xwang233
diff --git a/‎linalg/cholesky-solve/.gitignore
Lines changed: 1 addition & 0 deletions b/‎linalg/cholesky-solve/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎linalg/cholesky-solve/_prof_is_square_matrix.txt b/‎linalg/cholesky-solve/_prof_is_square_matrix.txt
diff --git a/‎linalg/cholesky-solve/_prox_is_nrhs_eq_1.txt b/‎linalg/cholesky-solve/_prox_is_nrhs_eq_1.txt
diff --git a/‎linalg/cholesky-solve/linalg-prof.py
Lines changed: 137 additions & 0 deletions b/‎linalg/cholesky-solve/linalg-prof.py
Lines changed: 137 additions & 0 deletions
diff --git a/‎linalg/cholesky-solve/parse.py
Lines changed: 79 additions & 0 deletions b/‎linalg/cholesky-solve/parse.py
Lines changed: 79 additions & 0 deletions
diff --git a/‎linalg/cholesky-solve/prof_after_heuristics.txt
Lines changed: 102 additions & 0 deletions b/‎linalg/cholesky-solve/prof_after_heuristics.txt
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1 @@
+!*
@@ -0,0 +1,137 @@
+import torch
+import time
+import itertools
+import gc
+import json
+
+from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+TIME_MULTIPLIER = 1e6
+TIME_UNIT = 'us'
+
+nb = 200
+# nb = 1
+
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+
+def compare(x, y, *, rtol, atol):
+    if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
+        if not x.is_cuda:
+            x = x.cuda()
+        if not y.is_cuda:
+            raise RuntimeError("y tensor should be cuda, but it's not")
+        return torch.testing._compare_tensors_internal(x, y, rtol=rtol, atol=atol, equal_nan=False)
+    
+    a = True
+    b = {}
+    for x_, y_, s_ in zip(x, y, ['U', 'S', 'V']):
+        a_, b_ = compare(x_, y_, rtol=rtol, atol=atol)
+
+        a = a and a_
+        if not a_:
+            b[s_] = b_
+    
+    return a, json.dumps(b, indent=2)
+
+
+def main(s: str = ''):
+    def prof(b_, n_, dtype=torch.float, p=None, flag=None):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        if p is None:
+            p = lambda x: x
+
+        # print(b_, n_)
+        # x = torch.randn(*b_, n_, n_, device='cuda', dtype=dtype)
+        zo = random_hermitian_pd_matrix(n_, *b_, device='cuda', dtype=torch.float64)
+        z = torch.cholesky(zo).to(dtype=dtype)
+        x = torch.randn(*b_, n_, n_, device='cuda').to(dtype=dtype)
+        # x = torch.randn(*b_, n_, 1, device='cuda').to(dtype=dtype)
+
+        xc = x.clone().cpu()
+        zc = z.clone().cpu()
+
+        # cpu timing
+        t1 = time.time()
+        for _ in range(nb):
+            yc = p(xc, zc)
+        t2 = time.time()
+        cpu_time = (t2-t1)/nb*TIME_MULTIPLIER
+        # print('cpu', cpu_time, 'ms')
+
+        if torch.isnan(yc).any() or torch.isnan(zc).any():
+            print('cpu output contains nan')
+
+        # warmup
+        for _ in range(nb):
+            y_warmup = p(x, z)
+        torch.cuda.synchronize()
+
+        c, d = compare(xc, x, rtol=1e-7, atol=1e-7)
+        if not c:
+            print('original matrix compare')
+            print(d)
+            raise RuntimeError('original value x modified')
+        c1, d1 = compare(zc, z, rtol=1e-7, atol=1e-7)
+        if not c1:
+            print('original matrix compare')
+            print(d1)
+            raise RuntimeError('original value z modified')
+
+        torch.cuda.profiler.start()
+        with torch.autograd.profiler.emit_nvtx(record_shapes=True):
+            y = p(x, z)
+            torch.cuda.synchronize()
+        torch.cuda.profiler.stop()
+
+        torch.cuda.synchronize()
+
+        # gpu timing
+        t1 = time.time()
+        for _ in range(nb):
+            # y = torch.cholesky(x)
+            y = p(x, z)
+        torch.cuda.synchronize()
+        t2 = time.time()
+        gpu_time = (t2-t1)/nb*TIME_MULTIPLIER
+        # print('gpu', gpu_time, 'ms')
+
+        e, f = compare(y_warmup, y, rtol=0, atol=0)
+        if not e:
+            print('non-determinism: cholesky_solve value output')
+            print(f)
+            raise RuntimeError('non-deterministic output')
+
+        torch.backends.cuda.matmul.allow_tf32 = False
+        reconstruct = (zo @ y.double()).float()
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+        a, b = compare(x, reconstruct, rtol=1e-3, atol=1e-3)
+        # a, b = compare(yc, y, rtol=1e-3, atol=1e-3)
+        if not a:
+            print('numerical mismatch: reconstruct value compare')
+            print(b)
+
+        print(f'{b_} {n_} {dtype}'.ljust(35) + f'{cpu_time : .3f}  {gpu_time : .3f}')
+        # f.write(f'{b_} {n_} {dtype}; ' + f'{cpu_time : .3e}, {gpu_time : .3e}\n')
+        torch.cuda.synchronize()
+    
+    print(s)
+    print(torch.__version__)
+    print()
+    print('batch_size, matrix_size, dtype'.ljust(35) +
+         f'cpu_time({TIME_UNIT}), gpu_time({TIME_UNIT})')
+
+    for b, n in itertools.product(
+        [[]] + [[2**i] for i in range(11)],
+        [2**j for j in range(1, 12, 1)]
+    ):
+        if b and b[0] * n >= 2**14:
+            continue
+        prof(b, n, p=torch.cholesky_solve)
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,79 @@
+import glob
+from collections import defaultdict
+import json
+import io
+import numpy as np
+
+BEFORE = 'before-commit'
+AFTER = 'after-commit'
+
+SORT_KEY = {
+    "cpu": -1,
+    "before_magma": 0,
+    "after_potrs_64bit": 2,
+    "after_heuristics": 3
+}
+
+class Markdown:
+    def __init__(self):
+        self.buffer = io.BufferedRandom(io.BytesIO())
+        self.enc = 'utf-8'
+    
+    def write(self, s: str):
+        self.buffer.write(s.encode(self.enc))
+    
+    def read(self) -> bytes:
+        self.buffer.seek(0)
+        return self.buffer.read()
+
+def main():
+    profs = glob.glob('./prof*.txt')
+    # profs = glob.glob('./prox*.txt')
+
+    dt_gpu = defaultdict(dict)
+    dt_cpu = defaultdict(dict)
+    columns = ["cpu"]
+
+    for prof in profs:
+        impl_key = prof[7:-4]
+        columns.append(impl_key)
+
+        with open(prof, 'r') as f:
+            fl = f.readlines()
+        
+        al = [line.rstrip().split('   ') for line in fl if line.startswith('[')]
+
+        for line in al:
+            shape = line[0]
+            t_cpu, t_gpu = (float(x) for x in line[-2:])
+
+            dt_gpu[shape][impl_key] = t_gpu
+            dt_cpu[shape][impl_key] = t_cpu
+    
+    columns.sort(key=SORT_KEY.__getitem__)
+        
+    print(json.dumps(dt_gpu, indent=2))
+    # print(dt_cpu)
+
+    md = Markdown()
+    md.write('time is in **us** (10^-6 s)\n\n')
+    md.write('|shape|' + '|'.join(columns) + '|\n')
+    md.write('|---:' * (len(columns)+1) + '|\n')
+
+    for shape in dt_gpu.keys():
+        t_cpu_avg = np.mean([x for x in dt_cpu[shape].values()])
+        md.write(f'| {shape} | {t_cpu_avg : .3f} |')
+
+        for column in columns[1:]:
+            md.write(f' {dt_gpu[shape].get(column, -1) : .3f} |')
+        
+        md.write('\n')
+
+
+    with open('readme.md', 'wb') as f:
+    # with open('readme1.md', 'wb') as f:
+        f.write(md.read())
+        
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,102 @@
+
+1.9.0a0+git2b5c5c4
+
+batch_size, matrix_size, dtype     cpu_time(us), gpu_time(us)
+[] 2 torch.float32                  105.547   75.507
+[] 4 torch.float32                  9.592   75.125
+[] 8 torch.float32                  10.310   75.818
+[] 16 torch.float32                 10.427   68.911
+[] 32 torch.float32                 13.537   77.344
+[] 64 torch.float32                 60.569   86.546
+[] 128 torch.float32                99.032   119.070
+[] 256 torch.float32                280.218   201.018
+[] 512 torch.float32                1089.866   490.519
+[] 1024 torch.float32               6125.575   1335.486
+[] 2048 torch.float32               42986.248   5497.439
+[1] 2 torch.float32                 9.669   73.801
+[1] 4 torch.float32                 9.311   73.138
+[1] 8 torch.float32                 10.223   73.413
+[1] 16 torch.float32                10.821   67.235
+[1] 32 torch.float32                13.647   69.747
+[1] 64 torch.float32                56.102   83.778
+[1] 128 torch.float32               164.089   109.557
+[1] 256 torch.float32               300.865   185.843
+[1] 512 torch.float32               835.133   427.641
+[1] 1024 torch.float32              4356.145   1345.123
+[1] 2048 torch.float32              26658.406   5495.042
+[2] 2 torch.float32                 10.254   48.923
+[2] 4 torch.float32                 10.238   48.424
+[2] 8 torch.float32                 10.865   49.670
+[2] 16 torch.float32                12.029   49.565
+[2] 32 torch.float32                18.553   335.974
+[2] 64 torch.float32                83.658   405.704
+[2] 128 torch.float32               170.118   529.372
+[2] 256 torch.float32               365.396   830.517
+[2] 512 torch.float32               1402.911   1562.380
+[2] 1024 torch.float32              8500.582   3699.644
+numerical mismatch: reconstruct value compare
+With rtol=0.001 and atol=0.001, found 1 element(s) (out of 8388608) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0010547935962677002 (0.04455813765525818 vs. 0.04350334405899048), which occurred at index (1, 452, 1011).
+[2] 2048 torch.float32              60374.918   13091.600
+[4] 2 torch.float32                 11.771   51.131
+[4] 4 torch.float32                 12.223   49.632
+[4] 8 torch.float32                 12.293   51.562
+[4] 16 torch.float32                15.020   50.697
+[4] 32 torch.float32                26.133   335.603
+[4] 64 torch.float32                154.521   459.424
+[4] 128 torch.float32               269.843   556.146
+[4] 256 torch.float32               571.958   888.574
+[4] 512 torch.float32               2527.016   1773.859
+[4] 1024 torch.float32              17031.137   4997.247
+[4] 2048 torch.float32              119452.786   21604.799
+[8] 2 torch.float32                 17.611   66.310
+[8] 4 torch.float32                 19.614   65.430
+[8] 8 torch.float32                 18.976   66.751
+[8] 16 torch.float32                24.600   66.377
+[8] 32 torch.float32                49.813   368.210
+[8] 64 torch.float32                296.102   518.253
+[8] 128 torch.float32               415.326   607.669
+[8] 256 torch.float32               1095.607   1049.521
+[8] 512 torch.float32               5024.378   2348.893
+[8] 1024 torch.float32              42197.851   7945.452
+[16] 2 torch.float32                23.073   66.698
+[16] 4 torch.float32                24.247   66.334
+[16] 8 torch.float32                25.295   66.991
+[16] 16 torch.float32               36.662   66.900
+[16] 32 torch.float32               86.474   375.259
+[16] 64 torch.float32               520.860   456.016
+[16] 128 torch.float32              715.033   654.156
+[16] 256 torch.float32              2046.187   1219.178
+[16] 512 torch.float32              10900.669   3345.146
+[32] 2 torch.float32                31.379   66.758
+[32] 4 torch.float32                37.876   66.538
+[32] 8 torch.float32                39.243   67.152
+[32] 16 torch.float32               59.557   67.266
+[32] 32 torch.float32               157.140   383.520
+[32] 64 torch.float32               955.098   512.199
+[32] 128 torch.float32              1370.115   723.370
+[32] 256 torch.float32              4047.383   1559.268
+[64] 2 torch.float32                49.703   67.573
+[64] 4 torch.float32                59.655   67.368
+[64] 8 torch.float32                63.415   67.888
+[64] 16 torch.float32               104.959   68.390
+[64] 32 torch.float32               294.157   381.888
+[64] 64 torch.float32               1776.475   486.399
+[64] 128 torch.float32              2635.866   829.155
+[128] 2 torch.float32               85.740   68.507
+[128] 4 torch.float32               105.935   67.955
+[128] 8 torch.float32               132.358   69.039
+[128] 16 torch.float32              194.751   69.127
+[128] 32 torch.float32              530.604   386.889
+[128] 64 torch.float32              3484.117   522.555
+[256] 2 torch.float32               159.428   68.678
+[256] 4 torch.float32               199.956   68.533
+[256] 8 torch.float32               207.843   69.817
+[256] 16 torch.float32              370.517   73.783
+[256] 32 torch.float32              998.839   415.101
+[512] 2 torch.float32               312.570   72.967
+[512] 4 torch.float32               386.612   73.049
+[512] 8 torch.float32               401.845   75.147
+[512] 16 torch.float32              663.637   79.657
+[1024] 2 torch.float32              599.290   85.372
+[1024] 4 torch.float32              766.145   84.642
+[1024] 8 torch.float32              797.913   88.762