xwang233 · Mar 25, 2021
diff --git a/‎linalg/cholesky-inverse/.gitignore
Lines changed: 1 addition & 0 deletions b/‎linalg/cholesky-inverse/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎linalg/cholesky-inverse/linalg-prof.py
Lines changed: 141 additions & 0 deletions b/‎linalg/cholesky-inverse/linalg-prof.py
Lines changed: 141 additions & 0 deletions
diff --git a/‎linalg/cholesky-inverse/parse.py
Lines changed: 79 additions & 0 deletions b/‎linalg/cholesky-inverse/parse.py
Lines changed: 79 additions & 0 deletions
diff --git a/‎linalg/cholesky-inverse/prof_after_heuristics.txt
Lines changed: 100 additions & 0 deletions b/‎linalg/cholesky-inverse/prof_after_heuristics.txt
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1 @@
+!*
@@ -0,0 +1,141 @@
+import torch
+import time
+import itertools
+import gc
+import json
+import multiprocessing
+import threading
+
+from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+TIME_MULTIPLIER = 1e6
+TIME_UNIT = 'us'
+
+nb = 200
+# nb = 1
+
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+
+def compare(x, y, *, rtol, atol):
+    if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
+        if not x.is_cuda:
+            x = x.cuda()
+        if not y.is_cuda:
+            raise RuntimeError("y tensor should be cuda, but it's not")
+        return torch.testing._compare_tensors_internal(x, y, rtol=rtol, atol=atol, equal_nan=False)
+    
+    a = True
+    b = {}
+    for x_, y_, s_ in zip(x, y, ['U', 'S', 'V']):
+        a_, b_ = compare(x_, y_, rtol=rtol, atol=atol)
+
+        a = a and a_
+        if not a_:
+            b[s_] = b_
+    
+    return a, json.dumps(b, indent=2)
+
+
+def main(s: str = ''):
+    def prof(b_, n_, dtype=torch.float, p=None, flag=None):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        if p is None:
+            p = lambda x: x
+
+        # print(b_, n_)
+        # x = torch.randn(*b_, n_, n_, device='cuda', dtype=dtype)
+        zo = random_hermitian_pd_matrix(n_, *b_, device='cpu', dtype=torch.float64).cuda()
+        z = torch.cholesky(zo.cpu()).to(dtype=dtype, device='cuda')
+        # x = torch.randn(*b_, n_, n_, device='cuda').to(dtype=dtype)
+        # x = torch.randn(*b_, n_, 1, device='cuda').to(dtype=dtype)
+
+        # xc = x.clone().cpu()
+        zc = z.clone().cpu()
+
+        # cpu timing
+        t1 = time.time()
+        for _ in range(nb):
+            yc = p(zc)
+        t2 = time.time()
+        cpu_time = (t2-t1)/nb*TIME_MULTIPLIER
+        # print('cpu', cpu_time, 'ms')
+
+        if torch.isnan(yc).any() or torch.isnan(zc).any():
+            print('cpu output contains nan')
+
+        # warmup
+        for _ in range(nb):
+            y_warmup = p(z)
+        torch.cuda.synchronize()
+
+        # c, d = compare(xc, x, rtol=1e-7, atol=1e-7)
+        # if not c:
+        #     print('original matrix compare')
+        #     print(d)
+        #     raise RuntimeError('original value x modified')
+        c1, d1 = compare(zc, z, rtol=1e-7, atol=1e-7)
+        if not c1:
+            print('original matrix compare')
+            print(d1)
+            raise RuntimeError('original value z modified')
+
+        torch.cuda.profiler.start()
+        with torch.autograd.profiler.emit_nvtx(record_shapes=True):
+            y = p(z)
+            torch.cuda.synchronize()
+        torch.cuda.profiler.stop()
+
+        torch.cuda.synchronize()
+
+        # gpu timing
+        t1 = time.time()
+        for _ in range(nb):
+            # y = torch.cholesky(x)
+            y = p(z)
+        torch.cuda.synchronize()
+        t2 = time.time()
+        gpu_time = (t2-t1)/nb*TIME_MULTIPLIER
+        # print('gpu', gpu_time, 'ms')
+
+        e, f = compare(y_warmup, y, rtol=0, atol=0)
+        if not e:
+            print('non-determinism: cholesky_solve value output')
+            print(f)
+            raise RuntimeError('non-deterministic output')
+
+        torch.backends.cuda.matmul.allow_tf32 = False
+        reconstruct = (zo @ y.double()).float()
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+        a, b = compare(torch.eye(n_).expand(*b_, n_, n_), reconstruct, rtol=1e-3, atol=1e-3)
+        # a, b = compare(yc, y, rtol=1e-3, atol=1e-3)
+        if not a:
+            print('numerical mismatch: reconstruct value compare')
+            print(b)
+
+        print(f'{b_} {n_} {dtype}'.ljust(35) + f'{cpu_time : .3f}  {gpu_time : .3f}')
+        # f.write(f'{b_} {n_} {dtype}; ' + f'{cpu_time : .3e}, {gpu_time : .3e}\n')
+        torch.cuda.synchronize()
+    
+    print(s)
+    print(torch.__version__)
+    print()
+    print('batch_size, matrix_size, dtype'.ljust(35) +
+         f'cpu_time({TIME_UNIT}), gpu_time({TIME_UNIT})')
+
+    for b, n in itertools.product(
+        # [[]] + [[2**i] for i in range(11)],
+        # [[], [1]],
+        [[2**i] for i in range(1, 11)],
+        [2**j for j in range(1, 12, 1)]
+    ):
+        if b and b[0] * n >= 2**14:
+            continue
+        prof(b, n, p=torch.cholesky_inverse)
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,79 @@
+import glob
+from collections import defaultdict
+import json
+import io
+import numpy as np
+
+BEFORE = 'before-commit'
+AFTER = 'after-commit'
+
+SORT_KEY = {
+    "cpu": -1,
+    "before_magma": 0,
+    "after_potrs_based": 2,
+    "after_heuristics": 3,
+}
+
+class Markdown:
+    def __init__(self):
+        self.buffer = io.BufferedRandom(io.BytesIO())
+        self.enc = 'utf-8'
+    
+    def write(self, s: str):
+        self.buffer.write(s.encode(self.enc))
+    
+    def read(self) -> bytes:
+        self.buffer.seek(0)
+        return self.buffer.read()
+
+def main():
+    profs = glob.glob('./prof*.txt')
+    # profs = glob.glob('./prox*.txt')
+
+    dt_gpu = defaultdict(dict)
+    dt_cpu = defaultdict(dict)
+    columns = ["cpu"]
+
+    for prof in profs:
+        impl_key = prof[7:-4]
+        columns.append(impl_key)
+
+        with open(prof, 'r') as f:
+            fl = f.readlines()
+        
+        al = [line.rstrip().split('   ') for line in fl if line.startswith('[')]
+
+        for line in al:
+            shape = line[0]
+            t_cpu, t_gpu = (float(x) for x in line[-2:])
+
+            dt_gpu[shape][impl_key] = t_gpu
+            dt_cpu[shape][impl_key] = t_cpu
+    
+    columns.sort(key=SORT_KEY.__getitem__)
+        
+    print(json.dumps(dt_gpu, indent=2))
+    # print(dt_cpu)
+
+    md = Markdown()
+    md.write('time is in **us** (10^-6 s)\n\n')
+    md.write('|shape|' + '|'.join(columns) + '|\n')
+    md.write('|---:' * (len(columns)+1) + '|\n')
+
+    for shape in dt_gpu.keys():
+        t_cpu_avg = np.mean([x for x in dt_cpu[shape].values()])
+        md.write(f'| {shape} | {t_cpu_avg : .3f} |')
+
+        for column in columns[1:]:
+            md.write(f' {dt_gpu[shape].get(column, -1) : .3f} |')
+        
+        md.write('\n')
+
+
+    with open('readme.md', 'wb') as f:
+    # with open('readme1.md', 'wb') as f:
+        f.write(md.read())
+        
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,100 @@
+
+1.9.0a0+gitf83bb72
+
+batch_size, matrix_size, dtype     cpu_time(us), gpu_time(us)
+[] 2 torch.float32                  10.827   68.474
+[] 4 torch.float32                  10.290   68.400
+[] 8 torch.float32                  11.035   68.010
+[] 16 torch.float32                 13.291   60.335
+[] 32 torch.float32                 18.488   59.669
+[] 64 torch.float32                 40.630   59.838
+[] 128 torch.float32                544.492   81.584
+[] 256 torch.float32                545.278   183.367
+[] 512 torch.float32                1765.593   467.857
+[] 1024 torch.float32               7690.465   1377.743
+[] 2048 torch.float32               50593.435   5687.451
+[1] 2 torch.float32                 10.318   66.886
+[1] 4 torch.float32                 10.408   66.848
+[1] 8 torch.float32                 11.165   67.350
+[1] 16 torch.float32                13.280   59.938
+[1] 32 torch.float32                18.553   58.808
+[1] 64 torch.float32                35.299   59.521
+[1] 128 torch.float32               587.937   107.318
+[1] 256 torch.float32               665.547   155.816
+[1] 512 torch.float32               2052.648   420.599
+[1] 1024 torch.float32              7829.883   1377.732
+[1] 2048 torch.float32              47425.777   5707.585
+[2] 2 torch.float32                 12.114   67.579
+[2] 4 torch.float32                 11.870   68.905
+[2] 8 torch.float32                 13.413   69.492
+[2] 16 torch.float32                17.011   70.037
+[2] 32 torch.float32                26.886   134.109
+[2] 64 torch.float32                69.199   202.242
+[2] 128 torch.float32               1211.519   391.432
+[2] 256 torch.float32               1169.972   611.967
+[2] 512 torch.float32               4103.551   1224.111
+[2] 1024 torch.float32              15942.370   3580.601
+[2] 2048 torch.float32              101954.410   12943.521
+[4] 2 torch.float32                 11.822   66.485
+[4] 4 torch.float32                 13.098   67.027
+[4] 8 torch.float32                 15.808   66.571
+[4] 16 torch.float32                22.434   67.693
+[4] 32 torch.float32                40.867   127.492
+[4] 64 torch.float32                117.341   190.632
+[4] 128 torch.float32               1923.342   343.292
+[4] 256 torch.float32               2123.984   675.958
+[4] 512 torch.float32               6900.259   1573.567
+[4] 1024 torch.float32              33191.272   4883.418
+[4] 2048 torch.float32              204610.639   21273.334
+[8] 2 torch.float32                 17.005   89.726
+[8] 4 torch.float32                 19.516   89.639
+[8] 8 torch.float32                 24.843   92.627
+[8] 16 torch.float32                38.671   163.162
+[8] 32 torch.float32                125.901   155.975
+[8] 64 torch.float32                231.619   226.618
+[8] 128 torch.float32               3673.230   393.283
+[8] 256 torch.float32               3719.603   825.180
+[8] 512 torch.float32               13651.674   2176.757
+[8] 1024 torch.float32              70810.848   7957.207
+[16] 2 torch.float32                17.269   90.749
+[16] 4 torch.float32                21.216   90.303
+[16] 8 torch.float32                30.577   90.498
+[16] 16 torch.float32               54.492   90.675
+[16] 32 torch.float32               127.280   161.965
+[16] 64 torch.float32               496.632   241.767
+[16] 128 torch.float32              7933.570   392.665
+[16] 256 torch.float32              8850.931   892.092
+[16] 512 torch.float32              29227.349   3448.063
+[32] 2 torch.float32                17.402   69.261
+[32] 4 torch.float32                24.254   69.902
+[32] 8 torch.float32                42.560   70.021
+[32] 16 torch.float32               89.972   70.196
+[32] 32 torch.float32               241.292   140.296
+[32] 64 torch.float32               923.823   228.437
+[32] 128 torch.float32              14301.009   546.384
+[32] 256 torch.float32              17318.430   1514.449
+[64] 2 torch.float32                32.297   91.580
+[64] 4 torch.float32                50.187   91.859
+[64] 8 torch.float32                86.789   91.848
+[64] 16 torch.float32               179.645   92.436
+[64] 32 torch.float32               542.165   167.738
+[64] 64 torch.float32               2208.329   276.698
+[64] 128 torch.float32              30164.434   648.333
+[128] 2 torch.float32               47.898   71.030
+[128] 4 torch.float32               77.578   70.615
+[128] 8 torch.float32               147.750   71.871
+[128] 16 torch.float32              335.081   72.927
+[128] 32 torch.float32              1780.635   150.491
+[128] 64 torch.float32              4148.735   279.559
+[256] 2 torch.float32               80.156   71.524
+[256] 4 torch.float32               138.524   71.729
+[256] 8 torch.float32               282.979   76.048
+[256] 16 torch.float32              740.534   78.084
+[256] 32 torch.float32              2066.872   178.185
+[512] 2 torch.float32               152.835   76.374
+[512] 4 torch.float32               267.853   77.028
+[512] 8 torch.float32               560.991   78.712
+[512] 16 torch.float32              1469.697   85.351
+[1024] 2 torch.float32              298.941   88.992
+[1024] 4 torch.float32              528.816   89.751
+[1024] 8 torch.float32              1196.932   92.831