Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 00ceae2

Browse files
committedMay 7, 2021
cholesky cusolver potrf batched
1 parent 22fa95a commit 00ceae2

File tree

10 files changed

+544
-0
lines changed

10 files changed

+544
-0
lines changed
 

‎linalg/cholesky-new/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
!*
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../linalg-prof.py

‎linalg/cholesky-new/A100/parse.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../parse.py
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../linalg-prof.py

‎linalg/cholesky-new/V100/parse.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../parse.py

‎linalg/cholesky-new/linalg-prof.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import torch
2+
import time
3+
import itertools
4+
import gc
5+
import json
6+
7+
from torch.testing._internal.common_utils import random_symmetric_pd_matrix
8+
9+
torch.backends.cuda.matmul.allow_tf32 = False
10+
11+
TIME_MULTIPLIER = 1e6
12+
TIME_UNIT = 'us'
13+
14+
nb = 200
15+
# nb = 1
16+
17+
torch.manual_seed(42)
18+
torch.cuda.manual_seed(42)
19+
20+
def compare(x, y, *, rtol, atol):
21+
if isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor):
22+
if not x.is_cuda:
23+
x = x.cuda()
24+
if not y.is_cuda:
25+
raise RuntimeError("y tensor should be cuda, but it's not")
26+
return torch.testing._core._compare_tensors_internal(x, y, rtol=rtol, atol=atol, equal_nan=False)
27+
28+
a = True
29+
b = {}
30+
for x_, y_, s_ in zip(x, y, ['U', 'S', 'V']):
31+
a_, b_ = compare(x_, y_, rtol=rtol, atol=atol)
32+
33+
a = a and a_
34+
if not a_:
35+
b[s_] = b_
36+
37+
return a, json.dumps(b, indent=2)
38+
39+
40+
def main(s: str = ''):
41+
def prof(b_, n_, dtype=torch.float, p=None, flag=None):
42+
gc.collect()
43+
torch.cuda.empty_cache()
44+
45+
if p is None:
46+
p = lambda x: x
47+
48+
# print(b_, n_)
49+
# x = random_symmetric_pd_matrix(n_, *b_, device='cuda').to(dtype=dtype)
50+
51+
_x = torch.randn(*b_, n_, n_, device='cuda', dtype=dtype)
52+
x = torch.matmul(_x, _x.transpose(-2, -1)) \
53+
+ torch.eye(n_, n_, dtype=dtype, device='cuda') * 1e-3
54+
55+
xc = x.clone().cpu()
56+
57+
# cpu timing
58+
t1 = time.time()
59+
for _ in range(nb):
60+
yc = p(xc)
61+
t2 = time.time()
62+
cpu_time = (t2-t1)/nb*TIME_MULTIPLIER
63+
# print('cpu', cpu_time, 'ms')
64+
65+
if torch.isnan(yc).any():
66+
print('cpu output contains nan')
67+
68+
# warmup
69+
for _ in range(nb):
70+
y_warmup = p(x)
71+
torch.cuda.synchronize()
72+
73+
c, d = compare(xc, x, rtol=1e-7, atol=1e-7)
74+
if not c:
75+
print('original matrix compare')
76+
print(d)
77+
raise RuntimeError('original value modified')
78+
79+
torch.cuda.profiler.start()
80+
with torch.autograd.profiler.emit_nvtx(record_shapes=True):
81+
y = p(x)
82+
torch.cuda.synchronize()
83+
torch.cuda.profiler.stop()
84+
85+
torch.cuda.synchronize()
86+
87+
# gpu timing
88+
t1 = time.time()
89+
for _ in range(nb):
90+
# y = torch.cholesky(x)
91+
y = p(x)
92+
torch.cuda.synchronize()
93+
t2 = time.time()
94+
gpu_time = (t2-t1)/nb*TIME_MULTIPLIER
95+
# print('gpu', gpu_time, 'ms')
96+
97+
e, f = compare(y_warmup, y, rtol=0, atol=0)
98+
if not e:
99+
print('non-determinism: cholesky value output')
100+
print(f)
101+
raise RuntimeError('non-deterministic output')
102+
103+
reconstruct = torch.matmul(y, y.transpose(-1, -2))
104+
a, b = compare(x, reconstruct, rtol=1e-3, atol=1e-3)
105+
# a, b = compare(yc, y, rtol=1e-3, atol=1e-3)
106+
if not a:
107+
print('numerical mismatch: cholesky value compare')
108+
print(b)
109+
110+
print(f'{b_} {n_} {dtype}'.ljust(35) + f'{cpu_time : .3f} {gpu_time : .3f}')
111+
# f.write(f'{b_} {n_} {dtype}; ' + f'{cpu_time : .3e}, {gpu_time : .3e}\n')
112+
torch.cuda.synchronize()
113+
114+
print(s)
115+
print(torch.__version__)
116+
print()
117+
print('batch_size, matrix_size, dtype'.ljust(35) +
118+
f'cpu_time({TIME_UNIT}), gpu_time({TIME_UNIT})')
119+
120+
for b, n in itertools.product(
121+
[[]] + [[2**i] for i in range(11)],
122+
[2**j for j in range(1, 12, 1)]
123+
):
124+
if b and b[0] * n * n >= 2**24:
125+
continue
126+
prof(b, n, p=torch.cholesky)
127+
128+
if __name__ == "__main__":
129+
main()
130+

‎linalg/cholesky-new/parse.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import glob
2+
from collections import defaultdict
3+
import json
4+
import io
5+
import numpy as np
6+
7+
BEFORE = 'before-commit'
8+
AFTER = 'after-commit'
9+
10+
SORT_KEY = {
11+
"cpu": -1,
12+
"before_potrf_and_magmaBatched": 0,
13+
"after_potrf_and_batched": 2,
14+
}
15+
16+
class Markdown:
17+
def __init__(self):
18+
self.buffer = io.BufferedRandom(io.BytesIO())
19+
self.enc = 'utf-8'
20+
21+
def write(self, s: str):
22+
self.buffer.write(s.encode(self.enc))
23+
24+
def read(self) -> bytes:
25+
self.buffer.seek(0)
26+
return self.buffer.read()
27+
28+
def main():
29+
profs = glob.glob('./prof*.txt')
30+
31+
dt_gpu = defaultdict(dict)
32+
dt_cpu = defaultdict(dict)
33+
columns = ["cpu"]
34+
35+
for prof in profs:
36+
impl_key = prof[7:-4]
37+
columns.append(impl_key)
38+
39+
with open(prof, 'r') as f:
40+
fl = f.readlines()
41+
42+
al = [line.rstrip().split(' ') for line in fl if line.startswith('[')]
43+
44+
for line in al:
45+
shape = line[0]
46+
t_cpu, t_gpu = (float(x) for x in line[-2:])
47+
48+
dt_gpu[shape][impl_key] = t_gpu
49+
dt_cpu[shape][impl_key] = t_cpu
50+
51+
columns.sort(key=SORT_KEY.__getitem__)
52+
53+
print(json.dumps(dt_gpu, indent=2))
54+
# print(dt_cpu)
55+
56+
md = Markdown()
57+
md.write('time is in **us** (10^-6 s)\n\n')
58+
md.write('|shape|' + '|'.join(columns) + '|\n')
59+
md.write('|---:' * (len(columns)+1) + '|\n')
60+
61+
for shape in dt_gpu.keys():
62+
t_cpu_avg = np.mean([x for x in dt_cpu[shape].values()])
63+
md.write(f'| {shape} | {t_cpu_avg : .3f} |')
64+
65+
for column in columns[1:]:
66+
md.write(f' {dt_gpu[shape].get(column, -1) : .3f} |')
67+
68+
md.write('\n')
69+
70+
71+
with open('readme.md', 'wb') as f:
72+
f.write(md.read())
73+
74+
75+
if __name__ == "__main__":
76+
main()
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
2+
1.9.0a0+git29cec7c
3+
4+
batch_size, matrix_size, dtype cpu_time(us), gpu_time(us)
5+
[] 2 torch.float32 166.751 44.876
6+
[] 4 torch.float32 14.199 44.647
7+
[] 8 torch.float32 14.256 45.384
8+
[] 16 torch.float32 17.726 45.079
9+
[] 32 torch.float32 22.008 53.898
10+
[] 64 torch.float32 70.720 77.246
11+
[] 128 torch.float32 210.257 122.961
12+
[] 256 torch.float32 342.160 213.780
13+
[] 512 torch.float32 1105.834 396.389
14+
[] 1024 torch.float32 4024.479 1027.658
15+
[] 2048 torch.float32 29073.299 2866.979
16+
[1] 2 torch.float32 17.226 49.818
17+
[1] 4 torch.float32 17.081 49.938
18+
[1] 8 torch.float32 18.842 50.026
19+
[1] 16 torch.float32 21.275 49.798
20+
[1] 32 torch.float32 26.360 50.294
21+
[1] 64 torch.float32 59.795 74.153
22+
[1] 128 torch.float32 191.315 111.419
23+
[1] 256 torch.float32 315.719 193.673
24+
[1] 512 torch.float32 527.442 387.703
25+
[1] 1024 torch.float32 2236.570 1037.982
26+
[1] 2048 torch.float32 12651.002 2883.768
27+
[2] 2 torch.float32 16.831 55.482
28+
[2] 4 torch.float32 16.716 55.149
29+
[2] 8 torch.float32 17.506 55.062
30+
[2] 16 torch.float32 21.101 55.068
31+
[2] 32 torch.float32 28.524 62.721
32+
[2] 64 torch.float32 77.375 84.537
33+
[2] 128 torch.float32 297.496 135.024
34+
[2] 256 torch.float32 521.365 250.014
35+
[2] 512 torch.float32 1598.421 564.944
36+
[2] 1024 torch.float32 6219.385 1659.213
37+
[2] 2048 torch.float32 40012.587 6029.224
38+
[4] 2 torch.float32 18.090 55.426
39+
[4] 4 torch.float32 21.400 55.792
40+
[4] 8 torch.float32 20.442 55.399
41+
[4] 16 torch.float32 28.229 55.983
42+
[4] 32 torch.float32 42.529 64.160
43+
[4] 64 torch.float32 137.360 85.123
44+
[4] 128 torch.float32 583.951 135.943
45+
[4] 256 torch.float32 979.756 253.303
46+
[4] 512 torch.float32 2742.286 653.256
47+
[4] 1024 torch.float32 10081.830 2169.271
48+
[8] 2 torch.float32 23.891 59.720
49+
[8] 4 torch.float32 23.849 59.400
50+
[8] 8 torch.float32 27.287 57.610
51+
[8] 16 torch.float32 35.224 57.226
52+
[8] 32 torch.float32 63.435 65.566
53+
[8] 64 torch.float32 183.948 87.408
54+
[8] 128 torch.float32 846.840 137.033
55+
[8] 256 torch.float32 1544.204 283.318
56+
[8] 512 torch.float32 4224.291 825.495
57+
[8] 1024 torch.float32 21504.675 3183.084
58+
[16] 2 torch.float32 23.084 55.476
59+
[16] 4 torch.float32 24.315 54.772
60+
[16] 8 torch.float32 30.618 57.551
61+
[16] 16 torch.float32 48.567 57.892
62+
[16] 32 torch.float32 91.910 64.478
63+
[16] 64 torch.float32 323.658 85.677
64+
[16] 128 torch.float32 1341.902 141.259
65+
[16] 256 torch.float32 2446.045 358.053
66+
[16] 512 torch.float32 7133.423 1193.165
67+
[32] 2 torch.float32 26.637 58.398
68+
[32] 4 torch.float32 27.893 56.289
69+
[32] 8 torch.float32 38.693 56.460
70+
[32] 16 torch.float32 72.627 57.133
71+
[32] 32 torch.float32 139.066 65.295
72+
[32] 64 torch.float32 515.858 91.805
73+
[32] 128 torch.float32 2710.525 168.658
74+
[32] 256 torch.float32 4628.189 507.494
75+
[32] 512 torch.float32 18827.291 1955.204
76+
[64] 2 torch.float32 35.092 60.304
77+
[64] 4 torch.float32 37.804 57.956
78+
[64] 8 torch.float32 55.552 56.902
79+
[64] 16 torch.float32 132.011 59.494
80+
[64] 32 torch.float32 243.195 64.955
81+
[64] 64 torch.float32 1048.934 93.381
82+
[64] 128 torch.float32 6201.448 254.424
83+
[64] 256 torch.float32 10290.796 874.258
84+
[128] 2 torch.float32 49.268 60.600
85+
[128] 4 torch.float32 54.197 56.717
86+
[128] 8 torch.float32 123.600 57.168
87+
[128] 16 torch.float32 218.310 58.094
88+
[128] 32 torch.float32 371.875 71.715
89+
[128] 64 torch.float32 1960.315 127.599
90+
[128] 128 torch.float32 11133.779 444.566
91+
[128] 256 torch.float32 24017.075 1519.374
92+
[256] 2 torch.float32 77.413 56.883
93+
[256] 4 torch.float32 88.081 57.020
94+
[256] 8 torch.float32 160.654 57.039
95+
[256] 16 torch.float32 358.368 61.771
96+
[256] 32 torch.float32 655.063 81.611
97+
[256] 64 torch.float32 3945.091 218.042
98+
[256] 128 torch.float32 20847.642 747.677
99+
[512] 2 torch.float32 187.004 76.611
100+
[512] 4 torch.float32 200.417 58.587
101+
[512] 8 torch.float32 358.144 65.459
102+
[512] 16 torch.float32 628.021 74.518
103+
[512] 32 torch.float32 1254.412 134.076
104+
[512] 64 torch.float32 7750.514 454.936
105+
[512] 128 torch.float32 52983.167 1382.968
106+
[1024] 2 torch.float32 315.468 57.831
107+
[1024] 4 torch.float32 302.045 65.851
108+
[1024] 8 torch.float32 465.969 74.170
109+
[1024] 16 torch.float32 1112.744 90.847
110+
[1024] 32 torch.float32 2358.574 228.770
111+
[1024] 64 torch.float32 14549.073 831.982
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
2+
1.9.0a0+git3948ce2
3+
4+
batch_size, matrix_size, dtype cpu_time(us), gpu_time(us)
5+
[] 2 torch.float32 206.735 43.274
6+
[] 4 torch.float32 14.042 43.120
7+
[] 8 torch.float32 15.231 43.688
8+
[] 16 torch.float32 17.363 44.925
9+
[] 32 torch.float32 22.341 52.098
10+
[] 64 torch.float32 57.448 75.674
11+
[] 128 torch.float32 181.807 120.755
12+
[] 256 torch.float32 335.406 218.886
13+
[] 512 torch.float32 980.325 446.244
14+
[] 1024 torch.float32 4017.303 1027.806
15+
[] 2048 torch.float32 28989.214 2877.455
16+
[1] 2 torch.float32 16.053 48.676
17+
[1] 4 torch.float32 16.146 48.577
18+
[1] 8 torch.float32 16.990 48.357
19+
[1] 16 torch.float32 18.630 48.988
20+
[1] 32 torch.float32 23.012 49.260
21+
[1] 64 torch.float32 49.570 72.438
22+
[1] 128 torch.float32 149.957 110.153
23+
[1] 256 torch.float32 333.872 191.205
24+
[1] 512 torch.float32 556.333 386.291
25+
[1] 1024 torch.float32 2077.819 1035.727
26+
[1] 2048 torch.float32 11273.186 2887.653
27+
[2] 2 torch.float32 16.811 75.074
28+
[2] 4 torch.float32 16.468 75.297
29+
[2] 8 torch.float32 17.303 75.999
30+
[2] 16 torch.float32 19.442 84.742
31+
[2] 32 torch.float32 22.466 93.547
32+
[2] 64 torch.float32 55.546 120.595
33+
[2] 128 torch.float32 245.310 182.900
34+
[2] 256 torch.float32 578.849 374.588
35+
[2] 512 torch.float32 1589.254 1183.922
36+
[2] 1024 torch.float32 6429.636 4393.322
37+
[2] 2048 torch.float32 40620.950 7468.882
38+
[4] 2 torch.float32 21.574 76.656
39+
[4] 4 torch.float32 20.320 75.183
40+
[4] 8 torch.float32 23.221 77.455
41+
[4] 16 torch.float32 29.128 84.240
42+
[4] 32 torch.float32 42.975 95.069
43+
[4] 64 torch.float32 147.607 124.631
44+
[4] 128 torch.float32 587.690 192.106
45+
[4] 256 torch.float32 914.129 394.497
46+
[4] 512 torch.float32 2885.733 1311.259
47+
[4] 1024 torch.float32 9762.999 4892.240
48+
[8] 2 torch.float32 17.601 77.999
49+
[8] 4 torch.float32 18.094 76.505
50+
[8] 8 torch.float32 20.335 77.841
51+
[8] 16 torch.float32 30.015 83.659
52+
[8] 32 torch.float32 48.658 97.371
53+
[8] 64 torch.float32 168.009 128.245
54+
[8] 128 torch.float32 759.834 199.677
55+
[8] 256 torch.float32 1343.409 425.509
56+
[8] 512 torch.float32 4386.839 1495.225
57+
[8] 1024 torch.float32 21372.106 5863.535
58+
[16] 2 torch.float32 18.562 76.997
59+
[16] 4 torch.float32 19.741 76.300
60+
[16] 8 torch.float32 23.991 76.799
61+
[16] 16 torch.float32 41.455 83.600
62+
[16] 32 torch.float32 78.890 98.929
63+
[16] 64 torch.float32 306.462 127.835
64+
[16] 128 torch.float32 1379.602 202.054
65+
[16] 256 torch.float32 2409.948 507.530
66+
[16] 512 torch.float32 7665.732 1844.689
67+
[32] 2 torch.float32 23.065 74.961
68+
[32] 4 torch.float32 28.031 77.012
69+
[32] 8 torch.float32 31.961 77.581
70+
[32] 16 torch.float32 94.479 85.294
71+
[32] 32 torch.float32 133.064 98.566
72+
[32] 64 torch.float32 540.137 128.793
73+
[32] 128 torch.float32 2548.141 217.744
74+
[32] 256 torch.float32 4898.430 689.526
75+
[32] 512 torch.float32 18985.937 2800.300
76+
[64] 2 torch.float32 29.178 76.237
77+
[64] 4 torch.float32 31.106 76.892
78+
[64] 8 torch.float32 47.171 78.007
79+
[64] 16 torch.float32 135.556 84.151
80+
[64] 32 torch.float32 216.458 98.063
81+
[64] 64 torch.float32 939.356 129.018
82+
[64] 128 torch.float32 5006.846 302.775
83+
[64] 256 torch.float32 9406.040 1043.229
84+
[128] 2 torch.float32 44.152 79.353
85+
[128] 4 torch.float32 45.574 77.045
86+
[128] 8 torch.float32 106.797 78.263
87+
[128] 16 torch.float32 198.779 84.773
88+
[128] 32 torch.float32 350.193 98.732
89+
[128] 64 torch.float32 1847.135 148.139
90+
[128] 128 torch.float32 10121.361 519.588
91+
[128] 256 torch.float32 23357.860 1925.991
92+
[256] 2 torch.float32 66.094 78.578
93+
[256] 4 torch.float32 80.675 78.168
94+
[256] 8 torch.float32 172.222 79.124
95+
[256] 16 torch.float32 343.161 86.131
96+
[256] 32 torch.float32 616.329 102.046
97+
[256] 64 torch.float32 3859.301 235.053
98+
[256] 128 torch.float32 18831.398 863.646
99+
[512] 2 torch.float32 170.538 77.360
100+
[512] 4 torch.float32 221.990 78.505
101+
[512] 8 torch.float32 289.357 80.695
102+
[512] 16 torch.float32 589.947 89.736
103+
[512] 32 torch.float32 1167.077 123.539
104+
[512] 64 torch.float32 7573.780 453.908
105+
[512] 128 torch.float32 52992.334 1599.033
106+
[1024] 2 torch.float32 297.906 79.390
107+
[1024] 4 torch.float32 294.363 81.875
108+
[1024] 8 torch.float32 465.537 84.850
109+
[1024] 16 torch.float32 1099.160 97.232
110+
[1024] 32 torch.float32 2342.029 221.044
111+
[1024] 64 torch.float32 14075.669 777.911

‎linalg/cholesky-new/readme.md

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
time is in **us** (10^-6 s)
2+
3+
|shape|cpu|before_potrf_and_magmaBatched|after_potrf_and_batched|
4+
|---:|---:|---:|---:|
5+
| [] 2 torch.float32 | 186.743 | 43.274 | 44.876 |
6+
| [] 4 torch.float32 | 14.120 | 43.120 | 44.647 |
7+
| [] 8 torch.float32 | 14.744 | 43.688 | 45.384 |
8+
| [] 16 torch.float32 | 17.544 | 44.925 | 45.079 |
9+
| [] 32 torch.float32 | 22.175 | 52.098 | 53.898 |
10+
| [] 64 torch.float32 | 64.084 | 75.674 | 77.246 |
11+
| [] 128 torch.float32 | 196.032 | 120.755 | 122.961 |
12+
| [] 256 torch.float32 | 338.783 | 218.886 | 213.780 |
13+
| [] 512 torch.float32 | 1043.080 | 446.244 | 396.389 |
14+
| [] 1024 torch.float32 | 4020.891 | 1027.806 | 1027.658 |
15+
| [] 2048 torch.float32 | 29031.256 | 2877.455 | 2866.979 |
16+
| [1] 2 torch.float32 | 16.639 | 48.676 | 49.818 |
17+
| [1] 4 torch.float32 | 16.614 | 48.577 | 49.938 |
18+
| [1] 8 torch.float32 | 17.916 | 48.357 | 50.026 |
19+
| [1] 16 torch.float32 | 19.953 | 48.988 | 49.798 |
20+
| [1] 32 torch.float32 | 24.686 | 49.260 | 50.294 |
21+
| [1] 64 torch.float32 | 54.683 | 72.438 | 74.153 |
22+
| [1] 128 torch.float32 | 170.636 | 110.153 | 111.419 |
23+
| [1] 256 torch.float32 | 324.796 | 191.205 | 193.673 |
24+
| [1] 512 torch.float32 | 541.888 | 386.291 | 387.703 |
25+
| [1] 1024 torch.float32 | 2157.195 | 1035.727 | 1037.982 |
26+
| [1] 2048 torch.float32 | 11962.094 | 2887.653 | 2883.768 |
27+
| [2] 2 torch.float32 | 16.821 | 75.074 | 55.482 |
28+
| [2] 4 torch.float32 | 16.592 | 75.297 | 55.149 |
29+
| [2] 8 torch.float32 | 17.404 | 75.999 | 55.062 |
30+
| [2] 16 torch.float32 | 20.271 | 84.742 | 55.068 |
31+
| [2] 32 torch.float32 | 25.495 | 93.547 | 62.721 |
32+
| [2] 64 torch.float32 | 66.460 | 120.595 | 84.537 |
33+
| [2] 128 torch.float32 | 271.403 | 182.900 | 135.024 |
34+
| [2] 256 torch.float32 | 550.107 | 374.588 | 250.014 |
35+
| [2] 512 torch.float32 | 1593.838 | 1183.922 | 564.944 |
36+
| [2] 1024 torch.float32 | 6324.511 | 4393.322 | 1659.213 |
37+
| [2] 2048 torch.float32 | 40316.768 | 7468.882 | 6029.224 |
38+
| [4] 2 torch.float32 | 19.832 | 76.656 | 55.426 |
39+
| [4] 4 torch.float32 | 20.860 | 75.183 | 55.792 |
40+
| [4] 8 torch.float32 | 21.831 | 77.455 | 55.399 |
41+
| [4] 16 torch.float32 | 28.678 | 84.240 | 55.983 |
42+
| [4] 32 torch.float32 | 42.752 | 95.069 | 64.160 |
43+
| [4] 64 torch.float32 | 142.483 | 124.631 | 85.123 |
44+
| [4] 128 torch.float32 | 585.821 | 192.106 | 135.943 |
45+
| [4] 256 torch.float32 | 946.942 | 394.497 | 253.303 |
46+
| [4] 512 torch.float32 | 2814.010 | 1311.259 | 653.256 |
47+
| [4] 1024 torch.float32 | 9922.414 | 4892.240 | 2169.271 |
48+
| [8] 2 torch.float32 | 20.746 | 77.999 | 59.720 |
49+
| [8] 4 torch.float32 | 20.971 | 76.505 | 59.400 |
50+
| [8] 8 torch.float32 | 23.811 | 77.841 | 57.610 |
51+
| [8] 16 torch.float32 | 32.620 | 83.659 | 57.226 |
52+
| [8] 32 torch.float32 | 56.047 | 97.371 | 65.566 |
53+
| [8] 64 torch.float32 | 175.978 | 128.245 | 87.408 |
54+
| [8] 128 torch.float32 | 803.337 | 199.677 | 137.033 |
55+
| [8] 256 torch.float32 | 1443.807 | 425.509 | 283.318 |
56+
| [8] 512 torch.float32 | 4305.565 | 1495.225 | 825.495 |
57+
| [8] 1024 torch.float32 | 21438.391 | 5863.535 | 3183.084 |
58+
| [16] 2 torch.float32 | 20.823 | 76.997 | 55.476 |
59+
| [16] 4 torch.float32 | 22.028 | 76.300 | 54.772 |
60+
| [16] 8 torch.float32 | 27.304 | 76.799 | 57.551 |
61+
| [16] 16 torch.float32 | 45.011 | 83.600 | 57.892 |
62+
| [16] 32 torch.float32 | 85.400 | 98.929 | 64.478 |
63+
| [16] 64 torch.float32 | 315.060 | 127.835 | 85.677 |
64+
| [16] 128 torch.float32 | 1360.752 | 202.054 | 141.259 |
65+
| [16] 256 torch.float32 | 2427.997 | 507.530 | 358.053 |
66+
| [16] 512 torch.float32 | 7399.577 | 1844.689 | 1193.165 |
67+
| [32] 2 torch.float32 | 24.851 | 74.961 | 58.398 |
68+
| [32] 4 torch.float32 | 27.962 | 77.012 | 56.289 |
69+
| [32] 8 torch.float32 | 35.327 | 77.581 | 56.460 |
70+
| [32] 16 torch.float32 | 83.553 | 85.294 | 57.133 |
71+
| [32] 32 torch.float32 | 136.065 | 98.566 | 65.295 |
72+
| [32] 64 torch.float32 | 527.997 | 128.793 | 91.805 |
73+
| [32] 128 torch.float32 | 2629.333 | 217.744 | 168.658 |
74+
| [32] 256 torch.float32 | 4763.310 | 689.526 | 507.494 |
75+
| [32] 512 torch.float32 | 18906.614 | 2800.300 | 1955.204 |
76+
| [64] 2 torch.float32 | 32.135 | 76.237 | 60.304 |
77+
| [64] 4 torch.float32 | 34.455 | 76.892 | 57.956 |
78+
| [64] 8 torch.float32 | 51.361 | 78.007 | 56.902 |
79+
| [64] 16 torch.float32 | 133.784 | 84.151 | 59.494 |
80+
| [64] 32 torch.float32 | 229.827 | 98.063 | 64.955 |
81+
| [64] 64 torch.float32 | 994.145 | 129.018 | 93.381 |
82+
| [64] 128 torch.float32 | 5604.147 | 302.775 | 254.424 |
83+
| [64] 256 torch.float32 | 9848.418 | 1043.229 | 874.258 |
84+
| [128] 2 torch.float32 | 46.710 | 79.353 | 60.600 |
85+
| [128] 4 torch.float32 | 49.886 | 77.045 | 56.717 |
86+
| [128] 8 torch.float32 | 115.198 | 78.263 | 57.168 |
87+
| [128] 16 torch.float32 | 208.544 | 84.773 | 58.094 |
88+
| [128] 32 torch.float32 | 361.034 | 98.732 | 71.715 |
89+
| [128] 64 torch.float32 | 1903.725 | 148.139 | 127.599 |
90+
| [128] 128 torch.float32 | 10627.570 | 519.588 | 444.566 |
91+
| [128] 256 torch.float32 | 23687.467 | 1925.991 | 1519.374 |
92+
| [256] 2 torch.float32 | 71.754 | 78.578 | 56.883 |
93+
| [256] 4 torch.float32 | 84.378 | 78.168 | 57.020 |
94+
| [256] 8 torch.float32 | 166.438 | 79.124 | 57.039 |
95+
| [256] 16 torch.float32 | 350.764 | 86.131 | 61.771 |
96+
| [256] 32 torch.float32 | 635.696 | 102.046 | 81.611 |
97+
| [256] 64 torch.float32 | 3902.196 | 235.053 | 218.042 |
98+
| [256] 128 torch.float32 | 19839.520 | 863.646 | 747.677 |
99+
| [512] 2 torch.float32 | 178.771 | 77.360 | 76.611 |
100+
| [512] 4 torch.float32 | 211.204 | 78.505 | 58.587 |
101+
| [512] 8 torch.float32 | 323.750 | 80.695 | 65.459 |
102+
| [512] 16 torch.float32 | 608.984 | 89.736 | 74.518 |
103+
| [512] 32 torch.float32 | 1210.745 | 123.539 | 134.076 |
104+
| [512] 64 torch.float32 | 7662.147 | 453.908 | 454.936 |
105+
| [512] 128 torch.float32 | 52987.751 | 1599.033 | 1382.968 |
106+
| [1024] 2 torch.float32 | 306.687 | 79.390 | 57.831 |
107+
| [1024] 4 torch.float32 | 298.204 | 81.875 | 65.851 |
108+
| [1024] 8 torch.float32 | 465.753 | 84.850 | 74.170 |
109+
| [1024] 16 torch.float32 | 1105.952 | 97.232 | 90.847 |
110+
| [1024] 32 torch.float32 | 2350.302 | 221.044 | 228.770 |
111+
| [1024] 64 torch.float32 | 14312.371 | 777.911 | 831.982 |

0 commit comments

Comments
 (0)
Please sign in to comment.