-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: use setfield and make make_zero!! type-stable
- Loading branch information
Showing
9 changed files
with
92 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,23 @@ | ||
function Lux.Training.compute_gradients(::AutoTracker, obj_fn::F, data, | ||
ts::TrainState{<:TrainingBackendCache{:Tracker, FT}}) where {F, FT} | ||
dparams = FT ? ts.cache.dparameters : Lux.recursive_make_zero!!(ts.cache.dparameters) | ||
ps_tracked = construct_tracked_params(ts.parameters, dparams) | ||
ts::TrainState{<:TrainingBackendCache{AutoTracker}}) where {F} | ||
dps = Training.dparameters(ts.cache) | ||
ps_tracked = construct_tracked_params(ts.parameters, dps) | ||
|
||
loss, st, stats = obj_fn(ts.model, ps_tracked, ts.states, data) | ||
Tracker.back!(loss) | ||
|
||
ts_new = TrainState( | ||
TrainingBackendCache{:Tracker, false}(ts.cache.dparameters, nothing), obj_fn, | ||
ts.model, ts.parameters, st, ts.optimizer, ts.optimizer_state, ts.step) | ||
@set! ts.cache.first_try = False() | ||
@set! ts.objective_function = obj_fn | ||
@set! ts.states = st | ||
|
||
return dparams, loss.data, stats, ts_new | ||
return dps, loss.data, stats, ts | ||
end | ||
|
||
function Lux.Training.compute_gradients( | ||
::AutoTracker, obj_fn::F, data, ts::TrainState) where {F} | ||
ad::AutoTracker, obj_fn::F, data, ts::TrainState) where {F} | ||
grads = Lux.recursive_make_zero(ts.parameters) | ||
ts_new = TrainState( | ||
TrainingBackendCache{:Tracker, true}(grads, nothing), obj_fn, ts.model, | ||
ts.parameters, ts.states, ts.optimizer, ts.optimizer_state, ts.step) | ||
return Lux.Training.compute_gradients(AutoTracker(), obj_fn, data, ts_new) | ||
cache = TrainingBackendCache(ad, True(), grads, nothing) | ||
@set! ts.cache = cache | ||
@set! ts.objective_function = obj_fn | ||
return Lux.Training.compute_gradients(ad, obj_fn, data, ts) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
f60db4d
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
f60db4d
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/115175
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
f60db4d
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s)
414500
ns411125
ns1.01
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s)
322250
ns322750
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s)
322708.5
ns244083
ns1.32
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s)
741958
ns740229
ns1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA
44250.5
ns43576
ns1.02
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s)
1327167
ns1361688
ns0.97
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s)
2451688
ns2448167
ns1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s)
14209750
ns16505500
ns0.86
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s)
2193937.5
ns2198042
ns1.00
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA
207380
ns207361
ns1.00
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s)
1468292
ns1419479
ns1.03
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s)
923959
ns931729
ns0.99
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s)
1598937.5
ns1582917
ns1.01
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s)
2242395.5
ns2213229
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1762396
ns1768708
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1028250
ns1072541.5
ns0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1537583
ns1542417
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2885833.5
ns3010167
ns0.96
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
208790
ns208923
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12117833
ns12164458
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
8811750
ns8831167
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9165333.5
ns9231125
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18605125
ns18575542
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1497201
ns1506706
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17314916
ns17297875
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
13952000
ns13966709
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14449937
ns14490229
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21832333
ns21825958
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
250356604.5
ns250077771
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
148503729
ns148351292
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
115663250
ns116742208
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
452727834
ns446235042
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5471701
ns5474148
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1224679334
ns1226735000
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
932428750
ns933099541
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
831047479.5
ns833488083
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1654023458
ns1628798917
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
31662494
ns31247743
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1141591625
ns1139513458
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1004360417
ns1004012958
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1322994750
ns1343460771
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1741933375
ns1729098333
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
1120833.5
ns1084187.5
ns1.03
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
1620917
ns1632875
ns0.99
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
3462083
ns3807833
ns0.91
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
779667
ns781500
ns1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA
270336.5
ns269181
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2988271
ns2973917
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
4139875
ns4123458
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
9659916
ns11391021
ns0.85
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3132834
ns3140229.5
ns1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1134352.5
ns1147789
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
2338166
ns2327458.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1437021
ns1427875
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1669291
ns1552208
ns1.08
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
4193000
ns4203041
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
210459.5
ns209123
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
19441042
ns19423562
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
16082770.5
ns16279416
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
17400416.5
ns17361812
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
25866000
ns25815125
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1593435
ns1606839
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
34177125
ns34524104
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
30976000
ns31057875
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
31151000
ns31105416
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
36261000
ns36883875
ns0.98
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
4537333
ns4526208.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2776604
ns2777083.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2913645.5
ns2685312.5
ns1.09
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
8378750
ns8381562.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
420670
ns373639
ns1.13
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
38891374.5
ns38887521
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
32306292
ns32509584
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
32384208
ns32333229
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
51948083
ns51833125
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2620746.5
ns2633953
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
88847729
ns88607687.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
114070333.5
ns113743125
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
226493250
ns227726583
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
73885250
ns74951083
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
268317334
ns267716166
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
159216084
ns159256375
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
127078708
ns123708895.5
ns1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
492762417
ns485091625
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
6963353
ns7022924
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1469208062.5
ns1478680979
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
1179701333
ns1179547083
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
1064469187.5
ns1066054563
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
2018298416.5
ns2001889209
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34585385
ns34822377.5
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1726168042
ns1724298291
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1532131312.5
ns1565497271
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1753217833
ns1925114250
ns0.91
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
2220540250
ns2239111625
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
2032250
ns2028500
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
2850166.5
ns2967646
ns0.96
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
7482625
ns8104667
ns0.92
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2429979
ns2308041.5
ns1.05
lenet(28, 28, 1, 128)/forward/GPU/CUDA
267353.5
ns272667
ns0.98
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
9603854
ns9619395.5
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
11874437.5
ns12015166
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
24867021
ns26324292
ns0.94
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
11308542
ns11677541
ns0.97
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1173785
ns1188628.5
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
380634584
ns383215354.5
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
287745375
ns284366604.5
ns1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
243501229
ns261725395.5
ns0.93
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
452284375.5
ns453056042
ns1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
5016811.5
ns5009701
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
1137459875
ns1160384584
ns0.98
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
943993333
ns912166042
ns1.03
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
898262625
ns984922208
ns0.91
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
1411909416
ns1396092167
ns1.01
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
18115193
ns18111984
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1060437
ns1053833
ns1.01
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
2017041.5
ns1605958
ns1.26
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
5113542
ns5411083
ns0.95
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1366833
ns1296875
ns1.05
lenet(28, 28, 1, 64)/forward/GPU/CUDA
265207
ns265721
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
6505083
ns6510958
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
12271187.5
ns13082584
ns0.94
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
18806687.5
ns21760833.5
ns0.86
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
6078250
ns5984375
ns1.02
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1214045
ns1208949
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
70581646
ns70494333
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43485459
ns43641125
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
39436292
ns39690584
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132675958
ns133468354
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1863920
ns1945255.5
ns0.96
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
355687833.5
ns356723479.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
270693083.5
ns271306709
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
254405500.5
ns254269771
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
538777458
ns536238459
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
12367452
ns12301288
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
396200000
ns395599834
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
402727854
ns377440167
ns1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
668679417
ns697289229.5
ns0.96
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
708861625
ns708495833
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
1187349792
ns1188885083
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
694829104
ns692916625
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
629932709
ns642915416.5
ns0.98
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
1779143271
ns1776695937.5
ns1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
13225818
ns12306515
ns1.07
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
3622108083.5
ns3668882667
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
2828172709
ns2834396125
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
2724737708
ns2699395792
ns1.01
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
5083300000
ns5050853166
ns1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49807086.5
ns49852240.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3420729.5
ns3422958
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2074875
ns2075583
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2525042
ns2513666
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6011833
ns6018396
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
315086
ns317455.5
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
26295500
ns26048666
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18987458
ns19094062.5
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19862667
ns19316000
ns1.03
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
39218853.5
ns39190562.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2478386
ns2466381
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
55626729.5
ns55369583
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
81917708
ns82210395.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
172510354
ns173994812.5
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
45569417
ns45354333
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1782395.5
ns1779187.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1093791.5
ns1097834
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1586291.5
ns1568791
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
3026979
ns3021312
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213440.5
ns210623
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12557083
ns12543916
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9205917
ns9277708.5
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9717709
ns9594229.5
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18945396
ns18987604.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1545222
ns1527868.5
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17667958
ns17650708
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14312292
ns14335458
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14670667
ns14544250
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
22150709
ns22174250
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
70496583.5
ns70431125
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
43541375
ns43537125
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
39470417
ns39620583
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
132760312.5
ns132531916.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1958343
ns1888879
ns1.04
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
358409083
ns360439083.5
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
346583313
ns347132666.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
304589375
ns304637542
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
725990125
ns722631792
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13320357
ns13304668
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
418971104
ns419234750
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
419729042
ns421465729
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
662505333
ns724319500
ns0.91
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
715138292
ns714217917
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
1450437
ns1705416
ns0.85
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
1298979
ns1350333.5
ns0.96
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
1344645.5
ns1170667
ns1.15
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
2365917
ns2385333.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
590150.5
ns580442.5
ns1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
8684833
ns8948271
ns0.97
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
12890000
ns12980437.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
30836166.5
ns32353312.5
ns0.95
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
9843750
ns9804417
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1473920
ns1427987.5
ns1.03
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
17999292
ns17962354
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
16546208
ns17440000
ns0.95
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
29181291
ns29738291
ns0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
14097584
ns14431937.5
ns0.98
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s)
693250
ns669833.5
ns1.03
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s)
521417
ns529250
ns0.99
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s)
1040750
ns1065708.5
ns0.98
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s)
724875
ns725395.5
ns1.00
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA
48072
ns47647
ns1.01
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s)
1566292
ns1549104
ns1.01
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s)
1002937.5
ns1038917
ns0.97
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s)
1370333.5
ns1517584
ns0.90
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s)
2257250
ns2269896
ns0.99
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA
238196.5
ns233022
ns1.02
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s)
1571020.5
ns1582916
ns0.99
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s)
1080916
ns1087854.5
ns0.99
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s)
1541833
ns1464166
ns1.05
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s)
2236209
ns2190854
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3399875
ns3413625
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2047875
ns2047083
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2515021
ns2507333.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
6005375
ns6011813
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
286172.5
ns284231.5
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24087042
ns24149000
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
17224041.5
ns17330312.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17292291
ns17059271
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
37522062.5
ns37480499.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2407498
ns2394265
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
53768270.5
ns53573937.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
83654187.5
ns83649500
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
169263021
ns172928458
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
44565333.5
ns44425187.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
250492042
ns249999250
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
148428250
ns148223583
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
115397479.5
ns116384896
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
450610604
ns447335937.5
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5443833
ns5449146
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1101924667
ns1105347792
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
855192187.5
ns857822708.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
827218333.5
ns830398396
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1763706625
ns1762030583
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
29367206
ns28862807
ns1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1019223979
ns1020245354
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
945177042
ns966178875
ns0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1303173167
ns1293466208
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1739257541.5
ns1724193375.5
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1211708
ns1306896.5
ns0.93
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
981875
ns984292
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
948167
ns778437.5
ns1.22
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2062875
ns1958750
ns1.05
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
569657
ns566426
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
5819083.5
ns6042375
ns0.96
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
4699250
ns6715125
ns0.70
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
24610750.5
ns26872708
ns0.92
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
7096333
ns6973417
ns1.02
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1369164.5
ns1365853
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
11390750
ns11215770.5
ns1.02
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
9112562.5
ns10033208
ns0.91
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
17263667
ns17672208
ns0.98
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
8694666.5
ns8568500
ns1.01
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s)
384000
ns399500
ns0.96
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s)
364688
ns399291.5
ns0.91
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s)
2302437.5
ns3544167
ns0.65
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s)
89750
ns88459
ns1.01
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA
27591.5
ns27618
ns1.00
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s)
391125
ns397459
ns0.98
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s)
382584
ns445041.5
ns0.86
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s)
4380375
ns4819375
ns0.91
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s)
258417
ns259833
ns0.99
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA
220859
ns219889.5
ns1.00
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s)
421604
ns428313
ns0.98
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s)
411750
ns475541
ns0.87
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s)
4491917
ns4960437.5
ns0.91
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s)
271250
ns271333
ns1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s)
329896
ns343709
ns0.96
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s)
300084
ns333937.5
ns0.90
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s)
750333
ns769833
ns0.97
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s)
54375
ns53125
ns1.02
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA
27841
ns28016
ns0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s)
355792
ns362209
ns0.98
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s)
247167
ns342792
ns0.72
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s)
868125
ns897833
ns0.97
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s)
151750
ns152583
ns0.99
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA
205968
ns205326.5
ns1.00
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s)
368375
ns378500
ns0.97
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s)
261709
ns358042
ns0.73
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s)
714208
ns728708
ns0.98
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s)
151125
ns150833.5
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
601673542
ns603479208
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
433401687
ns429058104
ns1.01
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
378552750
ns385950542
ns0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
874120625
ns872372584
ns1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7030592
ns7023071
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
2007087354.5
ns2010730958
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
1632009874.5
ns1608264687.5
ns1.01
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
1618542583.5
ns1653085833
ns0.98
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
2637429416
ns2638084625
ns1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26054721.5
ns25932761
ns1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s)
523500
ns535250
ns0.98
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s)
435895.5
ns433291.5
ns1.01
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s)
1828249.5
ns3023791.5
ns0.60
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s)
866354
ns880791
ns0.98
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA
47636
ns46986
ns1.01
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s)
1763270.5
ns1881604
ns0.94
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s)
2797458.5
ns2798729
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s)
14370145.5
ns16356750
ns0.88
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s)
2769562.5
ns2759229
ns1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA
248789.5
ns246659.5
ns1.01
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s)
1945916.5
ns1962958.5
ns0.99
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s)
5043500
ns5070604
ns0.99
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s)
14572416
ns16396875
ns0.89
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s)
2785979.5
ns2785625.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1374375
ns1614125
ns0.85
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1189542
ns1235583
ns0.96
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1224645.5
ns1027208
ns1.19
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2299000
ns2300875
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
583268.5
ns587018.5
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
5918791
ns5921542
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
7147000
ns5089688
ns1.40
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
24359584
ns26372271
ns0.92
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
7320208
ns7288250
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
1348690.5
ns1379747.5
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
13093542
ns13324958
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
12017167
ns12237645.5
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
20888000
ns21281499.5
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
10214417
ns10668750
ns0.96
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s)
2375
ns4417
ns0.54
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s)
2500
ns2583.5
ns0.97
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s)
3333.5
ns2750
ns1.21
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s)
2958
ns2500
ns1.18
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA
24628
ns24754
ns0.99
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s)
7291.5
ns7459
ns0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s)
7083
ns7250
ns0.98
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s)
7333.5
ns7333
ns1.00
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s)
7083
ns7083
ns1
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA
209898.5
ns213008
ns0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s)
8250
ns8375
ns0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s)
8208
ns8583
ns0.96
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s)
8375
ns8459
ns0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s)
5958
ns5834
ns1.02
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s)
10458
ns10625
ns0.98
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s)
12937.5
ns13708
ns0.94
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s)
10708
ns12042
ns0.89
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s)
7250
ns7500
ns0.97
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA
24907
ns25091.5
ns0.99
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s)
19875
ns20250
ns0.98
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s)
20104.5
ns19959
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s)
20125
ns20083
ns1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s)
20000
ns19875
ns1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA
230594
ns231793
ns0.99
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s)
23583.5
ns23625
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s)
23708
ns23667
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s)
23625
ns23666
ns1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s)
21333
ns21084
ns1.01
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s)
28459
ns28708
ns0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s)
28542
ns29292
ns0.97
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s)
28770.5
ns28375
ns1.01
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s)
45917
ns46584
ns0.99
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA
25803
ns26247
ns0.98
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s)
230250
ns222250
ns1.04
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s)
288166
ns279729.5
ns1.03
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s)
4212042
ns4335396.5
ns0.97
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s)
145000
ns145208
ns1.00
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA
207914
ns203061
ns1.02
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s)
342187.5
ns333124.5
ns1.03
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s)
333166
ns322500
ns1.03
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s)
411895.5
ns861333
ns0.48
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s)
160646
ns160750
ns1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s)
1750
ns1875
ns0.93
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s)
1791
ns1958
ns0.91
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s)
2250
ns2416
ns0.93
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s)
1958
ns1792
ns1.09
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA
23251.5
ns23061
ns1.01
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s)
5208
ns5458
ns0.95
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s)
5208
ns5500
ns0.95
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s)
5500
ns5375
ns1.02
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s)
5291
ns5375
ns0.98
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA
245332
ns243257
ns1.01
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s)
11291.5
ns11333.5
ns1.00
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s)
11375
ns11208
ns1.01
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s)
11458
ns11667
ns0.98
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s)
6959
ns6833
ns1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
79898667
ns79834791
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
49104563
ns49125291
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
44920792
ns43259375
ns1.04
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
151542042
ns151428917
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2713787
ns2726005
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
665144875
ns498680292
ns1.33
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
414328875
ns414152083
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
399605708
ns396991709
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
687317792
ns689086500
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
14579874
ns14585553
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
718439500
ns712438146
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
685447833
ns683887166
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
1000305625
ns1013847083
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
992652792
ns999589459
ns0.99
This comment was automatically generated by workflow using github-action-benchmark.