-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update perf numbers given new
dt0
methodology with best/3 runs and
a bogo uncertainty estimate based on 1/3(diff of min & next). Maybe more significantly, observe to reader that these are hot everything & incremental wall time is potentially misleading.
- Loading branch information
Showing
1 changed file
with
44 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
## FastIEEESinglePrecNaturalLogAbs; Just arctanh Taylor@1. Was 5X fastr'n mid00s | ||
## x87. On SkyLake/glibc2.40/gcc14 ~1.1-1.4X faster; ARM64glibc somehow(fastHW?) | ||
## x87. On SkyLake/glibc2.40/gcc14 ~1.2-2X faster; ARM64glibc somehow(fastHW?) | ||
## ~4X faster. Unsure about Win/OSX. See news.ycombinator.com/item?id=40758562 | ||
type f4s {.packed.} = object # De-structuring object for IEEE-single | ||
frac1 {.bitsize: 16}: cuint # Little-Endian format only right now | ||
|
@@ -65,7 +65,7 @@ when isMainModule: | |
inc n | ||
if not (l.isNaN or 2*x==x): sum += l | ||
let dt = epochTime() - t0 - dt0 | ||
echo &"sum0: {sum0:.0f} sum: {sum} in {dt:.6f} second; n: {n}; {dt/n.float*1e9:.2f} ns/eval" | ||
echo &"S0:{sum0:.2g} sL:{sum} in {dt:.6f} second;n: {n}; {dt/n.float*1e9:.2f} ns/eval" | ||
else: | ||
when not declared(stdout): import std/[syncio, formatFloat] | ||
import std/[math, heapqueue] | ||
|
@@ -117,42 +117,45 @@ when isMainModule: | |
echo "abs: ";(while abErr.len>0:(let e=abErr.pop;echo " ",e,lnaT(e[1]))) | ||
echo "rel: ";(while rlErr.len>0:(let e=rlErr.pop;echo " ",e,lnaT(e[1]))) | ||
#[ b=(chrt 99 taskset -c 2-3 env -i HOME=/u/cb PATH=/u/cb/bin:/usr/local/bin:/usr/bin) | ||
for p in *Fast *Std *FastFIM *StdFIM *FastFM *StdFM;{ec $p;repeat 2 nor 0 $b ./$p} | ||
AlderLake (i7-1370P @5.2 GHz); frq v | ||
lnaFast - 1.194x faster | ||
sum: 1652640836.519009 in 10.62613010406494 seconds; n: 4278190080 | ||
sum: 1652640836.519009 in 10.62076020240784 seconds; n: 4278190080 | ||
lnaStd | ||
sum: inf in 12.71785449981689 seconds; n: 4278190080 | ||
sum: inf in 12.68324708938599 seconds; n: 4278190080 | ||
lnaFastFIM - 1.174x faster | ||
sum: 1652640836.519009 in 10.93485021591187 seconds; n: 4278190080 | ||
sum: 1652640836.519009 in 11.22672414779663 seconds; n: 4278190080 | ||
lnaStdFIM | ||
sum: inf in 12.90901017189026 seconds; n: 4278190080 | ||
sum: inf in 12.83666086196899 seconds; n: 4278190080 | ||
lnaFastFM - 1.104x faster | ||
sum: 1652640836.519009 in 9.682320833206177 seconds; n: 4278190080 | ||
sum: 1652640836.519009 in 9.732645034790039 seconds; n: 4278190080 | ||
lnaStdFM | ||
sum: inf in 10.70303058624268 seconds; n: 4278190080 | ||
sum: inf in 10.68658685684204 seconds; n: 4278190080 | ||
SkyLake ([email protected]): frq f | ||
lnaFast - 1.373x faster | ||
sum: 1652640836.519009 in 19.2596070766449 seconds; n: 4278190080 | ||
sum: 1652640836.519009 in 19.25886845588684 seconds; n: 4278190080 | ||
lnaStd | ||
sum: inf in 26.43976330757141 seconds; n: 4278190080 | ||
sum: inf in 26.44279456138611 seconds; n: 4278190080 | ||
lnaFastFIM - 1.138x faster | ||
sum: 1652640836.519009 in 19.66579365730286 seconds; n: 4278190080 | ||
sum: 1652640836.519009 in 19.66761422157288 seconds; n: 4278190080 | ||
lnaStdFIM | ||
sum: inf in 22.48359608650208 seconds; n: 4278190080 | ||
sum: inf in 22.38936853408813 seconds; n: 4278190080 | ||
lnaFastFM - 1.388x faster | ||
sum: 1652640836.519009 in 15.90760946273804 seconds; n: 4278190080 | ||
sum: 1652640836.519009 in 15.90723752975464 seconds; n: 4278190080 | ||
lnaStdFM | ||
sum: inf in 22.09396910667419 seconds; n: 4278190080 | ||
sum: inf in 22.07609820365906 seconds; n: 4278190080 ]# | ||
i7_6700k$ for mode in '' -d:fm -d:fim -d:stdlib;{nim c -d:r -d:bench $mode lna>&/n;repeat 3 nor 0 $b ./lna} | ||
S0:5.3e+36 sL:1652640659.073322 in 13.312009 second;n: 8556380160; 1.56 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 13.285249 second;n: 8556380160; 1.55 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 13.296462 second;n: 8556380160; 1.55 ns/eval | ||
1.55 +- 0.003 | ||
S0:5.3e+36 sL:1652640659.073322 in 10.756718 second;n: 8556380160; 1.26 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 10.712787 second;n: 8556380160; 1.25 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 10.718144 second;n: 8556380160; 1.25 ns/eval | ||
1.25 +- 0.003 | ||
S0:5.3e+36 sL:1652640659.073322 in 11.040576 second;n: 8556380160; 1.29 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 11.030243 second;n: 8556380160; 1.29 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 10.904071 second;n: 8556380160; 1.27 ns/eval | ||
1.27 +- 0.006 | ||
S0:5.3e+36 sL:1641011596.122295 in 20.227257 second;n: 8556380160; 2.36 ns/eval | ||
S0:5.3e+36 sL:1641011596.122295 in 20.227509 second;n: 8556380160; 2.36 ns/eval | ||
S0:5.3e+36 sL:1641011596.122295 in 20.231506 second;n: 8556380160; 2.36 ns/eval | ||
2.36 +- 0.003 | ||
i7_1370P$ for mode in '' -d:fm -d:fim -d:stdlib;{nim c -d:r -d:bench $mode lna>&/n;repeat 3 nor 0 $b ./lna} | ||
S0:5.3e+36 sL:1652640659.073322 in 6.615819 second;n: 8556380160; 0.77 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 6.980933 second;n: 8556380160; 0.82 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 7.320486 second;n: 8556380160; 0.86 ns/eval | ||
0.773 +- 0.017 | ||
S0:5.3e+36 sL:1652640659.073322 in 7.609612 second;n: 8556380160; 0.89 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 7.661034 second;n: 8556380160; 0.90 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 8.191683 second;n: 8556380160; 0.96 ns/eval | ||
0.889 +- 0.003 | ||
S0:5.3e+36 sL:1652640659.073322 in 8.293185 second;n: 8556380160; 0.97 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 8.342474 second;n: 8556380160; 0.98 ns/eval | ||
S0:5.3e+36 sL:1652640659.073322 in 8.330391 second;n: 8556380160; 0.97 ns/eval | ||
0.969 +- 0.003 | ||
S0:5.3e+36 sL:1641011596.122295 in 7.963845 second;n: 8556380160; 0.93 ns/eval | ||
S0:5.3e+36 sL:1641011596.122295 in 8.605017 second;n: 8556380160; 1.01 ns/eval | ||
S0:5.3e+36 sL:1641011596.122295 in 9.766621 second;n: 8556380160; 1.14 ns/eval | ||
0.931 +- 0.027 | ||
In Summary: Skylake(4.7GHz) AlderLake (5.2GHzPcore) | ||
1.55 +- 0.003 0.773 +- 0.017 | ||
1.25 +- 0.003 0.889 +- 0.003 | ||
1.27 +- 0.006 0.969 +- 0.003 | ||
1.89x 2.36 +- 0.003 1.20x 0.931 +- 0.027 | ||
Note that assessing CPU superscalar pipeline util is much more subtle than raw | ||
wall clock time. These "speed-ups" are really ratios of "incremental wall time | ||
per loop per lna() eval" in best possible, hot-everything cases. ]# |