liangbch
发表于 2010-12-14 18:46:02
贴上在E8500的运行结果,函数包含至105楼。
Test function: UInt128x128To256_ANSI_C32(..) 10000000 times...
Elapsed time: 896.603 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_40F(..) 10000000 times...
Elapsed time: 335.639 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_42F(..) 10000000 times...
Elapsed time: 302.855 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_54F(..) 10000000 times...
Elapsed time: 310.400 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_56F(..) 10000000 times...
Elapsed time: 264.212 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_58F(..) 10000000 times...
Elapsed time: 271.680 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_69F(..) 10000000 times...
Elapsed time: 282.345 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_93F(..) 10000000 times...
Elapsed time: 374.652 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_96F(..) 10000000 times...
Elapsed time: 265.366 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_102F(..) 10000000 times...
Elapsed time: 400.549 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
Test function: UInt128x128To256_SSE2_105F(..) 10000000 times...
Elapsed time: 292.557 ms
21BC4991 10D8039C A196539D 146A3CBE * E3580EAB 9F5C4DAD D4D6030B 4E7C3B55
= 1DF58FE3 D59883DF 459F83F8 724F3842 3D686D6C 1EA80B08 E6980EFD 934DF516
G-Spider
发表于 2010-12-14 20:59:53
感觉n多时间消耗在函数调度上,而感受不到函数本身的内在价值。如果把运算次数作为一个参数,最速下降处理这个参数........,或许更好一点。可这样又或许对UInt128x128To256模块的设计不妥,因为我们通过不需要这个参数。
gxqcn
发表于 2010-12-15 08:06:34
131# liangbch
正好印证了我62#的猜测:谢谢楼上的测试!:handshake
看来不同机器对指令的处理周期相差很大,
也许在 Core2 上,56F 反超 58F 也是可能的。。。
gxqcn 发表于 2008-3-24 12:47 http://bbs.emath.ac.cn/images/common/back.gif
mizne
发表于 2011-3-20 03:14:18
线程的创建和销毁代价也是非常大的
thewangj
发表于 2011-3-20 14:06:02
真是高手如云啊
佩服!学习中。。。
smiler
发表于 2012-5-15 08:54:49
学习! 准备将之修改到256×256的算法中去
guxd
发表于 2013-7-6 08:49:17
楼主真厉害
无心人
发表于 2014-11-4 11:37:21
有最新CPU的同学更新下测试结果,另外,是不是需要弄个SSE4.2跟AVX2的版本来?
还有,64位也测试下?
只是呼吸
发表于 2014-11-27 13:30:32
本帖最后由 只是呼吸 于 2014-11-27 13:33 编辑
我用我的这台计算机试了一下。
机器数据: 操作系统win7
cpu Intel(R) Core(TM) i3-3240 cpu @ 3.40GHz3.40GHz
内存 4.0GB(3.47GB可用)
系统类型 32位操作系统
编译系统 vc++6.0
对函数"UInt128x128To256_ANSI_C32"的测试结果是 10000000次1023.014ms
对函数"UInt128x128To256_ALU_102F"的测试结果是 10000000次280.132ms
其余的函数不能通过编译,运行不了。
l4m2
发表于 2023-5-20 02:04:57
用15年後的電腦,限制使用16位(8086)指令,10 000 000次耗時2.5秒。
org 100h
L0:
mov BX, mul1
mov SI, mul2
mov DI, mulF
call mult
dec dword
jnz L0
ret
db 11Ch-$ dup ?
counter:
dd 10000000
db 120h-$ dup ?
mul1:
dq -1, -1
mul2:
dq -1, -1
mulF:
dq 4 dup ?
mult:
; Input SI
; Input BX
; Output DI
; Destroy ALL
; Buffer BX CX BP
rept 8 i:-7 {
mov AX,
mov , AX
}
mul word
display 48+0, 48+(0), 32
mov , AX
mov BX, DX
xor CX, CX
Z1 equ BX
Z2 equ CX
Z3 equ BP
rept 7 i: 1 {
xor Z3, Z3
rept i+1 j: 0 \{
mov AX,
mul word
display 48+j, 48+(i-j), 32
add Z1, AX
adc Z2, DX
adc Z3, 0
\}
mov , Z1
Z4 equ Z1
Z1 equ Z2
Z2 equ Z3
Z3 equ Z4
}
rept 7 i: 8 {
if i < 14
xor Z3, Z3
end if
rept 15-i j: i-7 \{
mov AX,
mul word
display 48+j, 48+(i-j), 32
add Z1, AX
adc Z2, DX
if i < 14
adc Z3, 0
end if
\}
mov , Z1
Z4 equ Z1
Z1 equ Z2
Z2 equ Z3
Z3 equ Z4
}
mov , Z1
ret
作為對比,5億次16位乘法耗時1秒:
org 100h
mov ecx, 500000000
L:
mov ax,
mul word
dec ecx
jnz L
ret
页:
4
5
6
7
8
9
10
11
12
13
[14]
15