- 注册时间
- 2007-12-28
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 12769
- 在线时间
- 小时
|
发表于 2008-5-4 15:40:57
|
显示全部楼层
增加了2个版本,先给出在PIV 2.6上的测试结果
说明:
binAdd_base30_ALU_4way_unroll2 为 binAdd_base30_ALU_4way_unroll的改进版,指令数更少,速度更快些。
binAdd_base30_ALU_8way_unroll2 为 binAdd_base30_ALU_8way_unroll的改进版,指令数更少,速度更快些,但基本与binAdd_base30_ALU_4way_unroll2 持平。
Test function: binAdd_ALU(..) 100000 times...
Elapsed time: 1904.675 ms
Test function: binAdd_GMP_ALU_8way_unroll(..) 100000 times...
Elapsed time: 1357.660 ms
Test function: binAdd_Yaos_MMX_4way_unroll(..) 100000 times...
Elapsed time: 772.390 ms
Test function: binAdd_base30_ALU(..) 100000 times...
Elapsed time: 1203.820 ms
Test function: binAdd_base30_ALU_4way_unroll(..) 100000 times...
Elapsed time: 1225.580 ms
Test function: binAdd_base30_ALU_4way_unroll2(..) 100000 times...
Elapsed time: 1190.035 ms
Test function: binAdd_base30_ALU_8way_unroll(..) 100000 times...
Elapsed time: 1213.810 ms
Test function: binAdd_base30_ALU_8way_unroll2(..) 100000 times...
Elapsed time: 1189.320 ms
Test function: binAdd_Base30_MMX_4way_unroll(..) 100000 times...
Elapsed time: 903.745 ms
再给出测试代码:
//基为2^30的版本另一个版本,采用4路循环展开- _declspec(naked)
- DWORD binAdd_base30_ALU_4way_unroll2(DWORD *dst,
- const DWORD *src1,
- const DWORD *src2,
- int size)
- {
- #undef BASE30_MASK
- #define BASE30_MASK 0x3fffffff
- _asm
- {
- push edi
- push esi
- push ebx
- sub esp,4
-
- mov ebx,[esp+16+PARAM_DST]
- mov esi,[esp+16+PARAM_SRC1]
- mov edi,[esp+16+PARAM_SRC2]
- mov edx,[esp+16+PARAM_SIZE]
-
- lea eax,[esi+edx*4]
- mov [esp],eax
-
- xor ecx,ecx //clear carry
- and edx,3
- jz a10
-
- loop00:
- add ecx,[esi]
- add ecx,[edi]
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx],eax
- lea esi,[esi+4]
- lea edi,[edi+4]
- lea ebx,[ebx+4]
- dec edx
- jnz loop00
- a10:
- jmp cmp00
-
- ALIGN 8
- loop01:
- add ecx, [esi]
- add ecx, [edi]
- mov eax, ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx],eax
-
- add ecx, [esi+4]
- add ecx, [edi+4]
- mov edx, ecx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+4],edx
-
-
- add ecx, [esi+8]
- add ecx, [edi+8]
- mov eax, ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+8],eax
-
-
- add ecx, [esi+12]
- add ecx, [edi+12]
- mov edx, ecx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+12],edx
-
-
- lea esi,[esi+16]
- lea edi,[edi+16]
- lea ebx,[ebx+16]
- cmp00:
- cmp esi,[esp]
- jb loop01
-
- thisExit:
- add esp,4
-
- pop ebx
- pop esi
- pop edi
- mov eax,0
- adc eax,0
- ret
- }
- }
复制代码 //基为2^30的版本另一个版本,指令数更少,采用8路循环展开- _declspec(naked)
- DWORD binAdd_base30_ALU_8way_unroll2(DWORD *dst,
- const DWORD *src1,
- const DWORD *src2,
- int size)
- {
- #undef BASE30_MASK
- #define BASE30_MASK 0x3fffffff
- _asm
- {
- push edi
- push esi
- push ebx
- sub esp,4
-
- mov ebx,[esp+16+PARAM_DST]
- mov esi,[esp+16+PARAM_SRC1]
- mov edi,[esp+16+PARAM_SRC2]
- mov edx,[esp+16+PARAM_SIZE]
-
- lea eax,[esi+edx*4]
- mov [esp],eax
-
- xor ecx,ecx
- and edx,7
- jz a10
-
- loop00:
- add ecx,[esi]
- add ecx,[edi]
- mov eax,ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx],eax
- lea esi,[esi+4]
- lea edi,[edi+4]
- lea ebx,[ebx+4]
- dec edx
- jnz loop00
- a10:
- jmp cmp00
-
- ALIGN 8
- loop01:
- add ecx,[esi]
- add ecx,[edi]
- mov eax,ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx],eax
-
-
- add ecx,[esi+4]
- add ecx,[edi+4]
- mov edx,ecx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+4],edx
-
-
- add ecx,[esi+8]
- add ecx,[edi+8]
- mov eax,ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+8],eax
-
-
- add ecx,[esi+12]
- add ecx,[edi+12]
- mov edx,ecx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+12],edx
-
-
- add ecx,[esi+16]
- add ecx,[edi+16]
- mov eax,ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+16],eax
-
-
- add ecx,[esi+20]
- add ecx,[edi+20]
- mov edx,ecx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+20],edx
-
-
- add ecx,[esi+24]
- add ecx,[edi+24]
- mov eax,ecx
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+24],eax
-
-
- add ecx,[esi+28]
- add ecx,[edi+28]
- mov edx,ecx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+28],edx
-
-
- lea esi,[esi+32]
- lea edi,[edi+32]
- lea ebx,[ebx+32]
- cmp00:
- cmp esi,[esp]
- jb loop01
-
- thisExit:
- add esp,4
- pop ebx
- pop esi
- pop edi
- mov eax,0
- adc eax,0
- ret
- }
- }
复制代码
|
|