- 注册时间
- 2007-12-28
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 12769
- 在线时间
- 小时
|
发表于 2008-4-16 16:43:45
|
显示全部楼层
// 接楼上,给出剩余的几个采用$2^30$进制的大数加法的函数
//基为2^30的版本,不采用循环展开- _declspec(naked)
- DWORD binAdd_base30_ALU(DWORD *dst,
- const DWORD *src1,
- const DWORD *src2,
- int size)
- {
- #undef BASE30_MASK
- #define BASE30_MASK 0x3fffffff
- _asm
- {
- push edi
- push esi
- push ebx
-
-
- mov ebx,[esp+12+PARAM_DST]
- mov esi,[esp+12+PARAM_SRC1]
- mov edi,[esp+12+PARAM_SRC2]
- mov ecx,[esp+12+PARAM_SIZE]
-
- xor edx,edx //clear carry
- or ecx,ecx
- jz thisExit
-
- loop00:
- mov eax,[esi]
- add eax,[edi]
- add eax,edx
- mov edx,eax
- and eax,BASE30_MASK
- shr edx,30
- mov [ebx],eax
- lea esi,[esi+4]
- lea edi,[edi+4]
- lea ebx,[ebx+4]
- dec ecx
- jnz loop00
-
- thisExit:
-
- pop ebx
- pop esi
- pop edi
- mov eax,0
- adc eax,0
- ret
- }
- }
复制代码 //基为2^30的版本,采用4路循环展开- _declspec(naked)
- DWORD binAdd_base30_ALU_4way_unroll(DWORD *dst,
- const DWORD *src1,
- const DWORD *src2,
- int size)
- {
- #undef BASE30_MASK
- #define BASE30_MASK 0x3fffffff
- _asm
- {
- push edi
- push esi
- push ebx
- sub esp,4
-
- mov ebx,[esp+16+PARAM_DST]
- mov esi,[esp+16+PARAM_SRC1]
- mov edi,[esp+16+PARAM_SRC2]
- mov edx,[esp+16+PARAM_SIZE]
-
- lea eax,[esi+edx*4]
- mov [esp],eax
-
- xor ecx,ecx //clear carry
- and edx,3
- jz a10
-
- loop00:
- mov eax,[esi]
- add eax,[edi]
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx],eax
- lea esi,[esi+4]
- lea edi,[edi+4]
- lea ebx,[ebx+4]
- dec edx
- jnz loop00
- a10:
- jmp cmp00
-
- ALIGN 8
- loop01:
- //calc [ebx] and [ebx+4]
- mov edx,[esi]
- mov eax,[esi+4]
- add edx,[edi]
- add eax,[edi+4]
-
- add edx,ecx
- mov ecx,edx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx],edx
-
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+4],eax
-
- //calc [ebx+8] and [ebx+12]
- mov edx,[esi+8]
- mov eax,[esi+12]
- add edx,[edi+8]
- add eax,[edi+12]
-
- add edx,ecx
- mov ecx,edx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+8],edx
-
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+12],eax
-
- lea esi,[esi+16]
- lea edi,[edi+16]
- lea ebx,[ebx+16]
- cmp00:
- cmp esi,[esp]
- jb loop01
-
- thisExit:
- add esp,4
-
- pop ebx
- pop esi
- pop edi
- mov eax,0
- adc eax,0
- ret
- }
- }
复制代码 //基为2^30的版本,采用8路循环展开- _declspec(naked)
- DWORD binAdd_base30_ALU_8way_unroll(DWORD *dst,
- const DWORD *src1,
- const DWORD *src2,
- int size)
- {
- #undef BASE30_MASK
- #define BASE30_MASK 0x3fffffff
- _asm
- {
- push edi
- push esi
- push ebx
- sub esp,4
-
- mov ebx,[esp+16+PARAM_DST]
- mov esi,[esp+16+PARAM_SRC1]
- mov edi,[esp+16+PARAM_SRC2]
- mov edx,[esp+16+PARAM_SIZE]
-
- lea eax,[esi+edx*4]
- mov [esp],eax
-
- xor ecx,ecx
- and edx,7
- jz a10
-
- loop00:
- mov eax,[esi]
- add eax,[edi]
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx],eax
- lea esi,[esi+4]
- lea edi,[edi+4]
- lea ebx,[ebx+4]
- dec edx
- jnz loop00
- a10:
- jmp cmp00
-
- ALIGN 8
- loop01:
- //calc [ebx] and [ebx+4]
- mov edx,[esi]
- mov eax,[esi+4]
- add edx,[edi]
- add eax,[edi+4]
-
- add edx,ecx
- mov ecx,edx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx],edx
-
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+4],eax
-
- //calc [ebx+8] and [ebx+12]
- mov edx,[esi+8]
- mov eax,[esi+12]
- add edx,[edi+8]
- add eax,[edi+12]
-
- add edx,ecx
- mov ecx,edx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+8],edx
-
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+12],eax
-
- //calc [ebx+16] and [ebx+20]
- mov edx,[esi+16]
- mov eax,[esi+20]
- add edx,[edi+16]
- add eax,[edi+20]
-
- add edx,ecx
- mov ecx,edx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+16],edx
-
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+20],eax
-
- //calc [ebx+24] and [ebx+28]
- mov edx,[esi+24]
- mov eax,[esi+28]
- add edx,[edi+24]
- add eax,[edi+28]
-
- add edx,ecx
- mov ecx,edx
- and edx,BASE30_MASK
- shr ecx,30
- mov [ebx+24],edx
-
- add eax,ecx
- mov ecx,eax
- and eax,BASE30_MASK
- shr ecx,30
- mov [ebx+28],eax
-
- lea esi,[esi+32]
- lea edi,[edi+32]
- lea ebx,[ebx+32]
- cmp00:
- cmp esi,[esp]
- jb loop01
-
- thisExit:
- add esp,4
- pop ebx
- pop esi
- pop edi
- mov eax,0
- adc eax,0
- ret
- }
- }
复制代码 // 使用4路循环展开- _declspec(naked)
- DWORD binAdd_Base30_MMX_4way_unroll(DWORD *dst,
- const DWORD *src1,
- const DWORD *src2,
- int size)
- {
- #define STACK_FRAME 12
- #undef BASE30_MASK
- #define BASE30_MASK 0x3fffffff
- _asm
- {
- push esi
- push edi
- push ebx
-
- mov ebx,dword ptr [esp+4+STACK_FRAME] //dst
- mov esi,dword ptr [esp+8+STACK_FRAME] //src1
- mov edi,dword ptr [esp+12+STACK_FRAME] //src2
- mov ecx,dword ptr [esp+16+STACK_FRAME] //size
-
- pxor mm0,mm0
- mov eax,BASE30_MASK
- movd mm7,eax
-
- mov eax,ecx
- shr ecx,2
- and eax,3
- jz a10
-
- loop0:
- movd mm1, [esi]
- movd mm2, [edi]
- paddq mm0, mm1
- paddq mm0, mm2
-
- movq mm1, mm0
- psrlq mm0, 30
- pand mm1, mm7
- movd [ebx],mm1
-
- lea esi,[esi+4]
- lea edi,[edi+4]
- lea ebx,[ebx+4]
- dec eax
- jnz loop0
- a10:
- or ecx,ecx
- jz thisExit
- loop1:
- //calc [ebx] and [ebx+4]
- movd mm1,[esi]
- movd mm3,[edi]
- movd mm2,[esi+4]
- movd mm4,[edi+4]
- paddq mm1,mm3
- paddq mm2,mm4
-
- paddq mm0,mm1
- movq mm3,mm0
- psrlq mm0,30
- pand mm3,mm7
- movd [ebx],mm3
-
- paddq mm0,mm2
- movq mm4,mm0
- psrlq mm0,30
- pand mm4,mm7
- movd [ebx+4],mm4
-
- //calc [ebx+8] and [ebx+12]
-
- movd mm1,[esi+8]
- movd mm3,[edi+8]
- movd mm2,[esi+12]
- movd mm4,[edi+12]
- paddq mm1,mm3
- paddq mm2,mm4
-
- paddq mm0,mm1
- movq mm3,mm0
- psrlq mm0,30
- pand mm3,mm7
- movd [ebx+8],mm3
-
- paddq mm0,mm2
- movq mm4,mm0
- psrlq mm0,30
- pand mm4,mm7
- movd [ebx+12],mm4
-
- add esi, 16
- add edi, 16
- add ebx, 16
- sub ecx,1
- jne loop1
-
- thisExit:
-
- pop ebx
- pop edi
- pop esi
-
- movd eax,mm0
- emms
- ret
- }
- }
复制代码 |
|