liangbch
发表于 2008-4-16 15:22:46
回29楼,当然!
无心人
发表于 2008-4-16 15:30:59
:P
那,怎么说呢
我觉得为了一点速度的提高
有点不值吧
无心人
发表于 2008-4-16 15:31:56
而且,2^32也可以一次加2^30个整数而不溢出啊
无心人
发表于 2008-4-16 15:35:51
另外
有一个操作2^30基是无法高效的
就是x=x+1操作
liangbch
发表于 2008-4-16 16:35:14
为了阅读方便,现将23楼附件中的2进制 大数加法的代码给出。
#define PARAM_SIZE 16
#define PARAM_SRC2 12
#define PARAM_SRC1 8
#define PARAM_DST4
//1. 普通的加法,不使用循环展开_declspec(naked)
DWORD binAdd_ALU(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
_asm
{
push edi
push esi
mov edi,
mov esi,
mov edx,
mov ecx,
orecx,ecx
jz thisExit
loop00:
mov eax,
adc eax,
mov ,eax
lea esi,
lea edx,
lea edi,
dec ecx
jnz loop00
thisExit:
mov eax,0
adc eax,0
pop esi
pop edi
ret
}
}//使用4路循环展开的MMX版本
// 循环主题采用yaos的代码,(有bug,己经fix),liangbch加上了收尾部分和变量初始化部分
// 使用4路循环展开_declspec(naked)
DWORD binAdd_Yaos_MMX_4way_unroll(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
#define STACK_FRAME 12
_asm
{
push esi
push edi
push ebx
mov ebx,dword ptr //dst
mov esi,dword ptr //src1
mov edi,dword ptr //src2
mov ecx,dword ptr //size
pxor mm0,mm0
mov eax,ecx
shr ecx,2
and eax,3
jza10
loop0:
movd mm1,
movd mm2,
paddq mm0,mm1
paddq mm0,mm2
movd , mm0
psrlq mm0, 32
lea esi,
lea edi,
lea ebx,
dec eax
jnz loop0
a10:
or ecx,ecx
jz thisExit
loop1:
movd mm1,
movd mm2,
paddq mm1,mm2
movd mm3,
movd mm4,
paddq mm3, mm4
movd mm5,
movd mm6,
paddq mm5, mm6
movd mm7,
movd mm2,
paddq mm7, mm2
paddq mm0, mm1
movd , mm0
psrlq mm0, 32
paddq mm0, mm3
movd , mm0
psrlq mm0, 32
paddq mm0, mm5
movd , mm0
psrlq mm0, 32
paddq mm0, mm7
movd , mm0
psrlq mm0, 32
add esi, 16
add edi, 16
add ebx, 16
sub ecx,1
jne loop1
thisExit:
pop ebx
pop edi
pop esi
movd eax,mm0
emms
ret
}
}//使用8路循环展开的ALU版本
/*
下面的代码由原始的GMP代码转译而来,限於对GMP的理解的限制,
部分代码不得不重写,GMP源代码请参见 aors_n.asm
使用8路循环展开
*/_declspec(naked)
DWORD binAdd_GMP_ALU_8way_unroll(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
_asm
{
push edi
push esi
mov edi,
mov esi,
mov edx,
mov ecx,
mov eax,ecx
shr ecx,3
and eax,7
jza10
push ecx
movecx,eax
loop00:
mov eax,
adc eax,
mov ,eax
lea esi,
lea edx,
lea edi,
dec ecx
jnz loop00
pop ecx
a10:
mov eax,0
adc eax,0
or ecx,ecx
jz thisExit
ALIGN 8
loop01:
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
mov eax,
adc eax,
mov ,eax
lea esi,
lea edx,
lea edi,
dec ecx
jnz loop01
mov eax,0
adc eax,0
thisExit:
pop esi
pop edi
ret
}
}
无心人
发表于 2008-4-16 16:39:54
:lol
你还没回答我问题呢?
2^32的MMX版本也是可以多次加不进位的
X = X+1的2^30版本如何高效?
liangbch
发表于 2008-4-16 16:43:45
// 接楼上,给出剩余的几个采用2^30进制的大数加法的函数
//基为2^30的版本,不采用循环展开_declspec(naked)
DWORD binAdd_base30_ALU(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
#undefBASE30_MASK
#define BASE30_MASK 0x3fffffff
_asm
{
push edi
push esi
push ebx
mov ebx,
mov esi,
mov edi,
mov ecx,
xor edx,edx //clear carry
orecx,ecx
jz thisExit
loop00:
mov eax,
add eax,
add eax,edx
mov edx,eax
and eax,BASE30_MASK
shr edx,30
mov ,eax
lea esi,
lea edi,
lea ebx,
dec ecx
jnz loop00
thisExit:
pop ebx
pop esi
pop edi
mov eax,0
adc eax,0
ret
}
}//基为2^30的版本,采用4路循环展开_declspec(naked)
DWORD binAdd_base30_ALU_4way_unroll(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
#undefBASE30_MASK
#define BASE30_MASK 0x3fffffff
_asm
{
push edi
push esi
push ebx
subesp,4
mov ebx,
mov esi,
mov edi,
mov edx,
lea eax,
mov ,eax
xor ecx,ecx //clear carry
and edx,3
jza10
loop00:
mov eax,
add eax,
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
lea esi,
lea edi,
lea ebx,
dec edx
jnz loop00
a10:
jmp cmp00
ALIGN 8
loop01:
//calc and
mov edx,
mov eax,
add edx,
add eax,
add edx,ecx
mov ecx,edx
and edx,BASE30_MASK
shr ecx,30
mov ,edx
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
//calc and
mov edx,
mov eax,
add edx,
add eax,
add edx,ecx
mov ecx,edx
and edx,BASE30_MASK
shr ecx,30
mov ,edx
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
lea esi,
lea edi,
lea ebx,
cmp00:
cmp esi,
jb loop01
thisExit:
add esp,4
pop ebx
pop esi
pop edi
mov eax,0
adc eax,0
ret
}
}//基为2^30的版本,采用8路循环展开_declspec(naked)
DWORD binAdd_base30_ALU_8way_unroll(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
#undefBASE30_MASK
#define BASE30_MASK 0x3fffffff
_asm
{
push edi
push esi
push ebx
subesp,4
mov ebx,
mov esi,
mov edi,
mov edx,
lea eax,
mov ,eax
xor ecx,ecx
and edx,7
jza10
loop00:
mov eax,
add eax,
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
lea esi,
lea edi,
lea ebx,
dec edx
jnz loop00
a10:
jmp cmp00
ALIGN 8
loop01:
//calc and
mov edx,
mov eax,
add edx,
add eax,
add edx,ecx
mov ecx,edx
and edx,BASE30_MASK
shr ecx,30
mov ,edx
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
//calc and
mov edx,
mov eax,
add edx,
add eax,
add edx,ecx
mov ecx,edx
and edx,BASE30_MASK
shr ecx,30
mov ,edx
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
//calc and
mov edx,
mov eax,
add edx,
add eax,
add edx,ecx
mov ecx,edx
and edx,BASE30_MASK
shr ecx,30
mov ,edx
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
//calc and
mov edx,
mov eax,
add edx,
add eax,
add edx,ecx
mov ecx,edx
and edx,BASE30_MASK
shr ecx,30
mov ,edx
add eax,ecx
mov ecx,eax
and eax,BASE30_MASK
shr ecx,30
mov ,eax
lea esi,
lea edi,
lea ebx,
cmp00:
cmp esi,
jb loop01
thisExit:
add esp,4
pop ebx
pop esi
pop edi
mov eax,0
adc eax,0
ret
}
}// 使用4路循环展开_declspec(naked)
DWORD binAdd_Base30_MMX_4way_unroll(DWORD *dst,
const DWORD *src1,
const DWORD *src2,
int size)
{
#define STACK_FRAME 12
#undefBASE30_MASK
#define BASE30_MASK 0x3fffffff
_asm
{
push esi
push edi
push ebx
mov ebx,dword ptr //dst
mov esi,dword ptr //src1
mov edi,dword ptr //src2
mov ecx,dword ptr //size
pxor mm0,mm0
moveax,BASE30_MASK
movd mm7,eax
mov eax,ecx
shr ecx,2
and eax,3
jza10
loop0:
movd mm1,
movd mm2,
paddq mm0,mm1
paddq mm0,mm2
movq mm1, mm0
psrlq mm0, 30
pand mm1, mm7
movd ,mm1
lea esi,
lea edi,
lea ebx,
dec eax
jnz loop0
a10:
or ecx,ecx
jz thisExit
loop1:
//calc and
movd mm1,
movd mm3,
movd mm2,
movd mm4,
paddq mm1,mm3
paddq mm2,mm4
paddq mm0,mm1
movq mm3,mm0
psrlq mm0,30
pand mm3,mm7
movd ,mm3
paddq mm0,mm2
movq mm4,mm0
psrlq mm0,30
pand mm4,mm7
movd ,mm4
//calc and
movd mm1,
movd mm3,
movd mm2,
movd mm4,
paddq mm1,mm3
paddq mm2,mm4
paddq mm0,mm1
movq mm3,mm0
psrlq mm0,30
pand mm3,mm7
movd ,mm3
paddq mm0,mm2
movq mm4,mm0
psrlq mm0,30
pand mm4,mm7
movd ,mm4
add esi, 16
add edi, 16
add ebx, 16
sub ecx,1
jne loop1
thisExit:
pop ebx
pop edi
pop esi
movd eax,mm0
emms
ret
}
}
liangbch
发表于 2008-4-16 17:01:23
1.
2^32的MMX版本也是可以多次加不进位的
我不带明白你指的是什么,是大数乘法还是大数加法。
如果是大数加法,使用ALU指令必然会进位。
我们不但要考虑MMX指令,更多的情况下,我们需要考虑ALU指令,我们不能假定我的程序总是运行在支持SSE2指令的计算机。我的程序应该自动侦测CPU,在任何386以上的CPU上运行。
2.对于大数x,计算x=x+1,并不是一个常用的操作,一般情况下,其复杂度为常数,不值得考虑。
无心人
发表于 2008-4-16 17:04:17
哈
当然是加法了
就你说的 A+B+C+D
。。。。。。。。。。。。。。。。。。。。。。。。
2.对于大数x,计算x=x+1,并不是一个常用的操作,一般情况下,其复杂度为常数,不值得考虑。这个不是吧,在不等长的加法里是常见的操作啊
liangbch
发表于 2008-4-16 17:07:01
在99.99999999...%的情况下,x=x+1,不会连续进位,所以我说是一个常数时间。