B计划之移位
对这个专题有些生疏了
先贴上很早以前的代码//移位 假设es = ds
//左移 pL << s = pA (s < 32)
void AsmShl(unsigned long * pL, unsigned long * pA, unsigned long t, unsigned long s)
{
if (t == 0)
return;
asm {
push ds
mov ebx, t
mov ecx, s
lea esi, dword ptr pL
lea edi, dword ptr pA
mov edx, 0
cld
}
AsmShl1:
asm {
lodsd
push eax
shld eax, edx, cl
pop edx
stosd
dec ebx
jnz AsmShl1
mov bl, 32
sub bl, cl
mov cl, bl
shr edx, cl
mov , edx
pop ds
}
}
//右移 pL >> s = pA (s < 32)
void AsmShr(unsigned long * pL, unsigned long * pA, unsigned long t, unsigned long s)
{
if (t == 0)
return;
asm {
push ds
mov ebx, t
mov ecx, s
lea esi, dword ptr pL
lea edi, dword ptr pA
mov edx,
add esi, 4
dec ebx
jz AsmShr2
cld
}
AsmShr1:
asm {
lodsd
shrd edx, eax, cl
xchg edx, eax
stosd
dec ebx
jnz AsmShr1
}
AsmShr2:
asm {
shr edx, cl
mov , edx
pop ds
}
}
然后考虑做出优化的代码来
可以用MMX/SSE/SSE2等指令
typedef unsigned int DWORD;
//左移 pL << s = pA (s < 32)
//最高位移出的数据做返回值
//不检查t, s为0的情况,乃高级函数的事情
__declspec(naked)
DWORD__cdecl BAsmShl(DWORD * pA, DWORD * pL, DWORD t, DWORD s)
{
__asm {
push ebx
push esi
push edi
mov ecx,
movebx,
shl ebx, 2
movesi,
addesi, ebx
movedi,
add edi, ebx
neg ebx
xoredx, edx
AsmShl1:
mov eax, dword ptr
shld eax, edx, cl
mov edx, dword ptr
mov dword ptr , eax
addebx, 4
jnzAsmShl1
subecx, 32
negecx
moveax, edx
shreax, cl
pop edi
pop esi
pop ebx
ret
}
}
优化1, 通用版本 好了
2#的程序测试完成了
另外,ALU形式的不知道是否4路展开合适否
下一个考虑MMX形式的 __declspec(naked)
DWORD__cdecl BAsmShl_MMX(DWORD * pA, DWORD * pL, DWORD t, DWORD s)
{
__asm {
movd mm0, dword ptr //s
mov ecx, dword ptr //t
mov eax, dword ptr
shl ecx, 2
add eax, ecx
mov edx, dword ptr
add edx, ecx
neg ecx
pxor mm1, mm1
asmLoop1:
movd mm2, dword ptr
psllq mm2, mm0
pormm2, mm1
movd dword ptr , mm2
punpckhdq mm2, mm1
movq mm1, mm2
add ecx, 4
jnz asmLoop1
movd eax, mm1
emms
ret
}
} 时间几乎和ALU版本相同 Core 2 XEON 1.6G
100万次1024DWORD左移3位
两个版本都是1.95秒
差距在0.01秒以下
大概3.07Clock/DWORD
页:
[1]