- 注册时间
- 2008-2-6
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 51573
- 在线时间
- 小时
|
楼主 |
发表于 2008-3-19 11:46:30
|
显示全部楼层
void UInt128x128To256_SSE2_54F( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right )
{
//全SSE2版本
__asm
{
/* 具体代码在下一个代码区,需登陆才可见 */
mov esi, dword ptr [left]
mov edi, dword ptr [right]
mov ebx, dword ptr [result]
movdqu xmm5, [esi]
movdqu xmm6, [edi]
pshufd xmm0, xmm5, 11011100b //2008-03-21 有修改
pshufd xmm1, xmm6, 11011100b //2008-03-21 有修改
pmuludq xmm0, xmm1
pcmpeqd xmm7, xmm7
psrlq xmm7, 32 //00000000FFFFFFFF00000000FFFFFFFF
movd [ebx], xmm0 //0:0结果低位保存
pshufd xmm1, xmm5, 00010000b
pshufd xmm2, xmm6, 00000001b
pmuludq xmm1, xmm2 //xmm1= 1:0/0:1
movq xmm4, xmm0
pshufd xmm4, xmm4, 11111101b
pshufd xmm3, xmm1, 00010000b
pand xmm3, xmm7
pshufd xmm2, xmm1, 00110010b
pand xmm2, xmm7
paddq xmm2, xmm3 //0:1+1:0=xmm2
paddq xmm2, xmm4
psrldq xmm0, 8
//xmm0=1:1
movd [ebx+4], xmm2
psrldq xmm2, 4
pshufd xmm3, xmm2, 11111001b
pshufd xmm2, xmm2, 11111100b
paddq xmm2, xmm3
//2:0 0:2
pshufd xmm4, xmm5, 00100000b
pshufd xmm3, xmm6, 00000010b
pmuludq xmm4, xmm3 //xmm4=2:0/0:2
pshufd xmm3, xmm4, 00010000b
pand xmm3, xmm7
pshufd xmm4, xmm4, 00110010b
pand xmm4, xmm7
paddq xmm3, xmm4 //xmm3=2:0 + 0:2 xmm0=1:1 xmm2=进位
pshufd xmm0, xmm0, 11011100b
paddq xmm3, xmm2
paddq xmm3, xmm0
movd [ebx+8], xmm3
psrldq xmm3, 4
pshufd xmm0, xmm3, 11111001b
pshufd xmm3, xmm3, 11111100b
paddq xmm0, xmm3 //xmm0=进位
//1:2 2:1 3:0 0:3
pshufd xmm1, xmm5, 00010010b
pshufd xmm4, xmm6, 00100001b
pmuludq xmm1, xmm4
pshufd xmm2, xmm5, 00110000b
pshufd xmm3, xmm6, 00000011b
pmuludq xmm2, xmm3 //xmm0进位 xmm1=1:2/2:1 xmm2=3:0/0:3
pshufd xmm4, xmm1, 00010000b
pand xmm4, xmm7
pshufd xmm1, xmm1, 00110010b
pand xmm1, xmm7
paddq xmm1, xmm4
paddq xmm0, xmm1
pshufd xmm3, xmm2, 00010000b
pand xmm3, xmm7
pshufd xmm2, xmm2, 00110010b
pand xmm2, xmm7
paddq xmm2, xmm3
paddq xmm0, xmm2
movd [ebx+12], xmm0
psrldq xmm0, 4
pshufd xmm3, xmm0, 11111001b
pshufd xmm0, xmm0, 11111100b
paddq xmm0, xmm3 //xmm0进位
//1:3 2:2 3:1
pshufd xmm1, xmm5, 00110010b
pshufd xmm3, xmm6, 00110010b
pmuludq xmm1, xmm3 //xmm1=3:3/2:2
pshufd xmm2, xmm5, 00110001b
pshufd xmm4, xmm6, 00010011b
pmuludq xmm2, xmm4 //xmm2=3:1/1:3
movdqa xmm4, xmm1
psrldq xmm4, 8 //xmm4=3:3
movq xmm1, xmm1 //xmm1=2:2
pshufd xmm3, xmm2, 00110010b
pand xmm3, xmm7
pshufd xmm2, xmm2, 00010000b
pand xmm2, xmm7
paddq xmm2, xmm3
pshufd xmm1, xmm1, 11011100b
paddq xmm1, xmm2
paddq xmm0, xmm1
movd [ebx+16], xmm0
psrldq xmm0, 4
pshufd xmm3, xmm0, 11111001b
pshufd xmm0, xmm0, 11111100b
paddq xmm0, xmm3 //xmm0=进位
//2:3 3:2
pshufd xmm1, xmm5, 00100011b
pshufd xmm2, xmm6, 00110010b
pmuludq xmm1, xmm2
pshufd xmm3, xmm1, 00010000b
pand xmm3, xmm7
pshufd xmm1, xmm1, 00110010b
pand xmm1, xmm7
paddq xmm1, xmm3
paddq xmm0, xmm1
movd [ebx+20], xmm0
psrldq xmm0, 4
pshufd xmm2, xmm0, 11111001b
pshufd xmm0, xmm0, 11111100b
paddq xmm0, xmm2 //进位
pshufd xmm4, xmm4, 11011100b
paddq xmm0, xmm4
movd [ebx+24], xmm0
psrldq xmm0, 4
pshufd xmm1, xmm0, 11111001b
pshufd xmm0, xmm0, 11111100b
paddq xmm0, xmm1
movd [ebx+28], xmm0
// ret;
}
}
//MMX 1 681 885us
//SSE2s 1 841 345us
//SSE2 1 841 528us |
|