- 注册时间
- 2008-2-6
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 51573
- 在线时间
- 小时
|
楼主 |
发表于 2008-3-18 14:12:09
|
显示全部楼层
运行时间高于MMX寄存器版本
指令数多16条
void UInt128x128To256_SSE2_42F( UINT32 * const result,
const UINT32 * const left,
const UINT32 * const right )
{
//使用SSE寄存器, 用SSE2指令, 但只利用低64位
__asm
{
mov esi, left
mov edi, right
mov ebx, result
//0:0
movd xmm0, dword ptr [esi]
movd xmm2, dword ptr [edi]
pmuludq xmm0, xmm2
movd [ebx], xmm0
psrlq xmm0, 32
// 0:1 1:0
movd xmm1, dword ptr [esi]
movd xmm3, dword ptr [edi+4]
pmuludq xmm1, xmm3
movd xmm2, dword ptr [edi]
movd xmm4, dword ptr [esi+4]
pmuludq xmm2, xmm4
mov eax, -1
//xmm0+xmm1+xmm2
paddq xmm0, xmm1//xmm0=xmm0+xmm1 xmm0肯定小于 2^64
movd xmm1, eax
pand xmm1, xmm0
psrlq xmm0, 32 //分解xmm0 = xmm0:xmm1
movd xmm3, eax
pand xmm3, xmm2
psrlq xmm2, 32 //分解xmm2=xmm2:xmm3
paddq xmm1, xmm3 //xmm1=xmm1+xmm3
paddq xmm0, xmm2 //xmm0=xmm0+xmm2
movd dword ptr [ebx+4], xmm1 //xmm1低位存储
psrlq xmm1, 32 //xmm1高位
paddq xmm0, xmm1 //得到进位
//1:1 0:2 2:0
movd xmm1, dword ptr [esi+4]
movd xmm4, dword ptr [edi+4]
pmuludq xmm1, xmm4
movd xmm2, dword ptr [esi]
movd xmm5, dword ptr [edi+8]
pmuludq xmm2, xmm5
movd xmm3, dword ptr [edi]
movd xmm7, dword ptr [esi+8]
pmuludq xmm3, xmm7
//xmm0+xmm1+xmm2+xmm3
movd xmm6, eax
pand xmm6, xmm0
psrlq xmm0, 32 //分解xmm0=xmm0:xmm6
movd xmm5, eax
pand xmm5, xmm1
psrlq xmm1, 32 //分解xmm1=xmm1:xmm5
paddq xmm6, xmm5 //xmm6=xmm6+xmm5
paddq xmm0, xmm1 //xmm0=xmm0+xmm1
movd xmm4, eax
pand xmm4, xmm2
psrlq xmm2, 32 //分解xmm2=xmm2:xmm4
paddq xmm6, xmm4 //xmm6=xmm6+xmm4
paddq xmm0, xmm2 //xmm0=xmm0+xmm2
movd xmm7, eax
pand xmm7, xmm3
psrlq xmm3, 32 //分解xmm3=xmm3:xmm7
paddq xmm6, xmm7 //xmm6=xmm6+xmm7
paddq xmm0, xmm3 //xmm0=xmm0+xmm3
movd dword ptr [ebx+8], xmm6 //xmm6低位存储
psrlq xmm6, 32
paddq xmm0, xmm6 //得到进位
//0:3 1:2 2:1 3:0
movd xmm1, dword ptr [esi]
movd xmm5, dword ptr [edi+12]
pmuludq xmm1, xmm5
movd xmm2, dword ptr [esi+4]
movd xmm6, dword ptr [edi+8]
pmuludq xmm2, xmm6
movd xmm3, dword ptr [esi+8]
movd xmm7, dword ptr [edi+4]
pmuludq xmm3, xmm7
movd xmm4, dword ptr [esi+12]
movd xmm6, dword ptr [edi]
pmuludq xmm4, xmm6
//xmm0+xmm1+xmm2+xmm3+xmm4
movd xmm5, eax
pand xmm5, xmm1
psrlq xmm1, 32 //分解xmm1=xmm1:xmm5
paddq xmm0, xmm5 //xmm0=xmm0+xmm5
movd xmm6, eax
pand xmm6, xmm2
psrlq xmm2, 32 //分解xmm2=xmm2:xmm6
paddq xmm0, xmm6 //xmm0=xmm0+xmm6
paddq xmm1, xmm2 //xmm1=xmm1+xmm2
movd xmm7, eax
pand xmm7, xmm3
psrlq xmm3, 32 //分解xmm3=xmm3:xmm7
paddq xmm0, xmm7 //xmm0=xmm0+xmm7
paddq xmm1, xmm3 //xmm1=xmm1+xmm3
movd xmm5, eax
pand xmm5, xmm4
psrlq xmm4, 32 //分解xmm4=xmm4:xmm5
paddq xmm0, xmm5 //xmm0=xmm0+xmm5
paddq xmm1, xmm4 //xmm1=xmm1+xmm4
movd dword ptr [ebx+12], xmm0 //xmm0低位存储
psrlq xmm0, 32
paddq xmm0, xmm1 //新进位
//1:3 2:2 3:1
movd xmm2, dword ptr [esi+4]
movd xmm4, dword ptr [edi+12]
pmuludq xmm2, xmm4
movd xmm1, dword ptr [esi+8]
movd xmm5, dword ptr [edi+8]
pmuludq xmm1, xmm5
movd xmm3, dword ptr [esi+12]
movd xmm6, dword ptr [edi+4]
pmuludq xmm3, xmm6
//xmm0+xmm1+xmm2+xmm3
movd xmm7, eax
pand xmm7, xmm0
psrlq xmm0, 32 //分解xmm0=xmm0:xmm7
movd xmm6, eax
pand xmm6, xmm1
psrlq xmm1, 32 //分解xmm1=xmm1:xmm6
paddq xmm7, xmm6 //xmm7=xmm7+xmm6
paddq xmm0, xmm1 //xmm0=xmm0+xmm1
movd xmm5, eax
pand xmm5, xmm2
psrlq xmm2, 32 //分解xmm2=xmm2:xmm5
paddq xmm7, xmm5 //xmm7=xmm7+xmm5
paddq xmm0, xmm2 //xmm0=xmm0+xmm2
movd xmm4, eax
pand xmm4, xmm3
psrlq xmm3, 32 //分解xmm3=xmm3:xmm4
paddq xmm7, xmm4 //xmm7=xmm7+xmm4
paddq xmm0, xmm3 //xmm0=xmm0+xmm3
movd dword ptr [ebx+16], xmm7 //xmm7低位存储
psrlq xmm7, 32
paddq xmm0, xmm7 //得到进位
//2:3 3:2
movd xmm1, dword ptr [esi+8]
movd xmm3, dword ptr [edi+12]
pmuludq xmm1, xmm3
movd xmm2, dword ptr [esi+12]
movd xmm4, dword ptr [edi+8]
pmuludq xmm2, xmm4
//xmm0+xmm1+xmm2
movd xmm7, eax
pand xmm7, xmm0
psrlq xmm0, 32 //分解xmm0=xmm0:xmm7
movd xmm6, eax
pand xmm6, xmm1
psrlq xmm1, 32 //分解xmm1=xmm1:xmm6
paddq xmm7, xmm6 //xmm7=xmm7+xmm6
paddq xmm0, xmm1 //xmm0=xmm0+xmm1
movd xmm5, eax
pand xmm5, xmm2
psrlq xmm2, 32 //分解xmm2=xmm2:xmm5
paddq xmm7, xmm5 //xmm7=xmm7+xmm5
paddq xmm0, xmm2 //xmm0=xmm0+xmm2
movd dword ptr [ebx+20], xmm7 //xmm7低位存储
psrlq xmm7, 32
paddq xmm0, xmm7 //得到进位
//3:3
movd xmm1, dword ptr [esi+12]
movd xmm2, dword ptr [edi+12]
pmuludq xmm1, xmm2
//xmm0+xmm1
movd xmm6, eax
pand xmm6, xmm0
psrlq xmm0, 32 //分解xmm0=xmm0:xmm6
movd xmm7, eax
pand xmm7, xmm1
psrlq xmm1, 32 //分解xmm1=xmm1:xmm7
paddq xmm6, xmm7 //xmm6=xmm6+xmm7
paddq xmm0, xmm1 //xmm0=xmm0+xmm1
movd dword ptr [ebx+24], xmm6 //xmm6低位存储
psrlq xmm6, 32
paddq xmm0, xmm6 //进位
movd dword ptr [ebx+28], xmm0 //存储
// ret
}
} |
|