- 注册时间
- 2010-10-22
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 2292
- 在线时间
- 小时
|
楼主 |
发表于 2011-6-23 18:52:01
|
显示全部楼层
发现上面的调用还不够刺激,直接试一下GotoBlas2的内核。
sasum求单精度浮点数组中,绝对值之和。
比如:float X1[4]={1.0, 2.0, 7.0, -8.0} ; 所有的元素绝对值之和为18.
在语法上改写sasum内核,必须支持sse2 ,保存下面的代码为:sasum_k.asm- ;//GotoBlas2内核之sasum
- .686p
- .xmm ;支持SSE2
- .model flat,c
- option casemap :none
-
- .code
-
- sasum_k proc
-
- push esi
- push ebx
- mov ecx, [esp+8+4] ;//N
- mov esi, [esp+8+8] ;//X
- mov ebx, [esp+8+0ch] ;//INCX
- xorps xmm0, xmm0
- test ecx, ecx
- jle loc_2A0
- test ebx, ebx
- jle loc_2A0
- xorps xmm1, xmm1
- pcmpeqb xmm3, xmm3
- psrld xmm3, 1
- lea ebx, ds:0[ebx*4]
- cmp ebx, 4
- jnz loc_1F0
- sub esi, 0FFFFFF80h
- cmp ecx, 3
- jle loc_1B8
- test esi, 4
- jz short loc_68
- movss xmm0, dword ptr [esi-80h]
- andps xmm0, xmm3
- add esi, 4
- dec ecx
- jle loc_290
- nop
- lea esi, [esi+0]
-
- loc_68:
- test esi, 8
- jz short loc_88
- movsd xmm1, qword ptr [esi-80h]
- andps xmm1, xmm3
- add esi, 8
- sub ecx, 2
- jle loc_290
- lea esi, [esi+0]
-
- loc_88:
- mov eax, ecx
- sar eax, 5
- jle loc_148
- movaps xmm4, xmmword ptr [esi-80h]
- movaps xmm5, xmmword ptr [esi-70h]
- movaps xmm6, xmmword ptr [esi-60h]
- movaps xmm7, xmmword ptr [esi-50h]
- dec eax
- jle short loc_100
- db 66h
- nop
-
- loc_A8:
- andps xmm4, xmm3
- addps xmm0, xmm4
- movaps xmm4, xmmword ptr [esi-40h]
- andps xmm5, xmm3
- addps xmm1, xmm5
- movaps xmm5, xmmword ptr [esi-30h]
- andps xmm6, xmm3
- addps xmm0, xmm6
- movaps xmm6, xmmword ptr [esi-20h]
- andps xmm7, xmm3
- addps xmm1, xmm7
- movaps xmm7, xmmword ptr [esi-10h]
- andps xmm4, xmm3
- addps xmm0, xmm4
- movaps xmm4, xmmword ptr [esi]
- andps xmm5, xmm3
- addps xmm1, xmm5
- movaps xmm5, xmmword ptr [esi+10h]
- andps xmm6, xmm3
- addps xmm0, xmm6
- movaps xmm6, xmmword ptr [esi+20h]
- andps xmm7, xmm3
- addps xmm1, xmm7
- movaps xmm7, xmmword ptr [esi+30h]
- sub esi, 0FFFFFF80h
- dec eax
- jg short loc_A8
- lea esi, [esi+0]
-
- loc_100:
- andps xmm4, xmm3
- addps xmm0, xmm4
- movaps xmm4, xmmword ptr [esi-40h]
- andps xmm5, xmm3
- addps xmm1, xmm5
- movaps xmm5, xmmword ptr [esi-30h]
- andps xmm6, xmm3
- addps xmm0, xmm6
- movaps xmm6, xmmword ptr [esi-20h]
- andps xmm7, xmm3
- addps xmm1, xmm7
- movaps xmm7, xmmword ptr [esi-10h]
- andps xmm4, xmm3
- addps xmm0, xmm4
- andps xmm5, xmm3
- addps xmm1, xmm5
- andps xmm6, xmm3
- addps xmm0, xmm6
- andps xmm7, xmm3
- addps xmm1, xmm7
- sub esi, 0FFFFFF80h
- nop
- lea esi, [esi+0]
-
- loc_148:
- test ecx, 10h
- jz short loc_180
- movaps xmm4, xmmword ptr [esi-80h]
- andps xmm4, xmm3
- addps xmm0, xmm4
- movaps xmm5, xmmword ptr [esi-70h]
- andps xmm5, xmm3
- addps xmm1, xmm5
- movaps xmm6, xmmword ptr [esi-60h]
- andps xmm6, xmm3
- addps xmm0, xmm6
- movaps xmm7, xmmword ptr [esi-50h]
- andps xmm7, xmm3
- addps xmm1, xmm7
- add esi, 40h ; '@'
- nop
- lea esi, [esi+0]
-
- loc_180:
- test ecx, 8
- jz short loc_1A0
- movaps xmm4, xmmword ptr [esi-80h]
- andps xmm4, xmm3
- addps xmm0, xmm4
- movaps xmm5, xmmword ptr [esi-70h]
- andps xmm5, xmm3
- addps xmm1, xmm5
- add esi, 20h ; ' '
- nop
-
- loc_1A0:
- test ecx, 4
- jz short loc_1B8
- movaps xmm4, xmmword ptr [esi-80h]
- andps xmm4, xmm3
- addps xmm0, xmm4
- add esi, 10h
- lea esi, [esi+0]
-
- loc_1B8:
- test ecx, 2
- jz short loc_1D0
- movsd xmm4, qword ptr [esi-80h]
- andps xmm4, xmm3
- addps xmm1, xmm4
- add esi, 8
- db 66h
- nop
-
- loc_1D0:
- test ecx, 1
- jz loc_290
- movss xmm4, dword ptr [esi-80h]
- andps xmm4, xmm3
- addps xmm0, xmm4
- jmp loc_290
- align 10h
-
- loc_1F0:
- mov eax, ecx
- sar eax, 3
- jle short loc_270
- mov esi, esi
- lea edi, [edi+0]
-
- loc_200:
- movss xmm4, dword ptr [esi]
- add esi, ebx
- andps xmm4, xmm3
- addss xmm0, xmm4
- movss xmm5, dword ptr [esi]
- add esi, ebx
- andps xmm5, xmm3
- addss xmm1, xmm5
- movss xmm6, dword ptr [esi]
- add esi, ebx
- andps xmm6, xmm3
- addss xmm0, xmm6
- movss xmm7, dword ptr [esi]
- add esi, ebx
- andps xmm7, xmm3
- addss xmm1, xmm7
- movss xmm4, dword ptr [esi]
- add esi, ebx
- andps xmm4, xmm3
- addss xmm0, xmm4
- movss xmm5, dword ptr [esi]
- add esi, ebx
- andps xmm5, xmm3
- addss xmm1, xmm5
- movss xmm6, dword ptr [esi]
- add esi, ebx
- andps xmm6, xmm3
- addss xmm0, xmm6
- movss xmm7, dword ptr [esi]
- add esi, ebx
- andps xmm7, xmm3
- addss xmm1, xmm7
- dec eax
- jg short loc_200
- nop
- lea esi, [esi+0]
-
- loc_270:
- and ecx, 7
- jle short loc_290
- lea esi, [esi+0]
- lea edi, [edi+0]
-
- loc_280:
- movss xmm4, dword ptr [esi]
- andps xmm4, xmm3
- addss xmm0, xmm4
- add esi, ebx
- dec ecx
- jg short loc_280
-
- loc_290:
- addps xmm0, xmm1
- haddps xmm0, xmm0
- haddps xmm0, xmm0
- nop
- lea esi, [esi+0]
-
- loc_2A0:
- movss dword ptr [esp+8+4], xmm0
- fld dword ptr [esp+8+4]
- pop ebx
- pop esi
- ret
- sasum_k endp
-
- end
复制代码 |
|