- 注册时间
- 2007-12-28
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 12785
- 在线时间
- 小时
|
楼主 |
发表于 2011-1-2 22:06:50
|
显示全部楼层
给出6个单线程函数的源代码- #include <stdio.h>
- #include <stdlib.h>
- #include <assert.h>
-
- #include "defs.h"
- #include "findMaxMin.h"
-
- void getMaxMin1(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- int i;
- *min=~0;
- *max=0;
- for (i=0; i<len; i++)
- {
- if ( pData[ i ] > *max)
- *max=pData[ i ];
- if (pData[ i ]< *min)
- *min=pData[ i ];
- }
- }
-
-
- void getMaxMin1_asm(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- DWORD _min, _max;
- __asm
- {
- mov esi, pData
- mov ecx, len
- lea edi, [esi+ecx*4]
-
- mov eax, 0xffffffff //min
- mov edx, 0 //max
-
- loop_start:
- cmp esi, edi
- jge next10
-
- cmp [esi], eax
- cmovb eax,[esi]
-
- cmp [esi], edx
- cmova edx,[esi]
-
- add esi, 4
- jmp loop_start
- next10:
- mov _min, eax
- mov _max, edx
- }
-
- *min=_min;
- *max=_max;
- }
-
-
- void getMaxMin2(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- int i;
-
- if (len % 2 ==0 )
- {
- if ( pData[0] > pData[1] )
- {
- *max= pData[0];
- *min= pData[1];
- }
- else
- {
- *max= pData[1];
- *min= pData[0];
- }
- i=2;
- }
- else
- {
- *max= *min= pData[0];
- i=1;
- }
-
- for (;i<len;i+=2)
- {
- if ( pData[ i ] > pData[ i+1] )
- {
- if ( pData[ i ]> *max)
- *max=pData[ i ];
-
- if ( pData[ i+1]< *min)
- *min=pData[ i+1];
- }
- else
- {
- if ( pData[ i ] < *min)
- *min=pData[ i ];
-
- if ( pData[ i+1]> *max)
- *max=pData[ i+1];
- }
- }
- }
-
- void getMaxMin2_asm1(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- DWORD _min, _max;
- __asm
- {
- mov esi, pData
- mov ecx, len
- lea edi, [esi+ecx*4]
-
- and ecx,1
- jz next10
-
- //next00: next % 2 ==1
- mov eax, [esi]
- mov edx, [esi]
- add esi,4
- jmp next20
-
- next10: // len % 2 ==0
- mov eax,[esi] // eax: min
- mov edx,[esi+4] // edx, max
- cmp eax,edx
- jb next15
- xchg eax,edx
- next15:
- add esi,8
- jmp next20
-
- loop_start:
- mov ebx, [esi] //ebx=min(pData [i] , pData [ i+1]
- mov ecx, [esi+4] //ebx=max(pData [i] , pData [ i+1]
- cmp ebx, ecx
- cmova ebx, [esi+4]
- cmova ecx, [esi]
-
- cmp eax, ebx
- cmova eax, ebx
-
- cmp edx, ecx
- cmovb edx, ecx
-
- add esi, 8
- next20:
- cmp esi, edi
- jb loop_start
-
- mov _min, eax
- mov _max, edx
- }
- *min=_min;
- *max=_max;
- }
-
- _declspec(naked)
- void getMaxMin2_asm2(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
-
- #define PAR_PDATA 4
- #define PAR_LEN 8
- #define PAR_MIN 12
- #define PAR_MAX 16
-
- #define REG_MIN esi
- #define REG_MAX edi
-
- #define REG_CUR_TMP ecx
- #define REG_CUR_MIN eax
- #define REG_CUR_MAX edx
- #define REG_PDATA ebx // pData
- #define REG_PEND ebp // pData+len
-
- __asm
- {
- push ebx
- push ebp
- push esi
- push edi
-
- mov REG_PDATA, dword ptr[esp+16+PAR_PDATA]
- mov ecx,dword ptr[esp+16+PAR_LEN]
-
- test ecx,1
- jz next10
-
- //next00: next % 2 ==1
- lea REG_PEND, [REG_PDATA+ecx*4]
- mov REG_MIN, [REG_PDATA]
- mov REG_MAX, [REG_PDATA]
- add REG_PDATA,4
- jmp next20
-
- next10: // len % 2 ==0
- lea REG_PEND, [REG_PDATA+ecx*4]
- mov REG_MIN,[REG_PDATA]
- mov REG_MAX,[REG_PDATA+4]
- cmp REG_MIN,REG_MAX
- jb next15
- xchg REG_MIN,REG_MAX
- next15:
- add REG_PDATA,8
- jmp next20
-
- loop_start:
- mov REG_CUR_MIN, [REG_PDATA] //ebx=min(pData [i] , pData [ i+1]
- mov REG_CUR_MAX, [REG_PDATA+4] //ebx=max(pData [i] , pData [ i+1]
-
- cmp REG_CUR_MIN, REG_CUR_MAX
- mov REG_CUR_TMP, REG_CUR_MIN
-
- cmova REG_CUR_MIN, REG_CUR_MAX
- cmova REG_CUR_MAX, REG_CUR_TMP
-
- cmp REG_MIN, REG_CUR_MIN
- cmova REG_MIN, REG_CUR_MIN
-
- cmp REG_MAX, REG_CUR_MAX
- cmovb REG_MAX, REG_CUR_MAX
-
- add REG_PDATA, 8
- next20:
- cmp REG_PDATA, REG_PEND
- jb loop_start
-
- mov eax, [esp+16+PAR_MIN]
- mov edx, [esp+16+PAR_MAX]
- mov [eax], REG_MIN
- mov [edx], REG_MAX
-
- pop edi
- pop esi
- pop ebp
- pop ebx
- ret
- }
-
- }
-
- _declspec(naked)
- void getMaxMin1_SSE4(DWORD *pData,int len,DWORD *min,DWORD *max)
-
- // 使用SSE4.1 指令 计算数组的最大最小值
- // 寄存器的使用
- // eax: 存放最小值
- // edx: 存放最大值
- // xmm0: 当前取得的4个整数
- // xmm1: 存放最小值
- // xmm2: 存放最大值
-
- /* PMINUD and PMAXUD are SSE4.1 instruciton
- The usage for PMINUD
- Compares packed unsigned dword integers in the destination operand (first operand)
- and the source operand (second operand), and returns the minimum for each packed
- value in the destination operand.
-
- Operation
- IF (DEST[31:0] < SRC[31:0])
- THEN DEST[31:0] 􀃅 DEST[31:0];
- ELSE DEST[31:0] 􀃅 SRC[31:0]; FI;
-
- IF (DEST[63:32] < SRC[63:32])
- THEN DEST[63:32] 􀃅 DEST[63:32];
- ELSE DEST[63:32] 􀃅 SRC[63:32]; FI;
-
- IF (DEST[95:64] < SRC[95:64])
- THEN DEST[95:64] 􀃅 DEST[95:64];
- ELSE DEST[95:64] 􀃅 SRC[95:64]; FI;
-
- IF (DEST[127:96] < SRC[127:96])
- THEN DEST[127:96] 􀃅 DEST[127:96];
- ELSE DEST[127:96] 􀃅 SRC[127:96]; FI;
- */
-
- #define MIN_128REG xmm1
- #define MAX_128REG xmm2
- #define MIN_32REG eax
- #define MAX_32REG edx
- #define PT_REG esi
- #define END_REG edi
-
- #define PAR_PDATA 4
- #define PAR_LEN 8
- #define PAR_MIN 12
- #define PAR_MAX 16
-
- {
- __asm
- {
- push esi
- push edi
-
- mov MIN_32REG, 0xffffffff // min=0xffffffff
- xor MAX_32REG, MAX_32REG // max=0
-
- //phase1:
- mov ecx, dword ptr[esp+8+PAR_LEN] // len
- mov PT_REG, dword ptr[esp+8+PAR_PDATA] // pData
- lea END_REG, [PT_REG+ecx*4] // edi=pData + len
- jmp cmp10
-
- loop1_start:
- cmp [PT_REG], MIN_32REG
- cmovb MIN_32REG,[PT_REG]
-
- cmp [PT_REG], MAX_32REG
- cmova MAX_32REG,[PT_REG]
-
- add PT_REG, 4
- cmp10:
- test PT_REG, 0x0f
- jz phase2
-
- cmp PT_REG, END_REG
- jb loop1_start
-
- phase2:
- mov ecx, dword ptr[esp+8+PAR_LEN] // len
- mov END_REG, dword ptr[esp+8+PAR_PDATA] // pData
- lea END_REG, [END_REG+ecx*4] // edi=pData + len
- and END_REG, 0xfffffff0 // clear bit0-bit3 and make edi % 16==0
-
- movd MIN_128REG, MIN_32REG
- pshufd MIN_128REG, MIN_128REG, 00000000b // xmm1 = R0:R0:R0:R0
- movd MAX_128REG, MAX_32REG
- pshufd MAX_128REG, MAX_128REG, 00000000b // xmm2 = R0:R0:R0:R0
- jmp cmp20
-
- loop2_start:
- movdqa xmm0, xmmword ptr[PT_REG]
- PMINUD MIN_128REG, xmm0
- PMAXUD MAX_128REG, xmm0
-
- add PT_REG,16
- cmp20:
- cmp PT_REG, END_REG
- jb loop2_start
-
-
- phase3:
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit0-31(MIN_128REG))
-
- psrldq MIN_128REG, 4 //MIN_128REG >>=4
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit32-63(MIN_128REG))
-
- psrldq MIN_128REG, 4 //MIN_128REG >>=4
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit64-95(MIN_128REG))
-
- psrldq MIN_128REG, 4 //MIN_128REG >>=4
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit96-127(MIN_128REG) )
-
- //-----------------------------------------------
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MAX_32REG=max(max_32REG,bit0-31(MAX_128REG))
-
- psrldq MAX_128REG, 4 //MAX_128REG >>=4
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MAX_32REG=max(max_32REG,bit32-63(MAX_128REG))
-
-
- psrldq MAX_128REG, 4 //MAX_128REG >>=4
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MAX_32REG=max(max_32REG,bit64-95(MAX_128REG))
-
- psrldq MAX_128REG, 4 //MAX_128REG >>=4
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MIN_32REG=max(max_32REG,bit96-127(MIN_128REG))
-
- mov END_REG, dword ptr[esp+8+PAR_PDATA] // pData
- mov ecx, dword ptr[esp+8+PAR_LEN] // len
- lea END_REG, [END_REG+ecx*4] // edi=pData + len
- jmp cmp30
-
- loop3_start:
- cmp [PT_REG], MIN_32REG
- cmovb MIN_32REG,[PT_REG]
-
- cmp [PT_REG], MAX_32REG
- cmova MAX_32REG,[PT_REG]
-
- add PT_REG, 4
- cmp30:
- cmp PT_REG, END_REG
- jb loop3_start
-
- RET_VALUE:
- mov ecx, dword ptr[esp+8+PAR_MIN] //*min
- mov dword ptr [ecx],MIN_32REG
-
- mov ecx, dword ptr[esp+8+PAR_MAX] //*max
- mov dword ptr [ecx],MAX_32REG
-
- emms
- pop edi
- pop esi
-
- ret
- }
- }
复制代码 |
|