- 注册时间
- 2007-12-28
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 12785
- 在线时间
- 小时
|
楼主 |
发表于 2010-12-14 22:58:26
|
显示全部楼层
- #include <stdio.h>
- #include <stdlib.h>
- #include <assert.h>
- #include <windows.h>
-
- #include "defs.h"
- #include "findMaxMin.h"
- #include "MTVERIFY.h"
-
-
- DWORD WINAPI ThreadFunc(LPVOID n)
- {
- THREAD_FUNCTION_ST *pTF=(THREAD_FUNCTION_ST *)n;
- SEARCH_SORT_ST *p;
-
- switch (pTF->fun_ID)
- {
- case TF_FIND_MAX_MIN_ALU:
- p=(SEARCH_SORT_ST *)n;
- getMaxMin1(p->pdata,p->len,&(p->min),&(p->max));
- break;
- case TF_FIND_MAX_MIN_SSE4:
- p=(SEARCH_SORT_ST *)n;
- getMaxMin_SSE4(p->pdata,p->len,&(p->min),&(p->max));
- break;
- default:
- printf("Invalid thread function ID in %d line\n",__LINE__);
-
- }
- return 0;
- }
-
-
- void getMaxMin1(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- int i;
- *min=~0;
- *max=0;
- for (i=0; i<len; i++)
- {
- if ( pData[ i ] > *max)
- *max=pData[ i ];
- if (pData[ i ]< *min)
- *min=pData[ i ];
- }
- }
-
- void getMaxMin_asm(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- DWORD _min, _max;
- __asm
- {
- mov esi, pData
- mov ecx, len
- lea edi, [esi+ecx*4]
-
- mov eax, 0xffffffff //min
- mov edx, 0 //max
-
- loop_start:
- cmp esi, edi
- jge next10
-
- cmp [esi], eax
- cmovb eax,[esi]
-
- cmp [esi], edx
- cmova edx,[esi]
-
- add esi, 4
- jmp loop_start
- next10:
- mov _min, eax
- mov _max, edx
- }
-
- *min=_min;
- *max=_max;
-
- }
-
- _declspec(naked)
- void getMaxMin_SSE4(DWORD *pData,int len,DWORD *min,DWORD *max)
-
- // 使用SSE4.1 指令 计算数组的最大最小值
- // 寄存器的使用
- // eax: 存放最小值
- // edx: 存放最大值
- // xmm0: 当前取得的4个整数
- // xmm1: 存放最小值
- // xmm2: 存放最大值
-
- /* PMINUD and PMAXUD are SSE4.1 instruciton
- The usage for PMINUD
- Compares packed unsigned dword integers in the destination operand (first operand)
- and the source operand (second operand), and returns the minimum for each packed
- value in the destination operand.
-
- Operation
- IF (DEST[31:0] < SRC[31:0])
- THEN DEST[31:0] 􀃅 DEST[31:0];
- ELSE DEST[31:0] 􀃅 SRC[31:0]; FI;
-
- IF (DEST[63:32] < SRC[63:32])
- THEN DEST[63:32] 􀃅 DEST[63:32];
- ELSE DEST[63:32] 􀃅 SRC[63:32]; FI;
-
- IF (DEST[95:64] < SRC[95:64])
- THEN DEST[95:64] 􀃅 DEST[95:64];
- ELSE DEST[95:64] 􀃅 SRC[95:64]; FI;
-
- IF (DEST[127:96] < SRC[127:96])
- THEN DEST[127:96] 􀃅 DEST[127:96];
- ELSE DEST[127:96] 􀃅 SRC[127:96]; FI;
- */
-
- #define MIN_128REG xmm1
- #define MAX_128REG xmm2
- #define MIN_32REG eax
- #define MAX_32REG edx
- #define PT_REG esi
- #define END_REG edi
-
- #define PAR_PDATA 4
- #define PAR_LEN 8
- #define PAR_MIN 12
- #define PAR_MAX 16
-
- {
- __asm
- {
- push esi
- push edi
-
- mov MIN_32REG, 0xffffffff // min=0xffffffff
- xor MAX_32REG, MAX_32REG // max=0
-
- //phase1:
- mov ecx, dword ptr[esp+8+PAR_LEN] // len
- mov PT_REG, dword ptr[esp+8+PAR_PDATA] // pData
- lea END_REG, [PT_REG+ecx*4] // edi=pData + len
- jmp cmp10
-
- loop1_start:
- cmp [PT_REG], MIN_32REG
- cmovb MIN_32REG,[PT_REG]
-
- cmp [PT_REG], MAX_32REG
- cmova MAX_32REG,[PT_REG]
-
- add PT_REG, 4
- cmp10:
- test PT_REG, 0x0f
- jz phase2
-
- cmp PT_REG, END_REG
- jb loop1_start
-
- phase2:
- mov ecx, dword ptr[esp+8+PAR_LEN] // len
- mov END_REG, dword ptr[esp+8+PAR_PDATA] // pData
- lea END_REG, [END_REG+ecx*4] // edi=pData + len
- and END_REG, 0xfffffff0 // clear bit0-bit3 and make edi % 16==0
-
- movd MIN_128REG, MIN_32REG
- pshufd MIN_128REG, MIN_128REG, 00000000b // xmm1 = R0:R0:R0:R0
- movd MAX_128REG, MAX_32REG
- pshufd MAX_128REG, MAX_128REG, 00000000b // xmm2 = R0:R0:R0:R0
- jmp cmp20
-
- loop2_start:
- movdqa xmm0, xmmword ptr[PT_REG]
- PMINUD MIN_128REG, xmm0
- PMAXUD MAX_128REG, xmm0
-
- add PT_REG,16
- cmp20:
- cmp PT_REG, END_REG
- jb loop2_start
-
-
- phase3:
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit0-31(MIN_128REG))
-
- psrldq MIN_128REG, 4 //MIN_128REG >>=4
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit32-63(MIN_128REG))
-
- psrldq MIN_128REG, 4 //MIN_128REG >>=4
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit64-95(MIN_128REG))
-
- psrldq MIN_128REG, 4 //MIN_128REG >>=4
- movd ecx, MIN_128REG
- cmp ecx, MIN_32REG
- cmovb MIN_32REG,ecx //MIN_32REG=min(min_32REG,bit96-127(MIN_128REG) )
-
- //-----------------------------------------------
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MAX_32REG=max(max_32REG,bit0-31(MAX_128REG))
-
- psrldq MAX_128REG, 4 //MAX_128REG >>=4
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MAX_32REG=max(max_32REG,bit32-63(MAX_128REG))
-
-
- psrldq MAX_128REG, 4 //MAX_128REG >>=4
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MAX_32REG=max(max_32REG,bit64-95(MAX_128REG))
-
- psrldq MAX_128REG, 4 //MAX_128REG >>=4
- movd ecx, MAX_128REG
- cmp ecx, MAX_32REG
- cmova MAX_32REG,ecx //MIN_32REG=max(max_32REG,bit96-127(MIN_128REG))
-
- mov END_REG, dword ptr[esp+8+PAR_PDATA] // pData
- mov ecx, dword ptr[esp+8+PAR_LEN] // len
- lea END_REG, [END_REG+ecx*4] // edi=pData + len
- jmp cmp30
-
- loop3_start:
- cmp [PT_REG], MIN_32REG
- cmovb MIN_32REG,[PT_REG]
-
- cmp [PT_REG], MAX_32REG
- cmova MAX_32REG,[PT_REG]
-
- add PT_REG, 4
- cmp30:
- cmp PT_REG, END_REG
- jb loop3_start
-
- RET_VALUE:
- mov ecx, dword ptr[esp+8+PAR_MIN] //*min
- mov dword ptr [ecx],MIN_32REG
-
- mov ecx, dword ptr[esp+8+PAR_MAX] //*max
- mov dword ptr [ecx],MAX_32REG
-
- emms
- pop edi
- pop esi
-
- ret
- }
- }
-
-
-
- void getMaxMin_MT_n(int threadCount,int version, DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- int slot,width;
- SEARCH_SORT_ST thrdPara[16];
- HANDLE hThrds[16];
- DWORD threadId;
-
- assert(threadCount<=16);
- width= (len + threadCount-1)/threadCount;
-
- for (slot=0;slot<threadCount;slot++)
- {
- if (version==1)
- thrdPara[slot].fun_ID=TF_FIND_MAX_MIN_ALU;
- else
- thrdPara[slot].fun_ID=TF_FIND_MAX_MIN_SSE4;
-
- thrdPara[slot].pdata=pData+slot*width;
- thrdPara[slot].len=width;
- }
-
- if (len % width !=0)
- {
- thrdPara[threadCount-1].len=len % width;
- }
-
- for (slot=0;slot<threadCount;slot++)
- {
- MTVERIFY( hThrds[slot] = CreateThread(NULL,
- 0,
- ThreadFunc,
- (LPVOID)(thrdPara+slot),
- 0,
- &threadId ) );
- }
-
- for (slot=0; slot<threadCount; slot++)
- {
- WaitForSingleObject(hThrds[slot], INFINITE);
- MTVERIFY( CloseHandle(hThrds[slot]) );
- }
-
- *min=thrdPara[0].min;
- *max=thrdPara[0].max;
- for (slot=1;slot<threadCount;slot++)
- {
- if (thrdPara[slot].max > *max)
- *max=thrdPara[slot].max;
- if (thrdPara[slot].min < *min)
- *min=thrdPara[slot].min;
- }
- }
-
- void getMaxMin_MT2_v1(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- getMaxMin_MT_n(2,1,pData,len,min,max);
- }
-
- void getMaxMin_MT4_v1(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- getMaxMin_MT_n(4,1,pData,len,min,max);
- }
-
-
- void getMaxMin_MT2_v2(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- getMaxMin_MT_n(2,2,pData,len,min,max);
- }
-
- void getMaxMin_MT4_v2(DWORD *pData,int len,DWORD *min,DWORD *max)
- {
- getMaxMin_MT_n(4,2,pData,len,min,max);
- }
复制代码 |
|