- 注册时间
- 2010-10-22
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 2292
- 在线时间
- 小时
|
楼主 |
发表于 2010-12-24 22:14:48
|
显示全部楼层
17# G-Spider
有bug 更正(精确拷贝到字节),顺便加上硬预取方式,对于小字节量拷贝用movsd过渡。
测试平台:
测试32.1 MB文件存拷贝:
_fast_memcpy1 (movsd)
33 ms
_fast_memcpy9 (SSE 系列)
23 ms
_block_prefetch (硬预取 block_size 8KB)
22 ms
代码:- ;************************************************************
- ;-==-: fast_memcpyTest By G-Spider @2010
- ;-==-: ml /c /coff memcpyTest.asm
- ;-==-: link /subsystem:console memcpyTest.obj
- ;************************************************************
- .686p
- .XMM
-
- .model flat,stdcall
- option casemap:none
-
- include windows.inc
- include user32.inc
- include kernel32.inc
- include msvcrt.inc
-
- includelib user32.lib
- includelib kernel32.lib
- includelib msvcrt.lib
-
- BLOCK_SIZE equ 8192
-
- .data
- dwlm dd 1000 ;1000是毫秒为单位,1000000则是微秒为单位
- fmt db '计算用时:',0dh,0ah,0
- fmt1 db '%6lld ms',0dh,0ah,0
-
- szFileName db 'xinyu.avi',0 ;32,954KB 原文件
- szOutName db 'output.avi',0 ;输出文件;
-
- ;szFileName db 'test.png',0 ;63KB 请以微秒为单位 原文件
- ;szOutName db 'output.png',0 ;输出文件
-
- szPause db 'Pause',0
-
- .data?
- hHandle dd ?
- hHandle1 dd ?
- lpInputBuf dd ?
- lpOutputBuf dd ?
- dwStrlen dd ?
- lpNumberOfBytes dd ?
-
- dwOldProcessP dd ?
- dwOldThreadP dd ?
-
-
- ;-------------------------------------
- dqTickCounter1 dq ?
- dqTickCounter2 dq ?
- dqFreq dq ?
- dqTime dq ?
-
- .code
- ;*************************************
- _fast_memcpy1 proc lpdst,lpsrc,dwlen
-
- ;%define param esp+8+4
- ;%define src param+0
- ;%define dst param+4
- ;%define len param+8
-
- mov esi, lpsrc ; source array
- mov edi, lpdst ; destination array
- mov ecx, dwlen
- mov eax,ecx
- and eax,3
- shr ecx, 2 ; convert to DWORD count
- test ecx,ecx
- jz A000
- rep movsd
- A000:
- test eax,eax
- jz A001
- mov ecx,eax
- rep movsb
- A001:
- xor eax,eax
- ret
- _fast_memcpy1 endp
-
- ;***************************************
- _fast_memcpy9 proc lpdst,lpsrc,dwlen
-
- mov esi, lpsrc ;src pointer
- mov edi, lpdst ;dest pointer
- mov ebx, dwlen ;ebx is our counter
- mov ecx, ebx
- and ecx, 07fh ;剩余的<128字节
- shr ebx, 7 ;divide by 128 (8 * 128bit registers)
-
- test ebx,ebx
- jz A000
-
- ALIGN 16
- loop_copy:
- prefetchnta 128[ESI]; SSE2 prefetch
- prefetchnta 160[ESI];
- prefetchnta 192[ESI];
- prefetchnta 224[ESI];
-
- movdqa xmm0, 0[ESI] ; move data from src to registers
- movdqa xmm1, 16[ESI];
- movdqa xmm2, 32[ESI];
- movdqa xmm3, 48[ESI];
- movdqa xmm4, 64[ESI];
- movdqa xmm5, 80[ESI];
- movdqa xmm6, 96[ESI];
- movdqa xmm7, 112[ESI];
-
- movntdq 0[EDI], xmm0 ; move data from registers to dest
- movntdq 16[EDI], xmm1;
- movntdq 32[EDI], xmm2;
- movntdq 48[EDI], xmm3;
- movntdq 64[EDI], xmm4;
- movntdq 80[EDI], xmm5;
- movntdq 96[EDI], xmm6;
- movntdq 112[EDI], xmm7;
- add esi, 128;
- add edi, 128;
- dec ebx;
- jnz loop_copy; //loop please
- sfence
- align 16
- A000:
- mov eax, ecx
- and eax, 3
-
- shr ecx, 2 ; co[local]1[/local]nvert to DWORD count
- test ecx,ecx
- jz short A001
- rep movsd
- A001:
- test eax,eax
- jz A002
- mov ecx,eax
- rep movsb
-
- A002:
- xor eax,eax
- ret
-
- _fast_memcpy9 endp
-
-
-
- _block_prefetch proc lpdst,lpsrc,dwlen
-
- mov edi, lpdst
- mov esi, lpsrc
- mov eax, dwlen
- mov edx, eax
- and eax, (BLOCK_SIZE-1) ;4096-1=0fffh ;8192-1=1fffh;16*1024-1=3fffh
-
- and edx, 0ffffe000h ;与 BLOCK_SIZE有关
- test edx,edx
- jz A000
-
- align 16
- main_loop:
- xor ecx,ecx
- align 16
- prefetch_loop:
- movaps xmm0, [esi+ecx]
- movaps xmm0, [esi+ecx+64]
- add ecx,128
- cmp ecx,BLOCK_SIZE
- jne prefetch_loop
-
- xor ecx,ecx
- align 16
- cpy_loop:
- movdqa xmm0,[esi+ecx]
- movdqa xmm1,[esi+ecx+16]
- movdqa xmm2,[esi+ecx+32]
- movdqa xmm3,[esi+ecx+48]
- movdqa xmm4,[esi+ecx+64]
- movdqa xmm5,[esi+ecx+16+64]
- movdqa xmm6,[esi+ecx+32+64]
- movdqa xmm7,[esi+ecx+48+64]
-
- movntdq [edi+ecx],xmm0
- movntdq [edi+ecx+16],xmm1
- movntdq [edi+ecx+32],xmm2
- movntdq [edi+ecx+48],xmm3
- movntdq [edi+ecx+64],xmm4
- movntdq [edi+ecx+80],xmm5
- movntdq [edi+ecx+96],xmm6
- movntdq [edi+ecx+112],xmm7
- add ecx,128
- cmp ecx,BLOCK_SIZE
- jne cpy_loop
-
- add esi,ecx
- add edi,ecx
- sub edx,ecx
- jnz main_loop
-
- sfence
- align 16
- A000:
- mov ecx, eax
- and eax, 3
-
- shr ecx, 2 ; convert to DWORD count
- test ecx,ecx
- jz short A001
- rep movsd
- A001:
- test eax,eax
- jz A002
- mov ecx,eax
- rep movsb
-
- A002:
- xor eax,eax
- ret
-
- _block_prefetch endp
-
- ;*****************************************************
- start:
- invoke CreateFile,offset szFileName,GENERIC_READ,FILE_SHARE_READ,\
- NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL
- .if eax == INVALID_HANDLE_VALUE
- invoke MessageBox,NULL,0,0,0
- .endif
- mov hHandle,eax
-
- invoke GetFileSize,eax,NULL
- mov dwStrlen,eax
- add eax,16
- invoke crt_malloc,eax
- mov lpInputBuf,eax
- mov edx,lpInputBuf
- and eax,0fh
- jz Good1
- xor eax,edx
- add eax,10h
- mov lpInputBuf,eax
-
- Good1:
-
- invoke RtlZeroMemory,lpInputBuf,dwStrlen
- invoke ReadFile,hHandle,lpInputBuf,dwStrlen,offset lpNumberOfBytes,NULL
-
- mov eax,dwStrlen
- add eax,16
- invoke crt_malloc,eax
- mov lpOutputBuf,eax
- mov edx,lpOutputBuf
- and eax,0fh
- jz Good2
- xor eax,edx
- add eax,10h
- mov lpOutputBuf,eax
- Good2:
- invoke RtlZeroMemory,lpOutputBuf,dwStrlen
-
- ;----------------------------------------------------
- invoke crt_printf,offset fmt
- mov ecx,5 ;测试5次
- .while ecx!=0
- push ecx
-
- invoke GetCurrentProcess
- invoke GetPriorityClass,eax
- mov dwOldProcessP,eax
-
- invoke GetCurrentThread
- invoke GetThreadPriority,eax
- mov dwOldThreadP,eax
-
- invoke GetCurrentProcess
- invoke SetPriorityClass,eax,REALTIME_PRIORITY_CLASS
- invoke GetCurrentThread
- invoke SetThreadPriority,eax,THREAD_PRIORITY_TIME_CRITICAL
- ;--------------------------------------------------
-
- invoke QueryPerformanceCounter,addr dqTickCounter1
- ;时间测试
- ;invoke _fast_memcpy1,lpOutputBuf,lpInputBuf,dwStrlen
- ;invoke _fast_memcpy9,lpOutputBuf,lpInputBuf,dwStrlen
- invoke _block_prefetch,lpOutputBuf,lpInputBuf,dwStrlen
-
- ;测试结束
- invoke QueryPerformanceCounter,addr dqTickCounter2
- invoke QueryPerformanceFrequency,addr dqFreq
- mov eax,dword ptr dqTickCounter1
- mov edx,dword ptr dqTickCounter1[4]
- sub dword ptr dqTickCounter2,eax
- sub dword ptr dqTickCounter2[4],edx
-
- ;----------------------------------------------------
- ;优先级还原
- invoke GetCurrentThread
- invoke SetThreadPriority,eax,dwOldThreadP
-
- invoke GetCurrentProcess
- invoke SetPriorityClass,eax, dwOldProcessP
-
-
- finit
- fild dqFreq
- fild dqTickCounter2
- fimul dwlm
- fdivr
- fistp dqTime ;dqTime中的64位值就是时间间隔(以微秒为单位)
- ;---------------------------------------------------
-
- invoke crt_printf,offset fmt1,dqTime
-
- pop ecx
- dec ecx
- .endw
-
- ;输出copy文件
- invoke CreateFile,offset szOutName,GENERIC_WRITE,FILE_SHARE_READ,\
- NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL
- .if eax == INVALID_HANDLE_VALUE
- invoke MessageBox,NULL,0,0,0
- .endif
- mov hHandle1,eax
- invoke WriteFile,eax,lpOutputBuf,dwStrlen,offset lpNumberOfBytes,NULL
-
- invoke CloseHandle,hHandle
- invoke CloseHandle,hHandle1
-
- invoke crt_system,offset szPause
- invoke ExitProcess,0
-
- end start
复制代码 |
|