无心人
发表于 2008-4-30 13:53:17
:)
你不知道movdqu很慢么?
如果那么做,还不如4路展开的双字拷贝
另外,这种情况下,总是双字对齐的
所以可比字节对齐要多些选择
liangbch
发表于 2008-4-30 13:59:44
具体性能如何,等测试完才知道。
我可能在下周一给出结论,五一休息3天,不准备写代码。
无心人
发表于 2008-4-30 14:03:21
:)
关键是涉及到两个串
另外,串传送指令在这里不如普通指令
liangbch
发表于 2008-4-30 14:28:53
我手边有2年前写的代码,在PIV 2.6G 上重新运行,结果如下。
4 copy function speed test,time unit: second,length unint: DWORD
address =16X+0
lengthA_Copy A_Copy_With_SSE2 memcpy A_Copy_R4
16384 0.0000135665060.0000150560640.0000095048720.000015172883
4 copy function speed test,time unit: second,length unint: DWORD
address =16X+4
lengthA_Copy A_Copy_With_SSE2 memcpy A_Copy_R4
16384 0.0000252718350.0000488350810.0000180149190.000015590487
说明:
1.A_Copy 使用"rep movsd"
2.A_Copy_With_SSE2 使用movdqu 和movdqa,可能不是最优化的。
3.A_Copy_R4 使用4路展开的 mov指令,代码如下:
mov eax,
mov ebx,
mov ecx,
mov edx,
mov ,eax
mov , ebx
mov , ecx
mov ,edx
add esi,16
add edi,16
结论:
1.当16字节对齐时, memcpy 最快,A_Copy 次之,A_Copy_With_SSE2 和 A_Copy_R4 速度大致相当,慢于memcpy。
2.当16x+4字节对齐时,A_Copy_R4 最快,memcpy 次之,A_Copy 第三,而A_Copy_With_SSE2则比前几个慢上好多。
3.值得注意的是,memcpy性能相当好,超过了SSE2的版本,这验证了我在 http://bbs.emath.ac.cn/thread-273-3-1.html 23# 所言。
题外话,代码大全中谈到关于优化的规则:第一:不要用汇编优化,第二:还是不要用汇编优化。不经测试,一味的使用最先进的指令,极可能使得程序更慢。
无心人
发表于 2008-4-30 14:30:41
:)
你说的是intel编译器里的memcpy还是标准库?
liangbch
发表于 2008-4-30 14:33:51
程序是使用VC6.0编译的,未使用intel的编译器。
无心人
发表于 2008-4-30 14:43:17
:)
标准库似乎就是串操作吧?
无心人
发表于 2008-4-30 14:45:10
你可以写个程序试一试,可能你先前的观念是错误的。pshufd 绝不会比 movdqa 快。
无心人
发表于 2008-4-30 15:16:14
复制存在一个问题
块小了用movdqa快, 大了用movntdq快
不知道如何判断
liangbch
发表于 2008-4-30 15:23:02
原帖由 无心人 于 2008-4-30 14:43 发表 http://images.5d6d.net/dz60/common/back.gif
:)
标准库似乎就是串操作吧?
错,标准库的memcpy代码很复杂,使用的并非是串操作,核心部分是一个8路循环展开,采用以下的形式的代码
mov eax, ;U - get dword from source
;V - spare
mov ,eax ;U - put dword into destination
下面贴出vc 6.0 中的 memcpy的源代码:
page ,132
title memcpy - Copy source memory bytes to destination
;***
;memcpy.asm - contains memcpy and memmove routines
;
; Copyright (c) 1986-1997, Microsoft Corporation. All right reserved.
;
; Purpose:
; memcpy() copies a source memory buffer to a destination buffer.
; Overlapping buffers are not treated specially, so propogation may occur.
; memmove() copies a source memory buffer to a destination buffer.
; Overlapping buffers are treated specially, to avoid propogation.
;
;*******************************************************************************
.xlist
include cruntime.inc
.list
M_EXITmacro
ret ; _cdecl return
endm ; M_EXIT
CODESEG
page
;***
;memcpy - Copy source buffer to destination buffer
;
; Purpose:
; memcpy() copies a source memory buffer to a destination memory buffer.
; This routine does NOT recognize overlapping buffers, and thus can lead
; to propogation.
; For cases where propogation must be avoided, memmove() must be used.
;
; Algorithm:
;
; void * memcpy(void * dst, void * src, size_t count)
; {
; void * ret = dst;
;
; /*
; * copy from lower addresses to higher addresses
; */
; while (count--)
; *dst++ = *src++;
;
; return(ret);
; }
;
;memmove - Copy source buffer to destination buffer
;
; Purpose:
; memmove() copies a source memory buffer to a destination memory buffer.
; This routine recognize overlapping buffers to avoid propogation.
; For cases where propogation is not a problem, memcpy() can be used.
;
; Algorithm:
;
; void * memmove(void * dst, void * src, size_t count)
; {
; void * ret = dst;
;
; if (dst <= src || dst >= (src + count)) {
; /*
; * Non-Overlapping Buffers
; * copy from lower addresses to higher addresses
; */
; while (count--)
; *dst++ = *src++;
; }
; else {
; /*
; * Overlapping Buffers
; * copy from higher addresses to lower addresses
; */
; dst += count - 1;
; src += count - 1;
;
; while (count--)
; *dst-- = *src--;
; }
;
; return(ret);
; }
;
;
;Entry:
; void *dst = pointer to destination buffer
; const void *src = pointer to source buffer
; size_t count = number of bytes to copy
;
;Exit:
; Returns a pointer to the destination buffer in AX/DX:AX
;
;Uses:
; CX, DX
;
;Exceptions:
;*******************************************************************************
ifdef MEM_MOVE
_MEM_ equ <memmove>
else; MEM_MOVE
_MEM_ equ <memcpy>
endif; MEM_MOVE
% public_MEM_
_MEM_ proc \
dst:ptr byte, \
src:ptr byte, \
count:IWORD
; destination pointer
; source pointer
; number of bytes to copy
; push ebp ;U - save old frame pointer
; mov ebp, esp ;V - set new frame pointer
push edi ;U - save edi
push esi ;V - save esi
mov esi, ;U - esi = source
mov ecx, ;V - ecx = number of bytes to move
mov edi, ;U - edi = dest
;
; Check for overlapping buffers:
; If (dst <= src) Or (dst >= src + Count) Then
; Do normal (Upwards) Copy
; Else
; Do Downwards Copy to avoid propagation
;
mov eax,ecx ;V - eax = byte count...
mov edx,ecx ;U - edx = byte count...
add eax,esi ;V - eax = point past source end
cmp edi,esi ;U - dst <= src ?
jbe short CopyUp ;V - yes, copy toward higher addresses
cmp edi,eax ;U - dst < (src + count) ?
jb CopyDown ;V - yes, copy toward lower addresses
;
; Copy toward higher addresses.
;
;
; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination.This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;
CopyUp:
test edi,11b ;U - destination dword aligned?
jnz short CopyLeadUp ;V - if we are not dword aligned already, align
shr ecx,2 ;U - shift down to dword count
and edx,11b ;V - trailing byte count
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec ;N - process trailing bytes
;
; Code to do optimal memory copies for non-dword-aligned destinations.
;
; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
; alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
; be faster to move the bytes with one instruction.
;
align @WordSize
CopyLeadUp:
mov eax,edi ;U - get destination offset
mov edx,11b ;V - prepare for mask
sub ecx,4 ;U - check for really short string - sub for adjust
jb short ByteCopyUp ;V - branch to just copy bytes
and eax,11b ;U - get offset within first dword
add ecx,eax ;V - update size after leading bytes copied
jmp dword ptr LeadUpVec ;N - process leading bytes
align @WordSize
ByteCopyUp:
jmp dword ptr TrailUpVec ;N - process just bytes
align @WordSize
CopyUnwindUp:
jmp dword ptr UnwindUpVec ;N - unwind dword copy
align @WordSize
LeadUpVec dd LeadUp1, LeadUp2, LeadUp3
align @WordSize
LeadUp1:
and edx,ecx ;U - trailing byte count
mov al, ;V - get first byte from source
mov ,al ;U - write second byte to destination
mov al, ;V - get second byte from source
mov ,al ;U - write second byte to destination
mov al, ;V - get third byte from source
shr ecx,2 ;U - shift down to dword count
mov ,al ;V - write third byte to destination
add esi,3 ;U - advance source pointer
add edi,3 ;V - advance destination pointer
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec ;N - process trailing bytes
align @WordSize
LeadUp2:
and edx,ecx ;U - trailing byte count
mov al, ;V - get first byte from source
mov ,al ;U - write second byte to destination
mov al, ;V - get second byte from source
shr ecx,2 ;U - shift down to dword count
mov ,al ;V - write second byte to destination
add esi,2 ;U - advance source pointer
add edi,2 ;V - advance destination pointer
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec ;N - process trailing bytes
align @WordSize
LeadUp3:
and edx,ecx ;U - trailing byte count
mov al, ;V - get first byte from source
mov ,al ;U - write second byte to destination
inc esi ;V - advance source pointer
shr ecx,2 ;U - shift down to dword count
inc edi ;V - advance destination pointer
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindUp ;V - if so, then jump
rep movsd ;N - move all of our dwords
jmp dword ptr TrailUpVec ;N - process trailing bytes
align @WordSize
UnwindUpVec dd UnwindUp0, UnwindUp1, UnwindUp2, UnwindUp3
dd UnwindUp4, UnwindUp5, UnwindUp6, UnwindUp7
UnwindUp7:
mov eax, ;U - get dword from source
;V - spare
mov ,eax ;U - put dword into destination
UnwindUp6:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindUp5:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindUp4:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindUp3:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindUp2:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindUp1:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
lea eax, ;V - compute update for pointer
add esi,eax ;U - update source pointer
add edi,eax ;V - update destination pointer
UnwindUp0:
jmp dword ptr TrailUpVec ;N - process trailing bytes
;-----------------------------------------------------------------------------
align @WordSize
TrailUpVec dd TrailUp0, TrailUp1, TrailUp2, TrailUp3
align @WordSize
TrailUp0:
mov eax, ;U - return pointer to destination
pop esi ;V - restore esi
pop edi ;U - restore edi
;V - spare
M_EXIT
align @WordSize
TrailUp1:
mov al, ;U - get byte from source
;V - spare
mov ,al ;U - put byte in destination
mov eax, ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailUp2:
mov al, ;U - get first byte from source
;V - spare
mov ,al ;U - put first byte into destination
mov al, ;V - get second byte from source
mov ,al ;U - put second byte into destination
mov eax, ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailUp3:
mov al, ;U - get first byte from source
;V - spare
mov ,al ;U - put first byte into destination
mov al, ;V - get second byte from source
mov ,al ;U - put second byte into destination
mov al, ;V - get third byte from source
mov ,al ;U - put third byte into destination
mov eax, ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;
; Copy down to avoid propogation in overlapping buffers.
;
align @WordSize
CopyDown:
lea esi, ;U - point to 4 bytes before src buffer end
lea edi, ;V - point to 4 bytes before dest buffer end
;
; See if the destination start is dword aligned
;
test edi,11b ;U - test if dword aligned
jnz short CopyLeadDown ;V - if not, jump
shr ecx,2 ;U - shift down to dword count
and edx,11b ;V - trailing byte count
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag back
jmp dword ptr TrailDownVec ;N - process trailing bytes
align @WordSize
CopyUnwindDown:
neg ecx ;U - negate dword count for table merging
;V - spare
jmp dword ptr UnwindDownVec ;N - unwind copy
align @WordSize
CopyLeadDown:
mov eax,edi ;U - get destination offset
mov edx,11b ;V - prepare for mask
cmp ecx,4 ;U - check for really short string
jb short ByteCopyDown ;V - branch to just copy bytes
and eax,11b ;U - get offset within first dword
sub ecx,eax ;U - to update size after lead copied
jmp dword ptr LeadDownVec ;N - process leading bytes
align @WordSize
ByteCopyDown:
jmp dword ptr TrailDownVec ;N - process just bytes
align @WordSize
LeadDownVec dd LeadDown1, LeadDown2, LeadDown3
align @WordSize
LeadDown1:
mov al, ;U - load first byte
and edx,ecx ;V - trailing byte count
mov ,al ;U - write out first byte
dec esi ;V - point to last src dword
shr ecx,2 ;U - shift down to dword count
dec edi ;V - point to last dest dword
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag
jmp dword ptr TrailDownVec ;N - process trailing bytes
align @WordSize
LeadDown2:
mov al, ;U - load first byte
and edx,ecx ;V - trailing byte count
mov ,al ;U - write out first byte
mov al, ;V - get second byte from source
shr ecx,2 ;U - shift down to dword count
mov ,al ;V - write second byte to destination
sub esi,2 ;U - point to last src dword
sub edi,2 ;V - point to last dest dword
cmp ecx,8 ;U - test if small enough for unwind copy
jb short CopyUnwindDown ;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag
jmp dword ptr TrailDownVec ;N - process trailing bytes
align @WordSize
LeadDown3:
mov al, ;U - load first byte
and edx,ecx ;V - trailing byte count
mov ,al ;U - write out first byte
mov al, ;V - get second byte from source
mov ,al ;U - write second byte to destination
mov al, ;V - get third byte from source
shr ecx,2 ;U - shift down to dword count
mov ,al ;V - write third byte to destination
sub esi,3 ;U - point to last src dword
sub edi,3 ;V - point to last dest dword
cmp ecx,8 ;U - test if small enough for unwind copy
jb CopyUnwindDown;V - if so, then jump
std ;N - set direction flag
rep movsd ;N - move all of our dwords
cld ;N - clear direction flag
jmp dword ptr TrailDownVec ;N - process trailing bytes
;------------------------------------------------------------------
align @WordSize
UnwindDownVec dd UnwindDown7, UnwindDown6, UnwindDown5, UnwindDown4
dd UnwindDown3, UnwindDown2, UnwindDown1, UnwindDown0
UnwindDown7:
mov eax, ;U - get dword from source
;V - spare
mov ,eax ;U - put dword into destination
UnwindDown6:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindDown5:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindDown4:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindDown3:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindDown2:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
UnwindDown1:
mov eax, ;U(entry)/V(not) - get dword from source
;V(entry) - spare
mov ,eax ;U - put dword into destination
lea eax, ;V - compute update for pointer
add esi,eax ;U - update source pointer
add edi,eax ;V - update destination pointer
UnwindDown0:
jmp dword ptr TrailDownVec ;N - process trailing bytes
;-----------------------------------------------------------------------------
align @WordSize
TrailDownVec dd TrailDown0, TrailDown1, TrailDown2, TrailDown3
align @WordSize
TrailDown0:
mov eax, ;U - return pointer to destination
;V - spare
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailDown1:
mov al, ;U - get byte from source
;V - spare
mov ,al ;U - put byte in destination
mov eax, ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailDown2:
mov al, ;U - get first byte from source
;V - spare
mov ,al ;U - put first byte into destination
mov al, ;V - get second byte from source
mov ,al ;U - put second byte into destination
mov eax, ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
align @WordSize
TrailDown3:
mov al, ;U - get first byte from source
;V - spare
mov ,al ;U - put first byte into destination
mov al, ;V - get second byte from source
mov ,al ;U - put second byte into destination
mov al, ;V - get third byte from source
mov ,al ;U - put third byte into destination
mov eax, ;V - return pointer to destination
pop esi ;U - restore esi
pop edi ;V - restore edi
M_EXIT
_MEM_ endp
end