4路展开同时算的话,
速度大约差不多能提升30%
(这边测试 10x ms 提升到 7x ms) 函数参数跟题主的顺序有点不一样.
extern "C" uint64_t asm_avx2_x256_mulx_4_4(const uint64_t a[], const uint64_t b[], uint64_t result[]);
asm_avx2_x256_mulx_4_4 proc frame
.endprolog
;原来 result temp1,temp2rcx r9,r8
;现在temp1,temp2,resultrcx,r9.r8 即 把rcx,r8对调
;mov r10,
push rbx
push r12
push r13
push r14
push r15
push rdi
push rsi
push rbp
push rsp
mov r11,
mov r12,
mov r13,
mov r14,
;rcx:r11--r14dest:r8
;rdx(mulx 固定为被乘数)
; 剩余可用 rax rbx, rsi,rdi,r9,r10,r15,rbp
;mov r9, rdx
mov rdx,
mulx rax,rbx,r11
mulx rsi,rdi,r12
mulx r9,r10,r13
mulx r15,rbp,r14
add rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
mov ,rbx
movd xmm1,rax
movd xmm2,rsi
movd xmm3,r9
movd xmm4,r15
;第一轮结束
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov rdx,
mulx rax,rbx,r11
mulx rsi,rdi,r12
mulx r9,r10,r13
mulx r15,rbp,r14
add rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
movd rdx,xmm1
movd rdi,xmm2
movd r10,xmm3
movd rbp,xmm4
add rbx,rdx
adc rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
mov ,rbx
movd xmm1,rax
movd xmm2,rsi
movd xmm3,r9
movd xmm4,r15
;第二轮结束
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov rdx,
mulx rax,rbx,r11
mulx rsi,rdi,r12
mulx r9,r10,r13
mulx r15,rbp,r14
add rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
movd rdx,xmm1
movd rdi,xmm2
movd r10,xmm3
movd rbp,xmm4
add rbx,rdx
adc rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
mov ,rbx
movd xmm1,rax
movd xmm2,rsi
movd xmm3,r9
movd xmm4,r15
;第三轮结束
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov rdx,
mulx rax,rbx,r11
mulx rsi,rdi,r12
mulx r9,r10,r13
mulx r15,rbp,r14
add rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
movd rdx,xmm1
movd rdi,xmm2
movd r10,xmm3
movd rbp,xmm4
add rbx,rdx
adc rax,rdi
adc rsi,r10
adc r9,rbp
adc r15,0
mov ,rbx
mov ,rax
mov ,rsi
mov ,r9
mov ,r15
pop rsp
pop rbp
pop rsi
pop rdi
pop r15
pop r14
pop r13
pop r12
pop rbx
ret
asm_avx2_x256_mulx_4_4 endp
页:
1
[2]