64位平台的256位乘法测试
先贴上我测试通过的代码__declspec(naked)
void UInt256x256To512(uint64_t* result, const uint64_t* left, const uint64_t* right)
{
__asm
{
push rbx
mov r9, rdx
mov r10,
mov r11,
mov r12,
mov r13,
mov r14,
mov rax, r10
mul r11
mov , rax
mov r15, rdx
mov rax, r10
mul r12
add rax, r15
adc rdx, 0
mov r15, rdx
movq xmm1, rax
mov rax, r10
mul r13
add rax, r15
adc rdx, 0
mov r15, rdx
movq xmm2, rax
mov rax, r10
mul r14
add rax, r15
adc rdx, 0
mov r15, rdx
movq xmm3, rax
movq xmm4, rdx
mov r10,
movq r15, xmm1
mov rax, r10
mul r11
add rax, r15
adc rdx, 0
mov , rax
mov rbx, rdx
movq r15, xmm2
mov rax, r10
mul r12
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
movq xmm2, rax
mov rbx, rdx
movq r15, xmm3
mov rax, r10
mul r13
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
movq xmm3, rax
mov rbx, rdx
movq r15, xmm4
mov rax, r10
mul r14
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
movq xmm4, rax
movq xmm5, rdx
mov r10,
movq r15, xmm2
mov rax, r10
mul r11
add rax, r15
adc rdx, 0
mov , rax
mov rbx, rdx
movq r15, xmm3
mov rax, r10
mul r12
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
movq xmm3, rax
mov rbx, rdx
movq r15, xmm4
mov rax, r10
mul r13
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
movq xmm4, rax
mov rbx, rdx
movq r15, xmm5
mov rax, r10
mul r14
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
movq xmm5, rax
movq xmm6, rdx
mov r10,
movq r15, xmm3
mov rax, r10
mul r11
add rax, r15
adc rdx, 0
mov , rax
mov rbx, rdx
movq r15, xmm4
mov rax, r10
mul r12
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
mov , rax
mov rbx, rdx
movq r15, xmm5
mov rax, r10
mul r13
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
mov , rax
mov rbx, rdx
movq r15, xmm6
mov rax, r10
mul r14
add rax, r15
adc rdx, 0
add rax, rbx
adc rdx, 0
mov , rax
mov , rdx
pop rbx
ret
}
} 晕,发这些汇编代码上来干什么?一点注释都没有,谁有兴趣去看呀? 为什么用xmmi?是寄存器不够用吗?这些寄存器不需要保存原先的初值吗? 为什么不用 YMM 寄存器? 63行汇编,含IO占10个寄存器,读写内存。
llvm-mca表示运行时间43cycle,但mca可能有bug,实际时间没那么长
Timeline view:
0123456789 0123456789
Index 0123456789 0123456789 012
DeeeeeER. . . . . . . . movq (%rsi), %rdx
DeeeeeeeeeER . . . . . . . mulxq (%rbx), %rax, %r8
D========eER . . . . . . . movq %rax, (%rdi)
.DeeeeeeeeeER. . . . . . . mulxq 8(%rbx), %rax, %r9
.D========eER. . . . . . . addq %rax, %r8
. DeeeeeeeeeER . . . . . . . mulxq 16(%rbx), %rax, %r10
. D========eER . . . . . . . adcq %rax, %r9
.DeeeeeeeeeER. . . . . . . mulxq 24(%rbx), %rax, %r11
.D========eER. . . . . . . adcq %rax, %r10
.D=========eER . . . . . . adcq $0, %r11
. D----------R . . . . . . xorl %ecx, %ecx
. DeeeeeE----R . . . . . . movq 8(%rsi), %rdx
. DeeeeeeeeeER . . . . . . mulxq (%rbx), %rax, %rcx
. D=======eER . . . . . . adoxq %rax, %r8
. D========eER . . . . . . adcxq %rcx, %r9
. D========eER . . . . . . movq %r8, 8(%rdi)
. .DeeeeeeeeeER. . . . . . mulxq 8(%rbx), %rax, %rcx
. .D========eER. . . . . . adoxq %rax, %r9
. .D=========eER . . . . . . adcxq %rcx, %r10
. . DeeeeeeeeeER . . . . . . mulxq 16(%rbx), %rax, %rcx
. . D=========eER. . . . . . adoxq %rax, %r10
. . D==========eER . . . . . adcxq %rcx, %r11
. .DeeeeeeeeeE-R . . . . . mulxq 24(%rbx), %rax, %rcx
. .D==========eER . . . . . adoxq %rax, %r11
. .DeE----------R . . . . . movl $0, %r8d
. . D==========eER. . . . . adoxq %r8, %r8
. . D===========eER . . . . . adcxq %rcx, %r8
. . DeeeeeE-------R . . . . . movq 16(%rsi), %rdx
. . DeeeeeeeeeE--R . . . . . mulxq (%rbx), %rax, %rcx
. . D===========eER. . . . . adoxq %rax, %r9
. . D============eER . . . . adcxq %rcx, %r10
. . .D===========eER . . . . movq %r9, 16(%rdi)
. . .DeeeeeeeeeE---R . . . . mulxq 8(%rbx), %rax, %rcx
. . .D============eER . . . . adoxq %rax, %r10
. . . D============eER. . . . adcxq %rcx, %r11
. . . DeeeeeeeeeE----R. . . . mulxq 16(%rbx), %rax, %rcx
. . . D=============eER . . . . adoxq %rax, %r11
. . .D=============eER. . . . adcxq %rcx, %r8
. . .DeeeeeeeeeE-----R. . . . mulxq 24(%rbx), %rax, %rcx
. . .D==============eER . . . adoxq %rax, %r8
. . . DeE-------------R . . . movl $0, %r9d
. . . D==============eER . . . adoxq %r9, %r9
. . . D===============eER. . . adcxq %rcx, %r9
. . . DeeeeeE-----------R. . . movq 24(%rsi), %rdx
. . . DeeeeeeeeeE------R. . . mulxq (%rbx), %rax, %rcx
. . . D===============eER . . . adoxq %rax, %r10
. . . D================eER. . . adcxq %rcx, %r11
. . . .D===============eER. . . movq %r10, 24(%rdi)
. . . .DeeeeeeeeeE-------R. . . mulxq 8(%rbx), %rax, %rcx
. . . .D================eER . . adoxq %rax, %r11
. . . . D================eER . . adcxq %rcx, %r8
. . . . D================eER . . movq %r11, 32(%rdi)
. . . . DeeeeeeeeeE--------R . . mulxq 16(%rbx), %rax, %rcx
. . . .D================eER. . adoxq %rax, %r8
. . . .D=================eER . . adcxq %rcx, %r9
. . . .D=================eER . . movq %r8, 40(%rdi)
. . . . DeeeeeeeeeE--------R . . mulxq 24(%rbx), %rax, %rcx
. . . . D=================eER. . adoxq %rax, %r9
. . . . DeE-----------------R. . movl $0, %r10d
. . . . D=================eER . movq %r9, 48(%rdi)
. . . . D=================eER . adoxq %r10, %r10
. . . . D==================eER. adcxq %rcx, %r10
. . . . D===================eER movq %r10, 56(%rdi)
運行崩潰:r8未初始化,在mov r10, (把之前的错误代码删除了)
(进位的时候应该按BASE进位,我进成了DOUBLE,不出错就见鬼了)
use std::ops::Mul;
type BASE=u64;
type DOUBLE=u128;
const _:()=assert!(std::mem::size_of::<BASE>() * 2==std::mem::size_of::<DOUBLE>());
const BASE:u32=8*std::mem::size_of::<BASE>() as u32;
const BASE_LEN:usize=256/BASE as usize;
#
struct U256();
// impl Mul for U256 {
// type Output = ;
// fn mul(self, rhs: Self) -> Self::Output {
// let mut out0=;
// for i in 0..BASE_LEN {
// let mut carry=false;
// for j in 0..BASE_LEN {
// (out0,carry)=out0.overflowing_add(self.0 as DOUBLE*rhs.0 as DOUBLE+carry as DOUBLE)
// }
// let mut j=BASE_LEN;
// while carry {
// (out0,carry)=out0.overflowing_add(carry as DOUBLE);
// j+=1;
// }
// }
// let mut out=;
// let mut carry=0 as DOUBLE;
// for i in 0..BASE_LEN*2 {
// out=(out0+carry) as BASE;
// carry=(carry>>BASE)+(out0>>BASE);
// }
// out
// }
// }
impl Mul for U256 {
type Output = ;
fn mul(self, rhs: Self) -> Self::Output {
let mut out0=;
for i in 0..BASE_LEN {
let mut carry=false;
for j in 0..BASE_LEN {
(out0,carry)=out0.overflowing_add(self.0 as DOUBLE*rhs.0 as DOUBLE+((carry as DOUBLE)<<BASE))
}
let mut j=BASE_LEN;
while carry {
(out0,carry)=out0.overflowing_add((carry as DOUBLE)<<BASE);
j+=1;
}
}
let mut out=;
let mut carry=0 as DOUBLE;
for i in 0..BASE_LEN*2 {
let (res,of)=out0.overflowing_add(carry);
out=res as BASE;
carry=(res>>BASE)+((of as DOUBLE)<<BASE);
}
out
}
}
对应汇编
.text
.file "lib.9a6cb7ee-cgu.0"
.section ".text._ZN51_$LT$lib..U256$u20$as$u20$core..ops..arith..Mul$GT$3mul17h5eee3b70e1af20c7E","ax",@progbits
.globl _ZN51_$LT$lib..U256$u20$as$u20$core..ops..arith..Mul$GT$3mul17h5eee3b70e1af20c7E
.p2align 4, 0x90
.type _ZN51_$LT$lib..U256$u20$as$u20$core..ops..arith..Mul$GT$3mul17h5eee3b70e1af20c7E,@function
_ZN51_$LT$lib..U256$u20$as$u20$core..ops..arith..Mul$GT$3mul17h5eee3b70e1af20c7E:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $80, %rsp
.cfi_def_cfa_offset 136
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rdx, %rcx
movq 8(%rdx), %r10
movq (%rsi), %rbx
movq %rbx, 72(%rsp)
movq %r10, %rdx
mulxq %rbx, %r11, %r13
movq 16(%rcx), %r9
movq %r9, %rdx
mulxq %rbx, %r15, %r14
movq %rdi, %rax
movq (%rcx), %r12
movq 24(%rcx), %r8
movq %r8, %rdx
mulxq %rbx, %rdx, %rcx
movq %rdx, 16(%rsp)
movq %rcx, (%rsp)
movq 8(%rsi), %rdi
movq %rsi, 32(%rsp)
movq %r12, %rdx
mulxq %rdi, %rdx, %rcx
addq %r11, %rdx
movq %rdx, 56(%rsp)
adcq %r13, %rcx
movq %rcx, 64(%rsp)
movq %r10, %rdx
mulxq %rdi, %rbp, %r13
adcq $0, %r13
addq %r15, %rbp
adcq %r14, %r13
movq %r9, %rdx
movq %r9, 40(%rsp)
mulxq %rdi, %rbx, %rcx
movq %r8, %r15
movq %r8, 48(%rsp)
movq %r8, %rdx
mulxq %rdi, %rdx, %r14
movq %rdx, 8(%rsp)
adcq $0, %rcx
addq 16(%rsp), %rbx
movq 16(%rsi), %rdi
movq %r12, %rdx
movq %r12, %rsi
movq %r12, 24(%rsp)
mulxq %rdi, %r11, %r8
adcq (%rsp), %rcx
adcq $0, %r14
addq %rbp, %r11
movq %r10, %rdx
mulxq %rdi, %rbp, %r12
adcq %r13, %r8
adcq $0, %r12
addq %rbx, %rbp
movq %r9, %rdx
mulxq %rdi, %rbx, %r9
adcq %rcx, %r12
adcq $0, %r9
addq 8(%rsp), %rbx
movq %r15, %rdx
mulxq %rdi, %rcx, %r15
movq %rcx, (%rsp)
adcq %r14, %r9
adcq $0, %r15
movq 32(%rsp), %rcx
movq 24(%rcx), %rcx
movq %rsi, %rdx
mulxq %rcx, %r14, %r13
addq %rbp, %r14
adcq %r12, %r13
movq %r10, %rdx
mulxq %rcx, %r10, %rsi
adcq $0, %rsi
addq %rbx, %r10
adcq %r9, %rsi
movq 40(%rsp), %rdx
mulxq %rcx, %rbp, %rdi
adcq $0, %rdi
movq 48(%rsp), %rdx
mulxq %rcx, %r12, %r9
movq 24(%rsp), %rdx
mulxq 72(%rsp), %rcx, %rdx
addq (%rsp), %rbp
adcq %r15, %rdi
movq %rcx, (%rax)
adcq $0, %r9
addq 56(%rsp), %rdx
movq %rdx, 8(%rax)
adcq 64(%rsp), %r11
movq %r11, 16(%rax)
adcq $0, %r8
setb %cl
addq %r14, %r8
movzbl %cl, %ecx
adcq %r13, %rcx
movq %r8, 24(%rax)
setb %dl
addq %r10, %rcx
movzbl %dl, %edx
adcq %rsi, %rdx
movq %rcx, 32(%rax)
setb %cl
addq %rbp, %rdx
movzbl %cl, %ecx
adcq %rdi, %rcx
movq %rdx, 40(%rax)
setb %dl
addq %r12, %rcx
movzbl %dl, %edx
adcq %r9, %rdx
movq %rcx, 48(%rax)
movq %rdx, 56(%rax)
addq $80, %rsp
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size _ZN51_$LT$lib..U256$u20$as$u20$core..ops..arith..Mul$GT$3mul17h5eee3b70e1af20c7E, .Lfunc_end0-_ZN51_$LT$lib..U256$u20$as$u20$core..ops..arith..Mul$GT$3mul17h5eee3b70e1af20c7E
.cfi_endproc
.section ".note.GNU-stack","",@progbits
测试
fn main(){
let r=U256();
(r*r).into_iter().for_each(|x|println!("{x:x}"))
}
输出1
0
0
0
fffffffffffffffe
ffffffffffffffff
ffffffffffffffff
ffffffffffffffff 本帖最后由 l4m2 于 2023-5-4 00:02 编辑
不使用adx指令,多用幾個寄存器,效率接近
.section .text
.globl main
main:
addq $-128, %rsp
movq %rsp, %rdi
leaq 64(%rsp), %rcx
leaq 128(%rsp), %rsi
xorl %ebx, %ebx
lp:
// mul256 start
movq (%rcx), %rdx
mulxq (%rsi), %r8, %r12
movq %r8, (%rdi)
mulxq 8(%rsi), %r9, %r13
addq %r12, %r9
mulxq 16(%rsi), %r10, %r14
adcq %r13, %r10
mulxq 24(%rsi), %r11, %r8
adcq %r14, %r11
adcq $0, %r8
movq 8(%rcx), %rdx
mulxq (%rsi), %rax, %r12
addq %rax, %r9
movq %r9, 8(%rdi)
mulxq 8(%rsi), %rax, %r13
adcq %rax, %r10
mulxq 16(%rsi), %rax, %r14
adcq %rax, %r11
mulxq 24(%rsi), %rax, %r9
adcq %rax, %r8
adcq $0, %r9
addq %r12, %r10
adcq %r13, %r11
adcq %r14, %r8
adcq $0, %r9
movq 16(%rcx), %rdx
mulxq (%rsi), %rax, %r12
addq %rax, %r10
movq %r10, 16(%rdi)
mulxq 8(%rsi), %rax, %r13
adcq %rax, %r11
mulxq 16(%rsi), %rax, %r14
adcq %rax, %r8
mulxq 24(%rsi), %rax, %r9
adcq %rax, %r9
adcq $0, %r10
addq %r12, %r11
adcq %r13, %r8
adcq %r14, %r9
adcq $0, %r10
movq 24(%rcx), %rdx
mulxq (%rsi), %rax, %r12
addq %rax, %r11
movq %r11, 24(%rdi)
mulxq 8(%rsi), %rax, %r13
adcq %rax, %r8
mulxq 16(%rsi), %rax, %r14
adcq %rax, %r9
mulxq 24(%rsi), %rax, %r9
adcq %rax, %r10
adcq $0, %r10
addq %r12, %r8
movq %r8, 32(%rsi)
adcq %r13, %r9
movq %r9, 40(%rsi)
adcq %r14, %r10
movq %r10, 48(%rsi)
adcq $0, %r11
movq %r11, 56(%rsi)
//mul256 end
subl $1, %ebx
jnz lp
subq $-128, %rsp
ret
// rdi(dst) rsi(A) rdx=>rcx(B)
// r8-r11 current considered 4 bytes
// r12-r14 himul high
// rax(tmp) rdx(mulx reg)
/*
mul256:
movq %rdx, %rcx
movq (%rdx), %rdx
movq %r12, -8(%rsp)
mulxq (%rsi), %r8, %r12
movq %r8, (%rdi)
movq %r13, -16(%rsp)
mulxq 8(%rsi), %r9, %r13
addq %r12, %r9
movq %r14, -24(%rsp)
mulxq 16(%rsi), %r10, %r14
adcq %r13, %r10
mulxq 24(%rsi), %r11, %r8
adcq %r14, %r11
adcq $0, %r8
movq 8(%rcx), %rdx
mulxq (%rsi), %rax, %r12
addq %rax, %r9
movq %r9, 8(%rdi)
mulxq 8(%rsi), %rax, %r13
adcq %rax, %r10
mulxq 16(%rsi), %rax, %r14
adcq %rax, %r11
mulxq 24(%rsi), %rax, %r9
adcq %rax, %r8
adcq $0, %r9
addq %r12, %r10
adcq %r13, %r11
adcq %r14, %r8
adcq $0, %r9
movq 16(%rcx), %rdx
mulxq (%rsi), %rax, %r12
addq %rax, %r10
movq %r10, 16(%rdi)
mulxq 8(%rsi), %rax, %r13
adcq %rax, %r11
mulxq 16(%rsi), %rax, %r14
adcq %rax, %r8
mulxq 24(%rsi), %rax, %r9
adcq %rax, %r9
adcq $0, %r10
addq %r12, %r11
adcq %r13, %r8
adcq %r14, %r9
adcq $0, %r10
movq 24(%rcx), %rdx
mulxq (%rsi), %rax, %r12
addq %rax, %r11
movq %r11, 24(%rdi)
mulxq 8(%rsi), %rax, %r13
adcq %rax, %r8
mulxq 16(%rsi), %rax, %r14
adcq %rax, %r9
mulxq 24(%rsi), %rax, %r9
adcq %rax, %r10
adcq $0, %r10
addq %r12, %r8
movq %r8, 32(%rsi)
adcq %r13, %r9
movq %r9, 40(%rsi)
adcq %r14, %r10
movq %r10, 48(%rsi)
adcq $0, %r11
movq %r11, 56(%rsi)
movq -8(%rsp), %r12
movq -16(%rsp), %r13
movq -24(%rsp), %r14
ret
*/ Performance counter stats for './a.out':
25,636.00 msec task-clock # 0.999 CPUs utilized
4,351 context-switches #169.722 /sec
283 cpu-migrations # 11.039 /sec
47 page-faults # 1.833 /sec
100,564,298,132 cycles # 3.923 GHz (83.32%)
304,933,814 stalled-cycles-frontend # 0.30% frontend cycles idle (83.33%)
77,948,700,897 stalled-cycles-backend # 77.51% backend cycles idle (83.34%)
262,154,531,908 instructions # 2.61insn per cycle
# 0.30stalled cycles per insn(83.33%)
4,334,044,774 branches #169.061 M/sec (83.34%)
1,346,833 branch-misses # 0.03% of all branches (83.34%)
25.669427577 seconds time elapsed
25.625846000 seconds user
0.011995000 seconds sys
Timeline view:
0123456789 0123
Index 0123456789 0123456789
DeER . . . . . .. movq %rdx, %rcx
DeeeeeER. . . . .. movq (%rdx), %rdx
DeE----R. . . . .. movq %r12, -8(%rsp)
.DeeeeeeeeeER. . . .. mulxq (%rsi), %r8, %r12
.D========eER. . . .. movq %r8, (%rdi)
.D=========eER . . . .. movq %r13, -16(%rsp)
. DeeeeeeeeeER . . . .. mulxq 8(%rsi), %r9, %r13
. D========eER . . . .. addq %r12, %r9
. D=========eER. . . .. movq %r14, -24(%rsp)
.DeeeeeeeeeER. . . .. mulxq 16(%rsi), %r10, %r14
.D========eER. . . .. adcq %r13, %r10
. DeeeeeeeeeER . . .. mulxq 24(%rsi), %r11, %r8
. D========eER . . .. adcq %r14, %r11
. D=========eER . . .. adcq $0, %r8
. DeeeeeE----R . . .. movq 8(%rcx), %rdx
. DeeeeeeeeeER . . .. mulxq (%rsi), %rax, %r12
. D========eER . . .. addq %rax, %r9
. .D========eER. . .. movq %r9, 8(%rdi)
. .DeeeeeeeeeER. . .. mulxq 8(%rsi), %rax, %r13
. .D========eER. . .. adcq %rax, %r10
. . DeeeeeeeeeER . . .. mulxq 16(%rsi), %rax, %r14
. . D========eER . . .. adcq %rax, %r11
. .DeeeeeeeeeER. . .. mulxq 24(%rsi), %rax, %r9
. .D========eER. . .. adcq %rax, %r8
. .D=========eER . .. adcq $0, %r9
. . D======eE--R . .. addq %r12, %r10
. . D=======eE-R . .. adcq %r13, %r11
. . D========eER . .. adcq %r14, %r8
. . D=========eER . .. adcq $0, %r9
. . DeeeeeE-----R . .. movq 16(%rcx), %rdx
. . DeeeeeeeeeER . .. mulxq (%rsi), %rax, %r12
. . D========eER . .. addq %rax, %r10
. . D=========eER. .. movq %r10, 16(%rdi)
. . .DeeeeeeeeeER. .. mulxq 8(%rsi), %rax, %r13
. . .D========eER. .. adcq %rax, %r11
. . . DeeeeeeeeeER . .. mulxq 16(%rsi), %rax, %r14
. . . D========eER . .. adcq %rax, %r8
. . .DeeeeeeeeeER. .. mulxq 24(%rsi), %rax, %r9
. . .D=========eER .. adcq %rax, %r9
. . .D==========eER .. adcq $0, %r10
. . . D======eE---R .. addq %r12, %r11
. . . D=======eE--R .. adcq %r13, %r8
. . . D=========eER .. adcq %r14, %r9
. . . D==========eER.. adcq $0, %r10
. . . DeeeeeE------R.. movq 24(%rcx), %rdx
. . . DeeeeeeeeeE-R.. mulxq (%rsi), %rax, %r12
. . . D========eE-R.. addq %rax, %r11
. . . D=========eER.. movq %r11, 24(%rdi)
. . . .DeeeeeeeeeER.. mulxq 8(%rsi), %rax, %r13
. . . .D========eER.. adcq %rax, %r8
. . . . DeeeeeeeeeER .. mulxq 16(%rsi), %rax, %r14
. . . . D========eER .. adcq %rax, %r9
. . . .DeeeeeeeeeER.. mulxq 24(%rsi), %rax, %r9
. . . .D========eER.. adcq %rax, %r10
. . . .D=========eER. adcq $0, %r10
. . . . D======eE--R. addq %r12, %r8
. . . . D=======eE-R. movq %r8, 32(%rsi)
. . . . D========eER. adcq %r13, %r9
. . . . D=========eER . movq %r9, 40(%rsi)
. . . . D=========eER . adcq %r14, %r10
. . . . D==========eER. movq %r10, 48(%rsi)
. . . . D=========eER. adcq $0, %r11
. . . . D==========eER movq %r11, 56(%rsi)
. . . . DeeeeeE------R movq -8(%rsp), %r12
. . . . DeeeeeE------R movq -16(%rsp), %r13
. . . . D=eeeeeE-----R movq -24(%rsp), %r14
. . . . .DeeeeeeeE---R retq
补充内容 (2023-5-5 09:32):
程序修復後測試了正確性及效率 https://pastebin.com/nu1xu5jE 汇编是怎么写怎么调试的? https://pastebin.com/gktPUxJY
這個要40s但是我不知道瓶頸在哪。B讀進寄存器也差不多40
页:
[1]