- 注册时间
- 2014-6-29
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 812
- 在线时间
- 小时
|
发表于 2023-5-4 00:01:02
|
显示全部楼层
本帖最后由 l4m2 于 2023-5-4 00:02 编辑
不使用adx指令,多用幾個寄存器,效率接近
- .section .text
- .globl main
- main:
- addq $-128, %rsp
- movq %rsp, %rdi
- leaq 64(%rsp), %rcx
- leaq 128(%rsp), %rsi
- xorl %ebx, %ebx
- lp:
- // mul256 start
- movq (%rcx), %rdx
- mulxq (%rsi), %r8, %r12
- movq %r8, (%rdi)
- mulxq 8(%rsi), %r9, %r13
- addq %r12, %r9
- mulxq 16(%rsi), %r10, %r14
- adcq %r13, %r10
- mulxq 24(%rsi), %r11, %r8
- adcq %r14, %r11
- adcq $0, %r8
- movq 8(%rcx), %rdx
- mulxq (%rsi), %rax, %r12
- addq %rax, %r9
- movq %r9, 8(%rdi)
- mulxq 8(%rsi), %rax, %r13
- adcq %rax, %r10
- mulxq 16(%rsi), %rax, %r14
- adcq %rax, %r11
- mulxq 24(%rsi), %rax, %r9
- adcq %rax, %r8
- adcq $0, %r9
- addq %r12, %r10
- adcq %r13, %r11
- adcq %r14, %r8
- adcq $0, %r9
- movq 16(%rcx), %rdx
- mulxq (%rsi), %rax, %r12
- addq %rax, %r10
- movq %r10, 16(%rdi)
- mulxq 8(%rsi), %rax, %r13
- adcq %rax, %r11
- mulxq 16(%rsi), %rax, %r14
- adcq %rax, %r8
- mulxq 24(%rsi), %rax, %r9
- adcq %rax, %r9
- adcq $0, %r10
- addq %r12, %r11
- adcq %r13, %r8
- adcq %r14, %r9
- adcq $0, %r10
- movq 24(%rcx), %rdx
- mulxq (%rsi), %rax, %r12
- addq %rax, %r11
- movq %r11, 24(%rdi)
- mulxq 8(%rsi), %rax, %r13
- adcq %rax, %r8
- mulxq 16(%rsi), %rax, %r14
- adcq %rax, %r9
- mulxq 24(%rsi), %rax, %r9
- adcq %rax, %r10
- adcq $0, %r10
- addq %r12, %r8
- movq %r8, 32(%rsi)
- adcq %r13, %r9
- movq %r9, 40(%rsi)
- adcq %r14, %r10
- movq %r10, 48(%rsi)
- adcq $0, %r11
- movq %r11, 56(%rsi)
- // mul256 end
- subl $1, %ebx
- jnz lp
- subq $-128, %rsp
- ret
- // rdi(dst) rsi(A) rdx=>rcx(B)
- // r8-r11 current considered 4 bytes
- // r12-r14 himul high
- // rax(tmp) rdx(mulx reg)
- /*
- mul256:
- movq %rdx, %rcx
- movq (%rdx), %rdx
- movq %r12, -8(%rsp)
- mulxq (%rsi), %r8, %r12
- movq %r8, (%rdi)
- movq %r13, -16(%rsp)
- mulxq 8(%rsi), %r9, %r13
- addq %r12, %r9
- movq %r14, -24(%rsp)
- mulxq 16(%rsi), %r10, %r14
- adcq %r13, %r10
- mulxq 24(%rsi), %r11, %r8
- adcq %r14, %r11
- adcq $0, %r8
- movq 8(%rcx), %rdx
- mulxq (%rsi), %rax, %r12
- addq %rax, %r9
- movq %r9, 8(%rdi)
- mulxq 8(%rsi), %rax, %r13
- adcq %rax, %r10
- mulxq 16(%rsi), %rax, %r14
- adcq %rax, %r11
- mulxq 24(%rsi), %rax, %r9
- adcq %rax, %r8
- adcq $0, %r9
- addq %r12, %r10
- adcq %r13, %r11
- adcq %r14, %r8
- adcq $0, %r9
- movq 16(%rcx), %rdx
- mulxq (%rsi), %rax, %r12
- addq %rax, %r10
- movq %r10, 16(%rdi)
- mulxq 8(%rsi), %rax, %r13
- adcq %rax, %r11
- mulxq 16(%rsi), %rax, %r14
- adcq %rax, %r8
- mulxq 24(%rsi), %rax, %r9
- adcq %rax, %r9
- adcq $0, %r10
- addq %r12, %r11
- adcq %r13, %r8
- adcq %r14, %r9
- adcq $0, %r10
- movq 24(%rcx), %rdx
- mulxq (%rsi), %rax, %r12
- addq %rax, %r11
- movq %r11, 24(%rdi)
- mulxq 8(%rsi), %rax, %r13
- adcq %rax, %r8
- mulxq 16(%rsi), %rax, %r14
- adcq %rax, %r9
- mulxq 24(%rsi), %rax, %r9
- adcq %rax, %r10
- adcq $0, %r10
- addq %r12, %r8
- movq %r8, 32(%rsi)
- adcq %r13, %r9
- movq %r9, 40(%rsi)
- adcq %r14, %r10
- movq %r10, 48(%rsi)
- adcq $0, %r11
- movq %r11, 56(%rsi)
- movq -8(%rsp), %r12
- movq -16(%rsp), %r13
- movq -24(%rsp), %r14
- ret
- */
复制代码- Performance counter stats for './a.out':
- 25,636.00 msec task-clock # 0.999 CPUs utilized
- 4,351 context-switches # 169.722 /sec
- 283 cpu-migrations # 11.039 /sec
- 47 page-faults # 1.833 /sec
- 100,564,298,132 cycles # 3.923 GHz (83.32%)
- 304,933,814 stalled-cycles-frontend # 0.30% frontend cycles idle (83.33%)
- 77,948,700,897 stalled-cycles-backend # 77.51% backend cycles idle (83.34%)
- 262,154,531,908 instructions # 2.61 insn per cycle
- # 0.30 stalled cycles per insn (83.33%)
- 4,334,044,774 branches # 169.061 M/sec (83.34%)
- 1,346,833 branch-misses # 0.03% of all branches (83.34%)
- 25.669427577 seconds time elapsed
- 25.625846000 seconds user
- 0.011995000 seconds sys
复制代码- Timeline view:
- 0123456789 0123
- Index 0123456789 0123456789
- [0,0] DeER . . . . . . . movq %rdx, %rcx
- [0,1] DeeeeeER . . . . . . movq (%rdx), %rdx
- [0,2] DeE----R . . . . . . movq %r12, -8(%rsp)
- [0,3] .DeeeeeeeeeER . . . . . mulxq (%rsi), %r8, %r12
- [0,4] .D========eER . . . . . movq %r8, (%rdi)
- [0,5] .D=========eER . . . . . movq %r13, -16(%rsp)
- [0,6] . DeeeeeeeeeER . . . . . mulxq 8(%rsi), %r9, %r13
- [0,7] . D========eER . . . . . addq %r12, %r9
- [0,8] . D=========eER. . . . . movq %r14, -24(%rsp)
- [0,9] . DeeeeeeeeeER. . . . . mulxq 16(%rsi), %r10, %r14
- [0,10] . D========eER. . . . . adcq %r13, %r10
- [0,11] . DeeeeeeeeeER . . . . mulxq 24(%rsi), %r11, %r8
- [0,12] . D========eER . . . . adcq %r14, %r11
- [0,13] . D=========eER . . . . adcq $0, %r8
- [0,14] . DeeeeeE----R . . . . movq 8(%rcx), %rdx
- [0,15] . DeeeeeeeeeER . . . . mulxq (%rsi), %rax, %r12
- [0,16] . D========eER . . . . addq %rax, %r9
- [0,17] . .D========eER . . . . movq %r9, 8(%rdi)
- [0,18] . .DeeeeeeeeeER . . . . mulxq 8(%rsi), %rax, %r13
- [0,19] . .D========eER . . . . adcq %rax, %r10
- [0,20] . . DeeeeeeeeeER . . . . mulxq 16(%rsi), %rax, %r14
- [0,21] . . D========eER . . . . adcq %rax, %r11
- [0,22] . . DeeeeeeeeeER. . . . mulxq 24(%rsi), %rax, %r9
- [0,23] . . D========eER. . . . adcq %rax, %r8
- [0,24] . . D=========eER . . . adcq $0, %r9
- [0,25] . . D======eE--R . . . addq %r12, %r10
- [0,26] . . D=======eE-R . . . adcq %r13, %r11
- [0,27] . . D========eER . . . adcq %r14, %r8
- [0,28] . . D=========eER . . . adcq $0, %r9
- [0,29] . . DeeeeeE-----R . . . movq 16(%rcx), %rdx
- [0,30] . . DeeeeeeeeeER . . . mulxq (%rsi), %rax, %r12
- [0,31] . . D========eER . . . addq %rax, %r10
- [0,32] . . D=========eER . . . movq %r10, 16(%rdi)
- [0,33] . . .DeeeeeeeeeER . . . mulxq 8(%rsi), %rax, %r13
- [0,34] . . .D========eER . . . adcq %rax, %r11
- [0,35] . . . DeeeeeeeeeER . . . mulxq 16(%rsi), %rax, %r14
- [0,36] . . . D========eER . . . adcq %rax, %r8
- [0,37] . . . DeeeeeeeeeER. . . mulxq 24(%rsi), %rax, %r9
- [0,38] . . . D=========eER . . adcq %rax, %r9
- [0,39] . . . D==========eER . . adcq $0, %r10
- [0,40] . . . D======eE---R . . addq %r12, %r11
- [0,41] . . . D=======eE--R . . adcq %r13, %r8
- [0,42] . . . D=========eER . . adcq %r14, %r9
- [0,43] . . . D==========eER . . adcq $0, %r10
- [0,44] . . . DeeeeeE------R . . movq 24(%rcx), %rdx
- [0,45] . . . DeeeeeeeeeE-R . . mulxq (%rsi), %rax, %r12
- [0,46] . . . D========eE-R . . addq %rax, %r11
- [0,47] . . . D=========eER . . movq %r11, 24(%rdi)
- [0,48] . . . .DeeeeeeeeeER . . mulxq 8(%rsi), %rax, %r13
- [0,49] . . . .D========eER . . adcq %rax, %r8
- [0,50] . . . . DeeeeeeeeeER . . mulxq 16(%rsi), %rax, %r14
- [0,51] . . . . D========eER . . adcq %rax, %r9
- [0,52] . . . . DeeeeeeeeeER. . mulxq 24(%rsi), %rax, %r9
- [0,53] . . . . D========eER. . adcq %rax, %r10
- [0,54] . . . . D=========eER . adcq $0, %r10
- [0,55] . . . . D======eE--R . addq %r12, %r8
- [0,56] . . . . D=======eE-R . movq %r8, 32(%rsi)
- [0,57] . . . . D========eER . adcq %r13, %r9
- [0,58] . . . . D=========eER . movq %r9, 40(%rsi)
- [0,59] . . . . D=========eER . adcq %r14, %r10
- [0,60] . . . . D==========eER. movq %r10, 48(%rsi)
- [0,61] . . . . D=========eER. adcq $0, %r11
- [0,62] . . . . D==========eER movq %r11, 56(%rsi)
- [0,63] . . . . DeeeeeE------R movq -8(%rsp), %r12
- [0,64] . . . . DeeeeeE------R movq -16(%rsp), %r13
- [0,65] . . . . D=eeeeeE-----R movq -24(%rsp), %r14
- [0,66] . . . . .DeeeeeeeE---R retq
复制代码
补充内容 (2023-5-5 09:32):
程序修復後測試了正確性及效率 https://pastebin.com/nu1xu5jE |
|