找回密码
 欢迎注册
楼主: 无心人

[讨论] 二进制32位整数快速平方根

[复制链接]
发表于 2009-2-13 09:38:58 | 显示全部楼层
我也利用54楼的思想写了一个函数。 这个函数包含2种做法,当 定义了 LOOP_UNROLL, 使用swich case 技巧做了循环展开,速度应该更快。当LOOP_UNROLL未定义,使用常规的循环算法,楼主可否测试一下这个版本的速度。 下面给出源代码。
  1. typedef unsigned long DWORD;
  2. typedef unsigned char BYTE;
  3. inline DWORD log2(DWORD n)
  4. {
  5. _asm
  6. {
  7. mov ecx,n
  8. bsr eax,ecx
  9. }
  10. }
  11. const unsigned char sqrtTab2[]=
  12. {
  13. 0, 1, 1, 1, 2, 2, 2, 2,
  14. 2, 3, 3, 3, 3, 3, 3, 3,
  15. 4, 4, 4, 4, 4, 4, 4, 4,
  16. 4, 5, 5, 5, 5, 5, 5, 5,
  17. 5, 5, 5, 5, 6, 6, 6, 6,
  18. 6, 6, 6, 6, 6, 6, 6, 6,
  19. 6, 7, 7, 7, 7, 7, 7, 7,
  20. 7, 7, 7, 7, 7, 7, 7, 7,
  21. 8, 8, 8, 8, 8, 8, 8, 8,
  22. 8, 8, 8, 8, 8, 8, 8, 8,
  23. 8, 9, 9, 9, 9, 9, 9, 9,
  24. 9, 9, 9, 9, 9, 9, 9, 9,
  25. 9, 9, 9, 9, 10, 10, 10, 10,
  26. 10, 10, 10, 10, 10, 10, 10, 10,
  27. 10, 10, 10, 10, 10, 10, 10, 10,
  28. 10, 11, 11, 11, 11, 11, 11, 11,
  29. 11, 11, 11, 11, 11, 11, 11, 11,
  30. 11, 11, 11, 11, 11, 11, 11, 11,
  31. 12, 12, 12, 12, 12, 12, 12, 12,
  32. 12, 12, 12, 12, 12, 12, 12, 12,
  33. 12, 12, 12, 12, 12, 12, 12, 12,
  34. 12, 13, 13, 13, 13, 13, 13, 13,
  35. 13, 13, 13, 13, 13, 13, 13, 13,
  36. 13, 13, 13, 13, 13, 13, 13, 13,
  37. 13, 13, 13, 13, 14, 14, 14, 14,
  38. 14, 14, 14, 14, 14, 14, 14, 14,
  39. 14, 14, 14, 14, 14, 14, 14, 14,
  40. 14, 14, 14, 14, 14, 14, 14, 14,
  41. 14, 15, 15, 15, 15, 15, 15, 15,
  42. 15, 15, 15, 15, 15, 15, 15, 15,
  43. 15, 15, 15, 15, 15, 15, 15, 15,
  44. 15, 15, 15, 15, 15, 15, 15, 15
  45. };
  46. const unsigned char sqrTab2[]=
  47. {
  48. 0, 1, 4, 9, 16, 25, 36, 49,
  49. 64, 81, 100,121,144,169,196,225
  50. };
  51. #define LOOP_UNROLL
  52. extern "C"
  53. DWORD __fastcall UintSqrt2(DWORD x)
  54. {
  55. DWORD bc,m1,m2,n,t,i;
  56. if (x<256)
  57. return sqrtTab2[x];
  58. bc=log2(x)/2;
  59. m2=(x>>(bc*2-6));
  60. n=sqrtTab2[m2];
  61. m1=sqrTab2[n];
  62. #ifdef LOOP_UNROLL
  63. switch(bc)
  64. {
  65. case 15:
  66. m2= (x>>22);
  67. n <<= 1; m1<<= 2;
  68. t = m1 + (n<<1)+1;
  69. if ( t<=m2) { m1=t; n++; }
  70. case 14:
  71. m2= (x>>20);
  72. n <<= 1; m1<<= 2;
  73. t = m1 + (n<<1)+1;
  74. if ( t<=m2) { m1=t; n++; }
  75. case 13:
  76. m2= (x>>18);
  77. n <<= 1; m1<<= 2;
  78. t = m1 + (n<<1)+1;
  79. if ( t<=m2) { m1=t; n++; }
  80. case 12:
  81. m2= (x>>16);
  82. n <<= 1; m1<<= 2;
  83. t = m1 + (n<<1)+1;
  84. if ( t<=m2) { m1=t; n++; }
  85. case 11:
  86. m2= (x>>14);
  87. n <<= 1; m1<<= 2;
  88. t = m1 + (n<<1)+1;
  89. if ( t<=m2) { m1=t; n++; }
  90. case 10:
  91. m2= (x>>12);
  92. n <<= 1; m1<<= 2;
  93. t = m1 + (n<<1)+1;
  94. if ( t<=m2) { m1=t; n++; }
  95. case 9:
  96. m2= (x>>10);
  97. n <<= 1; m1<<= 2;
  98. t = m1 + (n<<1)+1;
  99. if ( t<=m2) { m1=t; n++; }
  100. case 8:
  101. m2= (x>>8);
  102. n <<= 1; m1<<= 2;
  103. t = m1 + (n<<1)+1;
  104. if ( t<=m2) { m1=t; n++; }
  105. case 7:
  106. m2= (x>>6);
  107. n <<= 1; m1<<= 2;
  108. t = m1 + (n<<1)+1;
  109. if ( t<=m2) { m1=t; n++; }
  110. case 6:
  111. m2= (x>>4);
  112. n <<= 1; m1<<= 2;
  113. t = m1 + (n<<1)+1;
  114. if ( t<=m2) { m1=t; n++; }
  115. case 5:
  116. m2= (x>>2);
  117. n <<= 1; m1<<= 2;
  118. t = m1 + (n<<1)+1;
  119. if ( t<=m2) { m1=t; n++; }
  120. case 4:
  121. m2= x;
  122. n <<= 1; m1<<= 2;
  123. t = m1 + (n<<1)+1;
  124. if ( t<=m2) { m1=t; n++; }
  125. case 3:
  126. case 2:
  127. case 1:
  128. case 0:;
  129. }
  130. #else
  131. for (i=bc;i>=4;i--)
  132. {
  133. m2=(x>>(i*2-8));
  134. n <<= 1; m1 <<= 2;
  135. t= m1 + (n<<1)+1;
  136. if ( t<=m2)
  137. { m1=t; n++; }
  138. }
  139. #endif
  140. return n;
  141. }
复制代码
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
 楼主| 发表于 2009-2-13 09:49:32 | 显示全部楼层
GxQ的代码 好像,并不快啊
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
 楼主| 发表于 2009-2-13 09:51:26 | 显示全部楼层
GxQ代码的汇编版本 提高50%左右性能
  1. _declspec(naked)
  2. UINT32 __fastcall iSqrt_GxQ_asm( UINT32 n )
  3. {
  4. /*
  5. UINT32 t;
  6. UINT32 ret = 0;
  7. #define SQRT_CORE(x) \
  8. t = ret + (1UL<<(x)); \
  9. ret >>= 1; \
  10. if ( n >= t ) \
  11. { \
  12. n -= t; \
  13. ret += (1UL<<(x)); \
  14. }
  15. SQRT_CORE(30, 28, ..., ..., 0);
  16. */
  17. __asm
  18. {
  19. push ebx //t
  20. push esi
  21. push edi
  22. xor eax, eax //ret
  23. mov ebx, eax
  24. mov edx, 1
  25. shl edx, 30
  26. //30
  27. add ebx, edx
  28. shr eax, 1
  29. mov esi, ecx
  30. sub esi, ebx
  31. mov edi, eax
  32. add edi, edx
  33. cmp ecx, ebx
  34. cmovae ecx, esi
  35. cmovae eax, edi
  36. //28
  37. shr edx, 2
  38. mov ebx, eax
  39. add ebx, edx
  40. shr eax, 1
  41. mov esi, ecx
  42. sub esi, ebx
  43. mov edi, eax
  44. add edi, edx
  45. cmp ecx, ebx
  46. cmovae ecx, esi
  47. cmovae eax, edi
  48. //26
  49. shr edx, 2
  50. mov ebx, eax
  51. add ebx, edx
  52. shr eax, 1
  53. mov esi, ecx
  54. sub esi, ebx
  55. mov edi, eax
  56. add edi, edx
  57. cmp ecx, ebx
  58. cmovae ecx, esi
  59. cmovae eax, edi
  60. //24
  61. shr edx, 2
  62. mov ebx, eax
  63. add ebx, edx
  64. shr eax, 1
  65. mov esi, ecx
  66. sub esi, ebx
  67. mov edi, eax
  68. add edi, edx
  69. cmp ecx, ebx
  70. cmovae ecx, esi
  71. cmovae eax, edi
  72. //22
  73. shr edx, 2
  74. mov ebx, eax
  75. add ebx, edx
  76. shr eax, 1
  77. mov esi, ecx
  78. sub esi, ebx
  79. mov edi, eax
  80. add edi, edx
  81. cmp ecx, ebx
  82. cmovae ecx, esi
  83. cmovae eax, edi
  84. //20
  85. shr edx, 2
  86. mov ebx, eax
  87. add ebx, edx
  88. shr eax, 1
  89. mov esi, ecx
  90. sub esi, ebx
  91. mov edi, eax
  92. add edi, edx
  93. cmp ecx, ebx
  94. cmovae ecx, esi
  95. cmovae eax, edi
  96. //18
  97. shr edx, 2
  98. mov ebx, eax
  99. add ebx, edx
  100. shr eax, 1
  101. mov esi, ecx
  102. sub esi, ebx
  103. mov edi, eax
  104. add edi, edx
  105. cmp ecx, ebx
  106. cmovae ecx, esi
  107. cmovae eax, edi
  108. //16
  109. shr edx, 2
  110. mov ebx, eax
  111. add ebx, edx
  112. shr eax, 1
  113. mov esi, ecx
  114. sub esi, ebx
  115. mov edi, eax
  116. add edi, edx
  117. cmp ecx, ebx
  118. cmovae ecx, esi
  119. cmovae eax, edi
  120. //14
  121. shr edx, 2
  122. mov ebx, eax
  123. add ebx, edx
  124. shr eax, 1
  125. mov esi, ecx
  126. sub esi, ebx
  127. mov edi, eax
  128. add edi, edx
  129. cmp ecx, ebx
  130. cmovae ecx, esi
  131. cmovae eax, edi
  132. //12
  133. shr edx, 2
  134. mov ebx, eax
  135. add ebx, edx
  136. shr eax, 1
  137. mov esi, ecx
  138. sub esi, ebx
  139. mov edi, eax
  140. add edi, edx
  141. cmp ecx, ebx
  142. cmovae ecx, esi
  143. cmovae eax, edi
  144. //10
  145. shr edx, 2
  146. mov ebx, eax
  147. add ebx, edx
  148. shr eax, 1
  149. mov esi, ecx
  150. sub esi, ebx
  151. mov edi, eax
  152. add edi, edx
  153. cmp ecx, ebx
  154. cmovae ecx, esi
  155. cmovae eax, edi
  156. //8
  157. shr edx, 2
  158. mov ebx, eax
  159. add ebx, edx
  160. shr eax, 1
  161. mov esi, ecx
  162. sub esi, ebx
  163. mov edi, eax
  164. add edi, edx
  165. cmp ecx, ebx
  166. cmovae ecx, esi
  167. cmovae eax, edi
  168. //6
  169. shr edx, 2
  170. mov ebx, eax
  171. add ebx, edx
  172. shr eax, 1
  173. mov esi, ecx
  174. sub esi, ebx
  175. mov edi, eax
  176. add edi, edx
  177. cmp ecx, ebx
  178. cmovae ecx, esi
  179. cmovae eax, edi
  180. //4
  181. shr edx, 2
  182. mov ebx, eax
  183. add ebx, edx
  184. shr eax, 1
  185. mov esi, ecx
  186. sub esi, ebx
  187. mov edi, eax
  188. add edi, edx
  189. cmp ecx, ebx
  190. cmovae ecx, esi
  191. cmovae eax, edi
  192. //2
  193. shr edx, 2
  194. mov ebx, eax
  195. add ebx, edx
  196. shr eax, 1
  197. mov esi, ecx
  198. sub esi, ebx
  199. mov edi, eax
  200. add edi, edx
  201. cmp ecx, ebx
  202. cmovae ecx, esi
  203. cmovae eax, edi
  204. //0
  205. shr edx, 2
  206. mov ebx, eax
  207. add ebx, edx
  208. shr eax, 1
  209. mov esi, ecx
  210. sub esi, ebx
  211. mov edi, eax
  212. add edi, edx
  213. cmp ecx, ebx
  214. cmovae ecx, esi
  215. cmovae eax, edi
  216. //ret
  217. pop edi
  218. pop esi
  219. pop ebx
  220. ret
  221. }
  222. }
复制代码
FPU2 Version: 3589.069 ms iSqrt_16 Version: 26237.479 ms iSqrt_GxQ Version: 16068.268 ms iSqrt_GxQ_asm Version: 7884.029 ms
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
 楼主| 发表于 2009-2-13 10:02:58 | 显示全部楼层
FPU2 Version: 3596.297 ms iSqrt_16 Version: 26235.439 ms iSqrt_GxQ Version: 16177.534 ms iSqrt_GxQ_asm Version: 7868.024 ms UintSqrt2 Version: 21584.053 ms ========================================= 急需各位机器的测试结果 我这里Core 2 XEON 1.6似乎还是浮点方式快的多
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
 楼主| 发表于 2009-2-13 10:04:42 | 显示全部楼层
我的代码参考46 iSqrt_16在55 iSqrt_GxQ在56 iSqrtGxQ_在63 UintSqrt2在62
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
 楼主| 发表于 2009-2-13 10:07:47 | 显示全部楼层
测试方法 1、从1循环到2^32-1 2、测试2100000000到2200000000
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
发表于 2009-2-13 10:59:38 | 显示全部楼层
这个主题中,出现了各种不同版本开平方版本,我想对这些版本全部测一遍。 1.首先由作者提出需要测试的程序,在哪一楼,那个函数。 2.某个人负责将这些程序收集起来,做成一个测试程序,打包上传。 3.任何人就可以下载编译并测试了,测试完毕提交测试环境和测试结果。 2点说明: 1. 测试程序应包括正确性测试和性能测试。 2 我的3个整数版本中,运行速度和被开放数的大小有关。 所以我建议,在做性能测试时,增加一个case,测试n=1到65535时的速度。 我先开个头。 我总共编写了6个有效的程序,其中包含一个汇编版,我需要测试的程序有 1. 11楼的 C 整数版 2. 42楼的3个浮点汇编版 3. 61楼的C 整数非牛顿迭代版。
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
 楼主| 发表于 2009-2-13 11:53:31 | 显示全部楼层
我需要测试46, 55, 63 这个测试很耗费时间 我的机器测试46三个版本是160秒左右 55版本是1200秒左右 63版本是330秒 56大概是670秒
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
发表于 2009-2-13 14:23:39 | 显示全部楼层
代码整理中,请稍候。。。
毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
发表于 2009-2-13 14:56:27 | 显示全部楼层
全部函数和测试程序源码包。

Uint32sqrt.rar

8.21 KB, 下载次数: 19, 下载积分: 金币 -1 枚, 经验 1 点, 下载 1 次

毋因群疑而阻独见  毋任己意而废人言
毋私小惠而伤大体  毋借公论以快私情
您需要登录后才可以回帖 登录 | 欢迎注册

本版积分规则

小黑屋|手机版|数学研发网 ( 苏ICP备07505100号 )

GMT+8, 2024-12-22 11:10 , Processed in 0.026134 second(s), 17 queries .

Powered by Discuz! X3.5

© 2001-2024 Discuz! Team.

快速回复 返回顶部 返回列表