我也利用54楼的思想写了一个函数。
这个函数包含2种做法,当 定义了 LOOP_UNROLL, 使用swich case 技巧做了循环展开,速度应该更快。当LOOP_UNROLL未定义,使用常规的循环算法,楼主可否测试一下这个版本的速度。
下面给出源代码。-
-
- typedef unsigned long DWORD;
- typedef unsigned char BYTE;
-
- inline DWORD log2(DWORD n)
- {
- _asm
- {
- mov ecx,n
- bsr eax,ecx
- }
- }
-
- const unsigned char sqrtTab2[]=
- {
- 0, 1, 1, 1, 2, 2, 2, 2,
- 2, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 5, 5, 5, 5, 5, 5, 5,
- 5, 5, 5, 5, 6, 6, 6, 6,
- 6, 6, 6, 6, 6, 6, 6, 6,
- 6, 7, 7, 7, 7, 7, 7, 7,
- 7, 7, 7, 7, 7, 7, 7, 7,
- 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 9, 9, 9, 9,
- 9, 9, 9, 9, 10, 10, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10,
- 10, 11, 11, 11, 11, 11, 11, 11,
- 11, 11, 11, 11, 11, 11, 11, 11,
- 11, 11, 11, 11, 11, 11, 11, 11,
- 12, 12, 12, 12, 12, 12, 12, 12,
- 12, 12, 12, 12, 12, 12, 12, 12,
- 12, 12, 12, 12, 12, 12, 12, 12,
- 12, 13, 13, 13, 13, 13, 13, 13,
- 13, 13, 13, 13, 13, 13, 13, 13,
- 13, 13, 13, 13, 13, 13, 13, 13,
- 13, 13, 13, 13, 14, 14, 14, 14,
- 14, 14, 14, 14, 14, 14, 14, 14,
- 14, 14, 14, 14, 14, 14, 14, 14,
- 14, 14, 14, 14, 14, 14, 14, 14,
- 14, 15, 15, 15, 15, 15, 15, 15,
- 15, 15, 15, 15, 15, 15, 15, 15,
- 15, 15, 15, 15, 15, 15, 15, 15,
- 15, 15, 15, 15, 15, 15, 15, 15
- };
-
- const unsigned char sqrTab2[]=
- {
- 0, 1, 4, 9, 16, 25, 36, 49,
- 64, 81, 100,121,144,169,196,225
- };
-
-
- #define LOOP_UNROLL
- extern "C"
- DWORD __fastcall UintSqrt2(DWORD x)
- {
- DWORD bc,m1,m2,n,t,i;
-
- if (x<256)
- return sqrtTab2[x];
-
- bc=log2(x)/2;
- m2=(x>>(bc*2-6));
- n=sqrtTab2[m2];
- m1=sqrTab2[n];
-
- #ifdef LOOP_UNROLL
- switch(bc)
- {
- case 15:
- m2= (x>>22);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 14:
- m2= (x>>20);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 13:
- m2= (x>>18);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 12:
- m2= (x>>16);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 11:
- m2= (x>>14);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 10:
- m2= (x>>12);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 9:
- m2= (x>>10);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 8:
- m2= (x>>8);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 7:
- m2= (x>>6);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 6:
- m2= (x>>4);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 5:
- m2= (x>>2);
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 4:
- m2= x;
- n <<= 1; m1<<= 2;
- t = m1 + (n<<1)+1;
- if ( t<=m2) { m1=t; n++; }
- case 3:
- case 2:
- case 1:
- case 0:;
- }
- #else
- for (i=bc;i>=4;i--)
- {
- m2=(x>>(i*2-8));
- n <<= 1; m1 <<= 2;
- t= m1 + (n<<1)+1;
- if ( t<=m2)
- { m1=t; n++; }
- }
- #endif
- return n;
- }
复制代码 |