二进制32位整数快速平方根

liangbch · 发表于 2009-2-13 09:38:58

我也利用54楼的思想写了一个函数。这个函数包含2种做法，当定义了 LOOP_UNROLL，使用swich case 技巧做了循环展开，速度应该更快。当LOOP_UNROLL未定义，使用常规的循环算法，楼主可否测试一下这个版本的速度。下面给出源代码。

typedef unsigned long DWORD;
typedef unsigned char BYTE;
inline DWORD log2(DWORD n)
{
_asm
{
mov ecx,n
bsr eax,ecx
}
}
const unsigned char sqrtTab2[]=
{
0, 1, 1, 1, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4,
4, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6,
6, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8,
8, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10,
10, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12,
12, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14,
14, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15
};
const unsigned char sqrTab2[]=
{
0, 1, 4, 9, 16, 25, 36, 49,
64, 81, 100,121,144,169,196,225
};
#define LOOP_UNROLL
extern "C"
DWORD __fastcall UintSqrt2(DWORD x)
{
DWORD bc,m1,m2,n,t,i;
if (x<256)
return sqrtTab2[x];
bc=log2(x)/2;
m2=(x>>(bc*2-6));
n=sqrtTab2[m2];
m1=sqrTab2[n];
#ifdef LOOP_UNROLL
switch(bc)
{
case 15:
m2= (x>>22);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 14:
m2= (x>>20);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 13:
m2= (x>>18);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 12:
m2= (x>>16);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 11:
m2= (x>>14);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 10:
m2= (x>>12);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 9:
m2= (x>>10);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 8:
m2= (x>>8);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 7:
m2= (x>>6);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 6:
m2= (x>>4);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 5:
m2= (x>>2);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 4:
m2= x;
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 3:
case 2:
case 1:
case 0:;
}
#else
for (i=bc;i>=4;i--)
{
m2=(x>>(i*2-8));
n <<= 1; m1 <<= 2;
t= m1 + (n<<1)+1;
if ( t<=m2)
{ m1=t; n++; }
}
#endif
return n;
}

复制代码

无心人 · 发表于 2009-2-13 09:49:32

GxQ的代码好像，并不快啊

无心人 · 发表于 2009-2-13 09:51:26

GxQ代码的汇编版本提高50%左右性能

_declspec(naked)
UINT32 __fastcall iSqrt_GxQ_asm( UINT32 n )
{
/*
UINT32 t;
UINT32 ret = 0;
#define SQRT_CORE(x) \
t = ret + (1UL<<(x)); \
ret >>= 1; \
if ( n >= t ) \
{ \
n -= t; \
ret += (1UL<<(x)); \
}
SQRT_CORE(30, 28, ..., ..., 0);
*/
__asm
{
push ebx //t
push esi
push edi
xor eax, eax //ret
mov ebx, eax
mov edx, 1
shl edx, 30
//30
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//28
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//26
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//24
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//22
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//20
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//18
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//16
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//14
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//12
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//10
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//8
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//6
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//4
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//2
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//0
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//ret
pop edi
pop esi
pop ebx
ret
}
}

复制代码

FPU2 Version: 3589.069 ms iSqrt_16 Version: 26237.479 ms iSqrt_GxQ Version: 16068.268 ms iSqrt_GxQ_asm Version: 7884.029 ms

无心人 · 发表于 2009-2-13 10:02:58

FPU2 Version: 3596.297 ms iSqrt_16 Version: 26235.439 ms iSqrt_GxQ Version: 16177.534 ms iSqrt_GxQ_asm Version: 7868.024 ms UintSqrt2 Version: 21584.053 ms ========================================= 急需各位机器的测试结果我这里Core 2 XEON 1.6似乎还是浮点方式快的多

无心人 · 发表于 2009-2-13 10:04:42

我的代码参考46 iSqrt_16在55 iSqrt_GxQ在56 iSqrtGxQ_在63 UintSqrt2在62

无心人 · 发表于 2009-2-13 10:07:47

测试方法 1、从1循环到2^32-1 2、测试2100000000到2200000000

liangbch · 发表于 2009-2-13 10:59:38

这个主题中，出现了各种不同版本开平方版本，我想对这些版本全部测一遍。 1.首先由作者提出需要测试的程序，在哪一楼，那个函数。 2.某个人负责将这些程序收集起来，做成一个测试程序，打包上传。 3.任何人就可以下载编译并测试了，测试完毕提交测试环境和测试结果。 2点说明： 1. 测试程序应包括正确性测试和性能测试。 2 我的3个整数版本中，运行速度和被开放数的大小有关。所以我建议，在做性能测试时，增加一个case,测试n=1到65535时的速度。我先开个头。我总共编写了6个有效的程序，其中包含一个汇编版，我需要测试的程序有 1. 11楼的 C 整数版 2. 42楼的3个浮点汇编版 3. 61楼的C 整数非牛顿迭代版。

无心人 · 发表于 2009-2-13 11:53:31

我需要测试46， 55， 63 这个测试很耗费时间我的机器测试46三个版本是160秒左右 55版本是1200秒左右 63版本是330秒 56大概是670秒

liangbch · 发表于 2009-2-13 14:23:39

代码整理中，请稍候。。。

liangbch · 发表于 2009-2-13 14:56:27

全部函数和测试程序源码包。

账号		自动登录	找回密码
密码			欢迎注册

[讨论] 二进制32位整数快速平方根