liangbch
发表于 2009-2-13 09:38:58
我也利用54楼的思想写了一个函数。
这个函数包含2种做法,当 定义了 LOOP_UNROLL, 使用swich case 技巧做了循环展开,速度应该更快。当LOOP_UNROLL未定义,使用常规的循环算法,楼主可否测试一下这个版本的速度。
下面给出源代码。
typedef unsigned long DWORD;
typedef unsigned char BYTE;
inline DWORD log2(DWORD n)
{
_asm
{
mov ecx,n
bsr eax,ecx
}
}
const unsigned char sqrtTab2[]=
{
0,1,1,1,2,2,2,2,
2,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4,
4,5,5,5,5,5,5,5,
5,5,5,5,6,6,6,6,
6,6,6,6,6,6,6,6,
6,7,7,7,7,7,7,7,
7,7,7,7,7,7,7,7,
8,8,8,8,8,8,8,8,
8,8,8,8,8,8,8,8,
8,9,9,9,9,9,9,9,
9,9,9,9,9,9,9,9,
9,9,9,9, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10,
10, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12,
12, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14,
14, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15
};
const unsigned char sqrTab2[]=
{
0, 1, 4, 9, 16, 25, 36, 49,
64, 81, 100,121,144,169,196,225
};
#define LOOP_UNROLL
extern "C"
DWORD __fastcall UintSqrt2(DWORD x)
{
DWORD bc,m1,m2,n,t,i;
if (x<256)
return sqrtTab2;
bc=log2(x)/2;
m2=(x>>(bc*2-6));
n=sqrtTab2;
m1=sqrTab2;
#ifdef LOOP_UNROLL
switch(bc)
{
case 15:
m2=(x>>22);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 14:
m2=(x>>20);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 13:
m2=(x>>18);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 12:
m2=(x>>16);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 11:
m2=(x>>14);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 10:
m2=(x>>12);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 9:
m2=(x>>10);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 8:
m2=(x>>8);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 7:
m2=(x>>6);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 6:
m2=(x>>4);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 5:
m2=(x>>2);
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 4:
m2=x;
n <<= 1; m1<<= 2;
t = m1 + (n<<1)+1;
if ( t<=m2) { m1=t; n++; }
case 3:
case 2:
case 1:
case 0:;
}
#else
for (i=bc;i>=4;i--)
{
m2=(x>>(i*2-8));
n<<= 1; m1 <<= 2;
t= m1 + (n<<1)+1;
if ( t<=m2)
{ m1=t; n++; }
}
#endif
return n;
}
无心人
发表于 2009-2-13 09:49:32
:)
GxQ的代码
好像,并不快啊
无心人
发表于 2009-2-13 09:51:26
GxQ代码的汇编版本
提高50%左右性能
_declspec(naked)
UINT32 __fastcall iSqrt_GxQ_asm( UINT32 n )
{
/*
UINT32 t;
UINT32 ret = 0;
#define SQRT_CORE(x) \
t = ret + (1UL<<(x)); \
ret >>= 1; \
if ( n >= t ) \
{ \
n -= t; \
ret += (1UL<<(x));\
}
SQRT_CORE(30, 28, ..., ..., 0);
*/
__asm
{
push ebx//t
push esi
push edi
xor eax, eax //ret
mov ebx, eax
mov edx, 1
shl edx, 30
//30
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//28
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//26
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//24
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//22
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//20
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//18
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//16
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//14
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//12
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//10
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//8
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//6
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//4
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//2
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//0
shr edx, 2
mov ebx, eax
add ebx, edx
shr eax, 1
mov esi, ecx
sub esi, ebx
mov edi, eax
add edi, edx
cmp ecx, ebx
cmovae ecx, esi
cmovae eax, edi
//ret
pop edi
pop esi
pop ebx
ret
}
}
FPU2 Version: 3589.069 ms
iSqrt_16 Version: 26237.479 ms
iSqrt_GxQ Version: 16068.268 ms
iSqrt_GxQ_asm Version: 7884.029 ms
无心人
发表于 2009-2-13 10:02:58
FPU2 Version: 3596.297 ms
iSqrt_16 Version: 26235.439 ms
iSqrt_GxQ Version: 16177.534 ms
iSqrt_GxQ_asm Version: 7868.024 ms
UintSqrt2 Version: 21584.053 ms
=========================================
急需各位机器的测试结果
我这里Core 2 XEON 1.6似乎还是浮点方式快的多
无心人
发表于 2009-2-13 10:04:42
我的代码参考46
iSqrt_16在55
iSqrt_GxQ在56
iSqrtGxQ_在63
UintSqrt2在62
无心人
发表于 2009-2-13 10:07:47
测试方法
1、从1循环到2^32-1
2、测试2100000000到2200000000
liangbch
发表于 2009-2-13 10:59:38
这个主题中,出现了各种不同版本开平方版本,我想对这些版本全部测一遍。
1.首先由作者提出需要测试的程序,在哪一楼,那个函数。
2.某个人负责将这些程序收集起来,做成一个测试程序,打包上传。
3.任何人就可以下载编译并测试了,测试完毕提交测试环境和测试结果。
2点说明:
1. 测试程序应包括正确性测试和性能测试。
2我的3个整数版本中,运行速度和被开放数的大小有关。 所以我建议,在做性能测试时,增加一个case,测试n=1到65535时的速度。
我先开个头。
我总共编写了6个有效的程序,其中包含一个汇编版,我需要测试的程序有
1.11楼的 C 整数版
2.42楼的3个浮点汇编版
3.61楼的C 整数非牛顿迭代版。
无心人
发表于 2009-2-13 11:53:31
我需要测试46, 55, 63
这个测试很耗费时间
我的机器测试46三个版本是160秒左右
55版本是1200秒左右
63版本是330秒
56大概是670秒
liangbch
发表于 2009-2-13 14:23:39
代码整理中,请稍候。。。
liangbch
发表于 2009-2-13 14:56:27
全部函数和测试程序源码包。
页:
1
2
3
4
5
6
[7]
8
9
10
11
12
13
14
15