你的P4和我的不同 GreaterEqualBit2_24F(314159265)=0x14000000
GreaterEqualBit2_25F(314159265)=0x14000000
GreaterEqualBit2_36F(314159265)=0x14000000
GreaterEqualBit2_57F(314159265)=0x14000000
GreaterEqualBit2_58F(314159265)=0x14000000
GreaterEqualBit2_59F(314159265)=0x14000000
GreaterEqualBit2_75F(314159265)=0x14000000
24# 56449.240 ms
25# 51673.252 ms
36# 49594.482 ms
57# 52741.690 ms
58# 49340.659 ms
59# 52868.823 ms
75# 49191.056 ms
以上是AMD CPU的测试数据,Intel 的等我回家测。。。 无心人编译的程序无法在我的机器上运行,
所以自己写代码编译测试(顺带将86# shshsh_0510 的代码一并测试),
结果如下:GreaterEqualBit2_24F(314159265)=0x14000000
GreaterEqualBit2_25F(314159265)=0x14000000
GreaterEqualBit2_36F(314159265)=0x14000000
GreaterEqualBit2_57F(314159265)=0x14000000
GreaterEqualBit2_58F(314159265)=0x14000000
GreaterEqualBit2_59F(314159265)=0x14000000
GreaterEqualBit2_75F(314159265)=0x14000000
GreaterEqualBit2_86F(314159265)=0x14000000
n: 1-2^31
24#: 25 s
25#: 29 s
36#: 32 s
57#: 26 s
58#: 31 s
59#: 28 s
75#: 24 s
86#: 17 s
Press any key to continue
注:本帖所测代码最后修改于 2009-02-04 20:00 25#可进一步优化为
_declspec(naked)
UINT32 GreaterEqualBit2_25F(UINT32 n)
{
__asm {
mov eax,
mov edx, eax
bsr ecx, edx
btr edx, ecx
bsrecx, edx
jz exit25
btr edx, ecx
cmp edx, 0
je exit25
mov edx, 1
shledx, cl
sub edx, 1
or eax, edx
add eax, 1
exit25:
ret
}
}
==============================
Core2 Xeon 1.6G测试
24# 24303.416 ms
25# 22223.639 ms
36# 26344.195 ms
57# 22893.317 ms
58# 25275.783 ms
59# 24195.312 ms
75# 23295.180 ms
=====================
已经完全超越其他函数
并测试从1-15的结果均对
=======================
代码OK, 重新更新下载去 79#更新了
GxQ如果是以79#测试的, 要重新测试
25#应该时间减少
================================
另外, 25#可以输入0, 并正常产生输出, 结果是0
24也是, 36结果不对, 其他么测, 程序就异常了 49# 的程序我用汇编写了一下
我进行了简单测试,由于比较复杂不知是否正确,但应该差不多吧。
发现之所以慢,是因为功能本身太小,如果代码较长编译器稍稍放松一点就白费劲了。
另外,全部展开太复杂,就偷懒在3,4层将gxq的copy进去了,总之就是想说,不用bsr 未必就不行 :)
第一次写汇编,还请各位多指教///////////////////////////////////////////////////////////////////////////////////////////////////////////////
//m 1-2 ,2 bits :mark
// 0: at least 3 ones
// 1: 2 ones
// 2: 1 one
// 3: 0 one
//x 3-12 ,10 bits:
// when contains at least 2 ones ,then x= upper-one
// eg : 11001 - 100000 ,1001 - 10000 ,0011-0100
//y 13-22 ,10 bits:
// when contains2 ones, y= GreaterEqualBit2(first byte+1)
// eg :1010-1100 ,0011 -0100
//z 23-32 ,10 bits:
// when contains at least 3 ones , it is y =GreaterEqualBit2(first byte) ,so just return z<<24
// eg:0110010 -1000000,101011 -110000
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
const UINT32 Data = {
0xC0000000,0x80000000,0x80000000,0x40401000,0x80000000,0x40801800,0x40802000,0x00800008,
0x80000000,0x41002800,0x41003000,0x0100000C,0x41004000,0x01000010,0x01000010,0x01000010,
0x80000000,0x42004800,0x42005000,0x02000014,0x42006000,0x02000018,0x02000018,0x02000018,
0x42008000,0x02000020,0x02000020,0x02000020,0x02000020,0x02000020,0x02000020,0x02000020,
0x80000000,0x44008800,0x44009000,0x04000024,0x4400A000,0x04000028,0x04000028,0x04000028,
0x4400C000,0x04000030,0x04000030,0x04000030,0x04000030,0x04000030,0x04000030,0x04000030,
0x44010000,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,
0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,
0x80000000,0x48010800,0x48011000,0x08000044,0x48012000,0x08000048,0x08000048,0x08000048,
0x48014000,0x08000050,0x08000050,0x08000050,0x08000050,0x08000050,0x08000050,0x08000050,
0x48018000,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,
0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,
0x48020000,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
0x80000000,0x50020800,0x50021000,0x10000084,0x50022000,0x10000088,0x10000088,0x10000088,
0x50024000,0x10000090,0x10000090,0x10000090,0x10000090,0x10000090,0x10000090,0x10000090,
0x50028000,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,
0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,
0x50030000,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
0x50040000,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
// if(first byte has at least 3 ones)
// {
// get z and return z<<24
// }
// if(first byte has 2 ones)
// {
// if ( other bits areall 0)
// return (first byte)<<24
// else
// get y,return y<<24
// }
// if (first byte has 1 one )
// {
// if(second byte has at least 2 ones)
// return (first byte)<<24 + (x of second byte)<<16
// go to GXQ;
//
// }
// if (first byte has no one ) {
// go to GXQ;
// }
///////////////////////////////////////////////////////////////////////////////////////////////////////////
_declspec(naked) int GreaterEqualBit2_SHSHSH(int n)
{
__asm
{
movecx, dword ptr; //save n to ecx
moveax, ecx;
shreax, 24; //get first byte
shleax, 2;//mul 4
movedx, dword ptr; //get table item
moveax, 0xC0000000; //get mark
andeax, edx;
cmpeax,0;
jne first_byte_l_3_ones;
// at least 3 ones in first byte
moveax, 0x000003FF; //get z 00000000 00000000 00000011 11111111
andeax, edx;
shleax, 24;
ret;
first_byte_l_3_ones:
cmpeax,1;
jne first_byte_l_2_ones;
// exactly 2 ones in first byte
moveax, ecx;
shleax, 8; //other data
cmpeax,0;
jne first_byte_2_ones_plus_sth;
moveax,ecx;
andeax,0xFF000000;
ret;
first_byte_2_ones_plus_sth:
moveax, 0x000FFC00; //get y 00000000 00001111 11111100 00000000
andeax, edx;
shleax, 14;
ret;
first_byte_l_2_ones:
cmpeax,2;
jne return_hardcase;
//1 one,get second byte
moveax, 0x00FF0000;
andeax, ecx;
shreax, 14;
movedx, dword ptr; //get table item
moveax, 0xC0000000; //get mark
andeax, edx;
cmpeax, 0;
jnesecond_byte_l_3_ones;
//second byte has at least 3 ones
moveax, 0x3FF00000; //get x 00111111 11110000 00000000 00000000
andeax, edx;
shreax, 4;
andecx, 0xFF000000;
addeax, ecx;
ret;
second_byte_l_3_ones:
cmpeax, 1;
jnesecond_byte_l_2_ones;
//second byte has 2 ones ,same to above
moveax, 0x3FF00000; //get x 00111111 11110000 00000000 00000000
andeax, edx;
shreax, 4;
andecx, 0xFF000000;
addeax, ecx;
ret;
second_byte_l_2_ones:
//second byte has at most 1 one
jmp return_hardcase;
return_hardcase: //copy from GXQ:)
mov edx, ecx;
mov eax, 1;
bsr ecx, edx;
shl eax, cl;
push eax;
xor edx, eax;
bsr ecx, edx;
mov eax, 0;
setne al;
shl eax, cl;
lea ecx, ;
xor edx, eax;
cmovneeax, ecx;
pop edx;
add eax, edx;
ret;
}
}
怎么说呢
感觉是杂货摊
有点不好啊 目前来说
24, 25, 57, 58, 75代表了几种不同的实现
回复 86# shshsh_0510 的帖子
厉害啊!第一次写就这么强!在 82# 的测试中一举夺冠! :)
明白shshsh_0510代码的意思了
呵呵
属于查表法的变形
变形的很巧妙
不过,是否能简化呢?