无心人 发表于 2009-2-4 16:47:31

希望能得到GxQ机器的结果
你的P4和我的不同

gxqcn 发表于 2009-2-4 17:06:27

GreaterEqualBit2_24F(314159265)=0x14000000
GreaterEqualBit2_25F(314159265)=0x14000000
GreaterEqualBit2_36F(314159265)=0x14000000
GreaterEqualBit2_57F(314159265)=0x14000000
GreaterEqualBit2_58F(314159265)=0x14000000
GreaterEqualBit2_59F(314159265)=0x14000000
GreaterEqualBit2_75F(314159265)=0x14000000
24# 56449.240 ms
25# 51673.252 ms
36# 49594.482 ms
57# 52741.690 ms
58# 49340.659 ms
59# 52868.823 ms
75# 49191.056 ms

以上是AMD CPU的测试数据,Intel 的等我回家测。。。

gxqcn 发表于 2009-2-4 17:07:11

无心人编译的程序无法在我的机器上运行,
所以自己写代码编译测试(顺带将86# shshsh_0510 的代码一并测试),
结果如下:GreaterEqualBit2_24F(314159265)=0x14000000
GreaterEqualBit2_25F(314159265)=0x14000000
GreaterEqualBit2_36F(314159265)=0x14000000
GreaterEqualBit2_57F(314159265)=0x14000000
GreaterEqualBit2_58F(314159265)=0x14000000
GreaterEqualBit2_59F(314159265)=0x14000000
GreaterEqualBit2_75F(314159265)=0x14000000
GreaterEqualBit2_86F(314159265)=0x14000000


n: 1-2^31

24#: 25 s
25#: 29 s
36#: 32 s
57#: 26 s
58#: 31 s
59#: 28 s
75#: 24 s
86#: 17 s
Press any key to continue

注:本帖所测代码最后修改于 2009-02-04 20:00

无心人 发表于 2009-2-4 17:08:08

25#可进一步优化为

_declspec(naked)
UINT32 GreaterEqualBit2_25F(UINT32 n)
{
__asm {
mov eax,
    mov edx, eax
    bsr ecx, edx
    btr edx, ecx
    bsrecx, edx
    jz exit25
    btr edx, ecx
    cmp edx, 0
    je exit25
    mov edx, 1
    shledx, cl
    sub edx, 1
    or eax, edx
    add eax, 1
exit25:
ret
    }
}

==============================
Core2 Xeon 1.6G测试
24# 24303.416 ms
25# 22223.639 ms
36# 26344.195 ms
57# 22893.317 ms
58# 25275.783 ms
59# 24195.312 ms
75# 23295.180 ms
=====================
已经完全超越其他函数
并测试从1-15的结果均对
=======================
代码OK, 重新更新下载去

无心人 发表于 2009-2-4 17:16:49

79#更新了

GxQ如果是以79#测试的, 要重新测试

25#应该时间减少
================================
另外, 25#可以输入0, 并正常产生输出, 结果是0
24也是, 36结果不对, 其他么测, 程序就异常了

shshsh_0510 发表于 2009-2-4 17:18:09

49# 的程序我用汇编写了一下
我进行了简单测试,由于比较复杂不知是否正确,但应该差不多吧。
发现之所以慢,是因为功能本身太小,如果代码较长编译器稍稍放松一点就白费劲了。
另外,全部展开太复杂,就偷懒在3,4层将gxq的copy进去了,总之就是想说,不用bsr 未必就不行 :)
第一次写汇编,还请各位多指教///////////////////////////////////////////////////////////////////////////////////////////////////////////////
//m 1-2 ,2 bits :mark
//         0: at least 3 ones
//         1: 2 ones
//         2: 1 one
//         3: 0 one
//x 3-12 ,10 bits:
//          when contains at least 2 ones ,then x= upper-one
//          eg : 11001 - 100000 ,1001 - 10000 ,0011-0100
//y 13-22 ,10 bits:
//          when contains2 ones, y= GreaterEqualBit2(first byte+1)
//          eg   :1010-1100 ,0011 -0100
//z 23-32 ,10 bits:
//          when contains at least 3 ones , it is y =GreaterEqualBit2(first byte) ,so just return z<<24
//          eg:0110010 -1000000,101011 -110000
///////////////////////////////////////////////////////////////////////////////////////////////////////////////

const UINT32 Data = {
        0xC0000000,0x80000000,0x80000000,0x40401000,0x80000000,0x40801800,0x40802000,0x00800008,
        0x80000000,0x41002800,0x41003000,0x0100000C,0x41004000,0x01000010,0x01000010,0x01000010,
        0x80000000,0x42004800,0x42005000,0x02000014,0x42006000,0x02000018,0x02000018,0x02000018,
        0x42008000,0x02000020,0x02000020,0x02000020,0x02000020,0x02000020,0x02000020,0x02000020,
        0x80000000,0x44008800,0x44009000,0x04000024,0x4400A000,0x04000028,0x04000028,0x04000028,
        0x4400C000,0x04000030,0x04000030,0x04000030,0x04000030,0x04000030,0x04000030,0x04000030,
        0x44010000,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,
        0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,0x04000040,
        0x80000000,0x48010800,0x48011000,0x08000044,0x48012000,0x08000048,0x08000048,0x08000048,
        0x48014000,0x08000050,0x08000050,0x08000050,0x08000050,0x08000050,0x08000050,0x08000050,
        0x48018000,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,
        0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,0x08000060,
        0x48020000,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
        0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
        0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
        0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,0x08000080,
        0x80000000,0x50020800,0x50021000,0x10000084,0x50022000,0x10000088,0x10000088,0x10000088,
        0x50024000,0x10000090,0x10000090,0x10000090,0x10000090,0x10000090,0x10000090,0x10000090,
        0x50028000,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,
        0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,0x100000A0,
        0x50030000,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
        0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
        0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
        0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,0x100000C0,
        0x50040000,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,
        0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100,0x10000100
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
//        if(first byte has at least 3 ones)
//        {
//               get z and return z<<24
//        }
//        if(first byte has 2 ones)
//        {
//               if ( other bits areall 0)
//                          return (first byte)<<24
//               else
//                          get y,return y<<24
//        }
//        if (first byte has 1 one )
//        {
//               if(second byte has at least 2 ones)
//                       return (first byte)<<24 + (x of second byte)<<16
//               go to GXQ;
//                          
//        }
//        if (first byte has no one ) {
//               go to GXQ;
//        }
///////////////////////////////////////////////////////////////////////////////////////////////////////////
_declspec(naked) int GreaterEqualBit2_SHSHSH(int n)
{
        __asm
        {
                movecx, dword ptr; //save n to ecx
                moveax, ecx;
                shreax, 24; //get first byte
                shleax, 2;//mul 4
                movedx, dword ptr; //get table item
               
                moveax, 0xC0000000; //get mark
                andeax, edx;

                cmpeax,0;
                jne   first_byte_l_3_ones;
                // at least 3 ones in first byte
                moveax, 0x000003FF; //get z 00000000 00000000 00000011 11111111
                andeax, edx;
                shleax, 24;
                ret;

first_byte_l_3_ones:
                cmpeax,1;   
                jne   first_byte_l_2_ones;
                // exactly 2 ones in first byte
                moveax, ecx;
                shleax, 8; //other data
                cmpeax,0;
                jne   first_byte_2_ones_plus_sth;
                moveax,ecx;
                andeax,0xFF000000;
                ret;
first_byte_2_ones_plus_sth:
                moveax, 0x000FFC00; //get y 00000000 00001111 11111100 00000000
                andeax, edx;
                shleax, 14;
                ret;
first_byte_l_2_ones:
                cmpeax,2;
                jne return_hardcase;
                //1 one,get second byte
                moveax, 0x00FF0000;
                andeax, ecx;
                shreax, 14;
                movedx, dword ptr; //get table item

                moveax, 0xC0000000; //get mark
                andeax, edx;
               
                cmpeax, 0;
                jnesecond_byte_l_3_ones;
                //second byte has at least 3 ones
                moveax, 0x3FF00000; //get x 00111111 11110000 00000000 00000000
                andeax, edx;
                shreax, 4;
                andecx, 0xFF000000;
                addeax, ecx;
                ret;
second_byte_l_3_ones:
                cmpeax, 1;
                jnesecond_byte_l_2_ones;
                //second byte has 2 ones ,same to above
                moveax, 0x3FF00000; //get x 00111111 11110000 00000000 00000000
                andeax, edx;
                shreax, 4;
                andecx, 0xFF000000;
                addeax, ecx;
                ret;
second_byte_l_2_ones:
                //second byte has at most 1 one
                jmp return_hardcase;

return_hardcase: //copy from GXQ:)
      mov   edx, ecx;
      mov   eax, 1;
      bsr   ecx, edx;
      shl   eax, cl;
      push    eax;
      xor   edx, eax;
      bsr   ecx, edx;
      mov   eax, 0;
      setne   al;
      shl   eax, cl;
      lea   ecx, ;
      xor   edx, eax;
      cmovneeax, ecx;
      pop   edx;
      add   eax, edx;
      ret;
        }
}

无心人 发表于 2009-2-4 17:23:13

怎么说呢

感觉是杂货摊
有点不好啊

无心人 发表于 2009-2-4 17:26:49

目前来说

24, 25, 57, 58, 75代表了几种不同的实现

gxqcn 发表于 2009-2-4 20:05:20

回复 86# shshsh_0510 的帖子

厉害啊!第一次写就这么强!
在 82# 的测试中一举夺冠!

无心人 发表于 2009-2-4 20:21:40

:)

明白shshsh_0510代码的意思了
呵呵

属于查表法的变形
变形的很巧妙

不过,是否能简化呢?
页: 1 2 3 4 5 6 7 8 [9] 10 11
查看完整版本: 求bit位为1数目不超出2的正整数