一个快速得到bit位索引的算法 - 第2页 - 算法交流 - 数学研发论坛

mathe 发表于 2008-3-4 08:43:41

根据那个链接，数1的比特位数目应该用查询表还要快一些。（特别对于16比特整数）

gxqcn 发表于 2008-3-4 08:50:41

原帖由 mathe 于 2008-3-4 08:40 发表 http://bbs.emath.ac.cn/images/common/back.gif
我们知道,像下面的代码（来源自http://www.everything2.com/index.pl?node_id=1181258）可以
比较快速计算出一个int中比特1的数目，稍微修改一下，就可以变成计算short中比特1的数目的代码（可能需要4个时钟周期？？
通过预先计算后半个short部分比特1的数目，我们应该可以将代码拆分成两个可以并行的部分。...

这个想法好是好，但最后如何并行并不容易啊。。。

不知道CPU是否支持函数级的并行：当两个紧邻的函数名一致、而输入输出没有依赖关系时？

mathe 发表于 2008-3-4 08:58:56

还有，gxqcn的代码可以改成C代码unsigned find(unsigned code, int index[])
{
int i;
int *p=index;
for(i=0;i<32;i++){
*p=i;
p+=(code&(1<<i))!=0;
}
*p=-1;
}看看，当然编译器优化选项要打开（这样循环才会被展开）。
如果当心编译器优化选项打开会影响测试，可以创建保换两个文件的工程，将这部分代码单独放在一个文件中，然后单独对这个文件优化。

mathe 发表于 2008-3-4 09:00:45

函数级的不行，没法做到指令级并行，而线程级并行显然不能使用。
发现你那个代码如果拆分成并行的两部分或者四部分，主要问题在于最后对于那些多余的0有些问题

无心人 发表于 2008-3-4 09:01:43

别想了
目前的X86没并行功能
考虑Cell级别的CPU可能还行

哈 :)

mathe 发表于 2008-3-4 09:10:01

将上面代码用gcc -O3 -funroll-all-loops -S编译出来代码为:
不过不知道为什么前面要做一次跳转指令，估计是为了让内存同Cacheline边界对齐。
如果我们能够保证输入数组空间已经内存对齐，那么这部分指令可以拿掉。（特别对于重复运行多次进行测试的代码，这部分应该拿掉） .file "tta1.c"
.text
.p2align 4,,15
.globl find
.type find, @function
find:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
movl \$1, %esi
pushl %ebx
xorl %ebx, %ebx
subl \$4, %esp
movl 12(%ebp), %eax
movl 8(%ebp), %edi
movl %eax, -16(%ebp)
jmp .L2
.p2align 4,,7
.L20:
movl -16(%ebp), %eax
.L2:
movb %bl, %cl
movl %esi, %edx
sall %cl, %edx
andl %edi, %edx
movl %ebx, (%eax)
cmpl \$1, %edx
movl -16(%ebp), %eax
sbbl %edx, %edx
notl %edx
andl \$4, %edx
addl %eax, %edx
movl %esi, %eax
leal 1(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 2(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 3(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 4(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 5(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 6(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 7(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
addl \$8, %ebx
notl %ecx
andl \$4, %ecx
cmpl \$32, %ebx
leal (%edx,%ecx), %eax
movl %eax, -16(%ebp)
jne .L20
movl 12(%ebp), %ecx
subl %ecx, -16(%ebp)
sarl \$2, -16(%ebp)
movl -16(%ebp), %eax
popl %edx
popl %ebx
popl %esi
popl %edi
popl %ebp
retgxqcn: 刚才对上述代码进行了重新排版，
1、将所有的美元符前添加“\”，以防止误解析为数学公式
2、将所有的Tab符替换成四个空格，并适当删减空格使之对齐

mathe 发表于 2008-3-4 09:18:43

好像上面编译器展开的代码有点问题，看不大懂，还是直接用手工展开的代码吧:#define STORE(i)*p=i; \
               p+=(code&(1<<i))!=0;

unsigned find(unsigned code, int index)
{
int i;
int *p=index;

   STORE(0)
   STORE(1)
   STORE(2)
   STORE(3)
   STORE(4)
   STORE(5)
   STORE(6)
   STORE(7)
   STORE(8)
   STORE(9)
   STORE(10)
   STORE(11)
   STORE(12)
   STORE(13)
   STORE(14)
   STORE(15)
   STORE(16)
   STORE(17)
   STORE(18)
   STORE(19)
   STORE(20)
   STORE(21)
   STORE(22)
   STORE(23)
   STORE(24)
   STORE(25)
   STORE(26)
   STORE(27)
   STORE(28)
   STORE(29)
   STORE(30)
   STORE(31)
*p=-1;
return p-index;
}对应gcc编译汇编代码为:find:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
pushl %ebx
movl 12(%ebp), %ebx
movl %eax, %edx
movl %eax, %ecx
andl \$2, %edx
andl \$1, %ecx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
leal (%ebx,%ecx,4), %ecx
andl \$4, %edx
movl \$0, (%ebx)
addl %ecx, %edx
movl \$1, (%ecx)
movl %eax, %ecx
andl \$4, %ecx
movl \$2, (%edx)
addl %ecx, %edx
movl %eax, %ecx
andl \$8, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$3, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$16, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$4, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$32, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$5, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$64, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$6, (%ecx)
addl %ecx, %edx
movb %al, %cl
andb \$-128, %cl
cmpb \$1, %cl
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$7, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$256, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$8, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$512, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$9, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$1024, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$10, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$2048, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$11, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$4096, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$12, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$8192, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$13, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$16384, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$14, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$32768, %ecx
cmpw \$1, %cx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$15, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$65536, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$16, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$131072, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$17, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$262144, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$18, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$524288, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$19, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$1048576, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$20, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$2097152, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$21, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$4194304, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$22, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$8388608, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$23, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$16777216, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$24, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$33554432, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$25, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$67108864, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
addl %ecx, %edx
movl \$26, (%ecx)
movl %eax, %ecx
andl \$134217728, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
addl %edx, %ecx
movl \$27, (%edx)
movl %eax, %edx
andl \$268435456, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
addl %ecx, %edx
movl \$28, (%ecx)
movl %eax, %ecx
andl \$536870912, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$29, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$1073741824, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
sarl \$31, %eax
addl %ecx, %edx
andl \$4, %eax
addl %edx, %eax
movl \$30, (%ecx)
movl \$31, (%edx)
movl \$-1, (%eax)
subl %ebx, %eax
popl %ebx
sarl \$2, %eax
popl %ebp
ret对应

gxqcn 发表于 2008-3-4 09:33:54

注意：我对16#、17#帖子进行过编辑排版。

似乎楼上的汇编指令数比我那版多了不少。:lol

mathe 发表于 2008-3-4 09:54:31

指令数目并不说明任何问题，我觉得编译器这么选择应该有其理由，毕竟这种细节问题人通常没有编译器考虑得好

无心人 发表于 2008-3-10 10:06:37

总感觉不对劲

终于觉醒了
似乎我是测试的1000万, 你们测试的是100万
把GXQ代码和我代码聚集在一起

终于比较出来了
还是我的快

我冤啊

页: 1 [2] 3 4 5 6 7

数学研发论坛's Archiver