mathe
发表于 2008-3-4 08:43:41
根据那个链接,数1的比特位数目应该用查询表还要快一些。(特别对于16比特整数)
gxqcn
发表于 2008-3-4 08:50:41
原帖由 mathe 于 2008-3-4 08:40 发表 http://bbs.emath.ac.cn/images/common/back.gif
我们知道,像下面的代码(来源自http://www.everything2.com/index.pl?node_id=1181258)可以
比较快速计算出一个int中比特1的数目,稍微修改一下,就可以变成计算short中比特1的数目的代码(可能需要4个时钟周期??
通过预先计算后半个short部分比特1的数目,我们应该可以将代码拆分成两个可以并行的部分。...
这个想法好是好,但最后如何并行并不容易啊。。。
不知道CPU是否支持函数级的并行:当两个紧邻的函数名一致、而输入输出没有依赖关系时?
mathe
发表于 2008-3-4 08:58:56
还有,gxqcn的代码可以改成C代码unsigned find(unsigned code, int index[])
{
int i;
int *p=index;
for(i=0;i<32;i++){
*p=i;
p+=(code&(1<<i))!=0;
}
*p=-1;
}看看,当然编译器优化选项要打开(这样循环才会被展开)。
如果当心编译器优化选项打开会影响测试,可以创建保换两个文件的工程,将这部分代码单独放在一个文件中,然后单独对这个文件优化。
mathe
发表于 2008-3-4 09:00:45
函数级的不行,没法做到指令级并行,而线程级并行显然不能使用。
发现你那个代码如果拆分成并行的两部分或者四部分,主要问题在于最后对于那些多余的0有些问题
无心人
发表于 2008-3-4 09:01:43
别想了
目前的X86没并行功能
考虑Cell级别的CPU可能还行
哈 :)
mathe
发表于 2008-3-4 09:10:01
将上面代码用gcc -O3 -funroll-all-loops -S编译出来代码为:
不过不知道为什么前面要做一次跳转指令,估计是为了让内存同Cacheline边界对齐。
如果我们能够保证输入数组空间已经内存对齐,那么这部分指令可以拿掉。(特别对于重复运行多次进行测试的代码,这部分应该拿掉) .file "tta1.c"
.text
.p2align 4,,15
.globl find
.type find, @function
find:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
movl \$1, %esi
pushl %ebx
xorl %ebx, %ebx
subl \$4, %esp
movl 12(%ebp), %eax
movl 8(%ebp), %edi
movl %eax, -16(%ebp)
jmp .L2
.p2align 4,,7
.L20:
movl -16(%ebp), %eax
.L2:
movb %bl, %cl
movl %esi, %edx
sall %cl, %edx
andl %edi, %edx
movl %ebx, (%eax)
cmpl \$1, %edx
movl -16(%ebp), %eax
sbbl %edx, %edx
notl %edx
andl \$4, %edx
addl %eax, %edx
movl %esi, %eax
leal 1(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 2(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 3(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 4(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 5(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 6(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %eax, %eax
notl %eax
andl \$4, %eax
addl %eax, %edx
movl %esi, %eax
leal 7(%ebx), %ecx
sall %cl, %eax
movl %ecx, (%edx)
movl %eax, %ecx
andl %edi, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
addl \$8, %ebx
notl %ecx
andl \$4, %ecx
cmpl \$32, %ebx
leal (%edx,%ecx), %eax
movl %eax, -16(%ebp)
jne .L20
movl 12(%ebp), %ecx
subl %ecx, -16(%ebp)
sarl \$2, -16(%ebp)
movl -16(%ebp), %eax
popl %edx
popl %ebx
popl %esi
popl %edi
popl %ebp
retgxqcn: 刚才对上述代码进行了重新排版,
1、将所有的美元符前添加“\”,以防止误解析为数学公式
2、将所有的Tab符替换成四个空格,并适当删减空格使之对齐
mathe
发表于 2008-3-4 09:18:43
好像上面编译器展开的代码有点问题,看不大懂,还是直接用手工展开的代码吧:#define STORE(i)*p=i; \
p+=(code&(1<<i))!=0;
unsigned find(unsigned code, int index)
{
int i;
int *p=index;
STORE(0)
STORE(1)
STORE(2)
STORE(3)
STORE(4)
STORE(5)
STORE(6)
STORE(7)
STORE(8)
STORE(9)
STORE(10)
STORE(11)
STORE(12)
STORE(13)
STORE(14)
STORE(15)
STORE(16)
STORE(17)
STORE(18)
STORE(19)
STORE(20)
STORE(21)
STORE(22)
STORE(23)
STORE(24)
STORE(25)
STORE(26)
STORE(27)
STORE(28)
STORE(29)
STORE(30)
STORE(31)
*p=-1;
return p-index;
}对应gcc编译汇编代码为:find:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
pushl %ebx
movl 12(%ebp), %ebx
movl %eax, %edx
movl %eax, %ecx
andl \$2, %edx
andl \$1, %ecx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
leal (%ebx,%ecx,4), %ecx
andl \$4, %edx
movl \$0, (%ebx)
addl %ecx, %edx
movl \$1, (%ecx)
movl %eax, %ecx
andl \$4, %ecx
movl \$2, (%edx)
addl %ecx, %edx
movl %eax, %ecx
andl \$8, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$3, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$16, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$4, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$32, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$5, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$64, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$6, (%ecx)
addl %ecx, %edx
movb %al, %cl
andb \$-128, %cl
cmpb \$1, %cl
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$7, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$256, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$8, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$512, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$9, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$1024, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$10, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$2048, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$11, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$4096, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$12, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$8192, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$13, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$16384, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$14, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$32768, %ecx
cmpw \$1, %cx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$15, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$65536, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$16, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$131072, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$17, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$262144, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$18, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$524288, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$19, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$1048576, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$20, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$2097152, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$21, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$4194304, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$22, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$8388608, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$23, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$16777216, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
movl \$24, (%ecx)
addl %ecx, %edx
movl %eax, %ecx
andl \$33554432, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$25, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$67108864, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
addl %ecx, %edx
movl \$26, (%ecx)
movl %eax, %ecx
andl \$134217728, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
addl %edx, %ecx
movl \$27, (%edx)
movl %eax, %edx
andl \$268435456, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
addl %ecx, %edx
movl \$28, (%ecx)
movl %eax, %ecx
andl \$536870912, %ecx
cmpl \$1, %ecx
sbbl %ecx, %ecx
notl %ecx
andl \$4, %ecx
movl \$29, (%edx)
addl %edx, %ecx
movl %eax, %edx
andl \$1073741824, %edx
cmpl \$1, %edx
sbbl %edx, %edx
notl %edx
andl \$4, %edx
sarl \$31, %eax
addl %ecx, %edx
andl \$4, %eax
addl %edx, %eax
movl \$30, (%ecx)
movl \$31, (%edx)
movl \$-1, (%eax)
subl %ebx, %eax
popl %ebx
sarl \$2, %eax
popl %ebp
ret对应
gxqcn
发表于 2008-3-4 09:33:54
注意:我对16#、17#帖子进行过编辑排版。
似乎楼上的汇编指令数比我那版多了不少。:lol
mathe
发表于 2008-3-4 09:54:31
指令数目并不说明任何问题,我觉得编译器这么选择应该有其理由,毕竟这种细节问题人通常没有编译器考虑得好
无心人
发表于 2008-3-10 10:06:37
总感觉不对劲
终于觉醒了
似乎我是测试的1000万, 你们测试的是100万
把GXQ代码和我代码聚集在一起
终于比较出来了
还是我的快
我冤啊