织女的六把黄金尺子问题 - 第4页 - 编程擂台 - 数学研发论坛

liangbch 发表于 2014-12-18 08:19:46

本楼是讨论汇编优化的，不喜欢汇编的人请忽略本楼。
在28#的代码中，“if ( g_used [ i ] )    continue” 这句是用来查找一个值为0的元素，这使我立即想起c运行时刻函数strlen,这个函数也是用来查找值为0的元素，不过其数组是char型而不是整型。如果将这段代码改为调用strlen函数，效果如何呢？说干就干，于是我又写了一版调用函数strlen的版本。另外，我有加了2个strlen函数的汇编的版本。源代码是来自于丹麦的Anger Fog。下面是调用strlen函数的版本。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>

/* 说明：
我们定义尺子编号为i,(i>=0), 尺子i总是有i+1段。
我们首先搜索编号最大的尺子，然后搜索编号次小的尺子，最后搜索编号为0的尺子，在本程序尺子编号用rule_idx来表示
*/

#define MAX_RULES_COUNT 8 //最多可搜索8把尺子
#define MAX_SEG_COUNT120 //最多可覆盖120种不同的长度
char g_used; //used=1表示长度i可被尺子测量
int g_maxSegArr[]={1,4,10,20,35,56,84,120}; //maxSegArr表示尺子i可测量的不同长度的总数
int g_result; // result表示尺子i第j+1段的长度
int g_count;

// The asm code is from Anger Fog "optimization_manuals"
_declspec(naked)
int anger_strlen( char *s )
{
__asm
{
push ebx
   mov ecx,       ; get pointer to string
   mov eax, ecx             ; copy pointer
   and ecx, 3             ; lower 2 bits of address, check alignment
   jz    L2                   ; string is aligned by 4. Go to loop
   and eax, -4             ; align pointer by 4
   mov ebx,          ; read from nearest preceding boundary
   shl ecx, 3             ; mul by 8 = displacement in bits
   mov edx, -1
   shl edx, cl             ; make byte mask
   not edx                ; mask = 0FFH for false bytes
   or    ebx, edx             ; mask out false bytes

   ; check first four bytes for zero
   lea ecx, ; subtract 1 from each byte
   not ebx                ; invert all bytes
   and ecx, ebx             ; and these two
   and ecx, 80808080H       ; test all sign bits
   jnz L3                   ; zero-byte found

   ; Main loop, read 4 bytes aligned
L1: add eax, 4             ; increment pointer by 4
L2: mov ebx,          ; read 4 bytes of string
   lea ecx, ; subtract 1 from each byte
   not ebx                ; invert all bytes
   and ecx, ebx             ; and these two
   and ecx, 80808080H       ; test all sign bits
   jz    L1                   ; no zero bytes, continue loop

L3: bsf ecx, ecx             ; find right-most 1-bit
   shr ecx, 3             ; divide by 8 = byte index
   sub eax,       ; subtract start address
   add eax, ecx             ; add index to byte
   pop ebx
   ret
}
}
// The asm code is from Anger Fog "optimization_manuals"
_declspec(naked)
int anger_strlen_sse2( char *s )
{
__asm
{
   mov    eax,       ; get pointer to string
   mov    ecx,eax          ; copy pointer
   pxor xmm0, xmm0          ; set to zero
   and    ecx,15          ; lower 4 bits indicate misalignment
   and    eax,-16          ; align pointer by 16
   movdqa xmm1,       ; read from nearest preceding boundary
   pcmpeqbxmm1, xmm0          ; compare 16 bytes with zero
   pmovmskb edx,xmm1          ; get one bit for each byte result
   shr    edx,cl          ; shift out false bits
   shl    edx,cl          ; shift back again
   bsf    edx,edx          ; find first 1-bit
   jnz    L2                ; found

   ; Main loop, search 16 bytes at a time
L1: add    eax,16          ; increment pointer by 16
   movdqa xmm1,       ; read 16 bytes aligned
   pcmpeqbxmm1, xmm0          ; compare 16 bytes with zero
   pmovmskb edx,xmm1          ; get one bit for each byte result
   bsf    edx,edx          ; find first 1-bit
   jz    L1                ; loop if not found

L2: ; Zero-byte found. Compute string length
   sub    eax,       ; subtract start address
   add    eax,edx          ; add byte index
   ret
}
}

void output_result( int rule_count)
{
int i,j;
for(i=0;i<rule_count;i++)
{
printf("(");
for(j=0;j<=i;j++)
printf("%d,",g_result);
printf(")\n");
}
printf("\n");
g_count++;
}

int check(int rule_idx, int deep, int *pNewSegCount, int newSegs[] )
{
int i,c,sum;
for (c=0,sum=0,i=deep-1;i>=0;i--)
{
sum += g_result;
if ( g_used )
return 0;
g_used=32;
newSegs=sum;
*pNewSegCount=c;
}
return 1;
}

//deep: 表示将要确定尺子第deep+1段的长度
void new_mark(int rule_count,int seg_count,int rule_idx, int deep)
{
int i,j,mid,flag,sum,new_used_count;
int new_used;

mid=rule_idx/2; //第rule_idx+1把尺子中间段的编号
for (sum=0,i=0;i<deep;i++)
sum += g_result;

for (i=1;i<=seg_count;i++)
{
//i+= strlen(g_used+i);
//i+= anger_strlen(g_used+i);
i+= anger_strlen_sse2(g_used+i);
if ( i + sum > seg_count)
break;

if ( deep==mid+1 && i< g_result )
continue;//消除重复的结果

g_result=i;
flag=check(rule_idx,deep+1,&new_used_count,new_used);
if (flag )
{
if ( rule_idx==0)
output_result(rule_count);
if ( deep==rule_idx)
new_mark(rule_count,seg_count,rule_idx-1,0);
else
new_mark(rule_count,seg_count,rule_idx,deep+1);
}
for ( j=0;j<new_used_count;j++)
g_used]=0;
}
}

void search(int n)
{
DWORD t;
printf("\nIs searching the soultion when rule count is %d\n",n);
t=::GetTickCount();
g_count=0;memset(g_result,0,sizeof(g_result));
new_mark(n,g_maxSegArr,n-1,0);
t=::GetTickCount()-t;
printf("It take %d ms and find %d solution\n",t,g_count);
}

//int main(int argc, char* argv[])
//{
// int i;
// for (i=1;i<=6;i++ ) search(i);
// return 0;
//}

liangbch 发表于 2014-12-18 08:30:42

效果？
效果不是很明显，调用3种strlen函数的版本的速度区别不大，在忽略误差的情况下，与原始版本差不多，可能的原因有
1. 这段代码不是热点。
2. 值为0的元素相对比较多，函数strlen查找的长度较小；
3. g_used[ i ]的地址在绝大部分情况下不是4字节对齐。汇编版本难以施展其威力。
我们不应怀疑汇编函数的意义。在字符串长度相对较长的情况下，在字符串地址是4字节对齐的情况下。汇编版本的strlen应该具有很大的优势。

zeroieme 发表于 2014-12-18 18:14:05

渣机器渣代码跑了半小时……
于是，我想能否转成跳舞链的01矩阵

毒酒滴冻鸭 发表于 2014-12-19 11:02:32

上次忘了，还有一张把全部56个长度的量法都列出的图。。。

毒酒滴冻鸭 发表于 2014-12-28 23:35:59

非常感谢gxqcn大大的赐精！:loveliness:

有空再多发两道困扰我多时的题目。。。:)

王守恩 发表于 2019-12-18 11:21:12

我们好像可以约定每把尺子的长度。
1 把尺子=1
2 把尺子=4, 3
3 把尺子=10, 9, 7
4 把尺子=20, 19, 17, 14
5 把尺子=35, 34, 32, 29,25
6 把尺子=56, 55, 53, 50, 46,41
7 把尺子=84, 83, 81, 78, 74, 69, 63

页: 1 2 3 [4]

数学研发论坛's Archiver