优化版本
//http://bbs.emath.ac.cn/viewthread.php?tid=1689&page=2&fromuid=8#pid21088#include <iostream>
#include <ctime>
#include <vector>
#include <map>
using namespace std;
typedef unsigned int UINT32, *PUINT32;
typedef unsigned __int64 UINT64, *PUINT64;
typedef struct node
{
UINT32key;
UINT32value;
}NODE;
typedef vector< NODE > NODE_VECTOR;
typedef map< UINT32, UINT32 > KEY_INDEX_MAP;
typedef KEY_INDEX_MAP::value_type MVT;
#ifndef UInt32x32To64
# define UInt32x32To64(a, b) ((UINT64)(((UINT64)(a)) * ((UINT64)(b))))
#endif
char szResult;
//http://bbs.emath.ac.cn/viewthread.php?tid=1207&page=6&fromuid=8#pid16108
UINT32 Sqrt( UINT32 n )
{
UINT32 t;
UINT32 ret = 0;
#define SQRT_CORE(x) \
t = ret + (1UL<<(x)); \
ret >>= 1; \
if ( n >= t ) \
{ \
n -= t; \
ret += (1UL<<(x));\
}
SQRT_CORE(30);
SQRT_CORE(28);
SQRT_CORE(26);
SQRT_CORE(24);
SQRT_CORE(22);
SQRT_CORE(20);
SQRT_CORE(18);
SQRT_CORE(16);
SQRT_CORE(14);
SQRT_CORE(12);
SQRT_CORE(10);
SQRT_CORE(8);
SQRT_CORE(6);
SQRT_CORE(4);
SQRT_CORE(2);
SQRT_CORE(0);
#undef SQRT_CORE
return ret;
}
const char * GetMin01( UINT32 n )
{
char * p = szResult;
UINT32 i=0, j=0, d, ten, bits, index, size0, size1;
UINT32 key, SqrtN;
UINT32 value;
NODE info;
NODE_VECTOR vNode;
KEY_INDEX_MAP mKeyIndex;
KEY_INDEX_MAP::iterator it;
*( p += 127 ) = '\0';
//if ( 0 == n ) return p;
while ( 0 == ( 1 & n ))
{
++i; //times of prime 2
n >>= 1;
}
while ( 0 == n % 5 )
{
++j; //times of prime 5
n /= 5;
}
if ( i < j ) i = j; //max times of prime 2 and prime 5
memset( p -= i, '0', i * sizeof(char));
if ( 1 == n )
{
*(--p) = '1';
return p;
}
//special case that n+1 is 10^j
i = n + 1;
j = 0;
while ( 0 == i % 10 )
{
++j;
i /= 10;
}
if ( 1 == i )
{
j *= 9;
memset( p -= j, '1', j * sizeof(char));
return p;
}
switch( n )
{
case 3:
case 37: n = 111;
break;
case 7:
case 13: n = 1001;
break;
case 337: n = 1011;
break;
case 367: n = 1101;
break;
default:
break;
}
i = n;
j = 0;
while (( d = i % 10 ) <= 1 )
{
*(--p) = '0' + d;
if ( 0 == ( i /= 10 ))
{
return p;
}
++j;
}
p += j;
const UINT32 INIT4BITS[] = { 0, 1, 10, 11, 100, 101, 110, 111, 1000, 1001, 1010, 1011, 1100, 1101, 1110, 1111, };
for ( i=0; 16!=i; ++i )
{
info.key = INIT4BITS % n;
info.value = i;
vNode.push_back( info );
mKeyIndex.insert( MVT( info.key, i ));
}
index = i - 1;
SqrtN = Sqrt( n );
for ( bits=4, ten=10000%n, size1=size0=vNode.size(); ; )
{
for ( i=1; size0!=i; ++i )
{
d = (UINT32)( UInt32x32To64( vNode.key, ten ) % n );
if ( mKeyIndex.end() != ( it = mKeyIndex.find( n - d )))
{
value = vNode[(*it).second].value;
do
{
*(--p) = '0' + ( 1 & value );
value >>= 1;
} while( 0 != --bits );
value = vNode.value;
do
{
*(--p) = '0' + ( 1 & value );
} while( 0 != ( value >>= 1 ));
return p;
}
}
for ( i=1; size1!=i; ++i )
{
d = (UINT32)( UInt32x32To64( vNode.key, ten ) % n );
value = vNode.value << bits;
for ( j=0; size0!=j; ++j )
{
key = vNode.key + d;
if ( key < d || key >= n )
{
key -= n;
}
if ( mKeyIndex.end() == mKeyIndex.find( key ))
{
mKeyIndex.insert( MVT( info.key = key, ++index ));
info.value = vNode.value | value;
vNode.push_back( info );
}
}
}
if ( size1 == size0 )
{
bits <<= 1;
ten = (UINT32)( UInt32x32To64( ten, ten ) % n );
}
else
{
++bits;
ten = (UINT32)( UInt32x32To64( ten, 10 ) % n );
}
size0 = vNode.size();
if ( size0 <= SqrtN /*&& bits < 16*/ )
{
size1 = size0;
}
else
{
size1 = 2;
}
}
return p;
}
int main( void )
{
UINT32 n;
const char * p;
while( 1 )
{
cout << "N=";
cin >> n;
if( 0 == n )
{
break;
}
clock_t start=clock();
p = GetMin01( n );
clock_t end=clock();
cout << p << "\n";
cout<< "Total cost " << end-start << "ms" << "\n" << endl;
}
return 0;
}Change list:
1、新增整数开方函数;
2、将 value 由 UINT64 修改成 UINT32 呃
那个函数代码行数有点大哦 运行大量数据的统计结果,11#的程序速度并没有优势.
我这里运行时比如数据2230767,2384613,2389761都比较慢 很奇怪,在我这里结论是相反的:
2#的测试结果如下:N=2230767
1000001010001010101110111110111110111111101
Total cost 1418ms
N=2384613
1000001010001010111110111111101110101110111
Total cost 1637ms
N=2389761
1000011101111111110111111011111111111111111
Total cost 1606ms
N=4294967291
1011101001001001001010000111001
Total cost 312ms
N=4294967279
1111011110011001110000101001001001
Total cost 407ms
N=4294967231
11111001101001000000010011010111
Total cost 437ms
N=899999999
1110010111111110000001111011010101101010101
Total cost 2296ms
N=我的则为:N=2230767
1000001010001010101110111110111110111111101
Total cost 702ms
N=2384613
1000001010001010111110111111101110101110111
Total cost 779ms
N=2389761
1000011101111111110111111011111111111111111
Total cost 779ms
N=4294967291
1011101001001001001010000111001
Total cost 62ms
N=4294967279
1111011110011001110000101001001001
Total cost 234ms
N=4294967231
11111001101001000000010011010111
Total cost 93ms
N=899999999
1110010111111110000001111011010101101010101
Total cost 750ms
N=我现在最新的代码如下(删除了开方函数,消减了初始化数量,新增了快速除5、除10的子函数)://http://bbs.emath.ac.cn/viewthread.php?tid=1689&page=2&fromuid=8#pid21101
#include <iostream>
#include <ctime>
#include <vector>
#include <map>
using namespace std;
typedef unsigned int UINT32, *PUINT32;
typedef unsigned __int64 UINT64, *PUINT64;
typedef struct node
{
UINT32key;
UINT32value;
}NODE;
typedef vector< NODE > NODE_VECTOR;
typedef map< UINT32, UINT32 > KEY_INDEX_MAP;
typedef KEY_INDEX_MAP::value_type MVT;
#ifndef UInt32x32To64
# define UInt32x32To64(a, b) ((UINT64)(((UINT64)(a)) * ((UINT64)(b))))
#endif
char szResult;
__declspec(naked)
UINT32 DivMod5( UINT32& n )
{
__asm
{
mov ecx, dword ptr;
mov edx, dword ptr;
mov eax, 0xCCCCCCCD;
push edx;
mul edx;
shr edx, 2;
mov dword ptr, edx;
lea edx, ;
pop eax;
sub eax, edx;
ret;
}
}
__declspec(naked)
UINT32 DivMod10( UINT32& n )
{
__asm
{
mov ecx, dword ptr;
mov edx, dword ptr;
mov eax, 0xCCCCCCCD;
push edx;
mul edx;
shr edx, 3;
mov dword ptr, edx;
imul edx, 10;
pop eax;
sub eax, edx;
ret;
}
}
const char * GetMin01( UINT32 n )
{
char * p = szResult;
UINT32 i=0, j=0, d, ten, bits, index, size0, size1;
UINT32 key;
UINT32 value;
NODE info;
NODE_VECTOR vNode;
KEY_INDEX_MAP mKeyIndex;
KEY_INDEX_MAP::iterator it;
*( p += 127 ) = '\0';
//if ( 0 == n ) return p;
while ( 0 == ( 1 & n ))
{
++i; //times of prime 2
n >>= 1;
}
while ( 0 == ( d = DivMod5( n )))
{
++j; //times of prime 5
}
n = n * 5 + d;
if ( i < j ) i = j; //max times of prime 2 and prime 5
memset( p -= i, '0', i * sizeof(char));
if ( 1 == n )
{
*(--p) = '1';
return p;
}
//special case that n+1 is 10^j
i = n + 1;
j = 0;
while ( 0 == ( d = DivMod10( i )))
{
++j;
}
if ( 0 == i && 1 == d )
{
j *= 9;
memset( p -= j, '1', j * sizeof(char));
return p;
}
i = n;
j = 0;
while (( d = DivMod10( i )) <= 1 )
{
*(--p) = '0' + d;
if ( 0 == i )
{
return p;
}
++j;
}
p += j;
info.value = info.key = 0;
vNode.push_back( info );
mKeyIndex.insert( MVT( info.key, index = 0 ));
info.value = info.key = 1;
vNode.push_back( info );
mKeyIndex.insert( MVT( info.key, ++index ));
for ( bits=1, ten=10%n, size1=size0=vNode.size(); ; )
{
for ( i=1; size0!=i; ++i )
{
d = (UINT32)( UInt32x32To64( vNode.key, ten ) % n );
if ( mKeyIndex.end() != ( it = mKeyIndex.find( n - d )))
{
value = vNode[(*it).second].value;
do
{
*(--p) = '0' + ( 1 & value );
value >>= 1;
} while( 0 != --bits );
value = vNode.value;
do
{
*(--p) = '0' + ( 1 & value );
} while( 0 != ( value >>= 1 ));
return p;
}
}
for ( i=1; size1!=i; ++i )
{
d = (UINT32)( UInt32x32To64( vNode.key, ten ) % n );
value = vNode.value << bits;
for ( j=0; size0!=j; ++j )
{
key = vNode.key + d;
if ( key < d || key >= n )
{
key -= n;
}
if ( mKeyIndex.end() == mKeyIndex.find( key ))
{
mKeyIndex.insert( MVT( info.key = key, ++index ));
info.value = vNode.value | value;
vNode.push_back( info );
}
}
}
if ( size1 == size0 )
{
bits <<= 1;
ten = (UINT32)( UInt32x32To64( ten, ten ) % n );
}
else
{
++bits;
ten = (UINT32)( UInt32x32To64( ten, 10 ) % n );
}
size0 = vNode.size();
if ( bits < 16 )
{
size1 = size0;
}
else
{
size1 = 2;
}
}
return p;
}
int main( void )
{
UINT32 n;
const char * p;
while( 1 )
{
cout << "N=";
cin >> n;
if( 0 == n )
{
break;
}
clock_t start=clock();
p = GetMin01( n );
clock_t end=clock();
cout << p << "\n";
cout<< "Total cost " << end-start << "ms" << "\n" << endl;
}
return 0;
}这是用VC2008编译的版本:
应该U不同的原因
毕竟小肚子的机器忒好了
而且是正宗的Intel机器
郭的程序受缓存,指令周期的影响太大 另外,哦只是另外
我没仔细考虑过
单纯用堆栈使用迭代的浮点运算
从头部计算是否能避免一些复杂数据结构的冗余带来的性能降低? 我在家里的CPU也是intel的,不如mathe的好那是肯定的了。
无心人可以下载 14# 编译好的程序运行对比一下。
另外,这个问题如何用浮点解决? mathe 的程序强大在输入不局限32位内,可以适当超出一些。
我的估计内存耗用会少些(自我感觉,不敢确定),因为我是用32位存储的。 呵呵,我没有说你的程序慢,只是觉得性能基本相当.
还有我没有测试单个数据的速度区别(只是用gxqcn的程序找出几个比较慢的数据,在我的机器上500ms左右)
不过我的测量也不是完全准确,因为我的程序里面很多对最后结果转换部分删除了(我只统计每个结果的长度,而不管结果本身,所以可以删除部分代码).但是我觉得这部分代码对总体运行时间影响不大,所以就直接比较了.
而机器的区别在于我的机器内存缓存会比较好,所以对内存的访问速度要好很多 我那个程序输入如果超过32位,还是需要修改一下,提供一个64比特整数乘64比特整数然后模一个64比特整数的函数,现在的程序结果会错的.但是可以大概用来评估一下计算超过32位时数据的速度.