哈哈
你没保存状态阿
多运行的8小时你也不知道
运行到哪里了阿
我是每隔2^32就输出一次当前平方的
我没有服务器,不打算继续算下去,所以就没打算输出中间的状态。
其实这个算法计算量规模是容易估算的,当时我就知道 LIMBS 可大于 6,但不需超过 7。 28位是可达到的
32位是遥远的 0.0000001 0001
0.0001294 0004
0.0001459 0009
0.0001547 0016
0.00016313 0049
0.00017110 0064
0.00018016 0169
0.00018919 0289
0.00019818 0576
0.00020722 1849
0.00021727 3969
0.00022625 4489
0.00023531 6889
0.00024628 00017956
0.00025634 00027889
0.00026736 00069696
0.00027840 00097969
0.00028737 00098596
0.00030343 00499849
0.00031546 00698896
0.00033445 01887876
0.00034949 02778889
0.00036852 04999696
0.00039054 09696996
0.00042355 19998784
0.00046858 46689889
0.00049861 66699889
0.00052163 79869969
0.00067264 000277788889
0.00087067 000478996996
0.00116770 000876988996
0.00161873 001749999889
0.00230372 003679999569
0.00282276 005599977889
0.00369779 007998976969
0.00383281 008998988769
0.00499882 017999978896
0.00625985 036799899889
0.00898388 088998998929
0.02429690 297889998849
0.02449191 299879997769
0.07182094 897977978689
0.07287597 975979998889
0.108861100 0002699997789889
0.11846199 0003957779999889
0.240214103 0009879498789889
0.240772106 0009998768898889
0.409552108 0029998985899689
0.690780109 0085986989688889
0.785017112 0097888999968769
1.331215115 0386999898769969
1.434825117 0429998989997889
1.657168118 0578889999977689
1.983984121 0898999897988929
2.869437124 1959999889996996
3.850002127 3699998989898689
5.131660126 6788999798879769
6.460858130 9895699989899689
12.925727133 00038896878989988889
12.961732136 00038999699989995889
16.786534135 00067699789959899889
27.896467139 00188997899869998769
33.394664142 00279869897899999969
43.629142144 00498999778899898896
61.863666148 00989879999979599689
85.523343145 01877896979979898969
156.117377153 05899989587897999889
167.352661151 06979497898999879969
呵呵,修改的GxQ的代码,时间在数量级上和他一样
但我是P4 2.0 CPU / 1280M /Ubuntu 8.04
机器慢些 原帖由 无心人 于 2008-7-14 08:19 发表 http://bbs.emath.ac.cn/images/common/back.gif
... 另外,不知道linux如何计时和输出?呵呵
GxQ老想知道我的运算时间是多少,嘿嘿
Linux里面C代码可以调用clock()命令比较精确的计算当前程序运行的时间.
而输出很简单呀,你可以重定向输出结果到一个文件里面就可以了(在代码中每次输出后面最好添加一个fflush(stdout))
而如果只需要统计整个程序的运行时间,在命令行对使用命令前面添加time命令就可以了,如
\$time ./your_prog.out input_parameter_list 呵呵,找到一个gettimeofday调用,是微秒级的精度
挺好用的
另外,刚下载了intel c++ 10.1.017
优化后,速度提升特明显
我的终极优化的代码
实现了我先前的一些想法,在计算 X = 162 时,已比 72# 的快了 50%!我的终极优化版本#include < stdlib.h >#include < stdio.h >
#include "../../../HugeCalc_API/CppAPI/Include/HugeCalc.h" // 公共接口
#pragma message( "automatic link to ../../../HugeCalc_API/CppAPI/Lib/HugeCalc.lib" )
#pragma comment( lib, "../../../HugeCalc_API/CppAPI/Lib/HugeCalc.lib" )
#define TEN_POW2 100UL
#define TEN_POW4 10000UL
#define LIMBS 5
UINT32 m, s;
#ifdef __INTEL_COMPILER
# undefLIMBS
# define LIMBS 8
__declspec(align(16))
#endif
UINT32 pool[ TEN_POW4 + LIMBS * 39 ];
#define table(x) pool
#define value(x) pool[(x) + TEN_POW4]
#define delta(x) pool[(x) + TEN_POW4 + LIMBS]
#define index(x) pool[(x) + TEN_POW4 + LIMBS * 2]
#define exist(x) pool[(x) + TEN_POW4 + LIMBS * 3]
void init( void )
{
UINT32 i1, i2, i3, i4;
UINT32 * p = &table(0) - 1;
memset( pool, 0, sizeof( pool ));
for ( i1 = 0; i1 < 10; ++i1 )
{
for ( i2 = 0; i2 < 10; ++i2 )
{
for ( i3 = 0; i3 < 10; ++i3 )
{
for ( i4 = 0; i4 < 10; ++i4 )
{
*(++p) = i1 + i2 + i3 + i4;
}
}
}
}
}
#define add_value(x) value(x) += delta(x)
#define mod_value(x) if ( value(x) >= TEN_POW4 ){ value(x) -= TEN_POW4; ++value(x+1); }
#define lookup(x) table( value(x) )
void sumOfDigits( void )
{
s = 0;
for ( UINT32 i = 0; i <= m; ++i )
{
add_value(i);
mod_value(i); /* 当 i==m 时不会进入 if,因此时平方根<100^(m+1) */
s += lookup(i);
}
}
void sumOfDigits_1( void )
{
add_value(0);
s = lookup(0);
}
void sumOfDigits_2( void )
{
add_value(0);
add_value(1);
mod_value(0);
s = lookup(0) + lookup(1);
}
void sumOfDigits_3( void )
{
add_value(0);
add_value(1);
add_value(2);
mod_value(0);
mod_value(1);
s = lookup(0) + lookup(1) + lookup(2);
}
void sumOfDigits_4( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
mod_value(0);
mod_value(1);
mod_value(2);
s = lookup(0) + lookup(1) + lookup(2) + lookup(3);
}
void sumOfDigits_5( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
s = lookup(0) + lookup(1) + lookup(2) + lookup(3) \
+ lookup(4);
}
void sumOfDigits_6( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
add_value(5);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
mod_value(4);
s = lookup(0) + lookup(1) + lookup(2) + lookup(3) \
+ lookup(4) + lookup(5);
}
void sumOfDigits_7( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
add_value(5);
add_value(6);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
mod_value(4);
mod_value(5);
s = lookup(0) + lookup(1) + lookup(2) + lookup(3) \
+ lookup(4) + lookup(5) + lookup(6);
}
void sumOfDigits_8( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
add_value(5);
add_value(6);
add_value(7);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
mod_value(4);
mod_value(5);
mod_value(6);
s = lookup(0) + lookup(1) + lookup(2) + lookup(3) \
+ lookup(4) + lookup(5) + lookup(6) + lookup(7);
}
typedef void ( *lpfnSumOfDigits )( void );
lpfnSumOfDigits lpfn[ ] = { sumOfDigits, sumOfDigits_1, sumOfDigits_2, sumOfDigits_3, \
sumOfDigits_4, sumOfDigits_5, sumOfDigits_6, sumOfDigits_7, sumOfDigits_8 };
int main( void )
{
UINT32 i;
lpfnSumOfDigits SumOfDigits = lpfn[ 1 ];
HugeCalc::EnableTimer();
HugeCalc::ResetTimer();
init();
index(0) = 1;
delta(0) = -1;
m = 0;
for ( ; ; )
{
delta(0) += 2;
if ( delta(0) > TEN_POW4 )
{
delta(0) = 1;
i = 0;
while ( TEN_POW4 == ++delta(++i) )
{
delta(i) = 0;
}
}
SumOfDigits();
if ( 0 == exist(s) )
{
exist(s) = 1;
printf( HugeCalc::GetTimerStr( FT_HHMMSS_ms ));
printf( "\tX = %u\t\tS = %u", s, value( i = m ) );
while ( 0 != i )
{
printf( "%04u", value(--i) );
}
printf( "\n" );
}
i = -1;
while ( TEN_POW2 == ++index(++i) )
{
index(i) = 0;
}
if ( m < i )
{
m = i;
printf( "\n****** %s: Searched all perfect squares less then 10^%u ******\n\n", HugeCalc::GetTimerStr( FT_DOT06SECs ), 4*m );
if ( LIMBS == i )
{
break;
}
#if ( LIMBS > 8)
if ( m + 1 >= sizeof( lpfn ) / sizeof( lpfn ))
{
SumOfDigits = lpfn[ 0 ];
}
else
#endif
{
SumOfDigits = lpfn[ m + 1 ];
}
}
else if ( i == m && 10 == index(m) )
{
printf( "\n****** %s: Searched all perfect squares less then 10^%u ******\n\n", HugeCalc::GetTimerStr( FT_DOT06SECs ), 4*m + 2 );
}
}
printf( "\nComputation took %s\n\n", HugeCalc::GetTimerStr( FT_DOT06SEC_s ) );
system( "pause" );
return 0;
} 0.000004: 1 1
0.000121: 4 4
0.000132: 9 9
0.000140: 7 16
0.000149: 13 49
0.000157: 10 64
0.000165: 16 169
0.000173: 19 289
0.000181: 18 576
0.000190: 221849
0.000198: 273969
0.000206: 254489
0.000215: 316889
0.000223: 28 17956
0.000232: 34 27889
0.000242: 36 69696
0.000251: 40 97969
0.000260: 37 98596
0.000274: 43 499849
0.000284: 46 698896
0.000300: 45 1887876
0.000313: 49 2778889
0.000330: 52 4999696
0.000350: 54 9696996
0.000379: 5519998784
0.000418: 5846689889
0.000445: 6166699889
0.000465: 6379869969
0.000600: 64 277788889
0.000696: 67 478996996
0.000835: 70 876988996
0.001048: 73 1749999889
0.001372: 72 3679999569
0.001621: 76 5599977889
0.002040: 79 7998976969
0.002164: 81 8998988769
0.002986: 82 17999978896
0.004208: 85 36799899889
0.006426: 88 88998998929
0.016116: 90297889998849
0.016196: 91299879997769
0.049611: 94897977978689
0.050550: 97975979998889
0.092371: 100 2699997789889
0.109283: 99 3957779999889
0.168103: 103 9879498789889
0.168708: 106 9998768898889
0.262466: 108 29998985899689
0.411295: 109 85986989688889
0.431866: 112 97888999968769
0.779954: 115 386999898769969
0.839113: 117 429998989997889
0.961269: 118 578889999977689
1.162582: 121 898999897988929
1.656253: 1241959999889996996
2.180752: 1273699998989898689
3.101664: 1266788999798879769
3.674470: 1309895699989899689
7.287076: 133 38896878989988889
7.294342: 136 38999699989995889
9.633264: 135 67699789959899889
16.111251: 139 188997899869998769
19.601210: 142 279869897899999969
26.191958: 144 498999778899898896
36.927668: 148 989879999979599689
50.888507: 145 1877896979979898969
90.289988: 153 5899989587897999889
98.299096: 151 6979497898999879969
和上面对比,发现速度提升明显,且编译提示发现可向量化的代码了
编译命令是
icc -fast -msse2
运行结果
00:00:00.000 X = 1 S = 100:00:00.000 X = 4 S = 4
00:00:00.000 X = 9 S = 9
00:00:00.000 X = 7 S = 16
00:00:00.000 X = 13 S = 49
00:00:00.000 X = 10 S = 64
****** 0.000901s: Searched all perfect squares less then 10^2 ******
00:00:00.000 X = 16 S = 169
00:00:00.001 X = 19 S = 289
00:00:00.001 X = 18 S = 576
00:00:00.001 X = 22 S = 1849
00:00:00.001 X = 27 S = 3969
00:00:00.001 X = 25 S = 4489
00:00:00.001 X = 31 S = 6889
****** 0.001846s: Searched all perfect squares less then 10^4 ******
00:00:00.001 X = 28 S = 17956
00:00:00.001 X = 34 S = 27889
00:00:00.001 X = 36 S = 69696
00:00:00.002 X = 40 S = 97969
00:00:00.002 X = 37 S = 98596
00:00:00.002 X = 43 S = 499849
00:00:00.002 X = 46 S = 698896
****** 0.002744s: Searched all perfect squares less then 10^6 ******
00:00:00.002 X = 45 S = 1887876
00:00:00.003 X = 49 S = 2778889
00:00:00.003 X = 52 S = 4999696
00:00:00.003 X = 54 S = 9696996
00:00:00.003 X = 55 S = 19998784
00:00:00.003 X = 58 S = 46689889
00:00:00.003 X = 61 S = 66699889
00:00:00.003 X = 63 S = 79869969
****** 0.003801s: Searched all perfect squares less then 10^8 ******
00:00:00.003 X = 64 S = 277788889
00:00:00.004 X = 67 S = 478996996
00:00:00.004 X = 70 S = 876988996
00:00:00.004 X = 73 S = 1749999889
00:00:00.005 X = 72 S = 3679999569
00:00:00.006 X = 76 S = 5599977889
00:00:00.006 X = 79 S = 7998976969
00:00:00.006 X = 81 S = 8998988769
****** 0.006982s: Searched all perfect squares less then 10^10 ******
00:00:00.007 X = 82 S = 17999978896
00:00:00.008 X = 85 S = 36799899889
00:00:00.010 X = 88 S = 88998998929
00:00:00.014 X = 90 S = 297889998849
00:00:00.014 X = 91 S = 299879997769
00:00:00.020 X = 94 S = 897977978689
00:00:00.021 X = 97 S = 975979998889
****** 0.022476s: Searched all perfect squares less then 10^12 ******
00:00:00.033 X = 100 S = 2699997789889
00:00:00.039 X = 99 S = 3957779999889
00:00:00.059 X = 103 S = 9879498789889
00:00:00.059 X = 106 S = 9998768898889
00:00:00.100 X = 108 S = 29998985899689
00:00:00.169 X = 109 S = 85986989688889
00:00:00.182 X = 112 S = 97888999968769
****** 0.185879s: Searched all perfect squares less then 10^14 ******
00:00:00.367 X = 115 S = 386999898769969
00:00:00.386 X = 117 S = 429998989997889
00:00:00.450 X = 118 S = 578889999977689
00:00:00.556 X = 121 S = 898999897988929
00:00:00.822 X = 124 S = 1959999889996996
00:00:01.143 X = 127 S = 3699998989898689
00:00:01.546 X = 126 S = 6788999798879769
00:00:01.859 X = 130 S = 9895699989899689
****** 1.867877s: Searched all perfect squares less then 10^16 ******
00:00:03.942 X = 133 S = 38896878989988889
00:00:03.948 X = 136 S = 38999699989995889
00:00:05.279 X = 135 S = 67699789959899889
00:00:08.984 X = 139 S = 188997899869998769
00:00:10.970 X = 142 S = 279869897899999969
00:00:14.728 X = 144 S = 498999778899898896
00:00:20.848 X = 148 S = 989879999979599689
****** 20.955593s: Searched all perfect squares less then 10^18 ******
00:00:28.825 X = 145 S = 1877896979979898969
00:00:51.330 X = 153 S = 5899989587897999889
00:00:55.840 X = 151 S = 6979497898999879969
00:01:03.080 X = 154 S = 8899988895999696889
00:01:54.156 X = 157 S = 28979978999958969889
00:03:10.483 X = 160 S = 78897999969769888996
00:03:21.335 X = 162 S = 87989899898866889889
****** 214.569824s: Searched all perfect squares less then 10^20 ******
Computation took 214.570197 s
请按任意键继续. . . 你新写的代码等于是给手工展开了吧??? 也可以这么说,
我不太相信编译器的优化能力。:lol
注:我用的是 VC6,没有用 ICC 编译器。