无心人
发表于 2008-7-14 10:13:06
那我修改下,用icc给编译下
如果时间比你少
甚至时间是你的120%
都算icc编译器厉害,呵呵
mathe
发表于 2008-7-14 10:16:44
呵呵,最好采用profiling再编译一把,这样编译器产生的代码可能会更好:lol
无心人
发表于 2008-7-14 10:42:29
0.000112 X = 1 S = 1
0.000235 X = 4 S = 4
0.000248 X = 9 S = 9
0.000260 X = 7 S = 16
0.000271 X = 13 S = 49
0.000282 X = 10 S = 64
****** 0.000292: Searched all perfect squares less then 10^2 ******
0.000307 X = 16 S = 169
0.000319 X = 19 S = 289
0.000330 X = 18 S = 576
0.000341 X = 22 S = 1849
0.000352 X = 27 S = 3969
0.000363 X = 25 S = 4489
0.000374 X = 31 S = 6889
****** 0.000385: Searched all perfect squares less then 10^4 ******
0.000400 X = 28 S = 17956
0.000412 X = 34 S = 27889
0.000425 X = 36 S = 69696
0.000438 X = 40 S = 97969
0.000616 X = 37 S = 98596
0.000638 X = 43 S = 499849
0.000652 X = 46 S = 698896
****** 0.000666: Searched all perfect squares less then 10^6 ******
0.000689 X = 45 S = 1887876
0.000707 X = 49 S = 2778889
0.000729 X = 52 S = 4999696
0.000758 X = 54 S = 9696996
0.000797 X = 55 S = 19998784
0.000855 X = 58 S = 46689889
0.000894 X = 61 S = 66699889
0.000921 X = 63 S = 79869969
****** 0.000954: Searched all perfect squares less then 10^8 ******
0.001139 X = 64 S = 277788889
0.001282 X = 67 S = 478996996
0.001489 X = 70 S = 876988996
0.001809 X = 73 S = 1749999889
0.002297 X = 72 S = 3679999569
0.002669 X = 76 S = 5599977889
0.003054 X = 79 S = 7998976969
0.003204 X = 81 S = 8998988769
****** 0.003346: Searched all perfect squares less then 10^10 ******
0.004232 X = 82 S = 17999978896
0.005733 X = 85 S = 36799899889
0.013798 X = 88 S = 88998998929
0.019962 X = 90 S = 297889998849
0.020022 X = 91 S = 299879997769
0.070643 X = 94 S = 897977978689
0.073355 X = 97 S = 975979998889
****** 0.073682: Searched all perfect squares less then 10^12 ******
0.206107 X = 100 S = 2699997789889
0.295351 X = 99 S = 3957779999889
0.438211 X = 103 S = 9879498789889
0.439921 X = 106 S = 9998768898889
0.780301 X = 108 S = 29998985899689
1.278534 X = 109 S = 85986989688889
1.387117 X = 112 S = 97888999968769
****** 1.396139: Searched all perfect squares less then 10^14 ******
2.541476 X = 115 S = 386999898769969
2.713739 X = 117 S = 429998989997889
3.123124 X = 118 S = 578889999977689
3.854804 X = 121 S = 898999897988929
5.541493 X = 124 S = 1959999889996996
7.524612 X = 127 S = 3699998989898689
10.197799 X = 126 S = 6788999798879769
12.371455 X = 130 S = 9895699989899689
****** 12.431427: Searched all perfect squares less then 10^16 ******
24.814522 X = 133 S = 38896878989988889
24.862220 X = 136 S = 38999699989995889
32.795631 X = 135 S = 67699789959899889
53.410336 X = 139 S = 188997899869998769
64.464989 X = 142 S = 279869897899999969
85.710894 X = 144 S = 498999778899898896
119.855649 X = 148 S = 989879999979599689
****** 120.465921: Searched all perfect squares less then 10^18 ******
164.041346 X = 145 S = 1877896979979898969
289.218007 X = 153 S = 5899989587897999889
314.673115 X = 151 S = 6979497898999879969
354.978235 X = 154 S = 8899988895999696889
无心人
发表于 2008-7-14 10:43:56
修改后代码
基本维持GxQ原意
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#define TEN_POW2 100UL
#define TEN_POW4 10000UL
#define UINT32 unsigned long
#define LIMBS 5
UINT32 m, s;
UINT32 pool[ TEN_POW4 + LIMBS * 39 + 1 ];
#define table(x) pool
#define value(x) pool[(x) + TEN_POW4]
#define delta(x) pool[(x) + TEN_POW4 + LIMBS]
#define index(x) pool[(x) + TEN_POW4 + LIMBS * 2]
#define exist(x) pool[(x) + TEN_POW4 + LIMBS * 3 + 1]
void init( void )
{
UINT32 i1, i2, i3, i4;
UINT32 * p = &table(0) - 1;
memset( pool, 0, sizeof( pool ));
for ( i1 = 0; i1 < 10; ++i1 )
{
for ( i2 = 0; i2 < 10; ++i2 )
{
for ( i3 = 0; i3 < 10; ++i3 )
{
for ( i4 = 0; i4 < 10; ++i4 )
{
*(++p) = i1 + i2 + i3 + i4;
}
}
}
}
}
void sumOfDigits( void )
{
s = 0;
for ( UINT32 i = 0; i <= m; ++i )
{
value(i) += delta(i);
if ( value(i) >= TEN_POW4 )
{/* 当 i==m 时不会进入,因此时平方根<100^(m+1) */
value(i) -= TEN_POW4;
++value(i+1);
}
s += table( value(i) );
}
}
#define add_value(x) value(x) += delta(x)
#define mod_value(x) if ( value(x) >= TEN_POW4 ){ value(x) -= TEN_POW4; ++value(x+1); }
#define lookup(x) table( value(x) )
void sumOfDigits_1( void )
{
add_value(0);
s = lookup(0);
}
void sumOfDigits_2( void )
{
add_value(0);
add_value(1);
mod_value(0);
s = lookup( 0 ) + lookup( 1 );
}
void sumOfDigits_3( void )
{
add_value(0);
add_value(1);
add_value(2);
mod_value(0);
mod_value(1);
s = lookup( 0 ) + lookup( 1 ) + lookup( 2 );
}
void sumOfDigits_4( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
mod_value(0);
mod_value(1);
mod_value(2);
s = lookup( 0 ) + lookup( 1 ) + lookup( 2 ) + lookup( 3 );
}
void sumOfDigits_5( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
s = lookup( 0 ) + lookup( 1 ) + lookup( 2 ) + lookup( 3 ) + lookup(
4 );
}
void sumOfDigits_6( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
add_value(5);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
mod_value(4);
s = lookup( 0 ) + lookup( 1 ) + lookup( 2 ) + lookup( 3 ) + lookup(
4 ) \
+ lookup( 5 );
}
void sumOfDigits_7( void )
{
add_value(0);
add_value(1);
add_value(2);
add_value(3);
add_value(4);
add_value(5);
add_value(6);
mod_value(0);
mod_value(1);
mod_value(2);
mod_value(3);
mod_value(4);
mod_value(5);
s = lookup( 0 ) + lookup( 1 ) + lookup( 2 ) + lookup( 3 ) \
+ lookup( 4 ) + lookup( 5 ) + lookup( 6 );
}
typedef void ( *lpfnSumOfDigits )( void );
lpfnSumOfDigits lpfn[ ] = { sumOfDigits, sumOfDigits_1, sumOfDigits_2,
sumOfDigits_3, \
sumOfDigits_4, sumOfDigits_5, sumOfDigits_6, sumOfDigits_7 };
int main( void )
{
UINT32 i;
struct timeval start, end;
double timeuse;
lpfnSumOfDigits SumOfDigits = lpfn[ 1 ];
gettimeofday(&start, NULL);
init();
index(0) = 1;
delta(0) = -1;
m = 0;
for ( ; ; )
{
delta(0) += 2;
if ( delta(0) > TEN_POW4 )
{
delta(0) = 1;
i = 0;
while ( TEN_POW4 == ++delta(++i) )
{
delta(i) = 0;
}
}
SumOfDigits();
if ( 0 == exist(s) )
{
exist(s) = 1;
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf( "%6.6f", timeuse );
printf( "\tX = %u\t\tS = %u", s, value( i = m ) );
while ( 0 != i )
{
printf( "%04u", value(--i) );
}
printf( "\n" );
}
i = -1;
while ( TEN_POW2 == ++index(++i) )
{
index(i) = 0;
}
if ( m < i )
{
m = i;
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf( "\n****** %6.6f: Searched all perfect squares less then 10^%u ******\n\n", timeuse, 4*m );
if ( LIMBS == i )
{
break;
}
#if ( LIMBS > 7)
if ( m + 1 >= sizeof( lpfn ) / sizeof( lpfn ))
{
SumOfDigits = lpfn[ 0 ];
}
else
#endif
{
SumOfDigits = lpfn[ m + 1 ];
}
}
else if ( i == m && 10 == index(m) )
{
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf( "\n****** %6.6f: Searched all perfect squares less then 10^%u ******\n\n", timeuse, 4*m + 2 );
}
}
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf( "\nComputation took %6.6f\n\n", timeuse );
system( "pause" );
return 0;
}
////////////////////
可以看出,所用时间远大于我原来贴的时间
下面我贴我修改的他原始程序的代码
无心人
发表于 2008-7-14 10:45:09
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#define TEN_POW2 100UL
#define TEN_POW4 10000UL
int main( void )
{
struct timeval start, end;
double timeuse;
unsigned long table[ TEN_POW4 ];
unsigned long mark[ 9*32 + 1 ];
unsigned long value[ 8 ], delta[ 8 ];
unsigned long * p = table;
unsigned long i, j, k, m, s;
long t;
unsigned long i2, i3, i4, s2, s3;
for ( i4 = 0; i4 < 10; ++i4 )
{
for ( i3 = 0; i3 < 10; ++i3 )
{
s3 = i4 + i3;
for ( i2 = 0; i2 < 10; ++i2 )
{
s2 = s3 + i2;
for ( i = 0; i < 10; ++i )
{
*p++ = s2 + i;
}
}
}
}
for (i=0; i < 8; i ++)
{
value = 0;
delta = 0;
}
for (i = 0; i < 9*32 + 1; i ++)
mark = 0;
value[ 0 ] = 0;
delta[ 0 ] = 1;
m = 1;
k = 0;
gettimeofday(&start, NULL);
for ( ; ; )
{
s = 0;
for ( i = 0; i < m; ++i )
{
value[ i ] += delta[ i ];
if ( value[ i ] >= TEN_POW4 )
{
value[ i ] -= TEN_POW4;
++value[ i + 1 ];
}
s += table[ value[ i ] ];
}
if (value)
{
m++;
s++;
}
if ( 0 == mark[ s ])
{
mark = 1;
gettimeofday(&end, NULL);
timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
timeuse /= 1000000;
printf("%8.6f:%", timeuse);
printf("%4d", s);
for (t=m-1;t>=0;t--)
printf("%4d", value);
printf("\n");
}
if (m >= 9)
break;
delta[ 0 ] += 2;
i = 0;
while ( delta[ i ] >= TEN_POW4 )
{
delta[ i ] -= TEN_POW4;
++delta[ ++i ];
}
k ++;
if (k == 0)
{
printf("\n");
printf("Current: ");
for (t=m-1;t>=0; t--)
printf("%04d", value);
printf("\n\n");
}
}
s = 0;
i = 0;
printf( "\n" );
return 0;
}
///////////////////////////
我估计原因在于他分离了加和下面的调整进位造成的
无心人
发表于 2008-7-14 10:49:56
TO: mathe
如何进行二次优化??
用intel C++?
我对105#原始程序进行下优化
mathe
发表于 2008-7-14 11:05:30
查看了一下,第一次编译用
-prof-gen
选项
第二次编译在同一个目录下用
-prof-use
gxqcn
发表于 2008-7-14 11:11:43
我曾对比过,用 ICC 编译该代码,效率不升反降。:L 不知问题出在哪里?:Q:
各位不妨测试一下,尤其是 无心人 对比测试一下看看。。。
无心人
发表于 2008-7-14 11:16:30
:lol
可能是你优化的不如icc彻底
反而让icc给优化倒了
你不用优化选项
光产生sse2代码试下
无心人
发表于 2008-7-14 11:37:26
0.000004: 1 1
0.000119: 4 4
0.000130: 9 9
0.000138: 7 16
0.000146: 13 49
0.000154: 10 64
0.000162: 16 169
0.000170: 19 289
0.000179: 18 576
0.000187: 221849
0.000195: 273969
0.000203: 254489
0.000212: 316889
0.000221: 28 17956
0.000230: 34 27889
0.000240: 36 69696
0.000249: 40 97969
0.000258: 37 98596
0.000273: 43 499849
0.000284: 46 698896
0.000301: 45 1887876
0.000314: 49 2778889
0.000330: 52 4999696
0.000350: 54 9696996
0.000379: 5519998784
0.000419: 5846689889
0.000446: 6166699889
0.000466: 6379869969
0.000600: 64 277788889
0.000697: 67 478996996
0.000837: 70 876988996
0.001050: 73 1749999889
0.001375: 72 3679999569
0.001624: 76 5599977889
0.001891: 79 7998976969
0.001994: 81 8998988769
0.002726: 82 17999978896
0.003747: 85 36799899889
0.005625: 88 88998998929
0.011515: 90297889998849
0.011592: 91299879997769
0.052307: 94897977978689
0.054137: 97975979998889
0.096983: 100 2699997789889
0.105409: 99 3957779999889
0.163203: 103 9879498789889
0.163819: 106 9998768898889
0.255277: 108 29998985899689
0.399672: 109 85986989688889
0.422920: 112 97888999968769
0.765848: 115 386999898769969
0.832703: 117 429998989997889
1.053660: 118 578889999977689
1.342495: 121 898999897988929
1.908987: 1241959999889996996
2.504875: 1273699998989898689
3.311136: 1266788999798879769
3.927256: 1309895699989899689
二次优化后的结果