- 注册时间
- 2008-2-6
- 最后登录
- 1970-1-1
- 威望
- 星
- 金币
- 枚
- 贡献
- 分
- 经验
- 点
- 鲜花
- 朵
- 魅力
- 点
- 上传
- 次
- 下载
- 次
- 积分
- 51573
- 在线时间
- 小时
|
楼主 |
发表于 2008-3-22 09:16:53
|
显示全部楼层
P4 Xeon2/1G/DDR 570ms/如何优化?
#include
#include
#include < wtypes.h >
#define LCM (16*3*5*7*11*13)
char BaseCache[LCM], WorkCache[LCM];
unsigned int Prime[6000000];
unsigned int CurrPos; //当前素数表位置
unsigned int LowSqrt, HighSqrt; //起始位置和结束位置的平方根
unsigned int __fastcall intSieveSqrt(unsigned int n)
{
__asm
{
mov ebx, n
cvtsi2sd xmm0, ebx
sqrtsd xmm0, xmm0
cvtsd2si eax, xmm0
mov ecx, eax
mul ecx
cmp eax, ebx
jbe exit1
sub ecx, 1
exit1:
mov eax, ecx
}
}
void initCache(void)
{
__asm
{
mov eax, 01000100h
movd xmm0, eax
pshufd xmm0, xmm0, 00000000b
mov ecx, LCM
sub ecx, 16
loop1:
movdqa BaseCache[ecx], xmm0
sub ecx, 16
jnc loop1
mov ecx, LCM
sub ecx, 3
loop2:
mov byte ptr BaseCache[ecx], 0
sub ecx, 6
jnc loop2
mov ecx, LCM
sub ecx, 5
loop3:
mov byte ptr BaseCache[ecx], 0
sub ecx, 10
jnc loop3
mov ecx, LCM
sub ecx, 7
loop4:
mov byte ptr BaseCache[ecx], 0
sub ecx, 14
jnc loop4
mov ecx, LCM
sub ecx, 11
loop5:
mov byte ptr BaseCache[ecx], 0
sub ecx, 22
jnc loop5
mov ecx, LCM
sub ecx, 13
loop6:
mov byte ptr BaseCache[ecx], 0
sub ecx, 26
jnc loop6
}
}
void CopyCache(void)
{
__asm
{
mov ecx, LCM
sub ecx, 80
loop1:
pshufd xmm0, BaseCache[ecx], 11100100b
pshufd xmm1, BaseCache[ecx+16], 11100100b
pshufd xmm2, BaseCache[ecx+32], 11100100b
pshufd xmm3, BaseCache[ecx+48], 11100100b
pshufd xmm4, BaseCache[ecx+64], 11100100b
movdqa WorkCache[ecx], xmm0
movdqa WorkCache[ecx+16], xmm1
movdqa WorkCache[ecx+32], xmm2
movdqa WorkCache[ecx+48], xmm3
movdqa WorkCache[ecx+64], xmm4
sub ecx, 80
jnc loop1
}
}
void firstSieve(void)
{
unsigned int i, j, k, k2, l, SieveIndex, p, p2;
LowSqrt = intSieveSqrt(LCM - 1);
CopyCache();
Prime[0] = 2;
Prime[1] = 3;
Prime[2] = 5;
Prime[3] = 7;
Prime[4] = 11;
Prime[5] = 13;
j = 5;
//得到已筛出的初始素数, 小于17 * 17
for (i = 17; i < (17 * 17); i += 2)
{
if (WorkCache[i])
Prime[++j] = i;
}
//利用新筛出的素数筛选, 最大应该283
for (i = 6; i <= j; i ++)
{
p = Prime[i];
p2 = p << 1;
for (k = p*p; k < LCM; k+=p2)
WorkCache[k] = 0;
}
// <= Prime[j] * Prime[j]已筛
//从SieveIndex下个素数筛
SieveIndex = j + 1;
k = Prime[j] * Prime[j];
for (i = Prime[SieveIndex - 1] + 2; i <= k; i += 2)
{
if (WorkCache[i])
Prime[++j] = i;
}
//最终筛,到sqrt(LCM)结束
for (i = SieveIndex; Prime[i] <= LowSqrt; i ++)
{
k = Prime[i] * Prime[i];
k2 = k << 1;
for (l = k; l < LCM; l += k2)
WorkCache[l] = 0;
}
//输出最后筛结果
k = Prime[j] * Prime[j];
for (i = k + 2; i < LCM; i += 2)
{
if (WorkCache[i])
Prime[++j] = i;
}
CurrPos = j;
}
void __fastcall OneSieve(unsigned int base)
{
unsigned int i, j, k, l, p2;
CopyCache();
HighSqrt = intSieveSqrt(base + LCM - 1);
for (i = 6; Prime[i] <= LowSqrt; i ++)
{
p2 = Prime[i] << 1;
j = (base / Prime[i]) * Prime[i];
if (j < base) j+= Prime[i];
if (j % 2 == 0) j += Prime[i];
j -= base;
for (k = j; k < LCM; k += p2)
WorkCache[k] = 0;
}
l = i;
for (i = l ; Prime[i] <= HighSqrt; i ++)
{
p2 = Prime[i] << 1;
j = Prime[i] * Prime[i] - base;
for (k = j; k < LCM; k += p2)
WorkCache[k] = 0;
}
for (i = 1; i < LCM; i +=2)
if (WorkCache[i])
Prime[++CurrPos] = base + i;
LowSqrt = HighSqrt;
}
int main(void)
{
UINT64 s_u64Frequency = 1;
UINT64 s_u64Start, s_u64End;
unsigned int i;
QueryPerformanceFrequency((LARGE_INTEGER *)&s_u64Frequency );
QueryPerformanceCounter((LARGE_INTEGER *)&s_u64Start );
initCache();
firstSieve();
for (i = 1; LCM * i <= 100000000; i ++)
OneSieve(i * LCM);
QueryPerformanceCounter((LARGE_INTEGER *)&s_u64End );
printf( "Elapsed time: %.3f ms\n",
(double)(( s_u64End - s_u64Start ) * 1000.0 / (double)s_u64Frequency ));
return 0;
}
代码见附件
Sieve8.rar
(2.62 KB, 下载次数: 14)
|
|