{
int i,count;
DWORD d0,d1,d2,d3, nlogSum;
UINT128 *pData=NULL;
scanf("%d",&count);
pData=new UINT128;
for (i=0;i<count;i++)
{
scanf("%u %u %u %u",&d0,&d1,&d2,&d3);
pData.U=d0;
pData.U=d1;
pData.U=d2;
pData.U=d3;
}
qsort( (void *)pData, (size_t)count, sizeof(UINT128), Uint128Compare );
nlogSum= logSum(pData,count);
delete[] pData; pData=NULL;
printf("%lu\n",nlogSum);
return 0;
} :b:
这么多汇编,可就有点麻烦了
呵呵
回复 161# liangbch 的帖子
for (i=0;i<count;i++){
scanf("%u %u %u %u",&d0,&d1,&d2,&d3);
pData.U=d0;
pData.U=d1;
pData.U=d2;
pData.U=d3;
}也可简化成以下形式吧?for (i=0;i<count;i++)
{
scanf("%u %u %u %u",&(pData.U),&(pData.U),&(pData.U),&(pData.U));
} 祝贺各位!
正如 litaoye 在 151# 所言:
原帖由 litaoye 于 2009-3-20 04:05 发表 http://bbs.emath.ac.cn/images/common/back.gif
。。。这道题的平均水平都被大家带高了! 这其中似乎有别的因素在影响时间
比如L2 Cache等
回复 163# gxqcn 的帖子
完全可以呀,不过对性能几乎没有影响。#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef struct
{
unsigned int d;
} UINT128;
unsigned int CompTable ={0,0, 0,0, 0,0, 0,1, 1,0, 1,0, 1,1, 2,0, 2,0, 2,1, 3,0, 3,0, 3,0, 3,1,
4,0,4,0,4,1,5,0,5,0,5,1,6,0,6,0,6,0,6,1,7,0,7,0,7,1,8,0,8,0,
8,1,9,0,9,0,9,0,9,1,10,0,10,0,10,1,11,0,11,0,11,1,12,0,12,
0,12,0,12,1,13,0,13,0,13,1,14,0,14,0,14,1,15,0,15,0,15,0,15,
1,16,0,16,0,16,1,17,0,17,0,17,1,18,0,18,0,18,0,18,1,19,0,19,
0,19,1,20,0,20,0,20,1,21,0,21,0,21,0,21,1,22,0,22,0,22,1,23,
0,23,0,23,1,24,0,24,0,24,0,24,1,25,0,25,0,25,1,26,0,26,0,26,
1,27,0,27,0,27,0,27,1,28,0,28,0,28,1,29,0,29,0,29,1,30,0,30,
0,30,1,31,0,31,0,31,0,31,1,32,0,32,0,32,1,33,0,33,0,33,1,34,
0,34,0,34,0,34,1,35,0,35,0,35,1,36,0,36,0,36,1,37,0,37,0,37,0,37,1,38,0};
UINT128 pow10 = {1,0,0,0,
10,0,0,0,
100,0,0,0,
1000,0,0,0,
10000,0,0,0,
100000,0,0,0,
1000000,0,0,0,
10000000,0,0,0,
100000000,0,0,0,
1000000000,0,0,0,
1410065408,2,0,0,
1215752192,23,0,0,
3567587328,232,0,0,
1316134912,2328,0,0,
276447232,23283,0,0,
2764472320,232830,0,0,
1874919424,2328306,0,0,
1569325056,23283064,0,0,
2808348672,232830643,0,0,
2313682944,2328306436,0,0,
1661992960,1808227885,5,0,
3735027712,902409669,54,0,
2990538752,434162106,542,0,
4135583744,46653770,5421,0,
2701131776,466537709,54210,0,
1241513984,370409800,542101,0,
3825205248,3704098002,5421010,0,
3892314112,2681241660,54210108,0,
268435456,1042612833,542101086,0,
2684354560,1836193738,1126043566,1,
1073741824,1182068202,2670501072,12,
2147483648,3230747430,935206946,126,
0,2242703233,762134875,1262,
0,952195850,3326381459,12621,
0,932023908,3199043520,126217,
0,730304488,1925664130,1262177,
0,3008077584,2076772117,12621774,
0,16004768,3587851993,126217744,
0,160047680,1518781562,1262177448,
0, 0, 0, 0};
/*
__declspec(naked)
int __fastcall log2(UINT128 * n)
{
__asm
{
bsr eax,
bsr edx,
add edx, 32
cmp , 0
cmovne eax, edx
bsr edx,
add edx, 64
cmp , 0
cmovne eax, edx
bsr edx,
add edx, 96
cmp , 0
cmovne eax, edx
ret
}
}
*/
int compare( const void * a, const void * b )
{
unsigned int * l, * r;
l = (unsigned int *)a; r = (unsigned int *)b;
if (*(l+3) != *(r + 3)) return (*(l + 3) > *(r+3) ? 1 : -1);
if (*(l+2) != *(r + 2)) return (*(l + 2) > *(r+2) ? 1 : -1);
if (*(l+1) != *(r + 1)) return (*(l + 1) > *(r+1) ? 1 : -1);
return (* l > * r ? 1 : (* l == * r ? 0 : -1));
}
__declspec(naked)
int __fastcall log_10(UINT128 * n)
{
__asm
{
bsr edx,
bsr eax,
add eax, 32
cmp dword ptr , 0
cmovne edx, eax
bsr eax,
add eax, 64
cmp dword ptr , 0
cmovne edx, eax
bsr eax,
add eax, 96
cmp dword ptr , 0
cmovne edx, eax
mov eax, dword ptr CompTable
cmp dword ptr CompTable, 0
je ex
add eax, 1
mov edx, eax
shl edx, 4
push ebx
mov ebx,
sub ebx, pow10
mov ebx,
sbb ebx, pow10
mov ebx,
sbb ebx, pow10
mov ebx,
sbb ebx, pow10
sbb ebx, ebx
add eax, ebx
pop ebx
ex:
ret
}
}
__declspec(naked)
int __fastcall logxor(UINT128 * a, UINT128 * b)
{
__asm
{
mov eax, 0
ret
}
}
int main(int argc, char* argv[])
{
int count,result,i,j, k;
UINT128 data, r;
scanf("%d",&count);
result=0;
i = 0;
while (i < count) {
scanf("%u", &data.d);
scanf("%u", &data.d);
scanf("%u", &data.d);
scanf("%u", &data.d);
i++;
}
qsort( (void *)data, (size_t)count, 16, compare );
for (i= 0; i < count - 1; i ++)
{
for (j = i + 1; j < count; j ++)
{
r.d = data.d ^ data.d;
r.d = data.d ^ data.d;
r.d = data.d ^ data.d;
r.d = data.d ^ data.d;
// printf("%u, %u, %u, %u", r.d, r.d, r.d, r.d);
// printf(":%u\n", log_10(&r));
result += log_10(&r);
}
}
printf("%u",result*2);
return 0;
}
0.156
和各位还存在一点点差距 有点昏昏的
好像,大概,似乎,寄存器不够用了啊
不知道 宝宝怎么设定的寄存器
回复 168# 无心人 的帖子
我的建议,如果先求128bit的xor运算再求 log10 就比较慢。我是将2者结合在一起的。在我的程序中,先对两个128bit数的最高32bit的边求xor,如果值为0,继续求下32bit。 直到结果非0。 如果前64bit是0,那么在和10^n比较的时候就只需比较后64bit 就可以了。我的xorLog10函数没有内存写指令,也不用SSE,SSE2,MMX 和 REP cmpd 等指令,寄存器完全够用了。 :)
俺用了xmm寄存器了
所以也够用了
就是汇编语句有点长
快100行了
所以毛病太多了
使劲调试中