无心人
发表于 2008-5-7 14:46:08
:)
你还没考虑SSE2的8路展开版本了
无心人
发表于 2008-5-7 14:48:58
另外,你没比较超过L2缓存大小的数据
liangbch
发表于 2008-5-7 15:58:45
SSE2 的版本使用循环展开提速不明显,我的版本在每个循环步使用2条movdqa指令,也就是说和A_Clear_R8一样,每个循环步处理 8个DWORD,你的那个版本每个循环步使用1条movdqa指令,但速度没有明显变化,SSE2指令的版本,movdqa 是瓶颈,循环控制所花的时间可以忽略不计。
liangbch
发表于 2008-5-7 15:59:13
很大的内存块(超过L2cache)清零的优化,我暂时不考虑。
无心人
发表于 2008-5-7 16:13:40
:)
movq呢?
movdqa可能因为CPU数据总线是64位的,会有冲突
多几次movq看下效果如何?
无心人
发表于 2008-5-7 20:15:06
我对4#有疑问,具体自己检查
另外,你时间不要写这么多零好不好?
我眼睛不好,也数不过来
:lol
最后希望能看到你测试的源代码
还有,是否用了Intel编译器?
liangbch
发表于 2008-5-8 10:07:36
1.好吧,容我将单位改为毫秒。
2.编译器使用VC6.0,如果使用intel编译器,他会将简单循环作4路展开,C语言版的函数和汇编语言的4路循环展开版本大致相当。
无心人
发表于 2008-5-8 17:53:33
说几个对你测试的问题
1、清零的速度还可以提升
我算过,每秒可达到2.5GB/s以上,我的机器不如你的, 但我测试清零速度似乎能达到2.0G以上,你的似乎达不到
2、加的时钟你P4上的三操作数C代码是6.0个时钟每双字,而我算我的是4.9,说明加法要改进一个时钟是很困难的,你的代码虽然是2^30进制,但应该和我的2^32进制不会存在明显的差异,因为你多的操作应该不会造成很明显的延时
liangbch
发表于 2008-5-8 22:46:29
这里给出测试代码,请检查我的测试方法是否科学。int main(int argc, char* argv[])
{
test_a_clear();
test_a_copy();
test_add();
test_sub();
test_neg();
return 0;
}
void test_a_clear()
{
int n=8192;
//createFolders();
//test_clear_function(256);
test_clear_speed( n,0);
test_clear_speed( n,1);
test_clear_speed( n,2);
test_clear_speed( n,3);
test_clear_speed( n,4);
test_clear_speed( n,5);
test_clear_speed( n,6);
test_clear_speed( n,7);
}
void test_a_copy()
{
int n=8192;
//createFolders();
//test_copy_function(256);
test_copy_speed(n,0,0);
test_copy_speed(n,0,1);
test_copy_speed(n,0,2);
test_copy_speed(n,0,3);
test_copy_speed(n,1,0);
test_copy_speed(n,1,1);
test_copy_speed(n,1,2);
test_copy_speed(n,1,3);
test_copy_speed(n,2,0);
test_copy_speed(n,2,1);
test_copy_speed(n,2,2);
test_copy_speed(n,2,3);
test_copy_speed(n,3,0);
test_copy_speed(n,3,1);
test_copy_speed(n,3,2);
test_copy_speed(n,3,3);
}
void test_add()
{
test_add_function(1024);
test_add_speed( 8192);
}
void test_sub()
{
test_sub_function(1024);
test_sub_speed( 8192);
}
void test_neg()
{
test_neg_function(1024);
test_neg_speed( 8192);
}
void test_clear_speed( int len,int align16_offset)
{
DWORD *pData=(DWORD *)malloc( (len+64)* sizeof(DWORD));
DWORD *pTest1= (DWORD *)malloc( (len+64)* sizeof(DWORD));
DWORD *pTest2= (DWORD *)malloc( (len+64)* sizeof(DWORD));
DWORD *pTest3= (DWORD *)malloc( (len+64)* sizeof(DWORD));
DWORD *pTest4= (DWORD *)malloc( (len+64)* sizeof(DWORD));
DWORD *pTest5= (DWORD *)malloc( (len+64)* sizeof(DWORD));
DWORD *pBuff1=NULL;
DWORD *pBuff2=NULL;
DWORD *pBuff3=NULL;
DWORD *pBuff4=NULL;
DWORD *pBuff5=NULL;
int count=0;
double t1,t2,t3,t4,t5;
double s_t1,s_t2,s_t3,s_t4,s_t5;
MadeNum(pData,len+32);
count=0;
s_t1=0.00;
s_t2=0.00;
s_t3=0.00;
s_t4=0.00;
s_t5=0.00;
DWORD tmp;
tmp=(DWORD)pTest1;
while ((tmp & 0x3f) !=0)
tmp+=4;
pBuff1=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest2;
while ((tmp & 0x3f) !=0)
tmp+=4;
pBuff2=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest3;
while ((tmp & 0x3f) !=0)
tmp+=4;
pBuff3=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest4;
while ((tmp & 0x3f) !=0)
tmp+=4;
pBuff4=(DWORD *)tmp;
tmp=(DWORD)pTest5;
while ((tmp & 0x3f) !=0)
tmp+=4;
pBuff5=(DWORD *)tmp;
pBuff1 += align16_offset;
pBuff2 += align16_offset;
pBuff3 += align16_offset;
pBuff4 += align16_offset;
pBuff5 += align16_offset;
do
{
memcpy(pBuff1,pData,(len)*sizeof(DWORD));
memcpy(pBuff2,pData,(len)*sizeof(DWORD));
memcpy(pBuff3,pData,(len)*sizeof(DWORD));
memcpy(pBuff4,pData,(len)*sizeof(DWORD));
memcpy(pBuff5,pData,(len)*sizeof(DWORD));
t1=currTime();
memset(pBuff1,0,len*sizeof(DWORD));
t1=currTime()-t1;
s_t1+=t1;
t2=currTime();
A_Clear(pBuff2,len);
t2=currTime()-t2;
s_t2+=t2;
t3=currTime();
A_Clear_R8(pBuff3,len);
t3=currTime()-t3;
s_t3+=t3;
t4=currTime();
A_Clear_With_SSE2(pBuff4,len);
t4=currTime()-t4;
s_t4+=t4;
t5=currTime();
yaos_AsmMemZero0(pBuff5,len);
t5=currTime()-t5;
s_t5+=t5;
count++;
}while (s_t1<0.001);
s_t1 /=count;
s_t2 /=count;
s_t3 /=count;
s_t4 /=count;
s_t5 /=count;
printf("\n\nClear zero function speed test,length unit: DWORD, time unit: ms\n");
printf("pBuff1=%x\t",(DWORD)pBuff1);
printf("pBuff2=%x\t",(DWORD)pBuff2);
printf("pBuff3=%x\t",(DWORD)pBuff3);
printf("pBuff4=%x\t",(DWORD)pBuff4);
printf("pBuff5=%x\t",(DWORD)pBuff5);
printf("\nlength\taddress\tmemset\tA_Clean\tA_Clean_R8\tA_Clear_With_SSE2\tyaos_AsmMemZero0\n");
printf("%d\t32X+%d\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\n",
len,align16_offset*4,
s_t1*1000,s_t2*1000,s_t3*1000,s_t4*1000,s_t5*1000);
free(pData);
free(pTest1);
free(pTest2);
free(pTest3);
free(pTest4);
free(pTest5);
}
void test_copy_speed( int len,int src_offset,int tag_offset)
{
DWORD *pData=(DWORD *)malloc( (len+32)* sizeof(DWORD));
DWORD *pTest1= (DWORD *)malloc( (len+32)* sizeof(DWORD));
DWORD *pTest2= (DWORD *)malloc( (len+32)* sizeof(DWORD));
DWORD *pTest3= (DWORD *)malloc( (len+32)* sizeof(DWORD));
DWORD *pTest4= (DWORD *)malloc( (len+32)* sizeof(DWORD));
int i,j,count=0;
double t,s_t1,s_t2,s_t3,s_t4;
double min_t1,min_t2,min_t3,min_t4;
DWORD *psrc;
DWORD *ptag1;
DWORD *ptag2;
DWORD *ptag3;
DWORD *ptag4;
DWORD tmp;
MadeNum(pData,len+32);
count=1024;
min_t1=min_t2=min_t3=min_t4=1000;
tmp=(DWORD)pData;
while ((tmp & 0x1f) !=0)
tmp+=4;
psrc=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest1;
while ((tmp & 0x1f) !=0)
tmp+=4;
ptag1=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest2;
while ((tmp & 0x1f) !=0)
tmp+=4;
ptag2=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest3;
while ((tmp & 0x1f) !=0)
tmp+=4;
ptag3=(DWORD *)tmp;
//--------------
tmp=(DWORD)pTest4;
while ((tmp & 0x1f) !=0)
tmp+=4;
ptag4=(DWORD *)tmp;
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
memcpy(ptag1+tag_offset,psrc+src_offset,len*sizeof(DWORD));
}
s_t1=currTime()-t;
s_t1 /=count;
if (s_t1<min_t1)
min_t1=s_t1;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
A_Copy(ptag2+tag_offset,psrc+src_offset,len);
}
s_t2=currTime()-t;
s_t2 /=count;
if (s_t2<min_t2)
min_t2=s_t2;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
A_Copy_With_SSE2(ptag3+tag_offset,psrc+src_offset,len);
}
s_t3=currTime()-t;
s_t3 /=count;
if (s_t3<min_t3)
min_t3=s_t3;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
A_Copy_R8(ptag4+tag_offset,psrc+src_offset,len);
}
s_t4=currTime()-t;
s_t4 /=count;
if (s_t4<min_t4)
min_t4=s_t4;
}
printf("\n\n4 copy function speed test,time unit: ms,length unint:DWORD\n");
printf("length\tsource address\ttarget address\tmemcpy\tA_Copy\tA_Copy_With_SSE2\tA_Copy_R8\n");
printf("%d\t16X+%d\t16X+%d\t%.6f\t%.6f\t%.6f\t%.6f\n",
len,src_offset*4,tag_offset*4,
min_t1*1000,min_t2*1000,min_t3*1000,min_t4*1000);
free(pData);
free(pTest1);
free(pTest2);
free(pTest3);
free(pTest4);
}
liangbch
发表于 2008-5-8 22:47:21
接楼上:void test_add_speed( int len)
{
DWORD *pData1=(DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pData2=(DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult1= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult2= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult3= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult4= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult5= (DWORD *)malloc( (len+4)* sizeof(DWORD));
int i,j,count=0;
double t,s_t1,s_t2,s_t3,s_t4,s_t5;
double min_t1,min_t2,min_t3,min_t4,min_t5;
MadeNum(pData1,len+4);
MadeNum(pData2,len+4);
A_Copy(pResult1,pData1,len+4);
A_Copy(pResult2,pData1,len+4);
A_Copy(pResult3,pData1,len+4);
count=1024;
min_t1=min_t2=min_t3=min_t4=min_t5=1000;
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Add_C(pResult1,pData2,len);
}
s_t1=currTime()-t;
s_t1 /=count;
if (s_t1<min_t1)
min_t1=s_t1;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Add(pResult2,pData2,len);
}
s_t2=currTime()-t;
s_t2 /=count;
if (s_t2<min_t2)
min_t2=s_t2;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Add_4way_unroll(pResult3,pData2,len);
}
s_t3=currTime()-t;
s_t3 /=count;
if (s_t3<min_t3)
min_t3=s_t3;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Add_3P_C(pResult4,pData1,pData2,len);
}
s_t4=currTime()-t;
s_t4 /=count;
if (s_t4<min_t4)
min_t4=s_t4;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Add_3P_4way_unroll(pResult5,pData1,pData2,len);
}
s_t5=currTime()-t;
s_t5 /=count;
if (s_t5<min_t5)
min_t5=s_t5;
}
printf("\n\n5 add function speed test,time unit: ms,length unint:DWORD\n");
printf("length\tBIN_ADD_C\tBIN_Add\tBIN_Add_4way_unroll\tBIN_Add_3P_C\tBIN_Add_3P_4way_unroll\n");
printf("%d\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\n",len,
min_t1*1000,min_t2*1000,min_t3*1000,min_t4*1000,min_t5*1000);
free(pData1);
free(pData2);
free(pResult1);
free(pResult2);
free(pResult3);
free(pResult4);
free(pResult5);
}
void test_sub_speed( int len)
{
DWORD *pData1=(DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pData2=(DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult1= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult2= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult3= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult4= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult5= (DWORD *)malloc( (len+4)* sizeof(DWORD));
int i,j,count=0;
double t,s_t1,s_t2,s_t3,s_t4,s_t5;
double min_t1,min_t2,min_t3,min_t4,min_t5;
createFolders();
MadeNum(pData1,len+4);
MadeNum(pData2,len+4);
A_Copy(pResult1,pData1,len+4);
A_Copy(pResult2,pData1,len+4);
A_Copy(pResult3,pData1,len+4);
count=1024;
min_t1=min_t2=min_t3=min_t4=min_t5=1000;
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Sub_C(pResult1,pData2,len);
}
s_t1=currTime()-t;
s_t1 /=count;
if (s_t1<min_t1)
min_t1=s_t1;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Sub_C(pResult2,pData2,len);
}
s_t2=currTime()-t;
s_t2 /=count;
if (s_t2<min_t2)
min_t2=s_t2;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Sub_4way_unroll(pResult3,pData2,len);
}
s_t3=currTime()-t;
s_t3 /=count;
if (s_t3<min_t3)
min_t3=s_t3;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Sub_3P_C(pResult4,pData1,pData2,len);
}
s_t4=currTime()-t;
s_t4 /=count;
if (s_t4<min_t4)
min_t4=s_t4;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Sub_3P_4way_unroll(pResult5,pData1,pData2,len);
}
s_t5=currTime()-t;
s_t5 /=count;
if (s_t5<min_t5)
min_t5=s_t5;
}
printf("\n\n5sub function speed test,time unit: ms,length unint:DWORD\n");
printf("length\tBIN_Sub_C\tBIN_Sub\tBIN_sub_4way_unroll\tBIN_sub_3P_C\tBIN_sub_3P_4way_unroll\n");
printf("%d\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\n",
len,min_t1*1000,min_t2*1000,min_t3*1000,min_t4*1000,min_t5*1000);
free(pData1);
free(pData2);
free(pResult1);
free(pResult2);
free(pResult3);
free(pResult4);
free(pResult5);
}
void test_neg_speed( int len)
{
DWORD *pData1=(DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult1= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult2= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult3= (DWORD *)malloc( (len+4)* sizeof(DWORD));
DWORD *pResult4= (DWORD *)malloc( (len+4)* sizeof(DWORD));
int i,j,count=0;
double t,s_t1,s_t2,s_t3,s_t4;
double min_t1,min_t2,min_t3,min_t4;
MadeNum(pData1,len+4);
A_Copy(pResult1,pData1,len+4);
A_Copy(pResult2,pData1,len+4);
A_Copy(pResult3,pData1,len+4);
A_Copy(pResult4,pData1,len+4);
count=1024;
min_t1=min_t2=min_t3=min_t4=1000;
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Negate(pResult1,len);
}
s_t1=currTime()-t;
s_t1 /=count;
if (s_t1<min_t1)
min_t1=s_t1;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Negate_C(pResult2,len);
}
s_t2=currTime()-t;
s_t2 /=count;
if (s_t2<min_t2)
min_t2=s_t2;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Negate_4way_unroll(pResult3,len);
}
s_t3=currTime()-t;
s_t3 /=count;
if (s_t3<min_t3)
min_t3=s_t3;
}
for (i=0;i<8;i++)
{
t=currTime();
for (j=0;j<count;j++)
{
BIN_Negate_SSE2(pResult4,len);
}
s_t4=currTime()-t;
s_t4 /=count;
if (s_t4<min_t4)
min_t4=s_t4;
}
printf("\n\n4 neg function speed test,time unit: ms,length unint:DWORD\n");
printf("length\tBIN_NEG\t\tBIN_NEG_C\tBIN_NEG_4way_unroll\tBIN_Negate_SSE2\n");
printf("%d\t%.6f\t%.6f\t%.6f\t%.6f\n",len,
min_t1*1000,min_t2*1000,min_t3*1000,min_t4*1000);
free(pData1);
free(pResult1);
free(pResult2);
free(pResult3);
free(pResult4);
}