| 
注册时间2007-12-26最后登录1970-1-1威望 星金币 枚贡献 分经验 点鲜花 朵魅力 点上传 次下载 次积分92885在线时间 小时 
 | 
 
 发表于 2008-3-5 19:41:41
|
显示全部楼层 
测试程序模板| 感觉这次擂台比上次的 UInt96x96To192() 更有意义, 为了大家方便交流和测试,我做了一个模板:仅一个.c文件再加一个dsp文件即可。
 
 其中为了方便大家自测算法的正确性,我已完成了一个标准c的版本,
 该版本的结果与我用HugeCalc的一致,并经过反复推敲论证过的,应该是无bug且高效的。
 代码如下:
 
/************************************************************************/
/* UInt128x128To256.c                                                   */
/************************************************************************/
#include < wtypes.h >
#include < stdio.h >
#if 0
#   define _RAND_TEST   /* 随机测试 */
#endif
#ifndef _DEBUG          /* 测试次数 */
#  define _TEST_TIMES   10000000UL
#else
#  define _TEST_TIMES   1UL     /* 先DEBUG算法的正确性 */
#endif
typedef void ( *lpfn_UInt128x128To256 )( UINT32 * const result,
                                        const UINT32 * const left,
                                        const UINT32 * const right );
static BOOL s_bSupport_MMX = FALSE;
static BOOL s_bSupport_SSE = FALSE;
static BOOL s_bSupport_SSE2 = FALSE;
static UINT64 s_u64Frequency = 1;
static UINT64 s_u64Start, s_u64End;
static BYTE buffer[(4+4+8)*4+15];
static UINT32 * left = NULL;
static UINT32 * right = NULL;
static UINT32 * result = NULL;
void initParam( void )
{
    /* 16字节对齐,以便于 SSE/SSE2 优化 */
    left = (UINT32 *)( ((UINT32)( buffer+15 )) & -16 );
    right = left + 4;
    result = right + 4;
    /* 待测函数中不得假定三个指针的相对偏移量情况 */
#ifndef _RAND_TEST
    left[0] = 0xFFFFFFFF;
    left[1] = 0xFFFFFFFF;
    left[2] = 0xFFFFFFFF;
    left[3] = 0xFFFFFFFF;
    right[0] = 0xFFFFFFFF;
    right[1] = 0xFFFFFFFF;
    right[2] = 0xFFFFFFFF;
    right[3] = 0xFFFFFFFF;
#else
    /* 随机数确保可充满32bits */
#   define RAND_VAL()    (( (UINT32)(rand()) << 17 ) | rand())
    srand(GetTickCount());
    left[0] = RAND_VAL();
    left[1] = RAND_VAL();
    left[2] = RAND_VAL();
    left[3] = RAND_VAL();
    right[0] = RAND_VAL();
    right[1] = RAND_VAL();
    right[2] = RAND_VAL();
    right[3] = RAND_VAL();
#endif
    /* 测试 CPU 支持的指令集 */
    __asm
    {
        mov     eax, 1;
        cpuid;
        mov     ecx, 800000h;   /* 23 bit */
        and     ecx, edx;
        neg     ecx;
        sbb     ecx, ecx;
        neg     ecx;
        mov     dword ptr[ s_bSupport_MMX ], ecx;
        mov     ecx, 2000000h;  /* 25 bit */
        and     ecx, edx;
        neg     ecx;
        sbb     ecx, ecx;
        neg     ecx;
        mov     dword ptr[ s_bSupport_SSE ], ecx;
        mov     ecx, 4000000h;  /* 26 bit */
        and     ecx, edx;
        neg     ecx;
        sbb     ecx, ecx;
        neg     ecx;
        mov     dword ptr[ s_bSupport_SSE2 ], ecx;
    }
    QueryPerformanceFrequency((LARGE_INTEGER *)&s_u64Frequency );
}
/************************************************************************/
/* UInt128x128To256 ANSI C 版,经过了严格测试                           */
/************************************************************************/
void UInt128x128To256_ANSI_C32( UINT32 * const result,
                               const UINT32 * const left,
                               const UINT32 * const right )
{
    typedef union tag_UINT64
    {
        UINT64 u64Val;
        UINT32 u32LH[2];
    } UInt64;
    UInt64 u64_0x0, u64_0x1, u64_0x2, u64_0x3;
    UInt64 u64_1x0, u64_1x1, u64_1x2, u64_1x3;
    UInt64 u64_2x0, u64_2x1, u64_2x2, u64_2x3;
    UInt64 u64_3x0, u64_3x1, u64_3x2, u64_3x3;
    u64_0x0.u64Val = UInt32x32To64( left[0], right[0] );
    u64_0x1.u64Val = UInt32x32To64( left[0], right[1] );
    u64_0x2.u64Val = UInt32x32To64( left[0], right[2] );
    u64_0x3.u64Val = UInt32x32To64( left[0], right[3] );
    u64_1x0.u64Val = UInt32x32To64( left[1], right[0] );
    u64_1x1.u64Val = UInt32x32To64( left[1], right[1] );
    u64_1x2.u64Val = UInt32x32To64( left[1], right[2] );
    u64_1x3.u64Val = UInt32x32To64( left[1], right[3] );
    u64_2x0.u64Val = UInt32x32To64( left[2], right[0] );
    u64_2x1.u64Val = UInt32x32To64( left[2], right[1] );
    u64_2x2.u64Val = UInt32x32To64( left[2], right[2] );
    u64_2x3.u64Val = UInt32x32To64( left[2], right[3] );
    u64_3x0.u64Val = UInt32x32To64( left[3], right[0] );
    u64_3x1.u64Val = UInt32x32To64( left[3], right[1] );
    u64_3x2.u64Val = UInt32x32To64( left[3], right[2] );
    u64_3x3.u64Val = UInt32x32To64( left[3], right[3] );
    /*                                        FF FE  00 01 --[0][0]
                                       FF FE  00 01 --[0][1]
                                       FF FE  00 01 --[1][0]
                                FF FE  00 01 --[0][2]
                                FF FE  00 01 --[1][1]
                                FF FE  00 01 --[2][0]
                         FF FE  00 01 --[0][3]
                         FF FE  00 01 --[1][2]
                         FF FE  00 01 --[2][1]
                         FF FE  00 01 --[3][0]
                  FF FE  00 01 --[1][3]
                  FF FE  00 01 --[2][2]
                  FF FE  00 01 --[3][1]
           FF FE  00 01 --[2][3]
           FF FE  00 01 --[3][2]
    FF FE  00 01 --[3][3]  表示  FFFF FFFE  0000 0001*/
    u64_0x1.u64Val += u64_0x0.u32LH[1];
    u64_0x1.u64Val += u64_1x0.u64Val;
    u64_2x0.u32LH[1] += ( u64_0x1.u64Val < u64_1x0.u64Val );
    u64_0x2.u64Val += u64_0x1.u32LH[1];
    u64_0x2.u64Val += u64_1x1.u64Val;
    u64_3x0.u32LH[1] += ( u64_0x2.u64Val < u64_1x1.u64Val );
    u64_0x2.u64Val += u64_2x0.u64Val;
    u64_2x1.u32LH[1] += ( u64_0x2.u64Val < u64_2x0.u64Val );
    u64_0x3.u64Val += u64_0x2.u32LH[1];
    u64_0x3.u64Val += u64_1x2.u64Val;
    u64_3x1.u32LH[1] += ( u64_0x3.u64Val < u64_1x2.u64Val );
    u64_0x3.u64Val += u64_2x1.u64Val;
    u64_2x2.u32LH[1] += ( u64_0x3.u64Val < u64_2x1.u64Val );
    u64_0x3.u64Val += u64_3x0.u64Val;
    u64_2x3.u64Val += ( u64_0x3.u64Val < u64_3x0.u64Val );
    u64_1x3.u64Val += u64_0x3.u32LH[1];
    u64_1x3.u64Val += u64_2x2.u64Val;
    u64_3x2.u32LH[1] += ( u64_1x3.u64Val < u64_2x2.u64Val );
    u64_1x3.u64Val += u64_3x1.u64Val;
    u64_3x3.u64Val += ( u64_1x3.u64Val < u64_3x1.u64Val );
    u64_2x3.u64Val += u64_1x3.u32LH[1];
    u64_2x3.u64Val += u64_3x2.u64Val;
    u64_3x3.u32LH[1] += ( u64_2x3.u64Val < u64_3x2.u64Val );
    u64_3x3.u64Val += u64_2x3.u32LH[1];
    result[0] = u64_0x0.u32LH[0];
    result[1] = u64_0x1.u32LH[0];
    result[2] = u64_0x2.u32LH[0];
    result[3] = u64_0x3.u32LH[0];
    result[4] = u64_1x3.u32LH[0];
    result[5] = u64_2x3.u32LH[0];
    result[6] = u64_3x3.u32LH[0];
    result[7] = u64_3x3.u32LH[1];
}
/************************************************************************/
/* 测试函数代码粘贴区开始 begin{                                        */
/*----------------------------------------------------------------------*/
/* 待测函数命名规范(推荐):UInt128x128To256_{1}_{2}(...)                */
/*     其中{1}代表用到的最高级指令集;                                  */
/*         {2}代表发帖楼层,如果写错了可以自行修订或请管理员帮助修改    */
/* 以后大家只需发自己待测函数的代码即可,不必再贴全测试代码             */
/************************************************************************/
_declspec(naked)
void UInt128x128To256_MMX_xxxF( UINT32 * const result,
                               const UINT32 * const left,
                               const UINT32 * const right )
{
    __asm
    {
        /* do something ... */
        ret;
    }
}
_declspec(naked)
void UInt128x128To256_SSE_xxxF( UINT32 * const result,
                               const UINT32 * const left,
                               const UINT32 * const right )
{
    __asm
    {
        /* do something ... */
        ret;
    }
}
_declspec(naked)
void UInt128x128To256_SSE2_11F( UINT32 * const result,
                               const UINT32 * const left,
                               const UINT32 * const right )
{
    __asm
    {
        /* 具体代码在 11#,需登陆才可见 */
        ret;
    }
}
/************************************************************************/
/* }end 测试函数代码粘贴区结束                                          */
/************************************************************************/
void testFun( const lpfn_UInt128x128To256 pFun,
             const LPCTSTR lpszFunName,
             const UINT32 u32TestTimes )
{
    UINT32 i;
    printf( "\nTest function: %s(..) %u times... \n",
        lpszFunName, u32TestTimes );
    QueryPerformanceCounter((LARGE_INTEGER *)&s_u64Start );
    for ( i = 0; i < u32TestTimes; ++i )
    {
        (*pFun)( result, left, right );
    }
    QueryPerformanceCounter((LARGE_INTEGER *)&s_u64End );
    i = (UINT32)(( s_u64End - s_u64Start ) * 1000000UL / s_u64Frequency );
    printf( "Elapsed time: %d.%03u ms\n", i / 1000, i % 1000 );
    printf( "  %08X %08X %08X %08X * %08X %08X %08X %08X\n"
        "= %08X %08X %08X %08X   %08X %08X %08X %08X\n",
        left[3], left[2], left[1], left[0],
        right[3], right[2], right[1], right[0],
        result[7], result[6], result[5], result[4],
        result[3], result[2], result[1], result[0] );
}
int main(int argc, char* argv[])
{
    initParam();
    /* 标准结果 */
    testFun( UInt128x128To256_ANSI_C32,
            "UInt128x128To256_ANSI_C32", /*1*/ _TEST_TIMES );
    /* MMX 版本测试 */
    if ( s_bSupport_MMX )
    {
        testFun( UInt128x128To256_MMX_xxxF,
                "UInt128x128To256_MMX_xxxF", _TEST_TIMES );
        /* test other functions:
        testFun( ... );
        */
    }
    /* SSE 版本测试 */
    if ( s_bSupport_SSE )
    {
        testFun( UInt128x128To256_SSE_xxxF,
                "UInt128x128To256_SSE_xxxF", _TEST_TIMES );
        /* test other functions:
        testFun( ... );
        */
    }
    /* SSE2 版本测试 */
    if ( s_bSupport_SSE2 )
    {
        testFun( UInt128x128To256_SSE2_11F,
                "UInt128x128To256_SSE2_11F", _TEST_TIMES );
        /* test other functions:
        testFun( ... );
        */
    }
    printf( "\n" );
    system( "pause" );
    return 0;
}
 完整的测试模板包如下:
  UInt128x128To256.zip
(3.75 KB, 下载次数: 12)
(包含一个.c文件,及dsp文件) 
 请有兴趣的朋友在本地机上编译运行;如发现任何问题,请及时反馈,以便修正,谢谢!
 | 
 |