liangbch
发表于 2017-1-15 17:36:33
仍然使用11楼的思想,做进一步优化。这个版本使用1次查表(表的大小为128个64bit数),2次比较,4次32位数乘法,10次32位数和64位数移位运算。
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
typedef int BOOL;
#define TRUE 1
#define FALSE 0
typedef unsigned int uint32_t;
#if defined(_MSC_VER)
typedef unsigned __int64 uint64_t;
#define RAD 1000000000ui64
#define HALF_RAD 500000000ui64
#define I64_1 1i64
uint64_t prodArr[]=
{
0ui64,
144115188000000000ui64,
288230376000000000ui64,
432345564000000000ui64,
576460752000000000ui64,
720575940000000000ui64,
864691128000000000ui64,
1008806316000000000ui64,
1152921504000000000ui64,
1297036692000000000ui64,
1441151880000000000ui64,
1585267068000000000ui64,
1729382256000000000ui64,
1873497444000000000ui64,
2017612633000000000ui64,
2161727821000000000ui64,
2305843009000000000ui64,
2449958197000000000ui64,
2594073385000000000ui64,
2738188573000000000ui64,
2882303761000000000ui64,
3026418949000000000ui64,
3170534137000000000ui64,
3314649325000000000ui64,
3458764513000000000ui64,
3602879701000000000ui64,
3746994889000000000ui64,
3891110078000000000ui64,
4035225266000000000ui64,
4179340454000000000ui64,
4323455642000000000ui64,
4467570830000000000ui64,
4611686018000000000ui64,
4755801206000000000ui64,
4899916394000000000ui64,
5044031582000000000ui64,
5188146770000000000ui64,
5332261958000000000ui64,
5476377146000000000ui64,
5620492334000000000ui64,
5764607523000000000ui64,
5908722711000000000ui64,
6052837899000000000ui64,
6196953087000000000ui64,
6341068275000000000ui64,
6485183463000000000ui64,
6629298651000000000ui64,
6773413839000000000ui64,
6917529027000000000ui64,
7061644215000000000ui64,
7205759403000000000ui64,
7349874591000000000ui64,
7493989779000000000ui64,
7638104968000000000ui64,
7782220156000000000ui64,
7926335344000000000ui64,
8070450532000000000ui64,
8214565720000000000ui64,
8358680908000000000ui64,
8502796096000000000ui64,
8646911284000000000ui64,
8791026472000000000ui64,
8935141660000000000ui64,
9079256848000000000ui64,
9223372036000000000ui64,
9367487224000000000ui64,
9511602413000000000ui64,
9655717601000000000ui64,
9799832789000000000ui64,
9943947977000000000ui64,
10088063165000000000ui64,
10232178353000000000ui64,
10376293541000000000ui64,
10520408729000000000ui64,
10664523917000000000ui64,
10808639105000000000ui64,
10952754293000000000ui64,
11096869481000000000ui64,
11240984669000000000ui64,
11385099857000000000ui64,
11529215046000000000ui64,
11673330234000000000ui64,
11817445422000000000ui64,
11961560610000000000ui64,
12105675798000000000ui64,
12249790986000000000ui64,
12393906174000000000ui64,
12538021362000000000ui64,
12682136550000000000ui64,
12826251738000000000ui64,
12970366926000000000ui64,
13114482114000000000ui64,
13258597302000000000ui64,
13402712491000000000ui64,
13546827679000000000ui64,
13690942867000000000ui64,
13835058055000000000ui64,
13979173243000000000ui64,
14123288431000000000ui64,
14267403619000000000ui64,
14411518807000000000ui64,
14555633995000000000ui64,
14699749183000000000ui64,
14843864371000000000ui64,
14987979559000000000ui64,
15132094747000000000ui64,
15276209936000000000ui64,
15420325124000000000ui64,
15564440312000000000ui64,
15708555500000000000ui64,
15852670688000000000ui64,
15996785876000000000ui64,
16140901064000000000ui64,
16285016252000000000ui64,
16429131440000000000ui64,
16573246628000000000ui64,
16717361816000000000ui64,
16861477004000000000ui64,
17005592192000000000ui64,
17149707381000000000ui64,
17293822569000000000ui64,
17437937757000000000ui64,
17582052945000000000ui64,
17726168133000000000ui64,
17870283321000000000ui64,
18014398509000000000ui64,
18158513697000000000ui64,
18302628885000000000ui64
};
#elif defined(__GNUC__)
typedef unsigned long long uint64_t;
#define HALF_RAD 500000000LL
#define RAD 1000000000LL
#define I64_11LL
uint64_t prodArr[]=
{
0LL,
144115188000000000LL,
288230376000000000LL,
432345564000000000LL,
576460752000000000LL,
720575940000000000LL,
864691128000000000LL,
1008806316000000000LL,
1152921504000000000LL,
1297036692000000000LL,
1441151880000000000LL,
1585267068000000000LL,
1729382256000000000LL,
1873497444000000000LL,
2017612633000000000LL,
2161727821000000000LL,
2305843009000000000LL,
2449958197000000000LL,
2594073385000000000LL,
2738188573000000000LL,
2882303761000000000LL,
3026418949000000000LL,
3170534137000000000LL,
3314649325000000000LL,
3458764513000000000LL,
3602879701000000000LL,
3746994889000000000LL,
3891110078000000000LL,
4035225266000000000LL,
4179340454000000000LL,
4323455642000000000LL,
4467570830000000000LL,
4611686018000000000LL,
4755801206000000000LL,
4899916394000000000LL,
5044031582000000000LL,
5188146770000000000LL,
5332261958000000000LL,
5476377146000000000LL,
5620492334000000000LL,
5764607523000000000LL,
5908722711000000000LL,
6052837899000000000LL,
6196953087000000000LL,
6341068275000000000LL,
6485183463000000000LL,
6629298651000000000LL,
6773413839000000000LL,
6917529027000000000LL,
7061644215000000000LL,
7205759403000000000LL,
7349874591000000000LL,
7493989779000000000LL,
7638104968000000000LL,
7782220156000000000LL,
7926335344000000000LL,
8070450532000000000LL,
8214565720000000000LL,
8358680908000000000LL,
8502796096000000000LL,
8646911284000000000LL,
8791026472000000000LL,
8935141660000000000LL,
9079256848000000000LL,
9223372036000000000LL,
9367487224000000000LL,
9511602413000000000LL,
9655717601000000000LL,
9799832789000000000LL,
9943947977000000000LL,
10088063165000000000LL,
10232178353000000000LL,
10376293541000000000LL,
10520408729000000000LL,
10664523917000000000LL,
10808639105000000000LL,
10952754293000000000LL,
11096869481000000000LL,
11240984669000000000LL,
11385099857000000000LL,
11529215046000000000LL,
11673330234000000000LL,
11817445422000000000LL,
11961560610000000000LL,
12105675798000000000LL,
12249790986000000000LL,
12393906174000000000LL,
12538021362000000000LL,
12682136550000000000LL,
12826251738000000000LL,
12970366926000000000LL,
13114482114000000000LL,
13258597302000000000LL,
13402712491000000000LL,
13546827679000000000LL,
13690942867000000000LL,
13835058055000000000LL,
13979173243000000000LL,
14123288431000000000LL,
14267403619000000000LL,
14411518807000000000LL,
14555633995000000000LL,
14699749183000000000LL,
14843864371000000000LL,
14987979559000000000LL,
15132094747000000000LL,
15276209936000000000LL,
15420325124000000000LL,
15564440312000000000LL,
15708555500000000000LL,
15852670688000000000LL,
15996785876000000000LL,
16140901064000000000LL,
16285016252000000000LL,
16429131440000000000LL,
16573246628000000000LL,
16717361816000000000LL,
16861477004000000000LL,
17005592192000000000LL,
17149707381000000000LL,
17293822569000000000LL,
17437937757000000000LL,
17582052945000000000LL,
17726168133000000000LL,
17870283321000000000LL,
18014398509000000000LL,
18158513697000000000LL,
18302628885000000000LL
};
#else
#error do not support this compiler
#endif
void output_table()
{
uint64_t i,n,q,prod;
uint64_t imax,max;
uint64_t low57bits=(I64_1 << 57)-I64_1;;
printf("uint64_t prodArr[]=\n{\n");
for (max=0,i=0;i<128;i++)
{
n=( i << 57);
q= n / RAD;
prod=q * RAD;
printf("\t%I64u,\n", prod);
imax= n + low57bits- prod;
if (imax>max)
max=imax;
}
printf("};\n");
// printf("\nmax remainder =%I64u",max);
//max remainder =144115189068469759
}
/*
1. 关于 计算 a/10^9
a/10^9 = a * 10^-9=(a * 2199.023255552) / 2^41,故我们采用2种方法来近似计算 q= a/(10^9)
方法1.
u32_q =(uint32)(x>>37);
u32_q *= 2199;
u32_q >>= 17;
u64_q = (uint64_t)u32_q
u64_q <<= 13
方法2.
u32_q =(uint32)(a>>23)
u32_q *= 2199
u32_q >>=18
2. 关于 计算 u32_q * 10^9
当u32_q <= 29820, 可使用下面的方法 计算u64_x= u32_q * 10^9
u32_prod= u32_q * 144027
u64_t1=(uint64_t)u32_q
u64_t2=(uint64_t)u32_prod
u64_x= (u64_t1<<30) - (u64_t2<<9)
*/
uint32_t Uint64ModBillion(uint64_t a)
{
int i;
uint64_t u64_t1,u64_t2,u64_prod;
uint32_t u32_q,u32_prod;
// step1
i =(int) (a >> 57);
// 根据最高7bit idx,找到一个值 x= prodArr, 使得 prodArr是10^9的倍数,且prodArr>=a
a -=prodArr; //至此,a的最大值为 144115189068469759= 1.0000000068876424494379584473336 * 2^57
// step2
u32_q=(uint32_t)(a >>37); // max(u32_q)= max(a)/2^37=144115189068469759/2^37=1048576
u32_q *= 2199; // max(u32_q)= 1048576*2199=2305818624 < 2^32
u32_q >>=17; // max(u32_q)= (1048576*2199) >>17 =17592 < 29820
//误差分析
//这里 u32_q 约等于 int((a/10^9)/2^13),
//若 q=(a/10^9)/2^13, 则 max(a-u32_q) <= 1+ max(a)/(2^54)*0.023255552= 1+0.1860= 1.1860
u32_prod= u32_q * 144027;
u64_t1 = (uint64_t)u32_q;
u64_t2 = (uint64_t)u32_prod;
u64_prod= (u64_t1<<30) - (u64_t2<<9); // u64_prod= (uint64_t)u32_q * 10^9
a-= (u64_prod<<13);
// 这里a的最大值= (2^37-1)+1.1860*(2^13)*(10^9)= 9853514819840<2^44
// step3
u32_q=(uint32_t)(a >>23); // max(u32_q)= max(a)/2^23=9853514819840/2^23 = 1174630
u32_q *= 2199; // max(u32_q)= 1174630*2199=2583011370 < 2^32
u32_q >>=18; // max(u32_q)= (2583011370>>18)=9853 < 29820
u32_prod= u32_q * 144027;
u64_t1 = (uint64_t)u32_q;
u64_t2 = (uint64_t)u32_prod;
u64_prod= (u64_t1<<30) - (u64_t2<<9); // u64_prod= (uint64_t)u32_q * 10^9
a-= u64_prod;
//误差分析
//这里 u32_q 约等于 int((a/10^9))
//若q=(a/10^9), 则 max(a-u32_q)<= 1+ max(a)/(2^41)*0.023255552= 1+0.1042= 1.1042
// 这里a的最大值= (2^23-1)+1.1042*(10^9)= 1112588607
if ( a>=RAD)//因为a的最大值为1112588607,故只需调整余数一次
a-=RAD;
return (uint32_t)a;
}
int main(int argc, char* argv[])
{
uint64_t a;
uint32_t mod;
//output_table();
a=9223372036854775807I64;
mod=Uint64ModBillion(a);
if ( mod >=HALF_RAD)
printf("TRUE");
else
printf("FALSE");
return 0;
}
happysxyf
发表于 2017-1-15 22:27:22
本帖最后由 happysxyf 于 2017-1-15 23:26 编辑
liangbch 发表于 2017-1-15 10:43
还有, 我把 ”if(a-((a>>1)
主要是站长没说可以用uint64_t类型的与运算,我也不知道他们的硬件是否允许uint64_t的与运算。
你的代码优化的非常好。我还在寻找能否继续减少循坏次数,因为他的是纳秒。如果是10负9次方的响应,那C语言根本达不到,因为C语言计算个1+1耗时都要1纳秒。总之,我的取余程序根本达不到小于10的负9次方秒的瞬间响应。
liangbch
发表于 2017-1-16 07:33:13
那C语言根本达不到,因为C语言计算个1+1耗时都要1纳秒
不是的,如果你做大量的测试,会发现,像这种简单加法,耗时可以低于1个时钟周期。我的测试表明,在大量循环的情况下,在Intel两年内出的I7 CPU运行,下面4条指令总的执行时间可以低于1.3个时钟周期。
mov eax, DP
mul DP
add ecx, eax
adc ebx, edx
mathe
发表于 2017-1-16 08:26:17
http://www.cnblogs.com/shines77/p/4189074.html
mathe
发表于 2017-1-16 08:29:27
郭要求的其实是除以$5^9*2^8$商的奇偶性
mathe
发表于 2017-1-16 08:38:59
另外感觉去掉被除数末八比特除以5^9求奇偶性也可
gxqcn
发表于 2017-1-16 08:41:32
mathe 的提示切中要害!
既然需要的是求被 \(10^9\) 除后的余数与 \(2^8*5^9\) 进行大小比较,
那么被除数的低 \(8\) 位对结果不产生任何影响。
继续推导下去,可将被除数、除数及余数同时右移 \(8\) 位,等式仍成立,
要比较的数值降低为 \(5^9\)。
对于除法运算来说,在除数腾空 \(8\) 位之后,\(32\) 位下的运算优化腾挪余地就大多了。。。
liangbch
发表于 2017-1-16 10:55:01
除以10^9判断余数,需要求出35bit商,除以5*10^8判断商的奇偶性,需要求出36bit商。在第二种情况,因为只需要考虑商(不考虑余数),被除数和除数可同时缩小,商不变。使用第2种方法,可减少移位次数。但乘法次数很难减少。我21楼的代码,使用了1次查表(表格可改为uint32进一步缩小空间),4次32位数乘法。我想不管用什么方法,很难将乘法次数将到4次以下。除非增加相当多的的移位指令和比较指令。由于分支预测的关系,比较跳转指令对显著降低了程序的速度,所以我建议尽可能少用比较或者跳转指令。
为什么是“4次32位数乘法”?
以乘代除法的本质是将一次除法转化为2次乘法。因为商是35bit,使用查表得到6bit商,其后的第1次以乘代除法,得到15bit商,使用2次乘法,第2次以乘代除法,得到大约14bit商,同样使用2次乘法。基于严密的误差分析,第1次以乘代除法没有调整余数的过程,第2次以乘代除法也只需1次调整余数的过程。
liangbch
发表于 2017-1-16 10:57:06
关于以乘代除法,事实上我做了更多的独立地研究,国外也早就有类似的论文,如《Division by Invariant Integers using Multiplication》
对于郭子的论文,我补充几点。
1. http://www.cnblogs.com/shines77/p/4189074.html 中提到。"只有q的误差E<1/c, 则能保证q1=q"
当被除数为32比特以内的数,c可能有多个。但是在实践中,更常用的是长除法,即被除数a并不满足a<RAD, 但满足 a<=RAD*c-1时。在这种情况下,c只有一个,那就是3.故GMP中有一个单独的除法函数mpn_divexact_by3。
2. 因为以乘代除法计算a/c时,得到的商q0和真实的商q有偏差,故在完成a=a-q0*c后,a可能小于0或者大于c,这是就需要调整余数和商,每次调整需要1次比较和一次加减法,在实践中,我们需要分析余数a的最大值或者最大误差,这样就可事先确定最多要做几次调整。郭子的论文中,没有给出进一步的误差分析。
gxqcn
发表于 2017-1-16 13:33:39
现在,公布一下我们的作法:#include <stdio.h>
#include <stdint.h>
#include <tchar.h>
#include <assert.h>
typedef union
{
uint64_t u64;
struct
{
uint32_t u32L;
uint32_t u32H;
};
uint8_t u8[ 8 ];
}UINT64;
#define HMOD 1953125u // = 5**9
#define MOD 3906250u // = 2 * HMOD
#define C24 1152216u // = 2**24 % MOD
#define C32 1998546u // = 2**32 % MOD
#define C40 3815276u // = 2**40 % MOD
#define C48 148156u // = 2**48 % MOD
// 255 * ( C24 + C32 + C40 + C48 ) = 1 814 119 470
uint32_t Algo_1( uint64_t v )
{
UINT64 u;
u.u64 = v;
u.u64 >>= 8u;
// 整体移位后,u.u8[ 7 ] 一定为 0
return (( u.u32L % MOD )
+ u.u8[ 4 ] * C32
+ u.u8[ 5 ] * C40
+ u.u8[ 6 ] * C48 ) % MOD;
}
uint32_t Algo_2( uint64_t v )
{
UINT64 u;
u.u64 = v;
u.u64 >>= 8u;
// 在 Algo_1 基础上,减少一次 % 运算
return (( u.u32L < 2000000000u ? u.u32L : u.u32L - 2000000000u )
+ u.u8[ 4 ] * C32
+ u.u8[ 5 ] * C40
+ u.u8[ 6 ] * C48 ) % MOD;
}
uint32_t Algo_3( uint64_t v )
{
UINT64 u;
u.u64 = v;
// 不整体移位,仅低 32 位移位,消除分支,全程仅一次 % 运算
return (( u.u32L >> 8u )
+ u.u8[ 4 ] * C24
+ u.u8[ 5 ] * C32
+ u.u8[ 6 ] * C40
+ u.u8[ 7 ] * C48 ) % MOD;
}
int _tmain(int argc, _TCHAR* argv[])
{
uint64_t v = 9223372036854775807ULL;
uint32_t r, r1, r2, r3;
r = ( v % 1000000000u ) >> 8u;
r1 = Algo_1( v );
r2 = Algo_2( v );
r3 = Algo_3( v );
assert( r1 == r );
assert( r2 == r );
assert( r3 == r );
printf( "%s", ( r3 < HMOD ? "true" : "false" ));
return 0;
}
其中在 Algo_3 里,32位的 >>运算1次,*运算4次,+运算4次,%运算1次。
注意到 u.u8[ 4 ] * C24 + u.u8[ 5 ] * C32 + u.u8[ 6 ] * C40 + u.u8[ 7 ] * C48 \( \leqslant 255 * ( C24 + C32 + C40 + C48 ) = 1 814 119 470 \lt 2^{32} - 2^{24}\),
故全程运算均在 \(32\) 位以内,不存在溢出风险。