// The routine roots64scilk computes the roots using our Tangent Graeffe // root finding Cilk C implementation. // Copyright Michael Monagan March 2020 #include #include #include // Timer double realtime() { struct timeval t; gettimeofday(&t, NULL); t.tv_sec -= (2014-1970) * 3600 * 24 * 365; return (1. + t.tv_usec + t.tv_sec * 1000000.)/1000000.; } #define DEBUG 0 #define LONG long long int #define ULONG unsigned long long int unsigned LONG seed; unsigned LONG mult; LONG rand64s(LONG p); void randvec64s( LONG *A, int n, LONG p ); LONG * array( int n ); LONG * array64s( LONG n ); #define UINT64 unsigned long long typedef struct { UINT64 s; /* shift */ UINT64 v; /* reciprocal of d */ UINT64 d0; /* divisor shifted up */ UINT64 d1; } recint; recint recip1(UINT64 p); UINT64 mulrec64(UINT64 a, UINT64 b, recint v); /******************************************************************************************/ /* Zp utilities */ /******************************************************************************************/ #define max32s max32smain #define min32s min32smain #define max64s max64smain #define min64s min64smain #define add64s add64smain #define sub64s sub64smain #define mul64s mul64smain #define neg64s neg64smain LONG min64s( LONG a, LONG b ) { if( ab ) return a; else return b; } LONG neg64s(LONG a, LONG p) { return (a==0) ? 0 : p-a; } LONG add64s(LONG a, LONG b, LONG p) { LONG t; t = (a-p)+b; t += (t>>63) & p; return t; } LONG sub64s(LONG a, LONG b, LONG p) { LONG t; t = a-b; t += (t>>63) & p; return t; } LONG mul64s(LONG a, LONG b, LONG p) { LONG q, r; __asm__ __volatile__( \ " mulq %%rdx \n\t" \ " divq %4 \n\t" \ : "=a"(q), "=d"(r) : "0"(a), "1"(b), "rm"(p)); return r; } LONG modinv64s( LONG c, LONG p ); LONG powmod64s( LONG a, LONG n, LONG p ); /******************************************************************************************/ /* Polynomial utitlities from polyalg2 */ /******************************************************************************************/ LONG poleval64s(LONG *a, int d, LONG x, LONG p); int polmul64s( LONG * A, LONG * B, LONG * C, int da, int db, LONG p); int poldiv64s( LONG * A, LONG * B, int da, int db, LONG p ); void vecprint64s( LONG *A, int n ); int vecequal64s( LONG *A, LONG *B, int n ); void polprint64s( LONG *A, int d, LONG p ); int polequal64s( LONG *a, LONG *b, int d ); void polcopy64s( LONG *A, int d, LONG *B ); int poldiff64s( LONG *f, int d, LONG *fp, LONG p ); void polcopy64s( LONG *A, int d, LONG *B ); void FFTgraeffe64s( LONG * f, LONG * g, LONG d, LONG *T, LONG p ); LONG getomega64s( LONG p, LONG n ); LONG poleval64s(LONG *a, int d, LONG x, LONG p); LONG bluestein64s( LONG *a, LONG n, LONG s, LONG *v, LONG p ); LONG FFTbluestein64s( LONG *a, LONG n, LONG s, LONG *v, LONG *T, LONG p ); cilk LONG FFTbluestein64scilk( LONG *a, LONG n, LONG s, LONG *v, LONG *T, LONG p ); void tangGraeffe64s( LONG *f, LONG *g, LONG d, LONG p ); void FFTtangGraeffe64s( LONG *f, LONG *g, LONG d, int k, LONG *T, LONG p ); cilk int FFTtangGraeffe64scilk( LONG *f, LONG *g, LONG d, int k, LONG *T, LONG p ); int FFTpoldiv64s( LONG *a, LONG *b, LONG da, LONG db, long p ); void fastLambda( LONG * v, LONG n, LONG *f, LONG p ); cilk int fastLambdacilk( LONG * v, LONG n, LONG *f, LONG p ); void changebase( LONG *f, LONG n, LONG alpha, LONG p ); void changebase2( LONG *f, LONG n, LONG alpha, LONG p ); void changebase3( LONG *f, LONG n, LONG alpha, LONG p ); cilk LONG changebase3cilk( LONG *f, LONG n, LONG alpha, LONG p ); void Lambda( LONG *v, LONG n, LONG *f, LONG p ) { LONG i; LONG a[2]; a[1] = 1; f[0] = 1; for( i=0; i=i ; ) { if( D[i]40 ) { i = vecpartition64s(D, n); sortvec64s(D,i); sortvec64s(D+i+1,n-i-1); return; } sortvec64s(D,n-1); x = D[n-1]; for( i=n-1; i>0 && x1; k++ ) q = q >> 1; while( s<2*n ) { k--; s = 2*s; } return s; } LONG extractroots( LONG *v, LONG *x, LONG *y, LONG s, LONG k, LONG alpha, LONG omega, LONG *R, LONG p, recint P ) { LONG i,m,w,t,z,r,N; // R is for the roots w = 1; N = 0; m = 0; for( i=0; i=0 ) m++; //printf("rootsfound2 =%lld\n", m ); for( m=i=0; i=0 ) roots[m++] = roots[i]; return m; } LONG rootsrec( LONG *lambda, LONG n, LONG *R, LONG p, LONG *f, LONG *g, LONG *P, LONG *Q, LONG *v, LONG *x, LONG *y, LONG *W, LONG *T, int TIMINGS ) { LONG i, k, m, okay, N; LONG alpha, omega, q, s; double st,tt,ST,TT; recint pp; m = 0; if( n>0 && lambda[0]==0 ) { R[0] = 0; lambda++; n--; R++; m++; } if( n==0 ) return m; if( n==1 ) { R[0] = neg64s(lambda[0],p); return m+1; } ST = realtime(); pp = recip1(p); if( TIMINGS ) st = realtime(); do { for( i=0; i<=n; i++ ) f[i] = lambda[i]; alpha = rand64s(p); changebase3( f, n, alpha, p ); // inplace } while( f[0]==0 ); if( TIMINGS ) tt = realtime(); /* choose s parameter in the range 2n <= s < 4n */ for( s=(p-1)/2; (s&1)==0; s=s/2 ); q = (p-1)/s; for( k=0; q>1; k++ ) q = q >> 1; while( s<2*n ) { k--; s = 2*s; } printf("ROOTS :: n=%d k=%d s=%lld\n",n,k,s); if( DEBUG ) printf("alpha=%lld\n", alpha); if( TIMINGS ) printf("Base time = %8.3f s\n",tt-st); if( DEBUG ) { polcopy64s( f, n, g ); changebase( g, n, p-alpha, p ); // change it back if( n<20 ) { printf("g := "); polprint64s(g,n,p); } for( okay=1,i=0; i<=n; i++ ) if( g[i]!=lambda[i] ) okay = 0; if( okay ) printf("base okay\n"); else printf("base bug\n"); } if( DEBUG ) { polcopy64s( lambda, n, g ); if( n<20 ) { printf("lambda := "); polprint64s(g,n,p); } changebase( g, n, alpha, p ); if( n<20 ) { printf("g := "); polprint64s(g,n,p); } for( okay=1,i=0; i<=n; i++ ) if( g[i]!=f[i] ) okay = 0; if( okay ) printf("base2 okay\n"); else printf("base2 bug\n"); } for( i=0; i<=n; i++ ) P[i] = f[i]; //translate(f,n,Q,p); // Translate(f,n,P,Q,p); poldiff64s(f,n,Q,p); // Translate(f,n,P,Q,p); polcopy64s(P,n,f); polcopy64s(Q,n-1,g); //if( n<20 ) { printf("P := "); polprint64s(P,n,p); } //if( n<20 ) { printf("Q := "); polprint64s(Q,n-1,p); } for( N=8; N<=2*n; N=2*N ); if( TIMINGS ) st = realtime(); FFTtangGraeffe64s( P, Q, n, k, T, p ); if( TIMINGS ) tt = realtime(); if( TIMINGS ) printf("Tangent Graeffe time = %8.3f s\n",tt-st); if( DEBUG ) { for( i=0; i0 && lambda[0]==0 ) { R[0] = 0; lambda++; n--; R++; m++; } if( n==0 ) return m; if( n==1 ) { R[0] = neg64s(lambda[0],p); return m+1; } ST = realtime(); pp = recip1(p); if( TIMINGS ) st = realtime(); do { for( i=0; i<=n; i++ ) f[i] = lambda[i]; alpha = rand64s(p); //changebase2( f, n, alpha, p ); //changebase3( f, n, alpha, p ); DUMMY = spawn changebase3cilk( f, n, alpha, p ); sync; } while( f[0]==0 ); if( TIMINGS ) tt = realtime(); /* choose s parameter in the range 2n <= s < 4n */ for( s=(p-1)/2; (s&1)==0; s=s/2 ); q = (p-1)/s; for( k=0; q>1; k++ ) q = q >> 1; while( s<2*n ) { k--; s = 2*s; } printf("ROOTS :: n=%d k=%d s=%lld\n",n,k,s); if( TIMINGS ) printf("Base time = %8.3f s\n",tt-st); for( i=0; i<=n; i++ ) P[i] = f[i]; poldiff64s(f,n,Q,p); // Translate(f,n,P,Q,p); polcopy64s(P,n,f); polcopy64s(Q,n-1,g); for( N=8; N<=2*n; N=2*N ); if( TIMINGS ) st = realtime(); dummy = spawn FFTtangGraeffe64scilk( P, Q, n, k, T, p ); sync; if( TIMINGS ) tt = realtime(); if( TIMINGS ) printf("Tangent Graeffe time = %8.3f s\n",tt-st); if( TIMINGS ) st = realtime(); omega = spawn FFTbluestein64scilk( P, n, s, v, T2, p ); //sync; omega = spawn FFTbluestein64scilk( Q, n-1, s, x, T3, p ); //sync; //poldiff64s(P,n,Q,p); // derivative is in Q and has degree n-1 //omega = spawn FFTbluestein64scilk( Q, n-1, s, y, T, p ); poldiff64s(P,n,R,p); // derivative is in Q and has degree n-1 omega = spawn FFTbluestein64scilk( R, n-1, s, y, T, p ); sync; if( TIMINGS ) tt = realtime(); if( TIMINGS ) printf("Bluestein time = %8.3f s\n",tt-st); if( TIMINGS ) st = realtime(); //m = extractroots(v,x,y,s,k,alpha,omega,R,p,pp); m = spawn rootsblocks(v,x,y,s,k,alpha,omega,R,p,pp); sync; if( m==-1 ) { printf("No roots\n"); return 0; } if( TIMINGS ) tt = realtime(); if( TIMINGS ) printf("Extract roots time = %8.3f s\n",tt-st); if( TIMINGS ) st = realtime(); //fastLambda( R, m, g, p ); // Lambda( R, m, g, p ); dummy = spawn fastLambdacilk( R, m, g, p ); // Lambda( R, m, g, p ); sync; if( TIMINGS ) tt = realtime(); if( TIMINGS ) printf("Lambda time = %8.3f s\n",tt-st); if( TIMINGS ) st = realtime(); for( i=0; i<=n; i++ ) f[i] = lambda[i]; k = FFTpoldiv64s( f, g, n, m, p ); if( TIMINGS ) tt = realtime(); if( TIMINGS ) TT = realtime(); if( TIMINGS ) printf("divide time = %8.3f s\n",tt-st); if( TIMINGS ) printf("TOTAL time = %10.3f s\n",TT-ST); if( k!=-1 ) printf("roots: DIVIDE bad\n"); if( n<100 ) printf("ROOTS :: %d of %d roots found\n",m,n); else printf("ROOTS :: %d (%4.1f%)roots found\n",m,(100.0*m)/n); // if( mN ) N = M; T = array64s(N); S += N; R = array64s(s); S += s; v = array64s(s); S += s; x = array64s(s); S += s; y = array64s(s); S += s; W = array64s(s); S += s; S = S/1000000; if( S > 2000 ) printf(" Space = %8.2f gigs\n", S/1000.0 ); else printf( " Space = %8.2f megs\n", S/1.0 ); m = rootsrec( L, n, R, p, f, g, P, Q, v, x, y, W, T, 1 ); for( i=0; iN ) N = M; T = array64s(N); S += N; T2 = array64s(N); S += N; T3 = array64s(N); S += N; R = array64s(s); S += s; v = array64s(s); S += s; x = array64s(s); S += s; y = array64s(s); S += s; W = array64s(s); S += s; S = S/1000000; if( S > 2000 ) printf(" Space = %8.2f gigs\n", S/1000.0 ); else printf( " Space = %8.2f megs\n", S/1.0 ); //m = rootsrec( L, n, R, p, f, g, P, Q, v, x, y, W, T, 1 ); m = spawn rootsreccilk( L, n, R, p, f, g, P, Q, v, x, y, W, T, T2, T3, 1 ); sync; for( i=0; i1000 ) printf("Lambda time = %8.3f s\n",tt); if( n<20 ) { printf("f := "); polprint64s(f,n,p); } if( DEBUG ) { okay = 1; for( i=0; i1000 ) printf("Total roots time = %10.3f s\n",tt); if( DEBUG ) { okay = 1; for( i=0; i