// This file is Cilk versions of the code in fftutil8.c // I copied my fftutil8.c code and parallelized selected routines. // Copyright, Michael Monagan, March 2020. #include #include // Switch from int i,j,n; to LONG i,j,n; to allow FFTs of size 2^31 and 2^32 #define LONG long long int #define DEBUG 0 /********************************************************************************/ /* Zp utilities */ /********************************************************************************/ #define ADD64s fftadd64scilk #define ADD32s fftadd32scilk #define SUB64s fftsub64scilk #define NEG64s fftneg64scilk #define MUL64s fftmul64scilk #define MOD64s fftmod64scilk #define MIN64s fftmin64scilk #define INV64s modinv64s LONG ADD64s(LONG a, LONG b, LONG p) { LONG t; t = (a-p)+b; t += (t>>63) & p; return t; } LONG ADD32s(int a, int b, int p) { int t; t = (a-p)+b; t += (t>>31) & p; return t; } LONG SUB64s(LONG a, LONG b, LONG p) { LONG t; t = a-b; t += (t>>63) & p; return t; } LONG NEG64s(LONG a, LONG p) { LONG t; t = -a; t += (t>>63) & p; return t; } LONG MUL64s(LONG a, LONG b, LONG p) { LONG q, r; __asm__ __volatile__( \ " mulq %%rdx \n\t" \ " divq %4 \n\t" \ : "=a"(q), "=d"(r) : "0"(a), "1"(b), "rm"(p)); return r; } #define UINT64 unsigned long long typedef struct { UINT64 s; /* shift */ UINT64 v; /* reciprocal of d */ UINT64 d0; /* divisor shifted up */ UINT64 d1; } recint; recint recip1(UINT64 p); UINT64 mulrec64(UINT64 a, UINT64 b, recint v); LONG MOD64s(LONG a,LONG p) { return(a % p); } LONG MIN64s(LONG a, LONG b) { if(a 2 #define FFTCUTOFF 200000 void FFT64s1( LONG n, LONG *a, LONG *W, LONG p, recint P ); void FFT64s2( LONG n, LONG *a, LONG *W, LONG p, recint P ); void FFTwork1( LONG n2, LONG *a, LONG *b, LONG *W, LONG p, recint P ); void FFTwork2( LONG n2, LONG *a, LONG *b, LONG *W, LONG p, recint P ); #define FFTSIZE 262144 cilk LONG FFT64s1workcilk( LONG n2, LONG *a, LONG *b, LONG *W, LONG p, recint P ) { LONG i,dummy,m; if( n2<=FFTSIZE ) { FFTwork1(n2,a,b,W,p,P); return 1; } m = n2/FFTSIZE; if( n2-m*FFTSIZE != 0 ) printf("FFTSIZE bug\n"); for( i=0; i da+db int d; LONG i,n,dc; LONG dummy,w,t,*A,*B,*W; recint P; if( da==-1 || db==-1 ) return -1; P = recip1(p); dc = da+db; for( n=1; n=0 && c[dc]==0; dc-- ); return dc; } w = getomega64s(p,n); //printf("w := %lld;\n",w); if( w==0 ) { printf("omega does not exist: dc=%d n=%d\n",dc,n); exit(1); } A = T; B = T+n; W = B+n; MakeW64(n,w,W,p); //printf("W := "); vecprint32s(W,n); polpad64s(da,n,A,a); //printf("A := "); vecprint32s(A,n); dummy = spawn FFT64s1cilk(n,A,W,p,P); //printf("F := "); vecprint32s(A,n); if( a==b ) B = A; // c = a^2 else { polpad64s(db,n,B,b); //printf("B := "); vecprint32s(B,n); dummy = spawn FFT64s1cilk(n,B,W,p,P); //printf("F := "); vecprint32s(B,n); } sync; for( i=0; i0 ) { // double the order MakeWinv64(n,W,p); // reset it //VECZIPMUL64s(A+n2,W,n2,P); //for( i=0; i= m LONG i,n; LONG w,winv,*W,*Winv,*Y; if( m==1 ) { y[0] = INV64s(f[0],p); return 1; } for( n=1; nda LONG i,dr,dq,n; LONG *ra, *rb, *rq, *y, *q, *r; if( db<0 ) { printf("division by zero\n"); exit(1); } if( da da. //Q = array64s(4*dq+4); //T = array64s(4*n); ra = Q; y = Q + dq + 1; rb = Q + 2*dq + 2; for( i=0; i<=dq; i++ ) ra[i] = a[da-i]; // ra = recip(a) //printf("p:="); printf("%lld;\n",p); //printf("ra:="); polprint64s(ra,dq); for( i=0; i<=MIN64s(db,dq); i++ ) rb[i] = b[db-i]; // rb = recip(b) //printf("rb:="); polprint64s(rb,dq); while( i<=dq ) rb[i++] = 0; FFTinv64s( rb, dq+1, y, T, p ); // compute 1/rb to O(x^(dq+1)) //printf("inv:="); polprint64s(y,dq); rq = Q + 2*dq + 2; FFTmul64s( ra, y, rq, dq, dq, T, p ); // T must be size 3n //printf("rq:="); polprint64s(rq,2*dq); for( i=0; i<=dq; i++ ) q[i] = rq[dq-i]; r = T; FFTmul64s( b, q, r, db, dq, T+n, p ); // T must be size 4n for( i=0; i=0 && a[dr]==0; dr-- ); // compute deg(r) return dr; } int FFTpoldiv64s( LONG *a, LONG *b, LONG da, LONG db, LONG p ); cilk int FFTpoldiv64scilk( LONG *a, LONG *b, LONG da, LONG db, LONG p ) { LONG dr,dq,n; LONG *Q, *T; if( db<0 ) { printf("division by zero\n"); exit(1); } if( da da. //printf("FFTpoldiv64s: da=%d n=%d p=%lld\n",da,n,p); Q = array64s(4*dq+4); T = array64s(4*n); dr = FFTpoldivinp64s( a, b, da, db, Q, T, p ); //printf("FFTpoldiv64s: dr=%d\n",dr); free(Q); free(T); return dr; } /***************************** TREE MUL ALGORITHM ********************************/ LONG treemulspace( LONG n, LONG m ); #define CUTOFF 16 #define LAMBDACUTOFF 100000 void fastLambdarec( LONG *alpha, LONG n, LONG *f, LONG *T, LONG s, LONG *W, LONG p ); cilk int fastLambdareccilk( LONG *alpha, LONG n, LONG *f, LONG *T, LONG s, LONG *W, LONG Wsize, LONG p ) { int dummy; LONG m; if( n n. //printf("N = %d\n",N); Wsize = 3*N; W = array64s(Wsize); dummy = spawn fastLambdareccilk( v, n, F, T, s, W, Wsize, p ); sync; polcopy64s(F,n,f); free(F); free(W); free(T); return 1; } /***************************** Fast change of basis ******************************/ void makedivisors( LONG alpha, LONG n, LONG *T, LONG p ); void makepowers( LONG alpha, LONG n, LONG *T, LONG p ); void recbase( LONG *f, LONG n, LONG alpha, LONG *T, LONG m, LONG p ); void recbase2( LONG *f, LONG n, LONG alpha, LONG *T, LONG m, LONG *W, LONG *S, LONG p ); int divisorsspace( LONG n ); void changebase( LONG *f, LONG n, LONG alpha, LONG p ); void changebase2( LONG *f, LONG n, LONG alpha, LONG p ); void polrev64s( LONG *f, LONG d ); void changebase3( LONG *f, LONG n, LONG alpha, LONG p ); cilk LONG changebase3cilk( LONG *f, LONG n, LONG alpha, LONG p ) { // Input f a polynomial of degree n in Zp[x] // Output f(x+alpha) mod p in the array f // This version computes f(x+alpha) in O( M(n) ) instead of O( M(n) log n ) // Since the method computes 1/i! mod p we cannot use it for p<=n. // Computing inverses is expensive so some thought is needed for that. // Joris suggested precomputing (1/i!)^(-1) and going backwards. // Since the we have to compute i! first anyway this is free. LONG i,fac,inv,*g,*h; int dummy; recint P; if( p<=n || n<3000 ) { changebase2(f,n,alpha,p); return 1; } if( n0; i-- ) { // compute g[i] = alpha^i/i! mod p g[i] = mulrec64(fac,g[i],P); fac = mulrec64((LONG) i,fac,P); } h = array64s(2*n+1); dummy = spawn FFTpolmul64scilk(f,g,h,n,n,p); sync; polrev64s(h,n); f[0] = h[0]; fac = inv; for( i=n; i>0; i-- ) { // compute f[i] = h[i]/i! mod p f[i] = mulrec64(fac,h[i],P); fac = mulrec64(fac,(LONG) i,P); } free(g); free(h); return 1; }