// This file contains FFT code and FFT based polynomial routines // Copyright Michael Monagan, March 2020 #include #include #include "int128g.c" // Switch from int i,j,n; to LONG i,j,n; to allow FFTs of size 2^31 and 2^32 #define LONG long long int /********************************************************************************/ /* Zp utilities */ /********************************************************************************/ #define ADD64s fftadd64s #define ADD32s fftadd32s #define SUB64s fftsub64s #define NEG64s fftneg64s #define MUL64s fftmul64s #define MOD64s fftmod64s #define MIN64s fftmin64s #define INV64s modinv64s LONG ADD64s(LONG a, LONG b, LONG p) { LONG t; t = (a-p)+b; t += (t>>63) & p; return t; } LONG ADD32s(int a, int b, int p) { int t; t = (a-p)+b; t += (t>>31) & p; return t; } LONG SUB64s(LONG a, LONG b, LONG p) { LONG t; t = a-b; t += (t>>63) & p; return t; } LONG NEG64s(LONG a, LONG p) { LONG t; t = -a; t += (t>>63) & p; return t; } LONG MUL64s(LONG a, LONG b, LONG p) { LONG q, r; __asm__ __volatile__( \ " mulq %%rdx \n\t" \ " divq %4 \n\t" \ : "=a"(q), "=d"(r) : "0"(a), "1"(b), "rm"(p)); return r; } #define UINT64 unsigned long long //typedef struct { // UINT64 s; /* shift */ // UINT64 v; /* reciprocal of d */ // UINT64 d0; /* divisor shifted up */ // UINT64 d1; //} recint; //recint recip1(UINT64 p); //UINT64 mulrec64(UINT64 a, UINT64 b, recint v); LONG MOD64s(LONG a,LONG p) { return(a % p); } LONG MIN64s(LONG a, LONG b) { if(a= 0\n"); exit(1); } n = n * sizeof( LONG ); A = (LONG *) malloc( n ); if( A==0 ) { printf("array64s: malloc failed\n"); exit(1); } return A; } /********************************************************************************/ /* Polynomial routines */ /********************************************************************************/ int poladd64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG p); int polsqr64s( LONG * A, LONG * C, int d, LONG p ); int polmul64s( LONG * A, LONG * B, LONG * C, int da, int db, LONG p ); int poldiv64s( LONG * A, LONG * B, int da, int db, LONG p ); void polprint64s( LONG *A, int d ); void polcopy64s( LONG *A, int d, LONG *B ); LONG poleval64s(LONG *a, int d, LONG x, LONG p); /********************************************************************************/ /* FFT utilities and routines */ /********************************************************************************/ int vecequal64s( LONG *A, LONG *B, int n ); LONG getprimelem64s( LONG p ); // in primitive.c LONG getomega64s( LONG p, LONG n ) { // p is assumed prime > 2 LONG a,m,omega; m = (p-1)/n; if( (p-1)-n*m ) return 0; // there is no such omega a = getprimelem64s( p ); // printf("primelem = %lld\n",a); omega = powmod64s( a, m, p ); return omega; } void FFTwork1( LONG n2, LONG *a, LONG *b, LONG *W, LONG p, recint P ) { LONG i,t; for( i=0; i0; m=m/2 ) { for( j=0; j0; m=m/2 ) { for( j=0; j da+db LONG i,n,dc; LONG *A, *B, *W; recint P; if( da==-1 || db==-1 ) return -1; P = recip1(p); dc = da+db; for( n=1; n=0 && c[dc]==0; dc-- ); return dc; } LONG w = getomega64s(p,n); //printf("w := %lld;\n",w); if( w==0 ) { printf("omega does not exist: dc=%d n=%d\n",dc,n); exit(1); } A = T; B = T+n; W = B+n; MakeW64(n,w,W,p); //printf("W := "); vecprint32s(W,n); polpad64s(da,n,A,a); //printf("A := "); vecprint32s(A,n); FFT64s1(n,A,W,p,P); //printf("F := "); vecprint32s(A,n); if( a==b ) B = A; // c = a^2 else { polpad64s(db,n,B,b); //printf("B := "); vecprint32s(B,n); FFT64s1(n,B,W,p,P); //printf("F := "); vecprint32s(B,n); } for( i=0; i2d // There is an optimization available for the inverse transform of vectors // e.g. (n=8) [a0, a0, a1, a1, a2, a2, a3, a3] LONG i,j; LONG n,n2; LONG w, t, *A, *B, *C, *D, *W; recint P; P = recip1(p); if( d<1 ) { printf("degree d must be at least 1\n"); exit(1); } for( n=1; n<=2*d; n*=2 ); //printf("n := %d;\n",n); w = getomega64s(p,n); //printf("w := %lld;\n",w); if( w==0 ) { printf("omega does not exist: d=%d n=%d\n",d,n); exit(1); } n2 = n/2; A = T; C = T+n; W = C+n; // B = W+n; D = B+n; MakeW64(n,w,W,p); //printf("W := "); vecprint64s(W,n); polpad64s(d,n,A,f); //printf("A := "); vecprint64s(A,n); polpad64s(d-1,n,C,g); //printf("C := "); vecprint64s(C,n-1); FFT64s1(n,A,W,p,P); // A = FFT1(f(x)) //printf("A := "); vecprint64s(A,n); FFT64s1(n,C,W,p,P); // C = FFT1(g(x)) //printf("C := "); vecprint64s(A,n); for( j=0; j1000000 ) printf(" TG: j=%d\n", j ); if( j>0 ) { // double the order MakeWinv64(n,W,p); // reset it VECZIPMUL64s(A+n2,W,n2,P); //for( i=0; i= m LONG i,n; LONG w,winv,*W,*Winv,*Y; if( m==1 ) { y[0] = INV64s(f[0],p); return; } for( n=1; nda LONG i,dr,dq,n; LONG *ra, *rb, *rq, *y, *q, *r; if( db<0 ) { printf("division by zero\n"); exit(1); } if( da da. //Q = array64s(4*dq+4); //T = array64s(4*n); ra = Q; y = Q + dq + 1; rb = Q + 2*dq + 2; for( i=0; i<=dq; i++ ) ra[i] = a[da-i]; // ra = recip(a) //printf("p:="); printf("%lld;\n",p); //printf("ra:="); polprint64s(ra,dq); for( i=0; i<=MIN64s(db,dq); i++ ) rb[i] = b[db-i]; // rb = recip(b) //printf("rb:="); polprint64s(rb,dq); while( i<=dq ) rb[i++] = 0; FFTinv64s( rb, dq+1, y, T, p ); // compute 1/rb to O(x^(dq+1)) //printf("inv:="); polprint64s(y,dq); rq = Q + 2*dq + 2; FFTmul64s( ra, y, rq, dq, dq, T, p ); // T must be size 3n //printf("rq:="); polprint64s(rq,2*dq); for( i=0; i<=dq; i++ ) q[i] = rq[dq-i]; r = T; FFTmul64s( b, q, r, db, dq, T+n, p ); // T must be size 4n for( i=0; i=0 && a[dr]==0; dr-- ); // compute deg(r) return dr; } int FFTpoldiv64s( LONG *a, LONG *b, LONG da, LONG db, LONG p ) { LONG i,dr,dq,n; LONG *Q, *T; if( db<0 ) { printf("division by zero\n"); exit(1); } if( da da. //printf("FFTpoldiv64s: da=%d n=%d p=%lld\n",da,n,p); Q = array64s(4*dq+4); T = array64s(4*n); dr = FFTpoldivinp64s( a, b, da, db, Q, T, p ); //printf("FFTpoldiv64s: dr=%d\n",dr); free(Q); free(T); return dr; } /***************************** TREE MUL ALGORITHM ********************************/ LONG treemulspace( LONG n, LONG m ) { LONG q; if( n>1) + (n&1); return 2*treemulspace(q,m); } #define CUTOFF 16 void fastLambdarec( LONG *alpha, LONG n, LONG *f, LONG *T, LONG s, LONG *W, LONG p ) { LONG i,m; if( n==0 ) { f[0] = 1; return; } f[0] = NEG64s(alpha[0],p); f[1] = 1; if( n==1 ) return; if( n n. //printf("N = %d\n",N); W = array64s(3*N); fastLambdarec( v, n, f, T, s, W, p ); free(W); free(T); return; } } /***************************** Fast change of basis ******************************/ void makedivisors( LONG alpha, LONG n, LONG *T, LONG p ) { // construct T = [ (x-alpha)^n (x-alpha)^(n/2) ... (x-alpha)^2 (x-alpha) ] LONG m; LONG *g, h[2]; if( n<1 ) return; if( n==1 ) { T[0] = NEG64s(alpha,p); T[1] = 1; return; } m = n/2; g = T+n+1; makedivisors( alpha, m, g, p ); FFTpolmul64s( g, g, T, m, m, p ); return; } void makepowers( LONG alpha, LONG n, LONG *T, LONG p ) { // construct T = [ (x+alpha)^n (x+alpha)^(n/2) ... (x+alpha)^2 (x+alpha) ] LONG m; LONG *g, h[2]; if( n<1 ) return; if( n==1 ) { T[0] = alpha; T[1] = 1; return; } m = n/2; g = T+n+1; makepowers( alpha, m, g, p ); FFTpolmul64s( g, g, T, m, m, p ); return; } void recbase( LONG *f, LONG n, LONG alpha, LONG *T, LONG m, LONG p ) { // f(x) has degree n // T = [ (x-alpha)^m (x-alpha)^(m/2) ... (x-alpha)^2 (x-alpha) ] // m is a power of 2 // Let (q,r) = f div (x-alpha)^m. // Output r(x+alpha) + x^m + q(x+alpha) LONG i,dr; if( n<=0 ) return; if( n==1 ) { f[0] = ADD64s( f[0], MUL64s(f[1],alpha,p), p ); return; } if( n=0 && f[dr]==0; dr-- ); recbase2( f, dr, alpha, T+(m+1), m/2, W, S, p ); recbase2( f+m, n-m, alpha, T+(m+1), m/2, W, S, p ); if( n<20 || m<20 || (LONG) m*n < 4096 ) d = polmul64s( f+m, T, W, n-m, m, p ); else d = FFTmul64s( f+m, T, W, n-m, m, S, p ); //d = FFTpolmul64s( f+m, T, W, n-m, m, p); if( d!=n ) printf("recbase2 degree bug\n"); poladd64s( W, f, f, n, dr, p ); return; } int divisorsspace( LONG n ) { // space required for polynomials in (x-alpha)^n list for n in [1,2,4,8,...,n=2^k] if( n<1 ) return 0; if( n==1 ) return 2; return( n+1 + divisorsspace(n/2) ); } void changebase( LONG *f, LONG n, LONG alpha, LONG p ) { // write f(x+alpha) = sum( a_i (x-alpha)^i ) and output [a0,a1,...,an] in f LONG m,s; LONG *T; if( n==0 ) { return; } if( n==1 ) { f[0] = MUL64s(f[1],alpha,p); return; } for( m=1; 2*m0; i-- ) { // compute g[i] = alpha^i/i! mod p g[i] = mulrec64(fac,g[i],P); fac = mulrec64((LONG) i,fac,P); } h = array64s(2*n+1); FFTpolmul64s(f,g,h,n,n,p); polrev64s(h,n); f[0] = h[0]; fac = inv; for( i=n; i>0; i-- ) { // compute f[i] = h[i]/i! mod p f[i] = mulrec64(fac,h[i],P); fac = mulrec64(fac,(LONG) i,P); } free(g); free(h); return; }