
// Copyright Michael Monagan 2019-2022
// Compile with  
// gcc -O3 -shared -o gcd8.so -fPIC gcd8.c
// gcc -O3 -c gcd8.c

// This file has my classical O(d^2) arithmetic library for Fp[x]
// It supports 63 bit primes i.e., p < 2^63
// It eliminates the O(d^2) divisions by p using accumulators
// It also uses Roman Pearce's mulrec64 routine for multiplication mod p from the file "int128g.c"
// For 2021 I've added routines to support the multivarate GCD using Kronecker substitution
// Fall 2021 I've added fast evaluation for Ayoola

// April 2022 gcd8.c  I've added fast evaluation for Tian in evalLIST
// Copyright Michael Monagan 2000--2022.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>


#define LONG long long int
#define ULONG unsigned long long int

/******************************************************************************************/
/*  Zp utilities                                                                          */
/******************************************************************************************/

#define UINT64 unsigned long long

#include "int128g.c"

//typedef struct {
//        UINT64 s;       /* shift */
//        UINT64 v;       /* reciprocal of d */
//        UINT64 d0;      /* divisor shifted up */
//        UINT64 d1;
//} recint;
//
//recint recip1(UINT64  p);
//UINT64 mulrec64(UINT64  a, UINT64  b, recint  v);


ULONG seed;
ULONG mult;

LONG rand64s(LONG p) {
    LONG x,y;
    extern ULONG seed, mult;
    seed = mult*seed;
    x = seed >> 32;
    seed = mult*seed;
    y = seed >> 32;
    x = (x<<31) | y;
    x = x % p;
    return(x);
}


int  min32s(int a, int b) { if( a<b ) return a; else return b; }
int  max32s(int a, int b) { if( a>b ) return a; else return b; }
LONG add64s(LONG a, LONG b, LONG p) { LONG t; t = (a-p)+b; t += (t>>63) & p; return t; }
LONG sub64s(LONG a, LONG b, LONG p) { LONG t; t = a-b; t += (t>>63) & p; return t; }
LONG neg64s(LONG a, LONG p) { return (a==0) ? 0 : p-a; }
LONG mul64s(LONG a, LONG b, LONG p) {
        LONG q, r;
        __asm__ __volatile__(           \
        "       mulq    %%rdx           \n\t" \
        "       divq    %4              \n\t" \
        : "=a"(q), "=d"(r) : "0"(a), "1"(b), "rm"(p));
        return r;
}


        /* z += a1:a0 */
        #define zadd(z,a0,a1) __asm__(\
        "       addq    %4, %0  \n\t" \
        "       adcq    %5, %1  \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]) : "0"(z[0]), "1"(z[1]), "r"(a0), "r"(a1))

        /* z -= a1:a0 */
        #define zsub(z,a0,a1) __asm__(\
        "       subq    %4, %0  \n\t" \
        "       sbbq    %5, %1  \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]) : "0"(z[0]), "1"(z[1]), "r"(a0), "r"(a1))

        /* z = a*b */
        #define zmul(z,a,b) __asm__(\
        "       mulq    %%rdx   \n\t" \
                : "=a"(z[0]), "=d"(z[1]) : "a"(a), "d"(b))

        /* z += a*b */
        #define zfma(z,a,b) do {        \
        unsigned long u,v;              \
        __asm__(                        \
        "       mulq    %%rdx           \n\t" \
        "       addq    %%rax, %0       \n\t" \
        "       adcq    %%rdx, %1       \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]), "=a"(u), "=d"(v) : "0"(z[0]), "1"(z[1]), "a"(a), "d"(b));\
        } while (0)

        /* z -= a*b */
        #define zfms(z,a,b) do {        \
        unsigned long u,v;              \
        __asm__(                        \
        "       mulq    %%rdx           \n\t" \
        "       subq    %%rax, %0       \n\t" \
        "       sbbq    %%rdx, %1       \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]), "=a"(u), "=d"(v) : "0"(z[0]), "1"(z[1]), "a"(a), "d"(b));\
        } while (0)
        /* z[0] = z % p */
        /* z[1] = z / p */
        /* quotient can overflow */
        #define zdiv(z,p) __asm__(\
        "       divq    %4      \n\t" \
                : "=a"(z[1]), "=d"(z[0]) : "a"(z[0]), "d"(z[1]), "r"(p))

        /* z = z % p safe */
        #define zmod(z,p) __asm__(\
        "       divq    %4      \n\t" \
        "       xorq    %0, %0  \n\t" \
                : "=a"(z[1]), "=d"(z[0]) : "a"(z[0]), "d"(z[1] < p ? z[1] : z[1] % p), "r"(p))

        /* z = z << s */
        #define zshl(z,s) __asm__(\
        "       shldq   %%cl, %0, %1    \n\t" \
        "       shlq    %%cl, %0        \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]) : "0"(z[0]), "1"(z[1]), "c"(s))


/* c^(-1) mod p assuming 0 < c < p < 2^63 */
LONG modinv64s( LONG c, LONG p )
{   LONG d,r,q,r1,c1,d1;
    d = p;
    c1 = 1;
    d1 = 0;
    while( d != 0 ) {
        q = c / d;
        r = c - q*d; r1 = c1 - q*d1;
        c = d; c1 = d1;
        d = r; d1 = r1;
    }
    if( c!=1 ) return( 0 );
    if( c1 < 0 ) c1 += p;
    return( c1 );
}


/* a^n mod p assuming 0 <= a < p < 2^63 */
LONG powmod64s( LONG a, LONG n, LONG p )
{   LONG r,s;
    a += (a>>63) & p; // protect from bad input
    if( n==0 ) return 1;
    if( n==1 ) return a;
    for( r=1, s=a; n>0; n /= 2 ) { if( n & 1 ) r = mul64s(r,s,p); s = mul64s(s,s,p); }
    return r;
}

/* a^n mod p assuming 0 <= a < p < 2^63 */
LONG powmodP64s( LONG a, LONG n, LONG p, recint P )
{   LONG r,s;
    if( n==0 ) return 1;
    if( n==1 ) return a;
    if( n&1 ) r = a; else r = 1;
    s = a;
    for( n=n/2; n>0; n=n/2 ) { s = mulrec64(s,s,P); if( n&1 ) r = mulrec64(r,s,P); }
    return r;
}


/******************************************************************************************/
/* Polynomial routines                                                                    */
/******************************************************************************************/

void vecfill64s( LONG x, LONG *A, int n )
{   int i;
    for( i=0; i<n; i++ ) A[i] = x;
    return;
}

void polcopy64s( LONG *A, int d, LONG *B )
{   int i;
    for( i=0; i<=d; i++) B[i]=A[i];
    return;
}

void polrand64s( LONG *A, int d, LONG p ) {
    int i;
    for( i=0; i<d; i++ ) A[i] = rand64s(p);
    A[d] = 1;
    return;
}

/* print an array in form [a0,a1,...,an-1] */
void vecprint64s( LONG *A, int n )
{   int i;
    printf("[");
    for( i=0; i<n; i++ ) { printf("%lld",A[i]); if( i<n-1 ) printf(", "); }
    printf("];\n");
    return;
}

int vecequal64s( LONG *A, LONG *B, int n ) {
    int i,equal;
    for( equal=1,i=0; i<n; i++ ) equal = equal && (A[i]==B[i]);
    return equal;
}

/* print an array [a0,a1,...,ad] in form ad*x^d+...+a1*x+a0 */
void polprint64s( LONG *A, int d, LONG p ) {
    int i;
    if( d==-1 ) { printf("0;\n"); return; }
    for( i=d; i>0; i-- ) if( A[i]!=0 ) printf("%lld*x^%d+",A[i],i);
    printf("%lld;\n",A[0]);
    return;
}

int polequal64s( LONG *a, LONG *b, int d ) {
    int i,equal;
    for( equal=1,i=0; i<=d; i++ ) equal = equal && (a[i]==b[i]);
    return equal;
}

LONG poleval64s(LONG *a, int d, LONG x, LONG p) {
    int i; LONG r; recint P;
    P = recip1(p);
    if( d==-1 ) return 0;
    // a[0]+x(a[1]+x(a[2]+x(a[3])))
    for( r=a[d],i=d-1; i>=0; i-- ) r = add64s(a[i], mulrec64(x,r,P), p);
    return r;
}

void polmultieval64s(LONG *a, int d, LONG *x, LONG *y, int n, LONG p) {
    int j;
    for( j=0; j<n; j++ ) { y[j] = poleval64s(a,d,x[j],p); }
    return;
}

int poladd64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG p) {
// c = a + b mod p
   int i,m;
   m = min32s(da,db);
   for( i=0; i<=m; i++ ) c[i] = add64s(a[i],b[i],p);
   if( da==db ) { while( da>=0 && c[da]==0 ) da--; return da; }
   if( da<db ) { for( i=da+1; i<=db; i++ ) c[i] = b[i]; return db; }
   for( i=db+1; i<=da; i++ ) c[i] = a[i]; return da;
}

int polsub64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG p) {
// c = a-b mod p
   int i,m;
   m = min32s(da,db);
   for( i=0; i<=m; i++ ) c[i] = sub64s(a[i],b[i],p);
   if( da==db ) { while( da>=0 && c[da]==0 ) da--; return da; }
   if( da>db ) { for ( i=db+1; i<=da; i++ ) c[i] = a[i]; return da; }
   else { for( i=da+1; i<=db; i++ ) c[i] = neg64s(b[i],p); return db; }
}


int polsubmul( LONG *A, LONG *B, LONG a, LONG b, int dA, int dB, LONG p ) {
   // compute A = A - (ax+b) B efficiently

   LONG t; int i; ULONG z[2];

   if( dB==-1 ) return dA; // B = 0
   z[0] = z[1] = 0LL;

   // if deg(A) <= deg(B) then pad A with zeroes
   while( dA<=dB ) A[++dA] = 0;

   // constant term is special
   t = mul64s(b,B[0],p) ;
   A[0] = sub64s(A[0],t,p);

   for( i=1; i<=dB; i++ ) { zmul(z,a,B[i-1]); zfma(z,b,B[i]); zmod(z,p);
        t = A[i]-z[0]; A[i] = t + ((t>>63)&p); }

   // update leading term from B
   t = mul64s(a,B[dB],p);
   A[dB+1] = sub64s(A[dB+1],t,p);

   // compute and return degree
   while( dA>=0 && (A[dA]==0 || A[dA]==p) ) dA--;
   return dA;
}


/* compute gcd(A,B) and put gcd in A and return it's degree */
int polsubmulP( LONG *A, LONG *B, LONG a, LONG b, int dA, int dB, LONG p, recint P ) {

   // compute A = A - (ax+b) B efficiently

   LONG s,t; int i, d;

   if( dB==-1 ) return dA; // B = 0

   d = dA;

   // if deg(A) <= deg(B) then pad A with zeroes
   while( dA<=dB ) A[++dA] = 0;

   // constant term is special
   t = mulrec64(b,B[0],P);
   A[0] = sub64s(A[0],t,p);

   //for( i=1; i<=dB; i++ ) { t = mul64s(a,B[i-1],p); t = add64s(t,mul64s(b,B[i],p),p); A[i] = sub64s(A[i],t,p); }
   for( i=1; i<=dB; i++ ) { t = mulrec64(a,B[i-1],P); t = add64s(t,mulrec64(b,B[i],P),p); A[i] = sub64s(A[i],t,p); }

   // update leading term from B
   t = mulrec64(a,B[dB],P);
   A[dB+1] = sub64s(A[dB+1],t,p);

   // compute and return degree
   while( dA>=0 && (A[dA]==0 || A[dA]==p) ) dA--;

   if( dA==d ) printf("polsubmul failure: dAin=%d dAout=%d\n",d,dA);
   return dA;
}


int poldiff64s( LONG *f, int d, LONG *fp, LONG p ) {
    int i; recint P;
    P = recip1(p);
    for( i=1; i<=d; i++ ) fp[i-1] = mulrec64(f[i],(LONG) i,P);
    for( d--; d>=0 && fp[d]==0; d-- );
    return d;
}

/* compute C(x) = A(x)^2 mod p and return deg(C) */
/* we allow C to overwrite A i.e. polsqr64s(A,A,d,p) */
int polsqr64s( LONG * A, LONG * C, int d, LONG p )
{
    int i,k,m,dc; ULONG z[2];
    if( d<0 ) return d;
    for( k=2*d; k>=0; k-- ) {
       m = min32s(k,d);
       i = max32s(0,k-d);
       z[0] ^= z[0]; // = 0?
       z[1] ^= z[1]; // = 0?
       while( i<m-2 ) {
            zfma(z,A[i++],A[m--]);
            if( z[1]>=p ) z[1] -= p;
            zfma(z,A[i++],A[m--]);
       }
       if( i<m ) {
            zfma(z,A[i++],A[m--]);
            if( z[1]>=p ) z[1] -= p;
       }
       zadd(z,z[0],z[1]);
       if( z[1]>=p ) z[1] -= p;
       if( i==m ) zfma(z,A[i],A[i]);
       zmod(z,p);
       C[k] = z[0];
    }
    for( dc = 2*d; dc>=0 && C[dc]==0; dc-- );
    // Why is this loop here? Z_p has no zero-divisors.
    // Because p may not be prime!!
    return( dc );
}


/* compute C(x) = A(x) * B(x) mod p and return deg(C) */
/* we allow C to overwrite either A or B i.e. polmul64s(A,B,A,da,db,p) */
int polmul64s( LONG * A, LONG * B, LONG * C, int da, int db, LONG p)
{
    int i,k,m; ULONG z[2];
    if( da<0 || db<0 ) return da;
    int dc = da+db;
    for( k=dc; k>=0; k-- ) {
       i = max32s(0,k-db);
       m = min32s(k,da);
       z[0] ^= z[0]; // = 0?
       z[1] ^= z[1]; // = 0?
       while( i<m ) {
           zfma(z,A[i],B[k-i]); i++;
           if( z[1]>=p ) z[1] -= p;
           zfma(z,A[i],B[k-i]); i++;
       }
       if( i==m ) {
           zfma(z,A[i],B[k-i]);
           if( z[1]>=p ) z[1] -= p;
       }
       zmod(z,p);
       C[k] = z[0];
    }
    for( ; dc>=0 && C[dc]==0; dc-- );
    return( dc );
}


/* divide A by B and put the remainder and quotient in A */
/* return the degree of the remainder                    */
int poldiv64s( LONG * A, LONG * B, int da, int db, LONG p )
{
    int dq,dr,k,j,m; LONG t,inv; ULONG z[2];
    if( db<0 ) { printf("division by zero\n"); exit(1); }
    if( da<db ) return da; else { dq = da-db; dr = db-1; }
    inv = modinv64s(B[db],p);
    for( k=da; k>=0; k-- ) {
        z[0] = 0ll; z[1] = 0ll;
        m = min32s(dr,k);
        j = max32s(0,k-dq);
        //for( j=max32s(0,k-dq); j<=m; j++ ) { t -= ((LONG) B[j])*A[k-j+db]; t += (t>>63) & M; }
        while( j<m ) {
            zfma(z,B[j],A[k-j+db]); j++;
            if( z[1]>=p ) z[1] -= p;
            zfma(z,B[j],A[k-j+db]); j++;
        }
        if( j==m ) zfma(z,B[j],A[k-j+db]);
        if( z[1]>=p ) z[1] -= p;
        zmod(z,p);
        t = A[k] - z[0];
        t += (t>>63) & p;
        if( k>=db && inv!=1 ) t = mul64s(t,inv,p);
        A[k] = t;
    }
    while( dr>=0 && A[dr]==0 ) dr--;
    return( dr );
}


void polscamul64s( LONG x, LONG *A, int d, LONG p ) {
    int i;
    if( x==1 ) return;
    if( x==-1 ) for( i=0; i<=d; i++ ) A[i] = neg64s(A[i],p);
    else for( i=0; i<=d; i++ ) A[i] = mul64s(x,A[i],p);
    return;
}


/* make polynomial in A monic */
void monic64s( LONG *A, int d, LONG p ) {
    int i; LONG inv;
    if( d<0 || A[d]==1 ) return;
    inv = modinv64s(A[d],p);
    for( i=0; i<d; i++ ) A[i] = mul64s(inv, A[i], p);
    A[d] = 1;
    return;
}


void poltranslate64s( LONG *A, int d, LONG alpha, LONG *B, LONG p ) {
// B = A(x+alpha) mod p
    int i,j,k;
    recint P;
    P = recip1(p);
    B[0] = A[d];
    for( i=d-1,j=0; i>=0; i--,j++ ) {
        B[j+1] = B[j];
        for( k=j; k>0; k-- ) B[k] = add64s(mulrec64(B[k],alpha,P),B[k-1],p);
        B[0] = add64s(mulrec64(B[0],alpha,P),A[i],p);
    }
    return;
}

    
/* compute gcd(A,B) and put gcd in A and return it's degree */
/* Both A and B are destroyed */
int polgcd64s( LONG * A, LONG * B, int da, int db, LONG p ) {
    int dr; LONG *C, *D, *R, u, a, b;
    recint P;
    if( db<0 ) { printf("division by zero\n"); exit(1); }
    P = recip1(p);
    C = A; D = B;
    if( da<db ) { R = C; C = D; D = R; dr = da; da = db; db = dr; }
    while( 1 ) {
        if( db>0 && da-db==1 ) { // normal case
            u = modinv64s(D[db],p);
            a = mulrec64(C[da],u,P);
        //    a = mul64s(C[da],u,p);
            b = mulrec64(a,D[db-1],P);
        //    b = mul64s(a,D[db-1],p);
            b = mulrec64(u,sub64s(C[da-1],b,p),P);  // quotient = a x + b
        //    b = mul64s(u,sub64s(C[da-1],b,p),p);  // quotient = a x + b
            dr = polsubmulP(C,D,a,b,da,db,p,P);  // C = C - (a x + b) D
        //    dr = polsubmul(C,D,a,b,da,db,p);  // C = C - (a x + b) D
            if( dr>=db ) printf("failure\n");
        }
        else dr = poldiv64s(C,D,da,db,p);
        if( dr<0 ) { /* D|C so gcd(A,B)=D */
            if( D!=A ) polcopy64s(D,db,A);
            monic64s( A, db, p );
            return db;
        }
        R = C; C = D; D = R; da = db; db = dr;
        //printf("da=%d db=%d\n",da,db);
    }
}


void polgcdext64s( LONG *A, LONG *B, int da, int db,
                  LONG *G, LONG *S, LONG *T, int *dG, int *dS, int *dT,
                  LONG *W, LONG p )
{
    // Solve S A + T B = G = monic gcd(A,B) for G,S,T in Zp[x]
    // The arrays A and B are used for the remainder sequence so they are destroyed
    // G,S,T must all be of size max(da+1,db+1)
    // W is working storage of size 2max(da+1,db+1)
    // if S==0 or T==0 then S (and/or T) are not computed

    int m,dr,ds,dt,dq,ds1,ds2,dt1,dt2; LONG a,b,u;
    LONG *q,*r,*r1,*r2,*s,*s1,*s2,*t,*t1,*t2;

    recint P; P = recip1(p);

    if( da<0 || db<0 ) { printf("inputs must be non-zero\n"); exit(1); }
    m = max32s(da+1,db+1);
    r1 = A; r2 = B;
    if(S) { s1 = S; s2 = W;   s1[0]=1; ds1=0; ds2=-1; }
    if(T) { t1 = T; t2 = W+m; t2[0]=1; dt2=0; dt1=-1; }
    while( 1 ) {
        if( db>0 && da-db==1 ) { // normal case
            u = modinv64s(r2[db],p);
            a = mul64s(r1[da],u,p);
            b = mul64s(a,r2[db-1],p);
            b = mul64s(u,sub64s(r1[da-1],b,p),p);             // quotient = a x + b
            dr = polsubmul(r1,r2,a,b,da,db,p);                // r1 = r1 - (a x + b) r2
            if(S) ds = polsubmul(s1,s2,a,b,ds1,ds2,p);        // s1 = s1 - (a x + b) s2
            if(T) dt = polsubmul(t1,t2,a,b,dt1,dt2,p);        // t1 = t1 - (a x + b) t2
            //dr = polsubmulP(r1,r2,a,b,da,db,p,P);              // r1 = r1 - (a x + b) r2
            //if(S) ds = polsubmulP(s1,s2,a,b,ds1,ds2,p,P);      // s1 = s1 - (a x + b) s2
            //if(T) dt = polsubmulP(t1,t2,a,b,dt1,dt2,p,P);      // t1 = t1 - (a x + b) t2
        }
        else {
            dr = poldiv64s(r1,r2,da,db,p);                 // r1 = [remainder,quotient]
            q  = r1+db; dq = da-db;
            if(S) ds = polmul64s(q,s2,G,dq,ds2,p);
            if(S) ds = polsub64s(s1,G,s1,ds1,ds,p);        // s1 = s1 - q s2
            if(T) dt = polmul64s(q,t2,G,dq,dt2,p);
            if(T) dt = polsub64s(t1,G,t1,dt1,dt,p);        // t1 = t1 - q t2
        }
        if( dr<0 ) { /* D|C so gcd(A,B)=D */
            polcopy64s(r2,db,G);
            if(S) if( s2!=S ) polcopy64s(s2,ds2,S);
            if(T) if( t2!=T ) polcopy64s(t2,dt2,T);
            if( G[db]!=1 ) {
                u = modinv64s(G[db],p);
                polscamul64s(u,G,db,p);
                if(S) polscamul64s(u,S,ds2,p);
                if(T) polscamul64s(u,T,dt2,p);
            }
            dG[0] = db;
            if(S) dS[0] = ds2;
            if(T) dT[0] = dt2;
            return;
        }
        r = r1; r1 = r2; r2 = r;  da = db;   db = dr;
        if(S) { s = s1; s1 = s2; s2 = s; ds1 = ds2; ds2 = ds; }
        if(T) { t = t1; t1 = t2; t2 = t; dt1 = dt2; dt2 = dt; }
    }
}


/* C(x) := A(x)^n mod B(x) mod p;  0<=deg(A)<deg(B) and R must be of size 2*db-1 */
/* If A(x) is not reduced mod B(x) then we first compute C(x) := A(x) mod B(x)   */
int polpowmod64s( LONG * A, LONG n, LONG * B, int da, int db, LONG *C, LONG *R, LONG p )
{
    int dc,k,b[63];

    if( n==0 ) { C[0] = 1; return 0; }
    if( da>=db ) da = poldiv64s(A,B,da,db,p);                   // reduce A mod B first
    for( k=0; n>0; k++ ) { b[k]=n&1; n=n/2; }
    polcopy64s(A,da,C);
    dc = da;
    k--;
    while( k>0 ) { k--;
       // Main step: compute C := C^2 mod B in Zp[x]
       //dc = polmul64s(C,C,R,dc,dc,p);                           //printf("deg(R) = %d; R = ",dc); polprint64s(R,dc);
       dc = polsqr64s(C,R,dc,p);                                //printf("deg(R) = %d; R = ",dc); polprint64s(R,dc);
       dc = poldiv64s(R,B,dc,db,p);
       polcopy64s(R,dc,C);                                      //printf("deg(C) = %d; C = ",dc); polprint64s(C,dc);
       if( b[k]==1 ) {                                          //printf(" b[%d]=%d \n", k, b[k] );
           dc = polmul64s(A,C,R,da,dc,p);                       //printf("deg(R) = %d; R = ",dc); polprint64s(R,dc);
           dc = poldiv64s(R,B,dc,db,p);
           polcopy64s(R,dc,C);                                  //printf("deg(C) = %d; C = ",dc); polprint64s(C,dc);
       }
    }
    return dc;
}


// Input f in Zp[x] of degree d > 0, a known product of d linear factors.
// Output roots of f in R.
// The input array f is destroyed.
// W is a scratch array of size at least 3*d
void polsplit64s( LONG *f, int d, LONG *R, LONG *W, LONG p )
{
   int da,dg; LONG alpha, A[2];
   if( d==1 ) { alpha = p-f[0]; R[0] = alpha; return; }
   alpha = rand64s(p); A[1] = 1; A[0] = alpha;
   da = polpowmod64s( A, (p-1)/2, f, 1, d, W, W+d, p );
   if( da==0 ) return polsplit64s(f,d,R,W,p);      // alpha is unlucky, try again
   W[0] = add64s(W[0],1,p);                        // W = (x+alpha)^((p-1)/2) + 1 mod f
   polcopy64s( f, d, W+d );
   dg = polgcd64s( W, W+d, da, d, p );             // g = gcd( W, f ) in W
   if( dg==0 ) return polsplit64s(f,d,R,W,p);      // g = 1 ==> alpha is unlucky, try again
   poldiv64s(f,W,d,dg,p);                          // compute quotient q = f/g destroying f
   polcopy64s(W,dg-1,f);                           // f = [ g mod x^dg followed by q ]
   polsplit64s(f+dg,d-dg,R,W,p);
   f[dg] = 1;
   polsplit64s(f,dg,R+d-dg,W,p);
   return;
}


int polroots64s( LONG * f, int d, LONG * R, LONG *W, LONG p )
{
   int i, da, dg; LONG A[2]; extern ULONG seed,mult;
   clock_t st,et;
   //printf("roots: deg(f)=%d\n",d);
   // printf("f := "); polprint64s(f,d);
   for( i=0; i<d && f[i]==0; i++ );
   if( i>0 ) { R[0]=0; return( 1 + polroots64s(f+i,d-i,R+1,W,p) ); }
   if( f[d]!=1 ) monic64s(f,d,p);
   A[1] = 1;
   A[0] = 0;
   //st = clock();
   da = polpowmod64s( A, p-1, f, 1, d, W, W+d, p );    // W = x^(p-1) mod f
   //et = clock();
   //printf("Roots: powmod: x^(p-1) mod f = x^%d + ... time = %10d ms\n", da, (et-st)/1000 );
   //printf("da = %d, a := ",da); polprint64s(W,da);
   if( da==0 && W[0]==1 ) dg = d; // f is all linear factors
   else { W[0] = sub64s(W[0],1,p); dg = polgcd64s( f, W, d, da, p ); }   // f = gcd(f,W-1)
   //printf("g := "); polprint64s(f,dg);
   //printf("Roots: def(f)=%d  #roots=%d\n",d,dg);
   if( dg==0 ) return 0;
   seed = 1;
   mult = 6364136223846793003ll;
   //st = clock();
   polsplit64s( f, dg, R, W, p );
   //et = clock();
   //printf("Roots: split time=%10d ms\n", (et-st)/1000 );
   return dg; // number of roots in R
}


int BerlekampMassey64s( LONG *a, int N, LONG *L, LONG *W, LONG p )
{
    // Input sequence a = [a1,a2,a3,...,aN]
    // Output polynomial Lambda(x) is written to L
    // Uses the half extended Euclidean algorithm
    int i,m,n,dr,dq,dr0,dr1,dv0,dv1,dt;
    LONG *r,*q,*r0,*r1,*v0,*v1,*t,u,A,b;
    //recint P;
    while( N>0 && a[N-1]==0 ) N--; // ignore leading zeroes
    n = N/2;
    N = 2*n;
    if( N==0 ) { L[0] = 1; return 0; }
    m = N-1;
    // W is space for r0 = x^N and r1 of degree m and v0 and v1 of degree at most n
    r0 = W; r1 = r0+N+1; v0 = r1+N; v1 = v0+n+1;
    vecfill64s(0,r0,N); r0[N] = 1; dr0 = N;             // r0 = x^(2*n)
    for(i=0; i<N; i++) r1[i] = a[m-i];
    for(dr1=m; dr1>=0 && r1[dr1]==0; dr1--);            // r1 = sum(a[m-i]*x^i,i=0..m)
    if( dr1==-1 ) return -1;
    dv0 = -1;                                           // v0 = 0
    v1[0] = 1; dv1 = 0;                                 // v1 = 1
    //P = recip1(p);
    while( n <= dr1 ) {
        if( dr1>0 && dr0-dr1==1 ) { // normal case
            u = modinv64s(r1[dr1],p);
            A = mul64s(r0[dr0],u,p);
            b = mul64s(A,r1[dr1-1],p);
            b = mul64s(u,sub64s(r0[dr0-1],b,p),p);             // quotient q = A x + b
            //dr = polsubmulP(r0,r1,A,b,dr0,dr1,p,P);            // r0 = r0 - (A x + b) r1
            dr = polsubmul(r0,r1,A,b,dr0,dr1,p);            // r0 = r0 - (A x + b) r1
            // dt = polsubmulP(v0,v1,A,b,dv0,dv1,p,P);            // v0 = v0 - (A x + b) v1
            dt = polsubmul(v0,v1,A,b,dv0,dv1,p);            // v0 = v0 - (A x + b) v1
        } else {
           dr = poldiv64s(r0,r1,dr0,dr1,p);
           q = r0+dr1; dq = dr0-dr1;                           // q = quo(r0,r1)
           dt = polmul64s(q,v1,L,dq,dv1,p);
           dt = polsub64s(v0,L,v0,dv0,dt,p);
        }
        r = r0; r0 = r1; r1 = r; dr0 = dr1; dr1 = dr;         // r0,r1 = r1,rem(r0,r1)
        t = v0; v0 = v1; v1 = t; dv0 = dv1; dv1 = dt;         // v0,v1 = v1,v0 - q*v1
        //printf("r0 = "); polprint64s(r0,dr0);
        //printf("r1 = "); polprint64s(r1,dr1);
        //printf("v0 = "); polprint64s(v0,dv0);
        //printf("v1 = "); polprint64s(v1,dv1);
    }
    if( dv1>=0 ) {
        polcopy64s(v1,dv1,L);
        monic64s(L,dv1,p);
    }
    return dv1;
}


void polLambda64s( LONG *R, int n, LONG *L, LONG *W, LONG p ) {
// Compute L = proc_{i=0}^{n-1} (x-R[i])
// L must be of length n+1 and W must be of length n
// You can use R for W as in polLambda64s( R, n, L, R, p ) which will destroy R
    int i,m,d;
    if( n==0 ) { L[0] = 1; return; }
    if( n==1 ) { L[0] = neg64s(R[0],p); L[1] = 1; return; }
    if( n==2 ) { L[0] = mul64s(R[0],R[1],p); L[1] = neg64s(add64s(R[0],R[1],p),p); L[2] = 1; return; }
    m = n/2; d = n-m; 
    polLambda64s( R, m, L, W, p );           // L = [ x, x, 1, -, - ]  if n=4
    polLambda64s( R+m, d, L+m, W, p );       // L = [ x, x, y, y, 1 ]
    for( i=0; i<n; i++ ) W[i] = L[i];        // W = [ x, x, y, y ]
    polmul64s( W, W+m, L, m-1, d-1, p );     // L = [ a, b, c, -, - ]
    L[n-1] = 0; L[n] = 1;                    // L = [ a, b, c, 0, 1 ]
    poladd64s( W, L+d, L+d, m-1, m-1, p );   //   + [ -, -, y, y, - ] 
    poladd64s( W+m, L+m, L+m, d-1, d-1, p ); //   + [ -, -, x, x, - ]
    return;
}


/*  Test polLambda  */
LONG * array(LONG n) {
    LONG *A;
    A = (LONG *) malloc( n*sizeof(LONG) );
    if( A==0 ) { printf("out of memory\n"); exit(1); }
    return A;
}


/********************************************************************************************/
/*                                                                                          */
/*    Eval speedups for GCD code                                                            */
/*                                                                                          */
/********************************************************************************************/

void evalinit64s( LONG *Y, int n, LONG alpha, LONG p ) {
   int i; recint P;
   P = recip1(p);
   for( i=0; i<n; i++ ) Y[i] = powmodP64s( alpha, Y[i], p, P );
   return;
}

void evalnext64s( LONG *C, LONG *Y, int n, LONG p ) {
   int i; recint P;
   P = recip1(p);
   for( i=0; i<n; i++ ) C[i] = mulrec64(C[i],Y[i],P);
   return;
}
 
int evaladd64s( LONG *C, int n, int *T, int dx, LONG *B, LONG p ) {
   int i,j,k; LONG t;
   k = 0;
   for( i=0; i<=dx; i++ ) {
       for( j=0,t=0; j<T[i]; j++,k++ ) t = add64s(t,C[k],p);
       B[i] = t;
   }
   for( k=dx; k>=0 && B[k]==0; k-- );
   return k;
}


#define MAXPOW 64
LONG powmodPnew( LONG x, LONG n, LONG *alpha, recint P ) {
LONG t;
    if( n<0 ) {printf("powmod -ve\n"); exit(1); }
    if( n<MAXPOW ) return alpha[n];
    t = powmodPnew(x,n/2,alpha,P);
    t = mulrec64(t,t,P);
    if( n&1 ) t = mulrec64(x,t,P);
    return t;
}

/*********************************************************************/
/*  These routines work with the Maple DAG directly                  */
/*********************************************************************/

#define NAME 8
#define TABLEREF 10
#define PROD 14
#define SUM 16
#define POLY 17
#define EXPSEQ 31
#define LIST 32
#define SET 33
#define INTNEG 1
#define INTPOS 2


int ID( LONG *a ) { 
    int id;
    id = (a[0] >> 32) & 63;
    return id;
}

LONG LENGTH( LONG *a ) {
    LONG len;
    len = a[0] & ((1LL << 32)-1);
    return len;
}
    
void getdegs64s( LONG *f, int nf, int *D, int n )  {
int i,j,numbits,id;
LONG mask,z,d;
    // extract degrees of f in all n variables simulateneously
    id = ID(f);
    if( id!=POLY ) { printf("getdeg64s: input is not a POLY DAG\n"); exit(1); }
    numbits = 64/(n+1);
    // printf("getdegs64s: n=%d\n",n);
    mask = ( 1LL << numbits ) - 1;
    for( j=0; j<n; j++ ) D[j] = 0;
    for( i=0; i<nf; i++ ) {
        z = f[2*i+2];
        for( j=n-1; j>=0; j-- ) {
            d = z & mask;
            if( d>D[j] ) D[j] = d;
            z = z >> numbits;
        }
    }
    return;
}

void getterms64s( LONG *f, int nf, int *T, int dx, int x, int n )  {
int i,j,d,numbits,shift;
LONG mask,z;
    // Input f a POLY dag in n variables and nf terms and degree dx in x
    // Ouptut T[d] = the number of terms in f of degree d in x  
    numbits = 64/(n+1);
    mask = ( 1LL << numbits ) - 1;
    shift = numbits*(n-x);
    for( j=0; j<=dx; j++ ) T[j] = 0;
    for( i=0; i<nf; i++ ) {
        z = f[2*i+2];
        z = z >> shift;
        d = z & mask;
         //printf("d=%d\n",d);
        if( d>dx ) { printf("dx too small: d=%d  dx=%d\n",d,dx); exit(1); }
        T[d] = T[d]+1;
    }
    return;
}


void gettermshomo64s( LONG *f, int nf, int *T, int D, int n )  {
int i,j,d,numbits,shift;
LONG mask,z;
    // Input f a POLY dag in n variables and nf terms and total degree D.
    // Ouptut T[d] = the number of terms in f of degree total d for 0 <= d <= D
    // The terms in f are supposed to be sorted by total degree
    numbits = 64/(n+1);
    shift = numbits*n;
    for( j=0; j<=D; j++ ) T[j] = 0;
    for( i=0; i<nf; i++ ) {
        z = f[2*i+2];
        d = z >> shift;
        //printf("d=%d\n",d);
        if( d>D ) { printf("D too small\n"); exit(1); }
        T[d] = T[d]+1;
    }
    return;
}


int gettermsSUM64s( LONG *f, int *T, int dx, LONG x )  {
int i,j,id,found;
LONG z,d,*mon;
    // Input f a SUM dag.  Check it's expanded in names.
    // Ouptut T[i] = #coeff(f,x,i)
    if( ID(f)!=SUM ) { printf("gettermsSUM: input not a SUM\n"); exit(1); }
    for( j=0; j<=dx; j++ ) T[j] = 0;
    for( i=1; i<LENGTH(f); i+=2 ) {
        z = f[i];
        if( z&1 ) { T[0]++; continue; }
        if( z==x ) { if( dx==0 ) return -1; T[1]++; continue; }
        mon = (LONG *) z;
        id = ID(mon);
        if( id==NAME || id==TABLEREF ) { T[0]++; continue; } 
        if( id!=PROD ) { printf("bad term in SUM: id=%d \n",id); return -1; }
        found = 0;
        for( j=1; j<LENGTH(mon); j+=2 ) {
            z = mon[j];
            d = mon[j+1];
            if( d&1 ) d = d>>1; else { printf("exponent must be immediate\n");  return -2; }
            if( z==x ) {
                if( d>dx ) { printf("degree too big\n"); return -3; }
                T[d]++;
                found = 1;
            }
            id = ID((LONG *) z);
            if( id!=NAME && id!=TABLEREF ) return -4;
        }
        if( !found ) { T[0]++; }
    }
    return 1;
}


int * array32s( LONG n ) {
    int *A;
    A = (int *) malloc( n*sizeof(int) );
    if( A==0 ) { printf("out of memory\n"); exit(1); }
    return A;
}


void getsupport64s( LONG *f, int nf, LONG *alpha, int n, int x, int dx, int *T, LONG *C, LONG *M, LONG p ) {
// f = 3*x^2*y^2+5*z^3*y+2*x^2+4*y*z
// T = [#coeff(f,x,i), i=0..dx]
// sort terms in f on x so that g = (5z^3y+4yz) x^0 + 0 x^1 + (3y^2+2)x^2
// C = [(5,4),(),(3,2)] and M = [(z^3*y,y*z),(),(y^2,z^3*y)] | y=alpha[1],z=alpha[2]

    int i,j,k,numbits,d,index;
    LONG mask,y,z,c,m;
    recint P;
    LONG *Powers[32], *W;
    int D[32], *K;

//printf("getsupport: n=%d dx=%d\n",n,dx);
    getterms64s(f,nf,T,dx,x,n);
//for( i=0; i<=dx; i++ ) printf("x^%d = %d ",i,T[i]); printf("\n");
    K = array32s(dx+1);
    K[0] = 0; for( i=0; i<dx; i++ ) K[i+1] = K[i]+T[i];
    // T is an index into C and M
//for( i=0; i<=dx; i++ ) printf("x^%d = %d ",i,K[i]); printf("\n");

    //printf("#f=%d n=%d dx=%d x=%d\n",nf,n,dx,x);
    //printf("alpha=["); for( i=0; i<n; i++ ) printf("%lld,",alpha[i]); printf("];\n");
    if( n>32 ) { printf("too many variables\n"); exit(1); }
    if( x<1 || x>n ) { printf("x=%d out of range\n",d); exit(1); }
    numbits = 64/(n+1);
    mask = ( 1LL << numbits ) - 1;
    getdegs64s( f, nf, D, n );

    P = recip1(p);
    for( i=0; i<n; i++ ) {
        y = alpha[i];
        d = D[i]; // printf("deg x%i = %d\n",i+1,d);
        W = array(d+1);
        W[0] = 1;
        for( j=1; j<=d; j++ ) W[j] = mulrec64(y,W[j-1],P);
        Powers[i] = W;
    }

    for( i=0; i<=dx; i++ ) T[i] = 0;
    for( i=0; i<nf; i++ ) {
        z = f[2*i+2];
        c = f[2*i+3];
        if( !(c&1) ) { printf("coefficient is not immediate\n"); exit(1); }
        c = c >> 1;
//if |c|<2^62 then store C = 2*c+1 = odd  else create a GMP long integer (array) and store a pointer to it.
//[POLY | [x,y,z] | m1 | 3 | m2 | c2 ]  p<2^62
        //printf("c=%lld  z=%lld\n",c,z);
        m = 1;
        for( j=n; j>0; j-- ) {
            d = z & mask;
            if( j==x ) k = d;
            else { y = Powers[j-1][d]; //y = powmod64s(alpha[j-1],d,p);
                   m = mulrec64(m,y,P);
            }
            z = z >> numbits;
        }
        index = T[k]+K[k];
//printf("k=%d index=%d\n",k,index);
        C[index] = c;
        M[index] = m;
        T[k] = T[k]+1;
    }
    free(K);
    for( i=0; i<n; i++ ) free(Powers[i]);
    return;
}


void getdilatedsupport64s( LONG *f, int nf, LONG *alpha, LONG *beta, int n, int x, int dx, int *T, LONG *C, LONG *M, LONG p ) {
// f = 3x^2y^2 + 5z^3y + 2x^2 + 4yz
// sort terms in f on x so that g = (5z^3y+4yz) x^0 + 0 x^1 + (3y^2+2)x^2
// T = [#coeff(f,x,i), i=0..dx] = [2,0,2]
// d = [(z^3y,yz),(),(y^2,1)] | y=beta[1],z=beta[2] the dilation values
// C = [(5 d[1],4 d[2]),(),(3 d[3],2 d[4])] and 
// M = [(z^3*y,y*z),(),(y^2,1)] | y=alpha[1],z=alpha[2] the monomial evaluations

    int i,j,k,numbits,d,index;
    LONG mask,y,z,c,m,s;
    recint P;
    LONG *ALPHA[32], *BETA[32], *W;
    int D[32], *K;

//printf("getsupport: n=%d dx=%d\n",n,dx);
    getterms64s(f,nf,T,dx,x,n);
//for( i=0; i<=dx; i++ ) printf("x^%d = %d ",i,T[i]); printf("\n");
    K = array32s(dx+1);
    K[0] = 0; for( i=0; i<dx; i++ ) K[i+1] = K[i]+T[i];
    // T is an index into C and M

    //printf("#f=%d n=%d dx=%d x=%d\n",nf,n,dx,x);
    //printf("alpha=["); for( i=0; i<n; i++ ) printf("%lld,",alpha[i]); printf("];\n");
    if( n>32 ) { printf("too many variables\n"); exit(1); }
    if( x<1 || x>n ) { printf("x=%d out of range\n",d); exit(1); }
    numbits = 64/(n+1);
    mask = ( 1LL << numbits ) - 1;

    getdegs64s( f, nf, D, n );
    for( i=0; i<n; i++ ) ALPHA[i] = array(D[i]+1);
    for( i=0; i<n; i++ ) BETA[i] = array(D[i]+1);

    P = recip1(p);
    for( i=0; i<n; i++ ) {
        d = D[i];
        y = alpha[i];
        W = ALPHA[i]; W[0] = 1;
        for( j=1; j<=d; j++ ) W[j] = mulrec64(y,W[j-1],P);
        y = beta[i];
        W = BETA[i]; W[0] = 1;
        for( j=1; j<=d; j++ ) W[j] = mulrec64(y,W[j-1],P);
    }

    for( i=0; i<=dx; i++ ) T[i] = 0;
    for( i=0; i<nf; i++ ) {
        c = f[2*i+3];
        if( !(c&1) ) { printf("coefficient is not immediate\n"); exit(1); }
        c = c >> 1;
        z = f[2*i+2];
        m = 1; // monomial evaluation = M(alpha)
        s = 1; // dilation evaluation = M(beta)
        for( j=n; j>0; j-- ) {
            d = z & mask;
            if( j==x ) k = d;
            else { y = ALPHA[j-1][d]; //y = powmod64s(alpha[j-1],d,p);
                   m = mulrec64(m,y,P);
                   y = BETA[j-1][d];
                   s = mulrec64(s,y,P);
            }
            z = z >> numbits;
        }
        index = T[k]+K[k]; //printf("k=%d index=%d\n",k,index);
        C[index] = mulrec64(c,s,P);
        M[index] = m;
        T[k] = T[k]+1;
    }

    free(K);
    for( i=0; i<n; i++ ) free(ALPHA[i]);
    for( i=0; i<n; i++ ) free(BETA[i]);
    return;
}


void getsupporthomo64s( LONG *f, int nf, LONG *alpha, int n, int deg, int *T, LONG *C, LONG *M, LONG p ) {
// f = 3*x^2*y^2+5*z^3*y+2*x^2+4*y*z+7
// The terms of f are already sorted by total degree g = (3x^2y^2+5z^3y) + (2x^2+4yz)
// Count the terms of total degree d in T for 0 <= d <= deg so that 
// T = [1,0,2,0,2]
// C = [(3,5),(2,4),7] 
// M = eval( [x^2y^2,z^3y,x^2,yz,1], {x=alpha[0],y=alpha[1],z=alpha[2]}) mod p
// Then the homogenized evaluated polynomial in w can be extraced as
// (3M[1]+5M[2]) + (2M[3]+4M[4])w^2 + 7w^4

    int i,j,numbits,d;
    LONG mask,y,c,m;
    ULONG z;
    recint P;
    LONG *Powers[32], *W;
    int D[32];

    //printf("#f=%d n=%d dx=%d x=%d\n",nf,n,dx,x);
    //printf("alpha=["); for( i=0; i<n; i++ ) printf("%lld,",alpha[i]); printf("];\n");

    if( n>32 ) { printf("too many variables\n"); exit(1); }
    getdegs64s( f, nf, D, n );
    numbits = 64/(n+1);
    mask = ( 1LL << numbits ) - 1;
    P = recip1(p);
    for( i=0; i<n; i++ ) {
        y = alpha[i];
        d = D[i]; // printf("deg x%i = %d\n",i+1,d);
        W = array(d+1);
        W[0] = 1;
        for( j=1; j<=d; j++ ) W[j] = mulrec64(y,W[j-1],P);
        Powers[i] = W;
    }
    for( i=0; i<=deg; i++ ) T[i] = 0;
    for( i=0; i<nf; i++ ) {
        z = f[2*i+2];
        c = f[2*i+3];
        if( !(c&1) ) { printf("coefficient is not immediate\n"); exit(1); }
        c = c >> 1;
        printf("c=%lld  z=%llu\n",c,z);
        m = 1;
        for( j=n; j>0; j-- ) {
            d = z & mask;
            y = Powers[j-1][d]; //y = powmod64s(alpha[j-1],d,p);
            m = mulrec64(m,y,P);
            z = z >> numbits;
        }
        d = z;
        if( d>deg ) { printf("deg too small\n"); exit(1); }
        C[i] = c;
        M[i] = m;
        T[d] = T[d]+1;
    }
    for( i=0; i<n; i++ ) free(Powers[i]);
    return;
}



int evaldeg64s( LONG *f, int nf, LONG *g, int dx, LONG *alpha, int n, int x, LONG p ) {
// f is a POLY polynomial in n variables with nf terms
// The variables in f are numbered 1,2,...,n
// Compute g := eval( f, {x[i] = alpha[i] : i <> x}) mod p and return deg(g)
    LONG mask,c,z,y;
    int numbits,k,d,i,j,id;
    LONG *Powers[32], *W;
    int D[32];
    recint P;

    //printf("#f=%d n=%d dx=%d x=%d\n",nf,n,dx,x);
    id = ID(f);
    if( id!=POLY ) { printf("evaldeg64s: input must be a POLY DAG\n"); exit(1); }
    k = LENGTH((LONG *)f[1]);
    if( k-1!=n ) { printf("evaldeg64s: input has %d variables not %d\n",k-1,n); exit(1); }
    //printf("alpha=["); for( i=0; i<n; i++ ) printf("%lld,",alpha[i]); printf("];\n");
    if( n>32 ) { printf("too many variables\n"); exit(1); }
    if( x<1 || x>n ) { printf("x out of range\n"); exit(1); }
    numbits = 64/(n+1);
    mask = ( 1LL << numbits ) - 1;
    P = recip1(p);
    getdegs64s(f,nf,D,n);

    for( i=0; i<n; i++ ) {
        y = alpha[i];
        d = D[i]; // printf("deg x%i = %d\n",i+1,d);
        W = array(d+1);
        W[0] = 1;
        for( j=1; j<=d; j++ ) W[j] = mulrec64(y,W[j-1],P);
        Powers[i] = W;
    }
    for( i=0; i<=dx; i++ ) g[i] = 0;
    for( i=0; i<nf; i++ ) {
        z = f[2*i+2];
        c = f[2*i+3];
        if( !(c&1) ) { printf("coefficient is not immediate\n"); exit(1); }
        c = c >> 1;
        //printf("c=%lld  z=%lld\n",c,z);
        //   ^5 ^2 ^0 ^3  numbits=64/4=16
        for( j=n; j>0; j-- ) {
            d = z & mask;
            if( j==x ) k = d;
            else { y = Powers[j-1][d]; //y = powmod64s(alpha[j-1],d,p);
                   c = mulrec64(c,y,P);
            }
            z = z >> numbits;
        }
        g[k] = add64s(g[k],c,p);
    }
    while( dx>=0 && g[dx]==0 ) dx--;
    for( i=0; i<n; i++ ) free(Powers[i]);
    return dx;
}


LONG GMPsize( LONG *x ) {
    // x is a Maple INTPOS or INTNEG
    // The lower order 32 bits of x[1] is _mp_size
    ULONG mask; LONG n;
    mask = (1LL << 32)-1;
    n = ((ULONG) x[1]) & mask;
    return n;
}

LONG* GMPint( LONG *x ) {
    // x is a Maple INTPOS or INTNEG
    // The GMP integer starts at x+3
    return x+3;
}

LONG Zmodp( ULONG *y, LONG n, LONG p, recint P ) {
// Reduce a GMP integer stored in y=[y0|y1|...|yn-1] mod p <2^63
    ULONG t; LONG x,B,i;
//printf("Zmodp: n=%lld\n",n);
    t = (-1LL >> 63); // t = 2^64-1
    B = t % p;
    B = add64s(B,1LL,p); // B = 2^64 mod p
//printf("B=%lld\n",B);
    x = 0;
    for( n--; n>=0; n-- ) {  // use Horner y = y[0] + B(y[1] + B(y[2]))
        x = mulrec64(B,x,P);
        x = add64s(x,y[n]%p,p);
    }
//printf("x=%lld\n",x);
    return x;
}

LONG evalpoly64s( LONG *f, LONG *X, LONG *alpha, LONG **T, LONG p ) {
    int id;
    LONG i, j, *Y, L, n, m,t, x, z, numbits, mask, d, s, *S[64];
    LONG *ALPHA, *BETA;
    recint P;
    P = recip1(p);
    t = LENGTH(f);
    Y = (LONG *) f[1];
    id = ID(Y);
    if( id!=EXPSEQ ) return -7;
    n = LENGTH(Y);
    m = LENGTH(X);
    // Eval( f(Y[1],...,Y[n]}, {X[1]=alpha[1],...,X[m]=alpha[m]} ) mod p
    // printf("POLY #vars=%lld  t=%lld  #alpha=%lld\n",n-1,t,m-1);
    for( i=1; i<n; i++ ) S[i] = 0;
    for( i=1; i<n; i++ ) {
        x = Y[i];
        for( j=1; j<m; j++ ) 
           if( x==X[j] ) S[i]=T[j];
    }
    for( i=1; i<n; i++ ) 
        if( S[i] == 0 ) 
            { printf("poly is not fully evaluated\n"); return -7; }

    numbits = 64/(n); // n is one more already
    mask = ((1LL) << numbits) - 1;

    s = 0;
    for( i=2; i<t; i+=2 ) {
        x = f[i];
        z = f[i+1];
        if( z&1 ) { // immediate integer
            z = z >> 1;
            // z = z%p; // this is expensive
            if( z<-p || z>=p ) z = z%p;
            if( z<0 ) z = z+p;
        } else { LONG *y;
            y = (LONG *) f[i+1];
            id = ID(y);
            if( id==INTPOS || id==INTNEG ) {
                z = Zmodp(GMPint(y),GMPsize(y),p,P);
                if( id==INTNEG ) z = neg64s(z,p);
            }
            else return -8; 
        }
        for( j=n-1; j>0; j-- ) {
            d = x & mask;
            //z = mulrec64(z,powmodP64s(alpha[j],d,p,P),P);
            if( d ) z = mulrec64(z,S[j][d],P); 
            // case d=0 implies S[j][d]=1 so no multiplication is needed
            x = x >> numbits;
        }
        s = add64s(s,z,p);
    }
//    free(ALPHA);
    return s;
}

int indexofx( LONG *X, int n, LONG x ) {
// X = [EXPSEQ|x1|x2|...|xn-1]
    int i;
    for( i=1; i<n; i++ ) if( X[i]==x ) return i;
    return 0;
}

LONG evalrec( LONG *f, LONG *X, LONG *alpha, LONG **T, LONG p, recint P ) {
    LONG x,L,i,j,id,a,b,n;
    x = (LONG) f;
    if( x&1 ) { x = x >> 1; x = x%p; if( x<0 ) x += p; return x; }
    id = ID(f);
    if( id==INTPOS ) {
        x = Zmodp(GMPint(f),GMPsize(f),p,P);
        return x;
    } else if( id==INTNEG ) {
        x = Zmodp(GMPint(f),GMPsize(f),p,P);
        x = neg64s(x,p);
        return x;
    } else if( id==SUM ) {
        L = LENGTH(f);
        x = 0;
        for( i=1; i<L; i+=2 ) {
            a = evalrec( (LONG *) f[i], X, alpha, T, p, P );
            b = evalrec( (LONG *) f[i+1], X, alpha, T, p, P );
            //printf("SUM: a=%lld b=%lld\n",a,b);
            if( a<0 ) return a;
            if( b<0 ) return b;
            x = add64s( x, mulrec64( a, b, P ), p );
        }
        //printf("SUM: x=%lld\n",x);
        return x;
    } else if( id==PROD ) {
        L = LENGTH(f);
        n = LENGTH(X);
        x = 1;
        for( i=1; i<L; i+=2 ) { LONG d;
            d = f[i+1];
            if( !(d&1) ) return -1;
            d = d >> 1;
            if( d<0 ) b = -d; else b = d;
            j = indexofx( X, n, f[i] ); 
            if( j ) a = T[j][b];
            else { // e.g. x^2 (2xy^2 + 3y) and (x+y)/(x^2+y+2)
                a = evalrec( (LONG *) f[i], X, alpha, T, p, P );
                if( a<0 ) return a;
                a = powmodP64s(a,b,p,P);
            }
            if( d<0 ) {
                if( a==0 ) { printf("division by zero\n"); return -2; }
                a = modinv64s(a,p); 
            }
            x = mulrec64( x, a, P );
        }
        //printf("PROD: x=%lld\n",x);
        return x;
    } else if( id==NAME || id==TABLEREF ) {
        n = LENGTH(X);
        //for( i=1; i<n; i++ ) if( f == (LONG *) X[i] ) return alpha[i];
        for( i=1; i<n; i++ ) if( f == (LONG *) X[i] ) return T[i][1];
        return -4;
    } else if( id==POLY ) {
        x = evalpoly64s(f,X,alpha,T,p);
        //printf("POLY: x=%lld\n",x);
        return x;
    }
    printf("idfail=%d\n",id);
    return -1;
}

LONG eval64s( LONG *f, LONG *X, LONG *alpha, LONG *D, LONG p ) {
// X = [x1,x2,...,xm], alpha in Zp^m, D in Z^n degree maximums
    int id, n; LONG x, i, j, d, space, *A, *B, beta[64], *T[64];
    recint P; P = recip1(p);

    if( ID(X)!=LIST ) return -2; else X = (LONG *) X[1];
    if( ID(X)!=EXPSEQ ) return -3;
    if( ID(alpha)!=LIST ) return -4; else alpha = (LONG *) alpha[1];
    if( ID(alpha)!=EXPSEQ ) return -5;
    if( ID(D)!=LIST ) return -7; else D = (LONG *) D[1];
    if( ID(D)!=EXPSEQ ) return -8;
    n = LENGTH(X);
    if( LENGTH(alpha) != n ) return -8;
    if( LENGTH(D) != n ) return -8;
    for( i=1; i<n; i++ ) { 
        x = alpha[i]; //printf("x[%d] = %lld\n", i, x);
        if( x&1 ) { x = x>>1; x = x%p; if( x<0 ) x = x+p; beta[i] = x; } else return -6;
        //printf(" x[%d] = %lld\n", i, x);
    }
    for( space=0, i=1; i<n; i++) {
        d = D[i];
        if( d&1 ) { d = d >> 1; space += d+1; } else return -10;
    }
    A = array(space); // printf("allocating %lld  words\n",space);
    B = A;
    for( i=1; i<n; i++ ) {
        T[i] = B;
        d = D[i] >> 1; // printf("d[%lld]=%lld\n",i,d);
        x = beta[i];   //printf("x[%d] = %lld", i, x);
        B[0] = 1; 
        if( d>0 ) B[1] = x;
        for( j=2; j<=d; j++ ) B[j] = mulrec64(B[j-1],x,P);
        //printf("x[%lld] = [",i);
        //for( j=0; j<d; j++ ) printf("%lld,",B[j]);
        //printf("%lld]\n",B[d]);
        B = B + d+1;
    }
    x = evalrec( f, X, beta, T, p, P );
    free(A);
    return( x );
}

LONG evallists64s( LONG *LL, LONG *X, LONG *alpha, LONG *D, LONG *A, LONG p ) {
// A := Eval( [[x+y,2y^2+3x^2y],[3,5xy^3]], [x,y], [2,3], p );
    LONG n,r,c,d,i,j,x,*W,*B,*L,*T[64],beta[64],space;

    recint P; P = recip1(p);

    //  alpha := [ (1,2,3) ];   [ LIST 2, [ EXPSEQ 4 | 1 | 2 | 3 ] ]
    if( ID(X)==LIST ) X = (LONG *) X[1]; else return -1;
    if( ID(alpha)==LIST ) alpha = (LONG *) alpha[1]; else return -2;
    if( ID(D)==LIST ) D = (LONG *) D[1]; else return -5;
    n = LENGTH(X);
    if( LENGTH(alpha)!=n ) return -3;
    if( LENGTH(D) != n ) return -6;
    for( i=1; i<n; i++ ) {  // extract evaluation points into beta
        x = alpha[i];
        if( x&1 ) { x = x>>1; x = x%p; if( x<0 ) x = x+p; beta[i] = x; }
        else return -4;
    }
    for( i=1; i<n; i++ ) 
        if( !(D[i]&1) || D[i]<0 ) { printf("bad degrees\n"); return -6; }
    if( ID(LL)==LIST ) LL = (LONG *) LL[1]; else return -7;
    r = LENGTH(LL);
    for( i=1; i<r; i++ ) {
        L = (LONG *) LL[i];
        if( ID(L)==LIST ) L = (LONG *) L[1];
        else { printf("list expected\n"); return -9; }
        if( i==1 ) c = LENGTH(L);
        else if( c!= LENGTH(L) ) 
             { printf("lists must be same length\n"); return -10; }
    }
    r--; c--; //printf("evallists: r=%lld c=%lld\n",r,c);
    // LL is an r x c matrix represented as a Maple list of lists
    // Maple lists are indexed from 1 not 0

    for( space=0, i=1; i<n; i++) {
        d = D[i];
        if( d&1 && d>0 ) { d = d >> 1; space += d+1; }
        else return -10;
    }
    //for( d=0, i=1; i<n; i++) d = d + D[i] + 1;
    W = array(space); // table of powers [ alpha[i]^j : j=0,1,...,D[i] ]
    B = W; // pointer to position in W
    for( i=1; i<n; i++ ) {
        T[i] = B;
        d = D[i] >> 1; //printf("d[%lld]=%lld\n",i,d);
        x = beta[i];   //printf("x[%d] = %lld", i, x);
        B[0] = 1; 
        if( d>0 ) B[1] = x;
        for( j=2; j<=d; j++ ) B[j] = mulrec64(B[j-1],x,P);
        B = B + d+1;
    }
    //printf("evallists: evaluating ...\n");
    for( i=1; i<=r; i++ ) {
        L = (LONG *) LL[i]; L = (LONG *) L[1];
        for( j=1; j<=c; j++ ) {
            x = evalrec( (LONG *) L[j], X, beta, T, p, P );
            if( x<0 ) return x;
            A[(i-1)*r+(j-1)] = x;
        }
    }
    free(W);
    return(1);
}


int getSUMsupport64s( LONG *f, LONG *X, LONG *alpha, LONG *x, int dx, int *T, LONG *C, LONG *M, LONG p ) {
// f = 3*x^2*y^2+5*z^3*y+2*x^2+4*y*z, X=[y,z], x=x, dx=2
// T = [#coeff(f,x,i), i=0..dx]
// sort terms in f on x so that g = (5z^3y+4yz) x^0 + 0 x^1 + (3y^2+2)x^2
// C = [(5,4),(),(3,2)] and M = [(z^3*y,y*z),(),(y^2,z^3*y)] | y=alpha[1],z=alpha[2]

    int i,j,k,d,y,id,n,*K;
    LONG t,c,m,*mon,*z;
    recint P;

    if( ID(f)!=SUM ) { printf("f must be a sum\n"); return -1; }
    if( ID(X)!=LIST ) { printf("X must be a list\n"); return -1; }
    if( ID(alpha)!=LIST ) { printf("alpha must be a list\n"); return -1; }
    if( ID(x)!=NAME && ID(x)!=TABLEREF ) { printf("x must be a name\n"); return -1; }
    X = (LONG *) X[1];
    alpha = (LONG *) alpha[1];
    n = LENGTH(X);
    if( n!=LENGTH(alpha) ) return -1;
    for( i=1; i<n; i++ ) {
        if( !(alpha[i]&1) ) { printf("alpha[i] must be immediate\n"); exit(1); }
        t = alpha[i]>>1;
        if( t<0 || t>=p ) { printf("alpha[i] must be on [0,p)\n"); exit(1); }
    }

    //printf("getSUMsupport64s: #f=%d n=%d dx=%d\n",LENGTH(f)/2,LENGTH(X)-1,dx);

    // First pass through f: compute T[i]=#coeff(f,x,i); and check f is expanded
    if( gettermsSUM64s(f,T,dx,(LONG) x)<0 ) { printf("gettermsSUM64s: not handled\n"); exit(1); }

    K = array32s(dx+1);
    K[0] = 0; for( i=0; i<dx; i++ ) K[i+1] = K[i]+T[i]; // K is an index into C and M
    for( i=0; i<=dx; i++ ) T[i] = 0;

    P = recip1(p);

    for( i=1; i<LENGTH(f); i+=2 ) {
        m = f[i];
        c = f[i+1];
        if( c&1 ) c = c >> 1; else { printf("coefficient is not immediate\n"); exit(1); }
        if( m&1 ) { k = 0; j = T[k]+K[k]; C[j] = m >> 1; M[j] = 1; T[k] ++; continue; }
        c = c % p;
        if( c<0 ) c += p;
        z = (LONG*) f[i];
        d = 0;
        id = ID(z);
        if( id==NAME || id==TABLEREF ) {
            if( z==x ) { k = 1; m = 1; }
            else {
                y = indexofx(X,n,f[i]);
                if( y==0 ) { printf("variable not evaluated\n",i); exit(1); }
                k = 0;
                m = alpha[y]>>1;
            }
        } else if( id==PROD ) {
            k = 0;
            m = 1;
            mon = z;
            for( j=1; j<LENGTH(mon); j+=2 ) {
                z = (LONG *) mon[j];
                d = mon[j+1];
                if( d&1 ) d = d >> 1; else { printf("power too big\n"); exit(1); }
                if( z==x ) { k = d; continue; }
                y = indexofx(X,n,mon[j]);
                if( y==0 ) { printf("variable not evaluated %d\n",j); exit(1); }
                t = powmodP64s(alpha[y]>>1,d,p,P);
                m = mulrec64(t,m,P);
            }
        }
        else { printf("case not handled\n"); exit(1); }
        j = T[k]+K[k]; C[j] = c; M[j] = m; T[k]++;
    }

    //printf("getSUMsupport64s: done\n");
    free(K);
    return 1;
}


int evalSUM64s( LONG *f, LONG *X, LONG *alpha, LONG *x, int dx, LONG *g, LONG p ) {
// Eval( f, {X[i]=alpha[i]} ) mod p into g(x)

    int i,j,k,d,y,id,n;
    LONG t,c,m,*mon,*z;
    recint P;

    if( ID(f)!=SUM ) { printf("f must be a sum\n"); exit(1); }
    if( ID(X)!=LIST ) { printf("X must be a list\n"); exit(1); }
    if( ID(alpha)!=LIST ) { printf("alpha must be a list\n"); exit(1); }
    if( ID(x)!=NAME && ID(x)!=TABLEREF ) { printf("x must be a name\n"); exit(1); }
    X = (LONG *) X[1];
    alpha = (LONG *) alpha[1];
    n = LENGTH(X);
    if( n!=LENGTH(alpha) ) exit(1);
    for( i=1; i<n; i++ ) {
        if( !(alpha[i]&1) ) { printf("alpha[i] must be immediate\n"); exit(1); }
        t = alpha[i]>>1;
        if( t<0 || t>=p ) { printf("alpha[i] must be on [0,p)\n"); exit(1); }
    }

    //printf("evalSUM64s: #f=%d n=%d dx=%d\n",LENGTH(f)/2,LENGTH(X)-1,dx);

    for( i=0; i<=dx; i++ ) g[i] = 0;

    P = recip1(p);

    for( i=1; i<LENGTH(f); i+=2 ) {
        m = f[i];
        c = f[i+1];
        if( m&1 ) { c = m >> 1; g[0] = add64s(g[0],c,p); continue; }
        if( c&1 ) c = c >> 1; else { printf("coefficient is not immediate\n"); exit(1); }
        c = c % p;
        if( c<0 ) c += p;
        z = (LONG*) f[i];
        d = 0;
        id = ID(z);
        if( id==NAME || id==TABLEREF ) {
            if( z==x ) { g[1] = add64s(g[1],c,p); }
            else {
                y = indexofx(X,n,f[i]);
                if( y==0 ) { printf("variable not evaluated\n",i); exit(1); }
                k = 0;
                m = alpha[y]>>1;
                c = mulrec64(c,m,P);
                g[0] = add64s(g[0],c,p);
            }
        } else if( id==PROD ) {
            k = 0;
            m = 1;
            mon = z;
            for( j=1; j<LENGTH(mon); j+=2 ) {
                z = (LONG *) mon[j];
                d = mon[j+1];
                if( d&1 ) d = d >> 1; else { printf("power too big\n"); exit(1); }
                if( z==x ) { k = d; continue; }
                y = indexofx(X,n,mon[j]);
                if( y==0 ) { printf("variable not evaluated %d\n",j); exit(1); }
                t = powmodP64s(alpha[y]>>1,d,p,P);
                m = mulrec64(t,m,P);
            }
            c = mulrec64(c,m,P);
            g[k] = add64s(g[k],c,p);
        }
    }
    //printf("evalSUM4s: done\n");
    while( dx>=0 && g[dx]==0 ) dx--;
    return dx;
}

LONG evalNAME64s( LONG *x, LONG *X, LONG *a, LONG p ) {
int i; LONG *y;
    if( ID(x)!=NAME && ID(x)!=TABLEREF ) { printf("evalNAME: name expected\n"); exit(1); }
    for( i=1; i<=LENGTH(X); i++ ) {
        y = (LONG *) X[i];
        if( x==y ) return a[i];
    }
    printf("variable not evaluated\n");
    exit(1);
}

LONG modINT( LONG c, LONG p, recint P ) {
    int id;  LONG *x;
    if( c&1 ) {
         c = c >> 1;
         // c = c%p; // this is expensive
         if( c<-p || c>=p ) c = c%p;
         if( c<0 ) c += p;
         return c; 
    }
    x = (LONG *) c;
    id = ID(x);
    if( id!=INTPOS && id!=INTNEG ) { printf("integer expected\n"); exit(1); }
    c = Zmodp(GMPint(x),GMPsize(x),p,P);
    if( id==INTNEG ) c = neg64s(c,p);
    return c;
}
    

LONG evalPROD64s( LONG *f, LONG *X, LONG *a, LONG p, recint P ) {
    int j,k,n; LONG d,m,t,*x;
    m = 1;
    n = LENGTH(f);
    for( j=1; j<n; j+=2 ) {
         x = (LONG *) f[j];
         t = evalNAME64s( x, X, a, p );
         d = f[j+1];
         if( d&1 ) d = d>>1; else { printf("power too big\n"); exit(1); }
         if( d<0 ) { printf("negative powers not implemented\n"); exit(1); }
         t = powmodP64s(t,d,p,P);
         m = mulrec64(t,m,P);
    }
    return m;
}

LONG evalSUM64s2( LONG *f, LONG *X, LONG *a, LONG p, recint P ) {
    int i,n,id; LONG c,s,t,*m;
    s = 0;
    for( i=1; i<LENGTH(f); i+=2 ) {
        c = f[i];
        if( c&1 ) { c = c>>1; s = add64s(s,c,p); continue; }
        m = (LONG*) f[i];
        id = ID(m);
        if( id==INTPOS || id==INTNEG ) { c = modINT(c,p,P); s = add64s(s,c,p); continue; }
        c = modINT( f[i+1], p, P );
        if( id==NAME || id==TABLEREF ) t = evalNAME64s(m,X,a,p);
        else if( id==PROD ) t = evalPROD64s(m,X,a,p,P);
        else { printf("evalSUM: case not handled\n"); exit(1); }
        t = mulrec64(t,c,P);
        s = add64s(s,t,p);
    }
    return s;
}

LONG evalPOLY64s( LONG *f, LONG *X, LONG *alpha, LONG p, recint P ) {
    int i,j,n,numbits;
    LONG mask,m,c,d,s,z,*Y,a[33];

    if( ID(f)!=POLY ) { printf("evalPOLY: POLY expected\n"); exit(1); }
    Y = (LONG *) f[1];
    if( ID(Y)!=EXPSEQ ) { printf("evalPOLY: bad variables\n"); exit(1); }
    n = LENGTH(Y);

    for( i=1; i<n; i++ ) a[i] = 0;
    for( i=1; i<n; i++ ) {
        j = indexofx( X, LENGTH(X), Y[i] );
        if( j==0 ) printf("variable not in X\n");
        a[i] = alpha[j];
    }

    numbits = 64/(n); // n is one more already
    mask = ((1LL) << numbits) - 1;

    s = 0; // the result
    for( i=2; i<LENGTH(f); i+=2 ) {
        m = f[i]; // monomial
        c = f[i+1]; // coefficient
        z = 1;
        for( j=n-1; j>0; j-- ) {
            d = m & mask;
            if( d ) z = mulrec64(z,powmodP64s(a[j],d,p,P),P);
            m = m >> numbits;
        }
        c = modINT(c,p,P);
        z = mulrec64(c,z,P);
        s = add64s(s,z,p);
    }
    return s;
}

void evalLIST64s( LONG *L, LONG *X, LONG *alpha, LONG *A, LONG p ) {
// Eval(L,{X[i]=alpha[i]}) mod p into A an nrows by ncols array of LONGs

    int i,k,id,n;
    LONG t,c,*f; 
    recint P;

    if( ID(L)!=LIST ) { printf("L must be a list\n"); exit(1); }
    if( ID(X)!=LIST ) { printf("X must be a list\n"); exit(1); }

    L = (LONG *) L[1];
    X = (LONG *) X[1];
     
    alpha--;  // I've written the access to X and alpha indexed from 1

    P = recip1(p);

    n = LENGTH(X);
    //for ( i=0; i<n; i++ ) printf("alpha[%d]=%lld\n",i,alpha[i]); 

    for( i=1; i<n; i++ ) { 
        t = alpha[i];
        if( t<0 || t>=p ) { printf("a[%d]=%lld must be reduced mod p\n",i,t); exit(1); }
    }

    k = 0; // next entry in A
    for( i=1; i<LENGTH(L); i++ ) {
        c = L[i];
        if( c & 1 ) { A[k++] = modINT(c,p,P); continue; }
        f = (LONG *) L[i];
        id = ID(f);
        if( id==POLY ) A[k] = evalPOLY64s(f,X,alpha,p,P);
        else if( id==SUM ) A[k] = evalSUM64s2(f,X,alpha,p,P);
        else if( id==PROD ) A[k] = evalPROD64s(f,X,alpha,p,P);
        else if( id==NAME || id==TABLEREF ) A[k] = evalNAME64s(f,X,alpha,p);
        else if( id==INTPOS || id==INTNEG ) A[k] = modINT(c,p,P);
        else { printf("type not handled\n"); exit(1); }
        k++;
    }

    return;
}


/*  Test polgcd64s */
/*

/*  Test polgcd64s */
/*
int main() {
    LONG *R, *W, *L, p, *g, *a, *b;
    int i,n,d;
    p = 5; p = (p<<55)+1;
    printf("p := %lld;\n",p);
    n = 10;
    R = array(n); for( i=0; i<n; i++ ) R[i] = 2*i+1;
    printf("R := "); vecprint64s(R,n); printf("\n");
    L = array(n+1);
    W = array(n);
    polLambda64s(R,n,L,W,p);
    printf("L := "); polprint64s(L,n,p); printf("\n");

   seed = 1;
   mult = 6364136223846793003ll;

    d = 3;
    g = array(d+1);
    a = array(2*d+1);
    b = array(2*d+1);
    polrand64s(g,d,p);
    polrand64s(a,d,p);
    polrand64s(b,d,p);
    printf("g := "); polprint64s(g,d,p);
    printf("a := "); polprint64s(a,d,p);
    printf("b := "); polprint64s(b,d,p);
    polmul64s(g,a,a,d,d,p);
    polmul64s(g,b,b,d,d,p);
    polprint64s(g,d,p);
    polprint64s(a,2*d,p);
    polprint64s(b,2*d,p);
    d = polgcd64s(a,b,2*d,2*d,p);
    polprint64s(a,d,p);
    return 0;
}
*/
