// Copyright: Tian Chen and Michael Monagan, 2022

#define LONG long long int
#include <stdio.h>
#include <stdlib.h>
#include "int128g.c"


/******************************************************************************************/
/*       Zp arithmetic                                                                    */
/******************************************************************************************/

LONG add64s(LONG a, LONG b, LONG p) { LONG t; t = (a-p)+b; t += (t>>63) & p; return t; }
LONG sub64s(LONG a, LONG b, LONG p) { LONG t; t = a-b; t += (t>>63) & p; return t; }
LONG neg64s(LONG a, LONG p) { return (a==0) ? 0 : p-a; }
LONG mul64s(LONG a, LONG b, LONG p) {
        LONG q, r;
        __asm__ __volatile__(           \
        "       mulq    %%rdx           \n\t" \
        "       divq    %4              \n\t" \
        : "=a"(q), "=d"(r) : "0"(a), "1"(b), "rm"(p));
        return r;
}

/* c^(-1) mod p assuming 0 < c < p < 2^63 */
LONG inv64s( LONG c, LONG p ) { // if c = 0, returns 0   
    LONG d,r,q,r1,c1,d1;
    d = p; c1 = 1; d1 = 0;
    while( d != 0 ) {
        q = c / d;
        r = c - q*d; r1 = c1 - q*d1;
        c = d; c1 = d1;
        d = r; d1 = r1;
    }
    if( c!=1 ) return( 0 );
    if( c1 < 0 ) c1 += p;
    return( c1 );
}

LONG *array( LONG n ) {  
    LONG *A;
    A = (LONG *) malloc( 8*n );
    if( A==0 ) { printf("out of memory\n"); exit(1); }
    return A;
}

LONG *matrix64s( int n ) {   
    LONG *A; LONG N;
    N = n; N = n*N;
    N = sizeof(LONG) * N;
    A = (LONG *) malloc(N);
    return A;
}

/* print an array in form [a0,a1,...,an-1] */
void vecprint64s( LONG *A, int n ){   
    int i;
    printf("[");
    for( i=0; i<n; i++ ) { printf("%lld",A[i]); if( i<n-1 ) printf(", "); }
    printf("]");
    return;
}

void filltoeplitz( LONG *A, LONG *a, int n ) { // fill A to be T_n (n*n symmetric Toeplitz matrix)
    int j, k; 
    for ( j=0; j<n-1; j++ ) { 
        for ( k=j+1; k<n; k++ ) { A[j*n+k] = a[k-j]; A[k*n+j] = a[k-j]; }
	A[j*n+j] = a[0];
    }
    A[n*n-1] = a[0];
    return;
}

LONG det64s( LONG * A, int n, LONG p ) {
// M = [[a,b,c],[d,e,f],[1,2,3],[4,5,6]]  M[i][j]
// A = [a,b,c,d,e,f,1,2,3,4,5,6]          A[m*i+j]   m = #columns 
    int i,j,k;
    LONG d,t;
    recint P;
    P = recip1(p);
    d = 1;
    for( k=0; k<n; k++ ) {
        //if( k>0 && k%100 == 0 ) printf("elimination at row %d\n",k);
        for( i=k; i<n && A[n*i+k]==0; i++ ); // look for non-zero pivot
        if( i>=n ) { d = 0; break; }
        if( i!=k ) { // interchange row k with row i
             for( j=k; j<n; j++ ) { t = A[k*n+j]; A[k*n+j] = A[i*n+j]; A[i*n+j] = t; }
             d = neg64s(d,p);
        };
        d = mulrec64(d,A[k*n+k],P);
        if( A[k*n+k]==0 ) { printf("division by 0\n"); return -1; }
        t = inv64s(A[k*n+k],p);
        A[n*k+k] = 1;
        for( j=k+1; j<n; j++ ) { A[n*k+j] = mulrec64(A[n*k+j],t,P); }
        for( i=k+1; i<n; i++ ) { // row i
            if( A[i*n+k]!=0 )
                for( j=k+1; j<n; j++ ) 
                     // A[i,j] = A[i,j] - A[i,k] A[k,j] mod p
                     A[i*n+j] = sub64s(A[i*n+j],mulrec64(A[i*n+k],A[k*n+j],P), p );
            A[i*n+k] = 0;
        }
    }
    return d;
}

LONG tdet64s( LONG *a, int n, LONG p, LONG *A ) { // compute det(T_n)
    filltoeplitz( A, a, n ); 
    return det64s( A, n, p );
}

LONG detBareissSymmetric( LONG *b, int n, LONG p ) { // Ref: Bareiss (1969) Algorithm (3.4)
    // Input: vector b=[b_0,...,b_n], diag entries of symmetric Toeplitz matrix T (size (n+1)*(n+1))
    // Output: Det(T) mod p
    LONG *a, *anew, *inv, prod; 
    int i, j; 
    if ( b[0]==0 ){ //printf("note: a[0]=0, used G.E.\n"); 
        LONG *A = matrix64s(n+1); LONG d = tdet64s(b,n+1,p,A); free(A); return d; 
    } 
    a = array( 2*n+1 ); anew = array( 2*n+1 ); inv = array( n+1 );
    a[0] = b[n]; for ( j=1; j<n+1; j++ ){ a[j] = b[n-j]; a[j+n] = b[j]; }  
    inv[0] = inv64s( a[n],p ); prod = a[n];  
    for ( i=1; i<n; i++ ) { 
	for ( j=i; j<n+1; j++ ) { // j=-n+i to 0 in (3.4b) 
            anew[j] = sub64s( a[j], mul64s(mul64s(a[n+i],inv[i-1],p), a[2*n-j+i], p ), p );
	}
	if ( anew[n]==0 ) { //printf("note:a[%d]=0, used G.E.\n",i); 
            LONG *A = matrix64s(n+1); LONG d = tdet64s(b,n+1,p,A); free(A); return d; 
        } 
	inv[i] = inv64s( anew[n],p ); prod = mul64s( anew[n], prod, p ); 
        for ( j=n+i+1; j<2*n+1; j++ ) { // j=i+1 to n in (3.4b)
            anew[j] = sub64s( a[j], mul64s(mul64s(a[n+i],inv[i-1],p), a[2*n-j+i], p ), p );
        }		
	for ( j=0;j<2*n+1;j++ ) { a[j] = anew[j]; } 
    }
    anew[n] = sub64s( a[n], mul64s(mul64s(a[2*n],inv[n-1],p), a[2*n], p ), p );
    return mul64s( anew[n], prod, p ); 
}	


void NewtonInverses(LONG *A, LONG n, LONG *I, LONG p)
{   LONG i,j,prod;
    recint P = recip1(p);
    for (j=1; j<n; j++)
    {   prod = sub64s(A[j],A[0],p);
        for (i=1; i<j; i++)
            prod = mulrec64(prod,sub64s(A[j],A[i],p),P); // prod *= (A[j]-A[i)
        if( prod==0 ) { printf("x coordinates must be distinct\n"); exit(1); }
        I[j] = inv64s(prod,p);
    }
    return;
}

LONG NewtonInterp4(LONG *A, LONG *Y, LONG *I, LONG n, LONG *V, LONG p)
{   LONG d,i,prod,t,j,s;
    recint P = recip1(p);
    if( n<1 ) return -1;
    V[0] = Y[0];
    for (j=1; j<n; j++)
    {   s = V[0];
        prod = sub64s(A[j],A[0],p);
        for (i=1; i<j; i++)
        {   s = add64s(s,mulrec64(prod, V[i], P),p); // s += V[i] prod
            prod = mulrec64(prod,sub64s(A[j],A[i],p),P); // prod *= (A[j]-A[i)
        }
        if( prod==0 ) { printf("x coordinates must be distinct\n"); exit(1); }
        V[j] = mulrec64(sub64s(Y[j],s,p),I[j],P);
    }
    for( d=n-1; d>=0 && V[d]==0; d-- ); // d = deg(f)
    for (i=1; i<=d; i++) // convert to standard basis using inplace Horner
        for (j=d-i; j<=d-1; j++)
            V[j] = sub64s(V[j],mulrec64(A[d-i],V[j+1],P),p);
    return d;
}

LONG poleval(LONG *f, LONG d, LONG a, LONG p, recint P ) {
     LONG y,i;
     if( d<0 ) return 0;
     y = f[d];
     for( i=d-1; i>=0; i-- ) y = add64s(f[i],mulrec64(y,a,P),p);
     return y;
}

/* LONG interp2var( LONG *beta, int n, int vi, int vj, LONG di, LONG dj, LONG dmax, LONG *xx, LONG *I, LONG *M, LONG p, LONG CNT )
{   // beta is an array of n evaluation points, di=deg(a,xi),dj=deg(a,xj),dmax=max(di,dj),
    // xx and I are interpolation points and pre-computed inverses, respectively, both have sizes dmax+1
    // Result is stored onto M, a long array of size (dj+1)*(di+1), coeffs of a(xi,xj).
    LONG i,j, *alpha=array(n), *Ytemp=array(dj+1), *V=array(dmax+1); int k; 
    for ( k=0; k<n; k++ ) alpha[k] = beta[k]; // copy evaluation points to alpha
    for ( i=0; i<di+1; i++ ) { alpha[vi-1] = xx[i]; 
	for ( j=0; j<dj+1; j++ ) { alpha[vj-1] = xx[j];
        Ytemp[j] = detBareissSymmetric( alpha, n-1, p ); CNT++; 
		}
	    NewtonInterp4( xx, Ytemp, I, dj+1, V, p );
	    for ( j=0; j<dj+1; j++ ) M[(di+1)*j+i] = V[j]; // copy results back to columns of M
    }
    for ( j=0; j<dj+1; j++ ) {
	    NewtonInterp4( xx, M+(di+1)*j, I, di+1, V, p );   
	    for ( i=0; i<di+1; i++ ) M[(di+1)*j+i] = V[i];  // copy results back to rows of M
    }
    free(alpha); free(Ytemp); free(V); 
    return CNT; 
}	 */

LONG callBB( LONG *beta, int n, int vi, int vj, LONG di, LONG dj, LONG *xx, LONG *M, LONG p, LONG CNT ) {
	int k; LONG i,j,*alpha=array(n); 
    for ( k=0; k<n; k++ ) alpha[k] = beta[k]; // copy evaluation points to alpha
    for ( i=0; i<di+1; i++ ) { 
	    alpha[vi-1] = xx[i]; 
	    for ( j=0; j<dj+1; j++ ) { 
		    alpha[vj-1] = xx[j];
            M[(di+1)*j+i] = detBareissSymmetric( alpha, n-1, p ); CNT++; 
		}
	}
	free(alpha);
	return CNT;
}

void interp2var2( int vi, int vj, LONG di, LONG dj, LONG dmax, LONG *xx, LONG *I, LONG *M, LONG p ) {
	LONG i,j,*V=array(dmax+1),*Ytemp=array(dj+1); 
	for ( i=0; i<di+1; i++ ) { 
	    for ( j=0; j<dj+1; j++ ) {
        	Ytemp[j] = M[(di+1)*j+i];
	        NewtonInterp4( xx, Ytemp, I, dj+1, V, p );
            }
	    for ( j=0; j<dj+1; j++ ) M[(di+1)*j+i] = V[j]; // copy results back to columns of M}
	}
	for ( j=0; j<dj+1; j++ ) {
	    NewtonInterp4( xx, M+(di+1)*j, I, di+1, V, p );   
	    for ( i=0; i<di+1; i++ ) M[(di+1)*j+i] = V[i];  // copy results back to rows of M
    }
	free(V);
	free(Ytemp);
	return;
}





