//polynomial bivariate functions, miscellaneous univariate ones

#define DEBUG 0

#define M_INT long long int
#define LONG long long int
#define ULONG unsigned long long int
#define UINT32 unsigned int
#define UINT64 unsigned long long
typedef enum{false, true} bool;

#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdlib.h>

//#include "int128g.c"
#include "polyalg8.c"

//My Algorithms

//set all values in an array to 0
void cleanArray64s(LONG *A, int len){
   int i;
   for(i=0;i<len;i++)A[i] = 0;
}

//print a bivariate polynomial from an array
 void polprintbivar64s( LONG *A, int dx, int dy, char * x, char * y){

    int i,j;

    if(dx == -1 || dy == -1) { printf("0;\n"); return; }

    for (i=dy;i>=0;i--){
       for(j=dx;j>=0;j--){
          if(A[i*(dx+1)+j] != 0 && i*(dx+1)+j != 0){
             printf("%lld*%s^%d*%s^%d+",A[i*(dx+1)+j],x,j,y,i);
          }
       }
    }
    printf("%lld",A[0]);
    return;
 }

ULONG convol64s(LONG *A, LONG *B, LONG min, LONG max, LONG d, LONG p){

   int i;
   ULONG z[2];
   z[0] =0; z[1] = 0;

   if( p < 1LL << 50 ) { // if p < 2^50
      i = min;
      while(i<max-2){
             zfma(z,A[i],B[d-i]);i++;
             zfma(z,A[i],B[d-i]);i++;
             zfma(z,A[i],B[d-i]);i++;
      }
      while(i<=max){
         zfma(z,A[i],B[d-i]);i++;
      }
   }
   else{
     i = min;
     while( i<max-3 ) {
        zfma(z,A[i],B[d-i]); i++; if( z[1]>=p ) z[1] -= p;
        zfma(z,A[i],B[d-i]); i++;
        zfma(z,A[i],B[d-i]); i++; if( z[1]>=p ) z[1] -= p;
        zfma(z,A[i],B[d-i]); i++;
     }
     while( i<=max ) { zfma(z,A[i],B[d-i]); i++; if( z[1]>=p ) z[1] -= p; }

   }

   zmod(z,p);
   return z[0];


}

void poladdFast(LONG *A,LONG *B, LONG *C,int dx, LONG p){
   int i;
   for(i=0;i<=dx;i++)C[i] = add64s(A[i],B[i],p);
}

void addPol(LONG *A, int dz, LONG p){

   int j,k;

   for(k=0;k<dz;k++) for(j=dz-1;j>=k;j--) A[j] = add64s(A[j],A[j+1],p);

}

//Expands a polynomial with dy factors (y-alpha)
void ExpandOutBivar(LONG *A,int dx, int dy, LONG alpha, LONG p){

   //local variables
   LONG alpha2,*MUL,*TEMP,t;
   int p1,p2,i,j,k;

   MUL = array(dy+1);
   TEMP = array(dy+1);

   cleanArray64s(MUL,dy+1);

   for (i=1;i<=dy;i++)
   {

      //initial iteration
      if(i==1)
      {

         //setup initial y-alpha
         alpha2 = p - alpha;
         MUL[0] = alpha2;
         MUL[1] = 1;

         //do multiplication
         p1 = i*(dx+1);
         for(j=0;j<i;j++){
            p2 = j*(dx+1);
            for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[j],p);
               t = add64s(A[p2+k],t,p);
               A[p2+k] = t;
            }
         }
         for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[i],p);
               A[p1+k] = t;
         }

      }

      else
      {
         cleanArray64s(TEMP,dy+1);
         //Expand y-alpha^i
         for(j=0;j<i;j++){
            TEMP[j+1] = MUL[j];
         }
         for(j=0;j<i;j++){
            t = mul64s(alpha2,MUL[j],p);
            t = add64s(t,TEMP[j],p);
            MUL[j] = t;
         }
         t = TEMP[i]; MUL[i] = t;

         //do multiplication
         p1 = i*(dx+1);
         for(j=0;j<i;j++){
            p2 = j*(dx+1);
            for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[j],p);
               t = add64s(A[p2+k],t,p);
               A[p2+k] = t;
            }
         }
         for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[i],p);
               A[p1+k] = t;
         }
      }
      //printf("Y:=");polprintbivar64s(A,dx,dy,"x","y");printf("\n");
   }

   free(MUL);
   free(TEMP);

   return;
}

void ComparePolyFactorsBivar(LONG *A,LONG *F,int dx,int dz,int n,LONG alpha,LONG p){

   LONG *TEMP,*TEMP2,t1,t2;
   int i,j,k,l,m,p1,p2,p3,p4,check;

   TEMP = array(2*n*(dx+1)*(dz+1));
   TEMP2 = array(2*n*(dx+1)*(dz+1));

   cleanArray64s(TEMP,2*n*(dx+1)*(dz+1));
   cleanArray64s(TEMP2,2*n*(dx+1)*(dz+1));

   //Expand factors from (y-alpha) -> y
   for(i=0;i<n;i++){
      p1 = i*2*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = F[p1 + p2 + k];
            TEMP[p2 + k] = t1;
         }
      }
      //Store them in TEMP
      ExpandOutBivar(TEMP,dx,dz,alpha,p);
      //printf("AllEXP:=");polprintbivar64s(TEMP,dx,dz,"x","y");printf(":\n");
      if(i<n-1){
          p1 = (n-1-i)*2*(dx+1)*(dz+1);
          for(j=0;j<=dz;j++){
             p2 = j*(dx+1);
             for(k=0;k<=dx;k++){
                t1 = TEMP[p2 + k];
                TEMP[p1 + p2 + k] = t1;
             }
          }
       }
   }
   //printf("AllEXP:=");polprintbivar64s(TEMP,dx,n*dz,"x","y");printf(":\n");

   //Expand the factors into 1 large polynomial
   for(i=1;i<n;i++){

      cleanArray64s(TEMP2,n*(dx+1)*(dz+1));
      //multiply factors
      p1 = i*2*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP[p1+p2+k];
            if(t1 != 0){
               for(l=0;l<=dz;l++){
                  p3 = l*(dx+1);
                  for(m=0;m<=dx;m++){
                     t2 = TEMP[p3+m];
                     if(t2 != 0){
                        t2 = mul64s(t1,t2,p);
                        TEMP2[(j+l)*(dx+1)+(k+m)] = add64s(t2,TEMP2[(j+l)*(dx+1)+(k+m)],p);;
                     }
                  }
               }
            }
         }
      }

      //move current polynomial back to TEMP1
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP2[p2+k];
            TEMP[p2+k] = t1;
         }
      }

      //printf("this iteration:"); polprintbivar64s(TEMP,dx,dz,"x","y");printf("\n");

   }

   //Compare Check A - mul(F) = 0
   for(j=0;j<=dz;j++){
      p1 = j*(dx+1);
      for(k=0;k<=dx;k++){
         t1 = sub64s(A[p1+k],TEMP[p1+k],p);
         TEMP[p1+k] = t1;
      }
   }
   free(TEMP);
   free(TEMP2);

   return;

}



//Divide bivariate A by (y-alpha) using Shaw and Traub's method
void altDiv2(LONG *A,int dx, int dz, LONG alpha, LONG* W, LONG p){

   //local variables
   int i,j,k,b,blockSize,blocks;
   LONG x0,I,t,*p1,*pj,*pA,*pA2,*pW,*pW2;
   clock_t T1,T2,s1,s2,s3;

   s1=0;s2=0;s3=0;

   //Calculate A*x_0^j
   x0 = 1;
   blockSize = 32;
   blocks = (dx + blockSize)/blockSize;
   b = dx+1 - (blocks-1)*blockSize;
   cleanArray64s(W,(dz+1)*blockSize);

   //iterative case
   T1 = clock();
   pj = &A[dx+1];
   for(i=1;i<=dz;i++){
      x0 = mul64s(x0,alpha,p);
      for (j=0;j<=dx;j++){
         pj[j] = mul64s(pj[j],x0,p);
      }
      pj = pj + dx+1;
   }
   T2 = clock();
   //printf("Mult1=%8.2fms\n",(T2-T1)/1000.0/1.0);


   //if there is one block of code
   if(blocks==1){
      for(k=0;k<=dz-1;k++){
         pA = &A[(dz-1)*(dx+1)]; pA2 = pA + dx+1;
         for(j=dz-1;j>=k;j--){
            poladdFast(pA,pA2,pA,dx,p);
            pA2 -= (dx+1); pA -= (dx+1);
         }
      }
   }
   //if there are multiple blocks of code
   else{

       //For each main block of data, compress Data, add it, then return it
       for(i=1;i<blocks;i++){

          //Move data
          T1 = clock();
          pW = W; pA = &A[(i-1)*blockSize];
          for(j=0;j<=dz;j++){
             for(k=0;k<blockSize;k++){
                pW[k] = pA[k];
             }
             pW += blockSize; pA += dx+1;
          }
          T2 = clock();
          s1 =s1+ T2-T1;

          //Main addition Loop
          T1 = clock();
          for(k=0;k<=dz-1;k++){
            pW = &W[(dz-1)*blockSize]; pW2 = pW + blockSize;
             for(j=dz-1;j>=k;j--){
                //poladdFast(&W[j*blockSize],&W[(j+1)*blockSize],&W[j*blockSize],blockSize-1,p);
                poladdFast(pW,pW2,pW,blockSize-1,p);
                pW2 = pW; pW -= blockSize;
             }
          }
          T2 = clock();
          s2 = s2+ T2-T1;

          T1 = clock();
          pW = W; pA = &A[(i-1)*blockSize];
          for(j=0;j<=dz;j++){
             for(k=0;k<blockSize;k++){
                 pA[k] = pW[k];
             }
             pW += blockSize; pA += dx+1;
          }
          T2 = clock();
          s3 = s3 + T2-T1;

       }

       //Final, mis-sized block
       if(b>0){

           T1 = clock();
           pW = W; pA = &A[(blocks-1)*blockSize];
           for(j=0;j<=dz;j++){
              for(k=0;k<b;k++){
                 pW[k] = pA[k];
              }
              pW += b; pA += dx+1;
           }
           T2 = clock();
           s1 = s1 + T2-T1;

           T1 = clock();
           for(k=0;k<=dz-1;k++){
              pW = &W[(dz-1)*b];
              pW2 = pW + b;
              for(j=dz-1;j>=k;j--){
                poladdFast(pW,pW2,pW,b-1,p);
                pW2 = pW; pW -= b;
              }
           }
           T2 = clock();
           s2 = s2 + T2-T1;

           T1 = clock();
           pW = W; pA = &A[(blocks-1)*blockSize];
           for(j=0;j<=dz;j++){
              for(k=0;k<b;k++){
                 pA[k] = pW[k];
              }
              pW += b; pA += dx+1;
           }
           T2 = clock();
           s3 = s3 + T2-T1;

        }
   }


   //multiply by inverse
   T1 = clock();
   x0 = modinv64s(alpha,p);
   I = x0;
   pj = &A[dx+1];
   for(i=1;i<=dz;i++){
      for(j=0;j<=dx;j++){
         pj[j] = mul64s(pj[j],I,p);
      }
      pj = pj + dx+1;
      I = mul64s(I,x0,p);
   }
   T2 = clock();

   /*printf("Shift 1 =%8.2fms\n",(s1)/1000.0/1.0);
   printf("Addition =%8.2fms\n",(s2)/1000.0/1.0);
   printf("Shift 2=%8.2fms\n",(s3)/1000.0/1.0);
   printf("Mult 2=%8.2fms\n",(T2-T1)/1000.0/1.0);
*/


   return;

}

void changebase3( LONG *f, LONG *g, LONG *F, LONG* dF, LONG n, LONG alpha, LONG p, recint P ) {
//  Input f a polynomial of degree n in Zp[x]
//  Output f(x+alpha) mod p in the array f
//  This version computes f(x+alpha) in O( M(n) ) instead of O( M(n) log n )
//  Since the method computes 1/i! mod p we cannot use it for p<=n.
//  Computing inverses is expensive so some thought is needed for that.
//  Joris suggested precomputing (1/i!)^(-1) and going backwards.
//  Since the we have to compute i! first anyway this is free.
    LONG i,fac,inv,*h;
    //if( p<=n || n<3000 ) return changebase2(f,n,alpha,p);  // can omit this
    //fac = 1;
    for( i=1; i<=n; i++ ) {
        // compute f[i] = f[i]*i! mod p
        //fac = mulrec64(i,fac,P);
        f[i] = mulrec64(F[i],f[i],P);
    }
    //inv = modinv64s(F[n],p); // = 1/n! mod p
    polrev64s(f,n);
    /*g = array64s(n+1);
    g[0] = 1;
    for( i=1; i<=n; i++ ) {
        // compute g[i] = alpha^i mod p
        g[i] = mulrec64(alpha,g[i-1],P);
    }*/
    //fac = inv;
    /*for( i=n; i>0; i-- ) {
        // compute g[i] = alpha^i/i! mod p
        g[i] = mulrec64(fac,g[i],P);
        fac = mulrec64((LONG) i,fac,P);
    }*/
    h = array(2*n+1);
    //FFTpolmul64s(f,g,h,n,n,p);
    //polmul64s(f,g,h,n,n,p); //altarnately can use this
    polmul64s(f,g,h,n,n,p);
    polrev64s(h,n);
    f[0] = h[0];
    //fac = inv;
    for( i=n; i>0; i-- ) {
        // compute f[i] = h[i]/i! mod p
        f[i] = mulrec64(dF[i],h[i],P);
        //fac = mulrec64(fac,(LONG) i,P);
    }
    //free(g);
    free(h);
    return;
}

//Custom Functions

LONG DotProduct(LONG *A, LONG *B, int d, LONG p){

   int i,j,k;
   ULONG z[2];

   z[0] = 0; z[1] = 0;

   i=0;

   while(i<=d-3){
      zfma(z,A[i],B[i]);i++;
      zfma(z,A[i],B[i]);i++;
      zfma(z,A[i],B[i]);i++;
   }

   while(i<=d){
      zfma(z,A[i],B[i]);i++;
   }
   zmod(z,p);
   return z[0];

}

LONG DotProductP(LONG *A, LONG *B, int d, LONG p){

   int i,j,k;
   ULONG z[2];

   z[0] = 0; z[1] = 0;

   i=0;

   while(i<=d-2){
      zfma(z,A[i],B[i]);i++;//if(z[1]>=p){z[1] -=p;}
      zfma(z,A[i],B[i]);i++;if(z[1]>=p){z[1] -=p;}
   }

   while(i<=d){
      zfma(z,A[i],B[i]);i++;if(z[1]>=p){z[1] -=p;}
   }
   zmod(z,p);
   return z[0];

}

ULONG fastPolyMult(LONG *poly1,LONG *poly2, LONG *T, int deg1, int deg2, LONG p){

    //local variables
    int k,m,min,max;
    ULONG z[2];

    //clean T
    //cleanArray64s(T,deg1+deg2);

    for(k=0;k<=deg1+deg2;k++){
       min = max32s(0,k-deg2);
       max = min32s(k,deg1);
       z[0] = 0; z[1] = 0;
       m = min;
       while(m<=max-3){
          zfma(z,poly1[m],poly2[k-m]);m++;
          zfma(z,poly1[m],poly2[k-m]);m++;
          zfma(z,poly1[m],poly2[k-m]);m++;
       }
       while(m<=max){
          zfma(z,poly1[m],poly2[k-m]);m++;
       }
       zmod(z,p);
       T[k] = add64s(z[0],T[k],p);
     }
}

//perform the Extended Euclidean Algorithm
int MultiEEA(LONG *U, int d, int n, LONG *M, LONG *W, LONG p){
//size of U = n*(d+1)
//size of M = ...
//size of W = 2*((n-1)*d+1) + d+1

   //local variables
   int i,j,du,dm,ds,dt,dg,dmCalc,mPos,point;
   LONG *u,*m,*mCalc,*g,*Wptr,t,alpha2;

   //clean array
   cleanArray64s(W,4*(n*d+1) + (d+1));

   //set initial array pointers
   mCalc = W;
   g = mCalc + (n-1)*d+1;

   //define initial position in the M array
   mPos = 0;
   for(i=n-1;i>1;i--){
      mPos = mPos + i*d+1;
   }

   mCalc[0] = 1;
   dmCalc = 0;

   //Main loop
   u = U + (n-1)*(d+1);
   for (i=n;i>=2;i--){

      //calculate mcalc*U_i
      dmCalc = polmul64s(mCalc,u,mCalc,dmCalc,d,p);
      //add new mcalc to M
      polcopy64s(mCalc,dmCalc,M+mPos);
      mPos = mPos - ((n-i+2)*d + 1);
      //update u pointer
      u -= d+1;
   }


   //make sure gcd(u_i,M_i) = 1 for all M
   u = g + (n-1)*d+1;
   Wptr = u + d+1;
   dmCalc = (n-1)*d;
   mPos = 0;

   for(i=0;i<n;i++){

      //move data to u,mCalc
      polcopy64s(U+i*(d+1),d,u);
      polcopy64s(M+mPos,dmCalc,mCalc);
      polgcdext64s(u,mCalc,d,dmCalc,g,0,0,&dg,&ds,&dt,Wptr,p);

      //fail state
      if(dg>0) return -1;


      dmCalc -= d;
      mPos += (n-i-1)*d+1;
   }

   return 0;

}

void generateGcdexS(LONG *U, int n, int du, LONG *M, int dx, LONG *S, int *sDeg, LONG *W, LONG p){

    //local variables
    int i,j,k,du2,dm,dg,ds,wTemp,dt,mPos;
    LONG *uTemp,*mTemp,*g,*wPtr;

    cleanArray64s(W,4*n*(du+1));

    uTemp = W;
    mTemp = uTemp + du+1;
    g = mTemp + (n-1)*du+1;
    wPtr = g + n*du+1;

    mPos = 0;

    //Calculate the sigma term and subtract it from C
    for (i=1;i<n;i++){

       //extract M_i,U_i
       polcopy64s(U+(i-1)*(du+1),du,uTemp);
       du2 = poldeg64s(uTemp,du);
       polcopy64s(M+mPos,(n-i)*du,mTemp);
       dm = poldeg64s(mTemp,(n-i)*du);
       mPos = mPos + (n-i)*du+1;

       //perform EEA to solve s*U[i] + t*M[i] = 1 for s in Z[x]
       polgcdext64s(mTemp,uTemp,dm,du2,g,S+(i-1)*(dx+1),0,&dg,&ds,&dt,wPtr,p);
       sDeg[i-1] = ds;

    }

}

void LagInterpSetup(LONG *A,LONG *xPoints,int dx, LONG *W, LONG p){

    //local variables
    LONG t,a,b,*p1,*p2;
    int i,j,k,m,dm,point1,point2,size;
    ULONG z[2];

    size = (dx+1)/2+1;

    cleanArray64s(W,2*dx+6);
    p1 = &W[dx+2];
    p2 = &W[dx+4];

    //calculate master polynomial
    W[1] = 1;
    p1[1] = 1;
    for(i=1;i<=dx;i++){
       p1[0] = p-xPoints[i];
       polmul64s(W,p1,W,i,1,p);
    }

    //calculate Lagrange polynomials
    for(i=0;i<=dx;i++){
       //copy the master polynomial to spare space
       for(j=0;j<=dx+1;j++){p2[j] = W[j]; }
       p1[0] = p-xPoints[i];
       //perform division
       dm = poldiv64s(p2,p1,dx+1,1,p);
       //move polynomial to A
       for(j=0;j<=dx;j++){ A[i*(dx+1)+j] = p2[j+1]; }

    }

    //calculate inverse and multiply every term by it
    p1 = A;
    for(i=0;i<=dx;i++){
        //calculate denom
        a = 1;
        b = xPoints[i];
        for (j=0;j<=dx;j++){
            if (i!=j){
                a = mul64s(a,sub64s(b,xPoints[j],p),p);
            }
        }
        a = modinv64s(a,p);
        //multiply every term by a
        for(j=0;j<=dx;j++){
            if(p1[j]!=0){
                p1[j] = mul64s(p1[j],a,p);
            }
        }
        p1 = p1 + dx+1;
    }

    for(i=0;i<=dx;i++){
        //polprint64s(&A[i*(dx+1)],dx);printf("\n");
    }

    //transpose matrix (efficiently)
    cleanArray64s(W,2*(size)*(size-1) + 1);


    if(dx%2 ==0){


        //single entry,
        W[0] = A[0];

        //even case
        p1 = &W[1];
        //firstRow
        for(i=2,k=0;i<=dx+1;i+=2,k++){ p1[k*size] = A[i]; }
        //each column (even elements)
        for(i=2,k=1;i<=dx+1;i+=2,k++){
            for(j=2,m=0;j<=dx+1;j+=2,m++){
                p1[m*(size)+k] = A[(i-1)*(dx+1) + j];
            }
        }

        //odd
        p1 = p1 + size*(size-1);
        //firstRow
        for(i=1,k=0;i<=dx+1;i+=2,k++){ p1[k*size] = A[i]; }
        //each column (even elements)
        for(i=2,k=1;i<=dx+1;i+=2,k++){
            for(j=1,m=0;j<=dx+1;j+=2,m++){
                p1[m*(size)+k] = A[(i-1)*(dx+1) + j];
            }
        }
         //move elements back to A
         for(i=0;i<2*size*(size-1)+1;i++){ A[i] = W[i]; }
    }
    else{
       //transpose matrix A
       for(i=0;i<=dx;i++){
          for(j=i+1;j<=dx;j++){
              t = A[i*(dx+1)+j]; A[i*(dx+1)+j] = A[j*(dx+1)+i]; A[j*(dx+1)+i] = t;
          }
       }
    }

    return;

}

//perform Lagrangian interpolation. The Lagrangian polynomials are stored in LagPolys
//and the interpolation points are stored in yPoints.
//The resulting polynomial is stored in Delta
//The size of W is dx+1
void LagInterpEval(LONG *LagPolys,LONG *yPoints,LONG *Delta, int dx, LONG *W, LONG p, LONG *nummuls){
//Lagpolys - holds the Lagrangian polynomials used for lagrange interpolation - matrix has size (dx+1)^2
//yPoints - array of length dx+1 - contains the y points for Lagrange interpolation (the x points are implied to be 0,1,-1,2,-2,...
//Delta - the interpolated polynomial of degree at most dx
//dx - degree of the main polynomial A in x
//W - additional space needed - size = dx+1
//p - prime

    //local variables
    int i,j,k,size,point;
    LONG *p1,*V;
    ULONG z[2];

    if(dx%2==0){

        size = (dx+1)/2 + 1;
        V = W;
        cleanArray64s(V,size);
        V[0] = yPoints[0];

        //solve first solution (alpha = 0)
        Delta[0] = mul64s(LagPolys[0],yPoints[0],p);
        *nummuls += 1;

        //*******************************************************************
        //solve Lagrange interpolation (even case)
        //*******************************************************************

        //build vector
        for(i=1,j=1;i<size-1;i++,j+=2){
            V[i] = add64s(yPoints[j],yPoints[j+1],p);
        }
        if(dx%2==1){V[size-1] = yPoints[dx];}
        else{V[size-1] = add64s(yPoints[dx-1],yPoints[dx],p);}

        //calculate Delta (even Case)
        p1 = &LagPolys[1];
        for(i=0,j=2;i<size-1;i++,j+=2){
           if(p < 1LL << 50) Delta[j] = DotProduct(p1,V,size-1,p);
           else Delta[j] = DotProductP(p1,V,size-1,p);
           p1 = p1+size;
        }

        //*******************************************************************
        //solve Lagrange interpolation (odd case)
        //*******************************************************************

        //build vector
        for(i=1,j=1;i<size-1;i++,j+=2){
            V[i] = sub64s(yPoints[j],yPoints[j+1],p);
        }
        if(dx%2==1){V[size-1] = yPoints[dx];}
        else{V[size-1] = sub64s(yPoints[dx-1],yPoints[dx],p);}

        //calculate Delta (odd Case)
        for(i=0,j=1;i<size-1;i++,j+=2){
            if(p < 1LL << 50) Delta[j] = DotProduct(p1,V,size-1,p);
            else Delta[j] = DotProductP(p1,V,size-1,p);
           p1 = p1+size;
        }
    }


    //when dx is odd
    else{
        Delta[0] = yPoints[0];
        for(i=1;i<=dx;i++){
           if(p < 1LL << 50) Delta[i] = DotProduct(&LagPolys[i*(dx+1)],yPoints,dx,p);
           else DotProductP(&LagPolys[i*(dx+1)],yPoints,dx,p);
        }
    }

    return;
}



//calculate the content of polynomial F(x,y) in terms of y
int Content(LONG *F, LONG* cont, int du, int dx, int dz, LONG p, LONG *W){

int i,j,da,db;

	//get initial coefficient in x
	i=0;
	cleanArray64s(cont,dz+1); cleanArray64s(W,dz+1);
	da = -1;
	while(da < 0){
		for(j=0;j<=dz;j++){
				cont[j] = F[j*(dx+1)+i];
				if(cont[j] != 0){da=j;}
		}
		i++;
	}
	//polprint64s(cont,da,p);

	//get next and take gcd
	while(i<=du){
	   db = -1;  cleanArray64s(W,dz+1);
	   for(j=0;j<=dz;j++){
		   W[j] = F[j*(dx+1)+i];
		   if(W[j] != 0){db = j;}
	   }
	  if(db >= 0){
		 da = polgcd64s(cont,W,da,db,p);
	  }
	  i++;
	}

	return da;

}

void scalefactors( LONG *F, LONG *F0, int r, LONG du, LONG dx, LONG dy, LONG p ) {
    // Input: F is the answer of BHL (HenselLiftCubic), where f[rr][(dx+1)*j+i] = coeff( coeff(f[rr],y-alpha,j), x, i)
	// f[1] is at F[0], f[2] at F[2*(dx+1)*(dy+1)], f[3] at F[4*(dx+1)*(dy+1)] etc. f[r] at F[(r-1)*2*(dx+1)*(dy+1)]
    // r is the number of factors, F0 is the list of initial factors f0[rr], rr=1..r.
	// Output: Scale the answer to satisfy Eval(f[rr],y=alpha) mod p = f0[rr], rr=1..r.
	LONG *Temp, temp, i, j, k, dfx, c, LCF, LCF0; int rr;
	for ( rr=1; rr<=r; rr++ ) {
	    Temp = &F[2*(rr-1)*(dx+1)*(dy+1) + dx];
		k = dx; while (k>=0) { if (*Temp == 0) { Temp--; k--; } else break; }
		dfx = k; LCF = *Temp;  printf("dfx = %d, LCF = %d\n",dfx, LCF);
		LCF0 = F0[(rr-1)*(du+1)+dfx]; printf("LCF0 = %d\n",LCF0);
		c = mul64s( modinv64s( LCF, p ), LCF0, p ); printf("c=%d\n",c);
		for ( i=0; i<dx+1; i++ ) {
		    for ( j=0; j<dy+1; j++ ) {
				temp = F[2*(rr-1)*(dx+1)*(dy+1) + (dx+1)*j+i ];
                F[2*(rr-1)*(dx+1)*(dy+1) + (dx+1)*j+i ] = mul64s( c, temp, p );
			}
		}
	}
	return;
}

// a(x) = a0 + a1 x + a2 x^2 + a3 x^3 + a4 x^4  in Zp[x]
// a(x+b) = a0 + a1 (x+b) + a2 (x+b)^2 + a3 (x+b)^3 + a4 (x+b)^4  in Zp[x]
//        = a0 + (x+b) [ a1 + (x+b) [ a2 + (x+b) [ a3 + (x+b) a4 ] ] ]

LONG TaylorShift2( LONG *a, LONG d, LONG b, LONG p ) {
// a(x+b) using Horner's method (in place)
    LONG i, k;
    if( d==-1 ) return -1;
    if( d==0 ) return 0;
    for ( k=d-1; k>=0; k-- ) {
        for ( i=k; i<=d-1; i++ )
            a[i] = add64s( a[i],mul64s(b,a[i+1],p),p );
    }
    return d;
}


void ExpandF2( LONG *F, int r, LONG dx, LONG dy, LONG alpha, LONG p) {
	// Input: F is the answer of BHL (scaled), where f[rr][(dx+1)*j+i] = coeff( coeff(f[rr],y-alpha,j), x, i)
	// f[1] is at F[0], f[2] at F[2*(dx+1)*(dy+1)], f[3] at F[4*(dx+1)*(dy+1)] etc. f[r] at F[(r-1)*2*(dx+1)*(dy+1)]
	// r is the number of factors. The input alpha is p - alpha[j] in CMSHL Maple code
	// Output: returns expanded f onto F
	LONG *Temp=array(dy+1), i, j, k, d; int rr;
	for ( rr=1; rr<=r; rr++ ) {
	    for ( i=0; i<dx+1; i++ ) {
		    for ( j=0; j<dy+1; j++ ) Temp[j] = F[2*(rr-1)*(dx+1)*(dy+1) + (dx+1)*j+i];
			k = dy; while ( k>=0 ) { if ( Temp[k] == 0 ) k--; else break; }
			d = k;
		    TaylorShift2( Temp, d, alpha, p );
	        for ( j=0; j<dy+1; j++ ) F[2*(rr-1)*(dx+1)*(dy+1) + (dx+1)*j+i] = Temp[j];
	    }
	}
	free(Temp);
	return;
}

//perform bivariate polynomial mulitplication between 2 polynomials
//utilizes evaluation-interpolation
int polmulbivar64s(LONG* A, LONG* B, LONG* C, int dxA, int dzA, int dxB, int dzB, int dx, int dz, LONG p, LONG *W){
    //The amount of space used is: (dx+1)^2 + (dx+5)(dz+1) + 2dx + 6
    //where dx = dxA+dxB+1 and dz = dzA+dzB

   //local variables
   int DX, DZ;
   LONG i,j,k, m, r;
   LONG *p1, *V, *TEMP, *evalPoints, *lagInterpPolys, *evalProducts, *evalFactors, *vector;

   DX = dx; DZ = dz;

   if(DX%2==1){DX += 1;}


   //assign space
   V = W;
   TEMP = V;                V += (DZ+1);
   evalPoints  =  V;        V += (DX+1);
   lagInterpPolys  =  V;  V += (DX+1)*(DX+1);
   evalProducts  =  V;    V += (DX+1)*(DZ+1);
   evalFactors  =  V;      V +=   2*(DZ+1);
   vector =  V;              V +=   (DX+1);

   //set evaluation points
   evalPoints[0] = 0;
   p1 = evalPoints+1;
   for(i=1;i<=DX/2;i++){
      p1[0] = i;
      p1[1] = p-i;
      p1 +=2;
   }

   //set up polynomials for interpolation
   LagInterpSetup(lagInterpPolys,evalPoints,DX,V,p);


   //perform evaluation + multiplication
   cleanArray64s(evalProducts,(DX+1)*(DZ+1));
   for(j=0;j<=DX;j++){ //for each evaluation point

        cleanArray64s(evalFactors,2*(DZ+1));

        //evaluate the n polynomials
        r = 1;
        for(m=0;m<=dxA;m++){// for each x
            for(k=0;k<=dzA;k++){ //iterate through the relative z's
                evalFactors[k] = add64s(evalFactors[k],mul64s(A[k*(dxA+1) + m],r,p),p);
            }
            r=mul64s(r,evalPoints[j],p);
        }

        r=1;
        for(m=0;m<=dxB;m++){// for each x
            for(k=0;k<=dzB;k++){ //iterate through the relative z's
                evalFactors[(dz+1)+k] = add64s(evalFactors[(dz+1)+k],mul64s(B[k*(dzB+1) + m],r,p),p);
            }
            r=mul64s(r,evalPoints[j],p);
        }

        //multiply the polynomials together
        polmul64s(evalFactors,&evalFactors[dz+1],&evalProducts[j*(dz+1)],dzA,dzB,p);

   }

   //perform interpolation then populate A with the results
   //cleanArray64s(C,(dx+1)*(dz+1));
   for(i=0;i<=DZ;i++){

        //create vector to interpolate
        for(j=0;j<=DX;j++){ vector[j] = evalProducts[j*(DZ+1)+i]; }

        cleanArray64s(TEMP,DX);

        //polprint64s(vector,DX,p);  printf("\n");

        //perform Lagrange Interpolation
        LagInterpEval(lagInterpPolys,vector,TEMP,DX, V, p, &r);

        //polprint64s(TEMP,DX,p); printf("\n");



        //populate A
        for(j=0;j<=dx;j++){C[i*(dx+1)+j] = TEMP[j];}

   }

   return 0;

}

//perform bivariate polynomial addition
int poladdbivar64s(LONG *a, LONG *b, LONG *c, int dxA, int dyA, int dxB, int dyB, int dx, int dy, LONG p){

   LONG *A, *B, *C;
   int i;

   //add terms together
   i=0;
   while(i<=min32s(dyA,dyB)){
         A = &a[i*(dxA+1)];   B = &b[i*(dxB+1)]; C = &c[i*(dx+1)];
         poladd64s(A,B,C,dxA,dxB,p);
         i++;
   }

   //tack on terms of of polynomial that is larger (in terms of y)
   while(i<=dyA){
      A = &a[i*(dxA+1)];  C = &c[i*(dx+1)];
      polcopy64s(A,dxA,C);
      i++;
   }
   while(i<=dyB){
      B = &b[i*(dxB+1)];  C = &c[i*(dx+1)];
      polcopy64s(B,dxB,C);
      i++;
   }

   return 0;

}

//perform bivariate polynomial mulitplication between n >=2 polynomials
//utilizes evaluation-interpolation
int polmulbivarN64s(LONG ** A, LONG* C, int * dxA, int * dzA, int dx, int dz, int n, LONG p, LONG *W){
    //The amount of space used is: (dx+1)^2 + (dx+3)(dz+1) + n(dz+1) + 2dx+6
    //where dx = sum(dxA)+1 and dz = sum(dzA)

   //local variables
   int DX, DZ;
   LONG i,j,k, m, r;
   LONG *p1, *V, *TEMP, *evalPoints, *lagInterpPolys, *evalProducts, *evalFactors, *vector;

   DX = dx; DZ = dz;

   if(DX%2==1){DX += 1;}

   //assign space
   V = W;
   TEMP = V;                V += (DZ+1);
   evalPoints  =  V;        V += (DX+1);
   lagInterpPolys  =  V;  V += (DX+1)*(DX+1);
   evalProducts  =  V;    V += (DX+1)*(DZ+1);
   evalFactors  =  V;      V +=   n*(DZ+1);
   vector =  V;              V +=   (DX+1);

   //set evaluation points
   evalPoints[0] = 0;
   p1 = evalPoints+1;
   for(i=1;i<=dx/2;i++){
      p1[0] = i;
      p1[1] = p-i;
      p1 +=2;
   }

   //set up polynomials for interpolation
   LagInterpSetup(lagInterpPolys,evalPoints,DX,V,p);

   //perform evaluation + multiplication
   cleanArray64s(evalProducts,(DX+1)*(DZ+1));
   for(j=0;j<=DX;j++){ //for each evaluation point

        cleanArray64s(evalFactors,n*(dz+1));

        //evaluate the n polynomials
        for(i=0;i<n;i++){
            r = 1;
            for(m=0;m<=dxA[i];m++){// for each x
                for(k=0;k<=dzA[i];k++){ //iterate through the relative z's
                    evalFactors[i*(DZ+1) + k] = add64s(evalFactors[i*(DZ+1) + k],mul64s(A[i][k*(dzA[i]+1) + m],r,p),p);
                }
               r=mul64s(r,evalPoints[j],p);
            }
        }

        //multiply the polynomials together
        cleanArray64s(&evalProducts[j*(DZ+1)],DZ);
        for(i=0;i<=dzA[0];i++){
            evalProducts[j*(DZ+1)+i] = evalFactors[i];
        }
        r = dzA[0];
        for(i=0;i<n-1;i++){
           r = polmul64s(&evalProducts[j*(DZ+1)],&evalFactors[(i+1)*(DZ+1)],&evalProducts[j*(DZ+1)],r,dzA[i+1],p);
        }
   }

   //perform interpolation then populate A with the results
   for(i=0;i<=DZ;i++){

        //create vector to interpolate
        for(j=0;j<=DX;j++){ vector[j] = evalProducts[j*(dz+1)+i]; }

        //perform Lagrange Interpolation
        LagInterpEval(lagInterpPolys,vector,TEMP,DX, V, p, &r);

        //populate A
        for(j=0;j<=dx;j++){C[i*(dx+1)+j] = TEMP[j];}

   }

   return 0;

}
