//V35

#define DEBUG 0

#define M_INT long long int
#define LONG long long int
#define ULONG unsigned long long int
#define UINT32 unsigned int
#define UINT64 unsigned long long

#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdlib.h>

#include "polybivar1.c"

#define WORDSIZE 64

ULONG seed, mult;
LONG rand64s(LONG p);

LONG * extendSpace(LONG *A, LONG ** H, LONG * Space, LONG d1, LONG d2, LONG k){

    LONG * B,a;
    int i;
    B = A;

    //printf("in here\n");

    if (d1 + 1 >  *Space){
        //printf("got here againbbb\n");
        B = *H;
        a=1;
        for(i=0;i<k;i++){a *= 2;}
        *Space = a;
        *H += a;
//printf("k = %d\n",k);
        for(i=0;i<=d2;i++){
            B[i] = A[i];
        }
        for(i=d2+1;i<*Space;i++){
            B[i] = 0;
        }
        //printf("got here againbbb3\n");

    }

    return B;

}

LONG allocateSpaceCubic(int n, LONG dx, LONG dz, LONG du){

    LONG size, np, t, DZ;

    DZ = n*dz;

    np = 1; t = n;
        while (t>1){
           np +=  t;
           t = ceil2(t);
        }

    size = (n-1)*n*du/2 + n + (dx+2)*(dz+1) + 6*n*(du+1) + 3*(dx+2) + (dx+2)*(dx+2) + (du+1)*(dx/2 + 2) + (2*n+1)*(dx+2) + 2*n*(dz+1) + 4*(dx+2)*(DZ+n)*(log2N(n)+1) + 2*np*(dx+2) + 32*(dz+1) + (dx+2)*((dx+4)/2+1);

    return size;

}

//Evaluates polynomials at x = 0,+-1,+-2,+-3,...
void FastEval(LONG **G, LONG **H, LONG* Space, LONG* degrees, LONG *F, LONG *EvalPoints, int iter, int n, int numPolys, int *degsX, int *degsY, int du, int DX, int dz, LONG p){

   int i,j,k,m,ndz,dxdz, dx;
   LONG *p1,*p2,e,o,t;
   LONG *shift1, *shift2;
   ULONG z1[2],z2[2];

   dx = DX;
   if(DX%2==1){dx += 1;}

   ndz = n*(dz+1);
   dxdz = 2*(DX+1)*(dz+1);

   //deal with easy case (x=0, store just constants)
   p1 = &F[iter*(DX+1)];
   for(i=0;i<n;i++){

        if(*p1 != 0){
            G[i] = extendSpace(G[i],H,&Space[i],iter,degrees[i],log2N(iter)+1);
            degrees[i] = iter;
            G[i][iter] = *p1;
        }
      p1 = p1 + dxdz;
   }

   //printf("zero case\n");

   //preform evaluation if using a smaller prime
   p2 = EvalPoints;
   i=1;
   while(i<=dx/2){
   //for(i=1;i<=dx/2;i++){
      //pointers
      //p1 = polynomials
      //p2 = x = 0,+-1,...
      //p3 = stores even + odd;
      //p4 = stores even - odd;
      p1 = F + iter*(DX+1);

      for(j=0;j<n;j++){

         if(degsY[j] == iter){

             //extend space
             //printf("%p \n",*H);
             G[(2*i-1)*(numPolys)+j] = extendSpace(G[(2*i-1)*(numPolys)+j],H,&Space[(2*i-1)*numPolys+j],iter,degrees[(2*i-1)*numPolys+j],log2N(iter)+1);
             G[(2*i)*(numPolys)+j] = extendSpace(G[(2*i)*(numPolys)+j],H,&Space[(2*i)*numPolys+j],iter,degrees[(2*i)*numPolys+j],log2N(iter)+1);
             degrees[(2*i-1)*(numPolys)+j] = iter;
             degrees[(2*i)*(numPolys)+j] = iter;

             z1[0] = p1[0]; z1[1] = 0;
             z2[0] = p1[1]; z2[1] = 0;

             if( p < 1LL << 50 ) { // if p < 2^50
                 for(m=1,k=2;k<=degsX[j];k+=2,m++){
                    zfma(z1,p1[k],p2[m]);
                    zfma(z2,p1[k+1],p2[m]);
                 }
             }
             else{
                for(m=1,k=2;k<=degsX[j];k+=2,m++){
                    zfma(z1,p1[k],p2[m]); if( z1[1]>=p ) z1[1] -= p;
                    zfma(z2,p1[k+1],p2[m]); if( z2[1]>=p ) z2[1] -= p;
                 }
             }

             zmod(z1,p); e = z1[0];
             zmod(z2,p); o = mul64s(z2[0],i,p);

             G[(2*i-1)*(numPolys)+j][iter] = add64s(e,o,p);
             G[(2*i)*(numPolys)+j][iter] = sub64s(e,o,p);
         }
         p1 = p1 + dxdz;


      }
      p2 = p2 + du+1;
      i++;
      /*printf("after Eval\n");
      for(j=0;j<dz+n;j++){
      printf("yyy = %lld ",p1[j]);
      }
      printf("\n");*/
   }

   return;
}

//calculate the content of polynomial F(x,y) in terms of y
/*int Content(LONG *F, LONG* cont, int du, int dx, int dz, LONG p, LONG *W){

int i,j,da,db;

	//get initial coefficient in x
	i=0;
	cleanArray64s(cont,dz+1); cleanArray64s(W,dz+1);
	da = -1;
	while(da < 0){
		for(j=0;j<=dz;j++){
				cont[j] = F[j*(dx+1)+i];
				if(cont[j] != 0){da=j;}
		}
		i++;
	}
	//polprint64s(cont,da,p);

	//get next and take gcd
	while(i<=du){
	   db = -1;  cleanArray64s(W,dz+1);
	   for(j=0;j<=dz;j++){
		   W[j] = F[j*(dx+1)+i];
		   if(W[j] != 0){db = j;}
	   }
	  if(db >= 0){
		 da = polgcd64s(cont,W,da,db,p);
	  }
	  i++;
	}

	return da;

}*/

void Dionvar(LONG *U, int n, int d, LONG *c, int dx, LONG *M, LONG *S, int *sDeg, LONG *W, LONG *Sigmas, LONG p, LONG *numMuls){
//Enter list of polynomials U of size (n*(d+1)), solution C of size at most (d+1), and polynomial M <- U[2]*U[3]...U[n]

//local variables
    int i,j,k,du,dm,dg,ds,dt,mPos,dck,drem,point;
    LONG t,*u,*uTemp,*m,*mTemp,*g,*s,*ck,*Wptr,*V;

    cleanArray64s(W,2*(d+1) + 4*(n*d+1) + 2*(dx+1));
    cleanArray64s(Sigmas,n*(d+1));

      for(i=0;i<n;i++){
      for(j=0;j<=d;j++){
         //printf("%d,   ",U[i*(d+1)+j]);
      }
   }

    V = W;
    u = V;       V+= d+1;
    uTemp = V;   V+=  d+1;
    m = V;       V+=  n*d+1;
    mTemp = V;   V+=  n*d+1;
    s = V;       V+=  2*(dx+1);
    ck = V;      V+=  n*d+1;
    g = V;       V+= n*d+1;

    mPos = 0;

    //move c to ck
    dck = 0;
    for (j=0;j<=dx;j++){
       t = c[j];
       ck[j] = t;
       if(t != 0){
          dck = j;
       }

    }

    //printf("c=");polprint64s(ck,dck);printf("\n");

    //Calculate the sigma term and subtract it from C
    for (i=1;i<n;i++){

       cleanArray64s(uTemp,d+1);
       cleanArray64s(mTemp,(n-1)*d+1);
       cleanArray64s(s,2*(dx+1));
       cleanArray64s(g,n*d+1);
       cleanArray64s(V,2*(dx+1));

       //printf("ck:=");polprint64s(ck,dck);printf(":\n");

       //extract M_i,U_i
       point = (d+1)*(i-1);
       for (j=0;j<=d;j++){
          u[j] = U[point + j];
          uTemp[j] = U[point + j];
          if(U[point + j] != 0){
             du = j;
          }
       }

       //printf("ui:= "); polprint64s(uTemp,du); printf(":\n");
       for (j=0;j<(n-i)*d+1;j++){
          m[j] = M[mPos + j];
          mTemp[j] = M[mPos + j];
           if(M[mPos + j] != 0){
              dm = j;
            }
        }
        mPos = mPos + (n-i)*d+1;

       //polgcdext64s(mTemp,uTemp,dm,du,g,s,0,&dg,&ds,&dt,&(W[wTemp]),p);

       //printf("S:");polprint64s(&S[(i-1)*(dx+1)],sDeg[i-1]);printf("\n\n");

       ds = polmul64s(ck,&S[(i-1)*(dx+1)],s,dck,sDeg[i-1],p);
       *numMuls = *numMuls + (dck+1)*(sDeg[i-1]+1);

       //printf("CS:");polprint64s(s,ds);printf("\n\n");

       drem = poldiv64s(s,u,ds,du,p);
       *numMuls = *numMuls + (du)*(ds-du+1);

       //printf("CS div F0:");polprint64s(s,drem);printf("\n\n");

       ds = ds - drem - d;

       for(j=0;j<=drem;j++){
          t = s[j];
          Sigmas[(i-1)*(d)+j] = t;
       }
       for(j=0;j<=ds;j++){
          t = s[drem+j+d];
          s[j] = t;
          s[drem+j+d] = 0;
       }

       //calculate c - r*M_i
       dm = polmul64s(m,&Sigmas[(i-1)*(d)],m,dm,drem,p);
       dm = polsub64s(ck, m, m, dck, dm, p);

       //c is reduced to 0, so the remaining sigmas must be 0.
       if(dm==-1){return;}

       //take quotient of c - r*M_i and U_i;
       drem = poldiv64s(m,u,dm,du,p);
       for(j=0;j<=dm-du;j++){
          t = m[du+j];
          ck[j] = t;
          if(t != 0){
             dck = j;
          }
       }
    }

    //Save final sigmas
    for (j=0;j<=dck;j++){ Sigmas[(n-1)*d+j] = ck[j]; }

    return;
}

LONG getCoeff3(LONG ** G, LONG * H, LONG * Space, LONG *degrees,  int n, int k, LONG p){

     //denote local variables
     LONG i,d1,d2,t,s;
     LONG MIN,MAX;
     ULONG Z[2];
     clock_t T1,T2;

     t = n; s = 0;
     while(t>1){
        i=0;
        while(i<t/2){
            d1 = degrees[s+2*i]; d2 = degrees[s+2*i+1];
            //printf("Space = %d   new degrees = %d   old degrees = %d\n",Space[t+s+i],d1+d2,degrees[t+s+i]);
            G[t+s+i] = extendSpace(G[t+s+i],&H,&Space[t+s+i],d1+d2,degrees[t+s+i],log2N(d1+d2)+1);
            degrees[s+t+i] = d1+d2;
            if(k <= d1+d2){
                //printf("actually got in here\n");
                MIN = max32s(0,k-d2); MAX = min32s(k,d1);
                G[t+s+i][k] = convol64s(G[s+2*i], G[s+2*i+1], MIN, MAX, k,p);
            }
            i++;
        }
       if(t%2==1){
            //printf("I got in here\n");
           G[t+s+i] = extendSpace(G[t+s+i],&H,&Space[t+s+i],degrees[s+2*i],degrees[t+s+i],log2N(degrees[s+2*i])+1);
            degrees[s+t+i] = degrees[s+2*i];
            if(degrees[s+2*i]>=k){
               G[s+t+i][k] = G[s+2*i][k];
            }
       }
        s += t; t = ceil2(t);
     }

     //printf("here we go again\n");

     if(degrees[s] >= k){return G[s][k];}
     else{ return 0;}


}

void createProblem(LONG *A, LONG *F0, int n, int dx, int dz, int dxA, int dzA, LONG alpha, LONG p, LONG *W){

    //Note: dxA = dx*n, dzA = dz*n

   LONG *TEMP,*TEMP2,*TEMP3,t1,t2,alpha2,r;
   int i,j,k,l,m,p1,p2,p3,MIN,MAX,da,db,dc,w;
   ULONG z[2];

   //return the degree of dxA,dzA

   //TEMP = W;
   //TEMP2 = W + (dxA+1)*(dzA+1);
   //TEMP3 = W + 2*(dxA+1)*(dzA+1);
   TEMP = array((dxA+1)*(dzA+1));
   TEMP2 = array((dxA+1)*(dzA+1));
   TEMP3 = array((dxA+1)*(dzA+1));
   cleanArray64s(TEMP,(dxA+1)*(dzA+1));
   cleanArray64s(TEMP2,(dxA+1)*(dzA+1));
   cleanArray64s(TEMP3,(dxA+1)*(dzA+1));
   cleanArray64s(A,(dzA+1)*(dxA+1));

   //generate the polynomials of F (A = f_1*f_2*...*f_n)
   for(i=0;i<n;i++){
      p1 = i*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<dx;k++){
            r = rand64s(p);
            TEMP[p1+p2+k] = r;
         }
      }
      TEMP[p1+dx] = 1;
   }
   //printf("First Temp");polprintbivar64s(TEMP,dx,n*(dz+1)-1,"x","y");printf("\n");

   //Generate F0 (ie. F_i(z=alpha))
   for(i=0;i<n;i++){
      cleanArray64s(TEMP2,(dxA+1)*(dzA+1));
      p1 = i*(dx+1)*(dz+1);
      //move 1 of the n factors to TEMP2
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP[p1+p2+k];
            TEMP2[p2+k] = t1;
         }
      }

      //Evalute it at z=alpha
      alpha2=1;
      for(j=1;j<=dz;j++){
         alpha2 = mul64s(alpha2,alpha,p);
         p2 =  j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = mul64s(alpha2,TEMP2[p2+k],p);
            TEMP2[k] = add64s(TEMP2[k],t1,p);
         }
      }

      //move results to F0
      p1 = i*(dx+1);
      for(j=0;j<=dx;j++){
         t1 = TEMP2[j];
         F0[p1+j] = t1;
      }

      //move first F to TEMP3
      for(j=0;j<=dz;j++){
         for(k=0;k<=dx;k++){
            t1 = TEMP[j*(dx+1)+k];
            TEMP3[j*(dx+1)+k] = t1;
         }
      }
   }

    for(i=1;i<n;i++){
        cleanArray64s(TEMP2,(dxA+1)*(dzA+1));
        p1 = i*(dx+1)*(dz+1); //small
        da = i*dz;//big
        db = dz;//small
        dc = da+db;//combined
        for(j=0;j<=dz;j++){
           p2 = j*(dx+1); //small
           for(k=0;k<=i*dz;k++){
               p3 = k*(i*dx+1); //big
               for(l=dc;l>=0;l--){
                  MIN = max32s(0,l-db);
                  MAX = min32s(l,da);



                  z[0] = convol64s(&TEMP3[p3], &TEMP[p1+p2],MIN,MAX,l,p);
                  //for(m=MIN;m<=MAX;m++){
                    //t1 = big polynomial
                    //t2 = small polynomial
                    //zfma(z,TEMP3[p3+m],TEMP[p1+p2+l-m]);
                  //}
                  //zmod(z,p);
                  TEMP2[(k+j)*((i+1)*dx+1)+l] = add64s(z[0],TEMP2[(k+j)*((i+1)*dx+1)+l],p);
               }
           }
        }

      //move current polynomial back to TEMP1
      for(j=0;j<=(i+1)*dz;j++){
         p2 = j*((i+1)*dx+1);
         for(k=0;k<=(i+1)*dx;k++){
            t1 = TEMP2[p2+k];
            TEMP3[p2+k] = t1;
         }
      }
   }
   //move to A
   for(i=0;i<=dzA;i++){
   p1 = i*(dxA+1);
      for(j=0;j<=dxA;j++){
         t1 = TEMP3[p1+j];
         A[p1+j] = t1;
      }
   }

   for(i=0;i<n;i++){
      //printf("F%d:=",i);polprintbivar64s(&(TEMP[i*(dx+1)*(dz+1)]),dx,dz,"x","y");printf(":\n\n");
   }

   free(TEMP);
   free(TEMP2);
   free(TEMP3);

   return;

}

//create a bivariate polynomial to factor using Hensel lifting
void createProblem2(LONG *A, LONG *F0, int n, int dx, int dz, int dxA, int dzA, LONG alpha, LONG p, LONG *W){

    //Note: dxA = dx*n, dzA = dz*n

   LONG *TEMP,*TEMP2, *V, t1,t2,alpha2,r;
   LONG *evalPoints, *lagInterpPolys, *evalProducts, *evalFactors, *vector;
   LONG i,j,k,l,m,p1,p2,p3,MIN,MAX,da,db,dc,w;
   ULONG z[2];

    //printf("n=%lld dx=%lld dy=%lld,dxA=%lld,dzA=%lld\n",n,dx,dz,dxA,dzA);

   //return the degree of dxA,dzA
   V = W;
   TEMP = V;                V += (dxA+1)*(dzA+1);
   TEMP2 =  V;             V+= (dxA+1)*(dzA+1);
   evalPoints  =  V;        V += (dxA+1);
   lagInterpPolys  =  V;  V += (dxA+1)*(dxA+1);
   evalProducts  =  V;    V += (dxA+1)*(dzA+1);
   evalFactors  =  V;      V +=   (n*(dz+1));
   vector =  V;              V +=   (dxA+1);


   //generate the polynomials of F (A = f_1*f_2*...*f_n)
   for(i=0;i<n;i++){
      p1 = i*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            r = rand64s(p);
            TEMP[p1+p2+k] = r;
         }
      }
   }

   //Generate F0 (ie. F_i(z=alpha))
   for(i=0;i<n;i++){
      p1 = i*(dx+1)*(dz+1);
      //move 1 of the n factors to TEMP2
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP[p1+p2+k];
            TEMP2[p2+k] = t1;
         }
      }

      //Evalute it at z=alpha
      alpha2=1;
      for(j=1;j<=dz;j++){
         alpha2 = mul64s(alpha2,alpha,p);
         p2 =  j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = mul64s(alpha2,TEMP2[p2+k],p);
            TEMP2[k] = add64s(TEMP2[k],t1,p);
         }
      }

      //move results to F0
      p1 = i*(dx+1);
      for(j=0;j<=dx;j++){
         t1 = TEMP2[j];
         F0[p1+j] = t1;
      }
   }

   //printf("this loop2\n");

    //set evalPoints
   evalPoints[0] = 0;
   for(i=1;i<=dxA/2;i++){
      evalPoints[2*i-1] = i;
      evalPoints[2*i] = p-i;
   }
   if(dzA%2 == 1){
      evalPoints[dxA] = i;
   }
   //printf("got here 1\n");
   //Setup polynomials for lagrange interpolation
   LagInterpSetup(lagInterpPolys,evalPoints,dxA,V,p);
   //printf("got here 2\n");



   //perform evaluation + multiplication
   cleanArray64s(evalProducts,(dxA+1)*(dzA+1));
   for(j=0;j<=dxA;j++){ //for each evaluation point

        cleanArray64s(evalFactors,n*(dz+1));

        //evaluate the n polynomials
        r = 1;
        for(m=0;m<=dx;m++){// for each x
            for(i=0;i<n;i++){ //for each factor
                for(k=0;k<=dz;k++){ //iterate through the relative z's
                   evalFactors[i*(dz+1)+k] = add64s(evalFactors[i*(dz+1)+k],mul64s(TEMP[i*(dx+1)*(dz+1) + k*(dz+1) +m],r,p),p);
                }
            }
            r=mul64s(r,evalPoints[j],p);
        }

        //multiply the polynomials together
        r = polmul64s(evalFactors,&evalFactors[dz+1],&evalProducts[j*(dzA+1)],dz,dz,p);

        for(i=2;i<n;i++){
            r = polmul64s(&evalProducts[j*(dzA+1)],&evalFactors[i*(dz+1)],&evalProducts[j*(dzA+1)],r,dz,p);
        }

   }

   //printf("got here 3\n");

   //perform interpolation then populate A with the results
   for(i=0;i<=dzA;i++){

        //create vector to interpolate
        for(j=0;j<=dxA;j++){ vector[j] = evalProducts[j*(dzA+1)+i]; }

        //perform Lagrange Interpolation
        LagInterpEval(lagInterpPolys,vector,TEMP2,dxA, V, p, &r);

        //populate A
        for(j=0;j<=dzA;j++){A[i*(dzA+1)+j] = TEMP2[j];}

   }

   //free(TEMP); free(TEMP2); free(evalPoints); free(lagInterpPolys); free(evalProducts); free(evalFactors); free(vector);

   return;

}

//This preforms hensel lifting on n factors to factor polynomial A
int HenselLiftCubic(LONG *A, int DX, int DZ, LONG *DDX, LONG *DDZ, LONG *f0, int n, int du, LONG *F, LONG alpha, LONG *TempSpace, LONG p ){
/*
A[x][z]- polynomial you wish to factor
dx - degree bound on variable x of A
dz - degree bound on variable z of A
F0 - set of linear factors
n - number of initial factors
du - degree bound of initial factors (could be extended to an array later)
F - array that will store the final polynomials. must be of size n*(dx+1)*(dz+1)
W - working array for calculations. Must be at least size (tbd)
alpha - LONG integer
p - prime
*/

   //local variables
   int i,j,k,l,d,m,maxSizeA,ndz,flag, *maxDeg, *sDeg, *curXDeg, *dr, *df0, sumDeg, InterpFlag, numPolys, gdeg, gdegN, dz, dx, dTF, MIN, MAX;
   LONG s,t,t2,z,*M,*E, *F2,*p1,*p2,*p3,*p4,*p5,*TEMP1,*Coeffs, *LagInterpPolys, *gamma, *gammaTay,*gammaN, *gammaNTay, *prods, *F0;
   LONG *a, *DioSVal, numDioMuls,numCoeffExtractMul,numEvalMuls,numInterpMuls;
   LONG *H, *Space, *degrees, *ck,*evalPoints,*interpPoints,*Delta,*EvalPointsMul, *W, wTemp;
   LONG temp,temp2,temp3;
   LONG ** P1;
   recint P;
   clock_t T1,T2,T3,T4,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;

   //set time to 0
   s1=0;s2=0;s3=0;s4=0;s5=0;s6=0;s7=0;s8=0;s9=0;s10=0;s11=0;s12=0;s13=0;s14=0;s15=0;

   //printf("A:=");  polprintbivar64s(A,DX,DZ,"x","y"); printf(":\n");
   //printf("f1:="); polprint64s(f0,du,p);
   //printf("f2:="); polprint64s(&f0[du+1],du,p);

   //calculate the correct number of evaluation points
   dx = DX;
   if(dx%2==1){dx += 1;}

   //initial size calculations

   ndz = n*(DZ+1);
   numEvalMuls = 0;
   numDioMuls = 0;
   numCoeffExtractMul = 0;
   numInterpMuls = 0;
   InterpFlag = 0;

   P = recip1(p);
   //find the number of polynomial multiples we create
   numPolys = 1; t = n;
   while (t>1){
    numPolys +=  t;
    t = ceil2(t);
   }
   //printf("numPolys = %lld\n",numPolys);

   dz = DZ;

   //declare arrays
   W = TempSpace;
   maxDeg = arrayint(n); //this is in y
   sDeg = arrayint(n);  //Degree of each s variable (for Diophantine Equation)
   dr = arrayint(n);
   df0 = arrayint(n);
   curXDeg = arrayint(n);  //Degree of x for each factor (efficiency)

   F0 = W;                             W += n*(du+1);
   gamma = W;                          W += (dz+1);
   gammaTay = W;                       W += (dz+1);
   gammaN = W;                         W += (n-1)*(dz+1);
   gammaNTay = W;                      W += (n-1)*(dz+1);
   prods = W;                          W += n*(DX+1);
   ck = W;                             W += DX+1;

   //Calculate gamma and gamma^(n-1)
	gdeg = 0;
	for(i=0;i<=dz;i++){
	   gamma[i] = A[(i)*(DX+1)+DX];
	   if(gamma[i] != 0){ gdeg = i; }
	}
	for(i=0;i<=gdeg;i++){
	   gammaN[i] = gamma[i];
	   gammaTay[i] = gamma[i];
	}
	gdegN = gdeg;
	for(i=0;i<n-2;i++){
	   gdegN = polmul64s(gamma,gammaN,gammaN,gdeg,gdegN,p);
	}
	dz = DZ + gdegN;
	maxSizeA = 2*(DX+1)*(DZ+1);

	//printf("dz = %lld\n",dz);
	//printf("gamma:="); polprint64s(gammaN,gdegN,p);


   M = W;                              W += (n-1)*n*du/2 + n - 1; //M polynomials (diophantine)
   E = W;                              W += (DX+1)*(DZ+1);             //The error at each iteration
   TEMP1 = W;                          W += n*(du+1);             //Temporary storage
   evalPoints = W;                     W += dx+1;
   interpPoints = W;                   W += dx+1;
   LagInterpPolys = W;                 W += (dx+1)*(dx+1);
   Delta = W;                          W += dx+1;
   EvalPointsMul = W;                  W += (du+1)*((dx/2)+2);
   DioSVal = W;                        W += (n-1)*(DX+1);
   Space = W;                          W += (dx+1)*(numPolys); // < (dx+1)*3*n
   degrees = W;                        W += (dx+1)*(numPolys); // < (dx+1)*3*n
   H = W;                              W += 4*(dx+1)*(log2N(n)+1)*(dz+n);


    LONG *G[(dx+1)*numPolys];

    cleanArray64s(H,4*(dx+1)*(log2N(n)+1)*(dz+n));

	//find taylor series of gamma^(n-1)
	cleanArray64s(gammaNTay,(n-1)*(dz+1));
	for(i=0;i<=gdegN;i++){ gammaNTay[i] = gammaN[i];  }
	//printf("gammaNTay=");polprint64s(gammaNTay,gdegN); printf("\n");


	if(alpha != 0){
      altDiv2(gammaTay,0,gdeg,alpha,W,p);
      altDiv2(gammaNTay,0,gdegN,alpha,W,p);
   }

   if( n < 2){
      printf("This algorithm requires n >= 2");
      return -1;
   }

    //printf("gammaTay=");polprint64s(gammaTay,gdeg); printf("\n");
   //printf("gammaN :=");polprint64s(gammaN,gdegN); printf("\n");
   //printf("gammaNTay=");polprint64s(gammaNTay,gdegN); printf("\n");
   //printf("gdegN = %lld\n",gdegN);

   //move contents of F0 to F
   //Calculate exact degree bound of factors
   p1 = F;
   p2 = f0;
   p3 = F0;


   //Initialize matrix G, Space, and degrees
   for(i=0;i<(dx+1)*numPolys;i++){
           G[i] = H;
           Space[i] = 1;
           degrees[i] = 0;
           H += 1;
   }

   s = poleval64s(gamma,gdeg,alpha,p);
   for(i=0;i<n;i++){
      curXDeg[i] = -1;
      for(j=0;j<=du;j++){
         if(p2[j] != 0){ curXDeg[i]=j;}
      }
      df0[i] = curXDeg[i];

      t = mul64s(s,modinv64s(p2[curXDeg[i]],p),p);

      for(j=0;j<=curXDeg[i];j++){
          s = mul64s(p2[j],t,p);
          p1[j] = s; p3[j] = s;
      }

      p1 += maxSizeA;
      p2 += du+1; p3 += du+1;
   }


   //calculate terms needed for delta
   if(n==2){
        dTF = df0[0] + df0[1];
      for(j=0;j<=du;j++){
         prods[j] = F[maxSizeA+j]; prods[DX+1+j] = F[j];
      }
   }
   else{
       cleanArray64s(TEMP1,DX+1);
       prods[0]=1; dTF = 0;
       for(i=0;i<n;i++){dTF=polmul64s(&F[i*maxSizeA],prods,prods,curXDeg[i],dTF,p);}
       for(i=1;i<n;i++){
          for(j=0;j<=DX;j++){prods[i*(DX+1)+j] = prods[j];}
          dr[i] = poldiv64s(&prods[i*(DX+1)],&F[i*maxSizeA],DX,curXDeg[i],p);
          for(j=0;j<=dTF-curXDeg[i];j++){prods[i*(DX+1)+j] = prods[i*(DX+1)+j+curXDeg[i]]; } for(j=dTF-curXDeg[i]+1;j<=DX;j++){prods[i*(DX+1)+j]=0;}
       }
       dr[0] = poldiv64s(prods,F,DX,curXDeg[0],p);
       for(j=0;j<=dTF-curXDeg[0];j++){prods[j] = prods[j+curXDeg[0]]; } for(j=dTF-curXDeg[0]+1;j<=DX;j++){prods[j]=0;}
   }


   //declare maxDeg, the size of each n factors
   for(i=0;i<n;i++){
      maxDeg[i] = 0;
   }

   //set evalPoints
   evalPoints[0] = 0;
   p1 = evalPoints+1;
   for(i=1;i<=dx/2;i++){
      p1[0] = i;
      p1[1] = p-i;
      p1 +=2;
   }
   if(dx%2 == 1){
      evalPoints[dx] = i;
   }

   //printf("x=%lld\na",evalPoints[dx]);



   //Generate Lagrange Polynomials for Interpolation in main loop
   T1 = clock();
   LagInterpSetup(LagInterpPolys,evalPoints,dx,W,p);
   s7 = clock() - T1;

   //for(i=0;i<=12;i++){printf("tct=%lld ",LagInterpPolys[i]);} printf("\n");

   //calculate evaluation point constants
   p1 = EvalPointsMul;
   for(i=1;i<=dx/2;i++){
      t = 1;
      p1[0] = t;
      t2 = i*i;
      for(j=1;j<=du;j++){
         t = mul64s(t,t2,p);
         p1[j] = t;
      }
      p1 = p1 + du+1;
   }
   if(dx%2==1){
       t = 1;
      p1[0] = t;
      t2 = i*i;
      for(j=1;j<=du;j++){
         t = mul64s(t,t2,p);
         p1[j] = t;
      }
   }



   for(i=0;i<n;i++){
    //printf("f%d := ",i); polprintbivar64s(&F[i*(dx+1)*(dz+1)],dx,dz,"x","(y-3)");printf("\n");
   }

   //Recover multiple of U for future calls to diophantine equation
   j = MultiEEA(F0,du,n,M,W,p);

   //if the EEA failed, return 'FAIL'(add last)
   /*if(j==-1){
      printf("The input factors are not coprime");
      return -1;
   }*/

   //Generate the S values from Gcdex
   generateGcdexS(F0,n,du,M,DX,DioSVal,sDeg,W,p);

   // declare initial Error
   for(i=0;i<(DX+1)*(DZ+1);i++){
      E[i] = A[i];
   }
   //Change it to its Taylor Representation E = a_0 + a_1(y-alpha) + ...
   T1 = clock();
   if(alpha != 0){
      altDiv2(E,DX,DZ,alpha,W,p);
   }
   s9 = clock()-T1;

   //printf("E:=");polprintbivar64s(E,dx,dz,"x","(y-5)");printf(":\n");

   //Preform initial polynomial evaluations
   T1 = clock();
   FastEval(G,&H, Space, degrees, F,EvalPointsMul, 0, n, numPolys, curXDeg, maxDeg,  du, DX, DZ, p);
   s1 = clock()-T1;


   //initial CoeffCalc calculations
   T1 = clock();
   for(i=0;i<=dx;i++){
             //calculate coefficient for (y-alpha)^k
             interpPoints[i] = getCoeff3(&G[i*numPolys],H,&Space[i*numPolys],&degrees[i*numPolys],n,0,p);
             //printf("zzz = %lld ",interpPoints[i]);
    }
   s2 = clock()-T1;

   //main FOR loop
   T3 = clock();
   for(k=1;k<=dz;k++){

       //printf("\n\nIteration: k = %d\n\n",k);


         //Preform coefficient extraction
         T1 = clock();
         //printf("got here 1\n");
          for(i=0;i<=dx;i++){
             //calculate coefficient for (y-alpha)^k
             interpPoints[i] = getCoeff3(&G[i*numPolys],H,&Space[i*numPolys],&degrees[i*numPolys],n,k,p);
             //printf("zzz = %lld ",interpPoints[i]);
          }
          //printf("got here 2\n");
         // printf("\n");
          //printf("\n");
          T2 = clock();
          s2 = s2 + T2 - T1;


          //print out evals 2
          //for(i=0;i<=12;i++){printf("tct=%lld ",LagInterpPolys[i]);} printf("\n");

      //Interpolate
      T1 = clock();
      LagInterpEval(LagInterpPolys,interpPoints,Delta,dx, W, p, &numInterpMuls);
      T2 = clock();
      s3 = s3 + T2 - T1;

      //printf("Delta := "); polprint64s(Delta,dx); printf("\n");



      //Calculate ck for this iteration
      cleanArray64s(ck,DX+1);
      MIN = max32s(0,k-DZ); MAX = min32s(k,gdegN);
      for(j=MIN;j<=MAX;j++){
         poladdsca64s( ck, &E[(k-j)*(DX+1)],  ck, DX, DX, gammaNTay[j], p, P);
      }

      //printf("ck=");polprint64s(ck,DX);printf("\n");

      //subtract Delta from e
      temp = polsub64s(ck, Delta, ck, DX, DX, p);

      //subtract delta from e
      if(k <=gdeg){
          for(i=0;i<n;i++){
                t = gammaTay[k]; //printf("d0=%lld ",t);
                p1 = &prods[i*(DX+1)];
                for(j=df0[i];j<=dTF;j++){
                   ck[j] = sub64s(ck[j],mul64s(p1[j-df0[i]],t,p),p);
                }
          }
          //printf("\n");
      }


      //printf("temp=");polprint64s(ck,DX);printf("\n");

      //check to make sure ck isn't zero
      flag = 0;
      for (i=0;i<=DX;i++){
        if (ck[i] != 0){
            flag = 1;
            break;
        }
      }


      //check fail case
      if(flag == 1 && sumDeg == dz){
         printf("Error: c_k != 0 when the sum of degrees is equal to dz\n\n");
         return -1;
      }

      //if c_k isn't equal to zero, solve diophantine equation and preform updates/evaluations
      else if(flag == 1){

          //Solve Diophantine Equation
          cleanArray64s(TEMP1,n*(du+1));
           T1 = clock();
          Dionvar(F0, n, du, ck, DX, M, DioSVal,sDeg, W, TEMP1, p, &numDioMuls);
          T2 = clock();
          s4 = s4 + T2 - T1;

          //Update F
          //Update degree of X counter for factors
          p1 = &F[k*(DX+1)];
          p2 = TEMP1;
          InterpFlag = 1;
          for(i=0;i<n;i++){
             curXDeg[i]=-1;
             for(j=0;j<du;j++){
                //printf("t=");polprint64s(p2,du-1);printf("\n");
                if(p2[j] != 0){
                   InterpFlag = 0;
                   curXDeg[i]=j;
                   maxDeg[i] = k;
                   p1[j] = p2[j];
                }
             }
             p1[df0[i]] = gammaTay[k];
             if(gammaTay[k] !=0) {curXDeg[i] = df0[i]; maxDeg[i] = k; InterpFlag=0;}

             p1 = p1 + maxSizeA;
             p2 = p2 + du;
          }

            //add the sum of all the degrees (dz) for all polynomials
            sumDeg = 0;
            for (i=0;i<n;i++){
                 sumDeg += maxDeg[i];
            }

          //for(i=0;i<n;i++){
          //  printf("f%d:=",i+1);polprintbivar64s(&(F[i*maxSizeA]),DX,DZ,"as","(bs-3830)"); printf(":\n");
          //}
          //printf("\n");

          //printf("maxsize=%lld\n",maxDeg[0]);

          //printf("got here\n");

          //Calculate evaluation points
          T1 = clock();
          FastEval(G,&H, Space, degrees, F,EvalPointsMul, k, n, numPolys, curXDeg, maxDeg, du, DX, DZ, p);
          T2 = clock();
          s1 = s1 + T2 - T1;

          //printf("numPolys=%lld\n",numPolys);

          //printf("but did I get here\n");
          //for(i=0;i<numPolys;i++){printf("degs = %lld ",degrees[i]);} printf("\n");

          //Update CoeffCients
          T1 = clock();
          //printf("before update\n");

          /*for(i=0;i<numPolys;i++){
             for(j=0;j<=degrees[dx*numPolys+i];j++){
                  printf("Evals%d = %lld   ",k,G[numPolys + i][j]);
             }
             printf("\n");
          }*/

          for(i=0;i<=dx;i++){
            P1 = &G[i*numPolys];
            p2 = &Space[i*numPolys];
            p3 = &degrees[i*numPolys];
            s = 0; t = n;
            cleanArray64s(TEMP1, n);
            while(t>1){
                j=0;
                while(j<t/2){
                    if(s==0){
                        TEMP1[j] = 0;
                        if(p3[2*j]==k){TEMP1[j] = mul64s(P1[2*j][k],P1[2*j+1][0],p);} //printf("TEMP=%lld\n",TEMP1[j]);}
                        if(p3[2*j+1]==k){TEMP1[j] = add64s(TEMP1[j],mul64s(P1[2*j+1][k],P1[2*j][0],p),p);} //printf("TEMP=%lld\n",TEMP1[j]);}
                    }
                    else{
                        TEMP1[j] = add64s(mul64s(P1[s+2*j+1][0],TEMP1[2*j],p),mul64s(P1[s+2*j][0],TEMP1[2*j+1],p),p);
                    }

                   //printf("old deg = %d   new deg = %d current space = %d\n",p3[t+s+j],p3[s+2*j]+p3[s+2*j+1], p2[t+s+j]);

                   //for(m=0;m<=p3[s+t+j];m++){
                      // printf("abc = %lld ",P1[t+s+j][m]);
                   //}
                   //printf("\n");

                   P1[t+s+j] = extendSpace(P1[t+s+j],&H,&p2[t+s+j],p3[s+2*j]+p3[s+2*j+1],p3[t+s+j],log2N(p3[s+2*j]+p3[s+2*j+1])+1);

                    //printf("old deg = %d   new deg = %d current space = %d\n",p3[t+s+j],p3[s+2*j]+p3[s+2*j+1], p2[t+s+j]);

                   //for(m=0;m<=p3[s+2*j]+p3[s+2*j+1];m++){
                  //     printf("abc = %lld ",P1[t+s+j][m]);
                  // }
                   //printf("\n");
                   p3[s+t+j] = p3[s+2*j] + p3[s+2*j+1];
                   if(p3[s+t+j] >= k){P1[s+t+j][k] = add64s(P1[s+t+j][k],TEMP1[j],p);}
                   j++;
                }

                if(t%2==1){
                     TEMP1[j] = 0;
                     if(p3[s+2*j]>=k){TEMP1[j] = P1[s+2*j][k];}

                    P1[t+s+j] = extendSpace(P1[t+s+j],&H,&p2[t+s+j],p3[s+2*j],p3[t+s+j],log2N(p3[s+2*j])+1);
                    p3[s+t+j] = p3[s+2*j];
                    if(p3[s+t+j] >= k){P1[t+s+j][k] = TEMP1[j];}
                }
                //if(i==1){printf("TEMP1=");polprint64s(TEMP1,n/2,p); printf("\n");}
                s += t; t = ceil2(t);
            }

      }
      //printf("after update\n");


       /*for(i=0;i<numPolys;i++){
             for(j=0;j<=degrees[numPolys+i];j++){
                  printf("Evals%d = %lld   ",k,G[numPolys+i][j]);
             }
             printf("\n");
          }*/
      T2 = clock();
      s5 = s5 + T2 - T1;

      }
      //printf("B:=");polprintbivar64s(&F[2*maxSizeA],DX,2*DZ,"x","(y-3)");printf("\n");
   }

  // printf("escaped main loop\n");
   //printf("sumDeg = %lld dz = %lld\n",sumDeg,dz);

   T4 = clock();

   //check if sum of degrees = dzvpn.its.sfu.ca
   if(sumDeg != dz){
      printf("Error: sum of degrees not equal to %d\n",dz);
      return -1;
   }

   //printf("got out of main loop\n");



   //Calculate the content and divide each factor by it for the first n-1 factors
   for(i=0;i<n-1;i++){

       //Calculate content
       cleanArray64s(TEMP1,dz+1);
       p1 = &F[i*maxSizeA];
       //printf("Z:=");polprintbivar64s(p1,DX,2*DZ,"x","(y-3)");printf("\n");
       t = Content(p1,TEMP1,du,DX,2*DZ,p,W);

       DDZ[i] = maxDeg[i]-t;

      //divide each factor by content
      for(j=0;j<=du;j++){
           d=-1;
           for(k=0;k<=2*DZ;k++){
               H[k] = p1[k*(DX+1)+j];
               if(H[k] != 0){d = k;}
           }
           //printf("pp=");polprint64s(E,d);printf("\n");
           //printf("pt=");polprint64s(H,2*DZ);printf("\n");
           poldiv64s(H,TEMP1,d,t,p);
           //printf("pp=");polprint64s(&H[1],d-t);printf("\n");
           for(k=0;k<=d-t;k++){
                p1[k*(DX+1)+j] = H[k+t];
                if(H[k+t] != 0){ DDX[i*(DZ+1)+k] = j; }
           }
           for(k=d-t+1;k<=2*DZ;k++){p1[k*(DX+1)+j]=0;}
           //DDZ[i] = max32s(DDZ[i],d-t);
      }

       if (gdegN>0){
            poldiv64s(gammaNTay,TEMP1,gdegN,t,p);
            gammaNTay +=t;  gdegN = gdegN-t;
            //printf("zt=");polprint64s(gammaNTay,gdegN); printf("\n");
       }
       else{gammaN[0] = mul64s(gammaN[0],modinv64s(TEMP1[0],p),p);}

       //printf("%lld\n",gdegN);
       //polprint64s(gammaNTay,gdegN);printf("\n");

       p1 += maxSizeA;

   }

   //altar final factor
   for(j=0;j<=du;j++){
       d=0;
       for(k=0;k<=2*DZ;k++){
           H[k] = p1[k*(DX+1)+j];
           if(H[k] != 0){d = k;}
       }
       poldiv64s(H,gammaNTay,d,gdegN,p);
       for(k=0;k<=d-gdegN;k++){p1[k*(DX+1)+j] = H[k+gdegN];} for(k=d-gdegN+1;k<=2*DZ;k++){p1[k*(DX+1)+j]=0;}
    }
    DDZ[n-1] = maxDeg[n-1] - gdegN;

   //perform final change of base back go y
   s = sub64s(p,alpha,p);
   for(i=0;i<n;i++){
      if(alpha != 0){
         altDiv2(&F[i*maxSizeA],DX,DZ,s,W,p);
      }
      for(j=0;j<=DDZ[i];j++){
        DDX[i*(DZ+1)+j] = du;
        for(k=du;k>=0;k--){
            if(F[i*maxSizeA + j*(DX+1) + k] != 0){break;}
            DDX[i*(DZ+1)+j] -= 1;
        }
      }
   }

   //modify the factors so that the evaluations match the initial factorization input (f0)
   for(i=0;i<n;i++){
      //calculate the modifying constant
      s=1; t=0;
      for(j=0;j<=DDZ[i];j++){
         t = add64s(t,mul64s(s,F[i*maxSizeA + j*(DX+1) + DDX[i*(DZ+1)+DDZ[i]]],p),p);
         s = mul64s(s,alpha,p);
      }
      t = modinv64s(t,p);
      t = mul64s(f0[i*(du+1) + DDX[i*(DZ+1)+DDZ[i]]],t,p);

      //apply the constant to each factor
      for(j=0;j<=DDZ[i];j++){
         for(k=0;k<=DDX[i*(DZ+1)+j];k++){
            F[i*maxSizeA + j*(DX+1) + k] = mul64s(F[i*maxSizeA + j*(DX+1) + k],t,p);
         }
      }
   }



   //print statements
   /*printf("Change of Base time=%8.2fms\n",(s9)/1000.0/1.0);
   printf("Evaluation time=%8.2fms\n",(s1)/1000.0/1.0);
   printf("Coefficient Extraction time=%8.2fms\n",(s2)/1000.0/1.0);
   printf("Interpolation Setup time=%8.2fms\n",(s7)/1000.0/1.0);
   printf("Interpolation Calc time=%8.2fms\n",(s3)/1000.0/1.0);
   printf("Coefficient Update=%8.2fms\n",(s5)/1000.0/1.0);
   printf("Coefficient Shift time=%8.2fms\n",(s6)/1000.0/1.0);
   printf("Total Interpolation time=%8.2fms\n",(s3+s7)/1000.0/1.0);
   printf("Diophantine time=%8.2fms\n",(s4)/1000.0/1.0);
   /*printf("The number of Evaluation Multiplications is %lld\n",numEvalMuls);
   printf("The number of Diophantine Multiplications is %lld\n",numDioMuls);
   printf("The number of Interpolation Multiplications is %lld\n",numInterpMuls);
   printf("The number of Coefficient Extraction Multiplications is %lld\n",numCoeffExtractMul);*/
   //printf("\n");

   free(maxDeg); free(sDeg); free(curXDeg); free(dr); free(df0);

   return 0;

}

int HenselLiftQuartic(LONG *A, int dx, int dz, LONG *F0, int n, int du, LONG *F, LONG alpha, LONG *W, LONG p ){
/*
A[x][z]- polynomial you wish to factor
dx - degree bound on variable x of A
dz - degree bound on variable z of A
F0 - set of linear factors
n - number of initial factors
du - degree bound of initial factors (could be extended to an array later)
F - array that will store the final polynomials. must be of size n*(dx+1)*(dz+1)
W - working array for calculations. Must be at least size (tbd)
alpha - LONG integer
p - prime (duh)
*/

   //local variables
   int i,j,k,l,m,q,flag,mSize,mCounter,maxSizeA,*maxDeg,temp,temp2,temp3,point1,point2,point3,point4,MIN,MAX,MIN2,MAX2,accDeg,curDeg,*sDeg,*facDegs,*coeffDegs;
   LONG t,t2,*M,*M2,*E,*Prod,*TEMP1,*TEMP2,*Coeffs,*CoeffCalc,*ck,*evalPoints,*interpPoints,*Delta,*DioSVal,numDioMuls,*poly1,*poly2,*poly3,*poly4;
   clock_t T1,T2,T3,T4,T5,T6,s1,s2,s3,s4,s5,s6,s7,s8,s9;
   ULONG z[2];

   //set time to 0
   s1=0;s2=0;s3=0;s4=0;s5=0;s6=0;s7=0;s8=0;s9=0;

   //initial size calculations
   maxSizeA = (dx+1)*(dz+1);
   mSize = ((n-1)*n)/2*du + n - 1;
   mCounter = 0;
   numDioMuls = 0;

   //declare arrays
   maxDeg = arrayint(n);
   sDeg = arrayint(n);
   facDegs = arrayint((n)*(dz+1));
   coeffDegs = arrayint(n*(dz+1));
   M = array(mSize);
   M2 = array(du*(n-1)+1);
   E = array(maxSizeA);
   Prod = array(maxSizeA);
   TEMP1 = array(maxSizeA);
   TEMP2 = array(dx+1);
   ck = array(dx+1);
   evalPoints = array(dx+1);
   interpPoints = array(dx+1);
   Delta = array(dx+1);
   CoeffCalc = array((dx+1)*(n-2)*(dz+1));
   DioSVal = array((n-1)*(dx+1));


   cleanArray64s(M,mSize);
   cleanArray64s(TEMP1,maxSizeA);
   cleanArray64s(TEMP2,dx+1);
   cleanArray64s(CoeffCalc,(dx+1)*(n-2)*(dz+1));
   cleanArray64s(DioSVal,(n-1)*(dx+1));
   for(i=0;i<n*(dz+1);i++){facDegs[i]=0;coeffDegs[i]=0;}

   //move contents of F0 to F
   for(i=0;i<n;i++){
      point1 = i*(dx+1)*(dz+1);
      point2 = i*(du+1);
      facDegs[i*(dz+1)] = -1;
      for(j=0;j<=du;j++){
         t = F0[point2 + j];
         if(t != 0){
            facDegs[i*(dz+1)] = j;
            F[point1 + j] = t;
         }
      }
   }


   //declare maxDeg, the size of each n factors
   for(i=0;i<n;i++){
      maxDeg[i] = 0;
   }

   //set evalPoints
   for(i=0;i<=dx;i++){
      evalPoints[i] = i;
   }

   //over multiple of U for future calls to diophantine equation
   T1 = clock();
   //Recover multiple of U for future calls to diophantine equation
   j = MultiEEA(F0,du,n,M,W,p);



   //if the EEA failed, return 'FAIL'(add last)
   if(j==-1){
      printf("The input factors are not coprime");
      return -1;
   }
   T2 = clock();

   //printf("Perform EEA time=%8.2fms\n",(T2-T1)/1000.0/1.0);

   //if the EEA failed, return 'FAIL'(add last)

   // declare initial Error

   for(i=0;i<maxSizeA;i++){
      E[i] = A[i];
   }
   T1 = clock();
   altDiv2(E,dx,dz,alpha,W,p);
   T2 = clock();
   //printf("Division time=%8.2fms\n",(T2 - T1)/1000.0/1.0);

   //Generate the S values from Gcdex
   generateGcdexS(F0,n,du,M,dx,DioSVal,sDeg,W,p);

   //save initial CoeffCalc multiplications
   poly1 = &F[(n-1)*(dz+1)*(dx+1)];
   poly2 = &F[(n-2)*(dz+1)*(dx+1)];
   coeffDegs[(n-2)*(dz+1)] = facDegs[(n-1)*(dz+1)] + facDegs[(n-2)*(dz+1)];
   fastPolyMult(poly1,poly2,&CoeffCalc[(n-3)*(dx+1)*(dz+1)],facDegs[(n-1)*(dz+1)],facDegs[(n-2)*(dz+1)],p);
   for(j=n-3;j>=1;j--){
       poly1 = &F[(j)*(dz+1)*(dx+1)];
       poly2 = &CoeffCalc[(j)*(dx+1)*(dz+1)];
       coeffDegs[(j)*(dz+1)] = facDegs[(j)*(dz+1)] + coeffDegs[(j+1)*(dz+1)];
       fastPolyMult(poly1,poly2,&CoeffCalc[(j-1)*(dx+1)*(dz+1)],facDegs[(j)*(dz+1)],coeffDegs[(j+1)*(dz+1)],p);
   }


   //main FOR loop
   T3 = clock();
   for(k=1;k<=dz;k++){

        //printf("Iteration: %d\n\n",k);

      //get the error for this loop
      point1 = k*(dx+1);
      ck = &(E[point1]);

      //zero relevant elements
      /*for(m=0;m<n-2;m++){
          point1 = m*(dx+1)*(dz+1) + (k-1)*(dx+1);
          coeffDegs[(m+1)*(dz+1)+(k-1)] = 0;
          for(j=0;j<=dx;j++){
             CoeffCalc[point1 + j] = 0;
          }
       }*/

      //calculate deltacoeffDegs[(j+1)*(dz+1)+(i-q)]
      T1 = clock();
      for(i=k;i<=k;i++){

          //set degree totals
          accDeg = maxDeg[n-1];

          for (j=n-2; j>=1; j--){

              curDeg = accDeg;
              accDeg = accDeg + maxDeg[j];

              //make sure i need to do current calculations
              if(i <= accDeg){

                  //first 2 polynomials multiplied
                  if (j==n-2) {

                     MIN = max32s(0,i-maxDeg[j]);
                     MAX = min32s(i,maxDeg[j]);

                     point1 = (n-1)*(dx+1)*(dz+1);
                     point2 = (n-2)*(dx+1)*(dz+1);

                     //refers to the degree of (y-alpha) for each factor
                     for (q=MIN;q<=MAX;q++){
                        //multiply the 2 polynomials together (classical)

                        point3 = q*(dx+1);
                        point4 = (i-q)*(dx+1);
                        poly2 = &F[point1+point3];
                        poly1 = &F[point2+point4];
                        coeffDegs[(n-2)*(dz+1)+i] = max32s(coeffDegs[(n-2)*(dz+1)+i],facDegs[(n-1)*(dz+1)+(i-q)]+facDegs[(n-2)*(dz+1)+q]);

                        //multiply quickly
                        fastPolyMult(poly2,poly1,&CoeffCalc[(j-1)*(dx+1)*(dz+1) + i*(dx+1)],facDegs[(n-1)*(dz+1)+q],facDegs[(n-2)*(dz+1)+(i-q)],p);

                     }

                 }
                 //continued polynomial multiplication up to degree d
                 else{

                    MIN = max32s(0,i-curDeg);
                    MAX = min32s(i,maxDeg[j]);

                     point1 = (j)*(dx+1)*(dz+1); //this draws from F
                     point2 = (j-1)*(dx+1)*(dz+1); //this draws from CoeffCalc

                     //refers to the degree of (y-alpha) for each factor
                     for (q=MIN;q<=MAX;q++){
                        //multiply the 2 polynomials together (classical)

                        point3 = q*(dx+1);
                        point4 = (i-q)*(dx+1);
                        poly2 = &F[point1+point3];
                        poly1 = &CoeffCalc[point1 + point4];
                        coeffDegs[(j)*(dz+1)+i] = max32s(coeffDegs[(j)*(dz+1)+i],coeffDegs[(j+1)*(dz+1)+(i-q)]+facDegs[(j)*(dz+1)+q]);

                        //multiply quickly
                        fastPolyMult(poly2,poly1,&CoeffCalc[(j-1)*(dx+1)*(dz+1) + i*(dx+1)],facDegs[(j)*(dz+1)+q],coeffDegs[(j+1)*(dz+1)+(i-q)],p);
                     }

                 }
             }
          }
      }

      //zero Delta array
      cleanArray64s(Delta,dx+1);

      curDeg = accDeg;
      accDeg = accDeg + maxDeg[0];

      MIN = max32s(0,k-accDeg);
      MAX = min32s(k,curDeg);

      //Calculate final coefficients (Delta)
      for(m=MIN;m<=MAX;m++){
          poly1 = &F[m*(dx+1)];
          poly2 = &CoeffCalc[(k-m)*(dx+1)];

          fastPolyMult(poly1,poly2,Delta,facDegs[m],coeffDegs[(dz+1) + (k-m)],p);
      }
      T2 = clock();
      s1 = s1 + T2 - T1;

      //subtract delta from e
      T1 = clock();
      temp = polsub64s(ck, Delta, ck, dx, dx, p);
      T2 = clock();
      s5 = s5 + T2 - T1;
      //printf("ck:=");polprint64s(ck,dx);printf("\n");

      cleanArray64s(TEMP1,maxSizeA);

      //check to make sure ck isn't zero
      flag = 0;
      for (i=0;i<=dx;i++){
        if (ck[i] != 0){
            flag = 1;
            break;
        }
      }

      if(flag == 1) {
          //Solve Diophantine Equation
          T1 = clock();
          Dionvar(F0, n, du, ck, dx, M, DioSVal,sDeg, W, TEMP1, p, &numDioMuls);
          T2 = clock();
          s4 = s4 + T2 - T1;

          //printf("Del:=");polprint64s(TEMP1,maxSizeA);printf("\n");

          //Update F
          point2 = k*(dx+1);
          for(i=0;i<n;i++){
             point1 = i*(dx+1)*(dz+1);
             facDegs[i*(dz+1)+k] = -1;
             for(j=0;j<du;j++){
                if(TEMP1[i*du+j] != 0){
                   maxDeg[i] = k;
                   facDegs[i*(dz+1)+k] = j;
                   F[point1 + point2 + j] = TEMP1[i*du+j];
                }
             }
          }

          //Update CoeffCalc
          //Queue up first polynomial

            cleanArray64s(TEMP1,dx);
            poly1 = &F[(n-1)*(dx+1)*(dz+1)];
            poly2 = &F[(n-2)*(dx+1)*(dz+1)];
            fastPolyMult(poly1,&poly2[k*(dx+1)],TEMP1,facDegs[(n-1)*(dz+1)],facDegs[(n-1)*(dz+1)+k],p);
            fastPolyMult(&poly1[k*(dx+1)],poly2,TEMP1,facDegs[(n-1)*(dz+1)+k],facDegs[(n-1)*(dz+1)],p);
            accDeg = max32s(facDegs[(n-1)*(dz+1)]+facDegs[(n-1)*(dz+1)+k],facDegs[(n-1)*(dz+1)+k]+facDegs[(n-1)*(dz+1)]);
            coeffDegs[(n-2)*(dz+1)+k] = max32s(accDeg,coeffDegs[(n-2)*(dz+1)+k]);

            poly1 = &CoeffCalc[(n-3)*(dx+1)*(dz+1) + k*(dx+1)];
            for(i=0;i<=accDeg;i++){
                poly1[i] = add64s(poly1[i],TEMP1[i],p);
            }

            //specific co-efficients
            for(i=n-3;i>=1;i--){
                poly1 = &F[(i)*(dx+1)*(dz+1)];
                poly2 = &F[(i)*(dx+1)*(dz+1) + k*(dx+1)];
                poly3 = &CoeffCalc[(i)*(dx+1)*(dz+1)];
                poly4 = &CoeffCalc[(i-1)*(dx+1)*(dz+1) + k*(dx+1)];

                cleanArray64s(TEMP2,dx+1);
                fastPolyMult(poly1,TEMP1,TEMP2,facDegs[(i)*(dz+1)],accDeg,p);
                fastPolyMult(poly2,poly3,TEMP2,facDegs[(i)*(dz+1)+k],coeffDegs[(i+1)*(dz+1)],p);
                accDeg = max32s(accDeg + facDegs[(i)*(dz+1)],facDegs[(i)*(dz+1)+k]+coeffDegs[(i+1)*(dz+1)]);
                coeffDegs[(i)*(dz+1)+k] =  max32s(accDeg,coeffDegs[(i)*(dz+1)+k]);

                poly1 = &CoeffCalc[(i-1)*(dx+1)*(dz+1) + k*(dx+1)];
                for(j=0;j<=accDeg;j++){
                    TEMP1[j] = TEMP2[j];
                    poly1[j] = add64s(poly1[j],TEMP2[j],p);
                }
            }

      }
   }
   T4 = clock();

   //print statements
    /*
   printf("Polynomial Multiplication time=%8.2fms\n",(s1)/1000.0/1.0);
   printf("Diophantine time=%8.2fms\n",(s4)/1000.0/1.0);
   printf("Sub time=%8.2fms\n",(s5)/1000.0/1.0);
   printf("Total Loop time=%8.2fms\n",(T4-T3)/1000.0/1.0);
    */
   return 0;

}
