\documentclass[11pt]{article}
\setlength{\topmargin}{-1cm} \setlength{\oddsidemargin}{0in}
\setlength{\textheight}{9in} \setlength{\textwidth}{6.51in}
\usepackage{amsfonts}
\author{
Simon Lo \footnotemark[1] , Michael Monagan \footnotemark[1] , Allan
Wittkopf \thanks{Supported by NSERC of Canada and the MITACS NCE of
Canada.}\\
{\tt \{sclo,mmonagan,wittkopf\}@cecm.sfu.ca } \\
Centre for Experimental and Constructive Mathematics, \\
Department of Mathematics, Simon Fraser University, \\
Burnaby, B.C., V5A 1S6, Canada. }
\title{A Modular Algorithm for Computing the Characteristic Polynomial of an Integer Matrix in Maple}
\date{}

\begin{document}
\maketitle
\begin{abstract}
Let $\mathbf{A}$ be an $n \times n$ matrix of integers. In this
paper we present details of our Maple implementation of a modular
method for computing the characteristic polynomial of $\mathbf{A}$.
Our implementation considers several different representations for
the computation modulo primes, including the use of double precision
floats. The algorithm presently implemented in Maple releases 7--9
is the Berkowitz algorithm. We present some timings comparing the
two methods on a large sparse matrix arising from an application in
combinatorics, and also some randomly generated dense matrices.
\end{abstract}

\section{Introduction}
Let $\mathbf{A}$ be an $n \times n$ matrix of integers. One way to
compute the characteristic polynomial $c(x) = \det(x \mathbf{I} -
\mathbf{A}) \in \mathbb{Z}[x]$ is to evaluate the characteristic
matrix at $n$ points, compute $n$ determinants of integer matrices,
then interpolate to obtain the characteristic polynomial.  The
determinants of integer matrices can be computed using a
fraction-free Gaussian elimination algorithm (see Chapter 9 of
Geddes et. al \cite{geddes}) in $O(n^3)$ integer multiplications and
divisions. This approach will lead to an algorithm that requires
$O(n^4)$ integer operations.

The algorithm presently implemented in Maple releases 7--9 is the
Berkowitz algorithm \cite{berkowitz}. It is a division free
algorithm and thus can be used to compute the characteristic
polynomial of a matrix over any commutative ring $R$. It does
$O(n^4)$ multiplications in $R$. In \cite{jounaidi}, Abdeljaoued
described a Maple implementation of a sequential version of the
Berkowitz algorithm and compared it with the interpolation method
and other methods. His implementation improves with sparsity to
$O(n^3)$ multiplications when the matrix has $O(n)$ non-zero
entries.

In section 2 we present details of our Maple implementation of a
modular method for computing $c(x)$, the characteristic polynomial
of $\mathbf{A}$. The algorithm computes the characteristic
polynomial modulo a sequence of primes and applies the Chinese
remainder theorem.  Our implementation considers several different
representations for the computation modulo primes, including the use
of double precision floats. In section 3 we present some timings
comparing our modular algorithm implementations and the Berkowitz
algorithm on a $364 \times 364$ sparse matrix arising from an
application in combinatorics from Quaintance \cite{quaintance}, and
also some timings comparing randomly generated large dense matrices.
In section 4 we present two additional improvements that improve the
running time. In section 5 we give a short asymptotic comparison of
the algorithms.


\section{A Dense Deterministic Modular Algorithm}

Let $\mathbf{A} \in \mathbb{Z}^{n \times n}$ be the input matrix. We
want to compute the characteristic polynomial $c(x)= \det(x
\mathbf{I} - \mathbf{A}) \in \mathbb{Z}[x]$. We will utilize a
modular algorithm to compute $c(x)$ by computing the characteristic
polynomial of $\mathbf{A}$ modulo a sequence of primes $p_1, p_2,
p_3, ...$ using the Hessenberg matrix algorithm, then use the
Chinese remaindering algorithm to reconstruct $c(x).$ The cost of
the Hessenberg approach is $O(n^3)$ arithmetic operations in
$\mathbb{Z}_p$ for each prime $p$.  An alternative to the Hessenberg
matrix approach would be a Krylov approach which has the same
asymptotic complexity. In the Krylov approach one starts with a
random vector $v \in \mathbb{Z}^n_p$ and computes the Krylov
sequence of vectors
\[ v, {\bf A} v, {\bf A}^2 v, {\bf A}^3 v, ..., {\bf A}^n v \]
stopping when a linear dependence between them is found. This linear
dependence provides a factor of the minimal polynomial of
$\mathbf{A}$. Here is our modular algorithm.

\bigskip

\noindent  {\bf Input:} Matrix $\mathbf{A} \in \mathbb{Z}^{n \times n}$ \\
  {\bf Output:} Characteristic polynomial $c(x) = \det(x \mathbf{I} - \mathbf{A}) \in \mathbb{Z}[x]$
  \begin{enumerate}
    \item Compute a bound $S$ (see below) larger than the largest coefficient of $c(x)$.
    \item Choose $t$ machine primes $p_1, p_2,\ldots, p_t $ such that
       $ m = \prod_{i=1}^t p_i > 2S . $
    \item {\bf for} $i=1$ {\bf to} $t$ {\bf do}
    \begin{enumerate}
      \item $\mathbf{A}_i \gets \mathbf{A} \bmod p_i.$
      \item Compute $c_i(x)$ --- the characteristic polynomial of $\mathbf{A}_i$ over
        $\mathbb{Z}_{p_i}$ via the Hessenberg algorithm.
    \end{enumerate}
    \item Apply the Chinese remainder theorem: \\
        \hspace*{5mm} Solve $c(x) \equiv c_i(x) \pmod {p_i}$ for $c(x) \in \mathbb{Z}_m[x].$

    \item Output $c(x)$ in the symmetric range for $\mathbb{Z}_m.$
  \end{enumerate}

  \noindent We can compute a bound $S$ for the largest coefficient
  of $c(x)$ by modifying a bound for the determinant of $\mathbf{A}$. We have
\[ {\rm det}(\mathbf{A}) \le n! \prod_{i=1}^n \max_{j=1}^n |a_{i,j}| \]
so a bound for $S$ is
\[ S \le {\rm det}( \mathbf{A'} ) ~~ {\rm where} ~~ a'_{i,j} =
\cases{1+ \left| a_{{i,i}} \right| &$i=j$\cr a_{{i,j}}&otherwise.}
\]

\subsection{Hessenberg Algorithm}

Recall that a square matrix $\mathbf{M} = (m_{i,j})$ is in upper
Hessenberg form if $m_{i,j}=0$ for all $i \ge j+2,$ in other words,
the entries below the first subdiagonal are zero.
  \[ \left( \begin{array}{ccccccc}
       m_{1,1} & m_{1,2} & m_{1,3} & \cdots & m_{1,n-2} & m_{1,n-1} & m_{1,n} \\
       m_{2,1} & m_{2,2} & m_{2,3} & \cdots & m_{2,n-2} & m_{2,n-1} & m_{2,n} \\
       0       & m_{3,2} & m_{3,3} & \cdots & m_{3,n-2} & m_{3,n-1} & m_{3,n} \\
       0       & 0       & m_{4,3} & \cdots & m_{4,n-2} & m_{4,n-1} & m_{4,n} \\
       \vdots  & \vdots  & \ddots  & \ddots & \vdots    & \vdots    & \vdots  \\
       0       & 0       & 0       & \ddots & m_{n-1,n-2} & m_{n-1,n-1} & m_{n-1,n} \\
       0       & 0       & 0       & \cdots & 0         & m_{n,n-1} & m_{n,n} \\
       \end{array} \right) \]

\noindent  The Hessenberg algorithm (see \cite{cohen}) consists of
the following two main steps:

  \begin{description}
  \item[Step 1:]  Reduce the matrix $\mathbf{M} \in \mathbb{Z}_p^{n \times n}$ into the upper Hessenberg form using a
    series of elementary row and column operations while preserving the characteristic
    polynomial. In the algorithm below,
    $\mathbf{R}_i$ denotes the $i$'th row of $\mathbf{M}$
    and $\mathbf{C}_j$ the $j$'th column of $\mathbf{M}$.
    In total, $O(n^3)$ operations in $\mathbb{Z}_p$ are performed.
    \bigskip

  \noindent  {\bf Input:} Matrix $\mathbf{M} \in \mathbb{Z}_p^{n \times n}$ \\
  {\bf Output:} Matrix $\mathbf{M}$ in upper Hessenberg form with
  the same eigenvalues
  \begin{tabbing}
    000\=000\=000\=000\=\kill
    {\bf for} $j=1$ {\bf to} $n-2$ {\bf do} \\
    \>search for a nonzero entry $m_{i,j}$ where $j+2 \le i \le
        n$ \\
    \>{\bf if} found such entry {\bf then} \\
    \>\>{\bf do}
          $\mathbf{R}_i \leftrightarrow \mathbf{R}_{j+1}$ and
          $\mathbf{C}_i \leftrightarrow \mathbf{C}_{j+1}$
          {\bf if} $m_{j+1,j}=0$ \\
    \>\>{\bf for} $k=j+2$ {\bf to} $n$ {\bf do} \\
    \>\>\>{\bf comment} reduce using $m_{j+1,j}$ as pivot \\
    \>\>\>$u \gets m_{k,j}\, {m_{j+1,j}}^{-1}$ \\
    \>\>\>$\mathbf{R}_k \gets \mathbf{R}_k - u \mathbf{R}_{j+1}$ \\
    \>\>\>$\mathbf{C}_{j+1} \gets \mathbf{C}_{j+1} + u \mathbf{C}_k$ \\
    \>\>{\bf end for} \\
    \>{\bf end if} \\
    \>{\bf comment} now the first $j$ columns of $\mathbf{M}$ is in upper Hessenberg form \\
    {\bf end for} \\
  \end{tabbing}

    It is clear from the algorithm that at each step of the outer
    for loop, we are performing elementary row and column
    operations, and that at the termination of the outer for loop, the entire matrix is reduced into upper
    Hessenberg form. Let $\mathbf{H}$ be the matrix $\mathbf{M}$ reduced
    into upper Hessenberg form, let the elementary matrix $\mathbf{E}_j$ represent the $j$'th
    elementary row operation and let $\mathbf{E} = \mathbf{E}_1 \mathbf{E}_2
    \cdots \mathbf{E}_{n-2}$. We can write $\mathbf{H} =
    \mathbf{E} \mathbf{M} \mathbf{E}^{-1}$ since the elementary
    column operations that we perform in the algorithm are inverses
    of elementary row operations. Thus, this is a similarity transformation.

    To see that $\mathbf{H}$ has the same characteristic polynomial as the matrix $\mathbf{M},$
    note that $\mathbf{H} = \mathbf{E} \mathbf{M} \mathbf{E}^{-1}$
    implies
    \[ \begin{array}{r@{\;=\;}l}
        \det(x \mathbf{I} - \mathbf{H}) &
            \det(x \mathbf{I} - \mathbf{E} \mathbf{M} \mathbf{E}^{-1}) \\
         & \det(\mathbf{E} (x \mathbf{I}) \mathbf{E}^{-1} - \mathbf{E} \mathbf{M}
         \mathbf{E}^{-1}) \\
         & \det(\mathbf{E} (x \mathbf{I} - \mathbf{M})
         \mathbf{E}^{-1}) \\
         & \det(\mathbf{E}) \det(x \mathbf{I} - \mathbf{M})
         \det(\mathbf{E}^{-1}) \\
         & \det(\mathbf{E}) \det(x \mathbf{I} - \mathbf{M})
         \det (\mathbf{E})^{-1} \\
         & \det(x \mathbf{I} - \mathbf{M}).
    \end{array} \]
    \bigskip

  \item[Step 2:] The characteristic polynomial $c(x) = p_{n+1}(x) \in \mathbb{Z}_p[x]$ of the upper Hessenberg form can be
    efficiently computed bottom up using $O(n^2)$ operations in $\mathbb{Z}_p[x]$ ($O(n^3)$ operations in
    $\mathbb{Z}_p$)
    from the following recurrence for $p_k(x).$
        \[ p_{k+1}(x) =
            \cases{ 1 & $k=0$ \cr
                    (x-m_{k,k})\,p_k(x) - \sum\limits_{i=1}^{k-1} (\prod\limits_{j=i}^{k-1} m_{j+1,j})\, m_{i,k}\, p_i(x) & $1 \le k \le n+1$ }
        \]

  \noindent The algorithm below computes the above recurrence bottom up,
  and clearly shows that $O(n^2)$ operations in $\mathbb{Z}_p[x]$
  are required.

  \noindent  {\bf Input:} Matrix $\mathbf{M} \in \mathbb{Z}_p^{n \times n}$ in upper Hessenberg form\\
  {\bf Output:} Characteristic polynomial $c(x) \in \mathbb{Z}_p[x]$ of $\mathbf{M}$
  \begin{tabbing}
    000\=000\=000\=000\=\kill
    $p_1(x) \gets 1$ \\
    {\bf for} $k=1$ {\bf to} $n$ {\bf do} \\
    \>$p_{k+1}(x) \gets (x-m_{k,k})\,p_k(x)$ \\
    \>$t \gets 1$ \\
    \>{\bf for} $i=1$ {\bf to} $k-1$ {\bf do} \\
    \>\>$t \gets t\,m_{k-i+1,k-i}$ \\
    \>\>$p_{k+1}(x) \gets p_{k+1} - t\,m_{k-i,k}\,p_{k-i}(x)$ \\
    \>{\bf end for} \\
    {\bf end for} \\
    {\bf output} $p_{n+1}(x)$ \\
  \end{tabbing}


  \end{description}


\section{Timings and Implementations}
For a machine prime $p$, in order to improve the running time of our
algorithm, we've implemented the Hessenberg algorithm over
$\mathbb{Z}_p$ in the C programming language and the rest of the
algorithm in Maple. We used the Maple external function interface to
call the C code (see \cite{maple}). We've implemented both the
32-bit integer version and 64-bit integer versions, and also several
versions using 64-bit double precision floating point values for
comparison.

The following table consists of some timings (in seconds) of our
modular Hessenberg algorithm for a sparse $364\times364$ input
matrix arising from an application in combinatorics (see
\cite{quaintance}). Rows 1-9 below are for the modular algorithm
using different implementations of arithmetic for $\mathbb{Z}_p.$
The accelerated floating point version {\bf fprem} using 25-bit
primes generally give the best times.
\begin{center}
    \begin{tabular}{|l||*{5}{r @{} l|}}
        \hline
        Versions &
                \multicolumn{2}{l|}{{ Xeon}} &
                \multicolumn{2}{l|}{{ Opteron}} &
                \multicolumn{2}{l|}{{ AXP2800}} &
                \multicolumn{2}{l|}{{ Pentium M}} &
                \multicolumn{2}{l|}{{ Pentium 4}} \\
        &   \multicolumn{2}{l|}{{ 2.8 GHz}} &
                \multicolumn{2}{l|}{{ 2.0 GHz}} &
                \multicolumn{2}{l|}{{ 2.08 GHz}} &
                \multicolumn{2}{l|}{{ 2.00 GHz}} &
                \multicolumn{2}{l|}{{ 2.80 GHz}} \\
        \hline
        \hline
        {\bf 64int} &    100&.7 & 107&.4 & & & & & &  \\ \hline
        {\bf 32int} &    66&.3  & 73&.0  & 76&.8  & 35&.6  & 57&.4 \\ \hline
        {\bf new 32int} &49&.7  & 54&.7  & 56&.3  & 25&.5  & 39&.6 \\ \hline
        {\bf fmod} &     29&.5  & 32&.1  & 33&.0  & 35&.8  & 81&.1 \\ \hline
        {\bf trunc} &    67&.8  & 73&.7  & 69&.6  & 88&.5  & 110&.6 \\ \hline
        {\bf modtr} &    56&.3  & 62&.5  & 59&.5  & 81&.0  & 82&.6 \\ \hline
        {\bf new fmod} & 11&.0  & 11&.6  & 14&.5  & 15&.2  & 28&.8 \\ \hline
        {\bf fprem} &     10&.4  & 10&.9  & 13&.7  & 13&.9  & 26&.8 \\ \hline
        {\bf fLA} &      17&.6  & 19&.9  & 21&.9  & 26&.2  & 27&.3 \\ \hline
        {\bf Berkowitz} & 2053&.6 & 2262&.6 & & & & & & \\
        \hline
    \end{tabular}
\end{center}

\bigskip
{\large \bf \noindent Implementations}

\begin{description}

\item[64int] The 64-bit integer version is implemented using the \emph{long long int} datatype in C,
or equivalently the \emph{integer[8]} datatype in Maple. We can use
32-bit primes. All modular arithmetic first executes the
corresponding 64-bit integer machine instruction, then reduces the
result mod $p$ because we work in $\mathbb{Z}_p$. We allow both
positive and negative integers of magnitude less than $p$.

\item[32int]  The 32-bit integer version is similar, but implemented using the \emph{long int} datatype in C,
or equivalently the \emph{integer[4]} datatype in Maple. 16-bit
primes are used here.

\item[new 32int]  This is an improved {\bf 32int}, with
various hand/compiler optimizations.

\item[fmod]  This 64-bit float version is implemented using the \emph{double} datatype in C,
or equivalently the \emph{float[8]} datatype in Maple. 64-bit float
operations are used to simulate integer operations. Operations such
as additions, subtractions, multiplications are followed by a call
to the C library function \emph{fmod()} to reduce the results mod
$p$, since we are working in $\mathbb{Z}_{p}$. We allow both
positive and negative floating point representations of integers
with magnitude less than $p$.

\item[trunc]  This 64-bit float version is similar to above, but uses
the C library function \emph{trunc()} instead of \emph{fmod()}. To
compute $b \gets a \bmod p$, we first compute $c \gets a-p \times
\emph{trunc(} a/p \emph{)}$, then $b \gets c$ if $c \neq \pm p$, $b
\gets 0$ otherwise. The \emph{trunc()} function rounds towards zero
to the nearest integer.

\item[modtr]  A modified {\bf trunc} version,
where we do not do the extra check for equality to $\pm p$ at the
end. So to compute $b \gets a \bmod p$, we actually compute $b \gets
a-p \times \emph{trunc(} a/p \emph{)}$, which results in $-p \leq b
\leq p$.

\item[new fmod]  An improved {\bf fmod} version, where
we have reduced the number of times \emph{fmod()} is called. In
other words, we reduce the results mod $p$ only when the number of
accumulated arithmetic operations on an entry exceeds a certain
threshold. In order to allow this, we are restricted to use 25-bit
primes. We call this delayed mod acceleration. See the next
subsection.

\item[fprem] Equivalent to {\bf new fmod} version, but via direct
assembly programming using {\it fprem} instruction, removing the
function call overhead and making some efficiency improvements.

\item[fLA]  An improved {\bf trunc} version using
delayed mod acceleration. It is the default used in Maple's
\verb"LA:-Modular" routines.
\end{description}

% Allan's stuff
\subsection{Efficiency Considerations}

There are a few considerations for use of floating point for mod $p$
computations. Keeping these in mind, one can implement faster
external code for the algorithms than is possible with the integer
codes, and still have the advantage of using larger primes on 32-bit
machines.
\begin{enumerate}
\item Although floating point integer computations can represent
$53$-bit numbers accurately, we restrict the modulus to $p <
2^{25}$, which allows for more efficient mod operations, and
multiple mod operations (up to 8) to occur before having to reduce
modulo $p$. We call this the delayed mod acceleration.
\item Leveraging the smaller primes allows up to 8 computations
(using a maximal size prime) to occur before we must perform a
$\bmod$. This can be efficiently utilized in row-based algorithms,
as a counter associated with each row can count the number of
operations performed, and the algorithms can be made to only perform
the mod once the maximal number of computations is reached.
\item Floating point computations have a number of ways in which
a mod can be performed, including but not limited to subtracting the
floor of the inverse of the modulus times the number from the
number, the floating point mod operation {\it fmod} or {\it fprem},
using {\it trunc}, etc.
\end{enumerate}

\noindent In our experiments we found the following: Use of the
smaller primes, and delayed mod, mentioned in items $1$ and $2$
above increased performance by a factor of $2$-$3$.

With these modifications, use of floating point modular arithmetic
generally demonstrated better performance than integer modular
arithmetic.

The use of the C-library {\it fmod} function or direct assembly
programming using the {\it fprem} instruction (essentially
equivalent modulo function call overhead and some efficiency
improvements made available for our specific use of {\it fmod})
showed better performance than the other floating point schemes,
except on the Pentium 4, on which it was approximately equal. Note
also that on Pentium M the {\it fprem} performance was nearly a
factor of 2 times better.

% end Allan's stuff

\subsection{Timings for Dense Matrices}
The following table consists of some timings (in seconds) of our
modular Hessenberg algorithm using float ({\bf fprem}) and integer
({\bf new 32int}) implementations on dense $n \times n$ matrices,
with uniformly random integer entries between $-$999 and 999. We
also compare with Maple's Berkowitz algorithm. The timings were done
on a dual Opteron 2.2Ghz processor running Unix.
\begin{center}
    \begin{tabular}{|l||*{4}{r @{} l|}}
        \hline
        n &
                \multicolumn{2}{l|}{{ float}} &
                \multicolumn{2}{l|}{{ integer}} &
                \multicolumn{2}{l|}{{ Berkowitz}}  \\
        \hline \hline
        50  & $<$0&.1 & 0&.13  & 7&.85  \\ \hline
        100 & 0&.61 & 1&.85  & 128&.6  \\ \hline
        200 & 8&.82 & 30&.8  & 2248&.1  \\ \hline
        300 & 45&.4 & 153&.2  & &   \\ \hline
        400 & 173&.5 & 493&.4  & &   \\ \hline
        600 & 1195&.2 & 2973&.1   &  &  \\ \hline
        800 & 4968&.8 &   &    & &  \\ \hline
    \end{tabular}
\end{center}

\noindent Note that the time for Berkowitz algorithm on a dense $200
\times 200$ integer matrix is even slower than a sparse $364 \times
364$ integer matrix, resulting from the cost of large integer
arithmetic. Maple's Berkowitz algorithm is much much slower than the
others, as the timings suggest.

From the above data, we can see that the float version is always
faster than the integer version (about 3 times faster). Therefore in
practice, we would always use the float version.

% A method to improve the running time still further would be to use
% the early termination technique (see \cite{charpoly}) to stop the
% Chinese remaindering. It would convert the deterministic algorithm
% into a Monte-Carlo type probabilistic algorithm, with a bounded
% probability of error. A good bound for the probability of error,
% such as $10^{-50}$ would be sufficient for all practical
% applications.

\section{Improvements to the Basic Algorithm}

A method to improve the running time further would be to use the
early termination technique described below. Consider the following
matrix
  \[ \mathbf{A} = \left( \begin{array}{cccc}
       1 & u & v & w \\
       0 & 2 & x & y \\
       0 & 0 & 3 & z \\
       0 & 0 & 0 & 4 \\
       \end{array} \right). \]
The characteristic polynomial of $\mathbf{A}$ is $c(x) =
(x-1)(x-2)(x-3)(x-4) = x^4-10x^3+35x^2-50x+24.$ Notice that the
largest coefficient of $c(x)$ does not depend on any of the entries
$u, v, w, x, y, z.$ So if $u, v, w, x, y, z$ are large, then the
bound $S$ for the largest coefficient of $c(x)$ would be arbitrarily
far off. Our modular algorithm would use too many primes and would
do too many unnecessary modular computations. This observation
suggests that we use a output sensitive version of the algorithm and
not use a bound at all. We will incrementally apply the Chinese
remainder theorem to reconstruct $c(x)$ and stop the Chinese
remaindering once $\zeta$ consecutive modular images ``remain the
same''.

Let $p_1, p_2,\ldots, p_s, p_{s+1}, \ldots, p_{s+\zeta}$ be machine
primes, $c_i(x) \equiv c(x) \pmod {p_i$}, $C_i(x) \equiv c(x) \pmod
{p_1 \cdots p_i}$. Application of the Chinese remainder theorem
allows us to construct $C_i(x)$ from $c_i(x)$ and $C_{i-1}(x)$. This
is an incremental version of Garner's algorithm. Now suppose that
$C_s(x) = C_{s+1}(x) = \cdots = C_{s+\zeta},$ then there is a high
probability that $c(x) = C_s(x).$ Choosing $\zeta$ carefully will
ensure that the probability of premature termination of the Chinese
remaindering is low.

This output sensitive probabilistic version of the modular algorithm
is much faster than the deterministic version when the largest
coefficient of the characteristic polynomial is much smaller than
the bound. On the sparse $364 \times 364$ example in the previous
section, the timing improves by about 30\%.

\subsection{Strongly Connected Components}

For the example above, the Hessenberg algorithm does not need to do
any work since $\mathbf{A}$ is already in upper Hessenberg form. Now
consider the matrix $\mathbf{A}^T$, i.e.,
  \[ \mathbf{A}^T = \left( \begin{array}{cccc}
       1 & 0 & 0 & 0 \\
       u & 2 & 0 & 0 \\
       v & x & 3 & 0 \\
       w & y & z & 4 \\
       \end{array} \right) \]
which has the same characteristic polynomial. The algorithm would be
required to do some actual reductions. This observation lead us to
ask the following question: For what matrix structures can we
compute the characteristic polynomial quickly? Consider a matrix of
the form
  \[ \mathbf{B} = \left( \begin{array}{cccc}
       a & b & w & x \\
       c & d & y & z \\
       0 & 0 & e & f \\
       0 & 0 & g & h \\
       \end{array} \right). \]
Let $c_\mathbf{B}(x)$ be the characteristic polynomial of
$\mathbf{B}$. Then $c_\mathbf{B}(x) = c_{\mathbf{B}_1}(x)
c_{\mathbf{B}_2}(x)$ where
  \[ \mathbf{B}_1 = \left( \begin{array}{cc}
       a & b \\
       c & d \\
       \end{array} \right)
       ~{\rm and}~
     \mathbf{B}_2 = \left( \begin{array}{cc}
       e & f \\
       g & h \\
       \end{array} \right).
        \]
If the input matrix is block upper (lower) triangular, then the
computation of the characteristic polynomial is quick, since the
characteristic polynomial is just the product of the characteristic
polynomials of the diagonal blocks. But often matrices are not block
upper (lower) triangular, but are row and column permutations of
block upper (lower) triangular matrices. For example,
  \[ \mathbf{P} = \left( \begin{array}{cccc}
       h & 0 & g & 0 \\
       z & d & y & c \\
       f & 0 & e & 0 \\
       x & b & w & a \\
       \end{array} \right) \]
is the matrix $\mathbf{B}$ with rows 1,4 and columns 1,4
interchanged, and is no longer block upper triangular. The main
observation here is that simultaneous row and column interchanges do
not modify the characteristic polynomial (the arguments of section
2.1 also work here), so that if the rows and columns of a matrix are
permuted by the same permutation, then the characteristic polynomial
remains unchanged.

In order to apply the above observation, we need to efficiently
compute the permutation that would make the matrix block upper
triangular. It turns out that we can compute this permutation in
linear time by finding the strongly connected components of a
directed graph.

Let $\mathbf{A}$ be a $n \times n$ matrix. We denote by
$Graph(\mathbf{A})$ the weighted directed graph with $n$ vertices
such that $\mathbf{A}$ is the adjacency matrix of
$Graph(\mathbf{A})$. Recall that a directed graph $G$ is strongly
connected if for each pair of vertices $u, v \in G, u \ne v$, there
exists a path $u \leadsto v.$ Also, every directed graph can be
partitioned into maximal strongly connected components. Note that
permuting the vertices corresponds to permuting the rows and columns
of $\mathbf{A}.$

Denote by $\mathbf{A}_{(u_1,u_2,...,u_r),(v_1,v_2,...,v_s)}$ the $r
\times s$ submatrix of $\mathbf{A}$ such that
\[(\mathbf{A}_{(u_1,u_2,...,u_r),(v_1,v_2,...,v_s)})_{i,j} =
\mathbf{A}_{u_i,v_j}.\] The method works as follows:
\begin{enumerate}
\item Compute (see below) the $k$ strongly connected components of
$Graph(\mathbf{A}): V_1, V_2, ..., V_k$ where $V_i =
\{{v_i}_1,{v_i}_2,...,{v_i}_{n_i}\}.$
\item For $1 \le i \le k$, compute the characteristic
polynomial of the submatrix
$\mathbf{A}_{({v_i}_1,{v_i}_2,...,{v_i}_{n_i}),({v_i}_1,{v_i}_2,...,{v_i}_{n_i})}.
$ We denote these submatrices as $\mathbf{A}_{V_i,V_i}.$
\item Output (the characteristic polynomial $c(x)$ of $\mathbf{A}$) the
product of the characteristic polynomials computed in step 2.
\end{enumerate}

To see why the above method works let the $k$ strongly connected
components of $G = Graph(\mathbf{A})$ be $V_1, ..., V_k$. We can
topologically sort the $k$ strongly connected components such that
$V_i \prec V_j$ implies there does not exist $u \in V_i, v \in V_j$
such that $(v,u) \in G$. Without loss of generality assume $V_1
\prec V_2 \prec ... \prec V_k.$ Consider relabeling the vertices in
increasing topological order (which is not unique) and consider the
adjacency matrix $\mathbf{A}'$ that represents the relabeled graph.
First, it is clear that $\mathbf{A}'$ has the same characteristic
polynomial as $\mathbf{A}$ since $\mathbf{A}'$ is obtained by
permuting rows and columns of $\mathbf{A}$. Note that the $(i,j)$'th
blocks of $\mathbf{A}'$, denoted by $\mathbf{A}'_{i,j}$ are
precisely the submatrices $\mathbf{A}_{V_i,V_j}$. Therefore, the
diagonal $\mathbf{A}'_{i,i}$'s are precisely the
$\mathbf{A}_{V_i,V_i}$'s. Furthermore, the relabeling guarantees
$i<j$ if and only if $V_i \prec V_j$ so if $i<j$, then we have
$\mathbf{A}'_{j,i} = \mathbf{0}.$ Thus, $\mathbf{A}'$ is block upper
triangular, so the characteristic polynomial is just the product of
the characteristic polynomials of the diagonal blocks, the
$\mathbf{A}_{V_i,V_i}$'s. In practice, it is not necessary to
topologically sort the $V_i$'s because in order to compute $c(x),$
we do not need to know explicitly any of the entries in
$\mathbf{A}'_{i,j},$ where $i < j.$ Also, the ordering of the
vertices in each $V_i$ does not matter.

Now consider the example matrix $\mathbf{P}$ above
  \[ \mathbf{P} = \left( \begin{array}{cccc}
       h & 0 & g & 0 \\
       z & d & y & c \\
       f & 0 & e & 0 \\
       x & b & w & a \\
       \end{array} \right) .\]
Here, the strongly connected components of $Graph(\mathbf{P})$ are
$V_1 = \{1,3\}$ and $V_2 = \{2,4\}.$ To see this, it might help the
reader to draw $Graph(\mathbf{P}).$ Next we compute the product of
the characteristic polynomials of
  \[ \left( \begin{array}{cc}
       h & g \\
       f & e \\
       \end{array} \right) ~{\rm and}~
     \left( \begin{array}{cc}
       d & c \\
       b & a \\
       \end{array} \right).
        \]
Clearly we would get the characteristic polynomial of $\mathbf{P},$
which is also the characteristic polynomial of $\mathbf{B}.$ Observe
that $V_2 \prec V_1.$ Suppose we choose the ordering 2,4,1,3, then
we obtain the matrix
  \[ \mathbf{P}' = \left( \begin{array}{cccc}
       d & c & z & y \\
       b & a & x & w \\
       0 & 0 & h & g \\
       0 & 0 & f & e \\
       \end{array} \right) .\]
It doesn't quite look like $\mathbf{B},$ but the characteristic
polynomial of $\mathbf{P}'$ is the same as $\mathbf{B}$ above. The
difference is that the vertices in $V_i$'s are permuted. If we
choose the ordering 4,2,3,1, then we obtain $\mathbf{B}.$

The following piece of Maple code (available at
\verb"http://www.cecm.sfu.ca/CAG/papers/SCC.txt") computes the
strongly connected components in linear time and outputs the
submatrices in a list (see \cite{tarjan} for a description and proof
of the algorithm):
\begin{verbatim}
SCCBlocks := proc(M::Matrix)
   local StrongConnect,i,j,Stack,top,Number,Lowlink,OnStack,u,n,m,k,SCC,t,A;
   StrongConnect := proc(v)
      local w;
      i := i+1;
      Number[v] := i;
      Lowlink[v] := i;
      top := top+1;
      Stack[top] := v;
      OnStack[v] := true;
      for w in A[v] do
         if Number[w] = 0 then
            StrongConnect(w);
            Lowlink[v] := min(Lowlink[v],Lowlink[w])
         elif Number[w] < Number[v] then
            if OnStack[w] then
               Lowlink[v] := min(Lowlink[v],Number[w])
            end if
         end if
      end do;
      if Lowlink[v] = Number[v] then
         t := top;
         while t>0 and Number[Stack[t]] >= Number[v] do
            OnStack[Stack[t]] := false;
            t := t-1;
         end do;
         SCC[k] := [seq(Stack[j],j=t+1..top)];
         top := t;
         k := k+1
      end if
   end proc:

   n,m := op(1,M);
   A:=[seq([seq(`if`(M[i,j]=0,NULL,j),j=1..n)],i=1..n)];

   i := 0;
   k := 1;
   Stack := Array(1..n);
   top := 0;
   Number := Array(1..n);
   Lowlink := Array(1..n);
   OnStack := Array(1..n,fill=false);
   for u to n do
      if Number[u] = 0 then StrongConnect(u) end if
   end do;
   [seq(`if`(nops(SCC[i])=1 and M[op(SCC[i]),op(SCC[i])]=0,
             NULL, M[SCC[i],SCC[i]]),i=1..k-1)]
end proc:
\end{verbatim}

The output is a list of non-zero square matrices
$\mathbf{A}_1,\mathbf{A}_2,...,\mathbf{A}_r$. Let $m=\sum_{i=1}^r
{\rm dim}(\mathbf{A}_i).$ If the input $\mathbf{M}$ is an $n \times
n$ matrix, the output satisfies
\[c_{\mathbf{M}}(x) = x^{n-m} \prod_{i=1}^r c_{\mathbf{A}_i}(x)\]
where $c_{\mathbf{A}}(x)$ is the characteristic polynomial of
$\mathbf{A}$.

Running this on the sparse $364 \times 364$ matrix yields 12 blocks
of sizes $\{5,9,10,22,48,54,76,93\}.$ It takes 0.25 seconds. Note
that the line
\[\verb"A:=[seq([seq(`if`(M[i,j]=0,NULL,j),j=1..n)],i=1..n)]"\] took
0.20 out of a total of 0.25 seconds. Therefore if the matrix is
sparse, one needs to implement more efficient procedures for
extracting the column or row indices of the nonzero entries of a
matrix.

In total, the two improvements mentioned above have reduced the time
for computing the characteristic polynomial for the sparse $364
\times 364$ example to under 0.5 seconds, compared to 10.4 seconds
without the improvements!


\section{Asymptotic Comparison of the Methods}

Let $\mathbf{A}$ be an $n \times n$ matrix of integers. To compare
the running times of the Berkowitz algorithm and the modular
algorithm, we suppose that the entries of $\mathbf{A}$ are bounded
by $B^m$ in magnitude, that is, they are $m$ base $B$ digits in
length.  For both algorithms, we need a bound $S$ on the size of the
coefficients of the characteristic polynomial $c(x)$.  A generic
bound on the size of the determinant of $\mathbf{A}$ is sufficient
since this is the largest coefficient of $c(x)$. The magnitude of
the determinant of $\mathbf{A}$ is bounded by $S = n! B^{mn}$ and
its length is bounded by $n \log_B n + m n$ base $B$ digits. If $B >
2^{15}$ then we may assume $\log_B n < 2$ in practice and hence the
length of the determinant is $O(m n)$ base $B$ digits.

In Berkowitz's algorithm, the $O(n^4)$ integer multiplications are
on integers of average size $O(m n)$ digits in length, hence the
complexity (assuming classical integer arithmetic is used) is $O(n^4
(mn)^2)$. Since Maple 9 uses the FFT for integer multiplication and
division, the complexity is reduced to $\tilde{O}(n^5 m)$.

In the modular algorithm, we will need $O(mn)$ machine primes. The
cost of reducing the $n^2$ integers in $\mathbf{A}$ modulo one prime
is $O(m n^2)$. The cost of computing the characteristic polynomial
modulo each prime $p$ is $O(n^3)$. The cost of the Chinese
remaindering assuming a classical method for the Chinese remainder
algorithm (which is what Maple uses) is $O(n (mn)^2)$.  Thus the
total complexity is $O(mn m n^2 + mn n^3 + n (mn)^2)$ = $O(m^2 n^3 +
m n^4)$.

If we assume $m=O(n),$ that is, the size of the integer grows
proportionally with the size of the matrix, the complexity of the
Berkowitz algorithm and the modular algorithm is $\tilde{O}(n^6)$
and $O(n^5)$ respectively.

If we have $|A_{i,j}|<B$ for all $n$ (in the second set of timings
$|A_{i,j}|<10^3$), the complexity of the Berkowitz algorithm and the
modular algorithm is $\tilde{O}(n^5)$ and $O(n^4)$ respectively. In
concluding, we mention that algorithms which are asymptotically
faster than $O(n^4)$ are known. Two recent reference are Kaltofen
and Villard in \cite{Villard} and Dumas, Pernet and Wan in
\cite{charpoly}. We have not implemented any of these algorithms.


\begin{thebibliography}{}
\bibitem{jounaidi}{J. Abdeljaoued.
The Berkowitz Algorithm, Maple and Computing the Characteristic
Polynomial in an Arbitrary Commutative Ring. {\it MapleTech} {\bf
5}(1), pp. 21--32, Birkhauser, 1997.}

\bibitem{berkowitz}{S. J. Berkowitz. On Computing the Determinant
in Small Parallel time using a Small Number of Processors. {\it Inf.
Porcessing Letters} {\bf 18}(3) pp. 147--150, 1984.}

\bibitem{cohen}{H. Cohen. {\it A Course in Computational Algebraic
Number Theory.} Graduate texts in mathematics, {\bf 138},
Springer-Verlag, 1995.}

\bibitem{charpoly}{J.-G. Dumas, C. Pernet, Z. Wan. Efficient
Computation of the Characteristic Polynomial. To appear in {\it The
proceedings of ISSAC 2005,} ACM Press, 2005}

\bibitem{geddes}{K. O. Geddes, S. R. Czapor, and G. Labahn. {\em
      Algorithms for Computer Algebra}.  Kluwer Academic Publ.,
    Boston, Massachusetts, USA, 1992.}

\bibitem{Villard}{E. Kaltofen, G. Villard. On the Complexity of
Computing Determinants. {\it Journal of Computational Complexity}
{\bf 13} pp. 91--130, 2004.}

\bibitem{maple}{M. Monagan, K. Geddes, K. Heal, G. Labahn, S. Vorkoetter,
J. McCarron, P. deMarco. {\it Maple 9 Advanced Programming Guide}, Ch. 6, Maplesoft, 2003.}

\bibitem{quaintance}{ J. Quaintance. $m \times n$ Proper Arrays:
Geometric Construction and the Associated Linear Cellular Automata.
{\it Proceedings of the 2004 Maple Summer Workshop}, 2004.}

\bibitem{tarjan}{R. Tarjan. Depth-First Search and Linear Graph
Algorithms. {\it SIAM Journal on Computing} {\bf 1}(2) pp. 146--160,
1972.}

\end{thebibliography}

\end{document}