\begin{thebibliography}{100} \bibitem{aboelaze91a} {\sc M.~Aboelaze, N.~Chrisochoides, and E.~Houstis}, {\em {The Parallelization of Level 2 and 3 BLAS Operations on Distributed Memory Machines}}, Tech. Rep. CSD-TR-91-007, Purdue University, West Lafayette, IN, 1991. \bibitem{agarwal94b} {\sc R.~Agarwal, F.~Gustavson, and M.~Zubair}, {\em {Improving Performance of Linear Algebra Algorithms for Dense Matrices Using Algorithmic Prefetching}}, IBM J. Res. Dev., 38 (1994), pp.~265--275. \bibitem{laug} {\sc E.~Anderson, Z.~Bai, C.~Bischof, J.~Demmel, J.~Dongarra, J.~Du~Croz, A.~Greenbaum, S.~Hammarling, A.~McKenney, S.~Ostrouchov, and D.~Sorensen}, {\em {LAPACK} Users' Guide}, Society for Industrial and Applied Mathematics, Philadelphia, PA, second~ed., 1995. \bibitem{lawn20} {\sc E.~Anderson, Z.~Bai, C.~Bischof, J.~Demmel, J.~Dongarra, J.~Du~Croz, A.~Greenbaum, S.~Hammarling, A.~McKenney, and D.~Sorensen}, {\em {LAPACK}: A portable linear algebra library for high-performance computers}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-90-105, University of Tennessee, Knoxville, TN, May 1990. \newblock (Also LAPACK Working Note \#20). \bibitem{lawn31} {\sc E.~Anderson, Z.~Bai, and J.~Dongarra}, {\em Generalized {QR} factorization and its applications}, Linear Algebra and Its Applications, 162-164 (1992), pp.~243--273. \newblock (Also LAPACK Working Note \#31). \bibitem{angus90a} {\sc I.~Angus, G.~Fox, J.~Kim, and D.~Walker}, {\em {Solving Problems on Concurrent Processors: Software for Concurrent Processors}}, vol.~2, Prentice Hall, Englewood Cliffs, N.J, 1990. \bibitem{ieee754} {\sc {ANSI/IEEE}}, {\em \mbox{IEEE~Standard~for~Binary~Floating~Point~Arithmetic}}, New York, {S}td 754-1985~ed., 1985. \bibitem{ieee854} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em \mbox{IEEE~Standard~for~Radix~Independent~Floating~Point~Arithmetic}}, New York, {S}td 854-1987~ed., 1987. \bibitem{ariolidemmelduff} {\sc M.~Arioli, J.~W. Demmel, and I.~S. Duff}, {\em Solving sparse linear systems with sparse backward error}, {SIAM} J. Matrix Anal. Appl., 10 (1989), pp.~165--190. \bibitem{ashcraft90a} {\sc C.~Ashcraft}, {\em {The Distributed Solution of Linear Systems Using the Torus-wrap Data mapping}}, Tech. Rep. ECA-TR-147, Boeing Computer Services, Seattle, WA, 1990. \bibitem{bai92a} {\sc Z.~Bai and J.~Demmel}, {\em {Design of a parallel nonsymmetric eigenroutine toolbox, {P}art {I}}}, in Proceedings of the Sixth SIAM Conference on Parallel Processing for Scientific Computing, SIAM, 1993, pp.~391--398. \bibitem{baidemmel97} {\sc Z.~Bai and J.~Demmel}, {\em Using the matrix sign function to compute invariant subspaces}, SIAM J. Matrix Anal. Appl, x (1997), p.~xxx. \newblock to appear. \bibitem{lawn91} {\sc Z.~Bai, J.~Demmel, J.~Dongarra, A.~Petitet, H.~Robinson, and K.~Stanley}, {\em The spectral decomposition of nonsymmetric matrices on distributed memory computers}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-95-273, University of Tennessee, Knoxville, TN, 1995. \newblock (Also {LAPACK} {W}orking {N}ote No. 91), To appear in {SIAM} J. Sci. Stat. Comput. \bibitem{baidemmel92a} {\sc Z.~Bai and J.~W. Demmel}, {\em Design of a parallel nonsymmetric eigenroutine toolbox, {P}art {I}}, in Proceedings of the Sixth SIAM Conference on Parallel Processing for Scientific Computing, R.~F.~{\em et al}. Sincovec, ed., Philadelphia, PA, 1993, Society for Industrial and Applied Mathematics, pp.~391--398. \newblock Long version available as {C}omputer {S}cience {R}eport {CSD}-92-718, {U}niversity of {C}alifornia, {B}erkeley, 1992. \bibitem{barlowdemmel} {\sc J.~Barlow and J.~Demmel}, {\em Computing accurate eigensystems of scaled diagonally dominant matrices}, {SIAM} J. Num. Anal., 27 (1990), pp.~762--791. \newblock (Also LAPACK Working Note \#7). \bibitem{lawn111} {\sc J.~Bilmes, K.~Asanovic, J.~Demmel, D.~Lam, and C.~Chin}, {\em Optimizing matrix multiply using {PH}i{PAC}: A portable, high-performance, {ANSI} {C} coding methodology}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-96-326, University of Tennessee, Knoxville, TN, 1996. \newblock (Also LAPACK Working Note \#111). \bibitem{BV89} {\sc R.~H. Bisseling and J.~G.~G. {van de Vorst}}, {\em Parallel {LU} decomposition on a transputer network}, in Lecture Notes in Computer Science, Number 384, G.~A. {van Zee} and J.~G.~G. {van de Vorst}, eds., Springer-Verlag, 1989, pp.~61--77. \bibitem{blackford96a} {\sc L.~S. Blackford, J.~Choi, A.~Cleary, J.~Demmel, I.~Dhillon, J.~J. Dongarra, S.~Hammarling, G.~Henry, A.~Petitet, K.~Stanley, D.~W. Walker, and R.~C. Whaley}, {\em {ScaLAPACK}: A portable linear algebra library for distributed memory computers - design issues and performance}, in Proceedings of Supercomputing '96, Sponsored by ACM SIGARCH and IEEE Computer Society, 1996. \newblock (ACM Order Number: 415962, IEEE Computer Society Press Order Number: RS00126. http://www.supercomp.org/sc96/proceedings/). \bibitem{lawn112} {\sc L.~S. Blackford, A.~Cleary, J.~Demmel, I.~Dhillon, J.~Dongarra, S.~Hammarling, A.~Petitet, H.~Ren, K.~Stanley, and R.~C. Whaley}, {\em Practical experience in the dangers of heterogeneous computing}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-96-330, University of Tennessee, Knoxville, TN, July 1996. \newblock (Also LAPACK Working Note \#112), to appear ACM Trans. Math. Softw., 1997. \bibitem{brent92a} {\sc R.~Brent}, {\em {The LINPACK Benchmark on the AP 1000}}, in Frontiers, 1992, McLean, VA, 1992, pp.~128--135. \bibitem{brent93a} {\sc R.~Brent and P.~Strazdins}, {\em {Implementation of BLAS Level 3 and LINPACK Benchmark on the AP1000}}, Fujitsu Scientific and Technical Journal, 5 (1993), pp.~61--70. \bibitem{netlib1} {\sc S.~Browne, J.~Dongarra, S.~Green, E.~Grosse, K.~Moore, T.~Rowan, and R.~Wade}, {\em Netlib services and resources (rev. 1)}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-94-222, University of Tennessee, Knoxville, TN, 1994. \bibitem{netlib2} {\sc S.~Browne, J.~Dongarra, E.~Grosse, and T.~Rowan}, {\em The netlib mathematical software repository}, {D-Lib} Magazine (www.dlib.org), (1995). \bibitem{lawn93} {\sc J.~Choi, J.~Demmel, I.~Dhillon, J.~Dongarra, S.~Ostrouchov, A.~Petitet, K.~Stanley, D.~Walker, and R.~C. Whaley}, {\em Installation guide for {ScaLAPACK}}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-95-280, University of Tennessee, Knoxville, TN, March 1995. \newblock (Also LAPACK Working Note \#93). \bibitem{lawn95} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em {ScaLAPACK}: A portable linear algebra library for distributed memory computers - design issues and performance}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-95-283, University of Tennessee, Knoxville, TN, March 1995. \newblock (Also LAPACK Working Note \#95). \bibitem{lawn100} {\sc J.~Choi, J.~Dongarra, S.~Ostrouchov, A.~Petitet, D.~Walker, and R.~C. Whaley}, {\em A proposal for a set of parallel basic linear algebra subprograms}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-95-292, University of Tennessee, Knoxville, TN, May 1995. \newblock (Also LAPACK Working Note \#100). \bibitem{lawn55} {\sc J.~Choi, J.~Dongarra, R.~Pozo, and D.~Walker}, {\em {ScaLAPACK}: A scalable linear algebra library for distributed memory concurrent computers}, in Proceedings of the Fourth Symposium on the Frontiers of Massively Parallel Computation, McLean, Virginia, 1992, {IEEE} {C}omputer {S}ociety {P}ress, pp.~120--127. \newblock (Also LAPACK Working Note \#55). \bibitem{lawn92} {\sc J.~Choi, J.~Dongarra, and D.~Walker}, {\em The design of a parallel dense linear algebra software library: Reduction to {Hessenberg}, tridiagonal and bidiagonal form}, Numerical Algorithms, 10 (1995), pp.~379--399. \newblock (Also LAPACK Working Note \#92). \bibitem{choi94a} {\sc J.~Choi, J.~Dongarra, and D.~Walker}, {\em {PB-BLAS: A Set of Parallel Block Basic Linear Algebra Subroutines}}, Concurrency: Practice and Experience, 8 (1996), pp.~517--535. \bibitem{chtchelkanova95a} {\sc A.~Chtchelkanova, J.~Gunnels, G.~Morrow, J.~Overfelt, and R.~van~de Geijn}, {\em {Parallel Implementation of BLAS: General Techniques for Level 3 BLAS}}, Tech. Rep. TR95-49, Department of Computer Sciences, UT-Austin, 1995. \newblock Submitted to {Concurrency: Practice and Experience}. \bibitem{chu90a} {\sc E.~Chu and A.~George}, {\em {QR Factorization of a Dense Matrix on a Hypercube Multiprocessor}}, {SIAM} Journal on Scientific and Statistical Computing, 11 (1990), pp.~990--1028. \bibitem{lawn125} {\sc A.~Cleary and J.~Dongarra}, {\em Implementation in scalapack of divide-and-conquer algorithms for banded and tridiagonal linear systems}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-97-358, University of Tennessee, Knoxville, TN, April 1997. \newblock (Also LAPACK Working Note \#125). \bibitem{cosnard86a} {\sc M.~Cosnard, Y.~Robert, P.~Quinton, and M.~Tchuente}, eds., {\em Parallel Algorithms and Architectures}, North-Holland, 1986. \bibitem{berkeleynow} {\sc D.~E. Culler, A.~Arpaci-Dusseau, R.~Arpaci-Dusseau, B.~Chun, S.~Lumetta, A.~Mainwaring, R.~Martin, C.~Yoshikawa, and F.~Wong}, {\em Parallel computing on the {B}erkeley {NOW}}. \newblock To appear in JSPP'97 (9th Joint Symposium on Parallel Processing), Kobe, Japan, 1997. \bibitem{dayde94a} {\sc M.~Dayde, I.~Duff, and A.~Petitet}, {\em {A Parallel Block Implementation of Level 3 BLAS for MIMD Vector Processors}}, ACM Trans. Math. Softw., 20 (1994), pp.~178--193. \bibitem{demoorvandooren92} {\sc B.~De~Moor and P.~Van~Dooren}, {\em Generalization of the singular value and {QR} decompositions}, {SIAM} J. Matrix Anal. Appl., 13 (1992), pp.~993--1014. \bibitem{demmel84} {\sc J.~Demmel}, {\em Underflow and the reliability of numerical software}, {SIAM} J. Sci. Stat. Comput., 5 (1984), pp.~887--919. \bibitem{demmelMA221} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Applied Numerical Linear Algebra}, SIAM, 1996. \newblock to appear. \bibitem{demmeleisenstatgilbertliliu} {\sc J.~Demmel, S.~Eisenstat, J.~Gilbert, X.~Li, and J.~W.~H. Liu}, {\em A supernodal approach to sparse partial pivoting}, {T}echnical {R}eport {UCB}//{CSD}-95-883, UC Berkeley Computer Science Division, September 1995. \newblock to appear in SIAM J. Mat. Anal. Appl. \bibitem{lawn86} {\sc J.~Demmel and K.~Stanley}, {\em The performance of finding eigenvalues and eigenvectors of dense symmetric matrices on distributed memory computers}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-94-254, University of Tennessee, Knoxville, TN, September 1994. \newblock (Also LAPACK Working Note \#86). \bibitem{shm_superlu97} {\sc J.~W. Demmel, J.~R. Gilbert, and X.~S. Li}, {\em An asynchronous parallel supernodal algorithm for sparse {G}aussian elimination}, February 1997. \newblock Submitted to SIAM J. Matrix Anal. Appl., special issue on Sparse and Structured Matrix Computations and Their Applications (Also LAPACK Working Note 124). \bibitem{demmelli93} {\sc J.~W. Demmel and X.~Li}, {\em Faster numerical algorithms via exception handling}, {IEEE} Trans. Comp., 43 (1994), pp.~983--992. \newblock (Also LAPACK Working Note \#59). \bibitem{dhillon97_1} {\sc I.~S. Dhillon}, {\em Current inverse iteration software can fail}, (1997). \newblock Submitted for publication. \bibitem{dhillonthesis} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em A Stable ${O}(n^2)$ Algorithm for the Symmetric Tridiagonal Eigenproblem}, PhD thesis, University of California, Berkeley, CA, May 1997. \bibitem{dhillonparlett97} {\sc I.~S. Dhillon and B.~Parlett}, {\em Orthogonal eigenvectors without {G}ram-{S}chmidt}, (1997). \newblock draft. \bibitem{dunigan96a} {\sc J.~Dongarra and T.~Dunigan}, {\em Message-passing performance of various computers}, Tech. Rep. {ORNL}/{TM}-13006, Oak Ridge National Laboratory, Oak Ridge, TN, 1996. \newblock Submitted and accepted to Concurrency: Practice and Experience. \bibitem{lawn110} {\sc J.~Dongarra, S.~Hammarling, and D.~Walker}, {\em Key Concepts for Parallel Out-Of-Core LU Factorization}, Society for Industrial and Applied Mathematics, Philadelphia, PA, 1996. \newblock (Also {LAPACK} Working Note \#110). \bibitem{henry97a} {\sc J.~Dongarra, G.~Henry, and D.~Watkins}, {\em A distributed memory implementation of the nonsymmetric {QR} algorithm}, in Proceedings of the Eighth SIAM Conference on Parallel Processing for Scientific Computing, Philadelphia, PA, 1997, Society for Industrial and Applied Mathematics. \bibitem{DPRT95} {\sc J.~Dongarra, C.~Randriamaro, L.~Prylli, and B.~Tourancheau}, {\em Array redistribution in {S}ca{LAPACK} using {PVM}}, in EuroPVM users' group, Hermes, 1995. \bibitem{lawn37} {\sc J.~Dongarra and R.~van~de Geijn}, {\em Two dimensional basic linear algebra communication subprograms}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-91-138, University of Tennessee, Knoxville, TN, 1991. \newblock (Also LAPACK Working Note \#37). \bibitem{lawn43} {\sc J.~Dongarra, R.~van~de Geijn, and D.~Walker}, {\em Scalability issues in the design of a library for dense linear algebra}, Journal of Parallel and Distributed Computing, 22 (1994), pp.~523--537. \newblock (Also LAPACK Working Note \#43). \bibitem{blacs} {\sc J.~Dongarra, R.~van~de Geijn, and R.~C. Whaley}, {\em Two dimensional basic linear algebra communication subprograms}, in Environments and Tools for Parallel Scientific Computing, Advances in Parallel Computing, J.~Dongarra and B.~Tourancheau, eds., vol.~6, Elsevier Science Publishers B.V., 1993, pp.~31--40. \bibitem{dongarra95a} {\sc J.~Dongarra and D.~Walker}, {\em Software libraries for linear algebra computations on high performance computers}, {SIAM} Review, 37 (1995), pp.~151--180. \bibitem{lawn94} {\sc J.~Dongarra and R.~C. Whaley}, {\em A user's guide to the {BLACS} v1.1}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-95-281, University of Tennessee, Knoxville, TN, 1995. \newblock (Also LAPACK Working Note \#94). \bibitem{lawn118} {\sc J.~J. Dongarra and E.~F. D'Azevedo}, {\em The design and implementation of the parallel out-of-core {ScaLAPACK} {LU}, {QR}, and {Cholesky} factorization routines}, {D}epartment of {C}omputer {S}cience {T}echnical {R}eport CS-97-347, University of Tennessee, Knoxville, TN, 1997. \newblock (Also LAPACK Working Note 118). \bibitem{blas3alg} {\sc J.~J. Dongarra, J.~Du~Croz, I.~S. Duff, and S.~Hammarling}, {\em Algorithm 679: A set of {L}evel 3 {B}asic {L}inear {A}lgebra {S}ubprograms}, {ACM} Trans. Math. Soft., 16 (1990), pp.~18--28. \bibitem{blas3} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em A set of {L}evel 3 {B}asic {L}inear {A}lgebra {S}ubprograms}, {ACM} Trans. Math. Soft., 16 (1990), pp.~1--17. \bibitem{blas2alg} {\sc J.~J. Dongarra, J.~Du~Croz, S.~Hammarling, and R.~J. Hanson}, {\em Algorithm 656: An extended set of {FORTRAN} {B}asic {L}inear {A}lgebra {S}ubroutines}, {ACM} Trans. Math. Soft., 14 (1988), pp.~18--32. \bibitem{blas2} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em An extended set of {FORTRAN} basic linear algebra subroutines}, {ACM} Trans. Math. Soft., 14 (1988), pp.~1--17. \bibitem{Dongarra87e} {\sc J.~J. Dongarra and E.~Grosse}, {\em Distribution of mathematical software via electronic mail}, Communications of the {ACM}, 30 (1987), pp.~403--407. \bibitem{dongarra92a} {\sc J.~J. Dongarra, R.~van~de Geijn, and D.~W. Walker}, {\em A look at scalable dense linear algebra libraries}, in Proceedings of the Scalable High-Performance Computing Conference, IEEE, ed., IEEE Publishers, 1992, pp.~372--379. \bibitem{lapwn27} {\sc J.~Du~Croz and N.~J. Higham}, {\em Stability of methods for matrix inversion}, {IMA} J. Numer. Anal., 12 (1992), pp.~1--19. \newblock (Also LAPACK Working Note \#27). \bibitem{falgout93a} {\sc R.~Falgout, A.~Skjellum, S.~Smith, and C.~Still}, {\em {The Multicomputer Toolbox Approach to Concurrent BLAS and LACS}}, in Proceedings of the Scalable High Performance Computing Conference SHPCC-92, {IEEE} Computer Society Press, 1992. \bibitem{mpi} {\sc M.~P.~I. Forum}, {\em {MPI}: A message passing interface standard}, International Journal of Supercomputer Applications and High Performance Computing, 8 (1994), pp.~3--4. \newblock Special issue on {MPI}. Also available electronically, the URL is {\tt ftp://www.netlib.org/mpi/mpi-report.ps }. \bibitem{fox88ab} {\sc G.~Fox, M.~Johnson, G.~Lyzenga, S.~Otto, J.~Salmon, and D.~Walker}, {\em Solving Problems on Concurrent Processors, Volume 1}, Prentice-Hall, Englewood Cliffs, NJ, 1988. \bibitem{fox94wm} {\sc G.~Fox, R.~Williams, and P.~Messina}, {\em Parallel Computing Works!}, Morgan Kaufmann Publishers, Inc., San Francisco, CA, 1994. \bibitem{FP:92} {\sc T.~L. Freeman and C.~Phillips}, {\em Parallel Numerical Algorithms}, Prentice-Hall, Hemel Hempstead, Hertfordshire, UK, 1992. \bibitem{pvm} {\sc A.~Geist, A.~Beguelin, J.~Dongarra, W.~Jiang, R.~Manchek, and V.~Sunderam}, {\em {PVM}: Parallel Virtual Machine. A Users' Guide and Tutorial for Networked Parallel Computing}, {MIT} Press, Cambridge, MA, 1994. \bibitem{geist88a} {\sc G.~Geist and C.~Romine}, {\em {LU} factorization algorithms on distributed memory multiprocessor architectures}, SIAM J. Sci. Stat. Comput., 9 (1988), pp.~639--649. \bibitem{golub89a} {\sc G.~Golub and C.~van Loan}, {\em {Matrix Computations}}, Johns-Hopkins, Baltimore, second~ed., 1989. \bibitem{GVL2} {\sc G.~Golub and C.~F. Van~Loan}, {\em Matrix Computations}, Johns Hopkins University Press, Baltimore, MD, third~ed., 1996. \bibitem{hager} {\sc W.~W. Hager}, {\em Condition estimators}, {SIAM} J. Sci. Stat. Comput., 5 (1984), pp.~311--316. \bibitem{hammarling86} {\sc S.~Hammarling}, {\em The numerical solution of the general {G}auss-{M}arkov linear model}, in Mathematics in Signal Processing, T.~S.~{\em et al.}. Durani, ed., Clarendon Press, Oxford, UK, 1986. \bibitem{blas0} {\sc R.~Hanson, F.~Krogh, and C.~Lawson}, {\em A proposal for standard linear algebra subprograms}, {ACM SIGNUM} Newsl., 8 (1973). \bibitem{hatcher91a} {\sc P.~Hatcher and M.~Quinn}, {\em {Data-Parallel Programming On MIMD Computers}}, The MIT Press, Cambridge, Massachusetts, 1991. \bibitem{hendrickson94a} {\sc B.~Hendrickson and D.~Womble}, {\em The torus--wrap mapping for dense matrix calculations on massively parallel computers}, {SIAM} J. Sci. Stat. Comput., 15 (1994), pp.~1201--1226. \bibitem{gregthesis} {\sc G.~Henry}, {\em Improving Data Re-Use in Eigenvalue-Related Computations}, PhD thesis, Cornell University, Ithaca, NY, January 1994. \bibitem{lawn79} {\sc G.~Henry and R.~Van~de Geijn}, {\em Parallelizing the {QR} algorithm for the unsymmetric algebraic eigenvalue problem: {Myths} and reality}, SIAM J. Sci. Comput., 17 (1996), pp.~870--883. \newblock (Also LAPACK Working Note 79). \bibitem{lawn121} {\sc G.~Henry, D.~Watkins, and J.~Dongarra}, {\em A parallel implementation of the nonsymmetric {QR} algorithm for distributed memory architectures}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport CS-97-352, University of Tennessee, Knoxville, TN, March 1997. \newblock (Also LAPACK Working Note \# 121). \bibitem{higham1} {\sc N.~J. Higham}, {\em A survey of condition number estimation for triangular matrices}, {SIAM} Review, 29 (1987), pp.~575--596. \bibitem{nick2} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em {FORTRAN} codes for estimating the one-norm of a real or complex matrix, with applications to condition estimation}, {ACM} Trans. Math. Softw., 14 (1988), pp.~381--396. \bibitem{higham90} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Experience with a matrix norm estimator}, {SIAM} J. Sci. Stat. Comput., 11 (1990), pp.~804--809. \bibitem{higham93} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Perturbation theory and backward error for {$AX-XB=C$}}, {BIT}, 33 (1993), pp.~124--136. \bibitem{higham96} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Accuracy and Stability of Numerical Algorithms}, Society for Industrial and Applied Mathematics, Philadelphia, PA, 1996. \bibitem{huss94a} {\sc S.~Huss-Lederman, E.~Jacobson, A.~Tsao, and G.~Zhang}, {\em {Matrix Multiplication on the Intel Touchstone DELTA}}, Concurrency: Practice and Experience, 6 (1994), pp.~571--594. \bibitem{huss93a} {\sc S.~Huss-Lederman, A.~Tsao, and G.~Zhang}, {\em {A parallel implementation of the invariant subspace decomposition algorithm for dense symmetric matrices}}, in Proceedings of the Sixth SIAM Conference on Parallel Processing for Scientific Computing, SIAM, 1993, pp.~367--374. \bibitem{hwang93a} {\sc K.~Hwang}, {\em {Advanced Computer Architecture: Parallelism, Scalability, Programmability}}, McGraw-Hill, 1993. \bibitem{ibm1} {\sc {IBM Corporation}}, {\em {IBM} {RS6000}}, 1996. \newblock { (URL = {\tt http://www.rs6000.ibm.com/})}. \bibitem{intel} {\sc {Intel Corporation}}, {\em {Intel Supercomputer Technical Publications Home Page}}, 1995. \newblock { (URL = {\tt http://www.ssd.intel.com/pubs.html})}. \bibitem{kagstrom95b} {\sc B.~K{\aa}gstr\"{o}m, P.~Ling, and C.~V. Loan}, {\em {GEMM}-based level 3 {BLAS}: High-performance model implementations and performance evaluation benchmark}, Tech. Rep. UMINF 95-18, Department of Computing Science, Ume{\aa} University, 1995. \newblock Submitted to ACM Trans. Math. Softw. \bibitem{hpf} {\sc C.~Koebel, D.~Loveman, R.~Schreiber, G.~Steele, and M.~Zosel}, {\em The High Performance Fortran Handbook}, {MIT} Press, Cambridge, Massachusetts, 1994. \bibitem{kumar94a} {\sc V.~Kumar, A.~Grama, A.~Gupta, and G.~Karypis}, {\em {Introduction to Parallel Computing -- Design and Analysis of Algorithms}}, The Benjamin/Cummings Publishing Company, Inc., Redwood City, CA, 1994. \bibitem{blas1} {\sc C.~L. Lawson, R.~J. Hanson, D.~Kincaid, and F.~T. Krogh}, {\em Basic linear algebra subprograms for {F}ortran usage}, {ACM} Trans. Math. Soft., 5 (1979), pp.~308--323. \bibitem{lawn72} {\sc R.~Lehoucq}, {\em The computation of elementary unitary matrices}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport {CS}-94-233, University of Tennessee, Knoxville, TN, 1994. \newblock (Also LAPACK Working Note 72). \bibitem{lewis92} {\sc T.~Lewis and H.~El-Rewini}, {\em {Introduction to Parallel Computing}}, Prentice-Hall, Inc., Englewood Cliffs, NJ, 1992. \bibitem{xiaoyelithesis} {\sc X.~Li}, {\em Sparse {G}aussian Elimination on High Performance Computers}, PhD thesis, Computer Science Division, Department of Electrical Engineering and Computer Science, University of California, Berkeley, CA, September 1996. \bibitem{lichtenstein93a} {\sc W.~Lichtenstein and S.~L. Johnsson}, {\em Block-cyclic dense linear algebra}, {SIAM} J. Sci. Stat. Comput., 14 (1993), pp.~1259--1288. \bibitem{am2} {\sc A.~Mainwaring and D.~E. Culler}, {\em Active message applications programming interface and communication subsystem organization}, Tech. Rep. UCB CSD-96-918, University of California at Berkeley, Berkeley, CA, October 1996. \bibitem{pacheco97} {\sc P.~Pacheco}, {\em Parallel Programming with {MPI}}, Morgan Kaufmann Publishers, Inc., San Francisco, CA, 1997. \bibitem{paige90} {\sc C.~Paige}, {\em Some aspects of generalized {QR} factorization}, in Reliable Numerical Computations, M.~Cox and S.~Hammarling, eds., Clarendon Press, 1990. \bibitem{parl80} {\sc B.~Parlett}, {\em The Symmetric Eigenvalue Problem}, Prentice-Hall, Englewood Cliffs, NJ, 1980. \bibitem{parlett96} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em The construction of orthogonal eigenvectors for tight clusters by use of submatrices}, Center for Pure and Applied Mathematics PAM-664, University of California, Berkeley, CA, January 1996. \newblock submitted to SIMAX. \bibitem{parlettdhillon96} {\sc B.~Parlett and I.~Dhillon}, {\em On {F}ernando's method to find the most redundant equation in a tridiagonal system}, Linear Algebra and Its Applications, (1996). \newblock to appear. \bibitem{petitet96a} {\sc A.~Petitet}, {\em Algorithmic Redistribution Methods for Block Cyclic Decompositions}, PhD thesis, University of Tennessee, Knoxville, TN, 1996. \bibitem{Toolpack} {\sc e.~Pollicini, A.~A.}, {\em Using Toolpack Software Tools}, 1989. \bibitem{PT96a} {\sc L.~Prylli and B.~Tourancheau}, {\em Efficient block cyclic data redistribution}, in EUROPAR'96, vol.~1 of Lecture Notes in Computer Science, Springer-Verlag, 1996, pp.~155--165. \bibitem{PT97} \leavevmode\vrule height 2pt depth -1.6pt width 23pt, {\em Efficient block cyclic array redistribution}, Journal of Parallel and Distributed Computing, (1997). \newblock To appear. \bibitem{schreiber87a} {\sc R.~Schreiber and C.~F. Van~Loan}, {\em A storage efficient {WY} representation for products of {H}ouseholder transformations}, {SIAM} J. Sci. Stat. Comput., 10 (1989), pp.~53--57. \bibitem{ugpetsc} {\sc B.~Smith, W.~Gropp, and L.~Curfman~McInnes}, {\em {PETSc} 2.0 users manual}, {T}echnical {R}eport {ANL}-95/11, Argonne National Laboratory, Argonne, IL, 1995. \newblock (Available by anonymous ftp from {\tt ftp.mcs.anl.gov}). \bibitem{SOHWD:96} {\sc M.~Snir, S.~W. Otto, S.~Huss-Lederman, D.~W. Walker, and J.~J. Dongarra}, {\em {MPI}: The Complete Reference}, {MIT} Press, Cambridge, MA, 1996. \bibitem{SunSoft:XDR} {\sc SunSoft}, {\em The XDR Protocol Specification. Appendix A of ``Network Interfaces Programmer's Guide''}, SunSoft, 1993. \bibitem{velde94a} {\sc E.~van~de Velde}, {\em {Concurrent Scientific Computing}}, no.~16 in Texts in Applied Mathematics, Springer-Verlag, 1994. \bibitem{lawn73} {\sc R.~C. Whaley}, {\em Basic linear algebra communication subprograms: Analysis and implementation across multiple parallel architectures}, {C}omputer {S}cience {D}ept. {T}echnical {R}eport {CS}-94-234, University of Tennessee, Knoxville, TN, May 1994. \newblock (Also LAPACK Working Note 73). \bibitem{wilkinson1} {\sc J.~H. Wilkinson}, {\em The Algebraic Eigenvalue Problem}, Oxford University Press, Oxford, UK, 1965. \end{thebibliography} .