
      SUBROUTINE PDSDC( WANTT, WANTZ, N, A, IA, JA, DESCA,
     $                  Z, IZ, JZ, DESCZ, WR, WI,
     $                  WORK, LWORK, IWORK,
     $                  LIWORK, INFO )
*
*  -- ScaLAPACK auxiliary routine (version 1.3 ALPHA) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*
*     .. Scalar Arguments ..
      LOGICAL            WANTT, WANTZ
      INTEGER            N, IA, JA, IZ, JZ, LWORK, LIWORK, INFO
*     ..
*     .. Array Arguments ..
      INTEGER            DESCA( * ), DESCZ( * ), IWORK( * )
      DOUBLE PRECISION   A( * ), WI( * ), WORK( * ), WR( * ), Z( * )
*     ..
*
*  Purpose
*  =======
*
*  PDSDC computes the Schur decomposition and or eigenvalues of a 
*  non-symmetric matrix. It uses the matrix sign function to
*  "(S)pectrally (D)ivide and (C)onquer" the spectrum.
*
*
*  Implementation details
*  =======================
*  This routine uses a dynamic (T)ask (L)ist to keep track of the divide and
*  conquer process. It is implemented in C and makes use of C's dynamic 
*  memory management routines (malloc, free). All its routines are prefixed
*  with the characters "TL_". Please see "tl.c" for  further details.
*
*  The algorithm operates in data-parallel mode until it becomes cheaper
*  to compute in task-parallel mode. The finction "DOSWITCH" indicates
*  when this threshold is reached and should ideally be fine tuned for
*  each architecture. It is implemented in "decide.c".
*
*  Current Restrictions
*  =====================
*
*  (1) IA must equal JA (i.e., input matrices must be diagonal submatrices);
*      This restriction will be removed in the next release.
*
*  Notes
*  =====
*
*  Please see the accompanying README file for a basic description of
*  how SDC works. More detailed information can be found in the 
*  lapack working note, or at "http://www.cs.berkeley.edu/~hbr/sdc".
*
*  Since this is an iterative process, we cannot know in advance 
*  exactly how much memory will be required. Thus a call with LWORK == -1
*  returns a reasonable (if imprecise) estimate. 
*  Since the ammount of memory available can influence the time to switch
*  from data parallel to task aprallel mode, try to supply more than
*  the estimated minimum.
*
*  A description vector is associated with each 2D block-cyclicly dis-
*  tributed matrix.  This vector stores the information required to
*  establish the mapping between a matrix entry and its corresponding
*  process and memory location.
*
*  In the following comments, the character _ should be read as
*  "of the distributed matrix".  Let A be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESCA:
*
*  NOTATION        STORED IN      EXPLANATION
*  --------------- -------------- --------------------------------------
*  DT_A   (global) DESCA( DT_ )   The descriptor type.
*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
*                                 the BLACS process grid A is distribu-
*                                 ted over. The context itself is glo-
*                                 bal, but the handle (the integer
*                                 value) may vary.
*  M_A    (global) DESCA( M_ )    The number of rows in the distributed
*                                 matrix A.
*  N_A    (global) DESCA( N_ )    The number of columns in the distri-
*                                 buted matrix A.
*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
*                                 the rest M_A-IMB_A rows of A.
*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
*                                 the rest N_A-INB_A columns of A.
*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
*                                 row of the matrix A is distributed.
*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
*                                 first column of A is distributed.
*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
*                                 array storing the local blocks of the
*                                 distributed matrix A.
*                                 LLD_A >= MAX(1,LOCp(M_A)).
*
*  Let K be the number of rows or columns of a distributed matrix,
*  and assume that its process grid has dimension p x q.
*  LOCp( K ) denotes the number of elements of K that a process
*  would receive if K were distributed over the p processes of its
*  process column.
*  Similarly, LOCq( K ) denotes the number of elements of K that a
*  process would receive if K were distributed over the q processes of
*  its process row.
*  The values of LOCp() and LOCq() may be determined via a call to the
*  ScaLAPACK tool function, NUMROC:
*          LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
*          LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
*
*  Arguments
*  =========
*
*  WANTT   (global input) LOGICAL
*          = .TRUE. : the full Schur form T is required;
*          = .FALSE.: only eigenvalues are required.
*
*  WANTZ   (global input) LOGICAL
*          = .TRUE. : the matrix of Schur vectors Z is required;
*          = .FALSE.: Schur vectors are not required.
*
*  N       (global input) INTEGER
*          The order of the matrix A (and Z if WANTZ).
*          N >= 0.
*
*  A       (global input/output) DOUBLE PRECISION array, dimension
*          (DESCA(LLD_),*)
*          On entry, the distributed matrix to be solved.
*          On exit, if WANTT is .TRUE., A(IA:IA+N-1, JA:JA+N-1)
*          is upper quasi-triangular with 1-by-1 and 2-by-2 diagonal
*          blocks in standard form. If WANTT is .FALSE., the
*          contents of A are unspecified on exit.
*
*  IA      (global input) INTEGER
*          A's global row index, which points to the beginning of the
*          submatrix which is to be operated on.
*
*  JA      (global input) INTEGER
*          A's global column index, which points to the beginning of
*          the submatrix which is to be operated on.
*
*  DESCA   (global and local input) INTEGER array of dimension DLEN_.
*          The array descriptor for the distributed matrix A.
*
*  Z       (global output) DOUBLE PRECISION array.
*          If WANTZ is .TRUE., Z will contain the computed Schur vectors.
*          If WANTZ is .FALSE., Z is not referenced.
*
*  IZ      (global input) INTEGER
*          Z's global row index, which points to the beginning of the
*          submatrix which is to be operated on.
*
*  JZ      (global input) INTEGER
*          Z's global column index, which points to the beginning of
*          the submatrix which is to be operated on.
*
*  DESCZ   (global and local input) INTEGER array of dimension DLEN_.
*          The array descriptor for the distributed matrix Z.
*
*
*  WR      (global replicated output) DOUBLE PRECISION array,
*                                                         dimension (N)
*  WI      (global replicated output) DOUBLE PRECISION array,
*                                                         dimension (N)
*          The real and imaginary parts, respectively, of the N computed
*          eigenvalues are stored in  WR and WI. If two eigenvalues 
*          are computed as a complex conjugate pair, they are stored in 
*          consecutive elements of WR and WI, say the i-th and (i+1)th, with
*          WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., the
*          eigenvalues are stored in the same order as on the diagonal
*          of the Schur form returned in A. 
*
*  WORK    (local workspace/output) DOUBLE PRECISION array of size LWORK
*          On successful exit:
*             WORK( 1 ) contains the minimal LWORK;
*             WORK( 2 ) is an estimate of the error accumulated in the 
*             divide and conquer process;
*             WORK( 3 ) is an estimate of the number of flops performed.
*             WORK( 4 ) is the total number of splits performed.
*
*  LWORK   (local input) INTEGER
*          WORK(LWORK) is a local array and LWORK is assumed big enough
*          so that LWORK >= 3*N +
*                MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCq(N),
*                     7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) )
*
*  IWORK   (global and local input) INTEGER array of size LIWORK
*
*  LIWORK  (local input) INTEGER
*
*  INFO    (global output) INTEGER
*          < 0: parameter number -INFO incorrect or inconsistent
*          = 0: successful exit
*          = 1: Too many attempts at deflating the matrix
*          = 2: Failure solving with DGEEV or DGEES
*          = 3: Problem splitting the matrix with PDHALFP
*          Please see below for details on how information is
*          returned in the cases of failure.
*
*  =====================================================================
*  Further Details
*  ===============
*
*  When there is a failure in the process, on exit:
*           IWORK( 2 ) == Number of unsolved blocks
*        and
*           IWORK( 3 + (I - 1) * 2 ) == Low index
*           IWORK( 3 + (I - 1) * 2 + 1 ) == High index
*           WORK( 4 + (I - 1) * 2 ) == lower bound on real part
*           WORK( 4 + (I - 1) * 2 + 1 ) == upper bound on real part
*        for the I'th largest block (blocks are arranged in decreasing
*        size).
*
*  =====================================================================
*
*  Working Note:
*  ============
*
*  This routine is still in the prototype stage and much of the debugging
*  code has been left in. We have tried to isolate these blocks of code
*  and identify them wherever possible.
*
*  =====================================================================

*     .. Parameters ..
      INTEGER            CSRC_, CTXT_, DLEN_, DT_,
     $                   LLD_, MB_, M_, NB_, N_, RSRC_
      INTEGER            NORMAL, NOSPLIT, NOCONV
      PARAMETER          ( DLEN_ = 9, DT_ = 1,
     $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
     $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
      PARAMETER          ( NORMAL = 1, NOSPLIT = 2, NOCONV = 3 )
      DOUBLE PRECISION   ZERO, ONE
      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
*     ..
*     .. Local Scalars ..
      LOGICAL            LDUMMY
      INTEGER            IDUMMY, PAR_LIST, LOCAL_LIST, ILO2, IHI2,
     $                   N2, INFO2, NMYTASKS, NTASKS, IAM, IA2, JA2,
     $                   NPROCS, I, J, COUNT, MINPROC, ICTXT, LWMIN,
     $                   IPMAT, IPW, NPCOL, NPROW, MYCOL, MYROW,
     $                   IRESULT, MAXITER, NITER, ITRY, MAXTRY,
     $                   IROWDEST, ICOLDEST, IROWSRC, ICOLSRC, NSPLITS,
     $                   MINWORK, IFACTOR, STATE, IROFF, ICOFF, IPQT, 
     $                   IZ2, JZ2, MP, NQ, IQ, JQ, IPQ, IZ2ROW, JZ2COL
      DOUBLE PRECISION   X, LOW, HIGH, DUMMY, INFINITY, MINLOAD, 
     $                   ERREST, FLOPS, ERREST2, TRACEN2, 
     $                   GERSHBD, GERSHTOL
*     ..
*     .. Local Arrays ..
      INTEGER            DESCQ( DLEN_ )
*     ..
*     .. External Functions ..
      INTEGER            NUMROC, TL_INIT, TL_CREATE, TL_ADD, DOSWITCH,
     $                   TL_SIZE, TL_GET, TL_GET2, TL_FREE, TL_DESTROY,
     $                   INDXG2P
      DOUBLE PRECISION   PDLAMCH, PDLATRA
      EXTERNAL           NUMROC, PDLAMCH, PDLATRA, TL_INIT, TL_CREATE,
     $                   TL_ADD, DOSWITCH, TL_SIZE, TL_GET, TL_GET2,
     $                   TL_FREE, TL_DESTROY, INDXG2P
*     ..
*     .. External Subroutines ..
      EXTERNAL           BLACS_GRIDINFO, PDHALFP, DGEEV, DGEES,
     $                   IGAMN2D, INFOG1L, INFOG2L, PDLACONSB, PDLABAD,
     $                   PDLACP3, PXERBLA, BLACS_BARRIER, PDSDCUPD,
     $                   PDGEMM0, IGSUM2D, DGSUM2D, PDGERSH, PDLACPY
*     ..
*     .. Intrinsic Functions ..
      INTRINSIC          ABS, MAX, MIN, MOD, DBLE
***
***   Debugging variables/functions/subroutines
***
      logical            debug, printit
      double precision   dummy2, pdlange, dlange
      external           pdlange, dlange, dgemm, print_desc, 
     $                   pdsdcupd2

*     ..
*     .. Executable Statements ..
*
*     Get grid parameters
*
      ICTXT = DESCA( CTXT_ )
      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
      NPROCS = NPROW * NPCOL
      IAM = MYROW * NPCOL + MYCOL
***
***   Debugging - intialization
***
      debug = .false.
      printit = (debug .and. myrow.eq.0 .and. mycol.eq.0)
*
*     Check input parameters
*     TODO: add better input checking once inner routines have settled

      INFO = 0
      FLOPS = ZERO
      NSPLITS = 0
      INFO = 0
      IF( NPROW .EQ. -1 ) THEN
         INFO = -(500+CTXT_)
      ELSE
         CALL CHK1MAT( N, 1, N, 1, IA, JA, DESCA, 7, INFO )
         IF ( INFO .EQ. 0 ) THEN
            IF ( WANTZ .AND. ( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) ) THEN
               INFO = -( 1300+CTXT_ )
            END IF
            IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN
               INFO = -( 700+NB_ )
            END IF
            IF( WANTZ .AND. (DESCZ( MB_ ).NE.DESCZ( NB_ )) ) THEN
               INFO = -( 1300+NB_ )
            END IF
            IF( WANTZ .AND. DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN
               INFO = -( 1300+MB_ )
            END IF
            CALL IGAMN2D( ICTXT, 'ALL', ' ', 1, 1, INFO, 1, -1, -1,
     $              -1, -1, -1 )
         END IF
      END IF
      
      IF ( INFO .GE. 0 ) THEN
*
*
*        Note that since this is an iterative process, we cannot
*        know in advance exactly how much memory will be required.
*
*        A reasonable guess is 1.5 times the memory necessary to perform 
*        the first split.
*
*        Call with LWORK == -1 to get workspace requirements
*
         CALL PDHALFP( WANTT, WANTZ, N, A, IA, JA, 1, N,
     $                 DESCA, Z, IZ, JZ, DESCZ, ZERO, .TRUE., 20,
     $                 'NONE', COUNT, ERREST2, NITER, WORK, -1,
     $                 IWORK, LIWORK, INFO2 )
         if (printit) print *, 'PDSDC: Min LWORK = ', anint(work(1))
         WORK( 1 ) = WORK( 1 ) * 1.5
         IF ( LWORK .EQ. -1 ) THEN
            RETURN
         ELSE IF ( LWORK .LT. WORK( 1 ) ) THEN
            INFO = -15
         END IF
         LWMIN = WORK( 1 )
      END IF

      IF ( INFO.LT.0 ) THEN
         CALL PXERBLA( ICTXT, 'PDSDC', -INFO )
         RETURN
      END IF

      ERREST = ZERO
*
*     Check for trivial problems
*
      IF( N .EQ. 0 .OR. N .EQ. 1 )
     $   RETURN
*
*     Initialize
*
      MAXITER = 25
      MAXTRY = 10
      STATE = NORMAL
      IFACTOR = 0
      INFINITY = PDLAMCH( ICTXT, 'OVERFLOW' )
      GERSHTOL = DBLE(N) * PDLAMCH( ICTXT, 'PRECISION' )

      IF ( WANTZ ) THEN
*
*        Initialize matrix Z = Identity
*
         CALL PDLASET( 'All', N, N, ZERO, ONE, Z, IZ, JZ, DESCZ )
      END IF
*
*     Initialize task list routines and create a replicated Task List on 
*     each processor.
*     Add the root task.
*
      IDUMMY = 2
      IRESULT = TL_INIT( IDUMMY )
      PAR_LIST = TL_CREATE( N )
      IRESULT = TL_ADD( PAR_LIST, 1, N, -INFINITY, INFINITY )

*
*     Data parallel phase. 
*
      ITRY = 0

100   IF ( TL_SIZE( PAR_LIST ) .EQ. 0 ) 
     $   GOTO 200
*
*     Function 'DOSWITCH' returns TRUE when it becomes cheaper to 
*     solve in task-parallel mode.
*
      IF ( DOSWITCH( PAR_LIST, NPROCS ) .EQ. 1 ) THEN
*
*        Its cheaper to operate in task-parallel mode. Make sure we
*        have enough workspace.
*        TODO: make this more accurate
*
         IRESULT = TL_GET2( PAR_LIST, 1, ILO2, IHI2, LOW, HIGH )
         N2 = IHI2 - ILO2 + 1
         MINWORK = N2 * N2 + 6 * N2
         IF ( WANTT .OR. WANTZ ) THEN
            MINWORK = MINWORK + 4 * N2 * N2
            if (debug) minwork = minwork + n2 * n2
         END IF
         IF ( LWORK .GE. MINWORK ) THEN
            if (printit) print *, 'Switching...'
            GOTO 200
         END IF
      END IF

*
*     Continue in data-parallel mode;
*     Get next task
*
      IRESULT = TL_GET( PAR_LIST, ILO2, IHI2, LOW, HIGH )
*
*     Working with the sub-matrix A(IA+ILO2-1:IA+IHI2-1, JA+ILO2-1:JA+IHI2-1)
*
      IA2 = IA + ILO2 - 1
      JA2 = JA + ILO2 - 1
      N2 = IHI2 - ILO2 + 1
*
*     The switching criteria should kick in long before this does!
*
      IF ( N2 .EQ. 1 ) THEN
         if (printit) print *, '===========>>>>>> got leaf node'
         GOTO 100
      END IF
***
***   Debugging - print current task
***
      if (printit) then
         print *, 'Task:'
         print *, '  ILOW =', ILO2, ' IHIGH =', IHI2
         print *, '  LOW =', LOW, ' HIGH =', HIGH
      end if
*
*     Compute the splitting point. Initially try the trace of the matrix
*     divided by N (the average eigenvalue).
*
150   IF ( STATE .EQ. NORMAL ) THEN
*
*        Normal processing
*
         TRACEN2 = PDLATRA( N2, A, IA2, JA2, DESCA ) / DBLE( N2 )
         X = TRACEN2
      ELSE IF ( STATE .EQ. NOSPLIT ) THEN
*
*        We tried to split here before and all the eigenvalues
*        were to the left or right of the splitting point. The
*        bounds on that intrval have been updated with that information.
*        Split at the average point.
*
         X = ( LOW + TRACEN2 + HIGH ) / 3.0D+0
      ELSE IF ( ITRY .LT. MAXTRY ) THEN
*
*        We tried to split here before but failed (i.e. STATE == NOCONV).
*        The interval has not been updated.
*        Try a slightly different point.
*
         IF ( ITRY .EQ. 1 ) THEN
            IFACTOR = 0 
*
*           First failure in this specific interval.
*           Compute the Gershgorin disks to detect multiples of the
*           identity matrix.
*
            CALL PDGERSH( 'REAL', N2, A, IA2, JA2, DESCA, GERSHBD, 
     $                    DUMMY, WORK )
            if (printit) print *, 'Gershgorin bound: ', GERSHBD
            if (printit) print *, 'Gershgorin tolerance: ', GERSHTOL
            IF ( GERSHBD .LT. GERSHTOL ) THEN
*
*              We have a multiple of the identity. Done with this interval.
*
               if (printit) print *, 'Multiple of the Identity.'
               STATE = NORMAL
               ITRY = 0
               GOTO 100
            ELSE
*
*              Use Gershgorin to narrow interval
*
               IF ( HIGH .GT. GERSHBD )
     $            HIGH = GERSHBD
               IF ( LOW .LT. -GERSHBD )
     $            LOW = -GERSHBD
            END IF
         END IF
*
*        Compute a new splitting point.
*        Use the formula:
*             Trace/N + (upper_bound - lower_bound)/(maxtry+1) * IFACTOR
*             where IFACTOR = { 1, -1, 2, -2, ... }
*        (the sequence of points produced by this formula will have at 
*         least MAXTRY points in the interval.)
*        If we compute a point outside the interval, move on to the next
*        in the sequence.
*
 175     IF ( IFACTOR .GT. 0 ) THEN
            IFACTOR = -IFACTOR
         ELSE
            IFACTOR = -IFACTOR + 1
         END IF

         X = TRACEN2 + ( HIGH - LOW )/DBLE( MAXTRY + 1 ) * 
     $                 DBLE( IFACTOR )
         IF ( ( X .LE. LOW ) .OR. ( X .GE. HIGH ) ) THEN
            if (printit) print *, 'Split point ', x, ' out of bounds',
     $                            low, high
            GOTO 175
         END IF

      ELSE
*
*        Failure to converge after MAXTRY tries.
*
*        ** NB ** A future version of this routine will attempt to split
*        in a different way at this point. (And so will detect 
*        pure imaginary eigenvalues.)
*
*        Return split information to the user (see RECBLCKS for details).
*
         if (printit) print *, 'PDSDC: Failed to converge.'

         CALL RECBLCKS( PAR_LIST, IWORK, WORK )

         IRESULT = TL_DESTROY( PAR_LIST )
         IRESULT = TL_FREE()
         INFO = 1
         GOTO 1000
      END IF

***
***   Debugging check
***
      if (printit) print *, '   Split at X = ', X
      IF (X .LE. LOW .OR. X .GE. HIGH) THEN
         if (printit) 
     $      print *, '********* Split point out of bounds!! *********'
         X = ( LOW + HIGH ) / 2.0D+0
      END IF
*
*     Split the spectrum at the line Re(x) = X 
*
      ERREST2 = ZERO
      CALL PDHALFP( WANTT, WANTZ, N, A, IA, JA, ILO2, N2,
     $              DESCA, Z, IZ, JZ, DESCZ, X, .TRUE., MAXITER,
     $              'NONE', COUNT, ERREST2, NITER, WORK, LWORK,
     $              IWORK, LIWORK, INFO2 )
      NSPLITS = NSPLITS + 1
*
*     This flop count could be more precise.
*
      FLOPS = FLOPS + ( 2.0D+0 * DBLE( NITER ) + 20.0D+0/3.0D+0 ) *
     $        DBLE( N2 )**3
      IF ( WANTZ .AND. INFO2 .EQ. 0 ) 
     $   FLOPS = FLOPS + 2 * DBLE( N2 )**3
      IF ( WANTT .AND. INFO2 .EQ. 0 ) 
     $   FLOPS = FLOPS + 2 * DBLE( N2 )**3

      IF ( INFO2 .NE. 0 ) THEN
*
*        Trouble in PDHALFP. Exit.
*
         IF ( INFO2 .NE. 1 ) THEN
            CALL RECBLCKS( PAR_LIST, IWORK, WORK )
            IRESULT = TL_DESTROY( PAR_LIST )
            IRESULT = TL_FREE()
            INFO = 3
            GOTO 1000
         END IF
*
*        Failure at this place. Try again at another location.
*
         STATE = NOCONV
         ITRY = ITRY + 1
         if (printit) print *, 'Failure in PDHALFP. Re-trying task.'
         GOTO 150
      END IF
*
*     PDHALFP returns an error estimate for its computations. Take the
*     max of these as the estimate for the entire process.
*
      IF ( ERREST2 .GT. ERREST )
     $   ERREST = ERREST2

      if (printit) print *, ' Number of eigenvalues in LHP = ', count
*
*     For a trivial split, re-try the task. Will split somewhere
*     else next time. Note that this is not consideded a failure
*     since we got some information and will use that to narrow 
*     the interval.
*     Note: This is an _extremely_ rare situation when the split is
*     performed at TRACE/N (the average eigenvalue). Probably
*     due to rounding/arithmetic errors.
*
      IF ( COUNT .EQ. 0 .OR. COUNT .EQ. N2 ) THEN
         STATE = NOSPLIT
         IF ( COUNT .EQ. 0 ) THEN
            HIGH = X
         ELSE
            LOW = X
         END IF
         if (printit) print *, 'PDSDC: Re-trying NOSPLIT task.'
         ITRY = 0
         GOTO 150
      END IF
*
*     Matrix has split into 2x2 block upper triangular form.
*     Submit the (1,1) and (2,2) blocks as new tasks to the list.
*
      ITRY = 0
      STATE = NORMAL
      IRESULT = TL_ADD( PAR_LIST, ILO2, ILO2 + COUNT - 1,  X, HIGH )
      IRESULT = TL_ADD( PAR_LIST, ILO2 + COUNT, IHI2, LOW, X )
*
*     Continue the splitting process
*
      GOTO 100
*
*     End of data parallel phase, start of task parallel.
*

 200  NMYTASKS = 0
      NTASKS = TL_SIZE( PAR_LIST )
      DO 250 I = 1, N
         WR( I ) = ZERO
         WI( I ) = ZERO
 250  CONTINUE

*	return
*
*     Go through all tasks. For each one, pick the processor with 
*     the least ammount of work and assign it the current task.
*     Note that this is a greedy approach.
*
      DO 300 I = 1, NPROCS
         WORK( I )  = ZERO
 300  CONTINUE

      DO 400 I = 1, NTASKS
         MINLOAD = INFINITY
*
*        WORK( 1:NPROCS ) records the 'load' for each processor (temporary);
*        IWORK( 1:NTASKS ) records the task-to-processor assignments.
*
         DO 350 J = 1, NPROCS
            IF ( WORK( J ) .LT. MINLOAD ) THEN
               MINLOAD = WORK( J )
               MINPROC = J - 1
            END IF
 350     CONTINUE

         IWORK( I ) = MINPROC

         IRESULT = TL_GET2( PAR_LIST, I, ILO2, IHI2, LOW, HIGH )
         N2 = IHI2 - ILO2 + 1
         WORK( MINPROC + 1 ) = WORK( MINPROC + 1 ) + N2 * N2 * N2

         IF ( IAM .EQ. MINPROC ) THEN
            NMYTASKS = NMYTASKS + 1
         END IF
*
*        TODO: Find a proper estimate of flops for DGEEV/S
*
         FLOPS = FLOPS + DBLE( N2 )**3
 400  CONTINUE

*
*     Create local task list on each processor. Copy each diagonal 
*     block of A to the processor that it has been assigned to.
*
*     Note: We use the local task list in a silghtly different capacity.
*     The 5th parameter to TL_GET2 (and the 4th to TL_GET) will now record
*     the index into the WORK array corresponding to the matrix.
*
      LOCAL_LIST = TL_CREATE( MAX( 1, NMYTASKS ) )
      IPMAT = 1
      DO 450 I = 1, NTASKS
         IRESULT = TL_GET2( PAR_LIST, I, ILO2, IHI2, LOW, HIGH )
         N2 = IHI2 - ILO2 + 1
         IA2 = IA + ILO2 - 1
         JA2 = JA + ILO2 - 1
         ICOLDEST = MOD( IWORK( I ), NPCOL )
         IROWDEST = IWORK( I ) / NPCOL
*
*        To use PDLACP3, IA must equal JA. This is the cause of the 
*        current restriction on IA and JA.
*        Should write a PDLACP4, or use PDGEMR2D
*
         if (debug) 
     $      dummy = pdlange('1', N2, N2, a, ia2, ia2, desca,work(ipmat))
         CALL PDLACP3( N2, IA2, A, DESCA, WORK( IPMAT ), N2, 
     $                 IROWDEST, ICOLDEST, 0 )
         IF ( IAM .EQ. IWORK( I ) ) THEN
            if (debug) 
     $         dummy2 = dlange('1',N2,N2,work(ipmat),N2,
     $                         work(ipmat+n2*n2))
            IF ( IAM .EQ. IWORK( I ) .and. debug .and.
     $           abs(dummy-dummy2) .gt. 1.0D-12)
     $         print *, 'Bad copy of A ORIGINAL!!', iam, dummy, dummy2
            DUMMY = IPMAT
            IRESULT = TL_ADD( LOCAL_LIST, ILO2, IHI2, DUMMY, DUMMY )
            IF ( WANTT .OR. WANTZ ) THEN
               IPMAT = IPMAT + 2 * N2 * N2
            ELSE
               IPMAT = IPMAT + N2 * N2
            END IF
         END IF
 450  CONTINUE
*
      IPW = IPMAT
***
***   Debugging: Print out local assignments
***
      if (debug) then
         DO 470 I = 1, NPROCS
            IF ( IAM .EQ. I-1 ) THEN
               print *, 'IAM =', IAM, ' Number of tasks = ', NMYTASKS
               DO 460 J = 1, NMYTASKS
                  IRESULT = TL_GET2( LOCAL_LIST, J, ILO2, IHI2, 
     $                               DUMMY, DUMMY )
                  print *, 'Task: ', J
                  print *, '  ILOW =', ILO2, ' IHIGH =', IHI2
                  print *, '  WORK index =', ANINT( DUMMY )
 460           CONTINUE
            END IF
 470     CONTINUE
      end if
*
*     Each processor solves its local tasks.
*
      INFO2 = 0
      DO 500 I = 1, NMYTASKS
         IRESULT = TL_GET( LOCAL_LIST, ILO2, IHI2, DUMMY, DUMMY )
         N2 = IHI2 - ILO2 + 1
         IPMAT = ANINT( DUMMY )
         IF ( WANTT .OR. WANTZ ) THEN
            CALL DGEES( 'Vectors', 'NoSort', LDUMMY, 
     $                  N2, WORK( IPMAT ), N2, IDUMMY, WR( ILO2 ),
     $                  WI( ILO2 ), WORK( IPMAT + N2 * N2 ), N2,
     $                  WORK( IPW ), LWORK-IPW+1, LDUMMY, INFO2 )
         ELSE
            CALL DGEEV( 'NoLeftVectors', 'NoRightVectors',
     $                  N2, WORK( IPMAT ), N2, WR( ILO2 ), WI( ILO2 ),
     $                  DUMMY, 1, DUMMY, 1, WORK( IPW ),
     $                  LWORK-IPW+1, INFO2 )
         END IF
         IF ( INFO2 .NE. 0 ) THEN
            if (debug) print *, 'Failed in DEGGV/DGEES', IAM
            GOTO 525
         END IF
 500  CONTINUE
*
*     Wait for all processors to finish and check to see if any failed 
*     in task computation.
*
 525  CALL BLACS_BARRIER( ICTXT, 'All' )
      CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, INFO2, 1, -1, -1 )
      IF ( INFO2 .NE. 0 ) THEN
*
*        [To do: Package up local unfinished tasks for return]
*
         INFO = 2
         IRESULT = TL_DESTROY( PAR_LIST )
         IRESULT = TL_DESTROY( LOCAL_LIST )
         IRESULT = TL_FREE()
         GOTO 1000
      END IF

      IF ( WANTT .OR. WANTZ ) THEN
*
*        Use the Schur vectors computed by DGEES to update
*        the Schur form and orthogonal matrix as required.
*        Copy the vectors from local to global matrix for these operations.
*
         IPMAT = 1
         DO 550 I = 1, NTASKS
            ICOLSRC = MOD( IWORK( I ), NPCOL )
            IROWSRC = IWORK( I ) / NPCOL
            IRESULT = TL_GET2( PAR_LIST, I, ILO2, IHI2, LOW, HIGH )
            N2 = IHI2 - ILO2 + 1
            IA2 = IA + ILO2 - 1
            JA2 = JA + ILO2 - 1
***
***         Debugging - compute accuracy of the orthogonal matrix 
***         for the current task.
***
            if (debug) then
               call blacs_barrier( ICTXT, 'All' )
               if (printit) print *, 'TASK: ', i
               if (printit) print *, '  ILOW =', ILO2, ' IHIGH =', IHI2,
     $                               'N = ', IHI2 - ILO2 + 1
               if (iam .eq. iwork(i)) then
                  dummy = dlange('1',N2,N2,work(ipmat+n2*n2),N2,
     $                           work(ipw))
                  print *, '|| Local Q || = ', dummy
                  call dgemm('Trans', 'No trans', N2, N2, N2,
     $                       ONE, work(ipmat+n2*n2), n2,
     $                       work(ipmat+n2*n2), n2, ZERO, 
     $                       work(ipw), n2)
                  dummy = dlange('1', N2, N2, work(ipw),n2,work(ipw))
                  print *, 'LOCAL || q^t * q || = ', dummy-one, dummy
               end if
            end if
*
*           Copy local Q to a distributed matrix.
*           Use diagonal block of A as workspace.
*
            IF ( IAM .EQ. IWORK( I ) ) THEN
               if (debug)
     $            dummy = dlange('1',N2,N2,work(ipmat+n2*n2),
     $                           n2,work(ipw))
               CALL PDLACP3( N2, IA2, A, DESCA,
     $                       WORK( IPMAT + N2*N2 ), N2, 
     $                       IROWSRC, ICOLSRC, 1 )
            ELSE
               CALL PDLACP3( N2, IA2, A, DESCA, 
     $                       WORK( IPW ), N2,
     $                       IROWSRC, ICOLSRC, 1 )
            END IF
***
***         Debugging
***
            if (debug) then
               dummy2 = pdlange('1', N2, N2, a, ia2, ja2, 
     $                          desca, work(ipw))
               if (iam .eq. iwork(i))  
     $            print *, '|| Global Q || = ', dummy
               if (iam .eq. iwork(i) .and.
     $             abs(dummy-dummy2) .gt. 1.0D-12)
     $            print *, 'Bad copy of Q!!', iam, dummy, dummy2
            end if
*
*           Copy to distributed matrix Q (created from workspace)
*
            IZ2 = IZ + ILO2 - 1
            JZ2 = JZ + ILO2 - 1
            IROFF = MOD( IZ2 - 1, DESCZ( MB_ ) )
            ICOFF = MOD( JZ2 - 1, DESCZ( NB_ ) )
            IZ2ROW = INDXG2P( IZ2, DESCZ( MB_ ), MYROW,
     $                         DESCZ( RSRC_ ), NPROW )
            JZ2COL = INDXG2P( JZ2, DESCZ( NB_ ), MYCOL,
     $                         DESCZ( CSRC_ ), NPCOL )
            MP = NUMROC( N2 + IROFF, DESCZ( MB_ ), MYROW,
     $                      IZ2ROW, NPROW )
            NQ = NUMROC( N2 + ICOFF, DESCZ( NB_ ), MYCOL,
     $                      JZ2COL, NPCOL )
            IQ = IROFF + 1
            JQ = ICOFF + 1
            CALL DESCSET( DESCQ, N2 + IROFF, N2 + ICOFF, DESCZ( MB_ ),
     $               DESCZ( NB_ ), IZ2ROW, JZ2COL, DESCZ( CTXT_ ),
     $               MAX( 1, MP ) )
            IPQ = IPW
            IPW = IPQ + MP * NQ

            CALL PDLACPY( 'FULL', N2, N2, A, IA2, JA2, DESCA,
     $              WORK( IPQ ), IQ, JQ, DESCQ )
***
***         Debugging
***
            if (debug) then
               dummy = pdlange('1', N2, N2, A, IA2, JA2, DESCA, 
     $                         work(IPW))
               if (printit) print *, '|| Q || = ', dummy
               dummy = pdlange('1', N2, N2, A, IA2, JA2, DESCA,
     $                         work(IPW))
               if (printit) print *, '|| Q after copy || = ', dummy
               dummy = pdlange('1', N2, N2, WORK(IPQ), IQ, JQ, 
     $                         DESCQ, WORK(IPW) )
               if (printit) print *, '|| Qcopy || = ', dummy
            end if

*
*           Get Q transpose as well. This step is required because of 
*           a limitation on PDGEMM0 that will vanish in the next
*           ScaLAPACK release.
*
            IF ( IAM .EQ. IWORK( I ) ) THEN
               CALL DLASET('ALL', N2, N2, ZERO, ONE, WORK( IPW ), N2)
               CALL DGEMM('Trans', 'No trans', N2, N2, N2, ONE,
     $           WORK( IPMAT + N2*N2 ), N2, WORK( IPW ), N2, ZERO,
     $           WORK( IPW + N2*N2), N2)
               CALL PDLACP3( N2, IA2, A, DESCA,
     $                       WORK( IPW + N2*N2 ), N2,
     $                       IROWSRC, ICOLSRC, 1 )
            ELSE
               CALL PDLACP3( N2, IA2, A, DESCA,
     $                       WORK( IPW ), N2,
     $                       IROWSRC, ICOLSRC, 1 )
            END IF
            IPQT = IPW
            IPW = IPQT + MP * NQ
            CALL PDLACPY( 'FULL', N2, N2, A, IA2, JA2, DESCA,
     $              WORK( IPQT ), IQ, JQ, DESCQ )
***
***         Debugging
***
            if (debug) then
               call pdgemm0('N', 'N', n2, n2, n2, one, WORK(IPQ),
     $            IQ, JQ, DESCQ, WORK( IPQT ), IQ, JQ, DESCQ, zero,
     $            WORK(ipw), 1, 1, descq)
***     $            WORK(ipw), iq, jq, descq)
***               dummy = pdlange('1', N2, N2, work(ipw), iq, jq,
               dummy = pdlange('1', N2, N2, work(ipw), 1, 1,
     $                   descq, work(ipw+2*n2*n2))
               if (printit) print *, '|| Q * Q^T || = ', dummy
            end if
*
*           Update the accumulated orthogonal matrix and the off-diagonal
*           blocks of the matrix as necessary.
*
            CALL PDSDCUPD2( WANTT, WANTZ, N, N2, ILO2, A, IA, JA,
     $                     DESCA, Z, IZ, JZ, DESCZ, 
     $                     WORK( IPQ ), IQ, JQ, DESCQ,
     $                     WORK( IPQT ), WORK( IPW ) )

            CALL BLACS_BARRIER( ICTXT, 'All' )

            IF ( WANTT ) THEN
*
*              Restore the now upper quasi-triangular matrices (in Schur 
*              form, with 1-by-1 and 2-by-2 diagonal blocks) to the 
*              global A matrix.
*
               IF ( IAM .EQ. IWORK( I ) ) THEN
                  if (debug) 
     $               dummy = dlange('1',N2,N2,work(ipmat),N2,work(ipw))
                  CALL PDLACP3( N2, IA2, A, DESCA, WORK( IPMAT ), N2,
     $                          IROWSRC, ICOLSRC, 1 )
               ELSE
                  CALL PDLACP3( N2, IA2, A, DESCA, WORK( IPW ), N2,
     $                          IROWSRC, ICOLSRC, 1 )
               END IF
***
***         Debugging
***
               if (debug) then
                  dummy2 = pdlange('1',N2,N2,a,ia2,ia2,desca,
     $                             work(ipw))
                  if ((iam .eq. iwork(i)) .and.
     $                 abs(dummy-dummy2) .gt. 1.0D-12)
     $               print *, 'Bad copy back to A!!', iam, dummy, 
     $                        dummy2
               end if

            END IF

            IF ( IAM .EQ. IWORK( I ) ) THEN
               IPMAT = IPMAT + 2 * N2 * N2
            END IF

 550     CONTINUE
      END IF
*
*     End of task-parallel phase.
*     Re-constitute the eigenvalues on all processors.
*
      CALL DGSUM2D( ICTXT, 'All', ' ', N, 1, WR, N, -1, -1 )
      CALL DGSUM2D( ICTXT, 'All', ' ', N, 1, WI, N, -1, -1 )
*
*     Cleanup
*     Free the task lists;
*
      IRESULT = TL_DESTROY( PAR_LIST )
      IRESULT = TL_DESTROY( LOCAL_LIST )
      IRESULT = TL_FREE()
*
*     Set WORK values
*
 1000 WORK( 1 ) = DBLE( LWMIN )
      WORK( 2 ) = ERREST
      WORK( 3 ) = FLOPS
      WORK( 4 ) = DBLE( NSPLITS )
*
*     END OF PDSDC
*
      END




      SUBROUTINE RECBLCKS( PAR_LIST, IWORK, WORK )
      INTEGER PAR_LIST, IWORK( * )
      DOUBLE PRECISION WORK( * )
*
*        Records the state of the unsolved diagonal blocks of the matrix.
*        Details:
*           IWORK( 2 ) == Number of unsolved blocks
*        and
*           IWORK( 3 + (I - 1) * 2 ) == Low index
*           IWORK( 3 + (I - 1) * 2 + 1 ) == High index
*           WORK( 4 + (I - 1) * 2 ) == lower bound on real part
*           WORK( 4 + (I - 1) * 2 + 1 ) == upper bound on real part
*        for the I'th largest block (blocks are arranged in decreasing
*        size).
*
      INTEGER I, NTASKS, IHI, ILO, IRESULT
      DOUBLE PRECISION LOW, HIGH
      INTEGER TL_GET2, TL_SIZE
      EXTERNAL TL_GET2, TL_SIZE

      NTASKS = TL_SIZE( PAR_LIST )
      IWORK( 2 ) = NTASKS
      DO 100 I = 1, NTASKS
         IRESULT = TL_GET2( PAR_LIST, I, ILO, IHI, LOW, HIGH )
         IWORK( 3 + (I - 1) * 2 ) = ILO
         IWORK( 3 + (I - 1) * 2 + 1 ) = IHI
         WORK( 4 + (I - 1) * 2 ) = LOW
         WORK( 4 + (I - 1) * 2 + 1 ) = HIGH
 100  CONTINUE

      RETURN
      END
