C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  (Stripped down PVM-only version (4/13/95), for use in ParkBench     #
C   benchmark suite)                                                   #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
      SUBROUTINE LOGTRANS(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &                    MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, LM,
     &                    LN, MX, A, WS, B)
C
C This subroutine calls routines that compute B = transpose(A) using
C an O(log P) transpose algorithm. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,LM(MYINDEX),H1,H2,N): Processor I has LM(I) rows of A; 
C                            LM(0) + ... + LM(MAPSIZE-1) = M.
C  B(W,LN(MYINDEX),H1,H2,M): Processor I has LN(I) rows of B; 
C                            LN(0) + ... + LN(MAPSIZE-1) = N.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,LN,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,LN,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,LN,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,LN,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LOGTRANS requires that MAPSIZE be a power of two.
C
C Communication options (COMMOPT) for LOGTRANS include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LOGTRANS include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5) .AND. (BUFFERS .GT. 2)
C    recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: TRANSPOSE
C calls: LGTRNS1, LGTRNS2, LGTRNS3, LGTRNS4
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C local component of the array that is to be transposed, of size
C REAL (W,LM(MYINDEX),H1,H2,N)
      REAL A(1)
C
C     Work Space
C
C message buffers
C (large enough for REAL WS(W,MAX(LM),H1,H2,N,BUFFERS) 
C               and REAL WS(W,MAX(LN),H1,H2,M,BUFFERS) )
      REAL WS(1)
C
C     Output
C
C local component of the transposed array.
C (organized as REAL (W,MX,LN,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,LN,H2), (W,MX,LN,M,H2), or (W,MX,M,LN,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C temporary for checking legality of arguments
      INTEGER I, ML, NL
C declared size of first dimension of WS array in LGTRNS3 and LGTRNS4
      INTEGER WSSIZE
C
C---- Executable Statements --------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       Compute transpose.
C
C       Check that P is a power or two.
        I = 1
        DO WHILE (I .LT. MAPSIZE)
          I = I*2
        ENDDO
C
        IF (MAPSIZE .NE. I) THEN
          WRITE(0,101) MAP(MYINDEX), MAPSIZE
  101     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE TRANSPOSE ',/,
     &            ' O(log P) TRANSPOSE ALGORITHM REQUIRES THAT THE',
     &            ' NUMBER OF PROCESSORS BE A POWER OF TWO',/,
     &            ' PROCID = ',I4,' P = ',I4)
          STOP
        ENDIF
C
C       Check whether LM and LN are constant vectors and sum to 
C       M and N, respectively.
        ML = M/MAPSIZE
        NL = N/MAPSIZE
        I = 0
        DO WHILE ((I .LT. MAPSIZE) .AND. (LM(I) .EQ. ML) .AND. 
     &            (LN(I) .EQ. NL)) 
          I = I + 1
        ENDDO
C
C       Choose transpose algorithm.
        IF (I .EQ. MAPSIZE) THEN
C         everything regular, so use minimal copying algorithms 
          IF (BUFFERS .EQ. 2) THEN
C           double buffer algorithms
            CALL LGTRNS1(COMMOPT, PROTOPT, MAPSIZE, MAP,
     &                   MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL, 
     &                   MX, A, WS, B) 
C
          ELSEIF (BUFFERS .GE. 3) THEN
C           multiple buffer algorithms
            CALL LGTRNS2(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &                   MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, 
     &                   NL, MX, A, WS, B) 
C
          ELSE
C           illegal number of buffers specified
            WRITE(0,100) MAP(MYINDEX), BUFFERS
  100       FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE LOGTRANS ',/,
     &              ' ILLEGAL COMMUNICATION BUFFER SIZE SPECIFIED',/,
     &              ' PROCID = ',I4,' BUFFERS = ',I4)
            STOP
          ENDIF
C
        ELSE
C         irregular sizes, so use general algorithms
C
C         First, calculate declared workspace dimension.
          ML = LM(0)
          NL = LN(0)
          DO I=1,MAPSIZE-1
            IF (ML .LT. LM(I)) ML = LM(I)
            IF (NL .LT. LN(I)) NL = LN(I)
          ENDDO
          WSSIZE = MAX(ML*N,NL*M)*W*H1*H2
C          
C         Next, choose transpose algorithm.
          IF ((BUFFERS .EQ. 4) .OR. (BUFFERS .EQ. 5)) THEN
C           4 buffer algorithm
            CALL LGTRNS3(COMMOPT, PROTOPT, MAPSIZE, MAP,
     &                   MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN, 
     &                   LM(MYINDEX), MX, WSSIZE, A, WS, B) 
C
          ELSEIF (BUFFERS .GT. 5) THEN
C           recv-ahead algorithm (requires at least 6 buffers)
            CALL LGTRNS4(COMMOPT, BUFFERS/2, PROTOPT, 
     &                   MAPSIZE, MAP, MYINDEX, BASE, DIR, W, M, N, H1,
     &                   H2, LM, LN, LM(MYINDEX), MX, WSSIZE, A, WS, B) 
C
          ELSE
C           illegal number of buffers specified
            WRITE(0,100) MAP(MYINDEX), BUFFERS
            STOP
          ENDIF
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS1(COMMOPT, PROTOPT, MAPSIZE, MAP, 
     &                   MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                   MX, A, WS, B)
C
C This routine computes B = transpose(A) using an O(log P) transpose 
C algorithm and two communication buffers. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = M/P rows of A; 
C  B(W,NL,H1,H2,M): Each processor has NL = N/P rows of B; 
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LGTRNS1 requires that MAPSIZE be a power of two, and that M and N be 
C integer multiples of MAPSIZE.
C
C Communication options (COMMOPT) for LGTRNS1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LGTRNS1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: LOGTRANS
C calls: LGTRNS_INITR, SWAP, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MX
C Local component of the array that is to be transposed.
C (organized as REAL (W,ML,H1,H2,N))
      REAL A(W*ML*H1*H2*N)
C
C     Work Space
C
C message buffers
      REAL WS(W*ML*H1*H2*N,2)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,NL,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,NL,H2), (W,MX,NL,M,H2), or (W,MX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C size (in reals) of message being swapped
      INTEGER HALFSIZE
C number of swaps in O(log P) transpose algorithm
      INTEGER MXSTEP
C information on number, size, and location of pieces used to construct
C messages for a given swap
      INTEGER BLOCKS(LGPROCSX), BUFFSIZE(LGPROCSX), DIRN(LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C arrays indicating the origin offset of the message being sent and
C received, respectively
      INTEGER SORIGIN(LGPROCSX), RORIGIN(LGPROCSX)
C indices indicating which message buffer is used for sending the
C message in a swap, and which buffer is used for receiving the message
      INTEGER SNDBUF, RCVBUF
C loop indices
      INTEGER I, J, STEP
C array indices
      INTEGER ITO, IFROM
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in reals).
      HALFSIZE = W*ML*H1*H2*N/2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm.
      CALL LGTRNS_INITR(MAPSIZE, MAP, MYINDEX, LGPROCSX, HALFSIZE, 
     &                  MXSTEP, SWAPNODE, ORDER, BLOCKS, BUFFSIZE, 
     &                  DIRN, SORIGIN, RORIGIN)
C
C     Set initial roles of message buffers.
      SNDBUF = 1
      RCVBUF = 2
C
C     Construct transpose using logarithmic exchange algorithm:
C     case 1) (STEP .EQ. 1)
C
C     Send half of A and receive a message into WS(*,RCVBUF).
      CALL SWAP(COMMOPT, PROTOPT, ORDER(1), ME, BASE, 
     &          SWAPNODE(1), RBYTES*HALFSIZE, A(SORIGIN(1)+1),
     &          RBYTES*HALFSIZE, WS(RORIGIN(1)+1,RCVBUF))
C
      IF (MXSTEP .GT. 1) THEN
C       Copy half of WS(*,RCVBUF) into A, for use in subsequent steps.
        IFROM = RORIGIN(1) + DIRN(2)*BUFFSIZE(2)
        ITO   = (1-DIRN(1))*BUFFSIZE(1) + DIRN(2)*BUFFSIZE(2)
        DO I = 1,BUFFSIZE(2)
          A(ITO+I) = WS(IFROM+I,RCVBUF)
        ENDDO
      ENDIF
C
C     case 2) (STEP = 2,...,MXSTEP)
      DO STEP=2,MXSTEP
C
C       Switch roles of message buffers.
        SNDBUF = RCVBUF
        RCVBUF = MOD(SNDBUF, 2) + 1
C
C       Prepare a message of size HALFSIZE in WS(*,SNDBUF). In step
C       "STEP", the message is constructed from 2**(STEP-1) pieces, 1/2
C       from A and 1/2 from previous incoming message:
C        If (DIRN(STEP-1) .EQ. 0), then overwriting WS blocks 0, 2, ... 
C        If (DIRN(STEP-1) .EQ. 1), then overwriting WS blocks 1, 3, ...
        DO J = DIRN(STEP-1)+1,BLOCKS(STEP),2
          ITO   = SORIGIN(STEP) + (J-1)*BUFFSIZE(STEP)
          IFROM = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP)
          DO I = 1,BUFFSIZE(STEP)
            WS(ITO+I,SNDBUF) = A(IFROM+I)
          ENDDO
        ENDDO
C
C       Send newly assembled message and receive message into
C       WS(*,RCVBUF). 
        CALL SWAP(COMMOPT, PROTOPT, ORDER(STEP), ME, BASE, 
     &            SWAPNODE(STEP), RBYTES*HALFSIZE, 
     &            WS(SORIGIN(STEP)+1,SNDBUF), RBYTES*HALFSIZE, 
     &            WS(RORIGIN(STEP)+1,RCVBUF))
C
        IF (STEP .LT. MXSTEP) THEN
C         Copy half of WS(*,RCVBUF) into A, for use in subsequent steps.
          DO J = 1,BLOCKS(STEP)
            IFROM = RORIGIN(STEP) 
     &            + (2*J-2+DIRN(STEP+1))*BUFFSIZE(STEP+1)
            ITO   = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP) 
     &            + DIRN(STEP+1)*BUFFSIZE(STEP+1)
            DO I = 1,BUFFSIZE(STEP+1)
              A(ITO+I) = WS(IFROM+I,RCVBUF)
            ENDDO
          ENDDO
        ENDIF
C
      ENDDO
C
C     Finally, transpose each of P components from A or WS to B.
      IF (DIRN(MXSTEP) .EQ. 0) THEN
        DO I = 0,MAPSIZE-1,2
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*I+1,
     &               A(I*BUFFSIZE(MXSTEP)+1), B)
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*(I+1)+1,
     &               WS((I/2)*BUFFSIZE(MXSTEP)+1,RCVBUF), B)
        ENDDO
      ELSE
        DO I = 0,MAPSIZE-1,2
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*I+1,
     &               WS((I/2)*BUFFSIZE(MXSTEP)+1,RCVBUF), B)
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*(I+1)+1,
     &               A((I+1)*BUFFSIZE(MXSTEP)+1), B)
        ENDDO
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS2(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &                   MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, ML,
     &                   NL, MX, A, WS, B)
C
C This routine computes B = transpose(A) using an O(log P) transpose 
C algorithm and more than two communication buffers. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = M/P rows of A; 
C  B(W,NL,H1,H2,M): Each processor has NL = N/P rows of B; 
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LGTRNS2 requires that MAPSIZE be a power of two, and that
C M and N be integer multiples of MAPSIZE.
C
C Communication options (COMMOPT) for LGTRNS2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LGTRNS2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5) nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5) forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: LOGTRANS
C calls: LGTRNS_INITR, SWAP1, SWAP2, SWAP3, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MX
C Local component of the array that is to be transposed.
C (organized as REAL (W,ML,H1,H2,N))
      REAL A(W*ML*H1*H2*N)
C
C     Work Space
C
C message buffers
C (messages sent from (*,1,*), messages received into (*,2,*))
      REAL WS((W*ML*H1*H2*N),BUFFERS)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,NL,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,NL,H2), (W,MX,NL,M,H2), or (W,MX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C size (in reals and in bytes) of message being swapped
      INTEGER HALFSIZE, MSGLTH
C number of swaps in O(log P) transpose algorithm
      INTEGER MXSTEP
C number of buffers to use in recv-ahead algorithm
      INTEGER MXBUF, HLFBF1, HLFBF2
C information on number, size, and location of pieces used to construct
C messages for a given swap
      INTEGER BLOCKS(LGPROCSX), BUFFSIZE(LGPROCSX), DIRN(LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C arrays indicating the origin offset of the message being sent and
C received, respectively
      INTEGER SORIGIN(LGPROCSX), RORIGIN(LGPROCSX)
C loop indices
      INTEGER I, J, STEP
C array indices
      INTEGER ITO, IFROM
C indices indicating which message buffer is used for sending the
C message in a swap, which buffer is used for receiving the message.
C and which buffer is free to be reused
      INTEGER SNDBUF, RCVBUF, PREVBUF
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in reals and in bytes). 
      HALFSIZE = W*ML*H1*H2*N/2
      MSGLTH   = RBYTES*HALFSIZE
C
C     Precalculate swap partners and other information needed by
C     transpose algorithm.
      CALL LGTRNS_INITR(MAPSIZE, MAP, MYINDEX, LGPROCSX, HALFSIZE, 
     &                  MXSTEP, SWAPNODE, ORDER, BLOCKS, BUFFSIZE, 
     &                  DIRN, SORIGIN, RORIGIN)
C
C     Calculate number of recv-ahead buffers to use.
      MXBUF = MIN(MXSTEP, BUFFERS)
      HLFBF1 = MXBUF/2
      HLFBF2 = (MXBUF+1)/2
C
C     Post MXBUF receive requests.
      DO I=1,MXBUF
        CALL SWAP1(COMMOPT, PROTOPT, .TRUE., ORDER(I), ME,
     &             BASE, SWAPNODE(I), MSGLTH, WS(RORIGIN(I)+1,I)) 
      ENDDO
C
C     Initialize buffer pointers.
      SNDBUF  = MXBUF
      RCVBUF  = 1
      PREVBUF = 1
C
C     Construct transpose using logarithmic exchange algorithm:
C     case 1) (STEP .EQ. 1)
C
C     Send half of A and receive a message into WS(*,RCVBUF).
      CALL SWAP2(COMMOPT, PROTOPT, .TRUE., ORDER(1), ME, 
     &           BASE, SWAPNODE(1), MSGLTH, A(SORIGIN(1)+1), MSGLTH,
     &           WS(RORIGIN(1)+1,RCVBUF))
      CALL SWAP3(COMMOPT, PROTOPT, ME, BASE, SWAPNODE(1),
     &           0, WS)
C
      IF (MXSTEP .GT. 1) THEN
C       Copy half of WS(*,RCVBUF) into A, for use in subsequent steps.
        IFROM = RORIGIN(1) + DIRN(2)*BUFFSIZE(2)
        ITO   = (1-DIRN(1))*BUFFSIZE(1) + DIRN(2)*BUFFSIZE(2)
        DO I = 1,BUFFSIZE(2)
          A(ITO+I) = WS(IFROM+I,RCVBUF)
        ENDDO
      ENDIF
C
C     case 2) (STEP = 2,...,MXSTEP)
      DO STEP=2,MXSTEP
C
C       Update buffer pointers.
        SNDBUF  = RCVBUF
        RCVBUF  = MOD(RCVBUF, MXBUF) + 1
C
C       Prepare a message of size HALFSIZE in WS(*,SNDBUF). In step
C       "STEP", the message is constructed from 2**(STEP-1) pieces, 1/2
C       from A and 1/2 from previous incoming message:
C        If (DIRN(STEP-1) .EQ. 0), then overwriting WS blocks 0, 2, ... 
C        If (DIRN(STEP-1) .EQ. 1), then overwriting WS blocks 1, 3, ...
        DO J = DIRN(STEP-1)+1,BLOCKS(STEP),2
          ITO   = SORIGIN(STEP) + (J-1)*BUFFSIZE(STEP)
          IFROM = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP)
          DO I = 1,BUFFSIZE(STEP)
            WS(ITO+I,SNDBUF) = A(IFROM+I)
          ENDDO
        ENDDO
C
C       Delay posting more receive requests to allow some nonblocking 
C       sends to complete.
        IF (STEP .GT. HLFBF1+1) THEN
C
C         Wait until send is complete before requesting that the
C         buffer be overwritten.
          CALL SWAP3(COMMOPT, PROTOPT, ME, BASE,
     &               SWAPNODE(STEP-HLFBF1), 0, WS)
C
C         Post next recv-ahead receive request.
          IF (STEP+(HLFBF2-1) .LE. MXSTEP) THEN
            CALL SWAP1(COMMOPT, PROTOPT, .TRUE.,
     &                 ORDER(STEP+(HLFBF2-1)), ME, BASE,
     &                 SWAPNODE(STEP+(HLFBF2-1)), MSGLTH,
     &                 WS(RORIGIN(STEP+(HLFBF2-1))+1,PREVBUF)) 
            PREVBUF = MOD(PREVBUF, MXBUF) + 1
          ENDIF
C
        ENDIF
C
C       Send newly assembled message and receive message into
C       WS(*,RCVBUF).  
        CALL SWAP2(COMMOPT, PROTOPT, .TRUE., ORDER(STEP),
     &             ME, BASE, SWAPNODE(STEP), MSGLTH, 
     &             WS(SORIGIN(STEP)+1,SNDBUF), MSGLTH,
     &             WS(RORIGIN(STEP)+1,RCVBUF))  
C
        IF (STEP .LT. MXSTEP) THEN
C         Copy half of WS(*,RCVBUF) into A, for use in subsequent steps. 
          DO J = 1,BLOCKS(STEP)
            IFROM = RORIGIN(STEP) 
     &            + (J*2-2+DIRN(STEP+1))*BUFFSIZE(STEP+1)
            ITO   = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP) 
     &            + DIRN(STEP+1)*BUFFSIZE(STEP+1)
            DO I = 1,BUFFSIZE(STEP+1)
              A(ITO+I) = WS(IFROM+I,RCVBUF)
            ENDDO
          ENDDO
        ENDIF
C
      ENDDO
C
C     Transpose each of P components from A or WS to B.
      IF (DIRN(MXSTEP) .EQ. 0) THEN
        DO I = 0,MAPSIZE-1,2
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*I+1,
     &               A(I*BUFFSIZE(MXSTEP)+1), B)
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*(I+1)+1,
     &               WS((I/2)*BUFFSIZE(MXSTEP)+1,RCVBUF), B)
        ENDDO
      ELSE
        DO I = 0,MAPSIZE-1,2
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*I+1,
     &               WS((I/2)*BUFFSIZE(MXSTEP)+1,RCVBUF), B)
          CALL TRANS(DIR, W, M, H1, H2, ML, NL, MX, ML*(I+1)+1,
     &               A((I+1)*BUFFSIZE(MXSTEP)+1), B)
        ENDDO
      ENDIF
C
C     Finally, wait until outstanding send operations are complete.
      IF (MXSTEP .GT. 1) THEN
        DO I=MXSTEP-HLFBF1+1,MXSTEP
          CALL SWAP3(COMMOPT, PROTOPT, ME, BASE, 
     &             SWAPNODE(I), 0, WS)
        ENDDO
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS3(COMMOPT, PROTOPT, MAPSIZE, MAP, 
     &                   MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                   ML, MX, WSSIZE, A, WS, B)
C
C This routine computes B = transpose(A) using an O(log P) transpose 
C algorithm and four communication buffers. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,LM(MYINDEX),H1,H2,N): Processor I has LM(I) rows of A; 
C                            LM(0) + ... + LM(MAPSIZE-1) = M.
C  B(W,LN(MYINDEX),H1,H2,M): Processor I has LN(I) rows of B; 
C                            LN(0) + ... + LN(MAPSIZE-1) = N.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LGTRNS3 requires that MAPSIZE be a power of two.
C
C Communication options (COMMOPT) for LGTRNS3 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LGTRNS3 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: LOGTRANS
C calls: JOINSPLIT, LGTRNS_INITI, SWAP, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, ML, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C first dimension of work array
      INTEGER WSSIZE
C Local component of the array that is to be transposed.
C (organized as REAL (W,LM(MYINDEX),H1,H2,N))
      REAL A(W*ML*H1*H2,N)
C
C     Work Space
C
C message buffers
      REAL WS(WSSIZE,4)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,LN,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,LN,H2), (W,MX,LN,M,H2), or (W,MX,M,LN,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C number of bytes in a basic block being swapped
      INTEGER BLOCKSIZE
C number of bytes in message being sent and received, respectively
      INTEGER SENDSIZE, RECVSIZE
C number of swaps in O(log P) transpose algorithm
      INTEGER MXSTEP
C number of columns (last index) and rows (second index) in message
C sent in a given swap
      INTEGER SENDCOLS(LGPROCSX), SENDROWS(LGPROCSX)
C number of columns (last index) and rows (second index) in message
C received in a given swap
      INTEGER RECVCOLS(LGPROCSX), RECVROWS(LGPROCSX)
C information on how to join the kept and received data, and how to
C split it into data to be kept and data to be sent
      INTEGER DIRN(LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER SWAPORDER(LGPROCSX)
C indices indicating which message buffer is used for sending the
C message in a swap, and which buffer is used for receiving the message
      INTEGER SNDBUF, RCVBUF
C indices indicating which message buffer is used for the data being
C kept during a given step, and which buffer is used for the data being
C kept during the next step
      INTEGER CURBUF, NXTBUF
C loop index
      INTEGER STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in reals).
      BLOCKSIZE = RBYTES*W*H1*H2
C
C     Set initial roles of buffers.
      CURBUF = 1
      NXTBUF = 2
      SNDBUF = 3
      RCVBUF = 4
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm. (New ordering for columns in WS(1,RCVBUF);
C     workspace in WS(1,CURBUF) and WS(1,SNDBUF).)
      CALL LGTRNS_INITI(MAPSIZE, MAP, MYINDEX, LM, LN, N, LGPROCSX,
     &                  WS(1,CURBUF), WS(1,SNDBUF), MXSTEP, 
     &                  WS(1,RCVBUF), SENDCOLS, SENDROWS, RECVCOLS, 
     &                  RECVROWS, DIRN, SWAPNODE, SWAPORDER)
C
C     Reorder columns so that LN ordering is attained after transpose.
      IF (DIRN(1) .EQ. 0) THEN
        CALL LOGORDER(RECVCOLS(1), W*LM(MYINDEX)*H1*H2, N,
     &                WS(1,RCVBUF), A, WS(1,CURBUF), WS(1,SNDBUF))  
      ELSE
        CALL LOGORDER(SENDCOLS(1), W*LM(MYINDEX)*H1*H2, N,
     &                WS(1,RCVBUF), A, WS(1,SNDBUF), WS(1,CURBUF)) 
      ENDIF
C
C     Construct transpose using logarithmic exchange algorithm:
C
      DO STEP=1,MXSTEP-1
C
C       Send half of current data and receive new data.
        SENDSIZE = SENDCOLS(STEP)*SENDROWS(STEP)*BLOCKSIZE
        RECVSIZE = RECVCOLS(STEP)*RECVROWS(STEP)*BLOCKSIZE
        CALL SWAP(COMMOPT, PROTOPT, SWAPORDER(STEP), ME, 
     &            BASE, SWAPNODE(STEP), SENDSIZE, WS(1,SNDBUF), 
     &            RECVSIZE, WS(1,RCVBUF))
C
C       Combine kept half of current data with received data, then 
C       split the result into a send part and a keep part.
        IF (DIRN(STEP) .EQ. 0) THEN
          IF (DIRN(STEP+1) .EQ. 0) THEN
            CALL JOINSPLIT(W, SENDROWS(STEP), RECVROWS(STEP), H1, H2,
     &                     RECVCOLS(STEP+1), SENDCOLS(STEP+1),
     &                     WS(1,CURBUF), WS(1,RCVBUF),
     &                     WS(1,NXTBUF), WS(1,SNDBUF))
          ELSE
            CALL JOINSPLIT(W, SENDROWS(STEP), RECVROWS(STEP), H1, H2,
     &                     SENDCOLS(STEP+1), RECVCOLS(STEP+1), 
     &                     WS(1,CURBUF), WS(1,RCVBUF),
     &                     WS(1,SNDBUF), WS(1,NXTBUF))
          ENDIF
        ELSE
          IF (DIRN(STEP+1) .EQ. 0) THEN
            CALL JOINSPLIT(W, RECVROWS(STEP), SENDROWS(STEP), H1, H2,
     &                     RECVCOLS(STEP+1), SENDCOLS(STEP+1),
     &                     WS(1,RCVBUF), WS(1,CURBUF),
     &                     WS(1,NXTBUF), WS(1,SNDBUF))
          ELSE
            CALL JOINSPLIT(W, RECVROWS(STEP), SENDROWS(STEP), H1, H2,
     &                     SENDCOLS(STEP+1), RECVCOLS(STEP+1),
     &                     WS(1,RCVBUF), WS(1,CURBUF),
     &                     WS(1,SNDBUF), WS(1,NXTBUF))
          ENDIF
        ENDIF
C
C       Update buffer pointers
        CURBUF = NXTBUF
        NXTBUF = MOD(NXTBUF,2) + 1
C
      ENDDO
C
C     Send half of current data and receive new data.
      SENDSIZE = SENDCOLS(MXSTEP)*SENDROWS(MXSTEP)*BLOCKSIZE
      RECVSIZE = RECVCOLS(MXSTEP)*RECVROWS(MXSTEP)*BLOCKSIZE
      CALL SWAP(COMMOPT, PROTOPT, SWAPORDER(MXSTEP), ME, 
     &          BASE, SWAPNODE(MXSTEP), SENDSIZE, WS(1,SNDBUF), 
     &          RECVSIZE, WS(1,RCVBUF))
C
C     Transpose current and received data into B.
      IF (DIRN(MXSTEP) .EQ. 0) THEN
        CALL TRANS(DIR, W, M, H1, H2, SENDROWS(MXSTEP), LN(MYINDEX),
     &             MX, 1, WS(1,CURBUF), B)
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(MXSTEP), LN(MYINDEX), 
     &             MX, SENDROWS(MXSTEP)+1, WS(1,RCVBUF), B)
      ELSE
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(MXSTEP), LN(MYINDEX), 
     &             MX, 1, WS(1,RCVBUF), B)
        CALL TRANS(DIR, W, M, H1, H2, SENDROWS(MXSTEP), LN(MYINDEX),
     &             MX, RECVROWS(MXSTEP)+1, WS(1,CURBUF), B)
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS4(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &                   MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, 
     &                   LN, ML, MX, WSSIZE, A, WS, B)
C
C This routine computes B = transpose(A) using an O(log P) transpose 
C algorithm and more than five communication buffers. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,LM(MYINDEX),H1,H2,N): Processor I has LM(I) rows of A; 
C                            LM(0) + ... + LM(MAPSIZE-1) = M.
C  B(W,LN(MYINDEX),H1,H2,M): Processor I has LN(I) rows of B; 
C                            LN(0) + ... + LN(MAPSIZE-1) = N.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LGTRNS4 requires that MAPSIZE be a power of two.
C
C Communication options (COMMOPT) for LGTRNS4 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LGTRNS4 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: LOGTRANS
C calls: JOINSPLIT, LGTRNS_INITI, SWAP1, SWAP2, SWAP3, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of (double) communication buffers (to use in recv-ahead 
C algorithm)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, ML, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C first dimension of work array
      INTEGER WSSIZE
C Local component of the array that is to be transposed.
C (organized as REAL (W,LM(MYINDEX),H1,H2,N))
      REAL A(W*ML*H1*H2,N)
C
C     Work Space
C
C message buffers
      REAL WS(WSSIZE,2,BUFFERS)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,LN,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,LN,H2), (W,MX,LN,M,H2), or (W,MX,M,LN,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C number of bytes in a basic block being swapped
      INTEGER BLOCKSIZE
C number of bytes in message being sent and received, respectively
      INTEGER SENDSIZE, RECVSIZE
C number of swaps in O(log P) transpose algorithm
      INTEGER MXSTEP
C number of buffers to use in recv-ahead algorithm
      INTEGER MXBUF
C number of columns (last index) and rows (second index) in message
C sent in a given swap
      INTEGER SENDCOLS(LGPROCSX), SENDROWS(LGPROCSX)
C number of columns (last index) and rows (second index) in message
C received in a given swap
      INTEGER RECVCOLS(LGPROCSX), RECVROWS(LGPROCSX)
C information on how to join the kept and received data, and how to
C split it into data to be kept and data to be sent
      INTEGER DIRN(LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER SWAPORDER(LGPROCSX)
C indices indicating the buffers used for the current message swap
C and for the next message swap
      INTEGER CURMSG, NXTMSG
C indices indicating which message buffer is used for sending the
C message in a swap, and which buffer is used for receiving the message
      INTEGER SNDBUF, RCVBUF
C index indicating which buffers hold the local information (not being
C used in a swap during the current step)
      INTEGER LOCAL
C indices indicating which buffer is used for the data being
C kept during a given step, and which buffer is used for the data being
C kept during the next step
      INTEGER CURBUF, NXTBUF
C loop indices
      INTEGER I, STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in reals).
      BLOCKSIZE = RBYTES*W*H1*H2
C
C     Set initial roles of buffers.
      CURMSG = 1
      NXTMSG = 2
      RCVBUF = 1
      SNDBUF = 2
      LOCAL  = BUFFERS
      CURBUF = 1
      NXTBUF = 2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm. (New ordering for columns in 
C     WS(1,NXTBUF,LOCAL); workspace in WS(1,RCVBUF,CURMSG) and
C     WS(1,SNDBUF,CURMSG).)
      CALL LGTRNS_INITI(MAPSIZE, MAP, MYINDEX, LM, LN, N, LGPROCSX,
     &                  WS(1,RCVBUF,CURMSG), WS(1,SNDBUF,CURMSG), 
     &                  MXSTEP, WS(1,NXTBUF,LOCAL), SENDCOLS, 
     &                  SENDROWS, RECVCOLS, RECVROWS, DIRN, SWAPNODE, 
     &                  SWAPORDER)
C
C     Calculate number of recv-ahead buffers to use.
      MXBUF = MIN(MXSTEP, BUFFERS-1)
C
C     Post MXBUF receive requests.
      DO I=1,MXBUF
        RECVSIZE = RECVCOLS(I)*RECVROWS(I)*BLOCKSIZE
        CALL SWAP1(COMMOPT, PROTOPT, .TRUE., SWAPORDER(I), 
     &             ME, BASE, SWAPNODE(I), RECVSIZE, WS(1,RCVBUF,I)) 
      ENDDO
C
C     Reorder columns so that LN ordering is attained after transpose.
      IF (DIRN(1) .EQ. 0) THEN
        CALL LOGORDER(RECVCOLS(1), W*LM(MYINDEX)*H1*H2, N,
     &                WS(1,NXTBUF,LOCAL), A, WS(1,CURBUF,LOCAL), 
     &                WS(1,SNDBUF,CURMSG))  
      ELSE
        CALL LOGORDER(SENDCOLS(1), W*LM(MYINDEX)*H1*H2, N,
     &                WS(1,NXTBUF,LOCAL), A, WS(1,SNDBUF,CURMSG), 
     &                WS(1,CURBUF,LOCAL)) 
      ENDIF
C
      DO STEP=1,MXSTEP
C
C       Send half of current data and receive new data.
        SENDSIZE = SENDCOLS(STEP)*SENDROWS(STEP)*BLOCKSIZE
        RECVSIZE = RECVCOLS(STEP)*RECVROWS(STEP)*BLOCKSIZE
        CALL SWAP2(COMMOPT, PROTOPT, .TRUE., SWAPORDER(STEP), 
     &             ME, BASE, SWAPNODE(STEP), SENDSIZE, 
     &             WS(1,SNDBUF,CURMSG), RECVSIZE, WS(1,RCVBUF,CURMSG))
C
C       Wait until previous send of WS(1,SNDBUF,NXTMSG) is complete 
C       before requesting that the buffer be overwritten.
        IF (STEP .GT. MXBUF-1) THEN
          CALL SWAP3(COMMOPT, PROTOPT, ME, BASE,
     &               SWAPNODE(STEP-(MXBUF-1)), 0, WS)
        ENDIF
C
        IF (STEP .LT. MXSTEP) THEN
C         Combine kept half of current data with received data, 
C         splitting the result into a send part and a keep part.
          IF (DIRN(STEP) .EQ. 0) THEN
            IF (DIRN(STEP+1) .EQ. 0) THEN
              CALL JOINSPLIT(W, SENDROWS(STEP), RECVROWS(STEP), H1, H2,
     &                       RECVCOLS(STEP+1), SENDCOLS(STEP+1),
     &                       WS(1,CURBUF,LOCAL), WS(1,RCVBUF,CURMSG),
     &                       WS(1,NXTBUF,LOCAL), WS(1,SNDBUF,NXTMSG))
            ELSE
              CALL JOINSPLIT(W, SENDROWS(STEP), RECVROWS(STEP), H1, H2,
     &                       SENDCOLS(STEP+1), RECVCOLS(STEP+1), 
     &                       WS(1,CURBUF,LOCAL), WS(1,RCVBUF,CURMSG),
     &                       WS(1,SNDBUF,NXTMSG), WS(1,NXTBUF,LOCAL))
            ENDIF
          ELSE
            IF (DIRN(STEP+1) .EQ. 0) THEN
              CALL JOINSPLIT(W, RECVROWS(STEP), SENDROWS(STEP), H1, H2,
     &                       RECVCOLS(STEP+1), SENDCOLS(STEP+1),
     &                       WS(1,RCVBUF,CURMSG), WS(1,CURBUF,LOCAL),
     &                       WS(1,NXTBUF,LOCAL), WS(1,SNDBUF,NXTMSG))
            ELSE
              CALL JOINSPLIT(W, RECVROWS(STEP), SENDROWS(STEP), H1, H2,
     &                       SENDCOLS(STEP+1), RECVCOLS(STEP+1),
     &                       WS(1,RCVBUF,CURMSG), WS(1,CURBUF,LOCAL),
     &                       WS(1,SNDBUF,NXTMSG), WS(1,NXTBUF,LOCAL))
            ENDIF
          ENDIF
C
C         Post next recv-ahead receive request, using current receive
C         buffer.
          IF (STEP+MXBUF .LE. MXSTEP) THEN
            RECVSIZE = RECVCOLS(STEP+MXBUF)
     &               * RECVROWS(STEP+MXBUF)*BLOCKSIZE
            CALL SWAP1(COMMOPT, PROTOPT, .TRUE.,
     &                 SWAPORDER(STEP+MXBUF), ME, BASE,
     &                 SWAPNODE(STEP+MXBUF), RECVSIZE,
     &                 WS(1,RCVBUF,CURMSG)) 
          ENDIF
C
C         Update buffer pointers
          CURMSG = NXTMSG
          NXTMSG = MOD(NXTMSG,MXBUF) + 1
          CURBUF = NXTBUF
          NXTBUF = MOD(NXTBUF,2) + 1
C
        ENDIF
C
      ENDDO
C
C     Transpose current and received data into B.
      IF (DIRN(MXSTEP) .EQ. 0) THEN
        CALL TRANS(DIR, W, M, H1, H2, SENDROWS(MXSTEP), LN(MYINDEX),
     &             MX, 1, WS(1,CURBUF,LOCAL), B)
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(MXSTEP), LN(MYINDEX), 
     &             MX, SENDROWS(MXSTEP)+1, WS(1,RCVBUF,CURMSG), B)
      ELSE
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(MXSTEP), LN(MYINDEX), 
     &             MX, 1, WS(1,RCVBUF,CURMSG), B)
        CALL TRANS(DIR, W, M, H1, H2, SENDROWS(MXSTEP), LN(MYINDEX),
     &             MX, RECVROWS(MXSTEP)+1, WS(1,CURBUF,LOCAL), B)
      ENDIF
C
C     Wait until outstanding send operations are complete.
      DO I=MXSTEP-MXBUF+2,MXSTEP
        CALL SWAP3(COMMOPT, PROTOPT, ME, BASE, 
     &             SWAPNODE(I), 0, WS)
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS_INITR(MAPSIZE, MAP, MYINDEX, LGPROCSX, MSGSIZE,
     &                        MXSWAP, SWAPNODE, SWAPORDER, BLOCKS, 
     &                        BUFFSIZE, DIRN, SORIGIN, RORIGIN)
C
C This routine calculates swap partners and other information needed
C by the "regular" O(log P) exchange transpose algorithms, i.e. those
C for which all processors send and receive the same size messages.
C
C called by: LGTRNS1, LGTRNS2
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C declared length of the output arrays, at least as long as 
C LOG2(MAPSIZE)
      INTEGER LGPROCSX
C number of reals in a swap message
      INTEGER MSGSIZE
C
C     Output
C
C number of swaps in transpose algorithm
      INTEGER MXSWAP
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER SWAPORDER(LGPROCSX)
C information on number, size, and location of pieces used to construct
C messages for a given swap
      INTEGER BLOCKS(LGPROCSX)
      INTEGER BUFFSIZE(LGPROCSX)
      INTEGER DIRN(LGPROCSX)
      INTEGER SORIGIN(LGPROCSX)
      INTEGER RORIGIN(LGPROCSX)
C
C---- Local Variables --------------------------------------------------
C
C distance between MYINDEX and swap partner index
      INTEGER DISTANCE
C index of swap partner
      INTEGER DEST
C loop index
      INTEGER I
C
C---- Executable Statements -------------------------------------------
C
      MXSWAP = 0                                        
      DISTANCE = MAPSIZE/2
      DO WHILE (DISTANCE .GT. 0)
C
C       Increment step index.
        MXSWAP = MXSWAP+1
C
C       Calculate swap partners, swap order, and buffer pointers. The
C       order is chosen in such a way as to minimize collisions on a
C       bidirectional grid. 
        IF (MOD(MYINDEX, 2*DISTANCE) .LT. DISTANCE) THEN
          DIRN(MXSWAP) = 0
          DEST = MYINDEX + DISTANCE
          IF (MOD(MYINDEX, 2) .EQ. 0) THEN
            SWAPORDER(MXSWAP) = 1
          ELSE
            SWAPORDER(MXSWAP) = -1
          ENDIF
        ELSE
          DIRN(MXSWAP) = 1
          DEST = MYINDEX - DISTANCE
          IF (MOD(DEST, 2) .EQ. 0) THEN
            SWAPORDER(MXSWAP) = -1
          ELSE
            SWAPORDER(MXSWAP) = 1
          ENDIF
        ENDIF
        SWAPNODE(MXSWAP) = MAP(DEST)
C
C       Update distance.
        DISTANCE = DISTANCE/2
C
      ENDDO
C
C     Calculate segment sizes and number of segments making up a given 
C     message.
      BUFFSIZE(1) = MSGSIZE
      BLOCKS(1) = 1
      DO I=2,MXSWAP
        BUFFSIZE(I) = BUFFSIZE(I-1)/2
        BLOCKS(I)   = 2*BLOCKS(I-1)
      ENDDO
C
C     Calculate index offsets for outgoing and incoming messages
      SORIGIN(1) = (1-DIRN(1))*BUFFSIZE(1)
      DO I=2,MXSWAP
C       Determine base address of next message in send buffer.
        IF (((DIRN(I-1) .EQ. 0) .AND. (DIRN(I) .EQ. 0)) .OR.
     &      ((DIRN(I-1) .EQ. 1) .AND. (DIRN(I) .EQ. 1))) THEN
C         Outgoing message has same alignment as previous incoming
C         message (origin at BUFFSIZE(I) + 1).
          SORIGIN(I) = BUFFSIZE(I)
        ELSEIF ((DIRN(I-1) .EQ. 0) .AND. (DIRN(I) .EQ. 1)) THEN 
C         Origin of outgoing message is before that of the previous incoming
C         message (origin at 1).
          SORIGIN(I) = 0
        ELSE
C         Origin of outgoing message is after that of the previous incoming
C         message (origin at 2*buffsize(step)+1).
          SORIGIN(I) = 2*BUFFSIZE(I)
        ENDIF
      ENDDO
C
      DO I=1,MXSWAP-1
        RORIGIN(I) = BUFFSIZE(I+1)
      ENDDO
      RORIGIN(MXSWAP) = 0
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS_INITI(MAPSIZE, MAP, MYINDEX, LM, LN, N,
     &                        LGPROCSX, PORDER, LNSUM, MXSWAP, 
     &                        NEWORDER, SENDCOLS, SENDROWS, RECVCOLS,
     &                        RECVROWS, DIRN, SWAPNODE, SWAPORDER) 
C
C This routine calculates swap partners and other information needed
C by the "irregular" O(log P) exchange transpose algorithms, i.e. those
C for which the size of the messages a processor sends and receives 
C varies between steps and between processors.
C
C called by: LGTRNS3, LGTRNS4
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C number of local "last" indices for input and output arrays in 
C transpose for each processor
      INTEGER LM(0:MAPSIZE-1), LN(0:MAPSIZE-1)
C total number of columns (last index) in input array
      INTEGER N
C declared length of the column and row output arrays, at least as long
C as LOG2(MAPSIZE)
      INTEGER LGPROCSX
C
C     Workspace
C
C new processor column group ordering
      INTEGER PORDER(0:MAPSIZE-1)
C processer column group cumulative sums
      INTEGER LNSUM(0:MAPSIZE-1)
C
C     Output
C
C number of swaps in transpose algorithm
      INTEGER MXSWAP
C array indicating the permuted ordering needed to obtain the desired
C column partition LN at the end of the log transpose (mapping
C new ordering back to old ordering)
      INTEGER NEWORDER(N)
C arrays indicating number of last and second indices in data sent 
C during a given swap
      INTEGER SENDCOLS(LGPROCSX), SENDROWS(LGPROCSX)
C arrays indicating number of last and second indices in data received
C during a given swap
      INTEGER RECVCOLS(LGPROCSX), RECVROWS(LGPROCSX)
C array indicating how to join the kept and received data, and how to
C split it into data to be kept and data to be sent
      INTEGER DIRN(LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER SWAPORDER(LGPROCSX)
C
C---- Local Variables --------------------------------------------------
C
C distance between swap partners
      INTEGER DISTANCE, DIST2
C index offsets
      INTEGER INCREM, INCHLF
C index for original processors column group ordering 
      INTEGER IO
C index for new column ordering
      INTEGER IN
C index of swap partner and index of "base" processor in subgroup with
C a common number of columns or rows
      INTEGER DEST, BASE
C loop indices
      INTEGER I, J
C
C---- Executable Statements -------------------------------------------
C
C     Calculate new column ordering:
C     first, new processor column group ordering
      DISTANCE  = 1
      INCHLF    = MAPSIZE
      PORDER(0) = 0
      DO WHILE (DISTANCE .LT. MAPSIZE)
        INCREM = INCHLF
        INCHLF = INCREM/2
        DO I=0,MAPSIZE-1,INCREM
          PORDER(I+INCHLF) = PORDER(I) + DISTANCE
        ENDDO
        DISTANCE = 2*DISTANCE
      ENDDO
C     second, processer column group sums;
      LNSUM(0) = 0
      DO I=1,MAPSIZE-1
        LNSUM(I) = LNSUM(I-1) + LN(I-1)
      ENDDO
C     third, reordering array.
      IN = 0
      DO I=0,MAPSIZE-1
        IO = PORDER(I)
        DO J=1,LN(IO)
          IN = IN + 1
          NEWORDER(IN) = LNSUM(IO) + J
        ENDDO
      ENDDO
C
C     Calculate swap partners, swap order, where to put swapped
C     data, and number of rows and columns of data sent or received at
C     each step. The order is chosen in such a way as to minimize
C     collisions on a bidirectional grid.   
      MXSWAP  = 0                                        
      DISTANCE = 1
      DIRN(0)  = 0
      DO WHILE (DISTANCE .LT. MAPSIZE)
C
C       Increment step index and calculate next distance
        MXSWAP = MXSWAP+1
        DIST2 = 2*DISTANCE
C
        IF (MOD(MYINDEX, 2*DISTANCE) .LT. DISTANCE) THEN
          DIRN(MXSWAP) = 0
          DEST = MYINDEX + DISTANCE
          IF (MOD(MYINDEX, 2) .EQ. 0) THEN
            SWAPORDER(MXSWAP) = 1
          ELSE
            SWAPORDER(MXSWAP) = -1
          ENDIF
        ELSE
          DIRN(MXSWAP) = 1
          DEST = MYINDEX - DISTANCE
          IF (MOD(DEST, 2) .EQ. 0) THEN
            SWAPORDER(MXSWAP) = -1
          ELSE
            SWAPORDER(MXSWAP) = 1
          ENDIF
        ENDIF
        SWAPNODE(MXSWAP) = MAP(DEST)
C
C       Determine number of rows being sent (and in current data).
        IF (MXSWAP .EQ. 1) THEN
          SENDROWS(1) = LM(MYINDEX)
        ELSE
          SENDROWS(MXSWAP) = SENDROWS(MXSWAP-1) + RECVROWS(MXSWAP-1)
        ENDIF
C
C       Determine number of rows received in new data.
        RECVROWS(MXSWAP) = 0
        BASE = (DEST/DISTANCE)*DISTANCE
        DO I=BASE,BASE+DISTANCE-1
          RECVROWS(MXSWAP) = RECVROWS(MXSWAP) + LM(I)
        ENDDO
C
C       Determine number of columns of data received at each step.
        RECVCOLS(MXSWAP) = 0
        BASE = MOD(MYINDEX,DIST2)
        DO I=BASE,MAPSIZE-1,DIST2
          RECVCOLS(MXSWAP) = RECVCOLS(MXSWAP) + LN(I)
        ENDDO
C
C       Update distance.
        DISTANCE = DIST2
C
      ENDDO
C
C     Determine number of columns of data sent at each step.
      SENDCOLS(1) = N - RECVCOLS(1)
      DO I=2,MXSWAP
        SENDCOLS(I) = RECVCOLS(I-1) - RECVCOLS(I)
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LOGORDER(SPLIT, BLKSIZ, N, ORDER, INPUT, OUTPUT1, 
     &                    OUTPUT2)    
C
C This routine reorders the array INPUT so that the desired partition
C ordering is achieved after the log tranpose is complete.
C The reorderd data is output into two arrays, with the first "half"
C (determined by SPLIT) in OUTPUT1 and the second half in OUTPUT2.
C
C called by: LGTRNS3, LGTRNS4
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C index specifying where split between OUTPUT1 and OUTPUT2 occurs
      INTEGER SPLIT
C dimensions of data array
      INTEGER BLKSIZ, N
C specification of new column ordering
      INTEGER ORDER(N)
C array to be reorderd
      REAL INPUT(BLKSIZ, N)
C
C     Output
C
C reordered array (in two pieces)
      REAL OUTPUT1(BLKSIZ, N)
      REAL OUTPUT2(BLKSIZ, N)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER IB, IN
C
C---- Executable Statements -------------------------------------------
C
      DO IN=1,SPLIT
        DO IB=1,BLKSIZ
          OUTPUT1(IB,IN) = INPUT(IB,ORDER(IN))
        ENDDO
      ENDDO
      DO IN=SPLIT+1,N
        DO IB=1,BLKSIZ
          OUTPUT2(IB,IN-SPLIT) = INPUT(IB,ORDER(IN))
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE JOINSPLIT(W, MA, MB, H1, H2, N1, N2, INA, INB, OUT1,
     &                     OUT2) 
C
C This routine concatenates the arrays INA and INB on the second index,
C interleaving blocks of the second index values as indicated by array 
C LM, and splits the output into two arrays on the last index.

C called by: LGTRNS3, LGTRNS4
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER W, MA, MB, H1, H2, N1, N2
C arrays to be joined
      REAL INA(W,MA,H1,H2,N1+N2)
      REAL INB(W,MB,H1,H2,N1+N2)
C
C     Output
C
C split output arrays
      REAL OUT1(W,MA+MB,H1,H2,N1)
      REAL OUT2(W,MA+MB,H1,H2,N2)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER IW, IM, IH1, IH2, IN
C
C---- Executable Statements --------------------------------------------
C
C     First half of OUT1
      DO IN=1,N1
        DO IH2=1,H2
          DO IH1=1,H1
            DO IM=1,MA
              DO IW=1,W
                OUT1(IW,IM,IH1,IH2,IN) = INA(IW,IM,IH1,IH2,IN)
              ENDDO
            ENDDO
          ENDDO
        ENDDO
      ENDDO     
C
C
C     First half of OUT2
      DO IN=1,N2
        DO IH2=1,H2
          DO IH1=1,H1
            DO IM=1,MA
              DO IW=1,W
                OUT2(IW,IM,IH1,IH2,IN) = INA(IW,IM,IH1,IH2,N1+IN)
              ENDDO
            ENDDO
          ENDDO
        ENDDO
      ENDDO     
C
C     Second half of OUT1
      DO IN=1,N1
        DO IH2=1,H2
          DO IH1=1,H1
            DO IM=1,MB
              DO IW=1,W
                OUT1(IW,MA+IM,IH1,IH2,IN) = INB(IW,IM,IH1,IH2,IN)
              ENDDO
            ENDDO
          ENDDO
        ENDDO
      ENDDO     
C
C     Second half of OUT2
      DO IN=1,N2
        DO IH2=1,H2
          DO IH1=1,H1
            DO IM=1,MB
              DO IW=1,W
                OUT2(IW,MA+IM,IH1,IH2,IN) = INB(IW,IM,IH1,IH2,N1+IN)
              ENDDO
            ENDDO
          ENDDO
        ENDDO
      ENDDO     
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
