C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  (Stripped down PVM-only version (4/13/95), for use in ParkBench     #
C   benchmark suite)                                                   #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
      SUBROUTINE TRANSPOSE(COMMOPT, BUFFERS, PROTOPT,
     &                     MAPSIZE, MAP, MYINDEX, BASE, DIR, W, M, N,
     &                     H1, H2, LM, LN, MX, A, WS, B)
C
C This routine transposes the array
C                            A(W,LM(MYINDEX),H1,H2,N)
C to an array with general form
C                            B(W,LN(MYINDEX),H1,H2,M),
C where both A and B are distributed over MAPSIZE processors. Both
C arrays are declared to be real, but are real if (W .EQ. 1) and
C complex if (W .EQ. 2). The actual organization of B is determined by
C the parameter DIR; this specifies where TRANSPOSE is called from,
C allowing the routine to order B as required for subsequent stages in
C PSTSWM.  The parameters COMMOPT, PROTOPT, and BUFFERS select one of a
C variety of different transpose algorithms. 
C
C 1) The basic idea:
C
C  We compute B = transpose(A), where:
C    A is a matrix of size (W,M,H1,H2,N), distributed by rows (M index); 
C    B is a matrix of size (W,N,H1,H2,M), distributed by rows (N index).
C
C  Each processor has part of A and B as follows:
C      A(W,LM(MYINDEX),H1,H2,N):
C          Processor I has LM(I) rows of A;
C          LM(0) + ... + LM(MAPSIZE-1) = M.
C      B(W,LN(MYINDEX),H1,H2,M):
C          Processor I has LN(I) rows of B;
C          LN(0) + ... + LN(MAPSIZE-1) = N.
C 
C 2) Specializations:
C  
C  The routine TRANS incorporates additional reorganizations that allow 
C  for B having a shape different than (W,N,H1,H2,M), and an
C  organization different than a simple transpose of A. TRANS is called
C  once for each incoming message and once for the local component of
C  the transpose.  It takes the data received in a message (or found
C  locally): 
C                         WS(W,ML,H1,H2,LN)
C  and puts this in the correct place in the array B.
C
C  There are six different versions of TRANS, distinguished by DIR:
C 
C  DIR=-1: Used after real forward transpose.  The array B is formed as 
C          follows, where MX is the M value with padding used in
C          PSTSWM: 
C                          B(W,MX,LN,H1,H2)  
C  DIR=+1: Used after real backward transpose.  The array B is formed as
C          follows, where MX is the LN value with padding used in
C          PSTSWM: 
C                          B(W,MX,M,H1,H2)
C  DIR=-2: Used after complex forward transpose following real forward
C          transpose in transpose FFT/transpose LT algorithm.
C          The array B is formed as follows, where MX is the LN value
C          with padding used in PSTSWM:
C                          B(W,MX,H1,M,H2)  
C          This transpose "undistributes" the latitude dimension.
C  DIR=+2: Used after complex backward transpose preceding real forward
C          transpose in transpose FFT/transpose LT algorithm.
C          The array B is formed as follows, where MX is the M value
C          with padding used in PSTSWM:
C                          B(W,MX,H1,LN,H2)  
C          This transpose "redistributes" the latitude dimension, 
C          reordering the Fourier coefficients from the "load-balanced"
C          ordering to the normal "unordered" ordering that the FFT 
C          expects.
C  DIR=-3: Used after complex forward transpose following distributed
C          FFT in distributed FFT/transpose LT algorithm.
C          The array B is formed as follows, where MX is the H1 value
C          with padding used in PSTSWM:
C                          B(W,MX,LN,M,H2)  
C          This transpose "undistributes" the latitude dimension.
C  DIR=+3: Used after complex backward transpose preceding distributed 
C          FFT in distributed FFT/transpose LT algorithm.
C          The array B is formed as follows, where MX is the H1 value
C          with padding used in PSTSWM:
C                          B(W,MX,M,LN,H2)
C          This transpose "redistributes" the latitude dimension,
C          reordering the Fourier coefficients from the "truncated" 
C          ordering to the normal "unordered" ordering that the FFT 
C          expects.
C
C 3) Algorithm Variants:
C
C  The code incorporates numerous variants of three different parallel
C  transpose algorithms. The algorithms and some of the variants are
C  selected by the parameter COMMOPT as follows:
C
C  a) If ((COMMOPT .GE. 0) .AND. (COMMOPT .LE. 4)), then an O(P) step
C  "send/recv" algorithm is used. At each step each processor sends a
C  message to one processor and receives a message from another.
C
C  b) If ((COMMOPT .GE. 10) .AND. (COMMOPT .LE. 14)), then an O(P) step 
C  "swap" algorithm is used. At each step each processor exchanges
C  messages with another processor.
C
C  c) If ((COMMOPT .GE. 2) .AND. (COMMOPT .LE. 21)), then an O(log P) step 
C  algorithm is used. At each step each processor exchanges messages
C  with another processor. For this algorithm, MAPSIZE must be a power
C  of two and LM and LN must be constant vectors, i.e. all processors
C  must have the same number of rows of the distributed arrays. The
C  decreased number of steps in this algorithm compared to the first two
C  comes at the cost of a larger amount of data moved.
C
C  Other variants for these three algorithms are described in the
C  routines SRTRANS, SWAPTRANS, and LOGTRANS.
C
C called by: RFTLON
C calls: LOGTRANS, MSGBASE, SRTRANS, SWAPTRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C local component of the array that is to be transposed, of size
C REAL (W,LM(MYINDEX),H1,H2,N)
      REAL A(1)
C
C     Work Space
C
C message buffers
C (large enough for REAL WS(W,MAX(LM),H1,H2,N,BUFFERS) 
C               and REAL WS(W,MAX(LN),H1,H2,M,BUFFERS) )
      REAL WS(1)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,LN,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,LN,H2), (W,MX,LN,M,H2), or (W,MX,M,LN,H2)) 
      REAL B(1)
C
C---- Executable Statements --------------------------------------------
C
      CALL TRACEEVENTF('entry', 10, 0, 0)
        IF(COMMOPT .LT. 10) THEN
C
C         Using an O(P) step "send/recv" transpose algorithm.
          CALL SRTRANS(COMMOPT, PROTOPT, MAPSIZE, MAP,
     &                 MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN, MX,
     &                 A, WS, B) 
C
        ELSEIF (COMMOPT .LT. 20) THEN
C
C         Using an O(P) step "swap" transpose algorithm.
          CALL SWAPTRANS(COMMOPT-10, PROTOPT, MAPSIZE, MAP, 
     &                   MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                   MX, A, WS, B) 
C
        ELSEIF (COMMOPT .LT. 30) THEN
C
C         Using an O(log P) step "swap" transpose algorithm.
          CALL LOGTRANS(COMMOPT-20, BUFFERS, PROTOPT,
     &                  MAPSIZE, MAP, MYINDEX, BASE, DIR, W, M, N, H1,
     &                  H2, LM, LN, MX, A, WS, B)
C
        ELSE
C
C         illegal communication option specified
          WRITE(0,100) MAP(MYINDEX), COMMOPT
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE TRANSPOSE ',/, 
     &            ' ILLEGAL COMMUNICATION OPTION SPECIFIED',/,
     &            ' PROCID = ',I4,' COMMOPT = ',I4)
          STOP
C
        ENDIF
      CALL TRACEEVENTF('exit', 10, 0, 0)
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS(DIR, W, M, H1, H2, ML, NL, MX, START, FROM, TO)
C
C  This subroutine transposes FROM(W,ML,H1,H2,NL) into the indicated
C  part of TO. There are six different versions of TRANS, distinguished
C  by DIR:  
C  DIR=-1: Used after real forward transpose. Thus, both FROM and TO are 
C          real (W .EQ. 1). The array TO is formed as follows, where MX
C          is the M value with padding used in PSTSWM:  
C                          TO(MX,NL,H1,H2)  
C          Here START is an offset to the first (MX) index.
C  DIR=+1: Used after real backward transpose. Thus, both FROM and TO
C          are real (W .EQ. 1). The array TO is formed as follows, where
C          MX is the NL value with padding used in PSTSWM:   
C                          TO(MX,M,H1,H2)
C          Here START is an offset to the second (M) index.
C  DIR=-2: Used after complex forward transpose following real forward
C          transpose in transpose FFT/transpose LT algorithm. Thus, both
C          FROM and TO are complex (W .EQ. 2). The array TO is formed as
C          follows, where, MX is the NL value with padding used in
C          PSTSWM:  
C                          TO(MX,H1,M,H2)  
C          Here START is an offset to the third (M) index.
C  DIR=+2: Used after complex backward transpose preceding real forward
C          transpose in transpose FFT/transpose LT algorithm. Thus, both
C          FROM and TO are complex (W .EQ. 2). The array TO is formed as
C          follows, where MX is the M value with padding used in
C          PSTSWM: 
C                          TO(MX,H1,NL,H2)  
C          Here START is an offset to the first (MX) index. Also, the
C          ordering of the wavenumbers (MX) is permuted to that 
C          expected by the FFT.
C  DIR=-3: Used after complex forward transpose following distributed 
C          FFT in distributed FFT/transpose LT algorithm. Thus, both
C          FROM and TO are complex (W .EQ. 2). The array TO is formed as
C          follows, where MX is the H1 value with padding used in
C          PSTSWM:  
C                          TO(MX,NL,M,H2)  
C          Here START is an offset to the third (M) index.
C  DIR=+3: Used after complex backward transpose preceding distributed FFT
C          in distributed FFT/transpose LT algorithm. Thus, both
C          FROM and TO are complex (W .EQ. 2). The array TO is formed as
C          follows, where MX is the H1 value with padding used in
C          PSTSWM: 
C                          TO(MX,M,NL,H2)
C          Here START is an offset to the second (M) index. Also, the
C          ordering of the wavenumbers (MX) is permuted to that 
C          expected by the FFT.
C
C called by: LGTRNS1, LGTRNS2, SRTRNS1, SRTRNS2, SWPTRNS1, SWPTRNS2
C calls: TRANS1, TRANS2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Common Blocks ----------------------------------------------------
C
C domain decomposition information
#     include "spectral.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, H1, H2, ML, NL, MX
C starting location in the "M" index of TO where the transpose is to
C begin 
      INTEGER START
C array that is to be transposed
      REAL FROM(W,ML,H1,H2,NL)
C
C     Output
C
C destination of transposed array
C (organized as REAL (MX,NL,H1,H2), REAL (MX,M,H1,H2),
C  COMPLEX (MX,H1,M,H2), COMPLEX (MX,H1,NL,H2), COMPLEX (MX,NL,M,H2)
C  or COMPLEX (MX,M,NL,H2))
      REAL TO(1)
C
C---- Executable Statements --------------------------------------------
C
      IF(DIR .EQ. -1) THEN
        CALL TRANS1(H1*H2, ML, NL, MX, START, FROM, TO)
      ELSE IF(DIR .EQ. +1) THEN
        CALL TRANS2(M, H1*H2, ML, NL, MX, START, FROM, TO)
      ELSE IF(DIR .EQ. -2) THEN
        CALL TRANS3(M, H1, H2, ML, NL, MX, START, FROM, TO)
      ELSE IF(DIR .EQ. +2) THEN
        CALL TRANS4(H1, H2, ML, NL, MX, START, ORDINV_S, FROM, TO)
      ELSE IF(DIR .EQ. -3) THEN
        CALL TRANS5(M, H1, H2, ML, NL, MX, START, FROM, TO)
      ELSE IF(DIR .EQ. +3) THEN
        CALL TRANS6(M, H1, H2, ML, NL, MX, START, ORDINV_S, FROM, TO)
      ELSE
        WRITE(0,100) DIR
  100   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE TRANS ',/,
     &          ' INVALID TRANSPOSE REORGANIZATION OPTION SPECIFIED',/, 
     &          ' DIR = ',I4)
        STOP
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS1(H, ML, NL, MMX, START, FROM, TO)
C
C This subroutine transposes FROM(ML,H,NL) into the indicated part
C of TO. The array TO is formed as follows, where MMX is the M value
C with padding used in PSTSWM: 
C                         TO(MMX,NL,H)  
C Here START is an offset to the first index (MMX).
C
C called by: TRANS
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER H, ML, NL, MMX
C starting location in the "ML" index of TO where the transpose is to
C begin 
      INTEGER START
C array that is to be transposed
      REAL FROM(ML,H,NL)
C
C     Output
C
C destination of transposed array
      REAL TO(MMX,NL,H)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, K, L
C
C---- Executable Statements --------------------------------------------
C
      DO I = 1,NL
        DO K = 1,H
          DO L = 1,ML
            TO(START+L-1,I,K) = FROM(L,K,I)
          ENDDO
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS2(M, H, ML, NL, NLMX, START, FROM, TO)
C
C This subroutine transposes FROM(ML,H,NL) into the indicated part of
C TO. The array TO is formed as follows, where NLMX is the NL value
C with padding used in PSTSWM:  
C                         TO(NLMX,M,H)
C Here START is an offset to the second index (M).
C
C called by: TRANS
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER M, H, ML, NL, NLMX
C starting location in the "M" index of TO where the transpose is to
C begin 
      INTEGER START
C array that is to be transposed
      REAL FROM(ML,H,NL)
C
C     Output
C
C destination of transposed array
      REAL TO(NLMX,M,H)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, K, L
C
C---- Executable Statements --------------------------------------------
C
      DO I = 1,NL
        DO K = 1,H
          DO L = 1,ML
            TO(I,START+L-1,K) = FROM(L,K,I)
          ENDDO
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS3(M, H1, H2, ML, NL, NLMX, START, FROM, TO)
C
C This subroutine transposes FROM(ML,H1,H2,NL) into the indicated part
C of TO. The array TO is formed as follows, where NLMX is the NL value
C with padding used in PSTSWM: 
C                         TO(NLMX,H1,M,H2)
C Here START is an offset to the third (M) index.
C
C called by: TRANS
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER M, H1, H2, ML, NL, NLMX
C starting location in the "M" index of TO where the transpose is to
C begin 
      INTEGER START
C array that is to be transposed
      COMPLEX FROM(ML,H1,H2,NL)
C
C     Output
C
C destination of transposed array
      COMPLEX TO(NLMX,H1,M,H2)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, J, K, L
C
C---- Executable Statements --------------------------------------------
C
      DO I = 1,NL
        DO J = 1,H2
          DO K = 1,H1
            DO L = 1,ML
              TO(I,K,START+L-1,J) = FROM(L,K,J,I)
            ENDDO
          ENDDO
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS4(H1, H2, ML, NL, MMX, START, ORDER, FROM, TO)
C
C This subroutine transposes FROM(ML,H1,H2,NL) into the indicated part
C of TO. The array TO is formed as follows, where MMX is the M value
C with padding used in PSTSWM:   
C                         TO(MMX,H1,NL,H2)  
C Here START is an offset to the first (MMX) index. The MMX 
C indices are reordered according to the specification in ORDER.
C
C called by: TRANS
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER H1, H2, ML, NL, MMX
C starting location in the "MMX" index of TO where the transpose is to
C begin 
      INTEGER START
C array defining reordering
      INTEGER ORDER(MMX)
C array that is to be transposed
      COMPLEX FROM(ML,H1,H2,NL)
C
C     Output
C
C destination of transposed array
      COMPLEX TO(MMX,H1,NL,H2)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, J, K, L
C
C---- Executable Statements --------------------------------------------
C
      DO L = 1,NL
        DO K = 1,H2
          DO J = 1,H1
            DO I = 1,ML
              TO(ORDER(START+I-1),J,L,K) = FROM(I,J,K,L)
            ENDDO
          ENDDO
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS5(M, H1, H2, ML, NL, H1MX, START, FROM, TO)
C
C This subroutine transposes FROM(ML,H1,H2,NL) into the indicated part
C of TO. The array TO is formed as follows, where H1MX is the H1 value
C with padding used in PSTSWM:
C                         TO(H1MX,NL,M,H2)
C Here START is an offset to the third (M) index.
C
C called by: TRANS
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER M, H1, H2, ML, NL, H1MX
C starting location in the "M" index of TO where the transpose is to
C begin 
      INTEGER START
C array that is to be transposed
      COMPLEX FROM(ML,H1,H2,NL)
C
C     Output
C
C destination of transposed array
      COMPLEX TO(H1MX,NL,M,H2)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, J, K, L
C
C---- Executable Statements --------------------------------------------
C
      DO L = 1,NL
        DO K = 1,H2
          DO J = 1,H1
            DO I = 1,ML
              TO(J,L,START+I-1,K) = FROM(I,J,K,L)
            ENDDO
          ENDDO
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE TRANS6(M, H1, H2, ML, NL, H1MX, START, ORDER, FROM, 
     &                  TO)
C
C This subroutine transposes FROM(ML,H1,H2,NL) into the indicated part
C of TO. The array TO is formed as follows, where H1MX is the H1 value
C with padding used in PSTSWM:   
C                         TO(H1MX,M,NL,H2)  
C Here START is an offset to the second (MMX) index. The H1MX 
C indices are reordered according to the specification in ORDER.
C
C called by: TRANS
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C dimensions of input and output arrays
      INTEGER M, H1, H2, ML, NL, H1MX
C starting location in the "M" index of TO where the transpose is to
C begin 
      INTEGER START
C array defining reordering
      INTEGER ORDER(H1MX)
C array that is to be transposed
      COMPLEX FROM(ML,H1,H2,NL)
C
C     Output
C
C destination of transposed array
      COMPLEX TO(H1MX,M,NL,H2)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, J, K, L
C
C---- Executable Statements --------------------------------------------
C
      DO L = 1,NL
        DO K = 1,H2
          DO J = 1,H1
            DO I = 1,ML
              TO(ORDER(J),START+I-1,L,K) = FROM(I,J,K,L)
            ENDDO
          ENDDO
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

