C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
      SUBROUTINE SWAPTRANS(COMMOPT, PROTOPT, MAPSIZE, MAP,
     &                     MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                     MX, A, WS, B) 
C
C This subroutine calls routines that compute B = transpose(A) using
C an O(P) "swap" transpose algorithm, where each step consists of
C swapping information between processors. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,LM(MYINDEX),H1,H2,N): Processor I has LM(I) rows of A; 
C                            LM(0) + ... + LM(MAPSIZE-1) = M.
C  B(W,LN(MYINDEX),H1,H2,M): Processor I has LN(I) rows of B; 
C                            LN(0) + ... + LN(MAPSIZE-1) = N.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,LN,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,LN,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,LN,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,LN,H2)   (Used after complex backward transpose
C The MAP array defines the processor subset and ordering to use.
C
C Communication options (COMMOPT) for SWAPTRANS include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/trans
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/trans
C  IF (COMMOPT .EQ. 2) simple swap with recv-ahead
C  IF (COMMOPT .EQ. 3) ordered swap with recv-ahead
C  IF (COMMOPT .EQ. 4) send-ahead swap with recv-ahead
C Communication protocol options (PROTOPT) for SWAPTRANS include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: TRANSPOSE
C calls: SWPTRNS1, SWPTRNS2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C local component of the array that is to be transposed, of size
C REAL (W,LM(MYINDEX),H1,H2,N)
      REAL A(1)
C
C     Work Space
C
C message buffers, of size REAL (W,LN(MYINDEX),H1,H2,M)
      REAL WS(1)
C
C     Output
C
C local component of the transposed array.
C (organized as REAL (W,MX,LN,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,LN,H2), (W,MX,LN,M,H2), or (W,MX,M,LN,H2)) 
      REAL B(1)
C
C---- Executable Statements --------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       Compute transpose.
C
        IF (COMMOPT .LE. 1) THEN
C         no recv-ahead algorithms
          CALL SWPTRNS1(COMMOPT, PROTOPT, MAPSIZE, MAP,
     &                  MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                  LM(MYINDEX), LN(MYINDEX), MX, A, WS, B) 
C
        ELSEIF (COMMOPT .LE. 4) THEN
C         recv-ahead algorithms
          CALL SWPTRNS2(COMMOPT-2, PROTOPT, MAPSIZE, MAP,
     &                  MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                  LM(MYINDEX), LN(MYINDEX), MX, A, WS, B) 
C
        ELSE
C         illegal communication option specified
          WRITE(0,100) MAP(MYINDEX), COMMOPT
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE SWAPTRANS ',/,
     &            ' ILLEGAL COMMUNICATION OPTION SPECIFIED',/,
     &            ' PROCID = ',I4,' COMMOPT = ',I4)
          STOP
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SWPTRNS1(COMMOPT, PROTOPT, MAPSIZE, MAP, 
     &                    MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                    ML, NL, MX, A, WS, B) 
C
C This routine computes B = transpose(A) using an O(P) "swap" transpose
C algorithm without recv-ahead, where each step consists of swapping
C information between processors. Here 
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,LM(MYINDEX),H1,H2,N): Processor I has LM(I) rows of A; 
C                            LM(0) + ... + LM(MAPSIZE-1) = M.
C  B(W,LN(MYINDEX),H1,H2,M): Processor I has LN(I) rows of B; 
C                            LN(0) + ... + LN(MAPSIZE-1) = N.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,LN,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,LN,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,LN,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,LN,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C Communication options (COMMOPT) for SWPTRNS1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/trans
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/trans
C Communication protocol options (PROTOPT) for SWPTRNS1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: SWAPTRANS
C calls: SWPTRNS_INIT, SWAP, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C -1 for forward transpose; +1 for backward.
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, ML, NL, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C Local component of the array that is to be transposed.
C (organized as REAL (W,LM(MYINDEX),H1,H2,N))
      REAL A(W*ML*H1*H2,N)
C
C     Work Space
C
C message buffers
C (organized as REAL (W,LN(MYINDEX),H1,H2,M))
      REAL WS(W*NL*H1*H2,M)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,NL,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,NL,H2), (W,MX,NL,M,H2), or (W,MX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C number of bytes in a column segment being sent and in a row segment
C being received. 
      INTEGER COLSIZE, ROWSIZE
C size of message being sent and received during a swap
      INTEGER SENDSIZE, RECVSIZE
C arrays indicating beginning index and size of message being sent 
C during a given swap
      INTEGER SENDDEX(0:NPROCSX-1), SENDCOLS(0:NPROCSX-1)
C arrays indicating beginning index and size of message being 
C received during a given swap
      INTEGER RECVDEX(0:NPROCSX-1), RECVROWS(0:NPROCSX-1)
C arrays indicating swap partner and whether this processor sends or 
C receives first during a swap at a given step (for synchronous 
C communication)
      INTEGER SWAPNODE(NPROCSX-1), ORDER(NPROCSX-1)
C loop index
      INTEGER STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of a single column segment (for sending) and a
C     single row segment (for receiving), in bytes.
      COLSIZE = RBYTES*W*LM(MYINDEX)*H1*H2
      ROWSIZE = RBYTES*W*LN(MYINDEX)*H1*H2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm.
      CALL SWPTRNS_INIT(MAPSIZE, MAP, LM, LN, MYINDEX, SENDDEX,
     &                  SENDCOLS, RECVDEX, RECVROWS, SWAPNODE, ORDER)
C
C     Construct transpose using O(P) swap algorithm.
      DO STEP=1,MAPSIZE-1
C
C       Swap components.
        SENDSIZE = SENDCOLS(STEP)*COLSIZE
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP(COMMOPT, PROTOPT, ORDER(STEP), ME, BASE, 
     &            SWAPNODE(STEP), SENDSIZE, A(1,SENDDEX(STEP)),
     &            RECVSIZE, WS)
C
C       Transpose received component into B.
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(STEP), LN(MYINDEX), MX,
     &             RECVDEX(STEP), WS, B) 
C
      ENDDO
C
C     Finally, transpose last component from A to B.
      CALL TRANS(DIR, W, M, H1, H2, RECVROWS(0), LN(MYINDEX), MX, 
     &           RECVDEX(0), A(1,SENDDEX(0)), B) 
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SWPTRNS2(COMMOPT, PROTOPT, MAPSIZE, MAP, 
     &                    MYINDEX, BASE, DIR, W, M, N, H1, H2, LM, LN,
     &                    ML, NL, MX, A, WS, B) 
C
C This routine computes B = transpose(A) using an O(P) "swap" transpose
C algorithm with recv-ahead, where each step consists of swapping
C information between processors. Here 
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,LM(MYINDEX),H1,H2,N): Processor I has LM(I) rows of A; 
C                            LM(0) + ... + LM(MAPSIZE-1) = M.
C  B(W,LN(MYINDEX),H1,H2,M): Processor I has LN(I) rows of B; 
C                            LN(0) + ... + LN(MAPSIZE-1) = N.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MX,LN,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MX,H1,LN,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MX,LN,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MX,M,LN,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C Communication options (COMMOPT) for SWPTRNS2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/trans
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/trans
C  IF (COMMOPT .EQ. 2) send-ahead swap: 
C    all sends/all recvs/trans
C Communication protocol options (PROTOPT) for SWPTRNS2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: SWAPTRANS
C calls: SWPTRNS_INIT, SWAP1, SWAP2, SWAP3, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays on processors in MAP array
      INTEGER M, N, H1, H2, ML, NL, MX
      INTEGER LM(0:MAPSIZE-1)
      INTEGER LN(0:MAPSIZE-1)
C Local component of the array that is to be transposed.
C (organized as REAL (W,LM(MYINDEX),H1,H2,N))
      REAL A(W*ML*H1*H2,N)
C
C     Work Space
C
C message buffers
C (organized as REAL (W,LN(MYINDEX),H1,H2,M))
      REAL WS(W*NL*H1*H2,M)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MX,NL,H1,H2), (W,MX,M,H1,H2), (W,MX,H1,M,H2),
C  (W,MX,H1,NL,H2), (W,MX,NL,M,H2), or (W,MX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C number of bytes in a column segment being sent and in a row segment
C being received. 
      INTEGER COLSIZE, ROWSIZE
C size of message being sent and received during a swap
      INTEGER SENDSIZE, RECVSIZE
C arrays indicating beginning index and size of message being sent 
C during a given swap
      INTEGER SENDDEX(0:NPROCSX-1), SENDCOLS(0:NPROCSX-1)
C arrays indicating beginning index and size of message being 
C received during a given swap
      INTEGER RECVDEX(0:NPROCSX-1), RECVROWS(0:NPROCSX-1)
C arrays indicating swap partner and whether this processor sends or 
C receives first during a swap at a given step (for synchronous 
C communication)
      INTEGER SWAPNODE(NPROCSX-1), ORDER(NPROCSX-1)
C loop index
      INTEGER STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of a single column segment (for sending) and a
C     single row segment (for receiving), in bytes.
      COLSIZE = RBYTES*W*LM(MYINDEX)*H1*H2
      ROWSIZE = RBYTES*W*LN(MYINDEX)*H1*H2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm.
      CALL SWPTRNS_INIT(MAPSIZE, MAP, LM, LN, MYINDEX, SENDDEX,
     &                  SENDCOLS, RECVDEX, RECVROWS, SWAPNODE, ORDER)
C
C     Post receive requests.
      DO STEP=1,MAPSIZE-1
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP1(COMMOPT, PROTOPT, .TRUE., ORDER(STEP), ME, 
     &             BASE, SWAPNODE(STEP), RECVSIZE, WS(1,RECVDEX(STEP)))
      ENDDO
C
C     Construct transpose using O(P) swap algorithm.
      DO STEP=1,MAPSIZE-1
C
C       Initiate swap.
        SENDSIZE = SENDCOLS(STEP)*COLSIZE
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP2(COMMOPT, PROTOPT, .TRUE., ORDER(STEP), ME, 
     &             BASE, SWAPNODE(STEP), SENDSIZE, A(1,SENDDEX(STEP)),
     &             RECVSIZE, WS(1,RECVDEX(STEP)))  
C
      ENDDO
C
C     Wait until outstanding send and receive operations are complete.
      DO STEP=1,MAPSIZE-1
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP3(COMMOPT, PROTOPT, ME, BASE,
     &             SWAPNODE(STEP), RECVSIZE, WS(1,RECVDEX(STEP)))  

      ENDDO
C
      DO STEP=1,MAPSIZE-1
C       Transpose received component into B.
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(STEP), LN(MYINDEX), MX,
     &             RECVDEX(STEP), WS(1,RECVDEX(STEP)), B) 
      ENDDO
C
C     Transpose last component from A to B.
      CALL TRANS(DIR, W, M, H1, H2, RECVROWS(0), LN(MYINDEX), MX, 
     &           RECVDEX(0), A(1,SENDDEX(0)), B)  
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SWPTRNS_INIT(MAPSIZE, MAP, LM, LN, MYINDEX, SENDDEX,
     &                        SENDCOLS, RECVDEX, RECVROWS, SWAPNODE, 
     &                        ORDER)
C
C This routine calculates swap partners and other information needed
C by the O(P) "swap" transpose algorithm. 
C
C called by: SWPTRNS1, SWPTRNS2
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C number of local "last" indices for input and output arrays in 
C transpose for each processor
      INTEGER LM(0:MAPSIZE-1), LN(0:MAPSIZE-1)
C index of "me" in map and index arrays
      INTEGER MYINDEX
C
C     Output
C
C arrays indicating beginning index and size of message being sent 
C during a given swap
      INTEGER SENDDEX(0:MAPSIZE-1), SENDCOLS(0:MAPSIZE-1)
C arrays indicating beginning index and size of message being 
C received during a given swap
      INTEGER RECVDEX(0:MAPSIZE-1), RECVROWS(0:MAPSIZE-1)
C arrays indicating swap partner and whether this processor sends or 
C receives first during a swap at a given step (for synchronous 
C communication)
      INTEGER SWAPNODE(MAPSIZE-1), ORDER(MAPSIZE-1)
C
C---- Local Variables --------------------------------------------------
C
C loop index and bound
      INTEGER I, MXPOW2
C swap step and partner indices
      INTEGER INEXT, ISWAP
C smallest power of two not greater than I
      INTEGER IPOW2
C
C---- External Functions ----------------------------------------------
C
C Exclusive OR
      INTEGER XOR
C
C---- Executable Statements -------------------------------------------
C
C     Calculate smallest power of two not smaller than MAPSIZE.
      MXPOW2 = 1
      DO WHILE (MXPOW2 .LT. MAPSIZE)
        MXPOW2 = 2*MXPOW2
      ENDDO
C
C     Calculate indices for incoming and outgoing messages
C     (using RECVROWS and SENDCOLS as temporary storage).
      RECVROWS(0) = 1
      SENDCOLS(0) = 1
      DO I=1,MAPSIZE-1
        RECVROWS(I) = RECVROWS(I-1) + LM(I-1)
        SENDCOLS(I) = SENDCOLS(I-1) + LN(I-1)
      ENDDO
C
C     Determine indices for local data.
      SENDDEX(0) = SENDCOLS(MYINDEX)
      RECVDEX(0) = RECVROWS(MYINDEX)
C
      INEXT = 0
      IPOW2  = 1
      DO I=1,MXPOW2-1
C
C       Identify potential swap partner index.
        ISWAP = XOR(MYINDEX,I)
C
C       If a legal swap partner, record it.
        IF (ISWAP .LT. MAPSIZE) THEN
          INEXT = INEXT+1
C
C         Compute source and destination indices for messages.
          SENDDEX(INEXT) = SENDCOLS(ISWAP)
          RECVDEX(INEXT) = RECVROWS(ISWAP)
C
C         Save swap partner index.
          SWAPNODE(INEXT) = ISWAP
C
C         Calculate swap order, using an order that minimizes collisions
C         on a bidirectional grid.
          IF (I .GE. IPOW2) IPOW2 = 2*IPOW2
          IF (MOD(MYINDEX, IPOW2) .LT. IPOW2/2) THEN
            IF (MOD(MYINDEX, 2) .EQ. 0) THEN
              ORDER(INEXT) = 1
            ELSE
              ORDER(INEXT) = -1
            ENDIF
          ELSE
            IF (MOD(ISWAP, 2) .EQ. 0) THEN
              ORDER(INEXT) = -1
            ELSE
              ORDER(INEXT) = 1
            ENDIF
          ENDIF
C
        ENDIF
C
      ENDDO
C
C     Determine sizes for local data.
      SENDCOLS(0) = LN(MYINDEX)
      RECVROWS(0) = LM(MYINDEX)
C
      DO I=1,INEXT
C
C       Determine source and destination sizes for messages.
        SENDCOLS(I) = LN(SWAPNODE(I))
        RECVROWS(I) = LM(SWAPNODE(I))
C
C       Save swap partner id.
        SWAPNODE(I) = MAP(SWAPNODE(I))
C
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

