C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  (Stripped down PVM-only version (4/13/95), for use in ParkBench     #
C   benchmark suite)                                                   #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
      SUBROUTINE HALFSUM(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &                   MAP, MYINDEX, BASE1, BASE2, EXCHSIZE, LTH, WS,
     &                   SUM) 
C
C HALFSUM calls routines that calculate a vector sum over a 
C specified subset of processors using a recursive halving algorithm, 
C generalized to handle nonpowers of two number of processors. As
C messages become small in the recursive halving process, the algorithm
C switches to an exchange summation (idea courtesty of R. van de Geign).
C The MAP array defines the subset and the processor ordering to use. 
C The results are duplicated across all processors in the subset.
C
C Communication options (COMMOPT) for HALFSUM include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for HALFSUM include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5) .AND. (BUFFERS .GT. 1)
C    recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: FLTSUM
C calls: HALF1, HALF2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C message size (in reals) below which an exchange sum algorithm is
C to be used instead of a recursive halving sum algorithm
      INTEGER EXCHSIZE
C length of vectors to be summed
      INTEGER LTH
C
C     Work Space
C
C message buffers
      REAL WS(LTH*BUFFERS)
C
C     Input/Output
C
C On entry, contains (local) data. On exit contains vector sum.
      REAL SUM(LTH)
C
C---- Executable Statements --------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       Calculate vector sum.
C
        IF (BUFFERS .EQ. 1) THEN
C         single buffer algorithms
          CALL HALF1(COMMOPT, PROTOPT, MAPSIZE,
     &               MAP, MYINDEX, BASE1, BASE2, EXCHSIZE, LTH, WS, SUM) 
C
        ELSEIF (BUFFERS .GE. 2) THEN
C         multiple buffer algorithms
          CALL HALF2(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &               MAP, MYINDEX, BASE1, BASE2, EXCHSIZE, LTH, WS, SUM)
C
        ELSE
C         illegal number of buffers specified
          WRITE(0,100) MAP(MYINDEX), BUFFERS
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE HALFSUM ',/,
     &            ' ILLEGAL COMMUNICATION BUFFER SIZE SPECIFIED',/,
     &            ' PROCID = ',I4,' BUFFERS = ',I4)
          STOP
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE HALF1(COMMOPT, PROTOPT, MAPSIZE,
     &                 MAP, MYINDEX, BASE1, BASE2, EXCHSIZE, LTH, WS,
     &                 SUM) 
C
C HALF1 calculates a vector sum over a specified subset of processors
C when one communication buffer is provided, using a recursive halving
C algorithm generalized to handle nonpowers of two number of processors.
C As messages become small in the recursive halving process, the
C algorithm switches to an exchange summation (idea courtesty of R. van
C de Geign). The MAP array defines the subset and the processor ordering
C to use. The results are duplicated across all processors in the
C subset. 
C
C Communication options (COMMOPT) for HALF1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for HALF1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: HALFSUM
C calls: HALF_INIT, SWAP, SWAP_RECV, SWAP_SEND
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C message size (in reals) below which an exchange sum algorithm is
C to be used instead of a recursive halving sum algorithm
      INTEGER EXCHSIZE
C length of vectors to be summed
      INTEGER LTH
C
C     Work Space
C
C message buffers
      REAL WS(LTH)
C
C     Input/Output
C
C On entry, SUM contains (local) data. On exit, it contains the vector 
C sum.
      REAL SUM(LTH)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C flag indicating whether "me" is in the logical (partial) upper
C half cube of a non-power of two hypercube
      LOGICAL UPPER
C If MAPSIZE is not a power of two, then TWIN is the (true) processor 
C id for the neighboring processor in the other half cube of 
C the partial logical hypercube.
      INTEGER TWIN
C number of swaps in the recursive halving sum and number of swaps
C in both the recursive halving and exchange sum portions of the
C algorithm. MAXSTEP-HALFSTEP represents the number of swaps in the
C recursive doubling broadcast.
      INTEGER HALFSTEP, MAXSTEP
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C arrays indicating the size of the message being sent/received 
C during a given swap, and the corresponding index offsets
      INTEGER SENDSIZE(LGPROCSX)
      INTEGER RECVSIZE(LGPROCSX)
      INTEGER SENDINDEX(LGPROCSX)
      INTEGER RECVINDEX(LGPROCSX)
C temporaries for current message sizes and offsets
      INTEGER SNDLTH, RCVLTH, SNDDEX, RCVDEX
C loop indices
      INTEGER I, STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who i am.
      ME = MAP(MYINDEX)
C
C     Precalculate swap partners and other information needed by vector
C     sum algorithm.
      CALL HALF_INIT(MAPSIZE, MAP, MYINDEX, LTH, LGPROCSX, ALIGN, 
     &               EXCHSIZE, UPPER, TWIN, MAXSTEP, HALFSTEP, ORDER, 
     &               SENDSIZE, RECVSIZE, SENDINDEX, RECVINDEX, SWAPNODE)
C
      IF (UPPER) THEN
C       In upper half cube, so send data to TWIN and wait for results.
C       - ordered send/recv, with upper half cube processor going 
C         first
C       - safe to use same buffer for send and recv because using
C         ordered swap and going first
C
        CALL SWAP(1, PROTOPT, 1, ME, BASE1, TWIN, RBYTES*LTH, 
     &            SUM, RBYTES*LTH, SUM)
C
      ELSE
C       In lower half cube, so use usual power of two algorithm.
C
        IF (TWIN .NE. -1) THEN
C
C         Get data from TWIN in upper half cube using recv part of 
C         ordered swap.
          CALL SWAP_RECV(1, PROTOPT, -1, ME, BASE1, TWIN, 
     &                   RBYTES*LTH, WS)
C
C         Sum vectors.
          DO I = 1,LTH
            SUM(I) = SUM(I) + WS(I)
          ENDDO
C
        ENDIF
C 
C       Calculate vector sum in lower half cube using recursive halving
C       and/or exchange algorithm (leaving result partially
C       distributed). 
        DO STEP=1,MAXSTEP
C
          SNDLTH = SENDSIZE(STEP)
          RCVLTH = RECVSIZE(STEP)
          SNDDEX = SENDINDEX(STEP)
          RCVDEX = RECVINDEX(STEP)
C
C         Swap vectors.
          CALL SWAP(COMMOPT, PROTOPT, ORDER(STEP),
     &              ME, BASE1, SWAPNODE(STEP), RBYTES*SNDLTH,
     &              SUM(SNDDEX), RBYTES*RCVLTH, WS(RCVDEX))
C
C         Sum vectors.
          DO I = RCVDEX, RCVDEX+(RCVLTH-1)
            SUM(I) = SUM(I) + WS(I)
          ENDDO
C
        ENDDO
C
C       Broadcast distributed vector by reversing recursive halving part
C       of the summation algorithm (recursize doubling).
        DO STEP=HALFSTEP,1,-1
C
          RCVLTH = SENDSIZE(STEP)
          SNDLTH = RECVSIZE(STEP)
          RCVDEX = SENDINDEX(STEP)
          SNDDEX = RECVINDEX(STEP)
C
C         Swap vectors.
          CALL SWAP(COMMOPT, PROTOPT, ORDER(STEP),
     &              ME, BASE2, SWAPNODE(STEP), RBYTES*SNDLTH,
     &              SUM(SNDDEX), RBYTES*RCVLTH, SUM(RCVDEX))
C
        ENDDO
C
        IF (TWIN .NE. -1) THEN
C
C         Send results to TWIN in upper half cube using send part of 
C         ordered swap.
          CALL SWAP_SEND(1, PROTOPT, -1, ME, BASE1, TWIN,
     &                   RBYTES*LTH, SUM, RBYTES*LTH, SUM)
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE HALF2(COMMOPT, BUFFERS, PROTOPT, MAPSIZE,
     &                 MAP, MYINDEX, BASE1, BASE2, EXCHSIZE, LTH, WS,
     &                 SUM) 
C
C HALF2 calculates a vector sum over a specified subset of processors
C when more than one communication buffer is provided, using a recursive
C halving algorithm generalized to handle nonpowers of two number of
C processors. As messages become small in the recursive halving process,
C the algorithm switches to an exchange summation (idea courtesty of R.
C van de Geign). The MAP array defines the subset and the processor
C ordering to use. The results are duplicated across all processors in
C the subset. 
C
C Communication options (COMMOPT) for HALF2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for HALF2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5) nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5) forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: HALFSUM
C calls: HALF_INIT, SWAP, SWAP1, SWAP2, SWAP3, SWAP_RECV, SWAP_RECVBEGIN, 
C        SWAP_RECVEND, SWAP_SEND
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C message size (in reals) below which an exchange sum algorithm is
C to be used instead of a recursive halving sum algorithm
      INTEGER EXCHSIZE
C length of vectors to be summed
      INTEGER LTH
C
C     Work Space
C
C message buffers
      REAL WS(LTH*BUFFERS)
C
C     Input/Output
C
C on entry, contains (local) data, on exit contains vector sum
      REAL SUM(LTH)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C flag indicating whether "me" is in the logical (partial) upper
C half cube of a non-power of two hypercube
      LOGICAL UPPER
C If MAPSIZE is not a power of two, then TWIN is the (true) processor 
C id for the neighboring processor in the other half cube of 
C the partial logical hypercube.
      INTEGER TWIN
C number of swaps in the recursive halving sum and number of swaps
C in both the recursive halving and exchange sum portions of the
C algorithm. MAXSTEP-HALFSTEP represents the number of swaps in the
C recursive doubling broadcast.
      INTEGER HALFSTEP, MAXSTEP
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C arrays indicating the size of the message being sent/received 
C during a given swap, and the corresponding index offsets
      INTEGER SENDSIZE(LGPROCSX)
      INTEGER RECVSIZE(LGPROCSX)
      INTEGER SENDINDEX(LGPROCSX)
      INTEGER RECVINDEX(LGPROCSX)
      INTEGER WSINDEX(LGPROCSX)
C temporaries for current message sizes and offsets
      INTEGER SNDLTH, RCVLTH, SNDDEX, RCVDEX, WSDEX
C loop indices
      INTEGER I, STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who i am.
      ME = MAP(MYINDEX)
C
C     Precalculate swap partners and other information needed by vector
C     sum algorithm.
      CALL HALF_INIT(MAPSIZE, MAP, MYINDEX, LTH, LGPROCSX, ALIGN, 
     &               EXCHSIZE, UPPER, TWIN, MAXSTEP, HALFSTEP, ORDER, 
     &               SENDSIZE, RECVSIZE, SENDINDEX, RECVINDEX, SWAPNODE)
C
      IF (UPPER) THEN
C       In upper half cube, so send data to TWIN and wait for results.
C       - ordered send/recv, with upper half cube processor going 
C         first
C       - safe to use same buffer for send and recv because using
C         ordered swap and going first
C
        CALL SWAP(1, PROTOPT, 1, ME, BASE1, TWIN, RBYTES*LTH,
     &            SUM, RBYTES*LTH, SUM)
C
      ELSE
C       In lower half cube, so use usual power of two algorithm.
C
        IF (TWIN .NE. -1) THEN
C
C         Get/prepare for data from TWIN.
          IF (BUFFERS .GT. 2) THEN
C
C           Space available for recv-ahead for both TWIN and lower 
C           subcube interprocessor communication, so use recvbegin part
C           of ordered swap.
            CALL SWAP_RECVBEGIN(1, PROTOPT, -1, ME, BASE1,
     &                          TWIN, RBYTES*LTH, WS(2*LTH+1)) 
C
          ELSE
C
C           Space not available for recv-ahead for both TWIN and lower 
C           subcube interprocessor communication, so use recv part of 
C           ordered swap.
            CALL SWAP_RECV(1, PROTOPT, -1, ME, BASE1, TWIN,
     &                     RBYTES*LTH, WS)
C
C           Sum vectors.
            DO I = 1,LTH
              SUM(I) = SUM(I) + WS(I)
            ENDDO
C
          ENDIF
C
        ENDIF
C
C       Calculate indices into work space to be used for recv-ahead 
C       algorithm, and post receive requests for recursive halving
C       vector sum algorithm.  
        WSINDEX(1) = 1
        DO STEP=2,HALFSTEP
          WSINDEX(STEP) = WSINDEX(STEP-1) + RECVSIZE(STEP-1)
        ENDDO
        DO STEP=1,HALFSTEP
          CALL SWAP1(COMMOPT, PROTOPT, .TRUE., ORDER(STEP),
     &               ME, BASE1, SWAPNODE(STEP), RBYTES*RECVSIZE(STEP),
     &               WS(WSINDEX(STEP))) 
        ENDDO
C
C       Post receive requests for recursive doubling broadcast algorithm
C       (since know that requests will be posted before messages sent,
C       can disable safe forcetype protocol)
        DO STEP=HALFSTEP,1,-1
          CALL SWAP1(COMMOPT, PROTOPT, .FALSE., ORDER(STEP),
     &               ME, BASE2, SWAPNODE(STEP), RBYTES*SENDSIZE(STEP),
     &               SUM(SENDINDEX(STEP))) 
        ENDDO
C
        IF ((TWIN .NE. -1) .AND. (BUFFERS .GT. 2)) THEN
C
C         Get data from TWIN in upper half cube using recvend part of 
C         ordered swap.
          CALL SWAP_RECVEND(1, PROTOPT, -1, ME, BASE1, TWIN,
     &                      RBYTES*LTH, WS(2*LTH+1))
C
C         Sum vectors.
          DO I = 1,LTH
            SUM(I) = SUM(I) + WS(2*LTH+I)
          ENDDO
C
        ENDIF
C 
C       Calculate vector sum in lower half cube using recursive halving
C       algorithm (leaving result distributed).
        DO STEP=1,HALFSTEP
C
          SNDLTH = SENDSIZE(STEP)
          RCVLTH = RECVSIZE(STEP)
          SNDDEX = SENDINDEX(STEP)
          RCVDEX = RECVINDEX(STEP)
          WSDEX = WSINDEX(STEP)
C
C         Swap vectors.
          CALL SWAP2(COMMOPT, PROTOPT, .TRUE., ORDER(STEP), 
     &               ME, BASE1, SWAPNODE(STEP), RBYTES*SNDLTH, 
     &               SUM(SNDDEX), RBYTES*RCVLTH, WS(WSDEX))
C
C         Sum vectors.
          DO I = 0, RCVLTH-1
            SUM(RCVDEX+I) = SUM(RCVDEX+I) + WS(WSDEX+I)
          ENDDO
C
        ENDDO
C
C       When message sizes get small, finish calculating vector sum
C       using exchange algorithm (using previous RCVLTH, etc.).
C       Not using recv-ahead logic for exchange, since recv-ahead
C       is unlikely to be efficient for small messages.
        DO STEP=HALFSTEP+1,MAXSTEP
C
          SNDLTH = SENDSIZE(STEP)
          RCVLTH = RECVSIZE(STEP)
          SNDDEX = SENDINDEX(STEP)
          RCVDEX = RECVINDEX(STEP)
C
C         Swap vectors.
          CALL SWAP(COMMOPT, PROTOPT, ORDER(STEP),
     &              ME, BASE1, SWAPNODE(STEP), RBYTES*SNDLTH,
     &              SUM(SNDDEX), RBYTES*RCVLTH, WS)
C
C         Sum vectors.
          DO I = 0, RCVLTH-1
            SUM(RCVDEX+I) = SUM(RCVDEX+I) + WS(I+1)
          ENDDO
C
        ENDDO
C
C       Broadcast distributed vector by reversing recursive halving 
C       algorithm (recursize doubling).
        DO STEP=HALFSTEP,1,-1
C
          RCVLTH = SENDSIZE(STEP)
          SNDLTH = RECVSIZE(STEP)
          RCVDEX = SENDINDEX(STEP)
          SNDDEX = RECVINDEX(STEP)
C
C         Swap vectors (with safe forcetype protocol turned off, to 
C         match swapbegin).
          CALL SWAP2(COMMOPT, PROTOPT, .FALSE., ORDER(STEP),
     &               ME, BASE2, SWAPNODE(STEP), RBYTES*SNDLTH,
     &               SUM(SNDDEX), RBYTES*RCVLTH, SUM(RCVDEX)) 
C
        ENDDO
C
        IF (TWIN .NE. -1) THEN
C
C         Send results to TWIN in upper half cube using send part of 
C         ordered swap.
          CALL SWAP_SEND(1, PROTOPT, -1, ME, BASE1, TWIN,
     &                   RBYTES*LTH, SUM, RBYTES*LTH, SUM)
C
        ENDIF
C
C       Clean up outstanding sends. Note that send buffers are never in
C       danger of being overwritten. During the summation stage, 
C       send buffer space is never reused. During the broadcast stage, 
C       the processor that might overwrite a send buffer from the 
C       summation stage was the destination of the message sent, so know 
C       that send has been completed.
        DO I = 1,HALFSTEP
          CALL SWAP3(COMMOPT, PROTOPT, ME, BASE1, 
     &               SWAPNODE(I), 0, WS)
        ENDDO
        DO I = HALFSTEP,1,-1
          CALL SWAP3(COMMOPT, PROTOPT, ME, BASE2, 
     &               SWAPNODE(I), 0, WS)
        ENDDO
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE HALF_INIT(MAPSIZE, MAP, MYINDEX, LTH, LGPROCSX, ALIGN,
     &                     EXCHSIZE, UPPER, TWIN, MAXSWAP, HALFSWAP, 
     &                     ORDER, SENDSIZE, RECVSIZE, SENDINDEX, 
     &                     RECVINDEX, SWAPNODE) 
C
C This routine calculates swap partners and other information needed
C by the recursive halving combine and recursive doubling broadcast
C algorithms.
C
C called by: HALF1, HALF2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C length of vectors to be summed
      INTEGER LTH
C declared length of the output arrays, 
C at least as long as LOG2(MAPSIZE)
      INTEGER LGPROCSX
C alignment requirement for send and receive buffers (in number of 
C reals)
      INTEGER ALIGN
C message size (in reals) below which an exchange sum algorithm is
C to be used instead of a recursive halving sum algorithm
      INTEGER EXCHSIZE
C
C     Output
C
C flag indicating whether "me" is in the logical (partial) upper
C half cube of a non-power of two hypercube
      LOGICAL UPPER
C If MAPSIZE is not a power of two, then TWIN is the (true) processor 
C id for the neighboring processor in the other half cube of 
C the partial logical hypercube.
      INTEGER TWIN
C number of swaps in both recursive halving and exchange sum portions of
C the algorithm
      INTEGER MAXSWAP
C number of swaps in the recursive halving (and recursive doubling)
C portions of the algorithm
      INTEGER HALFSWAP
C array indicating whether this processor sends or receives first
C during a swap at a given step (for synchronous communication)
      INTEGER ORDER(LGPROCSX)
C array indicating the size of the message being sent during a given 
C swap
      INTEGER SENDSIZE(LGPROCSX)
C array indicating the size of the message being received during a given 
C swap
      INTEGER RECVSIZE(LGPROCSX)
C array indicating the index for the beginning of the message being 
C sent during a given swap
      INTEGER SENDINDEX(LGPROCSX)
C array indicating the index for the beginning of the message being 
C received during a given swap
      INTEGER RECVINDEX(LGPROCSX)
C array indicating the destination of the message being sent during
C a given swap
      INTEGER SWAPNODE(LGPROCSX)
C
C---- Local Variables --------------------------------------------------
C
C temporary for determining swap information
      INTEGER I
C variables indicating sizes of upper partial half cube
C and lower complete half cube
      INTEGER EXTRA, HALF
C MYINDEX for permuted ordering when MAPSIZE not a power of two
      INTEGER LOCALDEX
C temporaries for determining message lengths
      INTEGER SIZEA, SIZEB
C temporaries for determining message index offsets
      INTEGER INDEXA, INDEXB
C temporary for determining message destination
      INTEGER DEST

C
C---- Executable Statements -------------------------------------------
C                                                                              
C     Check for existence of a partial upper half cube,
      I = 1
      DO WHILE (I .LT. MAPSIZE)
        I = I*2
      ENDDO
C
      IF (MAPSIZE .EQ. I) THEN
        HALF = I
        EXTRA = 0
      ELSE
        HALF = I/2
        EXTRA = MAPSIZE - HALF
      ENDIF
C
C     For nonpowers-of-two, logical partial upper half cube
C     processors mapped to first "extra" even processors in
C     logical ring, This improves performance on a mesh
C     architecture. Performance is worse on a hypercube,
C     but using a nonpower of two processors on a hypercube 
C     is bogus anyway. Mapping courtesy of S. Seidel.
      UPPER = .FALSE.
      IF (EXTRA .GT. 0) THEN
C
C       First, identify TWIN and new MYINDEX with permuted mapping.
        IF (MYINDEX .LT. 2*EXTRA) THEN
          IF (MOD(MYINDEX, 2) .EQ. 1) THEN
            TWIN = MAP(MYINDEX-1)
            LOCALDEX = MYINDEX/2
          ELSE
            TWIN = MAP(MYINDEX+1)
            UPPER = .TRUE.
          ENDIF
        ELSE
          LOCALDEX = MYINDEX - EXTRA
          TWIN = -1
        ENDIF
C
      ELSE
C
C       Power of two, so no TWIN and no permutation.
        TWIN = -1
        LOCALDEX = MYINDEX
C
      ENDIF
C
C     Next, calculate swap partners, swap array sizes, and pointers 
C     into work arrays for each swap. Also calculate swap order for 
C     (COMMOPT .EQ. 1) protocol. The order is chosen in such a way as 
C     to minimize collisions on a bidirectional grid.
      IF (.NOT. UPPER) THEN
C
        I = 1
        MAXSWAP  = 0
        HALFSWAP = 0
        SIZEA    = LTH
        INDEXA   = 1
C
        DO WHILE (I .LT. HALF)
C
          MAXSWAP = MAXSWAP + 1
C
          IF (SIZEA .GE. EXCHSIZE) THEN
C           recursive halving sum phase: Vector length is halved at 
C           each step (modulo alignment), and the sending and receiving 
C           indices are for different halves of the previous vector.
            HALFSWAP = MAXSWAP
            SIZEB = ALIGN*((SIZEA/ALIGN)/2) + MOD(SIZEA,ALIGN)
            SIZEA = ALIGN*((SIZEA/ALIGN) - ((SIZEA/ALIGN)/2))
            INDEXB = INDEXA + SIZEA
          ELSE
C           exchange sum phase: Size and index stop changing,
C           and become the same for both receiving and sending.
            SIZEB = SIZEA
            INDEXB = INDEXA
          ENDIF
C
          IF (MOD(LOCALDEX, 2*I) .LT. I) THEN
            DEST = LOCALDEX + I
            IF (MOD(LOCALDEX, 2) .EQ. 0) THEN
              ORDER(MAXSWAP) = 1
            ELSE
              ORDER(MAXSWAP) = -1
            ENDIF
            SENDSIZE(MAXSWAP) = SIZEA
            RECVSIZE(MAXSWAP) = SIZEB
            SENDINDEX(MAXSWAP) = INDEXA
            RECVINDEX(MAXSWAP) = INDEXB
            SIZEA = SIZEB
            INDEXA = INDEXB
          ELSE
            DEST = LOCALDEX - I
            IF (MOD(DEST, 2) .EQ. 0) THEN
              ORDER(MAXSWAP) = -1
            ELSE
              ORDER(MAXSWAP) = 1
            ENDIF
            SENDSIZE(MAXSWAP) = SIZEB
            RECVSIZE(MAXSWAP) = SIZEA
            SENDINDEX(MAXSWAP) = INDEXB
            RECVINDEX(MAXSWAP) = INDEXA
          ENDIF
C
          IF (DEST .LT. EXTRA) THEN
            SWAPNODE(MAXSWAP) = MAP(2*DEST+1)           
          ELSE
            SWAPNODE(MAXSWAP) = MAP(DEST+EXTRA)           
          ENDIF
C
          I = 2*I
C
        ENDDO
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
