C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C irfft.f                                                              C
C                                                                      C
C The following routines calculate the inverse real Fourier transform  C
C of a block of distributed vectors, or are utility routines used in   C
C the inverse transform.                                               C
C                                                                      C
C IRFFT1 - calculates inverse real Fourier transforms of a block of    C
C          distributed vectors.                                        C
C IRFFT2 - calculates inverse real Fourier transforms of two blocks of C
C          distributed vectors, overlapping the butterfly calculation  C
C          phase for one block with the communication phase for the    C
C          other block.                                                C 
C FIXITY - modifies the Fourier coefficients of a real sequence so thatC
C          a subsequent reordering and an inverse complex Fourier      C
C          transform generates the real sequence.                      C
C SEQINV - calculates a sequential (local) block inverse complex       C
C          Fourier transform.                                          C
C IBTUPD - calculates an "inverse" power-of-two butterfly update for   C
C          each vector in a block, representing a factor-of-two stage  C
C          of an inverse Fourier transform.                            C
C BUFFIX - reorders the output from FIXITY and CPYFIX so that a        C
C          subsequent inverse complex Fourier transform generates the  C
C          real sequence.                                              C
C                                                                      C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE IRFFT1(COMMOPT, PROTOPT, MAPSIZE, MAP, 
     &                  MYINDEX, BASE, NLOCAL, MVECS, RJUMP, NFAX,
     &                  NTRIGS, TRIGS, WS, Y)  
C
C This subroutine calculates the inverse (real) Fourier transform of M
C complex vectors over a subset of MAPSIZE processors, producing M real
C vectors of length N=NLOCAL*MAPSIZE. It uses a distributed algorithm,
C i.e., an algorithm that moves the data and partial results only as
C they are needed, and transforms all M vectors as a single block. The
C MAP array defines the subset and the processor ordering to use. 
C
C Communication options (COMMOPT) for IRFFT1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for IRFFT1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: PARRFT
C calls: BUFCPY, BUFFIX, CPYFIX, FIXITY, IBTUPD, RFFT_INIT, SEQINV, SWAP
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in map array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C the local length of each vector to be transformed
      INTEGER NLOCAL
C the number of vectors per processor to be transformed in y
      INTEGER MVECS
C the offset between the start of successive vectors.
      INTEGER RJUMP
C the factorization of N (= NLOCAL*MAPSIZE)
      INTEGER NFAX(13)
C number of trigonometric function values used by IRFFT1
      INTEGER NTRIGS
C trigonometric function values used in inverse real Fourier
C transform 
      COMPLEX TRIGS(NTRIGS)
C
C     Work Space
C
C work array
      REAL WS(MVECS*(RJUMP/2),2)
C
C     Input/Output
C
C Y contains MVECS complex vectors each of length N/2+1.
C The real and imaginary parts of these vectors are stored as 
C consecutive elements of Y. Hence the real and imaginary parts of
C the ith component of the jth complex vector originally in Y
C are stored at Y(2*I-1,J) and Y(2*I,J).
      REAL Y(RJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C number of complex values in real arrays of size NLOCAL and RJUMP,
C respectively
      INTEGER NCLOC, CJUMP
C number of complex values in each local vector segment swapped during
C a parallel stage 
      INTEGER NCOMM
C length of message swapped during each parallel stage (in bytes) 
      INTEGER BUFSIZ
C true processor id for "me"
      INTEGER ME
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(0:LGPROCSX)
      INTEGER ORDER(0:LGPROCSX)
C which half (0: low, 1: high) of the data buffer that is sent during a 
C given swap, and index offset to that half
      INTEGER DIRN(0:LGPROCSX), OFFSET
C index and true processor id of the processor whose data is needed
C in order to compute the real transform from the complex transform,
C and order in which swap with TWIN is to be performed
      INTEGER TWINDEX, TWIN, TWORDER
C index offsets into trigonometric function values array
      INTEGER ICDIST, ICSEQ, IC
C number of "computational" swaps in distributed phase of transform
C algorithm. Total number of swaps includes one before sequential
C phase, and possibly one before real fix-up phase.
      INTEGER MAXSTEP
C loop index
      INTEGER STEP
C                                                                             
C---- Executable Statements --------------------------------------------
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     initialization
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     number of complex values in real arrays of size NLOCAL and RJUMP
      NCLOC = NLOCAL/2 
      CJUMP = RJUMP/2
C     number of complex values in each local vector segment swapped
C     during a parallel stage 
      NCOMM = NCLOC/2
C     length of message swapped during each parallel stage (in
C     bytes) 
      BUFSIZ = CBYTES*NCOMM*MVECS
C
C     Calculate swap partners (and swap order for (COMMOPT .EQ. 1)
C     protocol).  
      CALL RFFT_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, MAXSTEP,
     &               SWAPNODE, ORDER, DIRN, TWINDEX, TWORDER)
C
C     index offsets into the trigonometric function values array
C     corresponding to the sequential and distributed phases of the
C     algorithm  
      ICSEQ  = NCOMM + 1
      ICDIST = ICSEQ + NCLOC
C
C     Identify myself and "twin".
      ME = MAP(MYINDEX)
      TWIN = MAP(TWINDEX)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     Calculate inverse real Fourier transform.
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Combine odd and even elements of the complex vector as the first
C     step.
      CALL TRACEEVENTF('entry', 602, 0, 0)
      CALL FIXITY(MYINDEX, NCLOC, MVECS, CJUMP, TRIGS, Y)
      CALL TRACEEVENTF('exit', 602, 0, 0)
C
C     Move data around in preparation for inverse Fourier transform. 
      CALL TRACEEVENTF('entry', 603, 0, 0)
      IF ((MYINDEX .EQ. 0) .OR.(MYINDEX .EQ. TWINDEX)) THEN
        CALL CPYFIX(MYINDEX, NCLOC, MVECS, CJUMP, Y, WS(1,2))
      ELSE
        CALL CPYFIX(MYINDEX, NCLOC, MVECS, CJUMP, Y, WS(1,1))
        CALL SWAP(COMMOPT, PROTOPT, TWORDER, ME, BASE,
     &            TWIN, BUFSIZ, WS(1,1), BUFSIZ, WS(1,2))
      ENDIF
      CALL BUFFIX(NCLOC, MVECS, CJUMP, WS(1,2), Y)
      CALL TRACEEVENTF('exit', 603, 0, 0)
C
C     Calculate local inverse (complex) Fourier transform.
      CALL TRACEEVENTF('entry', 601, 0, 0)
      CALL SEQINV(NCLOC, MVECS, CJUMP, TRIGS(ICSEQ), Y)
      CALL TRACEEVENTF('exit', 601, 0, 0)
C
C     Calculate final (distributed) steps of the inverse Fourier
C     transform. 
      IC = ICDIST
      DO STEP=0,MAXSTEP
C
        IF (STEP .EQ. 0) THEN
          OFFSET = NCOMM*(1-DIRN(0))
          CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVECS, Y,
     &                WS(1,1)) 
        ENDIF
        CALL SWAP(COMMOPT, PROTOPT, ORDER(STEP), ME, BASE,
     &            SWAPNODE(STEP), BUFSIZ, WS(1,1), BUFSIZ, WS(1,2))
        IF (STEP .LT. MAXSTEP) THEN
          CALL IBTUPD(DIRN(STEP), DIRN(STEP+1), NCLOC, MVECS, CJUMP,
     &                TRIGS(IC), WS(1,2), WS(1,1), Y)
          IC = IC + NCOMM
        ELSE
          OFFSET = NCOMM*(1-DIRN(MAXSTEP))
          CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVECS,
     &                WS(1,2), Y)  
        ENDIF
C
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE IRFFT2(COMMOPT, PROTOPT, MAPSIZE, MAP, 
     &                  MYINDEX, BASE1, BASE2, NLOCAL, MVEC1, MVEC2,
     &                  RJUMP, NFAX, NTRIGS, TRIGS, WS1, WS2, Y1, Y2)  
C
C This subroutine calculates the inverse (real) Fourier transform of 2
C sets of complex vectors over a subset of MAPSIZE processors, producing
C real vectors of length N=NLOCAL*MAPSIZE. It uses a distributed
C algorithm, i.e., an algorithm that moves the data and partial results
C only as they are needed, and overlaps the butterfly calculation phase
C for one block with the communication phase for the other block. The
C MAP array defines the subset and the processor ordering to use. 
C
C Communication options (COMMOPT) for IRFFT2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for IRFFT2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C Since this is a "send-ahead" algorithm, only the simple exchange 
C (send/recv) communication option is invoked in SWAP_SEND and 
C SWAP_RECV. (Using odd/even swaps would prevent overlap.)
C The odd/even option is between SWAP_SEND and SWAP_RECV for different
C swaps, and is handled explicitly in this routine.
C
C called by: PARRFT
C calls: BUFCPY, BUFFIX, CPYFIX, FIXITY, RFFT_INIT, IBTUPD,
C        SEQINV, SWAP_SEND, SWAP_RECV
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in map array
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C the local length of each vector to be transformed
      INTEGER NLOCAL
C the number of vectors per processor to be transformed in y1
      INTEGER MVEC1
C the number of vectors per processor to be transformed in Y2
      INTEGER MVEC2
C the offset between the start of successive vectors.
      INTEGER RJUMP
C the factorization of N (= NLOCAL*MAPSIZE)
      INTEGER NFAX(13)
C number of trigonometric function values used by IRFFT1
      INTEGER NTRIGS
C trigonometric function values used by in inverse real Fourier
C transform 
      COMPLEX TRIGS(NTRIGS)
C
C     Work Space
C
C work array 1
      REAL WS1(MVEC1*(RJUMP/2),2)
C work array 2
      REAL WS2(MVEC2*(RJUMP/2),2)
C
C     Input/Output
C
C Y1 and Y2 each contain complex vectors of length NLOCAL/2. 
C Y1 contains MVEC1 vectors and Y2 contains MVEC2 vectors. 
C The real and imaginary parts of these vectors are stored as 
C consecutive elements of Y1 and Y2. Hence the real and imaginary 
C parts of the Ith component of the Jth complex vector in Y1 are stored
C at Y1(2*I-1,J) and Y1(2*I,J). 
      REAL Y1(RJUMP,MVEC1)
      REAL Y2(RJUMP,MVEC2)
C
C---- Local Variables --------------------------------------------------
C
C number of complex values in real arrays of size NLOCAL and RJUMP,
C respectively
      INTEGER NCLOC, CJUMP
C number of complex values in each local vector segment swapped during
C a parallel stage 
      INTEGER NCOMM
C lengths of messages swapped during each parallel stage (in bytes) 
      INTEGER BUFSZ1, BUFSZ2
C true processor id for "me"
      INTEGER ME
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(0:LGPROCSX)
      INTEGER ORDER(0:LGPROCSX)
C which half (0: low, 1: high) of the data buffer that is sent during a 
C given swap, and index offset to that half
      INTEGER DIRN(0:LGPROCSX), OFFSET
C index and true processor id of the processor whose data is needed
C in order to compute the real transform from the complex transform,
C and order in which swap with TWIN is to be performed
      INTEGER TWINDEX, TWIN, TWORDER
C index offsets into trigonometric function values array
      INTEGER ICDIST, ICSEQ, IC
C number of "computational" swaps in distributed phase of transform
C algorithm. Total number of swaps includes one before sequential
C phase, and possibly one before real fix-up phase.
      INTEGER MAXSTEP
C loop index
      INTEGER STEP
C                                                                             
C---- Executable Statements --------------------------------------------
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     initialization
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     number of complex values in real arrays of size NLOCAL and RJUMP
      NCLOC = NLOCAL/2 
      CJUMP = RJUMP/2
C     number of complex values in each local vector segment swapped
C     during a parallel stage 
      NCOMM = NCLOC/2
C     lengths of messages swapped during each parallel stage (in bytes) 
      BUFSZ1 = CBYTES*NCOMM*MVEC1
      BUFSZ2 = CBYTES*NCOMM*MVEC2
C
C     Calculate swap partners (and swap order for (COMMOPT .EQ. 1)
C     protocol).  
      CALL RFFT_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, MAXSTEP,
     &               SWAPNODE, ORDER, DIRN, TWINDEX, TWORDER)
C
C     If (COMMOPT .NE. 1), then not using odd/even option, so reset the
C     ORDER array.
      IF (COMMOPT .NE. 1) THEN
        DO STEP=0,MAXSTEP
          ORDER(STEP) = 1
        ENDDO
      ENDIF
C
C     index offsets into the trigonometric function values array
C     corresponding to the sequential and distributed phases of the
C     algorithm 
      ICSEQ  = NCOMM + 1
      ICDIST = ICSEQ + NCLOC
C
C     Identify myself and TWIN.
      ME = MAP(MYINDEX)
      TWIN = MAP(TWINDEX)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     Calculate inverse real Fourier transform. See IRFFT1 for a more
C     readable outline of the algorithm.
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      IF (( MYINDEX .EQ. 0) .OR. (MYINDEX .EQ. TWINDEX)) THEN
C
C       Calculate sequential real RFT for Y1.
        CALL FIXITY(MYINDEX, NCLOC, MVEC1, CJUMP, TRIGS, Y1)
        CALL CPYFIX(MYINDEX, NCLOC, MVEC1, CJUMP, Y1, WS1(1,2))
        CALL BUFFIX(NCLOC, MVEC1, CJUMP, WS1(1,2), Y1)
        CALL SEQINV(NCLOC, MVEC1, CJUMP, TRIGS(ICSEQ), Y1)
C
        IF (MAXSTEP .GT. -1) THEN
C
C         Prime the pipeline (step 1): send Y1.
          OFFSET = NCOMM*(1-DIRN(0))
          CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC1, Y1,
     &                WS1(1,1)) 
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(0), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2)) 
C
        ENDIF
C
C       Calculate sequential real RFT for Y2.
        CALL FIXITY(MYINDEX, NCLOC, MVEC2, CJUMP, TRIGS, Y2)
        CALL CPYFIX(MYINDEX, NCLOC, MVEC2, CJUMP, Y2, WS2(1,2))
        CALL BUFFIX(NCLOC, MVEC2, CJUMP, WS2(1,2), Y2)
        CALL SEQINV(NCLOC, MVEC2, CJUMP, TRIGS(ICSEQ), Y2)
C
        IF (MAXSTEP .GT. -1) THEN
C
C         Prime the pipeline (step 2): send Y2, recv Y1, update Y1.
          CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC2, Y2,
     &                WS2(1,1)) 
          IF (ORDER(0) .EQ. 1) THEN
            CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                     SWAPNODE(0), BUFSZ2, WS2(1,1), BUFSZ2,
     &                     WS2(1,2)) 
            CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                     SWAPNODE(0), BUFSZ1, WS1(1,2)) 
          ELSE
            CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                     SWAPNODE(0), BUFSZ1, WS1(1,2)) 
            CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                     SWAPNODE(0), BUFSZ2, WS2(1,1), BUFSZ2,
     &                     WS2(1,2)) 
          ENDIF
C
          IC = ICDIST
          CALL IBTUPD(DIRN(0), DIRN(1), NCLOC, MVEC1, CJUMP, TRIGS(IC),
     &                WS1(1,2), WS1(1,1), Y1) 
C
        ENDIF
C          
      ELSE
C
C       Prime the pipeline (step 1): fix Y1, send Y1.
        CALL FIXITY(MYINDEX, NCLOC, MVEC1, CJUMP, TRIGS, Y1)
        CALL CPYFIX(MYINDEX, NCLOC, MVEC1, CJUMP, Y1, WS1(1,1))
        CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE1,
     &                 TWIN, BUFSZ1, WS1(1,1), BUFSZ1, WS1(1,2))
C
C       Prime the pipeline (step 2): fix Y2, send Y2, recv Y1, RFT Y1. 
        CALL FIXITY(MYINDEX, NCLOC, MVEC2, CJUMP, TRIGS, Y2)
        CALL CPYFIX(MYINDEX, NCLOC, MVEC2, CJUMP, Y2, WS2(1,1))
        IF (TWORDER .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                   TWIN, BUFSZ2, WS2(1,1), BUFSZ2, WS2(1,2))
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                   TWIN, BUFSZ1, WS1(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                   TWIN, BUFSZ1, WS1(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                   TWIN, BUFSZ2, WS2(1,1), BUFSZ2, WS2(1,2))
        ENDIF
C
        CALL BUFFIX(NCLOC, MVEC1, CJUMP, WS1(1,2), Y1)
        CALL SEQINV(NCLOC, MVEC1, CJUMP, TRIGS(ICSEQ), Y1)
C
C       Prime the pipeline (step 3): send Y1, recv Y2, RFT Y2.
        OFFSET = NCOMM*(1-DIRN(0))
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC1, Y1,
     &              WS1(1,1)) 
        IF (ORDER(0) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(0), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2)) 
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE2,
     &                   TWIN, BUFSZ2, WS2(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE2,
     &                   TWIN, BUFSZ2, WS2(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(0), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2)) 
        ENDIF
C
        CALL BUFFIX(NCLOC, MVEC2, CJUMP, WS2(1,2), Y2)
        CALL SEQINV(NCLOC, MVEC2, CJUMP, TRIGS(ICSEQ), Y2)
C
C       Prime the pipeline (step 4): send Y2, recv Y1, update Y1.
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC2, Y2,
     &              WS2(1,1)) 
        IF (ORDER(0) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                   SWAPNODE(0), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(0), BUFSZ1, WS1(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(0), BUFSZ1, WS1(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                   SWAPNODE(0), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
        ENDIF
C
        IC = ICDIST
        CALL IBTUPD(DIRN(0), DIRN(1), NCLOC, MVEC1, CJUMP, TRIGS(IC), 
     &              WS1(1,2), WS1(1,1), Y1) 
C
      ENDIF
C
C     Begin overlapped algorithm.
      DO STEP=1,MAXSTEP
C
C       Finish swap with old partner and start swap with new
C       partner: send Y1, recv Y2, update Y2.
        IF (ORDER(STEP) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2))  
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE2,
     &                   SWAPNODE(STEP-1), BUFSZ2, WS2(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE2,
     &                   SWAPNODE(STEP-1), BUFSZ2, WS2(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2))  
        ENDIF
C
        CALL IBTUPD(DIRN(STEP-1), DIRN(STEP), NCLOC, MVEC2, CJUMP, 
     &              TRIGS(IC), WS2(1,2), WS2(1,1), Y2) 
C
C       Continue swap with new partner: send Y2, recv Y1, update Y1.
        IF (ORDER(STEP) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                   SWAPNODE(STEP), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, 1, ME, BASE2,
     &                   SWAPNODE(STEP), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
        ENDIF
C
        IF (STEP .LT. MAXSTEP) THEN
          IC = IC + NCOMM
          CALL IBTUPD(DIRN(STEP), DIRN(STEP+1), NCLOC, MVEC2, CJUMP,
     &                TRIGS(IC), WS1(1,2), WS1(1,1), Y1) 
        ELSE
          OFFSET = NCOMM*(1-DIRN(MAXSTEP))
          CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVEC1,
     &                WS1(1,2), Y1)  
        ENDIF
C
      ENDDO
C
C     Finish last dimensional swap: recv Y2, update Y2.
      IF (MAXSTEP .GT. -1) THEN
        CALL SWAP_RECV(0, PROTOPT, 1, ME, BASE2,
     &                 SWAPNODE(MAXSTEP), BUFSZ2, WS2(1,2)) 
        CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVEC2,
     &              WS2(1,2), Y2)  
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE FIXITY(MYINDEX, VECLTH, MVECS, CJUMP, CEXP, Y)
C
C FIXITY combines odd and even elements of each complex vector in Y as
C as the first step in computing inverse real Fourier transforms. After
C the modification, the inverse transforms can proceed by computing the
C usual inverse complex Fourier transforms of each vector. 
C 
C called by: IRFFT1, IRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C index of "me" in processor subset collaborating in distributed real
C Fourier transform 
      INTEGER MYINDEX
C length of each complex vector
      INTEGER VECLTH
C number of vectors being modified
      INTEGER MVECS
C the offset between the start of successive vectors in Y
      INTEGER CJUMP
C trigonometric function values used in the modification
      COMPLEX CEXP(VECLTH/2)
C
C     Input/Output
C
C On input, Y contains the data to be modified. On output, Y contains
C the results of the modification.
      COMPLEX Y(CJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C real one, to allow easy precision modifications
      REAL ONE
C half the length of the vectors in Y
      INTEGER HLFLTH
C loop bound
      INTEGER IST
C loop indices I, J
      INTEGER I, J
C temporaries used in computing the recombination of the data
      COMPLEX H1, H2
C                                                                             
C---- Executable Statements --------------------------------------------
C
      ONE = 1.0
C
C     Calculate half the length of the data vectors.
      HLFLTH = VECLTH/2
C
      DO J=1,MVECS
C
        IF (MYINDEX .EQ. 0) THEN
          IST = 2
	  Y(1,J) = CMPLX(ONE,ONE)*CONJG(Y(1,J))
	  Y(2,J) = 2.0*Y(2,J)
        ELSE
          IST = 1
        ENDIF
C
        DO I=IST,HLFLTH
C         Combine neighboring odd and even elements.
          H1 =  Y(2*I-1,J) + CONJG(Y(2*I,J))
          H2 = (Y(2*I-1,J) - CONJG(Y(2*I,J)))*CEXP(I)
          Y(2*I-1,J) = H1 + H2
          Y(2*I,J)   = CONJG(H1 - H2)
        ENDDO
C
      ENDDO
C       
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SEQINV(N, MVECS, CJUMP, CEXP, Y)
C
C SEQINV calculates in-place (unnormalized) inverse Fourier transforms
C of an array of complex vectors. Each vector is of length N, 
C where N is a power of two. 
C
C called by: IRFFT1, IRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C length of each complex vector
      INTEGER N
C number of vectors being transformed
      INTEGER MVECS
C the offset between the start of successive vectors
      INTEGER CJUMP
C trigonometric function values used in the transform
      COMPLEX CEXP(N)
C
C     Input/Output
C
C On input, Y contains the data to be transformed. On output, Y
C contains the results of the transforms.
      COMPLEX Y(CJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C loop limits for each stage of the power-of-two inverse fast Fourier
C transform 
       INTEGER NSTEPS, INCREM, NBLOCK
C index offset used to move through the trigonometric function values
C in the correct order
      INTEGER IC
C index offsets used to access the data needed in a given butterfly
C update 
      INTEGER IOFF1, IOFF2     
C loop indices
      INTEGER I, J, K, L
C temporaries used in computing the inverse power-of-two butterfly
C update 
      COMPLEX C
C
C---- External Functions ----------------------------------------------
C
C log base 2 function
      EXTERNAL LOG2
      INTEGER LOG2
C                                                                             
C---- Executable Statements --------------------------------------------
C
      NSTEPS = LOG2(N)
      DO L=1,MVECS
C
        INCREM = 1
        NBLOCK = N/2
        IC     = 0
        DO K=1,NSTEPS
C
          DO J=1,NBLOCK
C
            IOFF1 = 2*(J-1)*INCREM
            IOFF2 = IOFF1 + INCREM
            DO I=1,INCREM
C
C             Calculate power-of-two butterfly update:
              C = Y(IOFF2+I,L)*CEXP(IC+I)
              Y(IOFF2+I,L)   = Y(IOFF1+I,L) - C
              Y(IOFF1+I,L)   = Y(IOFF1+I,L) + C
C
            ENDDO
C
          ENDDO
C
          IC = IC + INCREM
          INCREM = 2*INCREM
          NBLOCK = NBLOCK/2
C
        ENDDO
C
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE IBTUPD(DIRIN, DIROUT, VECLTH, MVECS, CJUMP, CEXP, 
     &                  INBUF, OUTBUF, Y)
C
C IBTUPD calculates an "inverse" power-of-two butterfly update for each
C vector in Y, using VECLTH/2 complex data in INBUF and Y to produce a
C new complex vector of length VECLTH, VECLTH/2 in Y and VECLTH/2 in
C OUTBUF. This update represents a factor-of-two stage of an inverse
C fast Fourier transform. 
C
C called by: IRFFT1, IRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C switch indicating whether the data for vector J start in INBUF(1,J)
C and Y(1,J) (DIRIN .EQ. 0) or in Y(VECLTH/2+1,J) and INBUF(1,J) 
C (DIRIN .NE. 0) 
      INTEGER DIRIN
C switch indicating whether the results for vector J start in
C OUTBUF(1,J) and Y(1,J) (DIROUT .EQ. 0) or in Y(VECLTH/2+1,J) and
C OUTBUF(1,J) (DIROUT .NE. 0) 
      INTEGER DIROUT
C length of each complex output vector and half the length of the
C complex data vectors (and length of real data vectors)
      INTEGER VECLTH
C number of vectors being updated
      INTEGER MVECS
C the offset between the start of successive vectors in Y
      INTEGER CJUMP
C trigonometric function values used in the update
      COMPLEX CEXP(VECLTH/2)
C half of the data used in the update
      COMPLEX INBUF(VECLTH/2,MVECS)
C
C     Input/Output
C
C On input, half of the data for the update is contained in Y. On
C output, Y contains the results of the update. 
      COMPLEX Y(CJUMP,MVECS)
C half of the results produced by the update
      COMPLEX OUTBUF(VECLTH/2,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C length of data vectors
      INTEGER HLFLTH
C loop indices
      INTEGER I, J
C temporary used in computing the inverse power-of-two butterfly update 
      COMPLEX C
C                                                                             
C---- Executable Statements --------------------------------------------
C
C     length of data vectors
      HLFLTH = VECLTH/2
C
      IF (DIROUT .EQ. 0) THEN
C
        IF (DIRIN .EQ. 0) THEN
C
          DO J=1,MVECS
            DO I=1,HLFLTH
C             Calculate power-of-two butterfly update.
              C = INBUF(I,J)*CEXP(I)
              OUTBUF(I,J) = Y(I,J) - C
              Y(I,J)      = Y(I,J) + C
            ENDDO
          ENDDO
C
        ELSE
C
          DO J=1,MVECS
            DO I = 1,HLFLTH
C             Calculate power-of-two butterfly update.
              C = Y(HLFLTH+I,J)*CEXP(I)
              OUTBUF(I,J) = INBUF(I,J) - C
              Y(I,J)      = INBUF(I,J) + C
            ENDDO
          ENDDO
C
        ENDIF
C
      ELSE
C
        IF (DIRIN .EQ. 0) THEN
C
          DO J=1,MVECS
            DO I=1,HLFLTH
C             Calculate power-of-two butterfly update.
              C = INBUF(I,J)*CEXP(I)
              Y(HLFLTH+I,J) = Y(I,J) - C
              OUTBUF(I,J)   = Y(I,J) + C
            ENDDO
          ENDDO
C
        ELSE
C
          DO J=1,MVECS
            DO I = 1,HLFLTH
C             Calculate power-of-two butterfly update.
              C = Y(HLFLTH+I,J)*CEXP(I)
              Y(HLFLTH+I,J) = INBUF(I,J) - C
              OUTBUF(I,J)   = INBUF(I,J) + C
            ENDDO
          ENDDO
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE BUFFIX(VECLTH, MVECS, CJUMP, BUF, Y)
C
C BUFFIX copies the elements of each (half length) complex vector in
C buffer BUF into the even elements of the corresponding vector in Y, in
C reverse order. 
C
C called by: IRFFT1, IRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C length of the complex vectors in Y
      INTEGER VECLTH
C number of vectors in Y and BUF
      INTEGER MVECS
C the offset between the start of successive vectors in Y
      INTEGER CJUMP
C the array of half length data vectors
      COMPLEX BUF(VECLTH/2,MVECS)
C
C     Input/Output
C
C the array of vectors being partially  overwritten
      COMPLEX Y(CJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C length of the vectors in BUF
      INTEGER HLFLTH
C loop indices
      INTEGER I, J
C                                                                             
C---- Executable Statements --------------------------------------------
C
C length of the vectors in BUF
      HLFLTH = VECLTH/2
C
      DO J=1,MVECS
        DO I=1,HLFLTH
          Y(VECLTH-2*I+2,J) = BUF(I,J)
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
