C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C rftfax.f                                                             C
C                                                                      C
C The following routines are used to calculate trigonometric function  C
C values and ordering information required when using the distributed  C
C real Fourier transform routine PARRFT.                               C
C                                                                      C
C RFTFAX - calculates the factorization of N, the length of the        C
C          distributed vector being transformed, and the twiddle       C
C          factors (trigonometric function values) used on this        C
C          processor in the transform.                                 C
C MDEX   - calculates the global (ordered) index of a Fourier          C
C          given its local (unordered) index.                          C
C CROOTU - returns the requested root of unity.                        C
C                                                                      C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE RFTFAX(P, MYINDEX, N, NTRIGS, TRIGS, NFAX)
C
C This routine is an initialization routine that must 
C be called before REALRFT routine for a given decomposition
C
C called by: INPUT
C calls: CROOTU
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors over which the Fourier transform is to be
C distributed
      INTEGER P
C index of "me" in the set of processors
      INTEGER MYINDEX
C The number of points to be transformed: N must be a power of 2, and
C there must be at least 4 data points per processor
      INTEGER N
C number of trigonometric function values to be generated (number needed
C by PARRFT. It should be equal to (LOG2(P)+6)*N/4 + LOG2(P).
      INTEGER NTRIGS
C
C     Output
C
C array containing twiddle factors used in PARRFT
      COMPLEX TRIGS(NTRIGS)
C the factorization of N 
      INTEGER NFAX(13)
C
C---- Local Variables --------------------------------------------------
C
C real zero and one, to allow easy precision modifications
      REAL ZERO, ONE
C base 2 logarithm of the number of processors (also, approximate number
C of steps in the distributed phase of the Fourier transform algorithm)
      INTEGER LGP
C the local (per processor) length of the vector to be transformed
      INTEGER NLOCAL
C number of complex values in a real array of size NLOCAL
      INTEGER NCLOC
C number of complex values in each local vector segment swapped during
C a parallel stage 
      INTEGER NCOMM
C base 2 logarithm of NCLOC (approximate number of steps in the local
C phase of the Fourier transform)
      INTEGER ILOC
C total number of steps in the Fourier transform algorithm
      INTEGER ITOT
C index offsets for the TRIGS array, denoting the beginning of the
C twiddle factors for the forward tranform distributed, sequential
C and fix-up phases, and the inverse transform fix-up, sequential, and
C distributed phases, respectively.
      INTEGER FWDDIST, FWDSEQ, FWDFIX, INVFIX, INVSEQ, INVDIST
C binary representation of MYINDEX, and temporary used to compute it
      INTEGER IPBITS(0:15), TEMP
C loop indices
      INTEGER I, J, K
C P/2
      INTEGER NPHALF
C bit reversal of MYINDEX
      INTEGER MYREV
C powers of two
      INTEGER IPOW, JPOW, KPOW
C left and right circular shifts of MYINDEX, and right circular shift of
C MYREV
      INTEGER ILSHFT, IRSHFT, IRSREV
C other index offset temporaries used in computing twiddle factors
      INTEGER IUPPER, INDX, ICOUNT, INCREM, MBIT, IRIND
C
C---- External Functions -----------------------------------------------
C
C integer base 2 logarithm
      EXTERNAL LOG2
      INTEGER LOG2
C
C returns Nth roots of unity
      EXTERNAL CROOTU
      COMPLEX CROOTU
C
C returns bit reversed values
      EXTERNAL BITREV
      INTEGER BITREV
C                                                                             
C---- Executable Statements --------------------------------------------
C
C Calculate various values derived from N and P. See descriptions of
C variables above.
      NPHALF = P/2
      LGP    = LOG2(P)
      NLOCAL = N/P
      NCLOC  = NLOCAL/2
      NCOMM  = NCLOC/2
      ILOC   = LOG2(NCLOC)
      ITOT   = LGP + ILOC
C
      IF (NLOCAL .LT. 4) THEN
        WRITE(0,100) N, P, NLOCAL
 100    FORMAT (/,' PSTSWM: FATAL ERROR IN ROUTINE RFTFAX:',/,
     &   ' NOT ENOUGH POINTS PER PROCESSOR FOR DISTRIBUTED',/,
     &   ' FOURIER TRANSFORM (NEED AT LEAST 4)',/,
     &   ' P :',I8,/,' N :',I8,' N/P ',I8)
        STOP
      ENDIF
C
C     Calculate index offsets for trigonometric data.
C     COMPLEX CFOR(LGP)
      FWDDIST = 1
C     COMPLEX CEXP3(NLOCAL/2)
      FWDSEQ = FWDDIST + LGP
C     COMPLEX CEXP4(NLOCAL/4)
      FWDFIX = FWDSEQ + (NLOCAL/2)
C     COMPLEX CEXP5(NLOCAL/4)
      INVFIX = FWDFIX + (NLOCAL/4)
C     COMPLEX CEXP2(NLOCAL/2)
      INVSEQ = INVFIX + (NLOCAL/4)
C     COMPLEX CEXP1(LGPFC*NLOCAL/4)
      IF (LGP .GT. 0) THEN
        INVDIST = INVSEQ + (NLOCAL/2)
      ELSE
        INVDIST = FWDDIST
      ENDIF
C
      IF (((LGP+6)*NLOCAL/4 + LGP) .GT. NTRIGS) THEN
        WRITE(0,101) NTRIGS, ((LGP+6)*N/4 + LGP)
 101    FORMAT (/,' PSTSWM: FATAL ERROR IN ROUTINE RFTFAX:',/,
     &   ' NOT ENOUGH SPACE ALLOCATED FOR TRIGONOMETRIC FUNCTION',
     &   ' VALUES',/,
     &   ' ALLOCATED SPACE :',I8,/,' REQUIRED SPACE  :',I8)
        STOP
      ENDIF
C
C     Calculate circular left shift of MYINDEX.
      IF (MYINDEX .NE. 0) THEN
        ILSHFT  = 2*MOD(MYINDEX, NPHALF) + MYINDEX/NPHALF
      ELSE
        ILSHFT = 0
      ENDIF
C
C     Calculate circular right shift of MYINDEX.
      IRSHFT = MYINDEX/2 + MOD(MYINDEX,2)*NPHALF 
C
C     Determine binary representation of MYINDEX.
      TEMP  = MYINDEX
      DO I=0,15
        IPBITS(I) = MOD(TEMP,2)
        TEMP = TEMP/2
      ENDDO
C
C     Calculate right shifted reversal of MYINDEX.
      MYREV = 0
      IPOW  = NPHALF
      DO I=0,LGP-1
        MYREV = MYREV + IPOW*IPBITS(I)
        IPOW  = IPOW/2
      ENDDO
      IRSREV = MYREV/2  + MOD(MYREV,2)*NPHALF 
C
C     Calculate twiddle factors for distributed phase of inverse real
C     transform.
      ICOUNT = 0
      KPOW   = NCLOC
      JPOW   = 2
      DO I=0,LGP-1
	IUPPER = MOD(ILSHFT,JPOW)*NCOMM
	JPOW   = JPOW*2
	KPOW   = KPOW*2
        DO J=0,NCOMM-1
	  INDX = IUPPER + J
	  TRIGS(INVDIST+ICOUNT) = CROOTU(-INDX,KPOW)
          ICOUNT = ICOUNT + 1
        ENDDO
      ENDDO
C
C     Calculate twiddle factors for sequential phase of inverse real
C     transform.
      ICOUNT = 0
      INCREM = 1
      DO I=0,ILOC-1
	KPOW   = 2*INCREM
	DO K=0,INCREM-1
          TRIGS(INVSEQ+ICOUNT) = CROOTU(-K,KPOW)
	  ICOUNT = ICOUNT + 1
        ENDDO
        INCREM = INCREM*2
      ENDDO
C
C     Calculate twiddle factors for distributed phase of forward real
C     transform.
      IPOW = NPHALF
      JPOW = 0
      INDX = 0
      DO I=LGP-1,0,-1
	KPOW = P/IPOW
	INDX = INDX + JPOW*IPBITS(I)
	TRIGS(FWDDIST+I) = CROOTU(INDX,KPOW)
	JPOW = JPOW*2
	IF (I .EQ. (LGP-1)) JPOW = 1
	IPOW = IPOW/2
      ENDDO
C
C     Calculate twiddle factors for sequential phase of forward real
C     transform.
      ICOUNT = 0
      MBIT   = 0
      IPOW   = 1
      KPOW   = 2*P
      DO I=ILOC-1,0,-1
	DO K=0,IPOW-1
          INDX   = IRSREV + P*BITREV(K,MBIT)
	  TRIGS(FWDSEQ+ICOUNT) = CROOTU(INDX,KPOW)
	  ICOUNT = ICOUNT + 1
        ENDDO
        KPOW = KPOW*2
	IPOW = IPOW*2
	MBIT = MBIT + 1
      ENDDO
C
C     Calculate twiddle factors for fix-up phases of forward (FWDFIX)
C     and inverse (INVFIX) real transform, respectively.
      ICOUNT = 0
      ZERO = 0.0
      ONE  = 1.0
      DO I=0,NCOMM-1
        IRIND  = ILSHFT*NCLOC + 2*I
	INDX   = BITREV(IRIND,ITOT)
	TRIGS(FWDFIX+ICOUNT) = CMPLX(ZERO,ONE)*CROOTU(INDX,N)
        TRIGS(INVFIX+ICOUNT) = CMPLX(ZERO,ONE)*CROOTU(-INDX,N)
        ICOUNT = ICOUNT + 1
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
       INTEGER FUNCTION MDEX(K, PID, NPROCS, NLOCAL)
C
C MDEX returns the true (global, ordered) index of a Fourier coefficient
C given its local (unordered) index K on processor PID in a subset of
C NPROCS processors. If K is not a legal local Fourier coefficient
C index, then MDEX returns -1.
C
C called by: ALGINP
C calls: LOG2, BITREV
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C local Fourier index
      INTEGER K
C (relative) processor index
      INTEGER PID
C number of processors collaborating in distributed real Fourier
C transform
      INTEGER NPROCS
C number of local Fourier coefficients
      INTEGER NLOCAL
C
C---- Local Variables --------------------------------------------------
C
C left circular shift of PID
      INTEGER ILSHFT
C total number of (global) Fourier coefficients
      INTEGER N
C log_2(N)
      INTEGER LOGN
C largest even value less than or equal to K
      INTEGER KT
C
C---- External Functions -----------------------------------------------
C
C integer base 2 logarithm
      EXTERNAL LOG2
      INTEGER LOG2
C
C returns bit reversed values
      EXTERNAL BITREV
      INTEGER BITREV
C
C---- Executable Statements --------------------------------------------
C
C     Calculate circular left shift of PID.
      IF (PID .NE. 0) THEN
        ILSHFT  = 2*MOD(PID, (NPROCS/2)) + PID/(NPROCS/2)
      ELSE
        ILSHFT = 0
      ENDIF
C
C     Calculate total number of Fourier coefficients, and its logarithm.
      N    = NLOCAL*NPROCS
      LOGN = LOG2(N)
C
      IF (K .EQ. NLOCAL) THEN
C       Unless on processor 0, NLOCAL is not a legal local index. On
C       processor 0, it represents the largest Fourier wavenumber.
        MDEX = -1
        IF (PID .EQ. 0) MDEX = N
      ELSEIF ((K .NE. 1) .OR. (PID .NE. 0)) THEN
C       Calculate MDEX for current distributed real FFT algorithm. See
C       paper by Walker, Worley, and Drake for fuller explanation of
C       permutation.
        KT   = 2*(K/2)
	MDEX = BITREV(KT+NLOCAL*ILSHFT,LOGN)
        IF (KT .NE. K) MDEX = N - MDEX
      ELSE
C       On processor 0, K=1 is the "middle" wavenumber.
        MDEX = N/2
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
       INTEGER FUNCTION BITREV(I, NBITS)
C
C This function returns the NBITS bit reversal of the integer I.
C
C called by: RFTFAX, MDEX
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C integer to be bit reversed
      INTEGER I
C number of bits over which to reverse I
      INTEGER NBITS
C
C---- Local Variables --------------------------------------------------
C
C temporary used to isolate successive bits in the binary representation
C of I 
      INTEGER TEMP
C power of two
      INTEGER IPOW
C loop index
      INTEGER J
C                                                                             
C---- Executable Statements --------------------------------------------
C
      BITREV = 0
      TEMP   = I
      IPOW   = 2**(NBITS-1)
      DO J=1,NBITS
        BITREV = BITREV + IPOW*MOD(TEMP,2)
        TEMP = TEMP/2
        IPOW = IPOW/2
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
       COMPLEX FUNCTION CROOTU(I, N)
C
C This function returns the value of the -Ith "Nth root of unity", i.e.
C the complex exponential whose argument is -2*PI*SQRT(-1)*(I/N). It is
C used in RFTFAX to generate the twiddle factors used in the real
C Fourier tranaform. 
C
C called by: RFTFAX
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C (unscaled) numerator of argument to complex exponential
      INTEGER I
C denominator of argument to complex exponential
      INTEGER N
C
C---- Local Variables --------------------------------------------------
C
C angle of the specified root
      DOUBLE PRECISION THETA
C                                                                             
C---- Executable Statements --------------------------------------------
C
C     2*pi*i/n
      THETA  = (8.0D0*ATAN(1.0D0)*I)/N
      CROOTU = CMPLX(COS(THETA), -SIN(THETA))
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

