C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
      INTEGER FUNCTION ALGINP()
C                                                                              
C This subroutine inputs or determines logical machine and parallel
C algorithm parameters.
C                                                                              
C called by: INPUT
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C     
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C     
C---- Common Blocks ----------------------------------------------------
C     
C machine architecture information
#     include "machine.i"
C problem resolution information
#     include "problem.i"
C parallel algorithm information
#     include "algorithm.i"
C domain decomposition information
#     include "physical.i"
#     include "fourier.i"
#     include "spectral.i"
C
C---- Local Variables --------------------------------------------------
C
C flag indicating existence of input file
      LOGICAL ALGTEST
C communication buffers
      INTEGER IBUF(20)
C process/processor mapping option for logical ring
      INTEGER RINGOPT
C variables for calculating processor mappings
      INTEGER PX, PY, INDEX_I, INDEX_J, INDEX_ME
      INTEGER MAP2D(0:NPROCSX-1), MAP1D(0:NPROCSX-1)
C offsets used to determine mapping from local latitude and
C longitude indices to global indices
      INTEGER LATBASE, LONBASE
C half the number of local latitudes
      INTEGER NLLATH
C indices used in determining load balancing ordering of Fourier
C wavenumbers
      INTEGER NFCLIM1, NFCLIM2, FIRST, SECOND, NXTDEX, BEGDEX, ENDDEX
C array containing load balancing ordering of Fourier wavenumbers
      INTEGER NEWORDER(NFCX)
C Fourier wavenumber, local (untruncated) wavenumber index, processor
C index, and spectral coefficient index
      INTEGER M, IK, P, L
C local (truncated) indices for M=0 and M=1 Fourier coefficients
      INTEGER JM0, JM1
C MTRUE_F/S index for next wavenumber to be truncated, used when 
C determining local Fourier coefficients
      INTEGER NTMM
C variables used to calculate number of local spectral coefficients
      INTEGER NTMP1, NTMP2
C polynomial degree and wavenumber (loop) indices
      INTEGER JN, JM
C polynomial degree transition index and index value denoting beginning
C of local spectral coefficients, used when computing the partition of 
C spectral coefficients
      INTEGER JNTRNS, JNME
C work array for shifting spectral domain decomposition
      INTEGER JMTMP(NPROCSX+MMX+1)
C variables used to calculate how much space must be allocated to 
C allow physical, Fourier, and spectral arrays to fit into the
C same memory
      INTEGER PFACTOR, FFACTOR, SFACTOR
C log_2 and (2**(log_2 P)) of a given number of processors
      INTEGER LGP, IPX
C flag indicating whether to use a regular or irregular O(LG(P)) 
C transpose algorithm 
      LOGICAL REG_LGPFFT, REG_LGPIFT, REG_LGPFLT, REG_LGPILT
C number of logical buffers used in O(LG(P)) transpose algorithm.
      INTEGER MXBUF
C standard size (in REALS) of messages used in parallel algorithms
      INTEGER BLOCKSZ
C integer indicators of how much system space is required to prevent 
C various blocking send/receive patterns from deadlocking. 
      INTEGER FFTMSGS, FFTVOL, IFTMSGS, IFTVOL, FLTMSGS, FLTVOL,
     &        ILTMSGS, ILTVOL
C other loop indices and counters
      INTEGER I, J
C base for message types used in broadcasting problem parameters
      INTEGER BASE
C
C---- External Functions ----------------------------------------------
C
C base for message types
      EXTERNAL MSGBASE
      INTEGER MSGBASE
C logical machine mapping functions
      EXTERNAL RING_MAP
      INTEGER RING_MAP
C FFT reordering description
      EXTERNAL MDEX
      INTEGER MDEX
C log base 2 function
      EXTERNAL LOG2
      INTEGER LOG2
C                                                                            
C---- Executable Statements --------------------------------------------
C
      BASE = MSGBASE()
      IF (ME .EQ. 0) THEN
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C       Initialize logical machine parameters.
C       See common block /MACHINE/ (in machine.i) for definitions.
        PX = 1
        PY = 1
        MESHOPT = 1
        RINGOPT = 1
C
C       Initialize parallel algorithm parameters.
C       See common block /ALGORITHM/ (in algorithm) for definitions.
        FTOPT     = 0
        LTOPT     = 0
        COMMFFT   = 1
        COMMIFT   = 1
        COMMFLT   = 1
        COMMILT   = 1
        BUFSFFT   = 0
        BUFSIFT   = 0
        BUFSFLT   = 0
        BUFSILT   = 0
        PROTFFT   = 6
        PROTIFT   = 6
        PROTFLT   = 6
        PROTILT   = 6
        SUMOPT    = 0
        EXCHSIZE  = 1
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C                                                                              
C       Read algorithm input data.
        ALGTEST = .FALSE.
        OPEN(8, ERR=999, FILE=ALGINPUT, STATUS='OLD')
        ALGTEST = .TRUE.
  999   IF (ALGTEST) THEN
C
          WRITE(6,*) 'READING ALGORITHM PARAMETERS FROM FILE ', 
     &               ALGINPUT,':'
C
C         problem size parameters
          READ(8,*) PX
          READ(8,*) PY
          READ(8, *, END=1000) MESHOPT
          READ(8, *, END=1000) RINGOPT
C
C         algorithm parameters
          READ(8, *, END=1000) FTOPT
          READ(8,*) LTOPT
          READ(8, *, END=1000) COMMFFT
          READ(8,*) COMMIFT
          READ(8,*) COMMFLT
          READ(8,*) COMMILT
          READ(8, *, END=1000) BUFSFFT
          READ(8,*) BUFSIFT
          READ(8,*) BUFSFLT
          READ(8,*) BUFSILT
          READ(8, *, END=1000) PROTFFT
          READ(8,*) PROTIFT
          READ(8,*) PROTFLT
          READ(8,*) PROTILT
          READ(8, *, END=1000) SUMOPT
          READ(8, *, END=1000) EXCHSIZE
C
C         Close input file.
 1000     CLOSE(8)
C
        ELSE
C
          WRITE(6,*) 'USING DEFAULT ALGORITHM PARAMETERS'
C
        ENDIF
C
C       Send input parameters to other processors.
        IF (NPROCS .GT. 1) THEN
C
          IBUF(1)  = PX
          IBUF(2)  = PY
          IBUF(3)  = MESHOPT
          IBUF(4)  = RINGOPT
          IBUF(5)  = FTOPT
          IBUF(6)  = LTOPT
          IBUF(7)  = COMMFFT
          IBUF(8)  = COMMIFT
          IBUF(9)  = COMMFLT
          IBUF(10) = COMMILT
          IBUF(11) = BUFSFFT
          IBUF(12) = BUFSIFT
          IBUF(13) = BUFSFLT
          IBUF(14) = BUFSILT
          IBUF(15) = PROTFFT
          IBUF(16) = PROTIFT
          IBUF(17) = PROTFLT
          IBUF(18) = PROTILT
          IBUF(19) = SUMOPT
          IBUF(20) = EXCHSIZE
          CALL SETDATA0F('INTEGER')
          CALL BCAST0F(IBUF, IBYTES*20, BASE, 0)
C
        ENDIF
C
      ELSE
C
C       Get input parameters from node 0.
        CALL SETDATA0F('INTEGER')
        CALL BCAST0F(IBUF, IBYTES*20, BASE, 0)
        PX        = IBUF(1)
        PY        = IBUF(2)
        MESHOPT   = IBUF(3)
        RINGOPT   = IBUF(4)
        FTOPT     = IBUF(5)
        LTOPT     = IBUF(6)
        COMMFFT   = IBUF(7)
        COMMIFT   = IBUF(8)
        COMMFLT   = IBUF(9)
        COMMILT   = IBUF(10)
        BUFSFFT   = IBUF(11)
        BUFSIFT   = IBUF(12)
        BUFSFLT   = IBUF(13)
        BUFSILT   = IBUF(14)
        PROTFFT   = IBUF(15)
        PROTIFT   = IBUF(16)
        PROTFLT   = IBUF(17)
        PROTILT   = IBUF(18)
        SUMOPT    = IBUF(19)
        EXCHSIZE  = IBUF(20)
C
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Check consistency of logical machine parameters
C     and calculate neighbor information.
C
      IF (NPROCS .NE. PY*PX) THEN
        IF (ME .EQ. 0) WRITE(0,641) NPROCS, PX, PY
  641   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' PX*PY NOT EQUAL TO NPROCS',/,
     &          ' NPROCS = ',I4,' PX = ',I4,' PY = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
C     Initialize logical-to-physical processor mappings.
      CALL MESH_MAP_INIT(MESHOPT, PX, PY, MAP2D)
      CALL RING_MAP_INIT(RINGOPT, NPROCS, MAP1D)
C
C     Calculate i,j coordinates of processor.
      CALL MESH_INDEX(ME, PX, PY, MAP2D, INDEX_I, INDEX_J)
C
C     Calculate location in big ring.
      CALL RING_INDEX(ME, NPROCS, MAP1D, INDEX_ME)
      PREV = RING_MAP(INDEX_ME+1, NPROCS, MAP1D)
      NEXT = RING_MAP(INDEX_ME-1, NPROCS, MAP1D)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Calculate and check consistency of domain decomposition.
C
C     1) Define domain decomposition in physical space.
C
C     allocation of processors to decomposition
      NPLON_P = PX
      CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPLON_P)
C
      NPLAT_P = PY
      CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPLAT_P)
C
      NPVER_P = 1
      MAPVER_P(0) = ME
C
C     indices of current processor in this mapping
      LONDEX_P = INDEX_I
      LATDEX_P = INDEX_J
      VERDEX_P = 0
C
C     Longitude
C
C     Decomposition should assign at least one longitude per processor.
      IF (NLON .LT. NPLON_P) THEN
        IF (ME .EQ. 0) WRITE(0, 651) NLON, NPLON_P
  651   FORMAT(' NPLON_P IS LARGER THAN NLON',/,
     &         ' NLON = ',I4,' NPLON_P = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      NTMP1 = NLON/NPLON_P
      NTMP2 = MOD(NLON,NPLON_P)
      IF (NTMP2 .GT. 0) THEN
        MXLLON_P = NTMP1 + 1
      ELSE
        MXLLON_P = NTMP1
      ENDIF
      DO I=0,NTMP2-1
        NDLON_P(I) = NTMP1+1
      ENDDO
      DO I=NTMP2,NPLON_P-1
        NDLON_P(I) = NTMP1
      ENDDO
C
      NLLON_P = NDLON_P(LONDEX_P)
C
      LONBASE = 0
      DO I=0,LONDEX_P-1
        LONBASE = LONBASE + NDLON_P(I)
      ENDDO
      DO I=1,NLLON_P
         LONTRUE_P(I) = LONBASE + I
      ENDDO
C
C     Latitude
C
      IF (LTOPT .EQ. 0) THEN
C       Using a distributed Legendre transform algorithm, so
C       decomposition should assign pairs (north/south) of latitudes to 
C       processors, with at least one pair in each processor.
C
        IF (NLAT/2 .LT. NPLAT_P) THEN
          IF (ME .EQ. 0) WRITE(0, 653) NLAT, NPLAT_P
  653     FORMAT(' NPLAT_P IS LARGER THAN NLAT/2',/,
     &           ' NLAT = ',I4,' NPLAT_P = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        NLLATH = (NLAT/2)/NPLAT_P
        NTMP2 = MOD(NLAT/2,NPLAT_P)
        IF (NTMP2 .GT. 0) THEN
          MXLLAT_P = 2*(NLLATH + 1)
        ELSE
          MXLLAT_P = 2*NLLATH
        ENDIF
        IF (LATDEX_P .LT. NTMP2) THEN
          NLLATH = NLLATH+1
          LATBASE = LATDEX_P*NLLATH
        ELSE
          LATBASE = NTMP2*(NLLATH+1) + (LATDEX_P-NTMP2)*NLLATH
        ENDIF
        NLLAT_P = 2*NLLATH
C
        DO I=1,NLLATH
          LATTRUE_P(I)             = LATBASE + I
          LATTRUE_P(NLLAT_P-(I-1)) = (NLAT-LATBASE) - (I-1)
        ENDDO
C
      ELSE
C
C       Using a transpose/serial Legendre transform algorithm, so
C       decomposition should assign at least one latitude to each
C       processor.
C
        IF (NLAT .LT. NPLAT_P) THEN
          IF (ME .EQ. 0) WRITE(0, 654) NLAT, NPLAT_P
  654     FORMAT(' NPLAT_P IS LARGER THAN NLAT',/,
     &           ' NLAT = ',I4,' NPLAT_P = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        NLLAT_P  = NLAT/NPLAT_P
        NTMP2    = MOD(NLAT,NPLAT_P)
        IF (NTMP2 .GT. 0) THEN
          MXLLAT_P = NLLAT_P + 1
        ELSE
          MXLLAT_P = NLLAT_P
        ENDIF
        IF (LATDEX_P .LT. NTMP2) THEN
          NLLAT_P = NLLAT_P+1
          LATBASE = LATDEX_P*NLLAT_P
        ELSE
          LATBASE = NTMP2*(NLLAT_P+1) + (LATDEX_P-NTMP2)*NLLAT_P
        ENDIF
C
        DO I=1,NLLAT_P
           LATTRUE_P(I) = LATBASE + I
        ENDDO
C
      ENDIF
C
C     Vertical
C
C     Decomposition should assign at least one vertical layer per 
C     processor.
      IF (NVER .LT. NPVER_P) THEN
        IF (ME .EQ. 0) WRITE(0,655) NVER, NPVER_P
  655   FORMAT(' TOO FEW VERTICAL LEVELS',/,
     &         ' NVER = ',I4,' NPVER_P = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      NTMP1 = NVER/NPVER_P
      NTMP2 = MOD(NVER,NPVER_P)
      IF (NTMP2 .GT. 0) THEN
        MXLVER_P = NTMP1 + 1
      ELSE
        MXLVER_P = NTMP1
      ENDIF
      DO I=0,NTMP2-1
        NDVER_P(I) = NTMP1+1
      ENDDO
      DO I=NTMP2,NPVER_P-1
        NDVER_P(I) = NTMP1
      ENDDO
C
      NLVER_P = NDVER_P(VERDEX_P)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     2) Define domain decomposition for Fourier and 
C     physical spaces when transforming between them.
C
      IF (FTOPT .EQ. 0) THEN
C       Using distributed FFT
C
C       Check that PX is a power of two.
        LGP = LOG2(PX)
        IPX = 2**LGP
        IF (PX .NE. IPX) THEN
          IF (ME .EQ. 0) WRITE(0,644) PX
  644     FORMAT(' PX NOT A POWER OF TWO',/,
     &           ' PX = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
C       allocation of processors to decomposition
        NPFC_F = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_F)
C
        NPLAT_F = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPLAT_F)
C
        NPVER_F = 1
        MAPVER_F(0) = ME
C
C       indices of current processor in this mapping
        FCDEX_F  = INDEX_I
        LATDEX_F = INDEX_J
        VERDEX_F = 0
C
      ELSE
C       Using transpose and serial FFT
C
C       If using O(log P) tranpose, check that PX is a power of two.
        IF((COMMFFT .GE. 20) .OR. (COMMIFT .GE. 20)) THEN
          LGP = LOG2(PX)
          IPX = 2**LGP
          IF (PX .NE. IPX) THEN
            IF (ME .EQ. 0) WRITE(0,645) PX
  645       FORMAT(' PX NOT A POWER OF TWO',/,
     &             ' PX = ',I4)
            ALGINP = -1
            RETURN
          ENDIF
        ENDIF
C
C       allocation of processors to decomposition
        NPFC_F = 1
        MAPFC_F(0) = ME
C
        NPLAT_F = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPLAT_F)
C
        NPVER_F = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPVER_F)
C
C       indices of current processor in this mapping
        FCDEX_F = 0
        LATDEX_F = INDEX_J
        VERDEX_F = INDEX_I
C
      ENDIF
C
C     Longitude
C
C     Decomposition should assign at least one longitude per processor.
      IF (NLON .LT. NPFC_F) THEN
        IF (ME .EQ. 0) WRITE(0, 660) NLON, NPFC_F
  660   FORMAT(' NPFC_F IS LARGER THAN NLON',/,
     &         ' NLON = ',I4,' NPFC_F = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      NTMP1 = NLON/NPFC_F
      NTMP2 = MOD(NLON,NPFC_F)
      DO I=0,NTMP2-1
        NDLON_F(I) = NTMP1+1
      ENDDO
      DO I=NTMP2,NPFC_F-1
        NDLON_F(I) = NTMP1
      ENDDO
C
      NLLON_F = NDLON_F(FCDEX_F)
C
C     For distributed FFT algorithm need even partition with power of
C     two longitudes and power of two number of processors. Also need
C     at least four local longitudes per processor.
      IF (FTOPT .EQ. 0) THEN
C
        IF (NLON .NE. (NPFC_F*NLLON_F)) THEN
          IF (ME .EQ. 0) WRITE(0, 661) NLON, NPFC_F
  661     FORMAT(' NPFC_F DOES NOT DIVIDE NLON EVENLY',/,
     &           ' NLON = ',I4,' NPFC_F = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        IF (NLLON_F .NE. (2*(NLLON_F/2))) THEN
          IF (ME .EQ. 0) WRITE(0, 662) NLON, NPFC_F
  662     FORMAT(' 2 DOES NOT DIVIDE NLON/NPFC_F EVENLY',/,
     &           ' NLON = ',I4,' NPFC_F = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        IF (NLLON_F .LT. 4) THEN
          IF (ME .EQ. 0) WRITE(0, 663) NLON, NPFC_F
  663     FORMAT(' NLON/NPFC_F MUST BE AT LEAST 4 FOR PARALLEL FFT',/,
     &           ' NLON = ',I4,' NPFC_F = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
      ENDIF
C
C     Latitude
C
      IF (LTOPT .EQ. 0) THEN
C       Using a distributed Legendre transform algorithm, so
C       decomposition should assign pairs (north/south) of latitudes to 
C       processors, with at least one pair in each processor.
C
        IF (NLAT/2 .LT. NPLAT_F) THEN
          IF (ME .EQ. 0) WRITE(0, 664) NLAT, NPLAT_F
  664     FORMAT(' NPLAT_F IS LARGER THAN NLAT/2',/,
     &           ' NLAT = ',I4,' NPLAT_F = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        NTMP1 = (NLAT/2)/NPLAT_F
        NTMP2 = MOD(NLAT/2,NPLAT_F)
        IF (NTMP2 .GT. 0) THEN
          MXLLAT_F = 2*(NTMP1 + 1)
        ELSE
          MXLLAT_F = 2*NTMP1
        ENDIF
        DO I=0,NTMP2-1
          NDLAT_F(I) = 2*(NTMP1+1)
        ENDDO
        DO I=NTMP2,NPLAT_F-1
          NDLAT_F(I) = 2*NTMP1
        ENDDO
C
        NLLAT_F  = NDLAT_F(LATDEX_F)
C
      ELSE
C
C       Using a transpose/serial Legendre transform algorithm, so
C       decomposition should assign at least one latitude to each
C       processor.
C
        IF (NLAT .LT. NPLAT_F) THEN
          IF (ME .EQ. 0) WRITE(0, 665) NLAT, NPLAT_F
  665     FORMAT(' NPLAT_F IS LARGER THAN NLAT',/,
     &           ' NLAT = ',I4,' NPLAT_F = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        NTMP1 = NLAT/NPLAT_F
        NTMP2 = MOD(NLAT,NPLAT_F)
        IF (NTMP2 .GT. 0) THEN
          MXLLAT_F = NTMP1 + 1
        ELSE
          MXLLAT_F = NTMP1
        ENDIF
        DO I=0,NTMP2-1
          NDLAT_F(I) = NTMP1+1
        ENDDO
        DO I=NTMP2,NPLAT_F-1
          NDLAT_F(I) = NTMP1
        ENDDO
C
        NLLAT_F  = NDLAT_F(LATDEX_F)
C
      ENDIF
C
C     Vertical
C
      IF (FTOPT .EQ. 2) THEN
C       Using a double transpose FFT algorithm, so decomposing over 
C       fields and local latitudes as well as levels when undecomposing
C       longitudes, then going back to decomposition over wavenumbers 
C       before beginning parallel Legendre algorithm. For simplicity, 
C       "abusing" variable naming and using NDVER_F, NLVER_F, and 
C       MXLVER_F for the number of local independent FFTs (i.e., fields,
C       vertical levels, and local latitudes). These are calculated 
C       during the FFT since the number of species can vary. To get 
C       correct work array size declarations, need not worry about, e.g.,
C        MXLLON_P*NLVER_P*NLLAT_P .EQ. MXLLON_F*NLVER_F*NLLAT_F,
C       since decomposing over species also, so only worry about, e.g.,
C        MXLLON_P*NLVER_P*NLLAT_P*NSPECIES .GE. 
C         MXLLON_F*(NLVER_P*NLLAT_P*NSPECIES)/NPVER_F
C       so that other array blocks or field timesteps are not 
C       overwritten. To do this, need to pick NLVER_F (and MXLVER_F) so
C       that
C        NLVER_F*NLLAT_F*NSPECIES .GE. NLVER_P*NLLAT_P*NSPECIES/NPVER_F
C       for all NSPECIES, currently 1, 2, 3, 5, and 8.
        NLVER_F = MAX(
     &            ((NVER*NLLAT_P)/NPVER_F + 1)/(NLLAT_F) + 1,
     &            ((NVER*NLLAT_P*2)/NPVER_F + 1)/(NLLAT_F*2) + 1,
     &            ((NVER*NLLAT_P*3)/NPVER_F + 1)/(NLLAT_F*3) + 1,
     &            ((NVER*NLLAT_P*5)/NPVER_F + 1)/(NLLAT_F*5) + 1,
     &            ((NVER*NLLAT_P*8)/NPVER_F + 1)/(NLLAT_F*8) + 1)
C
        MXLVER_F = NLVER_F
C
      ELSE
C       Otherwise, decomposing (or not decomposing) over vertical 
C       dimension in standard way.
C
        NTMP1 = NVER/NPVER_F
        NTMP2 = MOD(NVER,NPVER_F)
        IF (NTMP2 .GT. 0) THEN
          MXLVER_F = NTMP1 + 1
        ELSE
          MXLVER_F = NTMP1
        ENDIF
        DO I=0,NTMP2-1
          NDVER_F(I) = NTMP1+1
        ENDDO
        DO I=NTMP2,NPVER_F-1
          NDVER_F(I) = NTMP1
        ENDDO
C
        NLVER_F = NDVER_F(VERDEX_F)
C
      ENDIF
C
C     Wavenumber
C
      DO M=0,NLON/2
        MTINV_F(M) = -1
      ENDDO
C     (Note: either NPFC_F .EQ. 1 (transpose FFT) or NPFC_F divides 
C     NLON/2 evenly (distributed FFT))
      NLFC_F  = ((NLON/2)/NPFC_F) + 1
      MXLFC_F = NLFC_F
      DO IK=0,NLFC_F-1
C       Get the "true" index of the "ik"th fc index on a processor
C       with fcdex index FCDEX_F (particular to this distributed FFT
C       when NPFC_F > 1, otherwise a standard unordered FFT).
        M = MDEX(IK, FCDEX_F, NPFC_F, NLFC_F-1)
        MTRUE_F(IK+1) = M
        IF (M .GE. 0) MTINV_F(M) = IK + 1
      ENDDO
C
C     NDFC_F refers to number of packed Fourier coefficients, so
C     extra space for NFC-1 in NLFC_F is not necessary.
      DO I=0,NPFC_F-1
        NDFC_F(I) = NLFC_F-1
      ENDDO
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     3) Define domain decomposition for spectral and Fourier spaces
C        when transforming between them.
C
      IF ((FTOPT .EQ. 0) .AND. (LTOPT .EQ. 0)) THEN
C       Using distributed FFT and distributed Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S   = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPSPEC_S)
C
        NPVER_S  = 1
        MAPVER_S(0) = ME
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_I
        SPECDEX_S  = INDEX_J
        VERDEX_S   = 0
C
      ELSEIF ((FTOPT .EQ. 1) .AND. (LTOPT .EQ. 0)) THEN
C       Using transpose/serial FFT and distributed Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S = 1
        MAPFC_S(0) = ME
C
        NPSPEC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPSPEC_S)
C
        NPVER_S = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPVER_S)
C
C       indices of current processor in this mapping
        FCDEX_S = 0
        SPECDEX_S = INDEX_J
        VERDEX_S = INDEX_I
C
      ELSEIF ((FTOPT .EQ. 0) .AND. (LTOPT .EQ. 1)) THEN
C       Using distributed FFT and transpose/serial Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = 1
        MAPSPEC_S(0) = ME
C
        NPVER_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPVER_S)
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_I
        SPECDEX_S  = 0
        VERDEX_S   = INDEX_J
C
      ELSEIF ((FTOPT .EQ. 1) .AND. (LTOPT .EQ. 1)) THEN
C       Using transpose/serial FFT and transpose/serial Legendre 
C       transform.
C
C       allocation of processors to decomposition
        NPFC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = 1
        MAPSPEC_S(0) = ME
C
        NPVER_S = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPVER_S)
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_J
        SPECDEX_S  = 0
        VERDEX_S   = INDEX_I
C
      ELSEIF ((FTOPT .EQ. 2) .AND. (LTOPT .EQ. 0)) THEN
C       Using double transpose distributed FFT and distributed
C       Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S   = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPSPEC_S)
C
        NPVER_S  = 1
        MAPVER_S(0) = ME
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_I
        SPECDEX_S  = INDEX_J
        VERDEX_S   = 0
C
      ENDIF
C
C     Differentiate between overlapped and non-overlapped 
C     distributed vector sum algorithms.
      IF ((LTOPT .EQ. 0) .AND. (COMMFLT .LT. 10)) THEN
        NLTSTEPS = NPSPEC_S
      ELSE
        NLTSTEPS = 1
      ENDIF
C
C     Wavenumber
C
C     Common initializations for determining local wavenumbers in
C     spectral decomposition
      DO M=0,NLON/2
        MTINV_S(M) = -1
      ENDDO
      JM0 = -1
      JM1 = -1
      IM0_S   = -1
      IMNFC_S = -1
C
      IF (((FTOPT .EQ. 1).AND.(LTOPT .EQ. 1).AND.(NPFC_S .GT. 1)) .OR.
     &    ((FTOPT .EQ. 2).AND.(NPFC_S .GT. 1))) THEN
C       Using transpose/serial FFT and transpose/serial Legendre 
C       transform or double transpose/serial FFT. Thus all wavenumbers 
C       are local in decomposed Fourier grid initially, but next 
C       transpose will decompose over the wavenumber dimension. Reorder
C       the wavenumbers first to load balance the subsequent Legendre 
C       transform.
C
C       Calculate ordering of untruncated wavenumbers to balance 
C       assignment of long and short columns of spectral coefficients:
C       Assign wavenumbers 0-(P-1) to processor rows 
C       (P-1)-0, wavenumbers P-(2P-1) to rows 0-(P-1), etc., 
C       changing direction of wrap map every P wavenumbers. 
C       (Ordering courtesy of S. Barros and T. Kauranne.) 
C       The last wrap is modified to "left justify" the last row
C       of wavenumbers, assigning them to lower numbered processors.
C       Note that wavenumber NFC-1 is not assigned, since it is 
C       packed with wavenumber 0 (when not truncated).
        DO I=0,NPFC_S-1
          NDFC_S(I) = 0
        ENDDO
        NXTDEX = 0
        IF (NFC-1 .LE. MM) THEN
          NFCLIM1 = NFC-1 - MOD(NFC-1,NPFC_S)
          NFCLIM2 = NFC-2
        ELSE
          NFCLIM1 = MM - MOD(MM,NPFC_S)
          NFCLIM2 = MM
        ENDIF
        DO P=0,NPFC_S-1
C
          IF (P .EQ. FCDEX_S) BEGDEX = NXTDEX
          FIRST  = (NPFC_S-1) - P
          SECOND = NPFC_S + P
          DO WHILE (FIRST .LT. NFCLIM1)
C
            NXTDEX = NXTDEX + 1
            NEWORDER(NXTDEX) = FIRST
            NDFC_S(P) = NDFC_S(P) + 1
            FIRST = FIRST + 2*NPFC_S
C
            IF (SECOND .LT. NFCLIM1) THEN
              NXTDEX = NXTDEX + 1
              NEWORDER(NXTDEX) = SECOND
              NDFC_S(P) = NDFC_S(P) + 1
              SECOND = SECOND + 2*NPFC_S
            ENDIF
C
          ENDDO
          IF (NFCLIM2 .GE. NFCLIM1) THEN
            NXTDEX = NXTDEX + 1
            NEWORDER(NXTDEX) = NFCLIM2
            NDFC_S(P) = NDFC_S(P) + 1
            NFCLIM2 = NFCLIM2 - 1
          ENDIF
          IF (P .EQ. FCDEX_S) ENDDEX = NXTDEX
C
        ENDDO
C      
C       Finish definition of NEWORDER by assigning truncated wavenumbers
C       to the end.
        IF (MM .LT. NFC-1) THEN
          DO I=MM+1,NFC-1
            NXTDEX = NXTDEX + 1
            NEWORDER(NXTDEX) = I
          ENDDO
        ENDIF
C
C       Determine untrucated local wavenumbers in spectral 
C       decomposition.
        NLFC_S  = NDFC_S(FCDEX_S)
        NLMM_S  = NLFC_S
        MXLFC_S = NDFC_S(0)
        DO I=1,NPFC_S-1
          IF (MXLFC_S .LT. NDFC_S(I)) MXLFC_S = NDFC_S(I)
        ENDDO
        DO IK=1,NLFC_S
C         Get the "true" index of the "ik"th fc index on a processor
C         with fcdex index FCDEX_S
          M = NEWORDER(BEGDEX+IK)
          MTRUE_S(IK) = M
          MTINV_S(M) = IK
          JMLTRUE_S(IK) = IK
          IF (M .EQ. 0) JM0   = IK
          IF (M .EQ. 1) JM1   = IK
          IF (M .EQ. 0) IM0_S = IK
        ENDDO
C
        IF (NFC-1 .LE. MM) THEN
C         Add wavenumber NFC-1 to the end. (Transpose works with 
C         "packed" array, where NFC-1 is packed into the imaginary part
C          of wavenumber 0. After tranpose is complete, NFC-1 is 
C         unpacked into the last location on processor NPFC_S-1.)
          NLFC_S = NLFC_S + 1
          IF (FCDEX_S .EQ. NPFC_S-1) THEN
            NLMM_S = NLFC_S
            MTRUE_S(NLFC_S) = NFC-1
            MTINV_S(NFC-1)  = NLFC_S
            IMNFC_S         = NLFC_S
          ELSE
            MTRUE_S(NLFC_S) = -1
          ENDIF
          JMLTRUE_S(NLFC_S) = NLFC_S
C
        ENDIF
C
C       Calculate how to reorder (packed) wavenumbers.
        DO NXTDEX=1,NFC-1
          ORDFWD_S(MTINV_F(NEWORDER(NXTDEX))) = NXTDEX
          ORDINV_S(NXTDEX) = MTINV_F(NEWORDER(NXTDEX))
        ENDDO
C
      ELSEIF ((FTOPT .EQ. 0).AND.(LTOPT .EQ. 1).AND.(NPVER_S .GT. 1)) 
     &                                                              THEN
C       Using distributed FFT and transpose/serial Legendre 
C       transform. The transpose decomposes over the vertical
C       dimension. By reordering the local wavenumbers first, the
C       truncated wavenumbers need not be transposed.
C
C       Determine local wavenumbers in spectral decomposition 
C       (both truncated and untruncated).
C
C       Order local wavenumbers so that all untruncated wavenumbers are
C       first, and only wavenumber NFC-1 and fictitious wavenumbers are
C       located in location NLFC-1: 
C       a) untruncated first,
        NLMM_S = 0
        DO IK=0,NLFC_F-1
C         Get the "true" index of the "ik"th fc index on a processor
C         with fcdex index FCDEX_S (particular to this distributed FFT
C         when NPFC_S > 1, otherwise a standard unordered FFT).
          M = MDEX(IK, FCDEX_S, NPFC_S, NLFC_F-1)
          IF (M .LE. MM .AND. M .GE. 0) THEN
            NLMM_S = NLMM_S + 1
            MTRUE_S(NLMM_S) = M
            MTINV_S(M) = NLMM_S
            JMLTRUE_S(NLMM_S) = NLMM_S
            IF (M .EQ. 0) THEN
              JM0   = NLMM_S
              IM0_S = IK+1
            ENDIF
            IF (M .EQ. 1) JM1 = NLMM_S
            IF (M .EQ. NFC-1) IMNFC_S = IK+1
          ENDIF
        ENDDO
C       b) truncated next,
        NTMM = NLMM_S
        DO IK=0,NLFC_F-1
          M = MDEX(IK, FCDEX_S, NPFC_S, NLFC_F-1)
          IF (M .GT. MM) THEN
            NTMM = NTMM + 1
            MTRUE_S(NTMM) = M
            MTINV_S(M) = NTMM
            JMLTRUE_S(NTMM) = NTMM
          ENDIF
        ENDDO
C       c) fictitious last
        DO IK=0,NLFC_F-1
          M = MDEX(IK, FCDEX_S, NPFC_S, NLFC_F-1)
          IF (M .LT. 0) THEN
            NTMM = NTMM + 1
            MTRUE_S(NTMM) = M
            JMLTRUE_S(NTMM) = NTMM
          ENDIF
        ENDDO
C
C       Save number of local (truncated) wavenumbers for this
C       subset of processors
        NLFC_S  = NLMM_S
        MXLFC_S = NLFC_S
        IF (IMNFC_S .EQ. -1) THEN
          DO I=0,NPFC_S-1
            NDFC_S(I) = NLFC_S
          ENDDO
        ELSE
          DO I=0,NPFC_S-1
            NDFC_S(I) = NLFC_S-1
          ENDDO
        ENDIF
C
C       Calculate how to reorder (packed) wavenumbers.
C       (Note: fictitious wavenumbers only in location NLFC_F,
C        so MTRUE_S(NXTDEX) never .EQ. -1 in following loop.)
        DO NXTDEX=1,NLFC_F-1
          ORDFWD_S(MTINV_F(MTRUE_S(NXTDEX))) = NXTDEX
          ORDINV_S(NXTDEX) = MTINV_F(MTRUE_S(NXTDEX))
        ENDDO
C
      ELSE
C       Determine local wavenumbers in spectral decomposition 
C       (both truncated and untruncated) without reordering.
        NLFC_S  = ((NLON/2)/NPFC_S) + 1
        MXLFC_S = NLFC_S
        DO I=0,NPFC_S-1
          NDFC_S(I) = NLFC_S-1
        ENDDO
        NLMM_S = 0
        NTMM   = NLFC_S
        DO IK=0,NLFC_S-1
C         Get the "true" index of the "ik"th fc index on a processor
C         with fcdex index FCDEX_S (particular to this distributed FFT
C         when NPFC_S > 1, otherwise a standard unordered FFT).
          M = MDEX(IK, FCDEX_S, NPFC_S, NLFC_S-1)
          IF (M .LE. MM .AND. M .GE. 0 ) THEN
            NLMM_S = NLMM_S + 1
            MTRUE_S(NLMM_S) = M
            MTINV_S(M) = NLMM_S
            JMLTRUE_S(NLMM_S) = IK+1
            IF (M .EQ. 0) JM0 = NLMM_S
            IF (M .EQ. 1) JM1 = NLMM_S
          ELSE
            MTRUE_S(NTMM) = M
            IF (M .GE. 0) MTINV_S(M) = NTMM
            JMLTRUE_S(NTMM) = IK+1
            NTMM = NTMM - 1
          ENDIF
          IF (M .EQ. 0)     IM0_S   = IK+1
          IF (M .EQ. NFC-1) IMNFC_S = IK+1
        ENDDO
C
      ENDIF
C
C     Latitude
C
C     Decomposition should assign pairs (north/south) of latitudes to 
C     processors, with at least one pair in each processor.
      IF (NLAT/2 .LT. NPSPEC_S) THEN
        IF (ME .EQ. 0) WRITE(0, 671) NLAT, NPSPEC_S
  671   FORMAT(' NPSPEC_S IS LARGER THAN NLAT/2',/,
     &         ' NLAT = ',I4,' NPSPEC_S = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      NTMP1 = (NLAT/2)/NPSPEC_S
      NTMP2 = MOD(NLAT/2,NPSPEC_S)
      IF (NTMP2 .GT. 0) THEN
        MXLLAT_S = 2*(NTMP1 + 1)
      ELSE
        MXLLAT_S = 2*NTMP1
      ENDIF
      DO I=0,NTMP2-1
        NDLAT_S(I) = 2*(NTMP1+1)
      ENDDO
      DO I=NTMP2,NPSPEC_S-1
        NDLAT_S(I) = 2*NTMP1
      ENDDO
C
      NLLAT_S  = NDLAT_S(SPECDEX_S)
      NLLATH_S = NLLAT_S/2
C
      LATBASE = 0
      DO I=0,SPECDEX_S-1
        LATBASE = LATBASE + NDLAT_S(I)/2
      ENDDO
      DO I=1,NLLATH_S
        LATTRUE_S(I)             = LATBASE + I
        LATTRUE_S(NLLAT_S-(I-1)) = (NLAT-LATBASE) - (I-1)
      ENDDO
C
C     Vertical
C
      NTMP1 = NVER/NPVER_S
      NTMP2 = MOD(NVER,NPVER_S)
      IF (NTMP2 .GT. 0) THEN
        MXLVER_S = NTMP1 + 1
      ELSE
        MXLVER_S = NTMP1
      ENDIF
      DO I=0,NTMP2-1
        NDVER_S(I) = NTMP1+1
      ENDDO
      DO I=NTMP2,NPVER_S-1
        NDVER_S(I) = NTMP1
      ENDDO
C
      NLVER_S = NDVER_S(VERDEX_S)
C
C     Determine LLCOL_S(1:NLMM+1,1:2), column lengths and cumulative
C     displacements for local ALP and DALP arrays.
      IF (NLMM_S .GT. 0) THEN
        LLCOL_S(1,1) = (NN+1) - MAX(MTRUE_S(1)+NN-KK, 0)
        LLCOL_S(1,2) = 0
        DO JM=2,NLMM_S
           LLCOL_S(JM,1) = (NN+1) - MAX(MTRUE_S(JM)+NN-KK, 0)
           LLCOL_S(JM,2) = LLCOL_S(JM-1,1) + LLCOL_S(JM-1,2)
        ENDDO
      ENDIF
C
C     Spectral
C
C     Calculate number of spectral coefficients associated with local 
C     Fourier wavenumbers.
      IF (NLMM_S .GT. 0) THEN
        NFSPEC_S = LLCOL_S(NLMM_S,1) + LLCOL_S(NLMM_S,2)
      ELSE
        NFSPEC_S = 0
      ENDIF
C
C     Calculate the number of local spectral coefficients associated 
C     with each processor in the set associated with the local Fourier 
C     wavenumbers.
      NTMP1 = NFSPEC_S/NLTSTEPS
      NTMP2 = MOD(NFSPEC_S,NLTSTEPS)
      DO J=NTMP2+1,NPSPEC_S
        NLSPEC_S(J) = NTMP1
      ENDDO
      NTMP1 = NTMP1 + 1
      DO J=1,NTMP2
        NLSPEC_S(J) = NTMP1
      ENDDO
      MXLSPEC_S = NLSPEC_S(1)
C     Increase MXLSPEC_S so that each set of spectral coefficients
C     will be aligned
      IF (MOD(2*MXLSPEC_S,ALIGN) .NE. 0) 
     &  MXLSPEC_S = (ALIGN*(1+((2*MXLSPEC_S)/ALIGN)))/2
C
C     Compute the partition of the spectral coefficients of the
C     state vectors in terms of JM and JN for distribution among 
C     the processors.
      JMB_S(1) = 1
      P = 1
      L = 0
      JNTRNS = 1
      JNB_S(1) = 1
      JNME = 1
      P01_S = -1
      P11_S = -1
C
      DO JM=1,NLMM_S
        DO JN=1,LLCOL_S(JM,1)
C
          L = L + 1
C
          IF ((JM .EQ. JM0) .AND. (JN .EQ. 2)) THEN
C            WAVE M=0, N=1
             P01_S = P
             L01_S = L
          ENDIF
C
          IF ((JM .EQ. JM1) .AND. (JN .EQ. 1)) THEN
C            WAVE M=1, N=1
             P11_S = P
             L11_S = L
          ENDIF
C
          IF ((L .EQ. NLSPEC_S(P)) .OR. (JN .EQ. LLCOL_S(JM,1))) THEN
C
            JNE_S(JNTRNS) = JN
            JNTRNS = JNTRNS + 1
            IF (JN .LT. LLCOL_S(JM,1)) THEN
              JNB_S(JNTRNS) = JN+1
            ELSE
              JNB_S(JNTRNS) = 1
            ENDIF
          ENDIF
C
          IF (L .EQ. NLSPEC_S(P)) THEN
C
            JME_S(P) = JM
            P = P + 1
            IF (JN .LT. LLCOL_S(JM,1)) THEN
              JMB_S(P) = JM
            ELSE
              JMB_S(P) = JM+1
            ENDIF
            IF (P .EQ. (SPECDEX_S+1)) JNME = JNTRNS
            L = 0
          ENDIF
C
        ENDDO
      ENDDO
C
      JME_S(P) = NLMM_S
      DO WHILE (P .LT. NLTSTEPS) 
        P = P + 1
        JMB_S(P) = NLMM_S
        JME_S(P) = NLMM_S-1
      ENDDO
      JNB_S(JNTRNS) = 0
      NTRNS_S = JNTRNS - 1
C
      IF (NLTSTEPS .GT. 1) THEN
C        Permute the partition so that the "ME" processor information
C        is in the first elements, and the rest are rotated around a 
C        ring of length NLTSTEPS (NPSPEC_S for overlapped algorithm).
C
         IF (P01_S .NE. -1) 
     &     P01_S = MOD((P01_S-1)+(NPSPEC_S-SPECDEX_S),NPSPEC_S) + 1
         IF (P11_S .NE. -1) 
     &     P11_S = MOD((P11_S-1)+(NPSPEC_S-SPECDEX_S),NPSPEC_S) + 1
C
         DO I=1,NPSPEC_S
           JMTMP(I) = JMB_S(I)
         ENDDO
         DO I=SPECDEX_S+1,NPSPEC_S
           JMB_S(I-SPECDEX_S) = JMTMP(I) 
         ENDDO
         DO I=1,SPECDEX_S
           JMB_S(I+(NPSPEC_S-SPECDEX_S)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NPSPEC_S
           JMTMP(I) = JME_S(I)
         ENDDO
         DO I=SPECDEX_S+1,NPSPEC_S
           JME_S(I-SPECDEX_S) = JMTMP(I) 
         ENDDO
         DO I=1,SPECDEX_S
           JME_S(I+(NPSPEC_S-SPECDEX_S)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NPSPEC_S
           JMTMP(I) = NLSPEC_S(I)
         ENDDO
         DO I=SPECDEX_S+1,NPSPEC_S
           NLSPEC_S(I-SPECDEX_S) = JMTMP(I) 
         ENDDO
         DO I=1,SPECDEX_S
           NLSPEC_S(I+(NPSPEC_S-SPECDEX_S)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NTRNS_S
           JMTMP(I) = JNB_S(I)
         ENDDO
         DO I=JNME,NTRNS_S
           JNB_S((I+1)-JNME) = JMTMP(I) 
         ENDDO
         DO I=1,JNME-1
           JNB_S((I+1)+(NTRNS_S-JNME)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NTRNS_S
           JMTMP(I) = JNE_S(I)
         ENDDO
         DO I=JNME,NTRNS_S
           JNE_S((I+1)-JNME) = JMTMP(I) 
         ENDDO
         DO I=1,JNME-1
           JNE_S((I+1)+(NTRNS_S-JNME)) = JMTMP(I) 
         ENDDO
C
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Determine number of local longitude and Fourier coefficients to 
C     declare when allocating space so as to guarantee that the physical,
C     Fourier, and spectral partitions of the gridpoint fields and 
C     the Fourier coefficients will fit into the same memory, even while
C     being transposed. Also, require that array segements of arrays
C     with MXLLON_P, MXLFC_F, and MXLFC_S first dimensions will be 
C     aligned.
      PFACTOR  = NLVER_P*NLLAT_P
      IF (NLVER_F .GT. 0) THEN
        FFACTOR  = 2*NLVER_F*NLLAT_F
      ELSE
        FFACTOR  = 2*NLLAT_F
      ENDIF
      IF (NLVER_S .GT. 0) THEN
        SFACTOR  = 2*NLVER_S*NLLAT_S
      ELSE
        SFACTOR  = 2*NLLAT_S
      ENDIF
      DO WHILE ((MXLLON_P*PFACTOR .NE. MXLFC_F*FFACTOR) .OR.
     &          (MXLFC_F*FFACTOR .NE. MXLFC_S*SFACTOR) .OR.
     &          (MOD(MXLLON_P,ALIGN) .NE. 0) .OR.
     &          (MOD(2*MXLFC_F,ALIGN) .NE. 0) .OR.
     &          (MOD(2*MXLFC_S,ALIGN) .NE. 0))
C
        DO WHILE (MOD(MXLLON_P,ALIGN) .NE. 0) 
          MXLLON_P = MXLLON_P + 1
        ENDDO
C
        DO WHILE (MOD(2*MXLFC_F,ALIGN) .NE. 0)
          MXLFC_F = MXLFC_F + 1
        ENDDO
C
        DO WHILE (MOD(2*MXLFC_S,ALIGN) .NE. 0)
          MXLFC_S = MXLFC_S + 1
        ENDDO
C
        IF (MXLLON_P*PFACTOR .LT. MXLFC_F*FFACTOR) THEN
          MXLLON_P = MXLLON_P + 1
        ELSEIF (MXLLON_P*PFACTOR .GT. MXLFC_F*FFACTOR) THEN
          MXLFC_F = MXLFC_F + 1
        ENDIF
C
        IF (MXLFC_F*FFACTOR .LT. MXLFC_S*SFACTOR) THEN
          MXLFC_F = MXLFC_F + 1
        ELSEIF (MXLFC_F*FFACTOR .GT. MXLFC_S*SFACTOR) THEN
          MXLFC_S = MXLFC_S + 1
        ENDIF
C
      ENDDO
      MXLLON_F = 2*MXLFC_F
C
C     Determine maximum local grid size, to determine storage needed
C     for work space when transposing between domain decompositions.
      MXLGRID = MAX(MXLLON_P*MXLVER_P*MXLLAT_P,
     &              MXLLON_F*MXLVER_F*MXLLAT_F,
     &              2*MXLFC_S*MXLVER_S*MXLLAT_S)
C
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Check consistency of parallel and communication algorithm 
C     parameters and determine amount of work space required.
C
C     Initialize buffers size indicators (which are used in INPUT when
C     allocating work space).
      BUFSWS2 = 0
      BUFSWS3 = 0
C
C     Legendre transform algorithms
C
      IF ((LTOPT .LT. 0) .OR. (LTOPT .GT. 1)) THEN
        IF (ME .EQ. 0) WRITE(0,701) LTOPT
  701   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &        ' ILLEGAL PARALLEL ALGORITHM OPTION SPECIFIED FOR',/,
     &        ' LEGENDRE TRANSFORM',/,
     &        ' LTOPT = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      IF (LTOPT .EQ. 0) THEN
C       distributed vector sum algorithms
C
        IF ((COMMFLT .LT. 0) .OR. (COMMFLT .GT. 41)) THEN
          IF (ME .EQ. 0) WRITE(0,702) LTOPT, COMMFLT, COMMILT
  702     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &          ' FORWARD LEGENDRE TRANSFORM',/,
     &          ' LTOPT = ',I4,' COMMFLT = ',I4, ' COMMILT = ',I4)
          ALGINP = -1
          RETURN
        ENDIF

        IF (COMMFLT .LT. 10) THEN
C         ring-pipeline algorithms for forward and inverse Legendre 
C         transforms
C
C         1) SHIFTSUM algorithm for forward Legendre transform
C
C         Need at most NLTSTEPS-1 buffers in recv-ahead 
C         variants, but if specify (BUFFERS .EQ. NLTSTEPS), can 
C         request all of these buffers at one time. If 
C         (BUFFERS .EQ. NLTSTEPS-1), then, for code consistency
C         between all recv-ahead variants, must delay the last 
C         request somewhat. 
          IF (BUFSFLT .GT. NLTSTEPS) BUFSFLT = NLTSTEPS 
          IF (BUFSFLT .LT. 1) BUFSFLT = 1
C
          IF ((COMMFLT .LT. 0) .OR. (COMMFLT .GT. 2)) THEN
            IF (ME .EQ. 0) WRITE(0,702) LTOPT, COMMFLT, COMMILT
            ALGINP = -1
            RETURN
          ENDIF
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
  703       FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &            ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &            ' FORWARD LEGENDRE TRANSFORM',/,
     &            ' LTOPT = ',I4,' COMMFLT = ',I4,' PROTFLT = ',I4)
            ALGINP = -1
            RETURN
          ENDIF
C
C         2) SHIFT algorithm for inverse Legendre transform
C
C         Need at most NLTSTEPS-1 buffers in recv-ahead 
C         variants, but if specify (BUFFERS .EQ. NLTSTEPS+1), can 
C         request all of these buffers at one time. If 
C         (BUFFERS .EQ. NLTSTEPS) or (BUFFFERS .EQ. NLTSTEPS-1), 
C         then, for code consistency between all recv-ahead variants, 
C         must delay the last requests somewhat. 
          IF (BUFSILT .GT. NLTSTEPS+1) BUFSILT = NLTSTEPS + 1
          IF (BUFSILT .LE. 3) BUFSILT = 2
C
          IF ((COMMILT .LT. 0) .OR. (COMMILT .GT. 2)) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
  704       FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &            ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &            ' INVERSE LEGENDRE TRANSFORM',/,
     &            ' LTOPT = ',I4,' COMMFLT = ',I4, ' COMMILT = ',I4)
            ALGINP = -1
            RETURN
          ENDIF
C
          IF ((PROTILT .LT. 0) .OR. (PROTILT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,705) LTOPT, COMMILT, PROTILT
  705       FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &            ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &            ' INVERSE LEGENDRE TRANSFORM',/,
     &            ' LTOPT = ',I4,' COMMILT = ',I4,' PROTILT = ',I4)
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSEIF (COMMFLT .LT. 20) THEN
C         nonoverlapped ring algorithm for forward Legendre transform
C
C         one buffer needed in recv-ahead variants of ring algorithm
          BUFSFLT = 1
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            ALGINP = -1
            RETURN
          ENDIF
C
C         no interprocessor communication in inverse Legendre transform
          BUFSILT = 0
C
          IF (COMMILT .LT. 10) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSEIF (COMMFLT .LT. 30) THEN
C         recursive halving algorithm for forward Legendre transform
C
C         number of buffers used in recv-ahead variants of recursive 
C         halving algorithm
          IF (BUFSFLT .GT. 3) BUFSFLT = 3
          IF (BUFSFLT .LT. 1) BUFSFLT = 1
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            ALGINP = -1
            RETURN
          ENDIF
C
C         no interprocessor communication in inverse Legendre transform
          BUFSILT = 0
C
          IF (COMMILT .LT. 10) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSEIF (COMMFLT .LT. 40) THEN
C         exchange algorithm for forward Legendre transform
C
C         Can use up to 2*log(P) buffers in nonblocking send and recv 
C         variants of EXCHSUM algorithm.
          LGP = LOG2(NPSPEC_S)        
          IF (BUFSFLT .GT. 2*LGP) BUFSFLT = 2*LGP
          IF (BUFSFLT .LT. 1) BUFSFLT = 1
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            ALGINP = -1
            RETURN
          ENDIF
C
C         no interprocessor communication in inverse Legendre transform
          BUFSILT = 0
C
          IF (COMMILT .LT. 10) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
            ALGINP = -1
            RETURN
          ENDIF
C
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        IF (SUMOPT .EQ. 0) THEN
C         "natural" summation ordering, so need less storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+1)
        ELSE
C         binary tree summation ordering, so need extra storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+NLLATH_S)
        ENDIF
        BUFSWS3 = MAX(BUFSWS3, BUFSILT)
C
      ELSE
C       transpose algorithm for Legendre transforms
C
        IF (((COMMFLT .LT. 0) .OR. (COMMFLT .GT. 4)) .AND.
     &      ((COMMFLT .LT. 10) .OR. (COMMFLT .GT. 14)) .AND.
     &      ((COMMFLT .LT. 20) .OR. (COMMFLT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,702) LTOPT, COMMFLT, COMMILT
          ALGINP = -1
          RETURN
        ENDIF
C
        IF (COMMFLT .LT. 20) THEN
C         O(P) transpose algorithms for forward Legendre transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSFLT = 2
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for forward Legendre transform
C
          IF (FTOPT .EQ. 1) THEN
            LGP = LOG2(NPFC_S)
          ELSE
            LGP = LOG2(NPVER_S)        
          ENDIF
C
          IF ((((FTOPT .EQ. 0) .OR. (FTOPT .EQ. 2))
     &         .AND. (MOD(NLAT,NPVER_S) .EQ. 0)
     &         .AND. (MOD(NVER,NPVER_S) .EQ. 0))
     &    .OR.((FTOPT .EQ. 1) 
     &         .AND. (MOD(NLAT,NPFC_S) .EQ. 0)
     &         .AND. (MOD(MM+1,NPFC_S) .EQ. 0))) THEN
C           Uniform partition of NLAT and NVER or NLAT and MM+1: can use
C           up to log(P)+1 buffers in nonblocking send and recv variants
C           of LOGTRANS algorithm, and require at least 3 buffers. 
            IF (BUFSFLT .GT. LGP+1) BUFSFLT = LGP + 1
            IF (BUFSFLT .LT. 3) BUFSFLT = 3
            REG_LGPFLT = .TRUE.
          ELSE
C           Nonuniform partition of NLAT or NVER or MM+1: can use up to
C           2*log(P)+3 buffers in nonblocking send and recv variants of
C           LOGTRANS algorithm, and require at least 5 buffers.
            IF (BUFSFLT .GT. 2*LGP+3) BUFSFLT = 2*LGP + 3
            IF (BUFSFLT .LT. 5) BUFSFLT = 5
            REG_LGPFLT = .FALSE.
          ENDIF
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            ALGINP = -1
            RETURN
          ENDIF
C
        ENDIF
C
        IF (((COMMILT .LT. 0) .OR. (COMMILT .GT. 4)) .AND.
     &      ((COMMILT .LT. 10) .OR. (COMMILT .GT. 14)) .AND.
     &      ((COMMILT .LT. 20) .OR. (COMMILT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,709) LTOPT, COMMFLT, COMMILT
          ALGINP = -1
          RETURN
        ENDIF
C
        IF (COMMILT .LT. 20) THEN
C         O(P) transpose algorithms for inverse Legendre transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSILT = 2
C
          IF ((PROTILT .LT. 0) .OR. (PROTILT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,705) LTOPT, COMMILT, PROTILT
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for inverse Legendre transform
C
          IF (FTOPT .EQ. 1) THEN
            LGP = LOG2(NPFC_S)
          ELSE
            LGP = LOG2(NPVER_S)        
          ENDIF
C
          IF ((((FTOPT .EQ. 0) .OR. (FTOPT .EQ. 2))
     &         .AND. (MOD(NLAT,NPVER_S) .EQ. 0)
     &         .AND. (MOD(NVER,NPVER_S) .EQ. 0))
     &    .OR.((FTOPT .EQ. 1) 
     &         .AND. (MOD(NLAT,NPFC_S) .EQ. 0)
     &         .AND. (MOD(MM+1,NPFC_S) .EQ. 0))) THEN
C           Uniform partition of NLAT and NVER or NLAT and MM+1: can use
C           up to log(P)+1 buffers in nonblocking send and recv variants
C           of LOGTRANS algorithm, and require at least 3 buffers. 
            IF (BUFSILT .GT. LGP+1) BUFSILT = LGP + 1
            IF (BUFSILT .LT. 3) BUFSILT = 3
            REG_LGPILT = .TRUE.
          ELSE
C           Nonuniform partition of NLAT or NVER or MM+1: can use up to
C           2*log(P)+3 buffers in nonblocking send and recv variants of
C           LOGTRANS algorithm, and require at least 5 buffers.
            IF (BUFSILT .GT. 2*LGP+3) BUFSILT = 2*LGP + 3
            IF (BUFSILT .LT. 5) BUFSILT = 5
            REG_LGPILT = .FALSE.
          ENDIF
C
          IF ((PROTILT .LT. 0) .OR. (PROTILT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,705) LTOPT, COMMILT, PROTILT
            ALGINP = -1
            RETURN
          ENDIF
C
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        BUFSWS2 = MAX(BUFSWS2, BUFSFLT)
        BUFSWS2 = MAX(BUFSWS2, BUFSILT)
        IF (SUMOPT .EQ. 0) THEN
C         "natural" summation ordering, so need less storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+1)
        ELSE
C         binary tree summation ordering, so need extra storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+NLLATH_S)
        ENDIF
        BUFSWS3 = MAX(BUFSWS3, BUFSILT)
C
      ENDIF
C
C     Fourier transform algorithms
C
      IF ((FTOPT .LT. 0) .OR. (FTOPT .GT. 2)) THEN
        IF (ME .EQ. 0) WRITE(0,706) FTOPT
  706   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &        ' ILLEGAL PARALLEL ALGORITHM OPTION SPECIFIED FOR',/,
     &        ' FOURIER TRANSFORM',/,
     &        ' FTOPT = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      IF ((FTOPT .EQ. 2) .AND. (LTOPT .NE. 0)) THEN
        IF (ME .EQ. 0) WRITE(0,7061) LTOPT, FTOPT
 7061   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &        ' ILLEGAL PARALLEL ALGORITHM OPTIONS SPECIFIED',/,
     &        ' PARALLEL LEGENDRE TRANSFORM OPTION LTOPT = ',I4,/,
     &        ' NOT SUPPORTED WITH DOUBLE TRANSPOSE PARALLEL FFT',/,
     &        ' FTOPT = ',I4)
        ALGINP = -1
        RETURN
      ENDIF
C
      IF (FTOPT .EQ. 0) THEN
C       distributed algorithms for Fourier transform
C
C       Always need exactly one buffer for distributed algorithm,
C       (which is enough space to allow sending and receiving to proceed
C        simultaneously.)
        BUFSFFT = 1
        BUFSIFT = 1
C
        IF ((COMMFFT .LT. 0) .OR. (COMMFFT .GT. 3)) THEN
          IF (ME .EQ. 0) WRITE(0,707) FTOPT, COMMFFT, COMMIFT
  707     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &          ' FORWARD FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMFFT = ',I4,' COMMIFT = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        IF ((PROTFFT .LT. 0) .OR. (PROTFFT .GT. 6)) THEN
          IF (ME .EQ. 0) WRITE(0,708) FTOPT, COMMFFT, PROTFFT
  708     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &          ' FORWARD FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMFFT = ',I4,' PROTFFT = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        IF ((COMMIFT .LT. 0) .OR. (COMMIFT .GT. 3)) THEN
          IF (ME .EQ. 0) WRITE(0,709) FTOPT, COMMFFT, COMMIFT
  709     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &          ' INVERSE FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMFFT = ',I4,' COMMIFT = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
        IF ((PROTIFT .LT. 0) .OR. (PROTIFT .GT. 6)) THEN
          IF (ME .EQ. 0) WRITE(0,720) FTOPT, COMMIFT, PROTIFT
  720     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &          ' INVERSE FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMIFT = ',I4,' PROTIFT = ',I4)
          ALGINP = -1
          RETURN
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        BUFSWS2 = MAX(BUFSWS2, BUFSFFT)
        BUFSWS2 = MAX(BUFSWS2, BUFSIFT)
C
      ELSEIF ((FTOPT .EQ. 1) .OR. (FTOPT .EQ. 2)) THEN
C       transpose algorithms for Fourier transform
C
        IF (((COMMFFT .LT. 0) .OR. (COMMFFT .GT. 4)) .AND.
     &      ((COMMFFT .LT. 10) .OR. (COMMFFT .GT. 14)) .AND.
     &      ((COMMFFT .LT. 20) .OR. (COMMFFT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,707) FTOPT, COMMFFT, COMMIFT
          ALGINP = -1
          RETURN
        ENDIF
C
        IF (COMMFFT .LT. 20) THEN
C         O(P) transpose algorithms for forward Fourier transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSFFT = 2
C
          IF ((PROTFFT .LT. 0) .OR. (PROTFFT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,708) FTOPT, COMMFFT, PROTFFT
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for forward Fourier transform
C
          LGP = LOG2(NPLON_P)
C
          IF ((FTOPT .EQ. 1)
     &        .AND. (MOD(NLON,NPLON_P) .EQ. 0)
     &        .AND. (MOD(NVER,NPLON_P) .EQ. 0)) THEN
C           Uniform partition of NLON and NVER: can use up to log(P)+1
C           buffers in nonblocking send and recv variants of LOGTRANS
C           algorithm, and require at least 3 buffers.  
            IF (BUFSFFT .GT. LGP+1) BUFSFFT = LGP + 1
            IF (BUFSFFT .LT. 3) BUFSFFT = 3
            REG_LGPFFT = .TRUE.
          ELSE
C           Nonuniform partition of NLON or NVER: can use up to 
C           2*log(P)+3 buffers in nonblocking send and recv variants of
C           LOGTRANS algorithm, and require at least 5 buffers. 
            IF (BUFSFFT .GT. 2*LGP+3) BUFSFFT = 2*LGP + 3
            IF (BUFSFFT .LT. 5) BUFSFFT = 5
            REG_LGPFFT = .FALSE.
          ENDIF
C
          IF ((PROTFFT .LT. 0) .OR. (PROTFFT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,708) FTOPT, COMMFFT, PROTFFT
            ALGINP = -1
            RETURN
          ENDIF
C
        ENDIF
C
        IF (((COMMIFT .LT. 0) .OR. (COMMIFT .GT. 4)) .AND.
     &      ((COMMIFT .LT. 10) .OR. (COMMIFT .GT. 14)) .AND.
     &      ((COMMIFT .LT. 20) .OR. (COMMIFT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,709) FTOPT, COMMFFT, COMMIFT
          ALGINP = -1
          RETURN
        ENDIF
C
        IF (COMMIFT .LT. 20) THEN
C         O(P) transpose algorithms for inverse Fourier transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSIFT = 2
C
          IF ((PROTIFT .LT. 0) .OR. (PROTIFT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,720) FTOPT, COMMIFT, PROTIFT
            ALGINP = -1
            RETURN
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for inverse Fourier transform
C
          LGP = LOG2(NPLON_P)
C
          IF ((FTOPT .EQ. 1) 
     &        .AND. (MOD(NLON,NPLON_P) .EQ. 0)
     &        .AND. (MOD(NVER,NPLON_P) .EQ. 0)) THEN
C           Uniform partition of NLON and NVER: can use up to log(P)+1
C           buffers in nonblocking send and recv variants of LOGTRANS
C           algorithm, and require at least 3 buffers.  
            IF (BUFSIFT .GT. LGP+1) BUFSIFT = LGP + 1
            IF (BUFSIFT .LT. 3) BUFSIFT = 3
            REG_LGPIFT = .TRUE.
          ELSE
C           Nonuniform partition of NLON or NVER: can use up to 
C           2*log(P)+3 buffers in nonblocking send and recv variants of
C           LOGTRANS algorithm, and require at least 5 buffers. 
            IF (BUFSIFT .GT. 2*LGP+3) BUFSIFT = 2*LGP + 3
            IF (BUFSIFT .LT. 5) BUFSIFT = 5
            REG_LGPIFT = .FALSE.
          ENDIF
C
          IF ((PROTIFT .LT. 0) .OR. (PROTIFT .GT. 6)) THEN
            IF (ME .EQ. 0) WRITE(0,720) FTOPT, COMMIFT, PROTIFT
            ALGINP = -1
            RETURN
          ENDIF
C
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        BUFSWS2 = MAX(BUFSWS2, BUFSFFT)
        BUFSWS2 = MAX(BUFSWS2, BUFSIFT)
C
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Determine amount of system buffer space needed for blocking
C     communication calls.
C
C     Initialize system buffer size requirement indicators.
C     XXXMSGS is the number of "simultaneous" messages that the system
C     must be able to store in system buffers, and XXXVOL is the
C     number of REAL values that must be stored.
      FFTMSGS = 0
      FFTVOL = 0
      IFTMSGS = 0
      IFTVOL = 0
      FLTMSGS = 0
      FLTVOL = 0
      ILTMSGS = 0
      ILTVOL = 0
C
C     Legendre transform algorithms
C
      IF (LTOPT .EQ. 0) THEN
C       distributed vector sum algorithms
C
        IF (NPSPEC_S .GT. 1) THEN
C
C         compute basic blocksize: 3 complex spectral fields of length 
C          MXLSPEC_S for each of NLVER_S layers 
          BLOCKSZ = 2*3*MXLSPEC_S*NLVER_S
C
          IF (COMMFLT .LT. 10) THEN
C           ring-pipeline algorithms for forward and inverse Legendre 
C           transforms
C
C           1) SHIFTSUM algorithm for forward Legendre transform
C
            IF (((COMMFLT .EQ. 0) .OR. (COMMFLT .EQ. 2)) 
     &          .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF (((COMMFLT .NE. 1) .OR. (BUFSFLT .GT. 1))
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
C           2) SHIFT algorithm for inverse Legendre transform
C
            IF (((COMMILT .EQ. 0) .OR. (COMMILT .EQ. 2)) 
     &          .AND. (PROTILT .EQ. 0)) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*BLOCKSZ + ILTMSGS*MSGOVH
            ELSEIF (((COMMILT .NE. 1) .OR. (BUFSILT .GT. 1))
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMFLT .LT. 20) THEN
C           nonoverlapped ring algorithm for forward Legendre transform
C
            IF (((COMMFLT .EQ. 10) .OR. (COMMFLT .EQ. 12)) 
     &          .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*((BLOCKSZ/ALIGN)/NPSPEC_S + 1)*ALIGN
     &                + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .NE. 11) 
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMFLT .LT. 30) THEN
C           recursive halving algorithm for forward Legendre transform
C           (conservative estimate)
C
            IF ((COMMFLT .EQ. 20) .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF ((BUFSFLT .EQ. 1) 
     &       .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 2
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ELSEIF ((BUFSFLT .GT. 1) 
     &         .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = LOG2(NPSPEC_S-1) + 1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMFLT .LT. 40) THEN
C           exchange algorithm for forward Legendre transform
C           (conservative estimate)
C
            IF ((COMMFLT .EQ. 30) .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF ((BUFSFLT .EQ. 1) 
     &       .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 2
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ELSEIF ((BUFSFLT .GT. 1) 
     &       .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = BUFSFLT/2
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
          ENDIF
C
        ENDIF
C
      ELSE
C       transpose algorithm for Legendre transforms
C
C       compute basic blocksize and number of processors: 
C        up to 8 complex Fourier domain fields
        IF (FTOPT .EQ. 1) THEN
          P = NPFC_S
          BLOCKSZ = 2*8*NDLAT_F(0)*NDFC_S(0)*NLVER_F
        ELSE
          P = NPVER_S
          BLOCKSZ = 2*8*NDLAT_F(0)*NDVER_S(0)*NLFC_S
        ENDIF
C
        IF (P .GT. 1) THEN
C
          IF (COMMFLT .LT. 10) THEN
C           send/receive O(P) transpose algorithms for forward Legendre 
C           transform
C
            IF (((COMMFLT .EQ. 0) .OR. (COMMFLT .EQ. 2)) 
     &          .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .EQ. 4) .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = P-1
              FLTVOL  = RBYTES*(P-1)*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .EQ. 0)
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .GT. 1)
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = P-1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMFLT .LT. 20) THEN
C           swap O(P) transpose algorithms for forward Legendre 
C           transform
C
            IF (((COMMFLT .EQ. 10) .OR. (COMMFLT .EQ. 12)) 
     &          .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .EQ. 14) .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = P-1
              FLTVOL  = RBYTES*(P-1)*BLOCKSZ + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .EQ. 10)
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .GT. 11)
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = P-1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
          ELSE
C           O(log(P)) transpose algorithms for forward Legendre transform
C           (conservative estimate)
C
            IF (REG_LGPFLT) THEN
              MXBUF = BUFSFLT - 1
            ELSE
              MXBUF = (BUFSFLT - 1)/2
            ENDIF
            IF ((COMMFLT .EQ. 20) .AND. (PROTFLT .EQ. 0)) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*BLOCKSZ*(P/2+1) + FLTMSGS*MSGOVH
            ELSEIF ((COMMFLT .EQ. 20) .AND. (MXBUF .LE. 2)
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = 1
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ELSEIF ((MXBUF .GT. 2)
     &          .AND. ((PROTFLT .EQ. 4) .OR. (PROTFLT .EQ. 5))) THEN
              FLTMSGS = MXBUF
              FLTVOL  = RBYTES*0 + FLTMSGS*MSGOVH
            ENDIF
C
          ENDIF
C
          IF (COMMILT .LT. 10) THEN
C           send/receive O(P) transpose algorithms for inverse Legendre 
C           transform
C
            IF (((COMMILT .EQ. 0) .OR. (COMMILT .EQ. 2)) 
     &          .AND. (PROTILT .EQ. 0)) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*BLOCKSZ + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .EQ. 4) .AND. (PROTILT .EQ. 0)) THEN
              ILTMSGS = P-1
              ILTVOL  = RBYTES*(P-1)*BLOCKSZ + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .EQ. 0)
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .GT. 1)
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = P-1
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMILT .LT. 20) THEN
C           swap O(P) transpose algorithms for inverse Legendre 
C           transform
C
            IF (((COMMILT .EQ. 10) .OR. (COMMILT .EQ. 12)) 
     &          .AND. (PROTILT .EQ. 0)) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*BLOCKSZ + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .EQ. 14) .AND. (PROTILT .EQ. 0)) THEN
              ILTMSGS = P-1
              ILTVOL  = RBYTES*(P-1)*BLOCKSZ + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .EQ. 10)
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .GT. 11)
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = P-1
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ENDIF
C
          ELSE
C           O(log(P)) transpose algorithms for inverse Legendre 
C           transform
C
            IF (REG_LGPILT) THEN
              MXBUF = BUFSILT - 1
            ELSE
              MXBUF = (BUFSILT - 1)/2
            ENDIF
            IF ((COMMILT .EQ. 20) .AND. (PROTILT .EQ. 0)) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*BLOCKSZ*(P/2 + 1) + ILTMSGS*MSGOVH
            ELSEIF ((COMMILT .EQ. 20) .AND. (MXBUF .LE. 2)
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = 1
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ELSEIF ((MXBUF .GT. 2)
     &          .AND. ((PROTILT .EQ. 4) .OR. (PROTILT .EQ. 5))) THEN
              ILTMSGS = MXBUF
              ILTVOL  = RBYTES*0 + ILTMSGS*MSGOVH
            ENDIF
C
          ENDIF
C
        ENDIF
C
      ENDIF
C
C     Fourier transform algorithms
C
      P = NPLON_P
C
      IF (P .GT. 1) THEN
C
        IF (FTOPT .EQ. 0) THEN
C         distributed algorithms for Fourier transform
C
C         compute basic blocksize: up to 8 real fields of length 
C          NLLON_F for each of NLLAT_F latitudes and NLVER_F layers 
          BLOCKSZ = 8*NLLAT_F*NLLON_F*NLVER_F
C
          IF ((COMMFFT .EQ. 0) .AND. (PROTFFT .EQ. 0)) THEN
            FFTMSGS = 1
            FFTVOL  = RBYTES*(BLOCKSZ/2 + 1) + FFTMSGS*MSGOVH
          ELSEIF ((COMMFFT .EQ. 0)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
            FFTMSGS = 1
            FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
          ELSEIF ((COMMFFT .EQ. 2) .AND. (PROTFFT .EQ. 0)) THEN
            FFTMSGS = 2
            FFTVOL  = RBYTES*BLOCKSZ + FFTMSGS*MSGOVH
          ELSEIF (((COMMFFT .EQ. 2) .OR. (COMMFFT .EQ. 3))
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
            FFTMSGS = 2
            FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
          ENDIF
C
          IF ((COMMIFT .EQ. 0) .AND. (PROTIFT .EQ. 0)) THEN
            IFTMSGS = 1
            IFTVOL  = RBYTES*(BLOCKSZ/2 + 1) + IFTMSGS*MSGOVH
          ELSEIF ((COMMIFT .EQ. 0)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
            IFTMSGS = 1
            IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
          ELSEIF ((COMMIFT .EQ. 2) .AND. (PROTIFT .EQ. 0)) THEN
            IFTMSGS = 2
            IFTVOL  = RBYTES*BLOCKSZ + IFTMSGS*MSGOVH
          ELSEIF (((COMMIFT .EQ. 2) .OR. (COMMIFT .EQ. 3))
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
            IFTMSGS = 2
            IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
          ENDIF
C
        ELSEIF ((FTOPT .EQ. 1) .OR. (FTOPT .EQ. 2)) THEN
C       transpose algorithms for Fourier transform
C
C         compute basic blocksize: up to 8 real fields of length 
C          NLLON_F for each of NLLAT_F latitudes and NLVER_F layers 
          BLOCKSZ = 8*NLLAT_P*NDLON_P(0)*NDVER_F(0)
C
          IF (COMMFFT .LT. 10) THEN
C           send/receive O(P) transpose algorithms for forward Legendre 
C           transform
C
            IF (((COMMFFT .EQ. 0) .OR. (COMMFFT .EQ. 2)) 
     &          .AND. (PROTFFT .EQ. 0)) THEN
              FFTMSGS = 1
              FFTVOL  = RBYTES*BLOCKSZ + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .EQ. 4) .AND. (PROTFFT .EQ. 0)) THEN
              FFTMSGS = P-1
              FFTVOL  = RBYTES*(P-1)*BLOCKSZ + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .EQ. 0)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
              FFTMSGS = 1
              FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .GT. 1)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
              FFTMSGS = P-1
              FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMFFT .LT. 20) THEN
C           swap O(P) transpose algorithms for forward Legendre 
C           transform
C
            IF (((COMMFFT .EQ. 10) .OR. (COMMFFT .EQ. 12)) 
     &          .AND. (PROTFFT .EQ. 0)) THEN
              FFTMSGS = 1
              FFTVOL  = RBYTES*BLOCKSZ + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .EQ. 14) .AND. (PROTFFT .EQ. 0)) THEN
              FFTMSGS = P-1
              FFTVOL  = RBYTES*(P-1)*BLOCKSZ + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .EQ. 10)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
              FFTMSGS = 1
              FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .GT. 11)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
              FFTMSGS = P-1
              FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
            ENDIF
C
          ELSE
C           O(log(P)) transpose algorithms for forward Legendre 
C           transform
C           (conservative estimate)
C
            IF (REG_LGPFFT) THEN
              MXBUF = BUFSFFT - 1
            ELSE
              MXBUF = (BUFSFFT - 1)/2
            ENDIF
            IF (((COMMFFT .EQ. 20) .OR. (COMMFFT .EQ. 22)) 
     &          .AND. (PROTFFT .EQ. 0)) THEN
              FFTMSGS = 1
              FFTVOL  = RBYTES*BLOCKSZ*(P/2+1) + FFTMSGS*MSGOVH
            ELSEIF ((COMMFFT .EQ. 20) .AND. (MXBUF .LE. 2)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
              FFTMSGS = 1
              FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
            ELSEIF ((MXBUF .GT. 2)
     &          .AND. ((PROTFFT .EQ. 4) .OR. (PROTFFT .EQ. 5))) THEN
              FFTMSGS = MXBUF
              FFTVOL  = RBYTES*0 + FFTMSGS*MSGOVH
            ENDIF
C
          ENDIF
C
          IF (COMMIFT .LT. 10) THEN
C           send/receive O(P) transpose algorithms for inverse Legendre 
C           transform
C
            IF (((COMMIFT .EQ. 0) .OR. (COMMIFT .EQ. 2)) 
     &          .AND. (PROTIFT .EQ. 0)) THEN
              IFTMSGS = 1
              IFTVOL  = RBYTES*BLOCKSZ + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .EQ. 4) .AND. (PROTIFT .EQ. 0)) THEN
              IFTMSGS = P-1
              IFTVOL  = RBYTES*(P-1)*BLOCKSZ + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .EQ. 0)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
              IFTMSGS = 1
              IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .GT. 1)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
              IFTMSGS = P-1
              IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
            ENDIF
C
          ELSEIF (COMMIFT .LT. 20) THEN
C           swap O(P) transpose algorithms for inverse Legendre 
C           transform
C
            IF (((COMMIFT .EQ. 10) .OR. (COMMIFT .EQ. 12)) 
     &          .AND. (PROTIFT .EQ. 0)) THEN
              IFTMSGS = 1
              IFTVOL  = RBYTES*BLOCKSZ + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .EQ. 14) .AND. (PROTIFT .EQ. 0)) THEN
              IFTMSGS = P-1
              IFTVOL  = RBYTES*(P-1)*BLOCKSZ + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .EQ. 10)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
              IFTMSGS = 1
              IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .GT. 11)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
              IFTMSGS = P-1
              IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
            ENDIF
C
          ELSE
C           O(log(P)) transpose algorithms for inverse Legendre transform
C
            IF (REG_LGPIFT) THEN
              MXBUF = BUFSIFT - 1
            ELSE
              MXBUF = (BUFSIFT - 1)/2
            ENDIF
            IF (((COMMIFT .EQ. 20) .OR. (COMMIFT .EQ. 22)) 
     &          .AND. (PROTIFT .EQ. 0)) THEN
              IFTMSGS = 1
              IFTVOL  = RBYTES*BLOCKSZ*(P/2+1) + IFTMSGS*MSGOVH
            ELSEIF ((COMMIFT .EQ. 20) .AND. (MXBUF .LE. 2)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
              IFTMSGS = 1
              IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
            ELSEIF ((MXBUF .GT. 2)
     &          .AND. ((PROTIFT .EQ. 4) .OR. (PROTIFT .EQ. 5))) THEN
              IFTMSGS = MXBUF
              IFTVOL  = RBYTES*0 + IFTMSGS*MSGOVH
            ENDIF
C
          ENDIF
C
        ENDIF
C
      ENDIF
C
C     Determine the maximum system buffer space requirements.
C
      SYSMSGS = MAX(FFTMSGS, IFTMSGS, FLTMSGS, ILTMSGS)
      SYSVOL  = MAX(FFTVOL, IFTVOL, FLTVOL, ILTVOL)
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Output primary algorithm and domain decomposition parameters.
C
      IF (ME .EQ. 0) THEN
        WRITE (6,201) PX, PY
  201   FORMAT (/,' NUMBER OF PROCESSORS IN PARALLELIZATION'
     &          /,' ROW PROCESSORS:    PX = ',I4,
     &          /,' COLUMN PROCESSORS: PY = ',I4,/)
C
        WRITE(6,*) 'PARALLEL ALGORITHMS'
        IF (FTOPT .EQ. 0) THEN
          WRITE(6,*) '  FT: DISTRIBUTED'
        ELSEIF (FTOPT .EQ. 1) THEN
          IF (COMMFFT .LT. 20) THEN
            WRITE(6,*) ' FFT: TRANSPOSE (O(P))'
          ELSEIF (COMMFFT .LT. 30) THEN
            WRITE(6,*) ' FFT: TRANSPOSE (O(LOG P))'
          ENDIF
          IF (COMMIFT .LT. 20) THEN
            WRITE(6,*) ' IFT: TRANSPOSE (O(P))'
          ELSEIF (COMMIFT .LT. 30) THEN
            WRITE(6,*) ' IFT: TRANSPOSE (O(LOG P))'
          ENDIF
        ELSEIF (FTOPT .EQ. 2) THEN
          IF (COMMFFT .LT. 20) THEN
            WRITE(6,*) ' FFT: DOUBLE TRANSPOSE (O(P))'
          ELSEIF (COMMFFT .LT. 30) THEN
            WRITE(6,*) ' FFT: DOUBLE TRANSPOSE (O(LOG P))'
          ENDIF
          IF (COMMIFT .LT. 20) THEN
            WRITE(6,*) ' IFT: DOUBLE TRANSPOSE (O(P))'
          ELSEIF (COMMIFT .LT. 30) THEN
            WRITE(6,*) ' IFT: DOUBLE TRANSPOSE (O(LOG P))'
          ENDIF
        ENDIF
C
        IF (LTOPT .EQ. 0) THEN
          IF (COMMFLT .LT. 10) THEN
            WRITE(6,*) '  LT: DISTRIBUTED (OVERLAPPED RING VECTOR SUM)'
          ELSEIF (COMMFLT .LT. 20) THEN
            WRITE(6,*)'  LT: DISTRIBUTED (RING VECTOR SUM)'
          ELSEIF (COMMFLT .LT. 30) THEN
            WRITE(6,*)'  LT: DISTRIBUTED (RECURSIVE HALVING VECTOR SUM)'
          ELSEIF (COMMFLT .LT. 40) THEN
            WRITE(6,*)'  LT: DISTRIBUTED (EXCHANGE VECTOR SUM)'
          ENDIF
        ELSEIF (LTOPT .EQ. 1) THEN
          IF (COMMFLT .LT. 20) THEN
            WRITE(6,*) ' FLT: TRANSPOSE (O(P))'
          ELSEIF (COMMFLT .LT. 30) THEN
            WRITE(6,*) ' FLT: TRANSPOSE (O(LOG P))'
          ENDIF
          IF (COMMILT .LT. 20) THEN
            WRITE(6,*) ' ILT: TRANSPOSE (O(P))'
          ELSEIF (COMMILT .LT. 30) THEN
            WRITE(6,*) ' ILT: TRANSPOSE (O(LOG P))'
          ENDIF
        ENDIF
C
        WRITE(6,*) ' NLLAT_P =',NLLAT_P, ';   NLLON_P =',NLLON_P,
     &             ' NLVER_P =',NLVER_P
        WRITE(6,*) ' NLLAT_F =',NLLAT_F, ';   NLLON_F =',NLLON_F
        WRITE(6,*) ' NLVER_F =',NLVER_F, ';   NLFC_F  =',NLFC_F
        WRITE(6,*) ' NLLAT_S =',NLLAT_S, ';   NLFC_S  =',NLFC_S,
     &             ' NLVER_S =',NLVER_S
C
        WRITE(6,*) ' MXLLON_P=',MXLLON_P, ';   MXLLON_F=',MXLLON_F
        WRITE(6,*) ' MXLFC_F =',MXLFC_F, ';   MXLFC_S =',MXLFC_S
C
      ENDIF
C
      ALGINP = 0
C
      RETURN
      END
