      subroutine cgemul3 ( a, lda, forma, b, ldb, formb, c, ldc, m, k, n
     *, aux, naux )
c#######################################################################
c
c Double complex:   zgemul3         Fortran-77 Version
c Single complex:   cgemul3
c
c Function:     Compute C = op(A)op(B), where
c
c                       { X,                     if transX == 'N' or 'n'
c               op(X) = { X transpose,           if transX == 'T' or 't'
c                       { X conjugate transpose, if transX == 'C' or 'c'
c                       { X conjugate,           if transX == 'K' or 'k'
c
c Reference:    Craig C. Douglas, Michael Heroux, Gordon Slishman, and
c               Roger M. Smith,
c               GEMMW:  A portable Level 3 BLAS Winograd variant of
c               Strassen's  matrix--matrix multiply algorithm,
c               Yale Computer Science Report 904, New Haven, CT, 1992.
c
c Contacts:     na.cdouglas@na-net.ornl.gov, na.heroux@na-net.ornl.gov,
c               slishmn@watson.ibm.com, and smith-roger@cs.yale.edu
c
c Date last
c modified:     May 14, 1992
c
c Algorithm:    Complex arithemtic for 3 multiplies instead of 4, as in
c               "The Design and Analysis of Computer Algorithms,"
c               by Aho, Hopcroft and Ullman, 1974.
c
c Naming
c Convention:   _name, where the underscore is replaced by 
c                   c   Single precision complex
c                   d   Double precision real
c                   s   Single precision real
c                   z   Double precision complex
c
c Module:       _gemul3 computes op(A)op(B) as
c
c                   (P+Qi) * (R+Si) = (PR-QS) + (PS + QR)i.
c
c Storage:      _gemul3 requires a storage area aux of size roughly
c
c                    naux = 4/3 n**2.
c
c               The units are either double or single precision real
c               (not complex).  See gemmw.c for the exact formula,
c               which has some latitude for this adaptive routine.
c
c Intended      The intended use of this code is to terminate complex
c Use:          data Strassen- Winograd recursions from _winos.  After
c               the last recursion this routine is used rather than a
c               conventional complex matrix multiply to obtain a
c               reduction in real matrix multiplications from 4 to 3.
c               This code employs only the residual aux storage
c               obtained by _gemmw.
c
c Implementation:  The particular implementation used here follows the
c                  steps below.
c
c               Step  Operation
c               ----  ---------
c                1.   Copies P to aux(0).  Iteratively stores vertical
c                     swaths of R-S in aux(kaux), and stores swaths of
c                     P*(R-S) in the lower half of C.  The swath width
c                     is however many columns will fit in the naux-m*k 
c                     (size of P) remaining elements of aux.
c
c                2.   Similar to step 1.  P+Q goes to aux(0).  Swaths
c                     of R go to aux(kaux).  Swaths of (P+Q)R go to
c                     upper half of C.
c
c                3.   The sum of the upper and lower halfs of C gives
c                     the imaginary part of the complex product.
c                     Store in the upper half of C.
c
c                4.   Similar to step 1.  P-Q goes to aux(0).  Swaths
c                     of S go to aux(kaux).  Swaths of (P-Q)S plus the
c                     lower half of C replace the lower half of C.
c                     This is the real part of the complex matrix
c                     product.
c
c                5.   Column-wise, the real and imaginary halves of C
c                     are merged to finish the complex product.
c
c#######################################################################
      character*1 forma, formb
      integer k, lda, ldb, ldc, m, n, naux
      real a(0:*), aux(0:*), b(0:*), c(0:*)
      integer i, iac1, iar1, ibc1, ibr1, ifail, ij, j, kaux, lda2, ldb2,
     * ldc2, nb, nn
      real neg1, one, zero
      data neg1 / -1.0 /
      data one / 1.0 /
      data zero / 0.0 /
c#######################################################################
c
c kaux  size of buffer for "A" derivatives
c nb    number of "B" cols for aux remainder
c
c#######################################################################
      kaux = m * k
      nb = (naux - kaux) / k
      ifail = 0
      if(nb.lt.1) then 
          write (*,*) 'Insufficient naux for zgemul3'
          stop
      endif
      lda2 = lda * 2
      ldb2 = ldb * 2
      ldc2 = ldc * 2
      if(forma.eq.'n'.or.forma.eq.'N'.or.forma.eq.'K'.or.forma.eq.'k')
     * then 
          iar1 = 2
          iac1 = lda2
      else
          iar1 = lda2
          iac1 = 2
      endif
      if(formb.eq.'n'.or.formb.eq.'N'.or.formb.eq.'K'.or.forma.eq.'k')
     * then 
          ibr1 = 2
          ibc1 = ldb2
      else
          ibr1 = ldb2
          ibc1 = 2
c#######################################################################
c
c STEP 1: P(R-S) to lower half of C ...
c
c#######################################################################
c copy P
      endif
      do 2000  j = 0,k-1
          call f06eff ( m, a(j * iac1), iar1, aux(j * m), 1 )
2000  continue
      do 2002  nn = 0, n-1, nb
c nb cols of P*(R-S)
          if(formb.eq.'n'.or.formb.eq.'N'.or.formb.eq.'t'.or.formb.eq.
     *'T') then 
              do 2004  j = nn,min(n - 1, nn + nb - 1)
                  do 2006  i = 0,k-1
                      ij = i*ibr1 + j*ibc1
                      aux(kaux + (j-nn)*k + i) = b(ij) - b(ij + 1)
2006              continue
2004          continue
          else
              do 2008  j = nn,min(n - 1, nn + nb - 1)
                  do 2010  i = 0,k-1
                      ij = i*ibr1 + j*ibc1
                      aux(kaux + (j-nn)*k + i) = b(ij) + b(ij + 1)
2010              continue
2008          continue
          endif
          call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m, 
     *aux(kaux), k, zero, c(m + nn*ldc2), ldc2 )
c#######################################################################
c
c STEP 2: (P+Q)R to upper half of C ...
c
c#######################################################################
c compute P+Q
2002  continue
      if(forma.eq.'n'.or.forma.eq.'N'.or.forma.eq.'t'.or.forma.eq.'T')
     * then 
          do 2012  j = 0,k-1
              do 2014  i = 0,m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) + a(ij + 1)
2014          continue
2012      continue
c conjugate
      else
          do 2016  j = 0,k-1
              do 2018  i = 0,m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) - a(ij + 1)
2018          continue
2016      continue
      endif
      do 2020  nn = 0, n-1, nb
c nb cols of (P+Q)*R
          do 2022  j = nn, min(n - 1, nn + nb - 1)
              call f06eff ( k, b(j*ibc1), ibr1, aux(kaux + (j-nn)*k), 1 
     *)
2022      continue
          call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m, 
     *aux(kaux), k, zero, c(nn*ldc2), ldc2 )
c#######################################################################
c
c STEP 3: (PS + QR)i to upper half of C ...
c
c#######################################################################
2020  continue
      call f01ctf ('N', 'N', m, n, one, c, ldc2, neg1, c(m), ldc2, c,
     *ldc2, ifail )
c#######################################################################
c
c STEP 4: (P-Q)S + P(R-S) to lower half of C ...
c
c#######################################################################
c compute P-Q
      if(forma.eq.'n'.or.forma.eq.'N'.or.forma.eq.'t'.or.forma.eq.'T')
     * then 
          do 2024  j=0, k-1
              do 2026  i=0, m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) - a(ij + 1)
2026          continue
2024      continue
c conjugate
      else
          do 2028  j=0, k-1
              do 2030  i=0, m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) + a(ij + 1)
2030          continue
2028      continue
c compute nb cols of (P-Q)S
      endif
      if(formb.eq.'n'.or.formb.eq.'N'.or.formb.eq.'t'.or.formb.eq.'T')
     * then 
          do 2032  nn = 0, n-1, nb
c nb cols of (P-Q)*S
              do 2034  j = nn, min(n - 1, nn + nb - 1)
                  call f06eff ( k, b(j*ibc1 + 1), ibr1, aux(kaux + (j-nn
     *)*k), 1 )
2034          continue
              call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m
     *, aux(kaux), k, one, c(nn*ldc2 + m), ldc2 )
2032      continue
c conjugate
      else
          do 2036  nn = 0, n-1, nb
c nb cols of (P-Q)*S
              do 2038  j = nn, min(n - 1, nn + nb - 1)
                  call f06fdf ( k, neg1, b(j*ibc1 + 1), ibr1, aux(kaux +
     * (j-nn)*k), 1 )
2038          continue
              call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m
     *, aux(kaux), k, one, c(nn*ldc2 + m), ldc2 )
2036      continue
c#######################################################################
c
c STEP 5: Pair off real and imaginary elements, col by col of C ...
c
c#######################################################################
      endif
      do 2040  j = 0,n-1
          call f06eff ( m*2, c(j*ldc2), 1, aux, 1 )
          call f06eff ( m, aux(m), 1, c(j*ldc2), 2 )
          call f06eff ( m, aux(0), 1, c(j*ldc2+1), 2 )
2040  continue
      return
      end
      subroutine zgemul3 ( a, lda, forma, b, ldb, formb, c, ldc, m, k, n
     *, aux, naux )
c#######################################################################
c
c Double complex:   zgemul3         Fortran-77 Version
c Single complex:   cgemul3
c
c Function:     Compute C = op(A)op(B), where
c
c                       { X,                     if transX == 'N' or 'n'
c               op(X) = { X transpose,           if transX == 'T' or 't'
c                       { X conjugate transpose, if transX == 'C' or 'c'
c                       { X conjugate,           if transX == 'K' or 'k'
c
c Reference:    Craig C. Douglas, Michael Heroux, Gordon Slishman, and
c               Roger M. Smith,
c               GEMMW:  A portable Level 3 BLAS Winograd variant of
c               Strassen's  matrix--matrix multiply algorithm,
c               Yale Computer Science Report 904, New Haven, CT, 1992.
c
c Contacts:     na.cdouglas@na-net.ornl.gov, na.heroux@na-net.ornl.gov,
c               slishmn@watson.ibm.com, and smith-roger@cs.yale.edu
c
c Date last
c modified:     May 14, 1992
c
c Algorithm:    Complex arithemtic for 3 multiplies instead of 4, as in
c               "The Design and Analysis of Computer Algorithms,"
c               by Aho, Hopcroft and Ullman, 1974.
c
c Naming
c Convention:   _name, where the underscore is replaced by 
c                   c   Single precision complex
c                   d   Double precision real
c                   s   Single precision real
c                   z   Double precision complex
c
c Module:       _gemul3 computes op(A)op(B) as
c
c                   (P+Qi) * (R+Si) = (PR-QS) + (PS + QR)i.
c
c Storage:      _gemul3 requires a storage area aux of size roughly
c
c                    naux = 4/3 n**2.
c
c               The units are either double or single precision real
c               (not complex).  See gemmw.c for the exact formula,
c               which has some latitude for this adaptive routine.
c
c Intended      The intended use of this code is to terminate complex
c Use:          data Strassen- Winograd recursions from _winos.  After
c               the last recursion this routine is used rather than a
c               conventional complex matrix multiply to obtain a
c               reduction in real matrix multiplications from 4 to 3.
c               This code employs only the residual aux storage
c               obtained by _gemmw.
c
c Implementation:  The particular implementation used here follows the
c                  steps below.
c
c               Step  Operation
c               ----  ---------
c                1.   Copies P to aux(0).  Iteratively stores vertical
c                     swaths of R-S in aux(kaux), and stores swaths of
c                     P*(R-S) in the lower half of C.  The swath width
c                     is however many columns will fit in the naux-m*k 
c                     (size of P) remaining elements of aux.
c
c                2.   Similar to step 1.  P+Q goes to aux(0).  Swaths
c                     of R go to aux(kaux).  Swaths of (P+Q)R go to
c                     upper half of C.
c
c                3.   The sum of the upper and lower halfs of C gives
c                     the imaginary part of the complex product.
c                     Store in the upper half of C.
c
c                4.   Similar to step 1.  P-Q goes to aux(0).  Swaths
c                     of S go to aux(kaux).  Swaths of (P-Q)S plus the
c                     lower half of C replace the lower half of C.
c                     This is the real part of the complex matrix
c                     product.
c
c                5.   Column-wise, the real and imaginary halves of C
c                     are merged to finish the complex product.
c
c#######################################################################
      character*1 forma, formb
      integer k, lda, ldb, ldc, m, n, naux
      double precision a(0:*), aux(0:*), b(0:*), c(0:*)
      integer i, iac1, iar1, ibc1, ibr1, ifail, ij, j, kaux, lda2, ldb2,
     * ldc2, nb, nn
      double precision neg1, one, zero
      data neg1 / -1.0 /
      data one / 1.0 /
      data zero / 0.0 /
c#######################################################################
c
c kaux  size of buffer for "A" derivatives
c nb    number of "B" cols for aux remainder
c
c#######################################################################
      kaux = m * k
      nb = (naux - kaux) / k
      ifail = 0
      if(nb.lt.1) then 
          write (*,*) 'Insufficient naux for zgemul3'
          stop
      endif
      lda2 = lda * 2
      ldb2 = ldb * 2
      ldc2 = ldc * 2
      if(forma.eq.'n'.or.forma.eq.'N'.or.forma.eq.'K'.or.forma.eq.'k')
     * then 
          iar1 = 2
          iac1 = lda2
      else
          iar1 = lda2
          iac1 = 2
      endif
      if(formb.eq.'n'.or.formb.eq.'N'.or.formb.eq.'K'.or.forma.eq.'k')
     * then 
          ibr1 = 2
          ibc1 = ldb2
      else
          ibr1 = ldb2
          ibc1 = 2
c#######################################################################
c
c STEP 1: P(R-S) to lower half of C ...
c
c#######################################################################
c copy P
      endif
      do 2000  j = 0,k-1
          call f06eff ( m, a(j * iac1), iar1, aux(j * m), 1 )
2000  continue
      do 2002  nn = 0, n-1, nb
c nb cols of P*(R-S)
          if(formb.eq.'n'.or.formb.eq.'N'.or.formb.eq.'t'.or.formb.eq.
     *'T') then 
              do 2004  j = nn,min(n - 1, nn + nb - 1)
                  do 2006  i = 0,k-1
                      ij = i*ibr1 + j*ibc1
                      aux(kaux + (j-nn)*k + i) = b(ij) - b(ij + 1)
2006              continue
2004          continue
          else
              do 2008  j = nn,min(n - 1, nn + nb - 1)
                  do 2010  i = 0,k-1
                      ij = i*ibr1 + j*ibc1
                      aux(kaux + (j-nn)*k + i) = b(ij) + b(ij + 1)
2010              continue
2008          continue
          endif
          call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m, 
     *aux(kaux), k, zero, c(m + nn*ldc2), ldc2 )
c#######################################################################
c
c STEP 2: (P+Q)R to upper half of C ...
c
c#######################################################################
c compute P+Q
2002  continue
      if(forma.eq.'n'.or.forma.eq.'N'.or.forma.eq.'t'.or.forma.eq.'T')
     * then 
          do 2012  j = 0,k-1
              do 2014  i = 0,m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) + a(ij + 1)
2014          continue
2012      continue
c conjugate
      else
          do 2016  j = 0,k-1
              do 2018  i = 0,m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) - a(ij + 1)
2018          continue
2016      continue
      endif
      do 2020  nn = 0, n-1, nb
c nb cols of (P+Q)*R
          do 2022  j = nn, min(n - 1, nn + nb - 1)
              call f06eff ( k, b(j*ibc1), ibr1, aux(kaux + (j-nn)*k), 1 
     *)
2022      continue
          call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m, 
     *aux(kaux), k, zero, c(nn*ldc2), ldc2 )
c#######################################################################
c
c STEP 3: (PS + QR)i to upper half of C ...
c
c#######################################################################
2020  continue
      call f01ctf ('N', 'N', m, n, one, c, ldc2, neg1, c(m), ldc2, c,
     *ldc2, ifail )
c#######################################################################
c
c STEP 4: (P-Q)S + P(R-S) to lower half of C ...
c
c#######################################################################
c compute P-Q
      if(forma.eq.'n'.or.forma.eq.'N'.or.forma.eq.'t'.or.forma.eq.'T')
     * then 
          do 2024  j=0, k-1
              do 2026  i=0, m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) - a(ij + 1)
2026          continue
2024      continue
c conjugate
      else
          do 2028  j=0, k-1
              do 2030  i=0, m-1
                  ij = i*iar1 + j*iac1
                  aux(j*m + i) = a(ij) + a(ij + 1)
2030          continue
2028      continue
c compute nb cols of (P-Q)S
      endif
      if(formb.eq.'n'.or.formb.eq.'N'.or.formb.eq.'t'.or.formb.eq.'T')
     * then 
          do 2032  nn = 0, n-1, nb
c nb cols of (P-Q)*S
              do 2034  j = nn, min(n - 1, nn + nb - 1)
                  call f06eff ( k, b(j*ibc1 + 1), ibr1, aux(kaux + (j-nn
     *)*k), 1 )
2034          continue
              call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m
     *, aux(kaux), k, one, c(nn*ldc2 + m), ldc2 )
2032      continue
c conjugate
      else
          do 2036  nn = 0, n-1, nb
c nb cols of (P-Q)*S
              do 2038  j = nn, min(n - 1, nn + nb - 1)
                  call f06fdf ( k, neg1, b(j*ibc1 + 1), ibr1, aux(kaux +
     * (j-nn)*k), 1 )
2038          continue
              call f06yaf ( 'N', 'N', m, min(nb, n - nn), k, one, aux, m
     *, aux(kaux), k, one, c(nn*ldc2 + m), ldc2 )
2036      continue
c#######################################################################
c
c STEP 5: Pair off real and imaginary elements, col by col of C ...
c
c#######################################################################
      endif
      do 2040  j = 0,n-1
          call f06eff ( m*2, c(j*ldc2), 1, aux, 1 )
          call f06eff ( m, aux(m), 1, c(j*ldc2), 2 )
          call f06eff ( m, aux(0), 1, c(j*ldc2+1), 2 )
2040  continue
      return
      end
