      program matmult
c         
CVD$G NOCONCUR
      include 'matcom.h'
      external colalg, dotalg
      character*80 arg
      integer iota(16)
      data iota /1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16/
c
c         Enable the Dynamic Multiprocessing Analysis Tool (DMAT).
c
      CALL LGENABLE
      CALL LGOPEN( logbufr, LENBUF, 'trace.matmult', -1 )
      CALL LGON
c
c         Get the USER input data from the command line.
c
      nreps = 25
      if( iargc().ne.3 ) then
         call getarg(0,arg)
         write(0,1080) arg
         call exit( 1 ) 
      endif
      call getarg(1,arg)
      read(arg,*) nproc
      call getarg(2,arg)
      read(arg,*) n
      call getarg(3,arg)
      read(arg,*) m
      if( nproc.lt.1 .or. nproc.gt.16 ) then
         write(6,1050) nproc
         call exit( 1 )
      endif
      if( n.lt.nproc .or. n.gt.NMAX ) then
         write(6,1060) n, NMAX
         call exit( 1 )
      endif
      if( m.lt.nproc .or. m.gt.MMAX ) then
         write(6,1070) m, MMAX
         call exit( 1 )
      endif
c
c         Allocate the barriers & locks.
c
      call barasgn( my_barrier, nproc )
      call lockasgn( my_lock, 0 )
c         
c         Set up the matrix.
c         
      t0 = ticktock( temp )
CVD$ NOVECTOR
      do 20 j = 1, m
         if( j.le.n ) then
CVD$ NOVECTOR
            do 10 i = 1, n
               a(i,j) = rand( 0 )
               b(i,j) = rand( 0 )
               c(i,j) = rand( 0 )
               d(i,j) = rand( 0 )
 10         continue
         else
CVD$ NOVECTOR
            do 15 i = 1, n
               b(i,j) = rand( 0 )
               c(i,j) = rand( 0 )
               d(i,j) = rand( 0 )
 15         continue
         endif	
 20   continue
      t1 = ticktock( temp )
      write(6,1000) nproc, n, m, t1-t0
c         
c         Do the column algorithm multiply serially.
c         
      t1 = ticktock( temp )
      do 60 k = 1, m
CVD$ VECTOR
         do 30 i = 1, n
            c(i,k) = a(i,1)*b(1,k)
 30      continue
         do 50 j = 2, n
CVD$ VECTOR
            do 40 i = 1, n
               c(i,k) = c(i,k) + a(i,j)*b(j,k)
 40         continue
 50      continue
 60   continue
      t2 = ticktock( temp )
      write(6,1010) t2-t1
c         
c         Do the multiply in parallel with column algorithm.
c         
      t3 = ticktock( temp )
CVD$ NOVECTOR
      do 70 ip = 2, nproc
         my_tskarry(1,ip) = 3
         call tskstart( my_tskarry(1,ip), colalg,m,n,nreps,
     $        iota(ip),nproc)
 70   continue
      call colalg(m,n,nreps,iota(1),nproc)
CVD$ NOVECTOR
      do 80 ip = 2, nproc
         call tskwait( my_tskarry(1,ip) )
 80   continue
      t4 = ticktock( temp )
      t5 = (t4-t3)/float(nreps)
      write(6,1020) t5, (t2-t1)/t5
c         
c         Do the dot product algorithm multiply serially.
c         
      t1 = ticktock( temp )
      do 61 k = 1, m
         do 51 i = 1, n
            dott = 0.0e0
CVD$ VECTOR
CVD$ ASSOC
            do 41 j = 1, n
               dott = dott + a(i,j)*b(j,k)
 41         continue
            c(i,k) = dott
 51      continue
 61   continue
      t2 = ticktock( temp )
      write(6,1010) t2-t1
c         
c         Do the multiply in parallel with a dot product algorithm.
c         
      t3 = ticktock( temp )
CVD$ NOVECTOR
      do 71 ip = 2, nproc
         my_tskarry(1,ip) = 3
         call tskstart( my_tskarry(1,ip), dotalg,m,n,nreps,
     $        iota(ip),nproc)
 71   continue
      call dotalg(m,n,nreps,iota(1),nproc)
CVD$ NOVECTOR
      do 81 ip = 2, nproc
         call tskwait( my_tskarry(1,ip) )
 81   continue
      t4 = ticktock( temp )
      t5 = (t4-t3)/float(nreps)
      write(6,1020) t5, (t2-t1)/t5
c         
c         Check the answer.
c         
      do 120 j = 1, m
CVD$ NOVECTOR
         do 110 i = 1, n
            if( abs( (d(i,j)-c(i,j))/c(i,j) ) .gt. 1.0e-4 ) 
     1           write(6,1030) i,j,c(i,j),d(i,j)
 110     continue
 120  continue
c
c         Clean up and exit
c
      call barrel( my_barrier )
      call lockrel( my_lock )
      call lgoff
      call lgclose
      call exit( 0 )
 1000 format(' Matrix setup complete for nproc,n,m =',3i5,e16.7, ' Sec')
 1010 format(' Single Serial time  =  ',e16.7)
 1020 format(' Avg  Parallel time  =  ',e16.7,' Spd Up = ',f6.3)
 1030 format(' ERROR: i,j,true,parallel = ',2i4,2(e16.7))
 1050 format(' ERROR: invalid nproc =',i5,' < 16')
 1060 format(' ERROR: invalid n =',i5,' < NMAX =',i10)
 1070 format(' ERROR: invalid m =',i5,' < MMAX =',i10)
 1080 format(1x,a,': usage: matmult nproc (1,8)',
     $     ' n (nproc,1025) m (nproc,1025)')
      end
      subroutine colalg(m,n,nreps,iproc,nproc)
c
c         Column Algorithm for Matrix Matrix multiply.  We assume
c         that nproc tasks will call and that 1<=iproc<=nproc.
c
      include 'matcom.h'
c
      do 110 k = iproc, m, nproc
         do 100 l = 1, nreps
CVD$ VECTOR
            do 70 i = 1, n
               d(i,k) = a(i,1)*b(1,k)
 70         continue
            do 90 j = 2, n
CVD$ VECTOR
               do 80 i = 1, n
                  d(i,k) = d(i,k) + a(i,j)*b(j,k)
 80            continue
 90         continue
 100     continue
 110  continue
      return
      end
      subroutine dotalg(m,n,nreps,iproc,nproc)
c
c         Dot Product Algorithm for Matrix Matrix multiply.  We assume
c         that nproc tasks will call and that 1<=iproc<=nproc.
c
      include 'matcom.h'
c
      do 111 k = iproc, m, nproc
         do 101 l = 1, nreps
            do 91 i = 1, n
               dott = 0.0e0
CVD$ VECTOR
CVD$ ASSOC
               do 81 j = 1, n
                  dott = dott + a(i,j)*b(j,k)
 81            continue
               d(i,k) = dott
 91         continue
 101     continue
 111  continue
      return
      end
      real function ticktock( temp )
c$$$c     
c$$$c     For Sequent Unix 4.2 Bsd.
c$$$c     
c$$$      external _gettime
c$$$      double precision ssec, usec
c$$$c     
c$$$      call _gettime( ssec, usec )
c$$$      ticktock = sngl(usec+ssec)
c         
c         For Alliant FX/8 fortran
c         
      dimension timarry(2)
      ticktock = etime( timarry )
      return
      end
