C     This program times the Level 1 BLAS routine DDOT, the Level 2
C     BLAS routine DGEMV, and the Level 3 BLAS routine DGEMM for 
C     increasing size of order of the matrix.
C
C     It is very simple-minded!
C
C     It produces a file suitable for use directly with MATLAB.
C
C     It assumes the cache is 64K - change subroutine FLUSH if
C     your cache is bigger. 
C
C     It uses a DOUBLE PRECISION FUNCTION SECOND, and by using
C     a user-supplied average megaflop rate (see AVMFLP below),
C     calculates the number of repetitions necessary to accumulate
C     a user-specified time (see AVTIME below).
C
C     You must supply your own versions of DDOT, DGEMV and DGEMM,
C     either provided by yourself or preferably by your manufacturer.
C
C     J. Dongarra, P. Mayes, G. Radicati
C     Program grnerates data for a paper entitled:
C     ``The IBM RS/6000 and Linear Algebra Operations'', to appear.
C
C
C     .. Parameters ..
      INTEGER          MAXN, LDA
      DOUBLE PRECISION ALPHA, BETA
      PARAMETER        (MAXN=512,LDA=530,ALPHA=1.1D0,BETA=0.9D0)
C     .. Local Scalars ..
      DOUBLE PRECISION AVMFLP, AVTIME, MFLP1, MFLP2, MFLP3, OPS, S0, S1,
     +                 S2, TEMP, TFLUSH, TIME1, TIME2, TIME3
      INTEGER          I, ICOUNT, II, IREP, J
C     .. Local Arrays ..
      DOUBLE PRECISION A(LDA,MAXN), B(LDA,MAXN), C(LDA,MAXN), X(MAXN),
     +                 XX(MAXN,0:100), Y(MAXN), YY(MAXN,0:100)
C     .. External Functions ..
      DOUBLE PRECISION DDOT, SECOND
      EXTERNAL         DDOT, SECOND
C     .. External Subroutines ..
      EXTERNAL         DGEMM, DGEMV, FLUSH
C     .. Intrinsic Functions ..
      INTRINSIC        MAX, MOD, DBLE, NINT
C     .. Executable Statements ..
C
C     In the following two lines, specify the average time you
C     would like each call to take, and the average megaflop rate
C     you expect to see. The number of times a call will be
C     repeated is then calculated accordingly.
C
      AVTIME = 5.0D0
      AVMFLP = 10.0D0
C
C     Firstly calculate the time taken to call FLUSH
C
      S0 = SECOND()
      S1 = SECOND()
      DO 20 I = 1, 1000
         CALL FLUSH
   20 CONTINUE
      S2 = SECOND()
      TFLUSH = (S2-S1) - (S1-S0)
      TFLUSH = TFLUSH*1.0D-3
C
C     Initialize the data
C
      DO 60 J = 1, MAXN
         DO 40 I = 1, MAXN
            A(I,J) = 1.1D0
            B(I,J) = 1.2D0
            C(I,J) = 1.3D0
   40    CONTINUE
         X(J) = 1.01D0
         Y(J) = 1.02D0
   60 CONTINUE
C
C
C
      WRITE (6,FMT=*) '%'
      WRITE (6,FMT=*) '% This file plots the speed of the three'
      WRITE (6,FMT=*) '% BLAS routines DDOT, DGEMV and DGEMM.'
      WRITE (6,FMT=*) '%'
      WRITE (6,FMT=*) 'data = ['
C
      DO 140 I = 8, 512, 8
C
         OPS = 2.0D0*DBLE(I)
         IREP = NINT(AVTIME/(OPS/AVMFLP/1.0D6))
         IREP = MAX(IREP,1)
         CALL FLUSH
         S0 = SECOND()
         S1 = SECOND()
         DO 80 ICOUNT = 1, IREP
C
C           Here we ensure that we are doing DOT products out of
C           memory, without calling FLUSH each time (this probably
C           takes more time than DOT anyway), by using separate
C           columns of the arrays XX and YY.
C
            II = MOD(ICOUNT,100)
            TEMP = DDOT(I,XX(1,II),1,YY(1,II),1)
   80    CONTINUE
         S2 = SECOND()
         TIME1 = (S2-S1) - (S1-S0)
         MFLP1 = OPS*IREP/TIME1/1.0D6
C
         OPS = 2.0D0*DBLE(I)**2
         IREP = NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))
         IREP = MAX(IREP,1)
         S0 = SECOND()
         S1 = SECOND()
         DO 100 ICOUNT = 1, IREP
            CALL FLUSH
C           CALL DGEMV('Transpose',I,I,ALPHA,A,LDA,X,1,BETA,Y,1)
            CALL DGEMV('No Transpose',I,I,ALPHA,A,LDA,X,1,BETA,Y,1)
  100    CONTINUE
         S2 = SECOND()
         TIME2 = (S2-S1) - (S1-S0) - IREP*TFLUSH
         MFLP2 = OPS*IREP/TIME2/1.0D6
C
         OPS = 2.0D0*DBLE(I)**3
         IREP = NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))
         IREP = MAX(IREP,1)
         S0 = SECOND()
         S1 = SECOND()
         DO 120 ICOUNT = 1, IREP
            CALL FLUSH
C           CALL DGEMM('Transpose','No transpose',I,I,I,ALPHA,A,LDA,B,
            CALL DGEMM('No Transpose','No transpose',I,I,I,ALPHA,A,LDA,
     +                 B,LDA,BETA,C,LDA)
  120    CONTINUE
         S2 = SECOND()
         TIME3 = (S2-S1) - (S1-S0) - IREP*TFLUSH
         MFLP3 = OPS*IREP/TIME3/1.0D6
C
         WRITE (6,FMT='(I5,3F12.2)') I, MFLP1, MFLP2, MFLP3
  140 CONTINUE
      WRITE (6,FMT=*) '];'
      WRITE (6,FMT=*) 'axis([0,512,0,50])'
      WRITE (6,FMT=*) 'plot(data(:,1),data(:,2),',
     +  'data(:,1),data(:,3),data(:,1),data(:,4))'
      WRITE (6,FMT=*) 'title(''RS/6000-530: Level 1, 2 and 3 BLAS'')'
      WRITE (6,FMT=*) 'xlabel(''Order of vectors/matrices'')'
      WRITE (6,FMT=*) 'ylabel(''Speed in Megaflops'')'
      STOP
      END
      SUBROUTINE FLUSH
C
C     This subroutine causes 64KBytes to be initialized. If it
C     is called before a code fragment, it will make sure that
C     the cache is flushed completely.
C
C     Alter the dimension of the matrix if the cache size
C     is not 65536 bytes, so that FOO is exactly the size
C     of the cache.
C
C     .. Local Scalars ..
      INTEGER          I
C     .. Local Arrays ..
      DOUBLE PRECISION FOO(8192)
C     .. Executable Statements ..
      DO 20 I = 1, 8192
         FOO(I) = 1.1D0
   20 CONTINUE
      RETURN
      END

.