C This program times the Level 1 BLAS routine DDOT, the Level 2 C BLAS routine DGEMV, and the Level 3 BLAS routine DGEMM for C increasing size of order of the matrix. C C It is very simple-minded! C C It produces a file suitable for use directly with MATLAB. C C It assumes the cache is 64K - change subroutine FLUSH if C your cache is bigger. C C It uses a DOUBLE PRECISION FUNCTION SECOND, and by using C a user-supplied average megaflop rate (see AVMFLP below), C calculates the number of repetitions necessary to accumulate C a user-specified time (see AVTIME below). C C You must supply your own versions of DDOT, DGEMV and DGEMM, C either provided by yourself or preferably by your manufacturer. C C J. Dongarra, P. Mayes, G. Radicati C Program grnerates data for a paper entitled: C ``The IBM RS/6000 and Linear Algebra Operations'', to appear. C C C .. Parameters .. INTEGER MAXN, LDA DOUBLE PRECISION ALPHA, BETA PARAMETER (MAXN=512,LDA=530,ALPHA=1.1D0,BETA=0.9D0) C .. Local Scalars .. DOUBLE PRECISION AVMFLP, AVTIME, MFLP1, MFLP2, MFLP3, OPS, S0, S1, + S2, TEMP, TFLUSH, TIME1, TIME2, TIME3 INTEGER I, ICOUNT, II, IREP, J C .. Local Arrays .. DOUBLE PRECISION A(LDA,MAXN), B(LDA,MAXN), C(LDA,MAXN), X(MAXN), + XX(MAXN,0:100), Y(MAXN), YY(MAXN,0:100) C .. External Functions .. DOUBLE PRECISION DDOT, SECOND EXTERNAL DDOT, SECOND C .. External Subroutines .. EXTERNAL DGEMM, DGEMV, FLUSH C .. Intrinsic Functions .. INTRINSIC MAX, MOD, DBLE, NINT C .. Executable Statements .. C C In the following two lines, specify the average time you C would like each call to take, and the average megaflop rate C you expect to see. The number of times a call will be C repeated is then calculated accordingly. C AVTIME = 5.0D0 AVMFLP = 10.0D0 C C Firstly calculate the time taken to call FLUSH C S0 = SECOND() S1 = SECOND() DO 20 I = 1, 1000 CALL FLUSH 20 CONTINUE S2 = SECOND() TFLUSH = (S2-S1) - (S1-S0) TFLUSH = TFLUSH*1.0D-3 C C Initialize the data C DO 60 J = 1, MAXN DO 40 I = 1, MAXN A(I,J) = 1.1D0 B(I,J) = 1.2D0 C(I,J) = 1.3D0 40 CONTINUE X(J) = 1.01D0 Y(J) = 1.02D0 60 CONTINUE C C C WRITE (6,FMT=*) '%' WRITE (6,FMT=*) '% This file plots the speed of the three' WRITE (6,FMT=*) '% BLAS routines DDOT, DGEMV and DGEMM.' WRITE (6,FMT=*) '%' WRITE (6,FMT=*) 'data = [' C DO 140 I = 8, 512, 8 C OPS = 2.0D0*DBLE(I) IREP = NINT(AVTIME/(OPS/AVMFLP/1.0D6)) IREP = MAX(IREP,1) CALL FLUSH S0 = SECOND() S1 = SECOND() DO 80 ICOUNT = 1, IREP C C Here we ensure that we are doing DOT products out of C memory, without calling FLUSH each time (this probably C takes more time than DOT anyway), by using separate C columns of the arrays XX and YY. C II = MOD(ICOUNT,100) TEMP = DDOT(I,XX(1,II),1,YY(1,II),1) 80 CONTINUE S2 = SECOND() TIME1 = (S2-S1) - (S1-S0) MFLP1 = OPS*IREP/TIME1/1.0D6 C OPS = 2.0D0*DBLE(I)**2 IREP = NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH)) IREP = MAX(IREP,1) S0 = SECOND() S1 = SECOND() DO 100 ICOUNT = 1, IREP CALL FLUSH C CALL DGEMV('Transpose',I,I,ALPHA,A,LDA,X,1,BETA,Y,1) CALL DGEMV('No Transpose',I,I,ALPHA,A,LDA,X,1,BETA,Y,1) 100 CONTINUE S2 = SECOND() TIME2 = (S2-S1) - (S1-S0) - IREP*TFLUSH MFLP2 = OPS*IREP/TIME2/1.0D6 C OPS = 2.0D0*DBLE(I)**3 IREP = NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH)) IREP = MAX(IREP,1) S0 = SECOND() S1 = SECOND() DO 120 ICOUNT = 1, IREP CALL FLUSH C CALL DGEMM('Transpose','No transpose',I,I,I,ALPHA,A,LDA,B, CALL DGEMM('No Transpose','No transpose',I,I,I,ALPHA,A,LDA, + B,LDA,BETA,C,LDA) 120 CONTINUE S2 = SECOND() TIME3 = (S2-S1) - (S1-S0) - IREP*TFLUSH MFLP3 = OPS*IREP/TIME3/1.0D6 C WRITE (6,FMT='(I5,3F12.2)') I, MFLP1, MFLP2, MFLP3 140 CONTINUE WRITE (6,FMT=*) '];' WRITE (6,FMT=*) 'axis([0,512,0,50])' WRITE (6,FMT=*) 'plot(data(:,1),data(:,2),', + 'data(:,1),data(:,3),data(:,1),data(:,4))' WRITE (6,FMT=*) 'title(''RS/6000-530: Level 1, 2 and 3 BLAS'')' WRITE (6,FMT=*) 'xlabel(''Order of vectors/matrices'')' WRITE (6,FMT=*) 'ylabel(''Speed in Megaflops'')' STOP END SUBROUTINE FLUSH C C This subroutine causes 64KBytes to be initialized. If it C is called before a code fragment, it will make sure that C the cache is flushed completely. C C Alter the dimension of the matrix if the cache size C is not 65536 bytes, so that FOO is exactly the size C of the cache. C C .. Local Scalars .. INTEGER I C .. Local Arrays .. DOUBLE PRECISION FOO(8192) C .. Executable Statements .. DO 20 I = 1, 8192 FOO(I) = 1.1D0 20 CONTINUE RETURN END .