c lu decomposition (this is essentially your 4 at a time algorithm c with a few coding changes. job,jn=-dng2-,t=15.us=bnchbch. account,ac=865606,us=u2240,upw=u2240. fetch,dn=dong2,text=for2,mf=v3. cft,i=dong2,l=0. fetch,dn=lltx ,text=for ,mf=v3. cft,i=lltx ,l=0. fetch,dn=smxpyc,text=cal3,mf=v3. cal,i=smxpyc,l=0. fetch,dn=sgemv,text=cal,mf=v3. cal,i=sgemv,l=0. dispose,dn=$out,dc=pr,sf=p,defer. ldr,nx,ab=lrun,mm=25000:5000,stk=5000:2000. lrun. exit. /eof ***deck dong2 for2 --- test driver 1cpu real mflops,aa(1001,1000),a(1001,1000),b(1001),x(1001) real emax common /statall/ aa,a,b,x,quit logical quit integer nsize(12) data nsize / 48,49,50,51,100 ,150,200,300,400,600 ,750,1000 / quit=.false. lda = 1001 c write(6,10) 10 format(' cholesky decomposition timing') write(6,1010) 1010 format(/,1x,100(1h-),/,5x,'version with cal sgemv',/, *1x,100(1h-)) do 100 nchoose = 1,10 n=nsize(nchoose) if ( n.gt. lda-1 ) stop do 30 j = 1,n do 20 i = j,n aa(i,j) = float(-i*j) aa(j,i) = aa(i,j) 20 continue aa(j,j) = float(j*n*n+1)/2.0 x(j) = 1.0 30 continue do 32 i = 1,n b(i) = 0.0 do 31 j = 1,n b(i) = b(i) + aa(i,j)*x(j) 31 continue 32 continue c write(6,40)lda,n 40 format(/' sp size of the arrays',i5,' and order is ',i5/) c do 43 j = 1,n do 42 i = 1,n a(i,j) = aa(i,j) 42 continue 43 continue t1 = second(t1) call llt1(a,lda,n,x,ierr) t2 = second(t2) - t1 call llts(a,lda,n,x,b) emax = 0.0 do 41 i = 1,n emax = amax1(emax,abs(x(i))) 41 continue mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6) write(6,50) t2,mflops,emax 50 format(' normal matrix*vector',f9.3,' mflops=',f9.4, $ ' check=',e12.3) c go to 2222]skip lltp for now. do 73 j = 1,n do 72 i = 1,n a(i,j) = aa(i,j) 72 continue 73 continue t1 = second(t1) ii1=irtc() call lltp(a,lda,n,x,ierr) ii2=irtc() t2 = second(t2) - t1 if( ierr .ne. 0 ) write(6,*)' ierr =',ierr call llts(a,lda,n,x,b) emax = 0.0 do 71 i = 1,n emax = amax1(emax,abs(x(i))) 71 continue mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6) write(6,80) t2,mflops,emax 80 format(' parallel version time = ',e12.3,' mflops = ',f9.4, $ ' check = ',e12.3) 2222 continue c do 83 j = 1,n do 82 i = 1,n a(i,j) = aa(i,j) 82 continue 83 continue t1 = second(t1) jj1=irtc() call lltx(a,lda,n,x,ierr) jj2=irtc() t2 = second(t2) - t1 call llts(a,lda,n,x,b) emax = 0.0 do 81 i = 1,n emax = amax1(emax,abs(x(i))) 81 continue mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6) write(6,90) t2,mflops,emax 90 format(' 4-fold matrix*vector',f9.3,' mflops=',f9.4, $ ' check=',e12.3) goto 100 tpar=9.5e-9*(ii2-ii1-3) tsng=9.5e-9*(jj2-jj1-3) flp = float(n*(7+n*(3+n*2)))/(6.*tpar*1.0e6) fls = float(n*(7+n*(3+n*2)))/(6.*tsng*1.0e6) speed=tsng/tpar write(6,95) tpar,tsng,flp,fls,speed 95 format(' clock times par&sng',2g16.6,/, 1' megaflops',2g16.6,/, 2' speedup',g16.6) 100 continue quit=.true. call killcpu stop end subroutine llts (a,lda,n,x,b) real a(lda,*), x(*), b(*), xk c c purpose: c solve the symmetric positive definite system ax = b given the c cholesky factorization of a (as computed in llt). c c additional parameters (not parameters of llt): c c x real(n), solution of linear system c c b real(n), right-hand-side of linear system c c ---------------------------------------------------------------------- c do 10 k = 1, n x(k) = b(k) 10 continue c do 30 k = 1, n xk = x(k)*a(k,k) do 20 i = k+1, n x(i) = x(i) - a(i,k)*xk 20 continue x(k) = xk 30 continue c do 50 k = n, 1, -1 xk = x(k)*a(k,k) do 40 i = 1, k-1 x(i) = x(i) - a(k,i)*xk 40 continue x(k) = xk 50 continue c return end subroutine llt1(a,lda,n,rowi,info) c ** llt.f -- llt decomposition c integer n,lda,info real a(lda,n),rowi(n),t character*1 charn c charn='n' info = 0 do 30 i = 1, n do 10 j = 1, i-1 rowi(j) = -a(i,j) 10 continue c c call smxpy1(n-i+1,a(i,i),i-1,lda,rowi,a(i,1)) c charn='n' call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,infos ) c if ( infos.ne.0 ) then c write(6,77) infos,i c7 format(' ????? info ne 0',6x,'info',i5,' i',i5) c stop c endif c if (a(i,i) .le. 0) then info = i go to 40 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) do 20 j = i+1, n a(j,i) = t*a(j,i) 20 continue 30 continue 40 return end subroutine lltxo(a,lda,n,rowi,info) c ** llt.f -- llt decomposition c integer n,lda,info real a(lda,n),rowi(n),t character*1 charn c info = 0 charn='n' do 30 i = 1, n do 10 j = 1, i-1 rowi(j) = -a(i,j) 10 continue c call smxpyc(n-i+1,a(i,i),i-1,lda,rowi,a(i,1)) c charn='n' call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,infos ) c if (a(i,i) .le. 0) then info = i go to 40 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) do 20 j = i+1, n a(j,i) = t*a(j,i) 20 continue 30 continue 40 return end subroutine smxpy1 (n1,y,n2,ldm,x,m) c ** smxpy01.f -- smxpy unrolled to a depth of 1 c integer ldm,n1,n2 real y(n1),x(n2),m(ldm,n2) c do 20 j = 1, n2 do 10 i = 1, n1 y(i) = (y(i)) + x(j)*m(i,j) 10 continue 20 continue return end subroutine smxpyx(n1,y,n2,ldm,x,m) c ** smxpy16.f -- smxpy unrolled to a depth of 16 c integer ldm,n1,n2 real y(n1),x(n2),m(ldm,n2) c c cleanup odd vector c j = mod(n2,2) if (j .ge. 1) then do 10 i = 1, n1 y(i) = (y(i)) + x(j)*m(i,j) 10 continue endif c c cleanup odd group of two vectors c j = mod(n2,4) if (j .ge. 2) then do 20 i = 1, n1 y(i) = ( (y(i)) $ + x(j-1)*m(i,j-1)) + x(j)*m(i,j) 20 continue endif c c cleanup odd group of four vectors c j = mod(n2,8) if (j .ge. 4) then do 30 i = 1, n1 y(i) = ((( (y(i)) $ + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2)) $ + x(j-1)*m(i,j-1)) + x(j) *m(i,j) 30 continue endif c c cleanup odd group of eight vectors c j = mod(n2,16) if (j .ge. 8) then do 40 i = 1, n1 y(i) = ((((((( (y(i)) $ + x(j-7)*m(i,j-7)) + x(j-6)*m(i,j-6)) $ + x(j-5)*m(i,j-5)) + x(j-4)*m(i,j-4)) $ + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2)) $ + x(j-1)*m(i,j-1)) + x(j) *m(i,j) 40 continue endif c c main loop - groups of sixteen vectors c jmin = j+16 do 60 j = jmin, n2, 16 do 50 i = 1, n1 y(i) = ((((((((((((((( (y(i)) $ + x(j-15)*m(i,j-15)) + x(j-14)*m(i,j-14)) $ + x(j-13)*m(i,j-13)) + x(j-12)*m(i,j-12)) $ + x(j-11)*m(i,j-11)) + x(j-10)*m(i,j-10)) $ + x(j- 9)*m(i,j- 9)) + x(j- 8)*m(i,j- 8)) $ + x(j- 7)*m(i,j- 7)) + x(j- 6)*m(i,j- 6)) $ + x(j- 5)*m(i,j- 5)) + x(j- 4)*m(i,j- 4)) $ + x(j- 3)*m(i,j- 3)) + x(j- 2)*m(i,j- 2)) $ + x(j- 1)*m(i,j- 1)) + x(j) *m(i,j) 50 continue 60 continue return end ***deck lltx for --- decomposition routine subroutine lltx (a,lda,n,rowi,info) c ** llt.f -- llt decomposition integer n,lda,info real a(lda,n),rowi(n),t c timings on 1 cpu of an x-mp2.16 c n time seconds megaflops c 50 0.000787 54.5785 c 100 0.003107 108.9173 c 150 0.008226 138.1521 c 200 0.017643 152.2911 c 300 0.053245 169.8830 c 400 0.121649 176.0298 c 600 0.394672 182.8876 c 750 0.762169 184.8765 c 1000 1.794049 186.0788 common /qqqw/ inorml character*1 inorml real rowip1(2000) real rowip2(2000) real rowip3(2000) inorml='n' info = 0 n1=n-mod(n,4) do 30 i = 1, n1,4 do 10 j = 1, i-1 rowi (j) = -a(i ,j) rowip1(j) = -a(i+1,j) rowip2(j) = -a(i+2,j) rowip3(j) = -a(i+3,j) 10 continue call sgemv( inorml,n-i+1,i-1,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,info ) if( i .ne. n ) then call sgemv( inorml,n-i,i-1,1. ,a(i+1,1),lda * ,rowip1,1 ,a(i+1,i+1),1 ,info ) endif if( i .lt. n-1 ) then call sgemv( inorml,n-i-1,i-1,1. ,a(i+2,1),lda * ,rowip2,1 ,a(i+2,i+2),1 ,info ) endif if( i .lt. n-2 ) then call sgemv( inorml,n-i-2,i-1,1. ,a(i+3,1),lda * ,rowip3,1 ,a(i+3,i+3),1 ,info ) endif if (a(i,i) .le. 0) then info = i go to 70 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) j=i+1 a(j,i) = a(i,i)*a(j,i) a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21 cdir$ ivdep do 20 j = i+2, n a(j,i) = a(i,i)*a(j,i) a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21 20 continue c the missing pieces for i+1 if (a(i+1,i+1) .le. 0) then info = i+1 go to 70 endif a(i+1,i+1) = 1./sqrt(a(i+1,i+1)) c t = a(i+1,i+1) j=i+2 a(j,i+1) = a(i+1,i+1)*a(j,i+1) a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) ) $ - a(i+2,i+1)*a(j,i+1) cdir$ ivdep do 22 j = i+3, n a(j,i+1) = a(i+1,i+1)*a(j,i+1) a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) ) $ - a(i+2,i+1)*a(j,i+1) 22 continue c the missing pieces for i+2 if (a(i+2,i+2) .le. 0) then info = i+2 go to 70 endif a(i+2,i+2) = 1./sqrt(a(i+2,i+2)) c t = a(i+2,i+2) j=i+3 a(j,i+2) = a(i+2,i+2)*a(j,i+2) a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) ) $ - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2) cdir$ ivdep do 24 j = i+4, n a(j,i+2) = a(i+2,i+2)*a(j,i+2) a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) ) $ - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2) 24 continue c the missing pieces for i+3 if (a(i+3,i+3) .le. 0) then info = i+3 go to 70 endif a(i+3,i+3) = 1./sqrt(a(i+3,i+3)) c t = a(i+3,i+3) cdir$ ivdep do 26 j = i+4, n a(j,i+3) = a(i+3,i+3)*a(j,i+3) 26 continue 30 continue do 60 i = n1+1, n do 40 j = 1, i-1 rowi(j) = -a(i,j) 40 continue call sgemv (inorml,n-i+1 ,i-1 ,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,infos ) if (a(i,i) .le. 0) then info = i go to 70 endif a(i,i) = 1/sqrt(a(i,i)) t = a(i,i) do 50 j = i+1, n a(j,i) = t*a(j,i) 50 continue 60 continue 70 continue return end ***deck smxpyc cal3 --- probably not needed * subroutine smxpyc( n1,y,n2,ldm,x,m ) * real y(n1),x(n2),m(ldm,n2) * do 10 j=1,n2 * do 10 i=1,n1 *10 y(i)=y(i) +x(j)*m(i,j) * return * end ident smxpyc n1 defarg y defarg n2 defarg ldm defarg x defarg m defarg ofx defb temp ofxpls defb temp ofyi1 defb temp ofmi1 defb temp n2val defb temp smxpyc enter argadd a2,n2,use=a7 a2 ,a2 argadd a1,n1,argptr=a7 a1 ,a1 argadd a5,ldm,argptr=a7 argadd a3,y,argptr=a7 a5 ,a5 argadd a4,x,argptr=a7 argadd a6,m,argptr=a7 b.ofyi1 a3 b.ofx a4 b.ofmi1 a6 a0 -a2 b.n2val a2 jap return * -------------------------------- * jsize = * *** determine vect len to use this pass *** a1:jtot *** a6:scratch *** a7: vl output * --------- load initial x as soon as possible --------- a0 b.ofx a4 b.n2val vl a4 v1 ,a0,1 a3 vl no. of x in v1 * ------------------------------------------------------ a6 128 a7=64 $ if( a1 ge 128 ) goto sized a0 a1-a6 a7 64 jap sized a6 64 a7=a1 $ if( a1 le 64 ) goto sized a0 a6-a1 a7 a1 jap sized s1 a1 ( 641 a7 s1 sized vl a7 a0 b.ofyi1 v0 ,a0,1 load y(i1+) a1 a1-a7 a1: new jtot a2 b.ofx b.ofxpls a2 reset counter in b.ofxpls a4 b.n2val no. x left to process a6 b.ofmi1 addr to start loading m from a2 a6+a7 b.ofmi1 a2 update for next time around j xloaded newx = * refresh x which contains scalars vl a4 a0 b.ofxpls v1 ,a0,1 x values a3 vl no. of x in v1 xloaded a2 b.ofxpls a2 a2+a3 vl a7 reset vect length b.ofxpls a2 update addr of x a2 0 counter for x in v1 a4 a4-a3 * --------------------------------------------------------------- workloop = * a0 a6 v2 ,a0,1 load m s1 v1,a2 get x a2 a2+1 a6 a6+a5 v3 s1*rv2 a0 a2-a3 v6 v0+fv3 new y jaz xout2 * - - - - - - - - - - - - - - - - - - - - - - - - - a0 a6 v4 ,a0,1 load m s1 v1,a2 get x a2 a2+1 a6 a6+a5 v5 s1*rv4 a0 a2-a3 v0 v6+fv5 new y jan workloop * ----------------------------------------------- * xout = * x counter ran out a0 a4 check if done with all x jaz newi1 j newx newi1 = * done with all x for these values a0 b.ofyi1 of y s0 a1 a2 b.ofyi1 a2 a2+a7 b.ofyi1 a2 ,a0,1 v0 store y jsn jsize go back for new i1 return exit xout2 v0 v6 j xout end ***deck sgemv cal --- matrix vector kernal * subroutine sgemv * * linpak level2 blas * ident sgemv trans defarg m defarg n defarg alpha defarg aa defarg lda defarg x defarg incx defarg y defarg incy defarg info defarg mval defb temp value of m nval defb temp value of n ofx defb temp loc of (@)x(1) ofy defb temp @y(1) ofa1j defb temp @a( 1,j1) ofaij defb temp @a(i1,j1) mleft defb temp # of m remaining in yloop y1r defb temp temp sto for 1st resid yloop ldas = 4 lda incxs = 5 incx incys = 6 incy a7s = 7 a7 sgemv enter s2 rt ****************************** argadd a1,trans,use=a7 argadd a2,m,argptr=a7 argadd a3,n,argptr=a7 argadd a4,lda,argptr=a7 argadd a5,incx,argptr=a7 argadd a6,incy,argptr=a7 s1 ,a1 trans a2 ,a2 m a3 ,a3 n a4 ,a4 lda a5 ,a5 incx a6 ,a6 incy a1 7 load v0 with a list of the legal vl a1 values of trans a0 $tstrns v0 ,a0,1 v1 s1-v0 vm v1,z tells which one of the legal values trans (s1) is. a0 a2 test m,n,lda for illegal values. s0 +a3 jam err2 m<0 ? a0 a4-a2 lda-m jsm err3 n<0 ? jam err6 m>lda ? s0 +a5 a0 a6 b.mval a2 b.nval a3 s.ldas a4 jsp incxp find iabs(incx)&iabs(incy) a5 a0-a5 incxp jap incyp a6 a0-a6 incyp = * s1 vm a2 zs1 a1 2 s.incxs a5 (s5=incx) a0 a1-a2 * 2-a2 argadd a3,alpha,argptr=a7 s.incys a6 (s6=incy) jam transp trans not in the set (' ','n','n') ? a1 64 a2 b.mval a0 a1-a2 argadd a4,x,argptr=a7 argadd a5,y,argptr=a7 argadd a6,aa,argptr=a7 vl a2 jap $xxx1 a2 vl a2 a2+a1 s1 a2 s1 s1>1 a2 s1 vl a2 $xxx1 = * a0 b.mval a1 b.nval s0 a1 a2 vl now : a2=1st resid of m jaz null m=0 ? jsz null n=0 ? s1 ,a3 value of alpha b.y1r a2 1st y1 resid s.a7s a7 save the value of a7 to s7 b.ofx a4 b.ofy a5 b.ofa1j a6 * -------------------------------- * xloop = * get 1st vector of x values a0 b.ofx a7 b.nval vl a7 a6 s.incxs v7 ,a0,a6 a1 vl # of x values this strip a6 a1*a6 stride in memory (for x) a7 a7-a1 a4 b.ofy b.nval a7 updated # of n left a7 b.ofx a7 a7+a6 a5 b.ofa1j b.ofaij a5 b.ofx a7 updated by vector v0 s1*rv7 alpha*x a2 b.y1r initial vl for y&a loops a7 b.mval b.mleft a7 vl a0 no race (with y store & reload) a0 dummy no race (with y store & reload) ,a0,a0 v1 no race (with y store & reload) vl a2 yloop = * a0 a4 @y(i1) a6 s.incys v1 ,a0,a6 y a2 vl a7 b.mleft a7 a7-a2 a6 0 j counter for jloop b.mleft a7 a7 s.ldas jloop = * do j = 0,a1-1 * note: must be unrolled 7 times to get * max speed a0 a5 v2 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v3 s3*rv2 v4 v1+fv3 jaz jloop$1 note: this is the exit taken for a1=64 a0 a5 v5 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v6 s3*rv5 v7 v4+fv6 jaz jloop$2 a0 a5 v1 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v2 s3*rv1 v3 v2+fv7 jaz jloop$3 a0 a5 v4 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v5 s3*rv4 v6 v5+fv3 jaz jloop$4 a0 a5 v7 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v1 s3*rv7 v2 v1+fv6 jaz jloop$5 a0 a5 v3 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v4 s3*rv3 v5 v4+fv2 jaz jloop$6 a0 a5 v6 ,a0,1 a(i1+,j) s3 v0,a6 a6 a6+1 a5 a5+a7 a0 a1-a6 v7 s3*rv6 v1 v7+fv5 jan jloop * ---- used all of current x strip store y & check for a 2nd strip * ---- of y's a0 a4 a6 s.incys ,a0,a6 v1 j qqqq jloop$1 a0 a4 a6 s.incys ,a0,a6 v4 qqqq a6 a6*a2 a0 b.mleft a5 b.ofaij a5 a5+a2 a4 a4+a6 b.ofaij a5 a2 b.mleft vl a2 jan yloop a2 s.ldas a2 a1*a2 a0 b.nval a3 b.ofa1j a3 a3+a2 b.ofa1j a3 jan xloop a7 s.a7s null argadd a4,info,argptr=a7 a1 0 ,a4 a1 return = * s6 rt s1 s6-s2 exit * -------------------------------- * jloop$2 a0 a4 a6 s.incys ,a0,a6 v7 j qqqq jloop$3 a0 a4 a6 s.incys ,a0,a6 v3 j qqqq jloop$4 a0 a4 a6 s.incys ,a0,a6 v6 j qqqq jloop$5 a0 a4 a6 s.incys ,a0,a6 v2 j qqqq jloop$6 a0 a4 a6 s.incys ,a0,a6 v5 j qqqq * -------------------------------------- * * -------------------------------------- * transp = * a1 6 a0 a1-a2 jam err1 trans not legal ? * ----- ----- ----- ----- ----- ----- ----- -----* * note: below is a copy of the non-transposed case with * * some modifications. all statements which are * * different from the first case have been indicated * * by a ^ in the comment field * * ----- ----- ----- ----- ----- ----- ----- -----* a1 64 a2 b.nval ^ a0 a1-a2 argadd a4,x,argptr=a7 argadd a5,y,argptr=a7 argadd a6,aa,argptr=a7 vl a2 jap $xxx2 a2 vl a2 a2+a1 s1 a2 s1 s1>1 a2 s1 vl a2 $xxx2 = * a0 b.nval ^ a1 b.mval ^ s0 a1 a2 vl now : a2=1st resid of m jaz null m=0 ? jsz null n=0 ? s1 ,a3 value of alpha b.y1r a2 1st y1 resid s.a7s a7 save the value of a7 to s7 b.ofx a4 b.ofy a5 b.ofa1j a6 * -------------------------------- * xloopt = * get 1st vector of x values a0 b.ofx a7 b.mval ^ vl a7 a6 s.incxs v7 ,a0,a6 a1 vl # of x values this strip a6 a1*a6 stride in memory (for x) a7 a7-a1 a4 b.ofy b.mval a7 ^ updated # of n left a7 b.ofx a7 a7+a6 a5 b.ofa1j b.ofaij a5 b.ofx a7 updated by vector v0 s1*rv7 alpha*x a2 b.y1r initial vl for y&a loops a7 b.nval ^ b.mleft a7 < b.nleft > vl a0 no race (with y store & reload) a0 dummy no race (with y store & reload) ,a0,a0 v1 no race (with y store & reload) vl a2 yloopt = * a0 a4 @y(j1) a6 s.incys v1 ,a0,a6 y a2 vl a7 b.mleft < b.nleft > a7 a7-a2 a6 0 i counter for iloop b.mleft a7 < b.nleft > a7 s.ldas iloop = * do i = 0,a1-1 * note: must be unrolled 7 times to get * max speed a0 a5 v2 ,a0,a7 ^ a(i,j1+) s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v3 s3*rv2 v4 v1+fv3 jaz iloop$1 note: this is the exit taken for a1=64 a0 a5 v5 ,a0,a7 ^ s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v6 s3*rv5 v7 v4+fv6 jaz iloop$2 a0 a5 v1 ,a0,a7 ^ s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v2 s3*rv1 v3 v2+fv7 jaz iloop$3 a0 a5 v4 ,a0,a7 ^ s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v5 s3*rv4 v6 v5+fv3 jaz iloop$4 a0 a5 v7 ,a0,a7 ^ s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v1 s3*rv7 v2 v1+fv6 jaz iloop$5 a0 a5 v3 ,a0,a7 ^ s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v4 s3*rv3 v5 v4+fv2 jaz iloop$6 a0 a5 v6 ,a0,a7 ^ s3 v0,a6 a6 a6+1 a5 a5+1 ^ a0 a1-a6 v7 s3*rv6 v1 v7+fv5 jan iloop * ---- used all of current x strip store y & check for a 2nd strip * ---- of y's a0 a4 a6 s.incys ,a0,a6 v1 j rrrr iloop$1 a0 a4 a6 s.incys ,a0,a6 v4 rrrr a6 a6*a2 * not needed a7 s.ldas ^ a7 a7*a2 ^ a0 b.mleft a5 b.ofaij a5 a5+a7 ^ a4 a4+a6 b.ofaij a5 a2 b.mleft vl a2 jan yloopt a2 s.ldas * ^^^^^ a2 a1*a2 ^ a0 b.mval ^ a3 b.ofa1j a3 a3+a1 ^ b.ofa1j a3 jan xloopt a7 s.a7s * -------------------------------- * argadd a4,info,argptr=a7 a1 0 ,a4 a1 = * s6 rt s1 s6-s2 exit * -------------------------------- * iloop$2 a0 a4 a6 s.incys ,a0,a6 v7 j rrrr iloop$3 a0 a4 a6 s.incys ,a0,a6 v3 j rrrr iloop$4 a0 a4 a6 s.incys ,a0,a6 v6 j rrrr iloop$5 a0 a4 a6 s.incys ,a0,a6 v2 j rrrr iloop$6 a0 a4 a6 s.incys ,a0,a6 v5 j rrrr * -------------------------------------- * * ----------- error exits ------------ err1 = * a1 1 j errorx err2 = * * v1 s1-v0 * vm v1,z tells which one of the legal values trans (s1) is. s1 vm a2 zs1 a0 a1-a2 * 7-a2 jam err1 trans not in the set $tstrns ? a1 2 j errorx err3 = * * v1 s1-v0 * vm v1,z tells which one of the legal values trans (s1) is. s1 vm a2 zs1 a0 a1-a2 * 7-a2 jam err1 trans not in the set $tstrns ? a1 3 j errorx err6 = * * v1 s1-v0 * vm v1,z tells which one of the legal values trans (s1) is. s1 vm a2 zs1 a0 a1-a2 * 7-a2 jam err1 trans not in the set $tstrns ? a1 6 * j errorx errorx = * argadd a2,info,argptr=a7 ,a2 a1 j return endp $tstrns con ' 'l con 'n'l con 'n'l con 't'l con 'c'l con 't'l con 'c'l dummy bssz 1 end c micro-tasked version c replace dong2 for with dong2p for2p c replace lltx for with lltpx for c should get "good" cpu utilization for 1 2 or 4 cpus. ***deck dong2p job --- parrallel driver jcl job,jn=-dng2-,t=39,us=bnchbch. account,ac=865606,us=u2240,upw=u2240. access,dn=$proc,pdn=microproc,id=jmn ,own=lps . micro,level=new. fetch,dn=dong2,text=for2p,mf=v3. cft,i=dong2,l=0. fetch,dn=lltx ,text=for ,mf=v3. cft,i=lltx ,l=0. fetch,dn=lltpx,text=for ,mf=v3. *. cft,i=lltpx,l=0. premult,i=lltpx. cft,i=$multf,l=0,alloc=stack. cal,i=$multc,l=0. *. fetch,dn=smxpyc,text=cal3,mf=v3. cal,i=smxpyc,l=0. fetch,dn=sgemv,text=cal,mf=v3. cal,i=sgemv,l=0. dispose,dn=$out,dc=pr,sf=p,defer. ldr,nx,ab=lrun,mm=25000:5000,stk=5000:2000,lib=multlib. lrun. exit. /eof ***deck dong2 for2p --- test driver real mflops,aa(1001,1000),a(1001,1000),b(1001),x(1001) real emax common /statall/ aa,a,b,x,quit logical quit integer nsize(12) data nsize / 48,49,50,51,100 ,150,200,300,400,600 ,750,1000 / quit=.false. lda = 1001 call getcpus(4) c write(6,10) 10 format(' cholesky decomposition timing') write(6,1010) 1010 format(/,1x,100(1h-),/,5x,'version with cal sgemv',/, *1x,100(1h-)) call secinit do 100 nchoose = 10,12 n=nsize(nchoose) if ( n.gt. lda-1 ) stop do 30 j = 1,n do 20 i = j,n aa(i,j) = float(-i*j) aa(j,i) = aa(i,j) 20 continue aa(j,j) = float(j*n*n+1)/2.0 x(j) = 1.0 30 continue do 32 i = 1,n b(i) = 0.0 do 31 j = 1,n b(i) = b(i) + aa(i,j)*x(j) 31 continue 32 continue c write(6,40)lda,n 40 format(/' ------------------lda',i5,' and order is ',i5/) c do 43 j = 1,n do 42 i = 1,n a(i,j) = aa(i,j) 42 continue 43 continue t1 = second(t1) call llt1(a,lda,n,x,ierr) t2 = second(t2) - t1 call llts(a,lda,n,x,b) emax = 0.0 do 41 i = 1,n emax = amax1(emax,abs(x(i))) 41 continue mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6) write(6,50) t2,mflops,emax 50 format(' normal matrix*vector',f9.6,' mflops=',f9.4, $ ' check=',e12.3) c ***** go to 2222]skip lltp for now. do 73 j = 1,n do 72 i = 1,n a(i,j) = aa(i,j) 72 continue 73 continue t1 = second(t1) ii1=irtc() call lltpx(a,lda,n,x,ierr) ii2=irtc() t2 = second(t2) - t1 if( ierr .ne. 0 ) write(6,*)' ierr =',ierr call llts(a,lda,n,x,b) emax = 0.0 do 71 i = 1,n emax = amax1(emax,abs(x(i))) 71 continue mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6) write(6,80) t2,mflops,emax 80 format(' parallel',12x,f9.6,' mflops=',f9.4, $ ' check=',e12.3) 2222 continue c do 83 j = 1,n do 82 i = 1,n a(i,j) = aa(i,j) 82 continue 83 continue t1 = second(t1) jj1=irtc() call lltx(a,lda,n,x,ierr) jj2=irtc() t2 = second(t2) - t1 call llts(a,lda,n,x,b) emax = 0.0 do 81 i = 1,n emax = amax1(emax,abs(x(i))) 81 continue mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6) write(6,90) t2,mflops,emax 90 format(' 4-fold matrix*vector',f9.6,' mflops=',f9.4, $ ' check=',e12.3) goto 100 tpar=9.5e-9*(ii2-ii1-3) tsng=9.5e-9*(jj2-jj1-3) flp = float(n*(7+n*(3+n*2)))/(6.*tpar*1.0e6) fls = float(n*(7+n*(3+n*2)))/(6.*tsng*1.0e6) speed=tsng/tpar write(6,95) tpar,tsng,flp,fls,speed 95 format(' clock times par&sng',2g16.6,/, 1' megaflops',2g16.6,/, 2' speedup',g16.6) 100 continue quit=.true. call relcpus stop end subroutine llts (a,lda,n,x,b) real a(lda,*), x(*), b(*), xk c c purpose: c solve the symmetric positive definite system ax = b given the c cholesky factorization of a (as computed in llt). c c additional parameters (not parameters of llt): c c x real(n), solution of linear system c c b real(n), right-hand-side of linear system c c ---------------------------------------------------------------------- c do 10 k = 1, n x(k) = b(k) 10 continue c do 30 k = 1, n xk = x(k)*a(k,k) do 20 i = k+1, n x(i) = x(i) - a(i,k)*xk 20 continue x(k) = xk 30 continue c do 50 k = n, 1, -1 xk = x(k)*a(k,k) do 40 i = 1, k-1 x(i) = x(i) - a(k,i)*xk 40 continue x(k) = xk 50 continue c return end subroutine llt1(a,lda,n,rowi,info) c ** llt.f -- llt decomposition c integer n,lda,info real a(lda,n),rowi(n),t character*1 charn c charn='n' info = 0 do 30 i = 1, n do 10 j = 1, i-1 rowi(j) = -a(i,j) 10 continue c c call smxpy1(n-i+1,a(i,i),i-1,lda,rowi,a(i,1)) c charn='n' call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,infos ) c if ( infos.ne.0 ) then c write(6,77) infos,i c7 format(' ????? info ne 0',6x,'info',i5,' i',i5) c stop c endif c if (a(i,i) .le. 0) then info = i go to 40 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) do 20 j = i+1, n a(j,i) = t*a(j,i) 20 continue 30 continue 40 return end subroutine lltxo(a,lda,n,rowi,info) c ** llt.f -- llt decomposition c integer n,lda,info real a(lda,n),rowi(n),t character*1 charn c info = 0 charn='n' do 30 i = 1, n do 10 j = 1, i-1 rowi(j) = -a(i,j) 10 continue c call smxpyc(n-i+1,a(i,i),i-1,lda,rowi,a(i,1)) c charn='n' call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,infos ) c if (a(i,i) .le. 0) then info = i go to 40 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) do 20 j = i+1, n a(j,i) = t*a(j,i) 20 continue 30 continue 40 return end subroutine smxpy1 (n1,y,n2,ldm,x,m) c ** smxpy01.f -- smxpy unrolled to a depth of 1 c integer ldm,n1,n2 real y(n1),x(n2),m(ldm,n2) c do 20 j = 1, n2 do 10 i = 1, n1 y(i) = (y(i)) + x(j)*m(i,j) 10 continue 20 continue return end subroutine smxpyx(n1,y,n2,ldm,x,m) c ** smxpy16.f -- smxpy unrolled to a depth of 16 c integer ldm,n1,n2 real y(n1),x(n2),m(ldm,n2) c c cleanup odd vector c j = mod(n2,2) if (j .ge. 1) then do 10 i = 1, n1 y(i) = (y(i)) + x(j)*m(i,j) 10 continue endif c c cleanup odd group of two vectors c j = mod(n2,4) if (j .ge. 2) then do 20 i = 1, n1 y(i) = ( (y(i)) $ + x(j-1)*m(i,j-1)) + x(j)*m(i,j) 20 continue endif c c cleanup odd group of four vectors c j = mod(n2,8) if (j .ge. 4) then do 30 i = 1, n1 y(i) = ((( (y(i)) $ + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2)) $ + x(j-1)*m(i,j-1)) + x(j) *m(i,j) 30 continue endif c c cleanup odd group of eight vectors c j = mod(n2,16) if (j .ge. 8) then do 40 i = 1, n1 y(i) = ((((((( (y(i)) $ + x(j-7)*m(i,j-7)) + x(j-6)*m(i,j-6)) $ + x(j-5)*m(i,j-5)) + x(j-4)*m(i,j-4)) $ + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2)) $ + x(j-1)*m(i,j-1)) + x(j) *m(i,j) 40 continue endif c c main loop - groups of sixteen vectors c jmin = j+16 do 60 j = jmin, n2, 16 do 50 i = 1, n1 y(i) = ((((((((((((((( (y(i)) $ + x(j-15)*m(i,j-15)) + x(j-14)*m(i,j-14)) $ + x(j-13)*m(i,j-13)) + x(j-12)*m(i,j-12)) $ + x(j-11)*m(i,j-11)) + x(j-10)*m(i,j-10)) $ + x(j- 9)*m(i,j- 9)) + x(j- 8)*m(i,j- 8)) $ + x(j- 7)*m(i,j- 7)) + x(j- 6)*m(i,j- 6)) $ + x(j- 5)*m(i,j- 5)) + x(j- 4)*m(i,j- 4)) $ + x(j- 3)*m(i,j- 3)) + x(j- 2)*m(i,j- 2)) $ + x(j- 1)*m(i,j- 1)) + x(j) *m(i,j) 50 continue 60 continue return end subroutine secinit common /secndw/ it0 it0=irtc() return end function second( dum ) common /secndw/ it0 second=9.5e-9*(irtc()-it0) return end ***deck lltpx for --- parrallel solver subroutine lltpx (a,lda,n,rowi,info) c ** llt.f -- llt decomposition integer n,lda,info real a(lda,n),rowi(n),t c timings on 4 cpus of an x-mp48 c n time seconds megaflops c 50 0.000628 68.4028 c 100 0.001724 196.2807 c 150 0.003484 326.1563 c 200 0.006564 409.3361 c 300 0.016532 547.1537 c 400 0.034715 616.8475 c 600 0.106640 676.8635 c 750 0.201663 698.7252 c 1000 0.460960 724.2154 common /qqqw/ inorml character*1 inorml common /lltpcom/ ldav,nv,i ,rowip1,rowip2,rowip3 real rowip1(2000) real rowip2(2000) real rowip3(2000) inorml='n' ldav=lda nv=n info = 0 n1=n-mod(n,4) do 30 i = 1, n1,4 if ( min0(i,n-i).lt.16 ) then ] use 1 cpu only do 10 j = 1, i-1 rowi (j) = -a(i ,j) rowip1(j) = -a(i+1,j) rowip2(j) = -a(i+2,j) rowip3(j) = -a(i+3,j) 10 continue call sgemv( inorml,n-i+1,i-1,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,info ) if( i .ne. n ) then call sgemv( inorml,n-i,i-1,1. ,a(i+1,1),lda * ,rowip1,1 ,a(i+1,i+1),1 ,info ) endif if( i .lt. n-1 ) then call sgemv( inorml,n-i-1,i-1,1. ,a(i+2,1),lda * ,rowip2,1 ,a(i+2,i+2),1 ,info ) endif if( i .lt. n-2 ) then call sgemv( inorml,n-i-2,i-1,1. ,a(i+3,1),lda * ,rowip3,1 ,a(i+3,i+3),1 ,info ) endif else ] do in parrallel call lltpar( a,rowi ) endif if (a(i,i) .le. 0) then info = i go to 70 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) j=i+1 a(j,i) = a(i,i)*a(j,i) a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21 cdir$ ivdep do 20 j = i+2, n a(j,i) = a(i,i)*a(j,i) a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21 20 continue c the missing pieces for i+1 if (a(i+1,i+1) .le. 0) then info = i+1 go to 70 endif a(i+1,i+1) = 1./sqrt(a(i+1,i+1)) c t = a(i+1,i+1) j=i+2 a(j,i+1) = a(i+1,i+1)*a(j,i+1) a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) ) $ - a(i+2,i+1)*a(j,i+1) cdir$ ivdep do 22 j = i+3, n a(j,i+1) = a(i+1,i+1)*a(j,i+1) a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) ) $ - a(i+2,i+1)*a(j,i+1) 22 continue c the missing pieces for i+2 if (a(i+2,i+2) .le. 0) then info = i+2 go to 70 endif a(i+2,i+2) = 1./sqrt(a(i+2,i+2)) c t = a(i+2,i+2) j=i+3 a(j,i+2) = a(i+2,i+2)*a(j,i+2) a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) ) $ - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2) cdir$ ivdep do 24 j = i+4, n a(j,i+2) = a(i+2,i+2)*a(j,i+2) a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) ) $ - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2) 24 continue c the missing pieces for i+3 if (a(i+3,i+3) .le. 0) then info = i+3 go to 70 endif a(i+3,i+3) = 1./sqrt(a(i+3,i+3)) c t = a(i+3,i+3) cdir$ ivdep do 26 j = i+4, n a(j,i+3) = a(i+3,i+3)*a(j,i+3) 26 continue 30 continue do 60 i = n1+1, n do 40 j = 1, i-1 rowi(j) = -a(i,j) 40 continue call sgemv (inorml,n-i+1 ,i-1 ,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,infos ) if (a(i,i) .le. 0) then info = i go to 70 endif a(i,i) = 1/sqrt(a(i,i)) t = a(i,i) do 50 j = i+1, n a(j,i) = t*a(j,i) 50 continue 60 continue 70 continue * write(6,44) n,info *4 format(' lltx n,info',2i5) return end cmic$ micro subroutine lltpar ( a,rowi ) real a(lda,n),rowi(n) common /qqqw/ inorml character*1 inorml common /lltpcom/ lda ,n ,i ,rowip1,rowip2,rowip3 real rowip1(2000) real rowip2(2000) real rowip3(2000) cmic$ process do 10 j = 1, i-1 rowi (j) = -a(i ,j) 10 continue call sgemv( inorml,n-i+1,i-1,1. ,a(i,1),lda * ,rowi,1 ,a(i,i),1 ,info ) cmic$ also process if( i .ne. n ) then do 11 j = 1, i-1 rowip1(j) = -a(i+1,j) 11 continue call sgemv( inorml,n-i,i-1,1. ,a(i+1,1),lda * ,rowip1,1 ,a(i+1,i+1),1 ,info ) endif cmic$ also process if( i .lt. n-1 ) then do 12 j = 1, i-1 rowip2(j) = -a(i+2,j) 12 continue call sgemv( inorml,n-i-1,i-1,1. ,a(i+2,1),lda * ,rowip2,1 ,a(i+2,i+2),1 ,info ) endif cmic$ also process if( i .lt. n-2 ) then do 13 j = 1, i-1 rowip3(j) = -a(i+3,j) 13 continue call sgemv( inorml,n-i-2,i-1,1. ,a(i+3,1),lda * ,rowip3,1 ,a(i+3,i+3),1 ,info ) endif cmic$ end process return end .