job,jn=cftlist,t= 77.us=bnchbch. account,ac=865606,us=u2240,upw=u2240. cft,l=0. fetch,dn=bndchl,text=for. cft,i=bndchl,l=0. fetch,dn=mxmup,text=cal1,mf=v3. cal,i=mxmup,l=0. fetch,dn=banded,text=for. cft,i=banded,l=0. segldr,i. xxx. /eof real abnd(101,5000) ] sqrt of matrix * ,bbnd(201,5000) ] full matrix for bndchl * ,cbnd(201,5000) ] full matrix for spbfa * ,rhs1(5000) ] right-hand-side for bndchl * ,rhs2(5000) ] " " spbfa common /spacer/ abnd,bbnd,cbnd integer mh(21) data mh / 2,3,4,6,8 ,12,16,24,32,33 ,34,40,48 ] 13 * ,64,65,66,70,84 ,96,97,100 / ] 8 n=5000 do 200 im=1,21 m1=mh(im) m2=2*m1 write(6,1) m2,n 1 format(' problem size m,n',2i6) t0=second() do 2 j=1,101*5000 2 abnd(j,1)=0. do 3 j=1,201*5000 3 bbnd(j,1)=0. do 5 i=1,n 5 abnd(m1+1,i)=1.+ranf() s=.75 do 10 l=m1,1,-1 do 8 i=1,n 8 abnd(l,i)=s*(ranf()-.4) 10 s=.8*s do 15 i=1,m1 do 15 j=1,m1+1-i 15 abnd(j,i)=0. call symmsq( abnd,101 ,n,m1 ,bbnd,201,mc ) do 30 j=1,201*5000 30 cbnd(j,1)=bbnd(j,1) do 35 j=1,n rhs1(j)=ranf()-.5 35 rhs2(j)=rhs1(j) tims=tims+second()-t0 t1=second() call bndchl( bbnd,201,n,m2 ,infox ) t2=second() call spbfa( cbnd,201,n,m2 ,infol ) t3=second() tt=t2-t1 t3=t3-t2 speed=t3/tt write(6,38) infox,tt ,infol,t3 ,speed 38 format(' bndchl info,time',i6,f12.6,/, 1 ' spbfa info,time',i6,f12.6 ,5x,' ratio',f8.3) flops= n*( m2*(m2+1) +m2 +16 ) rate=1.e-6*flops/tt write(6,44) rate 44 format(' rate=',g16.6) call bndchls( bbnd,201,n,m2 ,rhs1,rhs1 ) call spbsl ( cbnd,201,n,m2 ,rhs2 ) siz=0. errs=0. do 45 j=1,n errs=errs+(rhs1(j)-rhs2(j))**2 45 siz=siz+rhs1(j)**2 write(6,46) errs,siz ,tims 46 format(' errs,size',2g16.7,5x,' setup time so far',f9.3 *,/,1x,35(2h-+)) 200 continue stop end subroutine getrow( a,lda ,n,m ,i ,row ) real a(lda,n) ,row(n) data lasti/-99999/ c --- gets the ith row of a symmetric banded matrix --- j1=1 j2=n if ( i.eq.lasti+1 ) then j1=max0( 1,i-2*m-10 ) j2=min0( n,i+2*m ) endif lasti=i do 1 j=j1,j2 1 row(j)=0. do 2 j=0,min0(m,i-1) ] j represents distance left of diagnol 2 row(i-j)=a(m+1-j,i) do 3 j=1,min0(m,n-i) ] j represents distance right of diagnol 3 row(i+j)=a(m+1-j,i+j) return end subroutine getcol( a,lda ,n,m ,i ,row ) real a(lda,n) ,row(n) data lasti/-99999/ c --- gets the ith col of a symmetric banded matrix --- if ( iabs(i-lasti).lt.10 ) then s=1.33*float(m) k=s+16 j1=max0(1,i-k) do 11 j=j1,i-m+5 11 row(j)=0. j2=min0(n,i+k) do 12 j=i+m-5,j2 12 row(j)=0. else do 1 j=1,n 1 row(j)=0. endif lasti=i do 2 j=0,min0(m,i-1) ] j represents distance left of diagnol 2 row(i-j)=a(m+1-j,i) do 3 j=1,min0(m,n-i) ] j represents distance right of diagnol 3 row(i+j)=a(m+1-j,i+j) return end subroutine putval( a,lda ,n,m ,i,j ,val ) real a(lda,n) c --- stores an element into a symmetric band store matrix --- if ( iabs(i-j).gt.m ) return if ( i.lt.1 .or. j.lt.1 .or. i.gt.n .or. j.gt.n ) return ii=max0(i,j) c jj=iabs(i-j) a(m+1-iabs(i-j),ii)=val return end subroutine symmsq( a,lda,n,m ,as,ldas,ms ) real a(lda,n) ,as(ldas,n) * ,r(10000),c(10000) c --- squares the symmetric matrix a --- ms=2*m do 1 j=1,ms+1 do 1 i=1,n 1 as(j,i)=0. c k1=1 c k2=n s=1.3*(m+6) kpos=s do 10 i=1,n call getrow( a,lda,n,m ,i,r ) do 10 j=max0(1,i-ms),i call getcol( a,lda,n,m ,j,c ) s=0. c --------------------------------- k1=max0( 1,min0(i,j)-kpos ) k2=min0( n,max0(i,j)+kpos ) c --------------------------------- do 5 k=k1,k2 5 s=s+r(k)*c(k) call putval( as,ldas,n,ms ,i,j ,s ) 10 continue return end /eof abs=xxx *deck bndchl for subroutine bndchl( a,lda ,n,m ,info ) real a(lda,n) c driver for high-speed banded-cholesky decomposition. c algorith is the same as dongarras matrix times vector method for dense c matrices -only this is applied to banded systems. real rowi(1000) c the first m rows we use ordinary banded decomp c (we could decompose this into 2 matrix times vector operations c where one matrix was rectangular and one was triangular) c -but i didn't bother to. do 50 i=1,m c --- get rowi --- j1=max0(m+2-i,1) do 10 j=j1,m 10 rowi(j)=-a(j,i) nrows=min0(m+1,n+1-i) do 30 jj=j1,m cdir$ ivdep do 30 ii=1,min0(jj,nrows) 30 a(m+2-ii,i+ii-1)=a(m+2-ii,i+ii-1) * +rowi(jj)*a(jj+1-ii,i-1+ii) info=i if(a(m+1,i).le.0. ) go to 60 a(m+1,i)=1./sqrt(a(m+1,i)) do 40 ii=1,nrows-1 40 a(m+1-ii,i+ii)=a(m+1-ii,i+ii)*a(m+1,i) 50 continue c central rows of the matrix we have only triangular matrix times c times vector operations j1=1 nrows= m+1 do 150 i=m+1,n-m c --- get rowi --- do 110 j=j1,m 110 rowi(j)=-a(j,i) c do 130 accomplishes the same result as call mxmup c do 130 jj=j1,m c do 130 ii=1,nrows c30 if ( jj+1-ii .gt. 0 ) c * a(m+2-ii,i+ii-1)=a(m+2-ii,i+ii-1) c * +rowi(jj)*a(jj+1-ii,i-1+ii) call mxmup( a(1,i),lda-1 ,rowi,1 ,a(m+1,i),lda-1 ,m ) info=i if(a(m+1,i).le.0. ) go to 60 a(m+1,i)=1./sqrt(a(m+1,i)) do 140 ii=1,nrows-1 140 a(m+1-ii,i+ii)=a(m+1-ii,i+ii)*a(m+1,i) 150 continue c again do 250 could be accomplished by matrix times vector operations c where we have both rectangular and triangular matrices do 250 i=n+1-m,n c --- get rowi --- j1=max0(m+2-i,1) do 210 j=j1,m 210 rowi(j)=-a(j,i) nrows=min0(m+1,n+1-i) do 230 jj=j1,m cdir$ ivdep do 230 ii=1,min0(jj,nrows) ] nrows 230 ] if ( jj+1-ii .gt. 0 ) * a(m+2-ii,i+ii-1)=a(m+2-ii,i+ii-1) * +rowi(jj)*a(jj+1-ii,i-1+ii) info=i if(a(m+1,i).le.0. ) go to 60 a(m+1,i)=1./sqrt(a(m+1,i)) do 240 ii=1,nrows-1 240 a(m+1-ii,i+ii)=a(m+1-ii,i+ii)*a(m+1,i) 250 continue info=0 60 return end c --- a fortran version (no attempt at efficiency) of the kernal subroutine mxmupf( a,lda ,r,ldr ,y,ldy ,m ) real a(lda,m),r(ldr,m),y(ldy,m) do 10 i=1,m do 10 j=1,m 10 if ( j.ge.i ) y(1,i)=y(1,i)+ r(1,j)*a(j,i) return end subroutine bndchls( a,lda ,n,m ,x,b ) real a(lda,n) ,x(n),b(n) c solve ax=b : where a is cholesky decomposition (output from bndchl) c valid for the case where x and b occupy the same memory storage c as well as the case where x ,and b as disjoint (in memory) if ( loc(x).ne.loc(b) ) then do 1 j=1,n 1 x(j)=b(j) endif do 20 k=1,n x(k)=x(k)*a(m+1,k) cdir$ ivdep do 10 i=k+1,min0(k+m,n) 10 x(i)=x(i)-x(k)*a(m+k+1-i,i) 20 continue do 40 k=n,1,-1 x(k)=x(k)*a(m+1,k) cdir$ ivdep do 30 i=1,min0(k-1,m) 30 x(k-i)=x(k-i)-x(k)*a(m+1-i,k) 40 continue return end subroutine llts (a,lda,n,x,b) real a(lda,*), x(*), b(*), xk c c purpose: c solve the symmetric positive definite system ax = b given the c cholesky factorization of a (as computed in llt). c c additional parameters (not parameters of llt): c c x real(n), solution of linear system c c b real(n), right-hand-side of linear system c c ---------------------------------------------------------------------- c do 10 k = 1, n x(k) = b(k) 10 continue c do 30 k = 1, n ]]]]]]]]]]]]]]] xk = x(k)*a(k,k) do 20 i = k+1, n x(i) = x(i) - a(i,k)*xk 20 continue x(k) = xk 30 continue c do 50 k = n, 1, -1 xk = x(k)*a(k,k) do 40 i = 1, k-1 x(i) = x(i) - a(k,i)*xk 40 continue x(k) = xk 50 continue c return end subroutine llt1(a,lda,n,rowi,info) c ** llt.f -- llt decomposition c integer n,lda,info real a(lda,n),rowi(n),t character*1 charn c charn='n' info = 0 do 30 i = 1, n ]]] should be n do 10 j = 1, i-1 rowi(j) = -a(i,j) 10 continue c call smxpy1(n-i+1,a(i,i),i-1,lda,rowi,a(i,1)) c charn='n' c call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda c * ,rowi,1 ,a(i,i),1 ,infos ) c if ( infos.ne.0 ) then c write(6,77) infos,i c7 format(' ????? info ne 0',6x,'info',i5,' i',i5) c stop c endif c if (a(i,i) .le. 0) then info = i go to 40 endif a(i,i) = 1/sqrt(a(i,i)) c t = a(i,i) do 20 j = i+1, n a(j,i) = t*a(j,i) 20 continue 30 continue 40 return end subroutine smxpy1 (n1,y,n2,ldm,x,m) c ** smxpy01.f -- smxpy unrolled to a depth of 1 c integer ldm,n1,n2 real y(n1),x(n2),m(ldm,n2) c do 20 j = 1, n2 do 10 i = 1, n1 y(i) = (y(i)) + x(j)*m(i,j) 10 continue 20 continue return end *deck mxmup cal1 * subroutine mxmup( q,ldq ,r,ldr ,y,ldy ,m ) * real q(ldq,m),r(ldr,m),y(ldy,m) * * do 10 i=1,m * do 10 j=1,m * 10 if ( j.ge.i ) y(1,i)=y(1,i)+ r(1,j)*q(j,i) * * return * end ident mxmup q defarg matrix ldq defarg leading dim. of q r defarg vector multiplier ldr defarg leading dim of r y defarg vector to be updated ldy defarg leading dim of y m defarg order of problem lmajy defb temp @y at begining of a major iteration lmajr defb temp @r at begining of a major iteration lmajq defb temp @q at begining of a major iteration * ----- note: stored @q-1 lminr defb temp @r at begining of a minor iteration nmaj defb temp # of elements left to process at the beginning of a major iteration nmin defb temp # of elements left to process at the beginning of a minor iteration dimq defb temp leading dim of q dimqp defb temp leading dim of q +1 dimy defb leading dim of y dimr defb leading dim of r lmajyx defb @y at end of a major iteration lmajrx defb @r at end of a major iteration lmajqx defb @q at end of a major iteration nmajx defb # of elements left to process at the end of a major iteration mxmup enter argadd s1,q,use=a7 argadd a1,ldq,argptr=a7 argadd a2,r,argptr=a7 argadd a3,ldr,argptr=a7 argadd a4,y,argptr=a7 argadd a5,ldy,argptr=a7 argadd a6,m,argptr=a7 a7 s1 a1 ,a1 a3 ,a3 a5 ,a5 a6 ,a6 a7 a7-1 b.lmajr a2 b.lmajq a7 b.lmajy a4 b.dimq a1 a1 a1+1 b.dimr a3 b.dimqp a1 b.dimy a5 b.nmaj a6 a0 -a6 test if m<1 jap return major = * a7 b.nmaj vl a7 a0 b.lmajr load rs a1 b.dimr v0 ,a0,a1 a6 vl a1 a1*a6 a2 b.lmajr a2 a2+a1 b.lmajrx a2 b.lminr a2 a0 b.lmajy load ys a1 b.dimy v1 ,a0,a1 a1 a1*a6 a2 b.lmajy a2 a2+a1 b.lmajyx a2 * --- book-keeping for loop ------- a7 a7-a6 a2 a6-1 counter (for r in v0) * a6 block vl a5 a6 decrementing vl a4 b.lmajq @q a4 a4+a6 a3 b.dimq ldq b.nmajx a7 s0 +a7 test after loop for end. s4 a4 save @q for minor its loopm = * a0 a4 v2 ,a0,a3 s1 v0,a2 a2 a2-1 a5 a5-1 a4 a4-1 a0 a5-1 if next vl=0 then quit v3 s1*rv2 v1 v1+fv3 vl a5 jam loopmf a0 a4 v5 ,a0,a3 s1 v0,a2 a2 a2-1 a5 a5-1 a4 a4-1 a0 a5-1 v6 s1*rv5 v1 v1+fv6 vl a5 jap loopm loopmf = * jsz done if s0=0 then we are finished. * --- prepare for minor iterations --- a4 s4 a4 a4+1 @q loop begining * --- compute minor iterations (note if no minor iterations done) minor = * ################################# a0 b.lminr a1 b.dimr vl a5 v0 ,a0,a1 load next 64 rs a5 vl =64 a1 a1*a5 vl*ldr a2 b.lminr vl a6 vl in y dir. a2 a2+a1 a7 a7-a5 # rs remaining after this minor iterat b.lminr a2 s0 +a7 done with this set of minor its? a3 0 counter for scalar rs (in v0) a2 b.dimq loop = * +++++++++++++++++++++++++++++ a0 a4 v2 ,a0,a2 s1 v0,a3 a3 a3+1 a4 a4+1 * a0 a5-a3 don't need since 64 is even v3 s1*rv2 v4 v1+fv3 * jap loop+ a0 a4 2nd copy (unrolled by 2) v5 ,a0,a2 s1 v0,a3 a3 a3+1 a4 a4+1 a0 a5-a3 v6 s1*rv5 v1 v4+fv6 jan loop +++++++++++++++++++++++++++++ jsn minor more minor its ? * --- store out y --- a0 b.lmajy a5 b.dimy ,a0,a5 v1 *** j return ---- //cludge for testing only// ---- * --- prep for next major iter --- a3 b.nmajx b.nmaj a3 a3 b.lmajyx a4 b.dimqp a4 a6*a4 b.lmajy a3 a3 b.lmajq a1 b.lmajrx a3 a3+a4 b.lmajr a1 b.lmajq a3 j major * ------------------------------------------------- done vl a6 a0 b.lmajy a5 b.dimy ,a0,a5 v1 return = * exit end *deck banded for subroutine spbfa(abd,lda,n,m,info) integer lda,n,m,info real abd(lda,1) c c spbfa factors a real symmetric positive definite matrix c stored in band form. c c spbfa is usually called by spbco, but it can be called c directly with a saving in time if rcond is not needed. c c on entry c c abd real(lda, n) c the matrix to be factored. the columns of the upper c triangle are stored in the columns of abd and the c diagonals of the upper triangle are stored in the c rows of abd . see the comments below for details. c c lda integer c the leading dimension of the array abd . c lda must be .ge. m + 1 . c c n integer c the order of the matrix a . c c m integer c the number of diagonals above the main diagonal. c 0 .le. m .lt. n . c c on return c c abd an upper triangular matrix r , stored in band c form, so that a = trans(r)*r . c c info integer c = 0 for normal return. c = k if the leading minor of order k is not c positive definite. c c band storage c c if a is a symmetric positive definite band matrix, c the following program segment will set up the input. c c m = (band width above diagonal) c do 20 j = 1, n c i1 = max0(1, j-m) c do 10 i = i1, j c k = i-j+m+1 c abd(k,j) = a(i,j) c 10 continue c 20 continue c c linpack. this version dated 08/14/78 . c cleve moler, university of new mexico, argonne national lab. c c subroutines and functions c c blas sdot c fortran max0,sqrt c c internal variables c real sdot,t real s integer ik,j,jk,k,mu c begin block with ...exits to 40 c c do 30 j = 1, n info = j s = 0.0e0 ik = m + 1 jk = max0(j-m,1) mu = max0(m+2-j,1) ccc if (m .lt. mu) go to 20 do 10 k = mu, m t = abd(k,j) - sdot(k-mu,abd(ik,jk),1,abd(mu,j),1) t = t/abd(m+1,jk) abd(k,j) = t s = s + t*t ik = ik - 1 jk = jk + 1 10 continue ccc20 continue s = abd(m+1,j) - s ] main diagnol c ......exit if (s .le. 0.0e0) go to 40 abd(m+1,j) = sqrt(s) 30 continue info = 0 40 continue return end subroutine spbsl(abd,lda,n,m,b) integer lda,n,m real abd(lda,1),b(1) c c spbsl solves the real symmetric positive definite band c system a*x = b c using the factors computed by spbco or spbfa. c c on entry c c abd real(lda, n) c the output from spbco or spbfa. c c lda integer c the leading dimension of the array abd . c c n integer c the order of the matrix a . c c m integer c the number of diagonals above the main diagonal. c c b real(n) c the right hand side vector. c c on return c c b the solution vector x . c c error condition c c a division by zero will occur if the input factor contains c a zero on the diagonal. technically this indicates c singularity but it is usually caused by improper subroutine c arguments. it will not occur if the subroutines are called c correctly and info .eq. 0 . c c to compute inverse(a) * c where c is a matrix c with p columns c call spbco(abd,lda,n,rcond,z,info) c if (rcond is too small .or. info .ne. 0) go to ... c do 10 j = 1, p c call spbsl(abd,lda,n,c(1,j)) c 10 continue c c linpack. this version dated 08/14/78 . c cleve moler, university of new mexico, argonne national lab. c c subroutines and functions c c blas saxpy,sdot c fortran min0 c c internal variables c real sdot,t integer k,kb,la,lb,lm c c solve trans(r)*y = b c do 10 k = 1, n lm = min0(k-1,m) la = m + 1 - lm lb = k - lm t = sdot(lm,abd(la,k),1,b(lb),1) b(k) = (b(k) - t)/abd(m+1,k) 10 continue c c solve r*x = y c do 20 kb = 1, n k = n + 1 - kb lm = min0(k-1,m) la = m + 1 - lm lb = k - lm b(k) = b(k)/abd(m+1,k) t = -b(k) call saxpy(lm,t,abd(la,k),1,b(lb),1) 20 continue return end .