c  lu decomposition (this is essentially your 4 at a time algorithm
c  with a few coding changes.
job,jn=-dng2-,t=15.us=bnchbch.
account,ac=865606,us=u2240,upw=u2240.
fetch,dn=dong2,text=for2,mf=v3.
cft,i=dong2,l=0.
fetch,dn=lltx ,text=for ,mf=v3.
cft,i=lltx ,l=0.
fetch,dn=smxpyc,text=cal3,mf=v3.
cal,i=smxpyc,l=0.
fetch,dn=sgemv,text=cal,mf=v3.
cal,i=sgemv,l=0.
dispose,dn=$out,dc=pr,sf=p,defer.
ldr,nx,ab=lrun,mm=25000:5000,stk=5000:2000.
lrun.
exit.
/eof
***deck dong2 for2  --- test driver 1cpu
      real mflops,aa(1001,1000),a(1001,1000),b(1001),x(1001)
      real emax
      common /statall/ aa,a,b,x,quit
      logical quit

      integer nsize(12)
      data nsize / 48,49,50,51,100 ,150,200,300,400,600 ,750,1000 /

      quit=.false.
      lda = 1001
c
      write(6,10)
   10 format('  cholesky decomposition timing')
      write(6,1010)
1010  format(/,1x,100(1h-),/,5x,'version with cal sgemv',/,
     *1x,100(1h-))
      do 100 nchoose = 1,10
      n=nsize(nchoose)
      if ( n.gt. lda-1 ) stop
         do 30 j = 1,n
            do 20 i = j,n
               aa(i,j) = float(-i*j)
               aa(j,i) = aa(i,j)
   20       continue
            aa(j,j) = float(j*n*n+1)/2.0
            x(j) = 1.0
   30    continue
         do 32 i = 1,n
            b(i) = 0.0
            do 31 j = 1,n
               b(i) = b(i) + aa(i,j)*x(j)
   31       continue
   32    continue
c
         write(6,40)lda,n
   40    format(/' sp size of the arrays',i5,' and order is ',i5/)
c
         do 43 j = 1,n
            do 42 i = 1,n
               a(i,j) = aa(i,j)
   42       continue
   43    continue
         t1 = second(t1)
         call llt1(a,lda,n,x,ierr)
         t2 = second(t2) - t1
         call llts(a,lda,n,x,b)
         emax = 0.0
         do 41 i = 1,n
            emax = amax1(emax,abs(x(i)))
   41    continue
         mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6)
         write(6,50) t2,mflops,emax
   50    format(' normal matrix*vector',f9.3,' mflops=',f9.4,
     $          ' check=',e12.3)
c
      go to 2222]skip lltp for now.
         do 73 j = 1,n
            do 72 i = 1,n
               a(i,j) = aa(i,j)
   72       continue
   73    continue
         t1 = second(t1)
      ii1=irtc()
         call lltp(a,lda,n,x,ierr)
      ii2=irtc()
         t2 = second(t2) - t1
         if( ierr .ne. 0 ) write(6,*)' ierr =',ierr
         call llts(a,lda,n,x,b)
         emax = 0.0
         do 71 i = 1,n
            emax = amax1(emax,abs(x(i)))
   71    continue
         mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6)
         write(6,80) t2,mflops,emax
   80    format(' parallel version   time = ',e12.3,' mflops = ',f9.4,
     $          ' check = ',e12.3)
2222  continue
c
         do 83 j = 1,n
            do 82 i = 1,n
               a(i,j) = aa(i,j)
   82       continue
   83    continue
         t1 = second(t1)
      jj1=irtc()
         call lltx(a,lda,n,x,ierr)
      jj2=irtc()
         t2 = second(t2) - t1
         call llts(a,lda,n,x,b)
         emax = 0.0
         do 81 i = 1,n
            emax = amax1(emax,abs(x(i)))
   81    continue
         mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6)
         write(6,90) t2,mflops,emax
   90    format(' 4-fold matrix*vector',f9.3,' mflops=',f9.4,
     $          ' check=',e12.3)
      goto 100
       tpar=9.5e-9*(ii2-ii1-3)
       tsng=9.5e-9*(jj2-jj1-3)
       flp = float(n*(7+n*(3+n*2)))/(6.*tpar*1.0e6)
       fls = float(n*(7+n*(3+n*2)))/(6.*tsng*1.0e6)
      speed=tsng/tpar
       write(6,95) tpar,tsng,flp,fls,speed
95    format(' clock times par&sng',2g16.6,/,
     1' megaflops',2g16.6,/,
     2' speedup',g16.6)

  100 continue

      quit=.true.
      call killcpu

      stop
      end
      subroutine llts (a,lda,n,x,b)
      real a(lda,*), x(*), b(*), xk
c
c   purpose:
c     solve the symmetric positive definite system ax = b given the
c     cholesky factorization of a (as computed in llt).
c
c   additional parameters (not parameters of llt):
c
c     x real(n), solution of linear system
c
c     b real(n), right-hand-side of linear system
c
c ----------------------------------------------------------------------
c
      do 10 k = 1, n
       x(k) = b(k)
   10 continue
c
      do 30 k = 1, n
       xk = x(k)*a(k,k)
       do 20 i = k+1, n
          x(i) = x(i) - a(i,k)*xk
   20    continue
       x(k) = xk
   30 continue
c
      do 50 k = n, 1, -1
       xk = x(k)*a(k,k)
       do 40 i = 1, k-1
          x(i) = x(i) - a(k,i)*xk
   40    continue
       x(k) = xk
   50 continue
c
      return
      end
      subroutine llt1(a,lda,n,rowi,info)
c ** llt.f -- llt decomposition
c
      integer n,lda,info
      real a(lda,n),rowi(n),t
      character*1 charn
c
         charn='n'
      info = 0
      do 30 i = 1, n
         do 10 j = 1, i-1
            rowi(j) = -a(i,j)
   10    continue
c
c        call smxpy1(n-i+1,a(i,i),i-1,lda,rowi,a(i,1))
c        charn='n'
         call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda
     *              ,rowi,1 ,a(i,i),1 ,infos )
c     if ( infos.ne.0 ) then
c     write(6,77) infos,i
c7    format(' ????? info ne 0',6x,'info',i5,'  i',i5)
c     stop
c     endif

c
         if (a(i,i) .le. 0) then
            info = i
            go to 40
         endif
         a(i,i) = 1/sqrt(a(i,i))
c
         t = a(i,i)
         do 20 j = i+1, n
            a(j,i) = t*a(j,i)
   20    continue
   30 continue
   40 return
      end
      subroutine lltxo(a,lda,n,rowi,info)
c ** llt.f -- llt decomposition
c
      integer n,lda,info
      real a(lda,n),rowi(n),t
      character*1 charn
c
      info = 0
      charn='n'
      do 30 i = 1, n
         do 10 j = 1, i-1
            rowi(j) = -a(i,j)
   10    continue

c        call smxpyc(n-i+1,a(i,i),i-1,lda,rowi,a(i,1))
c        charn='n'
         call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda
     *              ,rowi,1 ,a(i,i),1 ,infos )
c
         if (a(i,i) .le. 0) then
            info = i
            go to 40
         endif
         a(i,i) = 1/sqrt(a(i,i))
c
         t = a(i,i)
         do 20 j = i+1, n
            a(j,i) = t*a(j,i)
   20    continue
   30 continue
   40 return
      end
      subroutine smxpy1 (n1,y,n2,ldm,x,m)
c ** smxpy01.f -- smxpy unrolled to a depth of 1
c
      integer ldm,n1,n2
      real y(n1),x(n2),m(ldm,n2)
c
      do 20 j = 1, n2
         do 10 i = 1, n1
            y(i) = (y(i)) + x(j)*m(i,j)
   10    continue
   20 continue
      return
      end
      subroutine smxpyx(n1,y,n2,ldm,x,m)
c ** smxpy16.f -- smxpy unrolled to a depth of 16
c
      integer ldm,n1,n2
      real y(n1),x(n2),m(ldm,n2)
c
c   cleanup odd vector
c
      j = mod(n2,2)
      if (j .ge. 1) then
         do 10 i = 1, n1
            y(i) = (y(i)) + x(j)*m(i,j)
   10    continue
      endif
c
c   cleanup odd group of two vectors
c
      j = mod(n2,4)
      if (j .ge. 2) then
         do 20 i = 1, n1
            y(i) = ( (y(i))
     $             + x(j-1)*m(i,j-1)) + x(j)*m(i,j)
   20    continue
      endif
c
c   cleanup odd group of four vectors
c
      j = mod(n2,8)
      if (j .ge. 4) then
         do 30 i = 1, n1
            y(i) = ((( (y(i))
     $             + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2))
     $             + x(j-1)*m(i,j-1)) + x(j)  *m(i,j)
   30    continue
      endif
c
c   cleanup odd group of eight vectors
c
      j = mod(n2,16)
      if (j .ge. 8) then
         do 40 i = 1, n1
            y(i) = ((((((( (y(i))
     $             + x(j-7)*m(i,j-7)) + x(j-6)*m(i,j-6))
     $             + x(j-5)*m(i,j-5)) + x(j-4)*m(i,j-4))
     $             + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2))
     $             + x(j-1)*m(i,j-1)) + x(j)  *m(i,j)
   40    continue
      endif
c
c   main loop - groups of sixteen vectors
c
      jmin = j+16
      do 60 j = jmin, n2, 16
         do 50 i = 1, n1
            y(i) = ((((((((((((((( (y(i))
     $             + x(j-15)*m(i,j-15)) + x(j-14)*m(i,j-14))
     $             + x(j-13)*m(i,j-13)) + x(j-12)*m(i,j-12))
     $             + x(j-11)*m(i,j-11)) + x(j-10)*m(i,j-10))
     $             + x(j- 9)*m(i,j- 9)) + x(j- 8)*m(i,j- 8))
     $             + x(j- 7)*m(i,j- 7)) + x(j- 6)*m(i,j- 6))
     $             + x(j- 5)*m(i,j- 5)) + x(j- 4)*m(i,j- 4))
     $             + x(j- 3)*m(i,j- 3)) + x(j- 2)*m(i,j- 2))
     $             + x(j- 1)*m(i,j- 1)) + x(j)   *m(i,j)
   50    continue
   60 continue
      return
      end
***deck lltx for    --- decomposition routine
      subroutine lltx (a,lda,n,rowi,info)
c ** llt.f -- llt decomposition

      integer n,lda,info
      real a(lda,n),rowi(n),t

c     timings on 1 cpu of an x-mp2.16

c    n    time seconds    megaflops
c   50     0.000787   54.5785
c  100     0.003107  108.9173
c  150     0.008226  138.1521
c  200     0.017643  152.2911
c  300     0.053245  169.8830
c  400     0.121649  176.0298
c  600     0.394672  182.8876
c  750     0.762169  184.8765
c 1000     1.794049  186.0788

      common /qqqw/ inorml
      character*1 inorml

      real rowip1(2000)
      real rowip2(2000)
      real rowip3(2000)
      inorml='n'

      info = 0
      n1=n-mod(n,4)
      do 30 i = 1, n1,4
         do 10 j = 1, i-1
            rowi  (j) = -a(i  ,j)
            rowip1(j) = -a(i+1,j)
            rowip2(j) = -a(i+2,j)
            rowip3(j) = -a(i+3,j)
   10    continue

         call sgemv( inorml,n-i+1,i-1,1. ,a(i,1),lda
     *   ,rowi,1 ,a(i,i),1 ,info )

         if( i .ne. n ) then
           call sgemv( inorml,n-i,i-1,1. ,a(i+1,1),lda
     *     ,rowip1,1 ,a(i+1,i+1),1 ,info )
         endif

         if( i .lt. n-1 ) then
           call sgemv( inorml,n-i-1,i-1,1. ,a(i+2,1),lda
     *     ,rowip2,1 ,a(i+2,i+2),1 ,info )
         endif

         if( i .lt. n-2 ) then
           call sgemv( inorml,n-i-2,i-1,1. ,a(i+3,1),lda
     *     ,rowip3,1 ,a(i+3,i+3),1 ,info )
         endif

         if (a(i,i) .le. 0) then
            info = i
            go to 70
         endif

         a(i,i) = 1/sqrt(a(i,i))

c        t = a(i,i)
         j=i+1
         a(j,i) = a(i,i)*a(j,i)
         a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21
cdir$ ivdep
         do 20 j = i+2, n
            a(j,i) = a(i,i)*a(j,i)
            a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21
   20    continue

c        the missing pieces for i+1


         if (a(i+1,i+1) .le. 0) then
            info = i+1
            go to 70
         endif
         a(i+1,i+1) = 1./sqrt(a(i+1,i+1))

c        t = a(i+1,i+1)
         j=i+2
         a(j,i+1) = a(i+1,i+1)*a(j,i+1)
         a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) )
     $                         - a(i+2,i+1)*a(j,i+1)
cdir$ ivdep
         do 22 j = i+3, n
            a(j,i+1) = a(i+1,i+1)*a(j,i+1)
            a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) )
     $                      - a(i+2,i+1)*a(j,i+1)
   22    continue

c        the missing pieces for i+2


         if (a(i+2,i+2) .le. 0) then
            info = i+2
            go to 70
         endif
         a(i+2,i+2) = 1./sqrt(a(i+2,i+2))

c        t = a(i+2,i+2)
         j=i+3
         a(j,i+2) = a(i+2,i+2)*a(j,i+2)
         a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) )
     $            - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2)
cdir$ ivdep
         do 24 j = i+4, n
            a(j,i+2) = a(i+2,i+2)*a(j,i+2)
            a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) )
     $            - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2)
   24    continue

c        the missing pieces for i+3


         if (a(i+3,i+3) .le. 0) then
            info = i+3
            go to 70
         endif
         a(i+3,i+3) = 1./sqrt(a(i+3,i+3))

c        t = a(i+3,i+3)
cdir$ ivdep
         do 26 j = i+4, n
            a(j,i+3) = a(i+3,i+3)*a(j,i+3)
   26    continue
   30 continue

      do 60 i = n1+1, n
         do 40 j = 1, i-1
            rowi(j) = -a(i,j)
   40    continue
         call sgemv (inorml,n-i+1 ,i-1 ,1. ,a(i,1),lda
     *              ,rowi,1 ,a(i,i),1 ,infos )

         if (a(i,i) .le. 0) then
            info = i
            go to 70
         endif
         a(i,i) = 1/sqrt(a(i,i))
         t = a(i,i)
         do 50 j = i+1, n
            a(j,i) = t*a(j,i)
   50    continue
   60 continue
   70 continue
      return
      end
***deck smxpyc cal3 --- probably not needed
*     subroutine smxpyc( n1,y,n2,ldm,x,m )
*     real y(n1),x(n2),m(ldm,n2)
*        do 10 j=1,n2
*        do 10 i=1,n1
*10      y(i)=y(i) +x(j)*m(i,j)
*     return
*     end

          ident     smxpyc

n1        defarg
y         defarg
n2        defarg
ldm       defarg
x         defarg
m         defarg

ofx       defb      temp
ofxpls    defb      temp
ofyi1     defb      temp
ofmi1     defb      temp
n2val     defb      temp

smxpyc    enter

          argadd    a2,n2,use=a7
          a2        ,a2
          argadd    a1,n1,argptr=a7
          a1        ,a1
          argadd    a5,ldm,argptr=a7
          argadd    a3,y,argptr=a7
          a5        ,a5
          argadd    a4,x,argptr=a7
          argadd    a6,m,argptr=a7

          b.ofyi1   a3
          b.ofx     a4
          b.ofmi1   a6
          a0        -a2
          b.n2val   a2
          jap       return

* -------------------------------- *

jsize     =         *

***    determine vect len to use this pass
***    a1:jtot
***    a6:scratch
***    a7: vl output

* --------- load initial x as soon as possible ---------
          a0        b.ofx
          a4        b.n2val
          vl        a4
          v1        ,a0,1
          a3        vl             no. of x in v1
* ------------------------------------------------------


          a6        128            a7=64 $ if( a1 ge 128 ) goto sized
          a0        a1-a6
          a7        64
          jap       sized

          a6        64             a7=a1 $ if( a1 le 64 ) goto sized
          a0        a6-a1
          a7        a1
          jap       sized

          s1        a1             ( 64<a1<128 ) a7=a1/2
          s1        s1>1
          a7        s1

sized     vl        a7

          a0        b.ofyi1
          v0        ,a0,1          load y(i1+)
          a1        a1-a7          a1: new jtot

          a2        b.ofx
          b.ofxpls  a2             reset counter in b.ofxpls
          a4        b.n2val        no. x left to process

          a6        b.ofmi1        addr to start loading m from
          a2        a6+a7
          b.ofmi1   a2             update for next time around
          j         xloaded

newx      =         *              refresh x which contains scalars
          vl        a4
          a0        b.ofxpls
          v1        ,a0,1          x values
          a3        vl             no. of x in v1
xloaded   a2        b.ofxpls
          a2        a2+a3
          vl        a7             reset vect length
          b.ofxpls  a2             update addr of x
          a2        0              counter for x in v1

          a4        a4-a3

* ---------------------------------------------------------------

workloop  =         *
          a0        a6
          v2        ,a0,1          load m

          s1        v1,a2          get x
          a2        a2+1
          a6        a6+a5

          v3        s1*rv2
          a0        a2-a3
          v6        v0+fv3         new y
          jaz       xout2
* - - - - - - - - - - - - - - - - - - - - - - - - -
          a0        a6
          v4        ,a0,1          load m

          s1        v1,a2          get x
          a2        a2+1
          a6        a6+a5

          v5        s1*rv4
          a0        a2-a3
          v0        v6+fv5         new y
          jan       workloop

*  -----------------------------------------------  *

xout      =         *              x counter ran out
          a0        a4             check if done with all x
          jaz       newi1
          j         newx

newi1     =         *              done with all x for these values
          a0        b.ofyi1        of y
          s0        a1
          a2        b.ofyi1
          a2        a2+a7
          b.ofyi1   a2
          ,a0,1     v0             store y
          jsn       jsize          go back for new i1

return    exit

xout2     v0        v6
          j         xout
          end
***deck sgemv cal   --- matrix vector kernal
*     subroutine sgemv
*
*     linpak level2 blas
*
          ident     sgemv

trans     defarg
m         defarg
n         defarg
alpha     defarg
aa        defarg
lda       defarg
x         defarg
incx      defarg
y         defarg
incy      defarg
info      defarg

mval      defb      temp           value of m
nval      defb      temp           value of n
ofx       defb      temp           loc of (@)x(1)
ofy       defb      temp           @y(1)
ofa1j     defb      temp           @a( 1,j1)
ofaij     defb      temp           @a(i1,j1)
mleft     defb      temp           # of m remaining in yloop
y1r       defb      temp           temp sto for 1st resid yloop
ldas      =         4              lda
incxs     =         5              incx
incys     =         6              incy
a7s       =         7              a7

sgemv     enter

          s2        rt             ******************************
          argadd    a1,trans,use=a7
          argadd    a2,m,argptr=a7
          argadd    a3,n,argptr=a7
          argadd    a4,lda,argptr=a7
          argadd    a5,incx,argptr=a7
          argadd    a6,incy,argptr=a7

          s1        ,a1            trans
          a2        ,a2            m
          a3        ,a3            n
          a4        ,a4            lda
          a5        ,a5            incx
          a6        ,a6            incy

          a1        7              load v0 with a list of the legal
          vl        a1             values of trans
          a0        $tstrns
          v0        ,a0,1
          v1        s1-v0
          vm        v1,z           tells which one of the legal values
                                   trans (s1) is.

          a0        a2             test m,n,lda for illegal values.
          s0        +a3
          jam       err2           m<0   ?
          a0        a4-a2          lda-m
          jsm       err3           n<0   ?
          jam       err6           m>lda ?

          s0        +a5
          a0        a6

          b.mval    a2
          b.nval    a3
          s.ldas    a4

          jsp       incxp          find iabs(incx)&iabs(incy)
          a5        a0-a5
incxp     jap       incyp
          a6        a0-a6
incyp     =         *

          s1        vm
          a2        zs1
          a1        2
          s.incxs   a5             (s5=incx)
          a0        a1-a2
*                   2-a2

          argadd    a3,alpha,argptr=a7
          s.incys   a6             (s6=incy)
          jam       transp         trans not in the set (' ','n','n') ?
          a1        64
          a2        b.mval
          a0        a1-a2
          argadd    a4,x,argptr=a7
          argadd    a5,y,argptr=a7
          argadd    a6,aa,argptr=a7
          vl        a2
          jap       $xxx1
          a2        vl
          a2        a2+a1
          s1        a2
          s1        s1>1
          a2        s1
          vl        a2
$xxx1     =         *
          a0        b.mval
          a1        b.nval
          s0        a1
          a2        vl             now : a2=1st resid of m
          jaz       null           m=0 ?
          jsz       null           n=0 ?
          s1        ,a3            value of alpha

          b.y1r     a2             1st y1 resid
          s.a7s     a7             save the value of a7 to s7
          b.ofx     a4
          b.ofy     a5
          b.ofa1j   a6

* -------------------------------- *

xloop     =         *              get 1st vector of x values
          a0        b.ofx
          a7        b.nval
          vl        a7
          a6        s.incxs
          v7        ,a0,a6
          a1        vl             # of x values this strip
          a6        a1*a6          stride in memory (for x)
          a7        a7-a1
          a4        b.ofy
          b.nval    a7             updated # of n left
          a7        b.ofx
          a7        a7+a6
          a5        b.ofa1j
          b.ofaij   a5
          b.ofx     a7             updated by vector
          v0        s1*rv7         alpha*x

          a2        b.y1r          initial vl for y&a loops
          a7        b.mval
          b.mleft   a7
          vl        a0             no race (with y store & reload)
          a0        dummy          no race (with y store & reload)
          ,a0,a0    v1             no race (with y store & reload)
          vl        a2

yloop     =         *
          a0        a4             @y(i1)
          a6        s.incys
          v1        ,a0,a6         y
          a2        vl
          a7        b.mleft
          a7        a7-a2
          a6        0              j counter for jloop
          b.mleft   a7
          a7        s.ldas

jloop     =         *              do j = 0,a1-1
*                                  note: must be unrolled 7 times to get
*                                  max speed
          a0        a5
          v2        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v3        s3*rv2
          v4        v1+fv3
          jaz       jloop$1        note: this is the exit taken for a1=64

          a0        a5
          v5        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v6        s3*rv5
          v7        v4+fv6
          jaz       jloop$2

          a0        a5
          v1        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v2        s3*rv1
          v3        v2+fv7
          jaz       jloop$3

          a0        a5
          v4        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v5        s3*rv4
          v6        v5+fv3
          jaz       jloop$4

          a0        a5
          v7        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v1        s3*rv7
          v2        v1+fv6
          jaz       jloop$5

          a0        a5
          v3        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v4        s3*rv3
          v5        v4+fv2
          jaz       jloop$6

          a0        a5
          v6        ,a0,1          a(i1+,j)
          s3        v0,a6
          a6        a6+1
          a5        a5+a7
          a0        a1-a6
          v7        s3*rv6
          v1        v7+fv5
          jan       jloop


*  ---- used all of current x strip store y & check for a 2nd strip
*  ---- of y's

          a0        a4
          a6        s.incys
          ,a0,a6    v1
          j         qqqq

jloop$1   a0        a4
          a6        s.incys
          ,a0,a6    v4

qqqq      a6        a6*a2
          a0        b.mleft
          a5        b.ofaij
          a5        a5+a2
          a4        a4+a6
          b.ofaij   a5
          a2        b.mleft
          vl        a2
          jan       yloop

          a2        s.ldas
          a2        a1*a2
          a0        b.nval
          a3        b.ofa1j
          a3        a3+a2
          b.ofa1j   a3
          jan       xloop
          a7        s.a7s

null      argadd    a4,info,argptr=a7
          a1        0
          ,a4       a1


return    =         *
          s6        rt
          s1        s6-s2
          exit

* -------------------------------- *

jloop$2   a0        a4
          a6        s.incys
          ,a0,a6    v7
          j         qqqq

jloop$3   a0        a4
          a6        s.incys
          ,a0,a6    v3
          j         qqqq

jloop$4   a0        a4
          a6        s.incys
          ,a0,a6    v6
          j         qqqq

jloop$5   a0        a4
          a6        s.incys
          ,a0,a6    v2
          j         qqqq

jloop$6   a0        a4
          a6        s.incys
          ,a0,a6    v5
          j         qqqq


* -------------------------------------- *
* -------------------------------------- *
transp    =         *
          a1        6
          a0        a1-a2
          jam       err1           trans not legal ?

* -----   -----   -----   -----   -----   -----   -----   -----*
*   note: below is a copy of the non-transposed case with      *
*         some modifications. all statements which are         *
*         different from the first case have been indicated    *
*         by a ^ in the comment field                          *
* -----   -----   -----   -----   -----   -----   -----   -----*

          a1        64
          a2        b.nval         ^
          a0        a1-a2
          argadd    a4,x,argptr=a7
          argadd    a5,y,argptr=a7
          argadd    a6,aa,argptr=a7
          vl        a2
          jap       $xxx2
          a2        vl
          a2        a2+a1
          s1        a2
          s1        s1>1
          a2        s1
          vl        a2
$xxx2     =         *
          a0        b.nval         ^
          a1        b.mval         ^
          s0        a1
          a2        vl             now : a2=1st resid of m
          jaz       null           m=0 ?
          jsz       null           n=0 ?
          s1        ,a3            value of alpha

          b.y1r     a2             1st y1 resid
          s.a7s     a7             save the value of a7 to s7
          b.ofx     a4
          b.ofy     a5
          b.ofa1j   a6

* -------------------------------- *

xloopt    =         *              get 1st vector of x values
          a0        b.ofx
          a7        b.mval         ^
          vl        a7
          a6        s.incxs
          v7        ,a0,a6
          a1        vl             # of x values this strip
          a6        a1*a6          stride in memory (for x)
          a7        a7-a1
          a4        b.ofy
          b.mval    a7             ^ updated # of n left
          a7        b.ofx
          a7        a7+a6
          a5        b.ofa1j
          b.ofaij   a5
          b.ofx     a7             updated by vector
          v0        s1*rv7         alpha*x

          a2        b.y1r          initial vl for y&a loops
          a7        b.nval         ^
          b.mleft   a7             < b.nleft >
          vl        a0             no race (with y store & reload)
          a0        dummy          no race (with y store & reload)
          ,a0,a0    v1             no race (with y store & reload)
          vl        a2

yloopt    =         *
          a0        a4             @y(j1)
          a6        s.incys
          v1        ,a0,a6         y
          a2        vl
          a7        b.mleft        < b.nleft >
          a7        a7-a2
          a6        0              i counter for iloop
          b.mleft   a7             < b.nleft >
          a7        s.ldas

iloop     =         *              do i = 0,a1-1
*                                  note: must be unrolled 7 times to get
*                                  max speed
          a0        a5
          v2        ,a0,a7         ^ a(i,j1+)
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v3        s3*rv2
          v4        v1+fv3
          jaz       iloop$1        note: this is the exit taken for a1=64

          a0        a5
          v5        ,a0,a7         ^
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v6        s3*rv5
          v7        v4+fv6
          jaz       iloop$2

          a0        a5
          v1        ,a0,a7         ^
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v2        s3*rv1
          v3        v2+fv7
          jaz       iloop$3

          a0        a5
          v4        ,a0,a7         ^
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v5        s3*rv4
          v6        v5+fv3
          jaz       iloop$4

          a0        a5
          v7        ,a0,a7         ^
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v1        s3*rv7
          v2        v1+fv6
          jaz       iloop$5

          a0        a5
          v3        ,a0,a7         ^
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v4        s3*rv3
          v5        v4+fv2
          jaz       iloop$6

          a0        a5
          v6        ,a0,a7         ^
          s3        v0,a6
          a6        a6+1
          a5        a5+1           ^
          a0        a1-a6
          v7        s3*rv6
          v1        v7+fv5
          jan       iloop


*  ---- used all of current x strip store y & check for a 2nd strip
*  ---- of y's

          a0        a4
          a6        s.incys
          ,a0,a6    v1
          j         rrrr

iloop$1   a0        a4
          a6        s.incys
          ,a0,a6    v4

rrrr      a6        a6*a2
* not needed   a7   s.ldas         ^
          a7        a7*a2          ^
          a0        b.mleft
          a5        b.ofaij
          a5        a5+a7          ^
          a4        a4+a6
          b.ofaij   a5
          a2        b.mleft
          vl        a2
          jan       yloopt

          a2        s.ldas
* ^^^^^   a2        a1*a2          ^
          a0        b.mval         ^
          a3        b.ofa1j
          a3        a3+a1          ^
          b.ofa1j   a3
          jan       xloopt
          a7        s.a7s
* -------------------------------- *

          argadd    a4,info,argptr=a7
          a1        0
          ,a4       a1


          =         *
          s6        rt
          s1        s6-s2
          exit

* -------------------------------- *
iloop$2   a0        a4
          a6        s.incys
          ,a0,a6    v7
          j         rrrr

iloop$3   a0        a4
          a6        s.incys
          ,a0,a6    v3
          j         rrrr

iloop$4   a0        a4
          a6        s.incys
          ,a0,a6    v6
          j         rrrr

iloop$5   a0        a4
          a6        s.incys
          ,a0,a6    v2
          j         rrrr

iloop$6   a0        a4
          a6        s.incys
          ,a0,a6    v5
          j         rrrr


* -------------------------------------- *
* ----------- error exits ------------

err1      =         *
          a1        1
          j         errorx

err2      =         *

*         v1        s1-v0
*         vm        v1,z           tells which one of the legal values
                                   trans (s1) is.

          s1        vm
          a2        zs1
          a0        a1-a2
*                   7-a2
          jam       err1           trans not in the set $tstrns ?
          a1        2
          j         errorx

err3      =         *

*         v1        s1-v0
*         vm        v1,z           tells which one of the legal values
                                   trans (s1) is.

          s1        vm
          a2        zs1
          a0        a1-a2
*                   7-a2
          jam       err1           trans not in the set $tstrns ?
          a1        3
          j         errorx

err6      =         *

*         v1        s1-v0
*         vm        v1,z           tells which one of the legal values
                                   trans (s1) is.

          s1        vm
          a2        zs1
          a0        a1-a2
*                   7-a2
          jam       err1           trans not in the set $tstrns ?
          a1        6
*         j         errorx

errorx    =         *
          argadd    a2,info,argptr=a7
          ,a2       a1
          j         return

          endp
$tstrns   con       ' 'l
          con       'n'l
          con       'n'l
          con       't'l
          con       'c'l
          con       't'l
          con       'c'l
dummy     bssz      1

          end

c  micro-tasked version
c  replace dong2 for with dong2p for2p
c  replace lltx for with lltpx for
c  should get "good" cpu utilization for 1 2 or 4 cpus.
***deck dong2p job  --- parrallel driver jcl
job,jn=-dng2-,t=39,us=bnchbch.
account,ac=865606,us=u2240,upw=u2240.
access,dn=$proc,pdn=microproc,id=jmn    ,own=lps    .
micro,level=new.
fetch,dn=dong2,text=for2p,mf=v3.
cft,i=dong2,l=0.
fetch,dn=lltx ,text=for ,mf=v3.
cft,i=lltx ,l=0.
fetch,dn=lltpx,text=for ,mf=v3.
*. cft,i=lltpx,l=0.
premult,i=lltpx.
cft,i=$multf,l=0,alloc=stack.
cal,i=$multc,l=0.
*.
fetch,dn=smxpyc,text=cal3,mf=v3.
cal,i=smxpyc,l=0.
fetch,dn=sgemv,text=cal,mf=v3.
cal,i=sgemv,l=0.
dispose,dn=$out,dc=pr,sf=p,defer.
ldr,nx,ab=lrun,mm=25000:5000,stk=5000:2000,lib=multlib.
lrun.
exit.
/eof
***deck dong2 for2p --- test driver
      real mflops,aa(1001,1000),a(1001,1000),b(1001),x(1001)
      real emax
      common /statall/ aa,a,b,x,quit
      logical quit

      integer nsize(12)
      data nsize / 48,49,50,51,100 ,150,200,300,400,600 ,750,1000 /

      quit=.false.
      lda = 1001
      call getcpus(4)
c
      write(6,10)
   10 format('  cholesky decomposition timing')
      write(6,1010)
1010  format(/,1x,100(1h-),/,5x,'version with cal sgemv',/,
     *1x,100(1h-))
      call secinit
      do 100 nchoose = 10,12
      n=nsize(nchoose)
      if ( n.gt. lda-1 ) stop
         do 30 j = 1,n
            do 20 i = j,n
               aa(i,j) = float(-i*j)
               aa(j,i) = aa(i,j)
   20       continue
            aa(j,j) = float(j*n*n+1)/2.0
            x(j) = 1.0
   30    continue
         do 32 i = 1,n
            b(i) = 0.0
            do 31 j = 1,n
               b(i) = b(i) + aa(i,j)*x(j)
   31       continue
   32    continue
c
         write(6,40)lda,n
   40    format(/' ------------------lda',i5,' and order is ',i5/)
c
         do 43 j = 1,n
            do 42 i = 1,n
               a(i,j) = aa(i,j)
   42       continue
   43    continue
         t1 = second(t1)
         call llt1(a,lda,n,x,ierr)
         t2 = second(t2) - t1
         call llts(a,lda,n,x,b)
         emax = 0.0
         do 41 i = 1,n
            emax = amax1(emax,abs(x(i)))
   41    continue
         mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6)
         write(6,50) t2,mflops,emax
   50    format(' normal matrix*vector',f9.6,' mflops=',f9.4,
     $          ' check=',e12.3)
c
***** go to 2222]skip lltp for now.
         do 73 j = 1,n
            do 72 i = 1,n
               a(i,j) = aa(i,j)
   72       continue
   73    continue
         t1 = second(t1)
      ii1=irtc()
         call lltpx(a,lda,n,x,ierr)
      ii2=irtc()
         t2 = second(t2) - t1
         if( ierr .ne. 0 ) write(6,*)' ierr =',ierr
         call llts(a,lda,n,x,b)
         emax = 0.0
         do 71 i = 1,n
            emax = amax1(emax,abs(x(i)))
   71    continue
         mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6)
         write(6,80) t2,mflops,emax
   80    format(' parallel',12x,f9.6,' mflops=',f9.4,
     $          ' check=',e12.3)
2222  continue
c
         do 83 j = 1,n
            do 82 i = 1,n
               a(i,j) = aa(i,j)
   82       continue
   83    continue
         t1 = second(t1)
      jj1=irtc()
         call lltx(a,lda,n,x,ierr)
      jj2=irtc()
         t2 = second(t2) - t1
         call llts(a,lda,n,x,b)
         emax = 0.0
         do 81 i = 1,n
            emax = amax1(emax,abs(x(i)))
   81    continue
         mflops = float(n*(7+n*(3+n*2)))/(6.*t2*1.0e6)
         write(6,90) t2,mflops,emax
   90    format(' 4-fold matrix*vector',f9.6,' mflops=',f9.4,
     $          ' check=',e12.3)
      goto 100
       tpar=9.5e-9*(ii2-ii1-3)
       tsng=9.5e-9*(jj2-jj1-3)
       flp = float(n*(7+n*(3+n*2)))/(6.*tpar*1.0e6)
       fls = float(n*(7+n*(3+n*2)))/(6.*tsng*1.0e6)
      speed=tsng/tpar
       write(6,95) tpar,tsng,flp,fls,speed
95    format(' clock times par&sng',2g16.6,/,
     1' megaflops',2g16.6,/,
     2' speedup',g16.6)

  100 continue

      quit=.true.
      call relcpus

      stop
      end
      subroutine llts (a,lda,n,x,b)
      real a(lda,*), x(*), b(*), xk
c
c   purpose:
c     solve the symmetric positive definite system ax = b given the
c     cholesky factorization of a (as computed in llt).
c
c   additional parameters (not parameters of llt):
c
c     x real(n), solution of linear system
c
c     b real(n), right-hand-side of linear system
c
c ----------------------------------------------------------------------
c
      do 10 k = 1, n
       x(k) = b(k)
   10 continue
c
      do 30 k = 1, n
       xk = x(k)*a(k,k)
       do 20 i = k+1, n
          x(i) = x(i) - a(i,k)*xk
   20    continue
       x(k) = xk
   30 continue
c
      do 50 k = n, 1, -1
       xk = x(k)*a(k,k)
       do 40 i = 1, k-1
          x(i) = x(i) - a(k,i)*xk
   40    continue
       x(k) = xk
   50 continue
c
      return
      end
      subroutine llt1(a,lda,n,rowi,info)
c ** llt.f -- llt decomposition
c
      integer n,lda,info
      real a(lda,n),rowi(n),t
      character*1 charn
c
         charn='n'
      info = 0
      do 30 i = 1, n
         do 10 j = 1, i-1
            rowi(j) = -a(i,j)
   10    continue
c
c        call smxpy1(n-i+1,a(i,i),i-1,lda,rowi,a(i,1))
c        charn='n'
         call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda
     *              ,rowi,1 ,a(i,i),1 ,infos )
c     if ( infos.ne.0 ) then
c     write(6,77) infos,i
c7    format(' ????? info ne 0',6x,'info',i5,'  i',i5)
c     stop
c     endif

c
         if (a(i,i) .le. 0) then
            info = i
            go to 40
         endif
         a(i,i) = 1/sqrt(a(i,i))
c
         t = a(i,i)
         do 20 j = i+1, n
            a(j,i) = t*a(j,i)
   20    continue
   30 continue
   40 return
      end
      subroutine lltxo(a,lda,n,rowi,info)
c ** llt.f -- llt decomposition
c
      integer n,lda,info
      real a(lda,n),rowi(n),t
      character*1 charn
c
      info = 0
      charn='n'
      do 30 i = 1, n
         do 10 j = 1, i-1
            rowi(j) = -a(i,j)
   10    continue

c        call smxpyc(n-i+1,a(i,i),i-1,lda,rowi,a(i,1))
c        charn='n'
         call sgemv (charn ,n-i+1 ,i-1 ,1. ,a(i,1),lda
     *              ,rowi,1 ,a(i,i),1 ,infos )
c
         if (a(i,i) .le. 0) then
            info = i
            go to 40
         endif
         a(i,i) = 1/sqrt(a(i,i))
c
         t = a(i,i)
         do 20 j = i+1, n
            a(j,i) = t*a(j,i)
   20    continue
   30 continue
   40 return
      end
      subroutine smxpy1 (n1,y,n2,ldm,x,m)
c ** smxpy01.f -- smxpy unrolled to a depth of 1
c
      integer ldm,n1,n2
      real y(n1),x(n2),m(ldm,n2)
c
      do 20 j = 1, n2
         do 10 i = 1, n1
            y(i) = (y(i)) + x(j)*m(i,j)
   10    continue
   20 continue
      return
      end
      subroutine smxpyx(n1,y,n2,ldm,x,m)
c ** smxpy16.f -- smxpy unrolled to a depth of 16
c
      integer ldm,n1,n2
      real y(n1),x(n2),m(ldm,n2)
c
c   cleanup odd vector
c
      j = mod(n2,2)
      if (j .ge. 1) then
         do 10 i = 1, n1
            y(i) = (y(i)) + x(j)*m(i,j)
   10    continue
      endif
c
c   cleanup odd group of two vectors
c
      j = mod(n2,4)
      if (j .ge. 2) then
         do 20 i = 1, n1
            y(i) = ( (y(i))
     $             + x(j-1)*m(i,j-1)) + x(j)*m(i,j)
   20    continue
      endif
c
c   cleanup odd group of four vectors
c
      j = mod(n2,8)
      if (j .ge. 4) then
         do 30 i = 1, n1
            y(i) = ((( (y(i))
     $             + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2))
     $             + x(j-1)*m(i,j-1)) + x(j)  *m(i,j)
   30    continue
      endif
c
c   cleanup odd group of eight vectors
c
      j = mod(n2,16)
      if (j .ge. 8) then
         do 40 i = 1, n1
            y(i) = ((((((( (y(i))
     $             + x(j-7)*m(i,j-7)) + x(j-6)*m(i,j-6))
     $             + x(j-5)*m(i,j-5)) + x(j-4)*m(i,j-4))
     $             + x(j-3)*m(i,j-3)) + x(j-2)*m(i,j-2))
     $             + x(j-1)*m(i,j-1)) + x(j)  *m(i,j)
   40    continue
      endif
c
c   main loop - groups of sixteen vectors
c
      jmin = j+16
      do 60 j = jmin, n2, 16
         do 50 i = 1, n1
            y(i) = ((((((((((((((( (y(i))
     $             + x(j-15)*m(i,j-15)) + x(j-14)*m(i,j-14))
     $             + x(j-13)*m(i,j-13)) + x(j-12)*m(i,j-12))
     $             + x(j-11)*m(i,j-11)) + x(j-10)*m(i,j-10))
     $             + x(j- 9)*m(i,j- 9)) + x(j- 8)*m(i,j- 8))
     $             + x(j- 7)*m(i,j- 7)) + x(j- 6)*m(i,j- 6))
     $             + x(j- 5)*m(i,j- 5)) + x(j- 4)*m(i,j- 4))
     $             + x(j- 3)*m(i,j- 3)) + x(j- 2)*m(i,j- 2))
     $             + x(j- 1)*m(i,j- 1)) + x(j)   *m(i,j)
   50    continue
   60 continue
      return
      end
      subroutine secinit
      common /secndw/ it0
      it0=irtc()
      return
      end
      function second( dum )
      common /secndw/ it0
      second=9.5e-9*(irtc()-it0)
      return
      end
***deck lltpx for   --- parrallel solver
      subroutine lltpx (a,lda,n,rowi,info)
c ** llt.f -- llt decomposition

      integer n,lda,info
      real a(lda,n),rowi(n),t

c     timings on 4 cpus of an x-mp48

c    n    time seconds    megaflops
c   50        0.000628      68.4028
c  100        0.001724     196.2807
c  150        0.003484     326.1563
c  200        0.006564     409.3361
c  300        0.016532     547.1537
c  400        0.034715     616.8475
c  600        0.106640     676.8635
c  750        0.201663     698.7252
c 1000        0.460960     724.2154

      common /qqqw/ inorml
      character*1 inorml
      common /lltpcom/ ldav,nv,i ,rowip1,rowip2,rowip3
      real rowip1(2000)
      real rowip2(2000)
      real rowip3(2000)

      inorml='n'
      ldav=lda
      nv=n

      info = 0
      n1=n-mod(n,4)
      do 30 i = 1, n1,4

      if ( min0(i,n-i).lt.16 ) then ] use 1 cpu only
         do 10 j = 1, i-1
            rowi  (j) = -a(i  ,j)
            rowip1(j) = -a(i+1,j)
            rowip2(j) = -a(i+2,j)
            rowip3(j) = -a(i+3,j)
   10    continue

         call sgemv( inorml,n-i+1,i-1,1. ,a(i,1),lda
     *   ,rowi,1 ,a(i,i),1 ,info )

         if( i .ne. n ) then
           call sgemv( inorml,n-i,i-1,1. ,a(i+1,1),lda
     *     ,rowip1,1 ,a(i+1,i+1),1 ,info )
         endif

         if( i .lt. n-1 ) then
           call sgemv( inorml,n-i-1,i-1,1. ,a(i+2,1),lda
     *     ,rowip2,1 ,a(i+2,i+2),1 ,info )
         endif

         if( i .lt. n-2 ) then
           call sgemv( inorml,n-i-2,i-1,1. ,a(i+3,1),lda
     *     ,rowip3,1 ,a(i+3,i+3),1 ,info )
         endif
         else ] do in parrallel
            call lltpar( a,rowi )
         endif

         if (a(i,i) .le. 0) then
            info = i
            go to 70
         endif

         a(i,i) = 1/sqrt(a(i,i))

c        t = a(i,i)
         j=i+1
         a(j,i) = a(i,i)*a(j,i)
         a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21
cdir$ ivdep
         do 20 j = i+2, n
            a(j,i) = a(i,i)*a(j,i)
            a(j,i+1) =((a(j,i+1))) - a(i+1,i)*a(j,i)] from do 21
   20    continue

c        the missing pieces for i+1


         if (a(i+1,i+1) .le. 0) then
            info = i+1
            go to 70
         endif
         a(i+1,i+1) = 1./sqrt(a(i+1,i+1))

c        t = a(i+1,i+1)
         j=i+2
         a(j,i+1) = a(i+1,i+1)*a(j,i+1)
         a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) )
     $                         - a(i+2,i+1)*a(j,i+1)
cdir$ ivdep
         do 22 j = i+3, n
            a(j,i+1) = a(i+1,i+1)*a(j,i+1)
            a(j,i+2) = ( (a(j,i+2)) - a(i+2,i)*a(j,i) )
     $                      - a(i+2,i+1)*a(j,i+1)
   22    continue

c        the missing pieces for i+2


         if (a(i+2,i+2) .le. 0) then
            info = i+2
            go to 70
         endif
         a(i+2,i+2) = 1./sqrt(a(i+2,i+2))

c        t = a(i+2,i+2)
         j=i+3
         a(j,i+2) = a(i+2,i+2)*a(j,i+2)
         a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) )
     $            - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2)
cdir$ ivdep
         do 24 j = i+4, n
            a(j,i+2) = a(i+2,i+2)*a(j,i+2)
            a(j,i+3) = ( (a(j,i+3)) - a(i+3,i)*a(j,i) )
     $            - a(i+3,i+1)*a(j,i+1) - a(i+3,i+2)*a(j,i+2)
   24    continue

c        the missing pieces for i+3


         if (a(i+3,i+3) .le. 0) then
            info = i+3
            go to 70
         endif
         a(i+3,i+3) = 1./sqrt(a(i+3,i+3))

c        t = a(i+3,i+3)
cdir$ ivdep
         do 26 j = i+4, n
            a(j,i+3) = a(i+3,i+3)*a(j,i+3)
   26    continue
   30 continue

      do 60 i = n1+1, n
         do 40 j = 1, i-1
            rowi(j) = -a(i,j)
   40    continue
         call sgemv (inorml,n-i+1 ,i-1 ,1. ,a(i,1),lda
     *              ,rowi,1 ,a(i,i),1 ,infos )

         if (a(i,i) .le. 0) then
            info = i
            go to 70
         endif
         a(i,i) = 1/sqrt(a(i,i))
         t = a(i,i)
         do 50 j = i+1, n
            a(j,i) = t*a(j,i)
   50    continue
   60 continue
   70 continue
*     write(6,44) n,info
*4    format(' lltx n,info',2i5)
      return
      end

cmic$ micro
      subroutine lltpar ( a,rowi )
      real a(lda,n),rowi(n)

      common /qqqw/ inorml
      character*1 inorml
      common /lltpcom/ lda ,n ,i ,rowip1,rowip2,rowip3
      real rowip1(2000)
      real rowip2(2000)
      real rowip3(2000)

cmic$ process
         do 10 j = 1, i-1
            rowi  (j) = -a(i  ,j)
   10    continue
         call sgemv( inorml,n-i+1,i-1,1. ,a(i,1),lda
     *   ,rowi,1 ,a(i,i),1 ,info )

cmic$ also process
         if( i .ne. n ) then
         do 11 j = 1, i-1
            rowip1(j) = -a(i+1,j)
   11    continue
           call sgemv( inorml,n-i,i-1,1. ,a(i+1,1),lda
     *     ,rowip1,1 ,a(i+1,i+1),1 ,info )
         endif

cmic$ also process
         if( i .lt. n-1 ) then
         do 12 j = 1, i-1
            rowip2(j) = -a(i+2,j)
   12    continue
           call sgemv( inorml,n-i-1,i-1,1. ,a(i+2,1),lda
     *     ,rowip2,1 ,a(i+2,i+2),1 ,info )
         endif

cmic$ also process
         if( i .lt. n-2 ) then
         do 13 j = 1, i-1
            rowip3(j) = -a(i+3,j)
   13    continue
           call sgemv( inorml,n-i-2,i-1,1. ,a(i+3,1),lda
     *     ,rowip3,1 ,a(i+3,i+3),1 ,info )
         endif
cmic$ end process

      return
      end


.