/*   Upd.c  */

#include "../DA2.h"

#define MYDEBUG 0

/*--------------------------------------------------------------------*/
/*
   ----------------------------------------------
   compute a nonsymmetric dense update A -= B * C

   created -- 96oct18, cca
   ---------------------------------------------
*/
void
DA2_ndUpd (
   DA2   *A,
   DA2   *B,
   DA2   *C
) {
int      inc1A, inc1B, inc1C, inc2A, inc2B, inc2C, 
         ncolA, ncolB, ncolC, nrowA, nrowB ;
double   *entA, *entB, *entC ;
/*
   --------------------------------
   pull out dimensions and pointers
   --------------------------------
*/
nrowA = A->n1      ;
ncolA = A->n2      ;
inc1A = A->inc1    ;
inc2A = A->inc2    ;
entA  = A->entries ;
nrowB = B->n1      ;
ncolB = B->n2      ;
inc1B = B->inc1    ;
inc2B = B->inc2    ;
entB  = B->entries ;
ncolC = C->n2      ;
inc1C = C->inc1    ;
inc2C = C->inc2    ;
entC  = C->entries ;

if ( inc2B == 1 && inc1C == 1 ) {
/*
   -----------------------------------------
   primary case, B row major, C column major
   -----------------------------------------
*/
   double   *colC0, *colC1, *colC2, *rowB0, *rowB1, *rowB2 ;
   double   sums[9] ;
   int      irowB, i0, i1, i2, j0, j1, j2, jcolC ;
/* 
   --------------------------------
   loop over the columns of A and C
   --------------------------------
*/
   colC0 = entC ;
   j0    =   0  ;
   for ( jcolC = 0 ; jcolC < ncolC - 2 ; jcolC += 3 ) {
      colC1 = colC0 + inc2C ;
      colC2 = colC1 + inc2C ;
      j1    = j0    + inc2A ;
      j2    = j1    + inc2A ;
      i0    = 0 ;
      rowB0 = entB ;
      for ( irowB = 0 ; irowB < nrowB - 2 ; irowB += 3 ) {
         rowB1 = rowB0 + inc1B ;
         rowB2 = rowB1 + inc1B ;
         mdot3x3(sums, ncolB, rowB0, rowB1, rowB2, colC0, colC1, colC2);
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         rowB0 = rowB2 + inc1B ;
         i0    = i2    + inc1A ;
      }
      if ( irowB == nrowB - 2 ) {
         rowB1 = rowB0 + inc1B ;
         mdot2x3(sums, ncolB, rowB0, rowB1, colC0, colC1, colC2);
         i1 = i0 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
      } else if ( irowB == nrowB - 1 ) {
         mdot1x3(sums, ncolB, rowB0, colC0, colC1, colC2);
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
      }
      colC0 = colC2 + inc2C ;
      j0    = j2    + inc2A ;
   }
   if ( jcolC == ncolC - 2 ) {
      colC1 = colC0 + inc2C ;
      j1    = j0    + inc2A ;
      i0    = 0 ;
      rowB0 = entB ;
      for ( irowB = 0 ; irowB < nrowB - 2 ; irowB += 3 ) {
         rowB1 = rowB0 + inc1B ;
         rowB2 = rowB1 + inc1B ;
         mdot3x2(sums, ncolB, rowB0, rowB1, rowB2, colC0, colC1);
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
         rowB0 = rowB2 + inc1B ;
         i0    = i2    + inc1A ;
      }
      if ( irowB == nrowB - 2 ) {
         rowB1 = rowB0 + inc1B ;
         mdot2x2(sums, ncolB, rowB0, rowB1, colC0, colC1);
         i1 = i0 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
      } else if ( irowB == nrowB - 1 ) {
         mdot1x2(sums, ncolB, rowB0, colC0, colC1);
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
      }
   } else if ( jcolC == ncolC - 1 ) {
      i0    = 0 ;
      rowB0 = entB ;
      for ( irowB = 0 ; irowB < nrowB - 2 ; irowB += 3 ) {
         rowB1 = rowB0 + inc1B ;
         rowB2 = rowB1 + inc1B ;
         mdot3x1(sums, ncolB, rowB0, rowB1, rowB2, colC0);
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
         rowB0 = rowB2 + inc1B ;
         i0    = i2    + inc1A ;
      }
      if ( irowB == nrowB - 2 ) {
         rowB1 = rowB0 + inc1B ;
         mdot2x1(sums, ncolB, rowB0, rowB1, colC0);
         i1 = i0 + inc1A ;
#if MYDEBUG > 0
fprintf(stdout, "\n before, entA[%d] = %20.12e", 
        i0 + j0, entA[i0 + j0]) ;
fprintf(stdout, "\n before, entA[%d] = %20.12e", 
        i1 + j0, entA[i1 + j0]) ;
#endif
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
#if MYDEBUG > 0
fprintf(stdout, "\n sums[0] = %20.12e", sums[0]) ;
fprintf(stdout, "\n sums[1] = %20.12e", sums[1]) ;
fprintf(stdout, "\n i0 = %d, i1 = %d, j0 = %d, inc1A = %d, inc2A = %d",
        i0, i1, j0, inc1A, inc2A) ;
fprintf(stdout, "\n entA[%d] = %20.12e", i0 + j0, entA[i0 + j0]) ;
fprintf(stdout, "\n entA[%d] = %20.12e", i1 + j0, entA[i1 + j0]) ;
#endif
      } else if ( irowB == nrowB - 1 ) {
         mdot1x1(sums, ncolB, rowB0, colC0);
         entA[i0 + j0] -= sums[0] ;
      }
   }
} else if ( inc1A == 1 && inc1B == 1 && inc1C == 1 ) {
   double   bi0, bi1, bi2, c00, c01, c02, c10, c11, c12, c20, c21, c22 ;
   double   *colA0, *colA1, *colA2, *colB0, *colB1, *colB2,
            *colC0, *colC1, *colC2 ;
   int      ii, jcolA, jcolB ;
/*
   -------------------------------
   A, B and C are all column major
   -------------------------------
*/
colA0 = A->entries ;
colC0 = C->entries ;
for ( jcolA = 0 ; jcolA < ncolA - 2 ; jcolA += 3 ) {
   colA1 = colA0 + inc2A ;
   colA2 = colA1 + inc2A ;
   colC1 = colC0 + inc2C ;
   colC2 = colC1 + inc2C ;
   colB0 = B->entries ;
   for ( jcolB = 0 ; jcolB < ncolB - 2 ; jcolB += 3 ) {
      colB1 = colB0 + inc2B ;
      colB2 = colB1 + inc2B ;
      c00 = colC0[jcolB]  ; c01 = colC1[jcolB]  ; c02 = colC2[jcolB] ;
      c10 = colC0[jcolB+1]; c11 = colC1[jcolB+1]; c12 = colC2[jcolB+1] ;
      c20 = colC0[jcolB+2]; c21 = colC1[jcolB+2]; c22 = colC2[jcolB+2] ;
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; bi1 = colB1[ii] ; bi2 = colB2[ii] ;
         colA0[ii] -= bi0 * c00 + bi1 * c10 + bi2 * c20 ;
         colA1[ii] -= bi0 * c01 + bi1 * c11 + bi2 * c21 ;
         colA2[ii] -= bi0 * c02 + bi1 * c12 + bi2 * c22 ;
      }
      colB0 = colB2 + inc2B ;
   }
   if ( jcolB == ncolB - 2 ) {
      colB1 = colB0 + inc2B ;
      c00 = colC0[jcolB]  ; c01 = colC1[jcolB]  ; c02 = colC2[jcolB] ;
      c10 = colC0[jcolB+1]; c11 = colC1[jcolB+1]; c12 = colC2[jcolB+1] ;
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; bi1 = colB1[ii] ; 
         colA0[ii] -= bi0 * c00 + bi1 * c10 ;
         colA1[ii] -= bi0 * c01 + bi1 * c11 ;
         colA2[ii] -= bi0 * c02 + bi1 * c12 ;
      }
   } else if ( jcolB == ncolB - 1 ) {
      c00 = colC0[jcolB]  ; c01 = colC1[jcolB]  ; c02 = colC2[jcolB] ;
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; 
         colA0[ii] -= bi0 * c00 ;
         colA1[ii] -= bi0 * c01 ;
         colA2[ii] -= bi0 * c02 ;
      }
   }
   colA0 = colA2 + inc2A ;
   colC0 = colC2 + inc2C ;
}
if ( jcolA == ncolA - 2 ) {
   colA1 = colA0 + inc2A ;
   colC1 = colC0 + inc2C ;
   colB0 = B->entries ;
   for ( jcolB = 0 ; jcolB < ncolB - 2 ; jcolB += 3 ) {
      colB1 = colB0 + inc2B ;
      colB2 = colB1 + inc2B ;
      c00 = colC0[jcolB]  ; c01 = colC1[jcolB]  ; 
      c10 = colC0[jcolB+1]; c11 = colC1[jcolB+1]; 
      c20 = colC0[jcolB+2]; c21 = colC1[jcolB+2]; 
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; bi1 = colB1[ii] ; bi2 = colB2[ii] ;
         colA0[ii] -= bi0 * c00 + bi1 * c10 + bi2 * c20 ;
         colA1[ii] -= bi0 * c01 + bi1 * c11 + bi2 * c21 ;
      }
      colB0 = colB2 + inc2B ;
   }
   if ( jcolB == ncolB - 2 ) {
      colB1 = colB0 + inc2B ;
      c00 = colC0[jcolB]  ; c01 = colC1[jcolB]  ; 
      c10 = colC0[jcolB+1]; c11 = colC1[jcolB+1]; 
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; bi1 = colB1[ii] ; 
         colA0[ii] -= bi0 * c00 + bi1 * c10 ;
         colA1[ii] -= bi0 * c01 + bi1 * c11 ;
      }
   } else if ( jcolB == ncolB - 1 ) {
      c00 = colC0[jcolB]  ; c01 = colC1[jcolB]  ; 
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; 
         colA0[ii] -= bi0 * c00 ;
         colA1[ii] -= bi0 * c01 ;
      }
   }
} else if ( jcolA == ncolA - 1 ) {
   colB0 = B->entries ;
   for ( jcolB = 0 ; jcolB < ncolB - 2 ; jcolB += 3 ) {
      colB1 = colB0 + inc2B ;
      colB2 = colB1 + inc2B ;
      c00 = colC0[jcolB]  ; 
      c10 = colC0[jcolB+1]; 
      c20 = colC0[jcolB+2]; 
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; bi1 = colB1[ii] ; bi2 = colB2[ii] ;
         colA0[ii] -= bi0 * c00 + bi1 * c10 + bi2 * c20 ;
      }
      colB0 = colB2 + inc2B ;
   }
   if ( jcolB == ncolB - 2 ) {
      colB1 = colB0 + inc2B ;
      c00 = colC0[jcolB]  ; 
      c10 = colC0[jcolB+1]; 
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; bi1 = colB1[ii] ; 
         colA0[ii] -= bi0 * c00 + bi1 * c10 ;
      }
   } else if ( jcolB == ncolB - 1 ) {
      c00 = colC0[jcolB]  ; 
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         bi0 = colB0[ii] ; 
         colA0[ii] -= bi0 * c00 ;
      }
   }
}
} else {
/*
   -----------------------------
   fallback case, slow but right
   -----------------------------
*/
   DV       colDV, rowDV ;
   double   value ;
   double   *col, *row ;
   int      ii, jj ;

   DV_setDefaultFields(&colDV) ;
   DV_setDefaultFields(&rowDV) ;
   DV_init(&colDV, ncolB, NULL) ;
   DV_init(&rowDV, ncolB, NULL) ;
   col = DV_entries(&colDV) ;
   row = DV_entries(&rowDV) ;
   for ( jj = 0 ; jj < ncolA ; jj++ ) {
      DA2_extractColumnDV(C, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         DA2_extractRowDV(B, &rowDV, ii) ;
         value = - DVdot(ncolB, row, col) ;
         DA2_addEntry(A, ii, jj, value) ;
      }
   }
   DV_clearData(&colDV) ;
   DV_clearData(&rowDV) ;
}

return ; }

/*--------------------------------------------------------------------*/
/*
   -----------------------------------------------
   compute a nonsymmetric sparse update A -= B * C

   created -- 96oct18, cca
   ----------------------------------------------
*/
void
DA2_nsUpd (
   DA2   *A,
   int   rowloc[],
   int   colloc[],
   DA2   *B,
   DA2   *C
) {
int      inc1A, inc1B, inc1C, inc2A, inc2B, inc2C, 
         ncolA, ncolB, ncolC, nrowA, nrowB ;
double   *entA, *entB, *entC ;
/*
   --------------------------------
   pull out dimensions and pointers
   --------------------------------
*/
nrowA = A->n1      ;
ncolA = A->n2      ;
inc1A = A->inc1    ;
inc2A = A->inc2    ;
entA  = A->entries ;
nrowB = B->n1      ;
ncolB = B->n2      ;
inc1B = B->inc1    ;
inc2B = B->inc2    ;
entB  = B->entries ;
ncolC = C->n2      ;
inc1C = C->inc1    ;
inc2C = C->inc2    ;
entC  = C->entries ;

{
int   error = 0, ierr, irow, jcol ;
for ( jcol = 0 ; jcol < ncolC ; jcol++ ) {
   if ( colloc[jcol] < 0 || colloc[jcol] >= ncolA ) {
      error = 1 ;
   }
}
for ( irow = 0 ; irow < nrowB ; irow++ ) {
   if ( rowloc[irow] < 0 || rowloc[irow] >= nrowA ) {
      error = 1 ;
   }
}
if ( error == 1 ) {
   fprintf(stderr, "\n fatal error in DA2_nsUpd(%p,%p,%p,%p,%p)"
           "\n bad colloc[] or rowloc[]", A, rowloc, colloc, B, C) ;
   fprintf(stderr, "\n A") ;
   DA2_writeStats(A, stderr) ;
   fprintf(stderr, "\n colloc :") ;
   IVfp80(stderr, ncolC, colloc, 10, &ierr) ;
   fprintf(stderr, "\n rowloc :") ;
   IVfp80(stderr, nrowB, rowloc, 10, &ierr) ;
   fprintf(stderr, "\n B") ;
   DA2_writeStats(B, stderr) ;
   fprintf(stderr, "\n C") ;
   DA2_writeStats(C, stderr) ;
   exit(-1) ;
}
}
/*
   ------------------------------
   check the kernel's orientation
   ------------------------------
*/
if ( inc2B == 1 && inc1C == 1 ) {
/*
   ---------------------------
   B row major, C column major
   ---------------------------
*/
   double   *colC0, *colC1, *colC2, *rowB0, *rowB1, *rowB2 ;
   double   sums[9] ;
   int      i0, i1, i2, irowB, jcolC, j0, j1, j2 ;
/* 
   --------------------------------
   loop over the columns of A and C
   --------------------------------
*/
   colC0 = entC ;
   for ( jcolC = 0 ; jcolC < ncolC - 2 ; jcolC += 3 ) {
      colC1 = colC0 + inc2C ;
      colC2 = colC1 + inc2C ;
      j0    = colloc[jcolC]*inc2A ;
      j1    = colloc[jcolC+1]*inc2A ;
      j2    = colloc[jcolC+2]*inc2A ;
      rowB0 = entB ;
      for ( irowB = 0 ; irowB < nrowB - 2 ; irowB += 3 ) {
         rowB1 = rowB0 + inc1B ;
         rowB2 = rowB1 + inc1B ;
         mdot3x3(sums, ncolB, rowB0, rowB1, rowB2, colC0, colC1, colC2);
         i0 = rowloc[irowB]*inc1A ;
         i1 = rowloc[irowB+1]*inc1A ;
         i2 = rowloc[irowB+2]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         rowB0 = rowB2 + inc1B ;
      }
      if ( irowB == nrowB - 2 ) {
         rowB1 = rowB0 + inc1B ;
         mdot2x3(sums, ncolB, rowB0, rowB1, colC0, colC1, colC2);
         i0 = rowloc[irowB]*inc1A ;
         i1 = rowloc[irowB+1]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
      } else if ( irowB == nrowB - 1 ) {
         mdot1x3(sums, ncolB, rowB0, colC0, colC1, colC2);
         i0 = rowloc[irowB]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
      }
      colC0 = colC2 + inc2C ;
   }
   if ( jcolC == ncolC - 2 ) {
      colC1 = colC0 + inc2C ;
      j0    = colloc[jcolC]*inc2A ;
      j1    = colloc[jcolC+1]*inc2A ;
      rowB0 = entB ;
      for ( irowB = 0 ; irowB < nrowB - 2 ; irowB += 3 ) {
         rowB1 = rowB0 + inc1B ;
         rowB2 = rowB1 + inc1B ;
         mdot3x2(sums, ncolB, rowB0, rowB1, rowB2, colC0, colC1);
         i0 = rowloc[irowB]*inc1A ;
         i1 = rowloc[irowB+1]*inc1A ;
         i2 = rowloc[irowB+2]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
         rowB0 = rowB2 + inc1B ;
      }
      if ( irowB == nrowB - 2 ) {
         rowB1 = rowB0 + inc1B ;
         mdot2x2(sums, ncolB, rowB0, rowB1, colC0, colC1);
         i0 = rowloc[irowB]*inc1A ;
         i1 = rowloc[irowB+1]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
      } else if ( irowB == nrowB - 1 ) {
         mdot1x2(sums, ncolB, rowB0, colC0, colC1);
         i0 = rowloc[irowB]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
      }
   } else if ( jcolC == ncolC - 1 ) {
      j0    = colloc[jcolC]*inc2A ;
      rowB0 = entB ;
      for ( irowB = 0 ; irowB < nrowB - 2 ; irowB += 3 ) {
         rowB1 = rowB0 + inc1B ;
         rowB2 = rowB1 + inc1B ;
         mdot3x1(sums, ncolB, rowB0, rowB1, rowB2, colC0);
         i0 = rowloc[irowB]*inc1A ;
         i1 = rowloc[irowB+1]*inc1A ;
         i2 = rowloc[irowB+2]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
         rowB0 = rowB2 + inc1B ;
      }
      if ( irowB == nrowB - 2 ) {
         rowB1 = rowB0 + inc1B ;
         mdot2x1(sums, ncolB, rowB0, rowB1, colC0);
         i0 = rowloc[irowB]*inc1A ;
         i1 = rowloc[irowB+1]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
      } else if ( irowB == nrowB - 1 ) {
         mdot1x1(sums, ncolB, rowB0, colC0);
         i0 = rowloc[irowB]*inc1A ;
         entA[i0 + j0] -= sums[0] ;
      }
   }
} else {
/*
   -----------------------------
   fallback case, slow but right
   -----------------------------
*/
   DV       colDV, rowDV ;
   double   value ;
   double   *col, *row ;
   int      ii, jj ;

   DV_setDefaultFields(&colDV) ;
   DV_setDefaultFields(&rowDV) ;
   DV_init(&colDV, ncolB, NULL) ;
   DV_init(&rowDV, ncolB, NULL) ;
   col = DV_entries(&colDV) ;
   row = DV_entries(&rowDV) ;
   for ( jj = 0 ; jj < ncolC ; jj++ ) {
      DA2_extractColumnDV(C, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowB ; ii++ ) {
         DA2_extractRowDV(B, &rowDV, ii) ;
         value = - DVdot(ncolB, row, col) ;
         DA2_addEntry(A, rowloc[ii], colloc[jj], value) ;
      }
   }
   DV_clearData(&colDV) ;
   DV_clearData(&rowDV) ;
}

return ; }

/*--------------------------------------------------------------------*/
/*
   -------------------------------------------------
   nonsymmetric dense 3 term update A -= X * Y * Z^T

   created -- 96oct18, cca
   -------------------------------------------------
*/
void
DA2_nd3Upd (
   DA2   *A,
   DA2   *X,
   DA2   *Y,
   DA2   *Z,
   DV    *tmpDV
) {
int      inc1A, inc1X, inc1Y, inc1Z,
         inc2A, inc2X, inc2Y, inc2Z,
         ncolA, ncolX, ncolY, ncolZ,
         nrowA, nrowX, nrowY, nrowZ ;
double   *entA, *entX, *entY, *entZ ;
/*
   --------------------------------
   pull out dimensions and pointers
   --------------------------------
*/
nrowA = A->n1      ;
ncolA = A->n2      ;
inc1A = A->inc1    ;
inc2A = A->inc2    ;
entA  = A->entries ;
nrowX = X->n1      ;
ncolX = X->n2      ;
inc1X = X->inc1    ;
inc2X = X->inc2    ;
entX  = X->entries ;
nrowY = Y->n1      ;
ncolY = Y->n2      ;
inc1Y = Y->inc1    ;
inc2Y = Y->inc2    ;
entY  = Y->entries ;
nrowZ = Z->n1      ;
ncolZ = Z->n2      ;
inc1Z = Z->inc1    ;
inc2Z = Z->inc2    ;
entZ  = Z->entries ;
#if MYDEBUG > 0
fprintf(stdout, "\n entA = %p", entA) ;
fprintf(stdout, "\n entX = %p", entX) ;
fprintf(stdout, "\n entY = %p", entY) ;
fprintf(stdout, "\n entZ = %p", entZ) ;
#endif
if ( inc2X == 1 && inc2Y == 1 && inc2Z == 1 ) {
/*
   ----------------------------------------
   first case, X, Y and Z are all row major
   ----------------------------------------
*/
   double   *colZT0, *colZT1, *colZT2, *rowX0, *rowX1, *rowX2, 
            *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      i0, i1, i2, irowX, jcolZT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*nrowY ) {
      DV_setSize(tmpDV, 3*nrowY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + nrowY ;
   tmp2 = tmp1 + nrowY ;
/* 
   ---------------------------------------
   loop over the columns of Z^T by triples
   ---------------------------------------
*/
   colZT0 = entZ ;
   j0     =   0  ;
   for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
      colZT1 = colZT0 + inc1Z ;
      colZT2 = colZT1 + inc1Z ;
      j1     = j0     + inc2A ;
      j2     = j1     + inc2A ;
/*
      ---------------------------------------------------------
      compute [ tmp0 tmp1 tmp2 ] = Y * [ colZT0 colZT1 colZT2 ]
      ---------------------------------------------------------
*/
      DVzero(3*nrowY, tmp0) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, colZT0, colZT1, colZT2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, colZT0, colZT1, colZT2) ;
#if MYDEBUG > 0
      fprintf(stdout, "\n after DA2_mvm_33") ;
      fprintf(stdout, "\n tmp0") ;
      DVfprintf(stdout, nrowY, tmp0) ;
      fprintf(stdout, "\n tmp1") ;
      DVfprintf(stdout, nrowY, tmp1) ;
      fprintf(stdout, "\n tmp2") ;
      DVfprintf(stdout, nrowY, tmp2) ;
#endif
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      i0    =   0  ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x3(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1, tmp2) ;
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      if ( irowX == nrowX - 2 ) {
         rowX1 = rowX0 + inc1X ;
         mdot2x3(sums, nrowY, rowX0, rowX1, tmp0, tmp1, tmp2) ;
         i1 = i0 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
      } else if ( irowX == nrowX - 1 ) {
         mdot1x3(sums, nrowY, rowX0, tmp0, tmp1, tmp2) ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
      }
      colZT0 = colZT2 + inc1Z ;
      j0     = j2     + inc2A ;
   }
   if ( jcolZT == nrowZ - 2 ) {
      colZT1 = colZT0 + inc1Z ;
      j1     = j0     + inc2A ;
/*
      ---------------------------------------------
      compute [ tmp0 tmp1 ] = Y * [ colZT0 colZT1 ]
      ---------------------------------------------
*/
      DVzero(2*nrowY, tmp0) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, colZT0, colZT1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, colZT0, colZT1) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      i0    =   0  ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x2(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1) ;
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      if ( irowX == nrowX - 2 ) {
         rowX1 = rowX0 + inc1X ;
         mdot2x2(sums, nrowY, rowX0, rowX1, tmp0, tmp1) ;
         i1 = i0 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
      } else if ( irowX == nrowX - 1 ) {
         mdot1x2(sums, nrowY, rowX0, tmp0, tmp1) ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
      }
   } else if ( jcolZT == nrowZ - 1 ) {
/*
      ---------------------------------
      compute [ tmp0 ] = Y * [ colZT0 ]
      ---------------------------------
*/
      DVzero(nrowY, tmp0) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, colZT0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, colZT0) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      i0    =   0  ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x1(sums, nrowY, rowX0, rowX1, rowX2, tmp0) ;
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      if ( irowX == nrowX - 2 ) {
         rowX1 = rowX0 + inc1X ;
         mdot2x1(sums, nrowY, rowX0, rowX1, tmp0) ;
         i1 = i0 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
      } else if ( irowX == nrowX - 1 ) {
         mdot1x1(sums, nrowY, rowX0, tmp0) ;
         entA[i0 + j0] -= sums[0] ;
      }
   }
} else if ( inc2X == 1 && inc1Y == 1 && inc2Z == 1 ) {
/*
   -----------------------------------------------------
   second case, X and Z are row major, Y is column major
   -----------------------------------------------------
*/
   double   *colZT0, *colZT1, *colZT2, 
            *rowX0, *rowX1, *rowX2, *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      irowX, i0, i1, i2, jcolZT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*ncolY ) {
      DV_setSize(tmpDV, 3*ncolY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + ncolY ;
   tmp2 = tmp1 + ncolY ;
/* 
   -----------------------
   loop over the rows of X
   -----------------------
*/
   rowX0 = entX ;
   i0    =   0  ;
   for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
      rowX1 = rowX0 + inc1X ;
      rowX2 = rowX1 + inc1X ;
      i1    = i0    + inc1A ;
      i2    = i1    + inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
              [ tmp2 ] = [ rowX2 ] 
      --------------------------------
*/
      DVzero(3*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colZT0 = entZ ;
      j0     =   0  ;
      for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
         colZT1 = colZT0 + inc1Z ;
         colZT2 = colZT1 + inc1Z ;
         mdot3x3(sums, ncolY, tmp0, tmp1, tmp2, colZT0, colZT1, colZT2);
         j1 = j0 + inc2A ;
         j2 = j1 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         colZT0 = colZT2 + inc1Z ;
         j0     = j2     + inc2A ;
      }
      if ( jcolZT == nrowZ - 2 ) {
         colZT1 = colZT0 + inc1Z ;
         mdot3x2(sums, ncolY, tmp0, tmp1, tmp2, colZT0, colZT1);
         j1 = j0 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
      } else if ( jcolZT == nrowZ - 1 ) {
         mdot3x1(sums, ncolY, tmp0, tmp1, tmp2, colZT0);
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
      }
      rowX0 = rowX2 + inc1X ;
      i0    = i2    + inc1A ;
   }
   if ( irowX == nrowX - 2 ) {
      rowX1 = rowX0 + inc1X ;
      i1    = i0    + inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
      --------------------------------
*/
      DVzero(2*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colZT0 = entZ ;
      j0     =   0  ;
      for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
         colZT1 = colZT0 + inc1Z ;
         colZT2 = colZT1 + inc1Z ;
         mdot2x3(sums, ncolY, tmp0, tmp1, colZT0, colZT1, colZT2) ;
         j1 = j0 + inc2A ;
         j2 = j1 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         colZT0 = colZT2 + inc1Z ;
         j0     = j2     + inc2A ;
      }
      if ( jcolZT == nrowZ - 2 ) {
         colZT1 = colZT0 + inc1Z ;
         mdot2x2(sums, ncolY, tmp0, tmp1, colZT0, colZT1) ;
         j1 = j0 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
      } else if ( jcolZT == nrowZ - 1 ) {
         mdot2x1(sums, ncolY, tmp0, tmp1, colZT0) ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
      }
   } else if ( irowX == nrowX - 1 ) {
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
      --------------------------------
*/
      DVzero(ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, rowX0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, rowX0) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colZT0 = entZ ;
      j0     =   0  ;
      for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
         colZT1 = colZT0 + inc1Z ;
         colZT2 = colZT1 + inc1Z ;
         mdot1x3(sums, ncolY, tmp0, colZT0, colZT1, colZT2) ;
         j1 = j0 + inc2A ;
         j2 = j1 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         colZT0 = colZT2 + inc1Z ;
         j0     = j2     + inc2A ;
      }
      if ( jcolZT == nrowZ - 2 ) {
         colZT1 = colZT0 + inc1Z ;
         mdot1x2(sums, ncolY, tmp0, colZT0, colZT1) ;
         j1 = j0 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
      } else if ( jcolZT == nrowZ - 1 ) {
         mdot1x1(sums, ncolY, tmp0, colZT0) ;
         entA[i0 + j0] -= sums[0] ;
      }
   }
} else {
/*
   -----------------------------
   fallback case, slow but right
   -----------------------------
*/
   DA2      tmpDA2 ;
   double   value ;
   double   *col, *row ;
   DV       colDV, rowDV ;
   int      ii, jj ;
/*
   -------------------------------------
   initialize the temporary data objects
   -------------------------------------
*/   
   DA2_setDefaultFields(&tmpDA2) ;
   DV_setDefaultFields(&rowDV) ;
   DV_setDefaultFields(&colDV) ;
   DA2_init(&tmpDA2, nrowX, ncolX, 1, nrowX, NULL) ;
   DV_init(&rowDV, ncolX, NULL) ;
   DV_init(&colDV, ncolX, NULL) ;
   row = DV_entries(&rowDV) ;
   col = DV_entries(&colDV) ;
/*
   ----------------------
   compute tmpDA2 = X * Y
   ----------------------
*/
   for ( jj = 0 ; jj < ncolX ; jj++ ) {
      DA2_extractColumnDV(Y, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowX ; ii++ ) {
         DA2_extractRowDV(X, &rowDV, ii) ;
         value = DVdot(ncolX, row, col) ;
         DA2_setEntry(&tmpDA2, ii, jj, value) ;
      }
   }
/*
   -------------------------
   compute A -= tmpDA2 * Z^T
   -------------------------
*/
   for ( jj = 0 ; jj < ncolA ; jj++ ) {
      DA2_extractRowDV(Z, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowA ; ii++ ) {
         DA2_extractRowDV(&tmpDA2, &rowDV, ii) ;
         value = - DVdot(ncolX, row, col) ;
         DA2_addEntry(A, ii, jj, value) ;
      }
   }
/*
   ------------------------
   free the working storage
   ------------------------
*/
   DA2_clearData(&tmpDA2) ;
   DV_clearData(&rowDV) ;
   DV_clearData(&colDV) ;
}

return ; }

/*--------------------------------------------------------------------*/
/*
   --------------------------------------------------
   nonsymmetric sparse 3 term update A -= X * Y * Z^T

   created -- 96oct18, cca
   --------------------------------------------------
*/
void
DA2_ns3Upd (
   DA2   *A,
   int   rowloc[],
   int   colloc[],
   DA2   *X,
   DA2   *Y,
   DA2   *Z,
   DV    *tmpDV
) {
int      inc1A, inc1X, inc1Y, inc1Z,
         inc2A, inc2X, inc2Y, inc2Z,
         ncolA, ncolX, ncolY, ncolZ,
         nrowA, nrowX, nrowY, nrowZ ;
double   *entA, *entX, *entY, *entZ ;
/*
   --------------------------------
   pull out dimensions and pointers
   --------------------------------
*/
nrowA = A->n1      ;
ncolA = A->n2      ;
inc1A = A->inc1    ;
inc2A = A->inc2    ;
entA  = A->entries ;
nrowX = X->n1      ;
ncolX = X->n2      ;
inc1X = X->inc1    ;
inc2X = X->inc2    ;
entX  = X->entries ;
nrowY = Y->n1      ;
ncolY = Y->n2      ;
inc1Y = Y->inc1    ;
inc2Y = Y->inc2    ;
entY  = Y->entries ;
nrowZ = Z->n1      ;
ncolZ = Z->n2      ;
inc1Z = Z->inc1    ;
inc2Z = Z->inc2    ;
entZ  = Z->entries ;
#if MYDEBUG > 0
fprintf(stdout, "\n entA = %p", entA) ;
fprintf(stdout, "\n entX = %p", entX) ;
fprintf(stdout, "\n entY = %p", entY) ;
fprintf(stdout, "\n entZ = %p", entZ) ;
#endif
if ( inc2X == 1 && inc2Y == 1 && inc2Z == 1 ) {
/*
   ----------------------------------------
   first case, X, Y and Z are all row major
   ----------------------------------------
*/
   double   *colZT0, *colZT1, *colZT2, *rowX0, *rowX1, *rowX2, 
            *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      i0, i1, i2, irowX, jcolZT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*nrowY ) {
      DV_setSize(tmpDV, 3*nrowY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + nrowY ;
   tmp2 = tmp1 + nrowY ;
/* 
   ---------------------------------------
   loop over the columns of Z^T by triples
   ---------------------------------------
*/
   colZT0 = entZ ;
   for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
      colZT1 = colZT0 + inc1Z ;
      colZT2 = colZT1 + inc1Z ;
      j0     = colloc[jcolZT] * inc2A ;
      j1     = colloc[jcolZT+1] * inc2A ;
      j2     = colloc[jcolZT+2] * inc2A ;
/*
      ---------------------------------------------------------
      compute [ tmp0 tmp1 tmp2 ] = Y * [ colZT0 colZT1 colZT2 ]
      ---------------------------------------------------------
*/
      DVzero(3*nrowY, tmp0) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, colZT0, colZT1, colZT2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, colZT0, colZT1, colZT2) ;
#if MYDEBUG > 0
      fprintf(stdout, "\n after DA2_mvm_33") ;
      fprintf(stdout, "\n tmp0") ;
      DVfprintf(stdout, nrowY, tmp0) ;
      fprintf(stdout, "\n tmp1") ;
      DVfprintf(stdout, nrowY, tmp1) ;
      fprintf(stdout, "\n tmp2") ;
      DVfprintf(stdout, nrowY, tmp2) ;
#endif
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x3(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1, tmp2) ;
         i0 = rowloc[irowX] * inc1A ;
         i1 = rowloc[irowX+1] * inc1A ;
         i2 = rowloc[irowX+2] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         rowX0 = rowX2 + inc1X ;
      }
      if ( irowX == nrowX - 2 ) {
         rowX1 = rowX0 + inc1X ;
         mdot2x3(sums, nrowY, rowX0, rowX1, tmp0, tmp1, tmp2) ;
         i0 = rowloc[irowX] * inc1A ;
         i1 = rowloc[irowX+1] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
      } else if ( irowX == nrowX - 1 ) {
         mdot1x3(sums, nrowY, rowX0, tmp0, tmp1, tmp2) ;
         i0 = rowloc[irowX] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
      }
      colZT0 = colZT2 + inc1Z ;
   }
   if ( jcolZT == nrowZ - 2 ) {
      colZT1 = colZT0 + inc1Z ;
      j0     = colloc[jcolZT] * inc2A ;
      j1     = colloc[jcolZT+1] * inc2A ;
/*
      ---------------------------------------------
      compute [ tmp0 tmp1 ] = Y * [ colZT0 colZT1 ]
      ---------------------------------------------
*/
      DVzero(2*nrowY, tmp0) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, colZT0, colZT1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, colZT0, colZT1) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x2(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1) ;
         i0 = rowloc[irowX] * inc1A ;
         i1 = rowloc[irowX+1] * inc1A ;
         i2 = rowloc[irowX+2] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      if ( irowX == nrowX - 2 ) {
         rowX1 = rowX0 + inc1X ;
         mdot2x2(sums, nrowY, rowX0, rowX1, tmp0, tmp1) ;
         i0 = rowloc[irowX] * inc1A ;
         i1 = rowloc[irowX+1] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
      } else if ( irowX == nrowX - 1 ) {
         mdot1x2(sums, nrowY, rowX0, tmp0, tmp1) ;
         i0 = rowloc[irowX] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
      }
   } else if ( jcolZT == nrowZ - 1 ) {
      j0 = colloc[jcolZT] * inc2A ;
/*
      ---------------------------------
      compute [ tmp0 ] = Y * [ colZT0 ]
      ---------------------------------
*/
      DVzero(nrowY, tmp0) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, colZT0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, colZT0) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x1(sums, nrowY, rowX0, rowX1, rowX2, tmp0) ;
         i0 = rowloc[irowX] * inc1A ;
         i1 = rowloc[irowX+1] * inc1A ;
         i2 = rowloc[irowX+2] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
         rowX0 = rowX2 + inc1X ;
      }
      if ( irowX == nrowX - 2 ) {
         rowX1 = rowX0 + inc1X ;
         mdot2x1(sums, nrowY, rowX0, rowX1, tmp0) ;
         i0 = rowloc[irowX] * inc1A ;
         i1 = rowloc[irowX+1] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
      } else if ( irowX == nrowX - 1 ) {
         mdot1x1(sums, nrowY, rowX0, tmp0) ;
         i0 = rowloc[irowX] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
      }
   }
} else if ( inc2X == 1 && inc1Y == 1 && inc2Z == 1 ) {
/*
   -----------------------------------------------------
   second case, X and Z are row major, Y is column major
   -----------------------------------------------------
*/
   double   *colZT0, *colZT1, *colZT2, 
            *rowX0, *rowX1, *rowX2, *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      irowX, i0, i1, i2, jcolZT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*ncolY ) {
      DV_setSize(tmpDV, 3*ncolY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + ncolY ;
   tmp2 = tmp1 + ncolY ;
/* 
   -----------------------
   loop over the rows of X
   -----------------------
*/
   rowX0 = entX ;
   for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
      rowX1 = rowX0 + inc1X ;
      rowX2 = rowX1 + inc1X ;
      i0     = rowloc[irowX] * inc1A ;
      i1     = rowloc[irowX+1] * inc1A ;
      i2     = rowloc[irowX+2] * inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
              [ tmp2 ] = [ rowX2 ] 
      --------------------------------
*/
      DVzero(3*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colZT0 = entZ ;
      for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
         colZT1 = colZT0 + inc1Z ;
         colZT2 = colZT1 + inc1Z ;
         mdot3x3(sums, ncolY, tmp0, tmp1, tmp2, colZT0, colZT1, colZT2);
         j0 = colloc[jcolZT] * inc2A ;
         j1 = colloc[jcolZT+1] * inc2A ;
         j2 = colloc[jcolZT+2] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         colZT0 = colZT2 + inc1Z ;
      }
      if ( jcolZT == nrowZ - 2 ) {
         colZT1 = colZT0 + inc1Z ;
         mdot3x2(sums, ncolY, tmp0, tmp1, tmp2, colZT0, colZT1);
         j0 = colloc[jcolZT] * inc2A ;
         j1 = colloc[jcolZT+1] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
      } else if ( jcolZT == nrowZ - 1 ) {
         mdot3x1(sums, ncolY, tmp0, tmp1, tmp2, colZT0);
         j0 = colloc[jcolZT] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
      }
      rowX0 = rowX2 + inc1X ;
   }
   if ( irowX == nrowX - 2 ) {
      rowX1 = rowX0 + inc1X ;
      i0    = rowloc[irowX] * inc1A ;
      i1    = rowloc[irowX+1] * inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
      --------------------------------
*/
      DVzero(2*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colZT0 = entZ ;
      for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
         colZT1 = colZT0 + inc1Z ;
         colZT2 = colZT1 + inc1Z ;
         mdot2x3(sums, ncolY, tmp0, tmp1, colZT0, colZT1, colZT2) ;
         j0 = colloc[jcolZT] * inc2A ;
         j1 = colloc[jcolZT+1] * inc2A ;
         j2 = colloc[jcolZT+2] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         colZT0 = colZT2 + inc1Z ;
      }
      if ( jcolZT == nrowZ - 2 ) {
         colZT1 = colZT0 + inc1Z ;
         mdot2x2(sums, ncolY, tmp0, tmp1, colZT0, colZT1) ;
         j0 = colloc[jcolZT] * inc2A ;
         j1 = colloc[jcolZT+1] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
      } else if ( jcolZT == nrowZ - 1 ) {
         mdot2x1(sums, ncolY, tmp0, tmp1, colZT0) ;
         j0 = colloc[jcolZT] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
      }
   } else if ( irowX == nrowX - 1 ) {
      i0 = rowloc[irowX] * inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
      --------------------------------
*/
      DVzero(ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, rowX0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, rowX0) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colZT0 = entZ ;
      for ( jcolZT = 0 ; jcolZT < nrowZ - 2 ; jcolZT += 3 ) {
         colZT1 = colZT0 + inc1Z ;
         colZT2 = colZT1 + inc1Z ;
         mdot1x3(sums, ncolY, tmp0, colZT0, colZT1, colZT2) ;
         j0 = colloc[jcolZT] * inc2A ;
         j1 = colloc[jcolZT+1] * inc2A ;
         j2 = colloc[jcolZT+2] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         colZT0 = colZT2 + inc1Z ;
      }
      if ( jcolZT == nrowZ - 2 ) {
         colZT1 = colZT0 + inc1Z ;
         mdot1x2(sums, ncolY, tmp0, colZT0, colZT1) ;
         j0 = colloc[jcolZT] * inc2A ;
         j1 = colloc[jcolZT+1] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
      } else if ( jcolZT == nrowZ - 1 ) {
         mdot1x1(sums, ncolY, tmp0, colZT0) ;
         j0 = colloc[jcolZT] * inc2A ;
         entA[i0 + j0] -= sums[0] ;
      }
   }
} else {
/*
   -----------------------------
   fallback case, slow but right
   -----------------------------
*/
   DA2      tmpDA2 ;
   double   value ;
   double   *col, *row ;
   DV       colDV, rowDV ;
   int      ii, jj ;
/*
   -------------------------------------
   initialize the temporary data objects
   -------------------------------------
*/   
   DA2_setDefaultFields(&tmpDA2) ;
   DV_setDefaultFields(&rowDV) ;
   DV_setDefaultFields(&colDV) ;
   DA2_init(&tmpDA2, nrowX, ncolX, 1, nrowX, NULL) ;
   DV_init(&rowDV, ncolX, NULL) ;
   DV_init(&colDV, ncolX, NULL) ;
   row = DV_entries(&rowDV) ;
   col = DV_entries(&colDV) ;
/*
   ----------------------
   compute tmpDA2 = X * Y
   ----------------------
*/
   for ( jj = 0 ; jj < ncolX ; jj++ ) {
      DA2_extractColumnDV(Y, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowX ; ii++ ) {
         DA2_extractRowDV(X, &rowDV, ii) ;
         value = DVdot(ncolX, row, col) ;
         DA2_setEntry(&tmpDA2, ii, jj, value) ;
      }
   }
/*
   -------------------------
   compute A -= tmpDA2 * Z^T
   -------------------------
*/
   for ( jj = 0 ; jj < nrowZ ; jj++ ) {
      DA2_extractRowDV(Z, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowX ; ii++ ) {
         DA2_extractRowDV(&tmpDA2, &rowDV, ii) ;
         value = - DVdot(ncolX, row, col) ;
         DA2_addEntry(A, rowloc[ii], colloc[jj], value) ;
      }
   }
/*
   ------------------------
   free the working storage
   ------------------------
*/
   DA2_clearData(&tmpDA2) ;
   DV_clearData(&rowDV) ;
   DV_clearData(&colDV) ;
}

return ; }

/*--------------------------------------------------------------------*/
/*
   -------------------------------------------------
   symmetric dense 3 term update A -= X * Y * X^T
   only the upper triangle of A is updated

   created -- 96oct18, cca
   -------------------------------------------------
*/
void
DA2_sd3Upd (
   DA2   *A,
   DA2   *X,
   DA2   *Y,
   DV    *tmpDV
) {
int      inc1A, inc1X, inc1Y, inc2A, inc2X, inc2Y, 
         ncolA, ncolX, ncolY, nrowA, nrowX, nrowY ;
double   *entA, *entX, *entY ;
/*
   --------------------------------
   pull out dimensions and pointers
   --------------------------------
*/
nrowA = A->n1      ;
ncolA = A->n2      ;
inc1A = A->inc1    ;
inc2A = A->inc2    ;
entA  = A->entries ;
nrowX = X->n1      ;
ncolX = X->n2      ;
inc1X = X->inc1    ;
inc2X = X->inc2    ;
entX  = X->entries ;
nrowY = Y->n1      ;
ncolY = Y->n2      ;
inc1Y = Y->inc1    ;
inc2Y = Y->inc2    ;
entY  = Y->entries ;
#if MYDEBUG > 0
fprintf(stdout, "\n entA = %p", entA) ;
fprintf(stdout, "\n entX = %p", entX) ;
fprintf(stdout, "\n entY = %p", entY) ;
#endif
if ( inc2X == 1 && inc2Y == 1 ) {
/*
   ---------------------------------
   first case, X and Y are row major
   ---------------------------------
*/
   double   *colXT0, *colXT1, *colXT2, *rowX0, *rowX1, *rowX2, 
            *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      irowX, i0, i1, i2, jcolXT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*nrowY ) {
      DV_setSize(tmpDV, 3*nrowY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + nrowY ;
   tmp2 = tmp1 + nrowY ;
/* 
   ---------------------------------------
   loop over the columns of X^T by triples
   ---------------------------------------
*/
   colXT0 = entX ;
   j0     =   0  ;
   for ( jcolXT = 0 ; jcolXT < nrowX - 2 ; jcolXT += 3 ) {
      colXT1 = colXT0 + inc1X ;
      colXT2 = colXT1 + inc1X ;
      j1     = j0     + inc2A ;
      j2     = j1     + inc2A ;
/*
      ---------------------------------------------------------
      compute [ tmp0 tmp1 tmp2 ] = Y * [ colXT0 colXT1 colXT2 ]
      ---------------------------------------------------------
*/
      DVzero(3*nrowY, tmp0) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, colXT0, colXT1, colXT2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, colXT0, colXT1, colXT2) ;
#if MYDEBUG > 0
      fprintf(stdout, "\n after DA2_mvm_33") ;
      fprintf(stdout, "\n tmp0") ;
      DVfprintf(stdout, nrowY, tmp0) ;
      fprintf(stdout, "\n tmp1") ;
      DVfprintf(stdout, nrowY, tmp1) ;
      fprintf(stdout, "\n tmp2") ;
      DVfprintf(stdout, nrowY, tmp2) ;
#endif
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      i0    =   0  ;
      for ( irowX = 0 ; irowX < jcolXT - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x3(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1, tmp2) ;
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      rowX1 = rowX0 + inc1X ;
      rowX2 = rowX1 + inc1X ;
      mdot3x3(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1, tmp2) ;
      i1 = i0 + inc1A ;
      i2 = i1 + inc1A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i0 + j2] -= sums[2] ;
      entA[i1 + j1] -= sums[4] ;
      entA[i1 + j2] -= sums[5] ;
      entA[i2 + j2] -= sums[8] ;
      colXT0 = colXT2 + inc1X ;
      j0     = j2     + inc2A ;
   }
   if ( jcolXT == nrowX - 2 ) {
      colXT1 = colXT0 + inc1X ;
      j1     = j0     + inc2A ;
/*
      ---------------------------------------------
      compute [ tmp0 tmp1 ] = Y * [ colXT0 colXT1 ]
      ---------------------------------------------
*/
      DVzero(2*nrowY, tmp0) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, colXT0, colXT1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, colXT0, colXT1) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      i0    =   0  ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x2(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1) ;
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      rowX1 = rowX0 + inc1X ;
      mdot2x2(sums, nrowY, rowX0, rowX1, tmp0, tmp1) ;
      i1 = i0 + inc1A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i1 + j1] -= sums[3] ;
   } else if ( jcolXT == nrowX - 1 ) {
/*
      ---------------------------------
      compute [ tmp0 ] = Y * [ colXT0 ]
      ---------------------------------
*/
      DVzero(nrowY, tmp0) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, colXT0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, colXT0) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      i0    =   0  ;
      for ( irowX = 0 ; irowX < nrowX - 1 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x1(sums, nrowY, rowX0, rowX1, rowX2, tmp0) ;
         i1 = i0 + inc1A ;
         i2 = i1 + inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
         rowX0 = rowX2 + inc1X ;
         i0    = i2    + inc1A ;
      }
      mdot1x1(sums, nrowY, rowX0, tmp0) ;
      entA[i0 + j0] -= sums[0] ;
   }
} else if ( inc2X == 1 && inc1Y == 1 ) {
/*
   ----------------------------------------------
   second case, X is row major, Y is column major
   ----------------------------------------------
*/
   double   *colXT0, *colXT1, *colXT2, *rowX0, *rowX1, *rowX2, 
            *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      irowX, i0, i1, i2, jcolXT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*ncolY ) {
      DV_setSize(tmpDV, 3*ncolY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + ncolY ;
   tmp2 = tmp1 + ncolY ;
/* 
   -----------------------
   loop over the rows of X
   -----------------------
*/
   rowX0 = entX ;
   i0    =   0  ;
   for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
      rowX1 = rowX0 + inc1X ;
      rowX2 = rowX1 + inc1X ;
      i1    = i0    + inc1A ;
      i2    = i1    + inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
              [ tmp2 ] = [ rowX2 ] 
      --------------------------------
*/
      DVzero(3*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colXT0 = entX + irowX*inc1X ;
      colXT1 = colXT0 + inc1X ;
      colXT2 = colXT1 + inc1X ;
      mdot3x3(sums, ncolY, tmp0, tmp1, tmp2, colXT0, colXT1, colXT2);
      j0 = irowX*inc2A ;
      j1 = j0  + inc2A ;
      j2 = j1  + inc2A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i0 + j2] -= sums[2] ;
      entA[i1 + j1] -= sums[4] ;
      entA[i1 + j2] -= sums[5] ;
      entA[i2 + j2] -= sums[8] ;
      colXT0 = colXT2 + inc1X ;
      j0     = j2 + inc2A ;
      for ( jcolXT = irowX + 3 ; jcolXT < nrowX - 2 ; jcolXT += 3 ) {
         colXT1 = colXT0 + inc1X ;
         colXT2 = colXT1 + inc1X ;
         mdot3x3(sums, ncolY, tmp0, tmp1, tmp2, colXT0, colXT1, colXT2);
         j1 = j0 + inc2A ;
         j2 = j1 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         colXT0 = colXT2 + inc1X ;
         j0     = j2     + inc2A ;
      }
      if ( jcolXT == nrowX - 2 ) {
         colXT1 = colXT0 + inc1X ;
         mdot3x2(sums, ncolY, tmp0, tmp1, tmp2, colXT0, colXT1);
         j1 = j0 + inc2A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
      } else if ( jcolXT == nrowX - 1 ) {
         mdot3x1(sums, ncolY, tmp0, tmp1, tmp2, colXT0);
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
      }
      rowX0 = rowX2 + inc1X ;
      i0    = i2    + inc1A ;
   }
   if ( irowX == nrowX - 2 ) {
      rowX1 = rowX0 + inc1X ;
      i1    = i0    + inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
      --------------------------------
*/
      DVzero(2*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colXT0 = entX + irowX*inc1X ;
      colXT1 = colXT0 + inc1X ;
      mdot2x2(sums, ncolY, tmp0, tmp1, colXT0, colXT1);
      j0 = irowX*inc2A ;
      j1 = j0  + inc2A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i1 + j1] -= sums[3] ;
   } else if ( irowX == nrowX - 1 ) {
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
      --------------------------------
*/
      DVzero(ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, rowX0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, rowX0) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colXT0 = entX + irowX*inc1X ;
      mdot1x1(sums, ncolY, tmp0, colXT0);
      j0 = irowX*inc2A ;
      entA[i0 + j0] -= sums[0] ;
   }
} else {
/*
   -----------------------------
   fallback case, slow but right
   -----------------------------
*/
   DA2      tmpDA2 ;
   double   value ;
   double   *col, *row ;
   DV       colDV, rowDV ;
   int      ii, jj ;
/*
   -------------------------------------
   initialize the temporary data objects
   -------------------------------------
*/   
   DA2_setDefaultFields(&tmpDA2) ;
   DV_setDefaultFields(&rowDV) ;
   DV_setDefaultFields(&colDV) ;
   DA2_init(&tmpDA2, nrowX, ncolX, 1, nrowX, NULL) ;
   DV_init(&rowDV, ncolX, NULL) ;
   DV_init(&colDV, ncolX, NULL) ;
   row = DV_entries(&rowDV) ;
   col = DV_entries(&colDV) ;
/*
   ----------------------
   compute tmpDA2 = X * Y
   ----------------------
*/
   for ( jj = 0 ; jj < ncolX ; jj++ ) {
      DA2_extractColumnDV(Y, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowX ; ii++ ) {
         DA2_extractRowDV(X, &rowDV, ii) ;
         value = DVdot(ncolX, row, col) ;
         DA2_setEntry(&tmpDA2, ii, jj, value) ;
      }
   }
/*
   -------------------------
   compute A -= tmpDA2 * X^T
   -------------------------
*/
   for ( jj = 0 ; jj < nrowX ; jj++ ) {
      DA2_extractRowDV(X, &colDV, jj) ;
      for ( ii = 0 ; ii <= jj ; ii++ ) {
         DA2_extractRowDV(&tmpDA2, &rowDV, ii) ;
         value = - DVdot(ncolX, row, col) ;
         DA2_addEntry(A, ii, jj, value) ;
      }
   }
/*
   ------------------------
   free the working storage
   ------------------------
*/
   DA2_clearData(&tmpDA2) ;
   DV_clearData(&rowDV) ;
   DV_clearData(&colDV) ;
}

return ; }

/*--------------------------------------------------------------------*/
/*
   -------------------------------------------------
   symmetric sparse 3 term update A -= X * Y * X^T
   only the upper triangle of A is updated

   created -- 96oct18, cca
   -------------------------------------------------
*/
void
DA2_ss3Upd (
   DA2   *A,
   int   locind[],
   DA2   *X,
   DA2   *Y,
   DV    *tmpDV
) {
int      inc1A, inc1X, inc1Y, inc2A, inc2X, inc2Y, 
         ncolA, ncolX, ncolY, nrowA, nrowX, nrowY ;
double   *entA, *entX, *entY ;
/*
   --------------------------------
   pull out dimensions and pointers
   --------------------------------
*/
nrowA = A->n1      ;
ncolA = A->n2      ;
inc1A = A->inc1    ;
inc2A = A->inc2    ;
entA  = A->entries ;
nrowX = X->n1      ;
ncolX = X->n2      ;
inc1X = X->inc1    ;
inc2X = X->inc2    ;
entX  = X->entries ;
nrowY = Y->n1      ;
ncolY = Y->n2      ;
inc1Y = Y->inc1    ;
inc2Y = Y->inc2    ;
entY  = Y->entries ;
#if MYDEBUG > 0
fprintf(stdout, "\n entA = %p", entA) ;
fprintf(stdout, "\n entX = %p", entX) ;
fprintf(stdout, "\n entY = %p", entY) ;
#endif
if ( inc2X == 1 && inc2Y == 1 ) {
/*
   ---------------------------------
   first case, X and Y are row major
   ---------------------------------
*/
   double   *colXT0, *colXT1, *colXT2, *rowX0, *rowX1, *rowX2, 
            *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      irowX, i0, i1, i2, jcolXT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*nrowY ) {
      DV_setSize(tmpDV, 3*nrowY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + nrowY ;
   tmp2 = tmp1 + nrowY ;
/* 
   ---------------------------------------
   loop over the columns of X^T by triples
   ---------------------------------------
*/
   colXT0 = entX ;
   for ( jcolXT = 0 ; jcolXT < nrowX - 2 ; jcolXT += 3 ) {
      colXT1 = colXT0 + inc1X ;
      colXT2 = colXT1 + inc1X ;
      j0     = locind[jcolXT] * inc2A ;
      j1     = locind[jcolXT+1] * inc2A ;
      j2     = locind[jcolXT+2] * inc2A ;
/*
      ---------------------------------------------------------
      compute [ tmp0 tmp1 tmp2 ] = Y * [ colXT0 colXT1 colXT2 ]
      ---------------------------------------------------------
*/
      DVzero(3*nrowY, tmp0) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, colXT0, colXT1, colXT2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, colXT0, colXT1, colXT2) ;
#if MYDEBUG > 0
      fprintf(stdout, "\n after DA2_mvm_33") ;
      fprintf(stdout, "\n tmp0") ;
      DVfprintf(stdout, nrowY, tmp0) ;
      fprintf(stdout, "\n tmp1") ;
      DVfprintf(stdout, nrowY, tmp1) ;
      fprintf(stdout, "\n tmp2") ;
      DVfprintf(stdout, nrowY, tmp2) ;
#endif
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      for ( irowX = 0 ; irowX < jcolXT - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x3(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1, tmp2) ;
         i0 = locind[irowX] * inc1A ;
         i1 = locind[irowX+1] * inc1A ;
         i2 = locind[irowX+2] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
         rowX0 = rowX2 + inc1X ;
      }
      rowX1 = rowX0 + inc1X ;
      rowX2 = rowX1 + inc1X ;
      mdot3x3(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1, tmp2) ;
      i0 = locind[irowX] * inc1A ;
      i1 = locind[irowX+1] * inc1A ;
      i2 = locind[irowX+2] * inc1A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i0 + j2] -= sums[2] ;
      entA[i1 + j1] -= sums[4] ;
      entA[i1 + j2] -= sums[5] ;
      entA[i2 + j2] -= sums[8] ;
      colXT0 = colXT2 + inc1X ;
   }
   if ( jcolXT == nrowX - 2 ) {
      colXT1 = colXT0 + inc1X ;
      j0     = locind[jcolXT] * inc2A ;
      j1     = locind[jcolXT+1] * inc2A ;
/*
      ---------------------------------------------
      compute [ tmp0 tmp1 ] = Y * [ colXT0 colXT1 ]
      ---------------------------------------------
*/
      DVzero(2*nrowY, tmp0) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, colXT0, colXT1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, colXT0, colXT1) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x2(sums, nrowY, rowX0, rowX1, rowX2, tmp0, tmp1) ;
         i0 = locind[irowX] * inc1A ;
         i1 = locind[irowX+1] * inc1A ;
         i2 = locind[irowX+2] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
         rowX0 = rowX2 + inc1X ;
      }
      rowX1 = rowX0 + inc1X ;
      mdot2x2(sums, nrowY, rowX0, rowX1, tmp0, tmp1) ;
      i0 = locind[irowX] * inc1A ;
      i1 = locind[irowX+1] * inc1A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i1 + j1] -= sums[3] ;
   } else if ( jcolXT == nrowX - 1 ) {
      j0 = locind[jcolXT] * inc2A ;
/*
      ---------------------------------
      compute [ tmp0 ] = Y * [ colXT0 ]
      ---------------------------------
*/
      DVzero(nrowY, tmp0) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, colXT0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, colXT0) ;
/*
      --------------------------------------
      now update the columns of A by triples
      --------------------------------------
*/
      rowX0 = entX ;
      for ( irowX = 0 ; irowX < nrowX - 1 ; irowX += 3 ) {
         rowX1 = rowX0 + inc1X ;
         rowX2 = rowX1 + inc1X ;
         mdot3x1(sums, nrowY, rowX0, rowX1, rowX2, tmp0) ;
         i0 = locind[irowX] * inc1A ;
         i1 = locind[irowX+1] * inc1A ;
         i2 = locind[irowX+2] * inc1A ;
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
         rowX0 = rowX2 + inc1X ;
      }
      mdot1x1(sums, nrowY, rowX0, tmp0) ;
      i0 = locind[irowX] * inc1A ;
      entA[i0 + j0] -= sums[0] ;
   }
} else if ( inc2X == 1 && inc1Y == 1 ) {
/*
   ----------------------------------------------
   second case, X is row major, Y is column major
   ----------------------------------------------
*/
   double   *colXT0, *colXT1, *colXT2, *rowX0, *rowX1, *rowX2, 
            *tmp0, *tmp1, *tmp2 ;
   double   sums[9] ;
   int      irowX, i0, i1, i2, jcolXT, j0, j1, j2 ;
/*
   -----------------------------------------
   check the dimensions of the tmp DV object
   -----------------------------------------
*/
   if ( DV_size(tmpDV) < 3*ncolY ) {
      DV_setSize(tmpDV, 3*ncolY) ;
   }
   tmp0 = DV_entries(tmpDV) ;
   tmp1 = tmp0 + ncolY ;
   tmp2 = tmp1 + ncolY ;
/* 
   -----------------------
   loop over the rows of X
   -----------------------
*/
   rowX0 = entX ;
   for ( irowX = 0 ; irowX < nrowX - 2 ; irowX += 3 ) {
      rowX1 = rowX0 + inc1X ;
      rowX2 = rowX1 + inc1X ;
      i0    = locind[irowX] * inc1A ;
      i1    = locind[irowX+1] * inc1A ;
      i2    = locind[irowX+2] * inc1A ;
#if MYDEBUG > 0
      fprintf(stdout, "\n irowX = %d, i0 = %d, i1 = %d, i2 = %d", 
              irowX, i0, i1, i2) ;
      fprintf(stdout, "\n rowX0") ;
      DVfprintf(stdout, ncolY, rowX0) ;
      fprintf(stdout, "\n rowX1") ;
      DVfprintf(stdout, ncolY, rowX1) ;
      fprintf(stdout, "\n rowX2") ;
      DVfprintf(stdout, ncolY, rowX2) ;
#endif
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
              [ tmp2 ] = [ rowX2 ] 
      --------------------------------
*/
      DVzero(3*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_33(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
*/
      DA2_mvm3vec(Y, tmp0, tmp1, tmp2, 1.0, rowX0, rowX1, rowX2) ;
      DA2_transpose(Y) ;
#if MYDEBUG > 0
      fprintf(stdout, "\n tmp0") ;
      DVfprintf(stdout, ncolY, tmp0) ;
      fprintf(stdout, "\n tmp1") ;
      DVfprintf(stdout, ncolY, tmp1) ;
      fprintf(stdout, "\n tmp2") ;
      DVfprintf(stdout, ncolY, tmp2) ;
#endif
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colXT0 = entX + irowX*inc1X ;
      colXT1 = colXT0 + inc1X ;
      colXT2 = colXT1 + inc1X ;
#if MYDEBUG > 0
      fprintf(stdout, "\n colXT0") ;
      DVfprintf(stdout, ncolY, colXT0) ;
      fprintf(stdout, "\n colXT1") ;
      DVfprintf(stdout, ncolY, colXT1) ;
      fprintf(stdout, "\n colXT2") ;
      DVfprintf(stdout, ncolY, colXT2) ;
#endif
      mdot3x3(sums, ncolY, tmp0, tmp1, tmp2, colXT0, colXT1, colXT2);
      j0 = locind[irowX] * inc2A ;
      j1 = locind[irowX+1] * inc2A ;
      j2 = locind[irowX+2] * inc2A ;
#if MYDEBUG > 0
      fprintf(stdout, "\n j0 = %d, j1 = %d, j2 = %d", 
              j0, j1, j2) ;
      fprintf(stdout, "\n sums") ;
      DVfprintf(stdout, 9, sums) ;
#endif
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i0 + j2] -= sums[2] ;
      entA[i1 + j1] -= sums[4] ;
      entA[i1 + j2] -= sums[5] ;
      entA[i2 + j2] -= sums[8] ;
#if MYDEBUG > 0
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i0 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j1, entA[i0 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j2, entA[i0 + j2]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i1 + j1, entA[i1 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i1 + j2, entA[i1 + j2]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i2 + j2, entA[i2 + j2]) ;
#endif
      colXT0 = colXT2 + inc1X ;
      for ( jcolXT = irowX + 3 ; jcolXT < nrowX - 2 ; jcolXT += 3 ) {
         colXT1 = colXT0 + inc1X ;
         colXT2 = colXT1 + inc1X ;
#if MYDEBUG > 0
         fprintf(stdout, "\n jcolXT = %d", jcolXT) ;
         fprintf(stdout, "\n colXT0") ;
         DVfprintf(stdout, ncolY, colXT0) ;
         fprintf(stdout, "\n colXT1") ;
         DVfprintf(stdout, ncolY, colXT1) ;
         fprintf(stdout, "\n colXT2") ;
         DVfprintf(stdout, ncolY, colXT2) ;
#endif
         mdot3x3(sums, ncolY, tmp0, tmp1, tmp2, colXT0, colXT1, colXT2);
         j0 = locind[jcolXT] * inc2A ;
         j1 = locind[jcolXT+1] * inc2A ;
         j2 = locind[jcolXT+2] * inc2A ;
#if MYDEBUG > 0
      fprintf(stdout, "\n j0 = %d, j1 = %d, j2 = %d", 
              j0, j1, j2) ;
#endif
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i0 + j2] -= sums[2] ;
         entA[i1 + j0] -= sums[3] ;
         entA[i1 + j1] -= sums[4] ;
         entA[i1 + j2] -= sums[5] ;
         entA[i2 + j0] -= sums[6] ;
         entA[i2 + j1] -= sums[7] ;
         entA[i2 + j2] -= sums[8] ;
#if MYDEBUG > 0
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i0 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j1, entA[i0 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j2, entA[i0 + j2]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i1 + j0, entA[i1 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i1 + j1, entA[i1 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i1 + j2, entA[i1 + j2]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i2 + j0, entA[i2 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i2 + j1, entA[i2 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i2 + j2, entA[i2 + j2]) ;
#endif
         colXT0 = colXT2 + inc1X ;
      }
      if ( jcolXT == nrowX - 2 ) {
         colXT1 = colXT0 + inc1X ;
         mdot3x2(sums, ncolY, tmp0, tmp1, tmp2, colXT0, colXT1);
         j0 = locind[jcolXT] * inc2A ;
         j1 = locind[jcolXT+1] * inc2A ;
#if MYDEBUG > 0
      fprintf(stdout, "\n j0 = %d, j1 = %d", j0, j1) ;
#endif
         entA[i0 + j0] -= sums[0] ;
         entA[i0 + j1] -= sums[1] ;
         entA[i1 + j0] -= sums[2] ;
         entA[i1 + j1] -= sums[3] ;
         entA[i2 + j0] -= sums[4] ;
         entA[i2 + j1] -= sums[5] ;
#if MYDEBUG > 0
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i0 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i0 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i1 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i1 + j1]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i2 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i2 + j1]) ;
#endif
      } else if ( jcolXT == nrowX - 1 ) {
         mdot3x1(sums, ncolY, tmp0, tmp1, tmp2, colXT0);
         j0 = locind[jcolXT] * inc2A ;
#if MYDEBUG > 0
      fprintf(stdout, "\n j0 = %d", j0) ;
#endif
         entA[i0 + j0] -= sums[0] ;
         entA[i1 + j0] -= sums[1] ;
         entA[i2 + j0] -= sums[2] ;
#if MYDEBUG > 0
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i0 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i1 + j0, entA[i1 + j0]) ;
      fprintf(stdout, "\n entA[%d] = %12.4e", i2 + j0, entA[i2 + j0]) ;
#endif
      }
      rowX0 = rowX2 + inc1X ;
   }
   if ( irowX == nrowX - 2 ) {
      rowX1 = rowX0 + inc1X ;
      i0    = locind[irowX] * inc1A ;
      i1    = locind[irowX+1] * inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
              [ tmp1 ] = [ rowX1 ] 
      --------------------------------
*/
      DVzero(2*ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_32(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
*/
      DA2_mvm2vec(Y, tmp0, tmp1, 1.0, rowX0, rowX1) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colXT0 = entX + irowX*inc1X ;
      colXT1 = colXT0 + inc1X ;
      mdot2x2(sums, ncolY, tmp0, tmp1, colXT0, colXT1);
      j0 = locind[irowX] * inc2A ;
      j1 = locind[irowX+1] * inc2A ;
      entA[i0 + j0] -= sums[0] ;
      entA[i0 + j1] -= sums[1] ;
      entA[i1 + j1] -= sums[3] ;
   } else if ( irowX == nrowX - 1 ) {
      i0 = locind[irowX] * inc1A ;
/*
      --------------------------------
      compute [ tmp0 ] = [ rowX0 ] * Y
      --------------------------------
*/
      DVzero(ncolY, tmp0) ;
      DA2_transpose(Y) ;
/*
      DA2_mvm_3(Y, tmp0, 1.0, rowX0) ;
*/
      DA2_mvm1vec(Y, tmp0, 1.0, rowX0) ;
      DA2_transpose(Y) ;
/*
      -----------------------------------
      now update the rows of A by triples
      -----------------------------------
*/
      colXT0 = entX + irowX*inc1X ;
      mdot1x1(sums, ncolY, tmp0, colXT0);
      j0 = locind[irowX] * inc2A ;
#if MYDEBUG > 0
      fprintf(stdout, "\n j0 = %d", j0) ;
#endif
      entA[i0 + j0] -= sums[0] ;
#if MYDEBUG > 0
      fprintf(stdout, "\n entA[%d] = %12.4e", i0 + j0, entA[i0 + j0]) ;
#endif
   }
} else {
/*
   -----------------------------
   fallback case, slow but right
   -----------------------------
*/
   DA2      tmpDA2 ;
   double   value ;
   double   *col, *row ;
   DV       colDV, rowDV ;
   int      ii, jj ;
/*
   -------------------------------------
   initialize the temporary data objects
   -------------------------------------
*/   
   DA2_setDefaultFields(&tmpDA2) ;
   DV_setDefaultFields(&rowDV) ;
   DV_setDefaultFields(&colDV) ;
   DA2_init(&tmpDA2, nrowX, ncolX, 1, nrowX, NULL) ;
   DV_init(&rowDV, ncolX, NULL) ;
   DV_init(&colDV, ncolX, NULL) ;
   row = DV_entries(&rowDV) ;
   col = DV_entries(&colDV) ;
/*
   ----------------------
   compute tmpDA2 = X * Y
   ----------------------
*/
   for ( jj = 0 ; jj < ncolX ; jj++ ) {
      DA2_extractColumnDV(Y, &colDV, jj) ;
      for ( ii = 0 ; ii < nrowX ; ii++ ) {
         DA2_extractRowDV(X, &rowDV, ii) ;
         value = DVdot(ncolX, row, col) ;
         DA2_setEntry(&tmpDA2, ii, jj, value) ;
      }
   }
/*
   -------------------------
   compute A -= tmpDA2 * X^T
   -------------------------
*/
   for ( jj = 0 ; jj < nrowX ; jj++ ) {
      DA2_extractRowDV(X, &colDV, jj) ;
      for ( ii = 0 ; ii <= jj ; ii++ ) {
         DA2_extractRowDV(&tmpDA2, &rowDV, ii) ;
         value = - DVdot(ncolX, row, col) ;
         DA2_addEntry(A, locind[ii], locind[jj], value) ;
      }
   }
/*
   ------------------------
   free the working storage
   ------------------------
*/
   DA2_clearData(&tmpDA2) ;
   DV_clearData(&rowDV) ;
   DV_clearData(&colDV) ;
}

return ; }

/*--------------------------------------------------------------------*/
