#include <stdio.h>
#include <math.h>
#define MATRIXDIM 16
#define ULP 3.0E-16

/*
 * Various and sundry routines needed for sequential version of lu
 * decomposition.
 */


/* The following three macros emulate FORTRAN style 2-dimensional arrays
 * in 1-dimensional C arrays.
 */

/* The following macro depends on the variables a and nca */
#define A(i,j) (*(a + (i)*nca + (j)))

/* The following macro depends on the variables b and ncb */
#define B(i,j) (*(b + (i)*ncb + (j)))

/* The following macro depends on the variables c and ncc */
#define C(i,j) (*(c + (i)*ncc + (j)))

#define min(a,b) (((a)<(b))?(a):(b))
#define max(a,b) (((a)>(b))?(a):(b))

/*
 * trisolve(int m, int n, double *a, int nca, double *b, int ncb)
 *
 * Solve the matrix equation
 *
 *     a*x=b
 * 
 * where x and b are m by n matrices, and a is an m by m unit lower triangular
 * matrix.  b is overwritten by x on output.
 */

/*
in HeNCE:
    trisolve(nb, n-j-nb, a(j:j+nb-1,j:j+nb-1), nb, a(j,j+nb-1:n-1), n-j-nb);
or
fanout for l = 0; l < (n/nb) - (j/nb) - 1 ; l++
    jl = (l+1)*nb + j;
    trisolve(nb, nb, a(j:j+nb-1,j:j+nb-1), nb, a(j:j+nb-1,jl:jl+nb-1), nb);
end fanout
*/

void
trisolve(m, n, a, nca, b, ncb)
int m, n;
double *a, *b;
int nca, ncb;
{
    int i, j, k;

    for(j = 0; j < n; j++) {
	for(k = 0; k < m; k++) {
	    for(i = k+1; i < m; i++) {
                B(i,j) -= B(k,j)*A(i,k);
	    }
	}
    }
}

/*
 * XXX added by Weicheng Jiang, Nov. 1992
 *
 * This function solves the matrix equation Ax = b. The vector b is
 * overwritten on output.
 *
 * Algorithm: (p.64, "Applied Numerical Linear Algebra", - W. W. Hager)
 *	j = 1 to (n-1)
 *          bj <-> bp(j)     # pivot, including the multipliers
 *	j = 1 to (n-1)
 *	    i = j+1 to n
 *		bi <- bi - Aij bj
 *	    next i
 *	next j
 *	j = n to 1
 *	    bj <- bj/Ajj
 *	    i = 1 to (j-1)
 *		bi <- bi - Aij bj
 *	    next i
 *	next j
 */
 
void
solve(n, a, b, y, piv)
int    n;
double *a, *b, *y;
int    piv[];
{
    int    i, j, nca = n;

    bcopy(b, y, n*sizeof(double));
    /* pivot */
    for (j = 0; j < n; j++) {
	double tmp;
        int    k = piv[j];

        tmp = y[j];
        y[j] = y[k];
	y[k] = tmp;
    }

    /* forward substitution */
    for (j = 0; j < n; j++)
	for (i = j+1; i < n; i++)
	    y[i] -= A(i,j)*y[j];

    /* backward substitution */
    for (j = n-1; j >= 0; j--) {
	y[j] /= A(j,j);
	for (i = 0; i < j; i++)
	    y[i] -= A(i,j)*y[j];
    }
}

/*
 * XXX added by Weicheng Jiang, Nov. 1992
 *
 * This function computes ||Ax-b||/p|x| to verify the solution.
 */
verify(n, a, x, b)
int     n;	/* matrix dimension */
double  *a;	/* matrix A */
double  *x, *b; /* vectors */
{
    double  diff = 0, xnorm = 0, *ax;
    int     i, k, nca = n;

    ax = (double *) calloc((unsigned)n, sizeof(double));
    for (i = 0; i < n; i++) {
	xnorm += fabs(x[i]);
	for (k = 0; k < n; k++)
	    ax[i] += A(i,k)*x[k];
	diff += fabs(ax[i]-b[i]);
    }
    printf("\t||Ax-b||\n\t-------- = %0.4g\n\tulp||x||\n", diff/(xnorm*ULP));
}

/* b = Ax, where x = {1, 2, ... n-1}. */
initb(n, a, b)
int     n;
double  *a, *b;
{
    int  i, j, nca = n;

    for (i = 0; i < n; i++) {
	b[i] = 0.0;
	for (j = 0; j < n; j++)
	    b[i] += A(i,j)*j;
    }
}
	
/*
 * update2(int m, int n, int k, double *a, int nca, double *b, int ncb,
 *     double *c, int ncc)
 *
 * Performs the matrix operation
 *
 * c -= a*b
 *
 * where a is a m by k matrix, b is a k by n matrix and c is an m by n matrix.
 */

/*
    update2(n-j-nb, n-j-nb, nb, &A(j+nb,j), nca, &A(j,j+nb), nca,
	&A(j+nb,j+nb),nca);

in HeNCE:

fanout for q = 0; q < (n/nb) - (j/nb) - 1; q++
    jq = (q+1)*nb + j;
    fanout for r = 0; r < (n/nb) - (j/nb) - ; r++
        jr = (r+1)*nb + j;
	update2(nb, nb, nb, a(jq:jq+nb-1,j:j+nb-1), nb,
	                    a(j:j+nb-1,jr:jr+nb-1), nb,
			    a(jq:jq+nb-1,jr:jr+nb-1), nb);
    end fanout
end fanout
*/

void
update2(m, n, k, a, nca, b, ncb, c, ncc)
int m, n, k;
double *a, *b, *c;
int nca, ncb, ncc;
{
    int i, j, l;

    for(j = 0; j < n; j++) {
	for(l = 0; l < k; l++) {
	    for(i = 0; i < m; i++) {
		C(i,j) -= B(l,j)*A(i,l);
	    }
	}
    }
}

/*
 * special case of above for 2 vectors
 */

void
update(m, n, b, incb, c, incc, a, nca)
int m, n;
double *b, *c, *a;
int incb, incc, nca;
{
    int ib, jc;
    int i, j;

    jc = 0;
    for(j = 0; j < n; j++) {
	ib = 0;
	for(i = 0; i < m; i++) {
	    A(i,j) -= b[ib] * c[jc];
	    ib += incb;
	}
	jc += incc;
    }
}

/* Print the m by n matrix c. */

void
printmatrix(m, n, c, ncc)
int m, n;
double *c;
int ncc;
{
    int i, j;

    for(i = 0; i < m; i++) {
	for(j = 0; j < n; j++) {
	    (void)printf("%8.2f", C(i,j));
	}
	(void)printf("\n");
    }
}

/* swap two vectors */

void
swapvects(n, a, b)
int n;
double *a, *b;
{
    int i;
    double tmp;

    for(i = 0; i < n; i++) {
	tmp = a[i];
	a[i] = b[i];
	b[i] = tmp;
    }
}

/*
 * Perform a series of row interchanges on matrix a.
 */

void
swaprows(nc, a, nca, k1, k2, piv)
int nc;
double *a;
int nca, k1, k2;
int *piv;
{
    int i;

    for(i = k1; i < k2; i++) {
	swapvects(nc, &A(i,0), &A(piv[i],0));
    }
}

#define dabs(x) (((x)>=0.0)?(x):-(x))

int
findmax(n, a, inc)
int n;
double *a;
int inc;
{
    int i, ix, maxi;
    double dmax;

    dmax = a[0];
    maxi = 0;
    ix = 0;
    for(i = 0; i < n; i++) {
	if(dabs(a[ix]) > dmax) {
	    dmax = dabs(a[ix]);
	    maxi = i;
	}
	ix += inc;
    }
    return maxi;
}

void
scale(n, d, a, inc)
int n;
double d, *a;
int inc;
{
    int i, ix;

    ix = 0;
    for(i = 0; i < n; i++) {
	a[ix] *= d;
	ix += inc;
    }
}

/*
 * Level 2 version of lu decomposion algorithm.
 *
 * Adapted from DGETF2.
 */

void
lu2(m, n, a, nca, piv)
int m, n;
double *a;
int nca;
int *piv;
{
    int j, jp;

    for(j = 0; j < min(m, n); j++) {
	jp = j + findmax(m-j, &A(j,j), nca);
	piv[j] = jp;
	swapvects(n, &A(j,0), &A(jp,0));
/*
	swapvects(n-j, &A(j,j), &A(jp,j));
*/

	if(j+1 < m) {
	    scale(m-j-1, 1.0/A(j,j), &A(j+1, j), nca);
	}
	if(j+1 < n) {
	    update(m-j-1,n-j-1,&A(j+1,j),nca,&A(j,j+1),1,&A(j+1,j+1), nca);
	}
    }
}

/*
 * Level 3 version of the lu decomposition algorithm.
 *
 * Adapted from DGETRF
 */

void
lu3(n, nb, a, nca, piv)
int n, nb;
double *a;
int nca;
int *piv;
{
    int i, j;

    for(j = 0; j < n; j += nb) {
	lu2(n-j, nb, &A(j,j), nca, &piv[j]);
	for(i = j; i < j+nb; i++) {
	    piv[i] += j;
	}
	swaprows(j, a, nca, j, j+nb, piv);
	swaprows(n-j-nb, &A(0,j+nb), nca, j, j+nb, piv);
	trisolve(nb, n-j-nb, &A(j,j), nca, &A(j,j+nb), nca);
	update2(n-j-nb, n-j-nb, nb, &A(j+nb,j), nca, &A(j,j+nb), nca,
		&A(j+nb,j+nb),nca);
    }
}

void
pivot(j, n, nb, a, nca, piv)
int j, n, nb;
double *a;
int nca;
int *piv;
{
    int i;

    lu2(n-j, nb, &A(j,j), nca, &piv[j]);
    for(i = j; i < j+nb; i++) {
	piv[i] += j;
    }
    swaprows(j, a, nca, j, j+nb, piv);
    swaprows(n-j-nb, &A(0,j+nb), nca, j, j+nb, piv);
}

/*
 * This procedure mimics the HeNCE parallel version of the lu decomposition
 * algorithm as closely as possible.
 */

void
lup(n, nb, a, nca, piv)
int n, nb;
double *a;
int nca;
int *piv;
{
    int j, l, q, r;

    for(j = 0; j < n; j += nb) {			/* loop */
	pivot(j, n, nb, a, nca, piv);
	for(l = 0; l <= ((n/nb-j/nb)-1)-1; l++) {	/* fanout */
	    int jl;

	    jl = (l+1)*nb+j;
	    trisolve(nb, nb, &A(j,j), nca, &A(j,jl), nca);
	}
	for(q = 0; q <= ((n/nb-j/nb)-1)-1; q++) {	/* fanout */
	    for(r = 0; r <= ((n/nb-j/nb)-1)-1; r++) {	/* fanout */
		int jq, jr;

		jq = (q+1)*nb+j;
		jr = (r+1)*nb+j;
		update2(nb, nb, nb, &A(jq,j),nca, &A(j,jr),nca, &A(jq,jr),nca);
	    }
	}
    }
}

int
main()
{
    static double a[] = {
	3,  18,  10,  1,  3,  17,  10,  1,  3,  17,  10,  1,  3,  17,  10,  1,
	2,   4 , -4,  2,  2,   4 , -2,  2,  2,   4 , -2,  2,  2,   4 , -2,  2,
	6,  18, -12,  5,  6,  18, -12,  3,  6,  18, -12,  3,  6,  18, -12,  3,
	4,   3,   2,  4,  3,   3,   2,  4,  4,   3,   2,  4,  4,   3,   2,  4,
	3,  17,  10,  3,  3,   7,  10,  1,  3,  17,  10,  1,  3,  17,  10,  1,
	2,   4 , -2,  2,  2,   4 ,  2,  2,  2,   4 , -2,  2,  2,   4 , -2,  2,
	6,  18, -12,  3,  6,  18, -12, 13,  6,  18, -12,  3,  6,  18, -12,  3,
	4,   3,   2,  4,  4,   3,   2,  4, -4,   3,   2,  4,  4,   3,   2,  4,
	3,  17,  10,  1,  3,  17,  10,  1,  3,  -7,  10,  1,  3,  17,  10,  1,
	2,   4 , -2,  2,  2,   4 , -2,  2,  2,   4 , 27,  2,  2,   4 , -2,  2,
	6,  18, -12,  3,  6,  18, -12,  3,  6,  18, -12, 13,  6,  18, -12,  3,
	4,   3,   2,  4,  4,   3,   2,  4,  4,   3,   2,  4,-14,   3,   2,  4,
	3,  17,  10,  1,  3,  17,  10,  1,  3,  17,  10,  1,  3,  -3,  10,  1,
	2,   4 ,  2,  2,  2,   4 , -2,  2,  2,   4 , -2,  2,  2,   4 , 20,  2,
	6,  18, -12,  3,  6,  18, -12,  3,  6,  18, -12,  3,  6,  18, -12, 30,
	4,   3,  12,  4,  4,   3,   2,  4,  4,   3,   2,  4,  4,   3,   2,  4,
    };
/*
    static double a[] = {
	3, 17, 10, 1,
	2, 4, -2, 2,
	6, 18, -12, 3,
	4, 3, 2, 4,
    };
   6.0000  18.0000 -12.0000   3.0000
   0.6667  -9.0000  10.0000   2.0000
   0.5000  -0.8889  24.8889   1.2778
   0.3333   0.2222  -0.0089   0.5670
*/
/*
    static double a[] = {
	2, 4, 2,
	4, 7, 7,
	-2, -7, 5
    };
    static double b[] = {4, 13, 7};
    static double a[] = {
	 2,  2, -2,
	-4, -2,  2,
	-2,  3,  9,
    };
    static double b[] = {8, -14, 9};
*/

    int piv[MATRIXDIM];
    int m = MATRIXDIM, n = m, nca = m;
    int i;
    int nb = 4;
    /* XXX Weicheng Jiang */
    double a_orig[MATRIXDIM*MATRIXDIM], b[MATRIXDIM], x[MATRIXDIM];

    /* XXX save the original matrix A.  -Weicheng Jiang */
    bcopy(a, a_orig, n*n*sizeof(double));
    initb(n, a, b);

/*
    printmatrix(m, n, a, nca);
    (void)printf("\n");
*/

/*    lu2(m, n, a, nca, piv);
    lu3(n, nb, a, nca, piv);   */
    lup(n, nb, a, nca, piv);

/*
    printmatrix(m, n, a, nca);
*/

    /* XXX Weicheng Jiang */
    solve(m, a, b, x, piv);
    printmatrix(m, 1, x, 1);
    verify(m, a_orig, x, b);

/*
    for(i = 0; i < n; i++) {
	(void)printf("piv[%d] = %d\n", i, piv[i]);
    }
*/
    return 0;
}
