#include	<stdio.h>
#include		"cnd_stats.h"
/*---------------------------------------------------------------*/
/*     	CAPSS: A Cartesian Parallel Sparse Solver                */
/*     	Beta Release                                             */
/*      Author: Padma Raghavan                                   */
/*---------------------------------------------------------------*/

dist_fanin_chol(me,	nprocs, start_proc,	fact_n,
		n, 	ncols, 	map, 	mycols, 	col,	
		blksize,	have_contrib_next,
		have_contrib,	contrib_nonz,
		contrib_nonz_sizes, msg_type_offset)

int 		me,	nprocs,	start_proc, 	fact_n,
		n, 	ncols, 	*map, 	*mycols, blksize,	*have_contrib,
		have_contrib_next,
		*contrib_nonz_sizes, msg_type_offset;
double 		**col,		**contrib_nonz;

/*
 *  Fanin Multifrontal Cholesky
 *  Uses fan-in communication without compute-ahead.
 */
{
	int  double_size, max_bytes, i, j, k, m, inc , limit, count;
	double  zero, t,  *q, *r, flt_one;
	extern	double	*colk;
	extern	double stats[], sqrt();

	inc = 1 ;
	j =limit =0 ;
	zero=0.0;
	flt_one = 1.0;
	double_size	= sizeof(double);
	max_bytes 	= ((n)+2)*double_size;
	for (k = 0 ; k < n ; k++) {
		m = n-k ;
		if (map[k] == me)
			r = col[j] ;
		else {
			r = colk ;
			if ( have_contrib[k] != -1 ) {
				copy_to(r,contrib_nonz[have_contrib[k]],m);
				free ((char *) contrib_nonz[
					have_contrib[k]]);
			} else
			dfill(&m, &zero, r, &inc) ;
		}
		for (i = 0  ; i < limit ; i++) {
			q = col[i]+k-mycols[i] ;
                        t = -(*q) ;
                        daxpy1_(&m, &t, q, &inc, r) ;
		}

		if(map[k] == me) {
			for(count=0; count < nprocs -1; count++) {

				recv0(colk, (max_bytes), k+msg_type_offset);
				daxpy1_(&m,&flt_one,colk,&inc,r);
				/*
				add_vec(r,colk,m);
				*/
			}
			if( k < fact_n) {
			t = 1.0 / sqrt(*r) ;
			
			dscal_(&m, &t, r, &inc) ;
			limit++;
			}  
			j++ ;
		}
		else {
			send0(r,(m*double_size),k+msg_type_offset,map[k]);
			stats[n_d_c] += m;
                }

	}
}

dist_fanboth_invert(me,	nprocs, start_proc,	fact_n,
		n, 	ncols, 	map, 	mycols, 	col,	
		blksize, msg_type_offset)

int 		me,	nprocs,	start_proc, 	fact_n,
		n, 	ncols, 	*map, 	*mycols, blksize,
		msg_type_offset;
double 		**col;

/*
 *  Code to invert the triangular part of a trapezoidal matrix
 *  inversion requires (1) one to all and (2) all to one communication
 *  per column
 *  the remaining non triangular part contains enteries of
 *  of the matrix before inversion 
 */
{
	int  double_size, max_bytes, proc,
		i, j, k, m, inc , limit, count, next_msg_type;
	double zero, t, *p, *q, *r, *u, flt_one;
	extern	double	*colk, *colj;
	extern	double stats[], sqrt();

	inc = 1 ;
	zero=0.0;
	flt_one = 1.0;
	double_size	= sizeof(double);
	max_bytes 	= ((n)+2)*double_size;
	next_msg_type = msg_type_offset;
					/* scale all values */

	for (i= 0 ; ((mycols[i] < fact_n) && (i <ncols)) ; i++) {
			m= fact_n - mycols[i]-1;
			q = col[i];
                        t = (*q) ; *q = 1.0 /t; t = - (*q); q++;
			dscal_(&m, &t, q, &inc) ;
	}
	i--;
	limit = i;
	if(mycols[i] == (fact_n -1)) i--;
			
	for (k=fact_n-2; k>=0 ; k--) {
		m = fact_n-k ;
		if (map[k] == me) {
			r = col[i];
			copy_to(colk,r,m);
			for(count=0, proc= start_proc; count < nprocs ; count++) {
				if(proc != me)
					send0(r, (m*double_size), next_msg_type, proc);
			       		proc++;
				stats[n_d_c] += m;
			}
		}
		else recv0(colk, (max_bytes), next_msg_type);
		r = colk;
		next_msg_type++;
					/* u is the update vector */
		if(map[k] != me)  {
			u = colj;
			dfill(&m, &zero, u, &inc) ;
		}
		else { 
			u =  col[i] ;
			u++;
			m--;
			dfill(&m, &zero, u, &inc) ;
			u =  col[i];
		}

		for (j = limit; j>i ; j--) {
			q = col[j];
                        t = *(r+mycols[j]-k) ;
			m = fact_n -mycols[j]; 
                        daxpy1_(&m, &t, q, &inc, u+mycols[j]-k) ;
				/* u = u+ *t x q */
		}

		m = fact_n-k ;
		if(map[k] == me) {
			for(count=0; count < nprocs -1; count++) {
				recv0(colk, (max_bytes), next_msg_type);
                        	daxpy1_(&m, &flt_one, colk, &inc, u) ;
				/*
				add_vec(u,colk,m);
				*/
			}
			i --;
		}
		else {
			send0(u,(m*double_size),next_msg_type,map[k]);
			stats[n_d_c] += m;
                }
		next_msg_type++;

	}
}/*end dist fanboth */

dist_fanout_invert(me,	nprocs, start_proc,	fact_n,
		n, 	ncols, 	map, 	mycols, 	col,	
		blksize, msg_type_offset)

int 		me,	nprocs,	start_proc, 	fact_n,
		n, 	ncols, 	*map, 	*mycols, blksize,
		msg_type_offset;
double 		**col;

/*
 *  Code to invert the triangular part of a trapezoidal matrix
 *  inversion requires (1) one to all  communication
 *  per column
 *  the remaining non triangular part contains enteries of
 *  of the matrix before inversion 
 */
{
	int  double_size, max_bytes, proc, last_proc,
		 j, k, m, inc , limit, count, next_msg_type;
	double zero, t, *p, *q, *r, *u, flt_one;
	extern	double	*colk;
	extern	double stats[];

	inc = 1 ;
	zero=0.0;
	flt_one = 1.0;
	double_size	= sizeof(double);
	max_bytes 	= ((n)+2)*double_size;
	last_proc = start_proc+nprocs -1;
	next_msg_type = msg_type_offset;
					/* scale all values */

	for (k= 0 ; ((mycols[k] < fact_n) && (k <ncols)) ; k++) {
			m= fact_n - mycols[k]-1;
			q = col[k];
                        t = (*q) ; *q = 1.0 /t; t = - (*q); q++;
			dscal_(&m, &t, q, &inc) ;
	}
	k--;
	limit = k;
			
	for (k=fact_n-1; k>0 ; k--) {
		m = fact_n-k ;
		if (map[k] == me) {
			r = col[limit];
			limit--;
			proc = map[k-1];
			for(count=0 ; count < nprocs ; 
				count++) {
				if(proc != me)
					send0(r, (m*double_size), 
					next_msg_type, proc);
				if(proc < (last_proc) )  proc++;
				else proc = start_proc;
				stats[n_d_c] += m;
			}
		}
		else {
			 recv0(colk, (max_bytes), next_msg_type);
			 r= colk;
		}
			
		next_msg_type++;

		for (j = limit; j>=0 ; j--) {
			q = col[j]+k- mycols[j];
                        t = *q;
			*q = 0.0;
                        daxpy1_(&m, &t, r, &inc, q) ;
		}


	}	
}/*end dist fanout  */

dist_fanring_invert(me,	nprocs, start_proc,	fact_n,
		n, 	ncols, 	map, 	mycols, 	col,	
		blksize, msg_type_offset)

int 		me,	nprocs,	start_proc, 	fact_n,
		n, 	ncols, 	*map, 	*mycols, blksize,
		msg_type_offset;
double 		**col;

/*
 *  Code to invert the triangular part of a trapezoidal matrix
 *  inversion requires (1) one to all  communication
 *  per column
 *  the remaining non triangular part contains enteries of
 *  of the matrix before inversion 
 */
{
	int  double_size, max_bytes, proc, last_proc,
		 j, k, m, inc , limit, next_msg_type;
	double zero, t, *p, *q, *r, *u, flt_one;
	extern double	*colk;
	extern	double stats[];

	inc = 1 ;
	zero=0.0;
	flt_one = 1.0;
	double_size	= sizeof(double);
	max_bytes 	= ((n)+2)*double_size;
	last_proc = start_proc + nprocs -1;
	next_msg_type = msg_type_offset;
					/* scale all values */

	for (k= 0 ; ((mycols[k] < fact_n) && (k <ncols)) ; k++) {
			m= fact_n - mycols[k]-1;
			q = col[k];
                        t = (*q) ; *q = 1.0 /t; t = - (*q); q++;
			dscal_(&m, &t, q, &inc) ;
	}
	k--;
	limit = k;
			
	for (k=fact_n-1; k>0 ; k--) {
		m = fact_n-k ;
		if (map[k] == me) {
			r = col[limit];
			limit--;
			proc = map[k-1];
			send0(r, (m*double_size), 
					next_msg_type, proc);
		}
		else {
			 recv0(colk, (max_bytes), next_msg_type);
			 if((me -1) <start_proc) proc =last_proc;
			 else proc = me -1;
			 if((proc != map[k]) && (proc!= me))
		 	 send0(colk, (m*double_size), 
					next_msg_type, proc);
			stats[n_d_c] += m;
			 r= colk;
		}
			
		next_msg_type++;

		for (j = limit; j>=0 ; j--) {
			q = col[j]+k- mycols[j];
                        t = *q;
			*q = 0.0;
                        daxpy1_(&m, &t, r, &inc, q) ;
		}


	}	
}/*end dist fanout  */


add_vec(to, from, count)
double   *to,    *from;
int     count;
{
                int     i;
                for (i=0; i < count;  i++)
                        to[i] += from[i] ;
}       /*add_vec*/

local_chol(	col,		fact_size,	size)

int 		fact_size,	size;

double 		**col ;

/*
 *  	Local Fanin -Cholesky Factorization.
 *
 */

{
	
	double 	sqrt() ;


	double 	*q, 		t;

	int	inc,	 	j,		k,	
		m,		till;


		for ( j = 0, inc =1;
				j < size ; j++) {

					/*cmod(j,k)*/

			m= size-j ;
			for (k = 0, till = (j <fact_size) ? j : fact_size;
					k < till ; k++) {
				q = col[k]+ j - k ;
                 		t = -(*q) ;
               			daxpy1_(&m, &t, q, &inc, col[j]) ;
			 }
			if ( j < fact_size) {
				t = 1.0 / sqrt(*col[j]) ;
				dscal_(&m, &t, col[j], &inc) ;
			}
		}


}
