#include "BSprivate.h"

/*@ BSbackward1 - Backward triangular matrix multiplication on a 
                  single vector

    Input Parameters:
.   A - The sparse matrix
.   x - The input vector
.   comm - The communication structure for A
.   procinfo - the usual processor information

    Output Parameters:
.   b - on exit contains A*x

    Returns:
    void

	Notes:
    We assume that A has no i-nodes or cliques

 @*/
void BSbackward1(A,x,b,comm,procinfo)
BSpar_mat *A;
FLOAT *x;
FLOAT *b;
BScomm *comm;
BSprocinfo *procinfo;
{
	BMcomp_msg *from_msg, *to_msg;
	BMphase *to_phase, *from_phase;
	BMmsg *msg;
	int	i, j, k;
	int	cl_ind, in_ind;
	int	count, size, ind;
	int *row;
	FLOAT *nz;
	BScl_2_inode *clique2inode;
	BSnumbering *color2clique;
	BSinode *inodes;
	int	*data_ptr, msg_len;
	FLOAT *msg_buf;
	FLOAT	t;

	color2clique = A->color2clique;
	clique2inode = A->clique2inode;
	inodes = A->inodes->list;

	/* REMEMBER, the to and from phase are switched here */
	from_msg = comm->to_msg;
	to_msg = comm->from_msg;

	/* post for all messages */
	BMinit_comp_msg(from_msg,procinfo); CHKERR(0);

	/* REMEMBER, the diagonal has already been taken care */

	/* now do this phase by phase */
	for (i=color2clique->length-2;i>=0;i--) {
		/* first send my messages */
		/* this will involve computing partial sums */
		to_phase = BMget_phase(to_msg,i); CHKERR(0);
		msg = NULL;
		while ((msg = BMnext_msg(to_phase,msg)) != NULL) {
			CHKERR(0);
			msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0);
			data_ptr = BMget_user(msg,&msg_len); CHKERR(0);
			count = 0;
			for (cl_ind=data_ptr[0];cl_ind<=data_ptr[1];cl_ind++) {
				in_ind=clique2inode->inode_index[cl_ind];
				size = inodes[in_ind].length;
				if (size > 0) {
					row = inodes[in_ind].row_num;
					nz = inodes[in_ind].nz;
					t = 0.0;
					for (k=0;k<size;k++) t += x[row[k]]*nz[k];
					msg_buf[count] = t;
				}
				count++;
			}
			BMsendf_msg(msg); CHKERR(0);
		}
		CHKERR(0);
	}

	/* do some local work */
	for (i=color2clique->length-2;i>=0;i--) {
		for (cl_ind=color2clique->numbers[i];
			cl_ind<color2clique->numbers[i+1];cl_ind++) {
			if (procinfo->my_id == clique2inode->proc[cl_ind]) {
				/* only do the strictly upper triangular part */
				/* we ASSUME the diagonal is all 1's */
				/* which is taken care prior to this */
				/* now, multiply the inodes */
				in_ind=clique2inode->inode_index[cl_ind];
				size = inodes[in_ind].length;
				if (size > 0) {
					ind = clique2inode->d_mats[cl_ind].local_ind;
					row = inodes[in_ind].row_num;
					nz = inodes[in_ind].nz;
					t = 0.0;
					for (k=0;k<size;k++) t += x[row[k]]*nz[k];
					b[ind] += t;
				}
			}
		}
	}

	/* receive my messages and update my rhs */
	for (i=color2clique->length-2;i>=0;i--) {
		from_phase = BMget_phase(from_msg,i); CHKERR(0);
		while ((msg = BMrecv_msg(from_phase)) != NULL) {
			CHKERR(0);
			msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0);
			data_ptr = BMget_user(msg,&msg_len); CHKERR(0);
			msg_len = BMget_msg_size(msg); CHKERR(0);
			msg_len /= sizeof(FLOAT);
			count = 0;
			for (j=0;j<msg_len;j++) b[data_ptr[j]] += msg_buf[j];
			BMfree_msg(msg); CHKERR(0);
		}
		CHKERR(0);
	}

	/* wait for all of the sent messages to finish */
	BMfinish_comp_msg(to_msg,procinfo); CHKERR(0);
	MLOG_flop((2*A->local_nnz));

}
