#include "BSprivate.h"

/*+ BSpar_bcg - conjugate gradient on a block of vectors; reverse communication

    Input Parameters:
.   num_cols - the length of the vector on this processor
.   rhs - the contiguous block of rhs vectors on this processor
          length = num_cols * nBS
.   x - the contiguous block of solution vectors on this processor
          length = num_cols * nBS
.   resid - the contiguous block of residual vectors on this processor
          length = num_cols * nBS
.   z - the contiguous block of work vectors on this processor
          length = num_cols * nBS
.   p - the contiguous block of work vectors on this processor
          length = num_cols * nBS
.   beta - the vector of cg coefficients
          length = nBS
.   alpha - the vector of cg coefficients
          length = nBS
.   cur_step - the current cg step
.   cur_phase - the current cg phase
.   nBS - the number of columns in the block
.   procinfo - the usual processor stuff

    Output Parameters:
.   many of the input parameters are changed, it is just the usual
    CG algorithm

    Returns:
    The action of the calling program depends on the return value
    CG_MATVECR: multiply A*x and put the results in resid
    CG_MATVECZ: multiply A*p and put the results in z
    CG_MSOLVE: multiply A(-1)*resid and put the results in z
    CG_ERROR: Error during CG, indefinite matrix encountered

    Notes:
    This is NOT the block conjugate gradient algorithm.  It is
    a separate conjugate gradient run on each vector, but the
    operations are blocked for efficiency.

    The first time that BSpar_bcg is called, cur_step and
    cur_phase should be 0.

 +*/
int BSpar_bcg(num_cols,rhs,x,resid,z,p,beta,alpha,cur_step,cur_phase,nBS,
	procinfo) 
int	num_cols;
FLOAT	*rhs;
FLOAT	*x;
FLOAT	*resid;
FLOAT	*z;
FLOAT	*p;
FLOAT	*beta;
FLOAT	*alpha;
int	*cur_step;
int	*cur_phase;
int	nBS;
BSprocinfo *procinfo;
{
	int	i, j;
	FLOAT	bk, *bknum;
	FLOAT	ak;
	FLOAT	*t_z, *t_p, *t_resid, *t_x;

	switch((*cur_phase)) {
		case 0: {
			(*cur_phase) = 1;
			return(CG_MATVECR);
		}
		case 1: {
			for (i=0;i<num_cols*nBS;i++) {
				resid[i] = rhs[i] - resid[i];
			}
			(*cur_phase) = 2;
			(*cur_step) = 1;
			return(CG_MSOLVE);
		}
		case 2: {
			MY_MALLOC(bknum,(FLOAT *),sizeof(FLOAT)*nBS,1);
			BSpar_bip(num_cols,z,resid,nBS,bknum,procinfo); CHKERR(0);
			for (j=0;j<nBS;j++) {
				t_z = &(z[j*num_cols]);
				t_p = &(p[j*num_cols]);
				if (bknum[j] < 0.0) return(CG_ERROR);
				if ((*cur_step) == 1) {
					for (i=0;i<num_cols;i++) {
						t_p[i] = t_z[i];
					}
				} else {
					bk = bknum[j]/beta[j];
					for (i=0;i<num_cols;i++) {
						t_p[i] = t_z[i] + bk*t_p[i];
					}
				}
				beta[j] = bknum[j];
			}
			MY_FREE(bknum);
			(*cur_phase) = 3;
			return(CG_MATVECZ);
		}
		case 3: {
			BSpar_bip(num_cols,p,z,nBS,alpha,procinfo); CHKERR(0);
			for (j=0;j<nBS;j++) {
				t_z = &(z[j*num_cols]);
				t_resid = &(resid[j*num_cols]);
				t_p = &(p[j*num_cols]);
				t_x = &(x[j*num_cols]);
				if (alpha[j] < 0.0) return(CG_ERROR);
				ak = beta[j]/alpha[j];
				for (i=0;i<num_cols;i++) {
					t_x[i] += ak*t_p[i];
					t_resid[i] -= ak*t_z[i];
				}
			}
			(*cur_phase) = 2;
			(*cur_step)++;
			return(CG_MSOLVE);
		}
	}
}
