#include "petsc.h"
#include "parpre_mat.h"
#include "parpre_vec.h"
#include "blockmat_impl.h"
#include "src/mat/impls/aij/mpi/mpiaij.h"
#include "src/vec/impls/blockvec_impl.h"

extern int IntSort(int *ar,int l);

#undef __FUNC__
#define __FUNC__ "BlockMatMult"
int BlockMatMult(BlockMat A,BlockVec bi,BlockVec bo)
{
  Scalar *x,*y;
  int mytid,ntids,ierr;

  PetscFunctionBegin;
  MPI_Comm_rank(A->comm,&mytid);
  MPI_Comm_size(A->comm,&ntids);

  /* array data switcheroo */
  ierr = BlockVecGetArray(bi,&x); CHKERRQ(ierr);
  ierr = BlockVecGetArray(bo,&y); CHKERRQ(ierr);

  /* setup communication of boundary */
  ierr = VecPlaceArray(A->global_vec,x); CHKERRQ(ierr);
  ierr = VecScatterBegin
    (A->global_vec,A->border_vec,INSERT_VALUES,SCATTER_FORWARD,
     A->Mvctx); CHKERRQ(ierr);
  /* now the product with the A part */
  {
    int local_size = A->local_size,isys;
    
    for (isys=0; isys<bi->n; isys++) {
      ierr = VecPlaceArray(A->in,x+isys*local_size); CHKERRQ(ierr);
      ierr = VecPlaceArray(A->out,y+isys*local_size); CHKERRQ(ierr);
      ierr = MatMult(A->A,A->in,A->out); CHKERRQ(ierr);
    }
  }  
  /* end boundary communication */
  ierr = VecScatterEnd
    (A->global_vec,A->border_vec,INSERT_VALUES,SCATTER_FORWARD,
     A->Mvctx); CHKERRQ(ierr);
  
  if (ntids>1) {
    /* product with the B part */
    {
      int border_size = A->border_size,local_size = A->local_size,isys,i;
      
      ierr = VecPlaceArray(A->out,A->out_save); CHKERRQ(ierr);
      for (isys=0; isys<bi->n; isys++) {
	ierr = VecPlaceArray
	  (A->bin,A->border_save+isys*border_size); CHKERRQ(ierr);
	ierr = MatMult(A->B,A->bin,A->out); CHKERRQ(ierr);
	for (i=0; i<local_size; i++)
	  y[isys*local_size+i] += A->out_save[i];
      }
    }
  }

  /* switcheroo back */
  ierr = BlockVecRestoreArray(bi,&x); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(bo,&y); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockMatCreate"
int BlockMatCreate(Mat mat,int nrhs,BlockMat *bA)
{
  MPI_Comm comm;
  BlockMat b;
  int mytid,ierr;

  PetscFunctionBegin;
  ierr = PetscObjectGetComm((PetscObject)mat,&comm); CHKERRQ(ierr);
  MPI_Comm_rank(comm,&mytid);

  b = (BlockMat) PetscNew(struct _p_BlockMat); CHKPTRQ(b);
  b->block_size = nrhs; b->comm = comm;

  /* Set up the distributed structure of this matrix,
     base on the original matrix.*/
  {
    Mat_MPIAIJ *aij = (Mat_MPIAIJ *) mat->data;
    Mat_SeqAIJ *B = (Mat_SeqAIJ *) (aij->B->data);
    IS from,to;
    int ib,ip=0,ir, ec=0, lgarray,*garray, *splits=aij->rowners;

    b->border_size = B->n;
    b->A = aij->A; b->B = aij->B;
    ierr = VecCreateSeqWithArray
      (MPI_COMM_SELF,b->border_size,0,&b->bin); CHKERRQ(ierr);
    lgarray = nrhs*b->border_size;
    garray = (int *) PetscMalloc( (lgarray+1)*sizeof(int) ); CHKPTRQ(garray);
    b->garray = garray;

    for (ib=0; ib<b->border_size; ib++) {
      int i = aij->garray[ib];
      again:
	if (i>=splits[ip] && i<splits[ip+1]) {
	  int local_size=splits[ip+1]-splits[ip];
	  for (ir=0; ir<nrhs; ir++) {
	    garray[ec] = i+(nrhs-1)*splits[ip]+ir*local_size;
	    ec++;
	  }
	} else {ip++; goto again;}
    }
    if (ec>lgarray) SETERRQ(1,1,"Out of bounds on block garray");
    ierr = IntSort(garray,ec); CHKERRQ(ierr);

    /* from vector and index set:
       really long vector, which we save for later use */
    ierr = VecCreateMPIWithArray
      (comm,nrhs*aij->n,nrhs*aij->N,0,&b->global_vec); CHKERRQ(ierr);
    ierr = ISCreateGeneral(PETSC_COMM_SELF,ec,garray,&from); CHKERRQ(ierr);
    /* to vector and index set:
     * this also create the permanent vector for the border points */
    ierr = VecCreateSeq(PETSC_COMM_SELF,ec,&b->border_vec); CHKERRQ(ierr);
    b->border_save = PetscMalloc
      ((ec+1)*sizeof(Scalar)); CHKPTRQ(b->border_save);
    ierr = ISCreateStride(PETSC_COMM_SELF,ec,0,1,&to); CHKERRQ(ierr);
    /* scatter context */
    ierr = VecScatterCreate
      (b->global_vec,from,b->border_vec,to,&b->Mvctx); CHKERRQ(ierr);
    /* VecScatterView(b->Mvctx,0); */
    ierr = ISDestroy(from); CHKERRQ(ierr);
    ierr = ISDestroy(to); CHKERRQ(ierr);
  }

  /* create the in/out vectors for the ->A matmult */
  {
    int first,last,local_size;
    
    ierr = MatGetOwnershipRange(mat,&first,&last); CHKERRQ(ierr);
    b->local_size = local_size = last-first;
    ierr = VecCreateSeqWithArray
      (MPI_COMM_SELF,local_size,0,&(b->in)); CHKERRQ(ierr);
    ierr = VecCreateSeq(MPI_COMM_SELF,local_size,&(b->out)); CHKERRQ(ierr);
    b->out_save = PetscMalloc
      ((local_size+1)*sizeof(Scalar)); CHKPTRQ(b->out_save);
  }
  
  *bA = b;
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockMatDestroy"
int BlockMatDestroy(BlockMat bA)
{
  int ierr;

  PetscFunctionBegin;

  ierr = VecDestroy(bA->border_vec); CHKERRQ(ierr);
  ierr = VecScatterDestroy(bA->Mvctx); CHKERRQ(ierr);
  PetscFree(bA->garray); 

  /* make sure that the right Scalar array gets deallocated */
  ierr = VecPlaceArray(bA->out,bA->out_save); CHKERRQ(ierr);
  ierr = VecDestroy(bA->out); CHKERRQ(ierr);

  PetscFree(bA);

  PetscFunctionReturn(0);
}

