#include "src/mat/impls/aij/mpi/mpiaij.h"
#include "src/vec/vecimpl.h"
#include "src/pc/pcextra.h"
#include "src/vec/utils/vpipe.h"
#include "parpre_pc.h"
#include "./mpixtra.h"

extern int MatMatMult_AIJ(Mat a, Mat b, Mat *c);

/* VE functions added from here on down */
int MatGetRowLen_MPIAIJ(Mat mat,int row,int *len)
{
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) mat->data;
  Mat dia = Aij->A, off = Aij->B;
  Mat_SeqAIJ *aij = (Mat_SeqAIJ *) dia->data, *bij = (Mat_SeqAIJ *) off->data;

  if (row<Aij->rstart | row>=Aij->rend)
    SETERRQ(1,0,"GetRowLen only for local rows");
  row -= Aij->rstart;
  *len = aij->i[row+1] - aij->i[row] + bij->i[row+1] - bij->i[row];
  return 0;
}
int MatMaxRowLen_MPIAIJ(Mat A, int *rowlen)
{
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) A->data;
  Mat dia = Aij->A, off = Aij->B;
  Mat_SeqAIJ *aij = (Mat_SeqAIJ *) dia->data, *bij = (Mat_SeqAIJ *) off->data;

  int rl=0,rlt,i;

  for ( i=0; i<aij->m; i++ ) {
    rlt = aij->i[i+1] - aij->i[i] + bij->i[i+1] - bij->i[i];
    if (rlt>rl) rl=rlt;
  }
  *rowlen = rl;
  return 0;
}

int MatMaxRowOffDiagElement_MPIAIJ(Mat A,Vec e)
{
  int rstart,rend,Row,ierr;
  ierr = MatGetOwnershipRange(A,&rstart,&rend); CHKERRQ(ierr);
  for (Row=rstart; Row<rend; Row++) {
    int ncols,*cols,iCol; Scalar *vals,m=0.0;
    ierr = MatGetRow(A,Row,&ncols,&cols,&vals); CHKERRQ(ierr);
    for (iCol=0; iCol<ncols; iCol++) {
      if (cols[iCol]!=Row)
	if (fabs(vals[iCol])>m) m = fabs(vals[iCol]);
    }
    ierr = VecSetValues(e,1,&Row,&m,INSERT_VALUES); CHKERRQ(ierr);
    ierr = MatRestoreRow(A,Row,&ncols,&cols,&vals); CHKERRQ(ierr);
  }
  ierr = VecAssemblyBegin(e); CHKERRQ(ierr);
  ierr = VecAssemblyEnd(e); CHKERRQ(ierr);

  return 0;
}

int MatMaxColOffDiagElement_MPIAIJ(Mat A,Vec e)
{
  int rstart,rend,Row,ierr;
  ierr = MatGetOwnershipRange(A,&rstart,&rend); CHKERRQ(ierr);
  for (Row=rstart; Row<rend; Row++) {
    int ncols,*cols,iCol; Scalar *vals;
    ierr = MatGetRow(A,Row,&ncols,&cols,&vals); CHKERRQ(ierr);
    for (iCol=0; iCol<ncols; iCol++) {
      if (cols[iCol]!=Row) {
	Scalar v = fabs(vals[iCol]);
	ierr = VecSetValues(e,1,cols+iCol,&v,/*MAX*/ADD_VALUES);
	CHKERRQ(ierr);
      }
    }
    ierr = MatRestoreRow(A,Row,&ncols,&cols,&vals); CHKERRQ(ierr);
  }
  ierr = VecAssemblyBegin(e); CHKERRQ(ierr);
  ierr = VecAssemblyEnd(e); CHKERRQ(ierr);

  return 0;
}

int MatMatMult_MPIAIJ(Mat A,Mat B,Mat *C)
{
  Mat_MPIAIJ  *Aij = (Mat_MPIAIJ *) A->data;
  MPI_Comm    comm = A->comm;
  Mat *sub,tmp1,tmp2,res;
  int iRow,ierr;
  int A_start,A_local,A_global,B_global;
  int A_off = ((Mat_SeqAIJ*) Aij->B->data)->n;

  {/* consistency check */
    int A_dum,B_dum;
    ierr = MatGetSize(A,&A_global,&A_dum); CHKERRQ(ierr);
    ierr = MatGetSize(B,&B_dum,&B_global); CHKERRQ(ierr);
    if (!(A_dum==B_dum)) SETERRQ(1,0,"MatMatMult: Global size mismatch");
  }

  /* get various sizes, and allocate the result matrix */
  {
    int A_jlocal,B_local,B_jlocal,idum;
    ierr = MatGetOwnershipRange(A,&A_start,&idum); CHKERRQ(ierr);
    ierr = MatGetLocalSize(A,&A_local,&A_jlocal); CHKERRQ(ierr);
    ierr = MatGetLocalSize(B,&B_local,&B_jlocal); CHKERRQ(ierr);
    if (A_global==B_global) {
      ierr = MatCreateMPIAIJ(comm,A_local,A_local,PETSC_DECIDE,PETSC_DECIDE,
			     0,0,0,0,&res); CHKERRQ(ierr);
    } else {/* is this case ever appropriate? */
      ierr = MatCreateMPIAIJ(comm,PETSC_DECIDE,PETSC_DECIDE,A_global,B_global,
			     0,0,0,0,&res); CHKERRQ(ierr);
    }
  }

  /* Get the two subblocks from B */
  sub = (Mat *) PetscMalloc(2*sizeof(Mat)); CHKPTRQ(sub);
  {
    IS A_home,A_away,B_total,*IS_A,*IS_B;
    /* index sets in A and B */
    IS_A = (IS *) PetscMalloc(2*sizeof(IS)); CHKPTRQ(IS_A);
    IS_B = (IS *) PetscMalloc(2*sizeof(IS)); CHKPTRQ(IS_B);
    ierr = ISCreateStride(comm,B_global,0,1,&B_total); CHKERRQ(ierr);
    ierr = ISCreateGeneral(comm,A_off,Aij->garray,&A_away); CHKERRQ(ierr);
    ierr = ISCreateStride(comm,Aij->cend-Aij->cstart,Aij->cstart,1,&A_home);
    CHKERRQ(ierr);
    IS_A[0] = A_home; IS_A[1] = A_away; IS_B[0] = IS_B[1] = B_total; 

    /* get the B block corresponding to the A diagonal/offdiagonal block */
    ierr = MatGetSubMatrices(B,2,IS_A,IS_B,MAT_INITIAL_MATRIX,&sub);
    CHKERRQ(ierr);
    ISDestroy(A_home); ISDestroy(A_away); ISDestroy(B_total);
    PetscFree(IS_A); PetscFree(IS_B);
    {
      Mat diag_factor = sub[0], off_factor = sub[1];
      ierr = MatMatMult_AIJ(Aij->A,diag_factor,&tmp1); CHKERRQ(ierr);
      ierr = MatDestroy(diag_factor); CHKERRQ(ierr);
      ierr = MatMatMult_AIJ(Aij->B,off_factor,&tmp2); CHKERRQ(ierr);
      ierr = MatDestroy(off_factor); CHKERRQ(ierr);
      PetscFree(sub);
    }
    for (iRow=0; iRow<A_local; iRow++) {
      int Row=A_start+iRow,ncols,*cols; Scalar *vals;

      ierr = MatGetRow(tmp1,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
      ierr = MatSetValues(res,1,&Row,ncols,cols,vals,ADD_VALUES);
      CHKERRQ(ierr);
      ierr = MatRestoreRow(tmp1,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);

      ierr = MatGetRow(tmp2,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
      ierr = MatSetValues(res,1,&Row,ncols,cols,vals,ADD_VALUES);
      CHKERRQ(ierr);
      ierr = MatRestoreRow(tmp2,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
    }
  }
  ierr = MatAssemblyBegin(res,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(res,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatDestroy(tmp1); CHKERRQ(ierr);
  ierr = MatDestroy(tmp2); CHKERRQ(ierr);

  *C = res;

  return 0;
}

/* VE from here on down probably superfluous */
#define IBLOCK_TAG 11
#define RBLOCK_TAG 12
#define LBLOCK_TAG 13
#define MBLOCK_TAG 14

/* Collect the rows whose (global) numbers are in "wanted"
 * into a sequential matrix. There are separate creation
 * and collection routines. The case where the index set "wanted"
 * is null on one or more processors is handled correctly. */
int MatGatherCtxCreate(Mat mat,IS wanted,MatGatherCtx *rgs)
/* check header, cookie = MATMPIAIJ */
{
  Mat_MPIAIJ  *Aij = (Mat_MPIAIJ *) mat->data;
  MPI_Comm    comm = mat->comm;
  int numtids,mytid, *owners = Aij->rowners;
  int *request_rows, request_nrows_tot;
  int nrecvs_all,nsends_all;
  int *recv_procs_all,*send_procs_all;
  int *request_nrows,*request_row_ptrs;
  int *send_rows_ptrs,*send_rows,*send_nrows;
  int i,ierr,p,send_nrows_tot;
  MatGatherCtx gs;

  MPI_Comm_size(comm,&numtids);
  MPI_Comm_rank(comm,&mytid);

  if (wanted) {
    int *request_tmp;

    ierr = ISGetSize(wanted,&request_nrows_tot); CHKERRQ(ierr);
    if (request_nrows_tot) {
      request_rows = (int *) PetscMalloc( request_nrows_tot*sizeof(int) );
      CHKPTRQ(request_rows);
      ierr = ISGetIndices(wanted,&request_tmp); CHKERRQ(ierr);
      PetscMemcpy(request_rows,request_tmp,request_nrows_tot*sizeof(int));
      ierr = ISRestoreIndices(wanted,&request_tmp); CHKERRQ(ierr);
    } else request_rows = 0;
  } else {
    request_rows = 0; request_nrows_tot = 0;
  }

  request_row_ptrs = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(request_row_ptrs);
  request_nrows = (int *) PetscMalloc( numtids*sizeof(int) );
  CHKPTRQ(request_nrows);

  /* sort requested numbers, and establish pointers for processors */
  ierr = IntSort(request_rows,request_nrows_tot);
  p = 0;
  for (i=0; i<request_nrows_tot; i++) {
  loopback:
    if (request_rows[i]>=owners[p]) {
      request_row_ptrs[p++] = i;
      goto loopback;
    }
  }
  for (i=p; i<=numtids; i++) request_row_ptrs[i] = request_nrows_tot;
  for (p=0; p<numtids; p++)
    request_nrows[p] = request_row_ptrs[p+1]-request_row_ptrs[p];
/*
printf("I am requesting: ");
for (i=0; i<numtids; i++) {int j;
printf("(%d:) ",i); for (j=request_row_ptrs[i];j<request_row_ptrs[i+1];j++)
printf("%d ",request_rows[j]);}
printf("\n");
*/
  /* every processor gathers how much is wanted from it from each proc */
  send_nrows = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(send_nrows); PetscMemzero(send_nrows,(numtids+1)*sizeof(int));
  for (p=0; p<numtids; p++) {
    int len;
    len = request_nrows[p];
    ierr = MPI_Gather
      ((void *)(&len),1,MPI_INT,(void *)send_nrows,1,MPI_INT,p,comm);
    CHKERRQ(ierr);
  }

  nrecvs_all = nsends_all = 0;
  for (p=0; p<numtids; p++) {
    if (request_nrows[p]) nrecvs_all++;
    if (send_nrows[p]) nsends_all++;
  }
  recv_procs_all = (int *) PetscMalloc( (nrecvs_all+1)*sizeof(int) );
  CHKPTRQ(recv_procs_all);
  send_procs_all = (int *) PetscMalloc( (nsends_all+1)*sizeof(int) );
  CHKPTRQ(send_procs_all);
  {
    int nr=0,ns=0;
    for (p=0; p<numtids; p++) {
      if (request_nrows[p]) recv_procs_all[nr++] = p;
      if (send_nrows[p])    send_procs_all[ns++] = p;
    }
  }

  /* set up pointers to receive info on what exactly is wanted */
  send_nrows_tot = 0;
  for (p=0; p<numtids; p++) send_nrows_tot += send_nrows[p];
  send_rows = (int *) PetscMalloc( (send_nrows_tot+1)*sizeof(int) );
  CHKPTRQ(send_rows);
  send_rows_ptrs = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(send_rows_ptrs);
  send_rows_ptrs[0] = 0;
  for (p=0; p<numtids; p++)
    send_rows_ptrs[p+1] = send_rows_ptrs[p]+send_nrows[p];

  /* now actually receive that info */
  for (p=0; p<numtids; p++) {
    int len;
/*    if (p==mytid) len=0;
    else */
    len=request_row_ptrs[p+1]-request_row_ptrs[p];
    ierr = MPI_Gatherv
      ((void *)&(request_rows[request_row_ptrs[p]]),len,MPI_INT,
       (void *)send_rows,send_nrows,send_rows_ptrs,MPI_INT,p,comm);
    CHKERRQ(ierr);
  }
/*
printf("I am sending: ");
for (i=0; i<numtids; i++) {int j;
printf("(%d:) ",i); for (j=send_rows_ptrs[i];j<send_rows_ptrs[i+1];j++)
printf("%d ",send_rows[j]);}
printf("\n");
*/
  /* save reusable information */
  gs = PetscNew(_MatGatherCtx);
  ierr = VecScatterCopy(Aij->Mvctx,(VecScatter*)&(gs->vs)); CHKERRQ(ierr);
  gs->comm = comm; gs->N = Aij->N;
  gs->mytid = mytid; gs->numtids = numtids;
  gs->request_nrows_tot = request_nrows_tot;
  gs->send_nrows_tot = send_nrows_tot;
  gs->nsends_all = nsends_all;
  gs->nrecvs_all = nrecvs_all;
  gs->custom_sends = gs->custom_recvs = 0;
  gs->send_procs_all = send_procs_all;
  gs->recv_procs_all = recv_procs_all;
  gs->send_rows = send_rows;
  gs->send_rows_ptrs = send_rows_ptrs;
  gs->request_row_ptrs = request_row_ptrs;
  gs->nsends_outstanding = 0;
  *rgs = gs;

  /* free temporaries */
  PetscFree(send_nrows); PetscFree(request_nrows);

  return 0;
}

int MatGatherCtxDestroy(MatGatherCtx gs)
{
  PetscFree(gs->send_procs_all);
  PetscFree(gs->recv_procs_all);
  PetscFree(gs->send_rows);
  PetscFree(gs->send_rows_ptrs);
  PetscFree(gs->request_row_ptrs);
  return 0;
}

static int MatGatherRowsReceive
(MatGatherCtx gs,PipelineFunction pf,PetscObject pl,Mat *return_mat)
{
  Mat catch_mat;
  MPI_Comm comm = gs->comm;
  int ierr,count,ip;
  int *recv_procs,nrecvs;

  if (pf) gs->custom_recvs = 1;
  else gs->custom_recvs = 0;
  
  recv_procs = (int *) PetscMalloc( (gs->nrecvs_all+1)*sizeof(int) );
  CHKPTRQ(recv_procs);
  nrecvs = 0;
  for (ip=0; ip<gs->nrecvs_all; ip++) {
    int p = gs->recv_procs_all[ip];
/*    if (p==gs->mytid) continue;*/
    if (pf && !(*pf)(p,pl)) ;
    else
      recv_procs[nrecvs++] = p;
  }

  ierr = MatCreateSeqAIJ
    (MPI_COMM_SELF,gs->request_nrows_tot,gs->N,0,0,&catch_mat);
  CHKERRQ(ierr);

  count = nrecvs;
  while (count) {
    int p,buflen,nrows,irow,unpacloc=0; void *buf;
    MPI_Status recv_status;
    while (1)
      for (ip=0; ip<nrecvs; ip++) {
	int flag;
	ierr = MPI_Iprobe(recv_procs[ip],MBLOCK_TAG,
			  comm,&flag,&recv_status);
	CHKERRQ(ierr);
	if (flag) goto found;
      }
  found:
    p = recv_status.MPI_SOURCE;
    MPI_Get_count(&recv_status,MPI_BYTE,&buflen);
    buf = (void *) PetscMalloc( buflen*sizeof(MPI_BYTE) );
    CHKPTRQ(buf);
    ierr = MPI_Recv(buf,buflen,MPI_PACKED,p,MBLOCK_TAG,comm,&recv_status);
    CHKERRQ(ierr);
    nrows = gs->request_row_ptrs[p+1]-gs->request_row_ptrs[p];
    {
      int nrows_test;
      MPI_Unpack(buf,buflen,&unpacloc,&nrows_test,1,MPI_INT,comm);
      if (!(nrows==nrows_test)) SETERRQ(p+1,0,"Nrows mismatch");
    }
    for (irow=0; irow<nrows; irow++) {
      int rlen,*idx, row=gs->request_row_ptrs[p]+irow;
      Scalar *val;
      MPI_Unpack(buf,buflen,&unpacloc,&rlen,1,MPI_INT,comm);
      idx = (int *) PetscMalloc( (1+rlen)*sizeof(int) );
      CHKPTRQ(idx);
      MPI_Unpack(buf,buflen,&unpacloc,idx,rlen,MPI_INT,comm);
      val = (Scalar *) PetscMalloc( (1+rlen)*sizeof(Scalar) );
      CHKPTRQ(val);
      MPI_Unpack(buf,buflen,&unpacloc,val,rlen,MPI_DOUBLE,comm);
      ierr = MatSetValues(catch_mat,1,&row,rlen,idx,val,INSERT_VALUES);
      CHKERRQ(ierr);
      PetscFree(idx); PetscFree(val);
    }
    PetscFree(buf);
    count--;
  }

  ierr = MatAssemblyBegin(catch_mat,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(catch_mat,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);

  if (gs->nsends_outstanding) {
    MPI_Status *send_status;
    send_status = (MPI_Status *) PetscMalloc
      (gs->nsends_outstanding*sizeof(MPI_Status)); CHKPTRQ(send_status);
    MPI_Waitall(gs->nsends_outstanding,gs->send_requests,send_status);
    PetscFree(send_status);
    for (ip=0; ip<gs->nsends_outstanding; ip++)
      PetscFree(gs->send_buffers[ip]);
    PetscFree(gs->send_buffers);
    gs->nsends_outstanding = 0;
  }

  
  *return_mat = catch_mat;

  PetscFree(recv_procs);

  return 0;
}

static int MatGatherRowsSend
(MatGatherCtx gs,Mat mat,PipelineFunction pf,PetscObject pl)
{
  MPI_Comm comm = gs->comm;
  int *send_procs,nsends;
  int ip;

  if (!pf) gs->custom_sends = 0;
  else gs->custom_sends = 1;
  
  /* Determine what processors we are going to send to */
  send_procs = (int *) PetscMalloc( (gs->nsends_all+1)*sizeof(int) );
  CHKPTRQ(send_procs);
  nsends = 0;
  for (ip=0; ip<gs->nsends_all; ip++) {
    int p = gs->send_procs_all[ip];
/*    if (p==gs->mytid) continue;*/
    if (pf && !(*pf)(p,pl)) ;
    else 
      send_procs[nsends++] = p;
  }

  if ((!gs->custom_sends) && nsends) {
    gs->nsends_outstanding = nsends;
    if (nsends) {
      gs->send_requests = (MPI_Request *) PetscMalloc
	(nsends*sizeof(MPI_Request)); CHKPTRQ(gs->send_requests);
      gs->send_buffers = (void **) PetscMalloc
	(nsends*sizeof(void*)); CHKPTRQ(gs->send_buffers);
    }
  } else {
    gs->nsends_outstanding = 0;
  }

  /* Loop over all processors, then all rows to be sent there,
   * to determine row sizes and cumulative buffer size.
   * The packed buffer contains 1 int (number of rows to follow),
   * and for each row 1 int (the length), the indices and values */
  for (ip=0; ip<nsends; ip++) {
    void *packed_send_buffer;
    int ierr,irow,nrows, p=send_procs[ip], s=0,ts, pacloc = 0;

    nrows = gs->send_rows_ptrs[p+1]-gs->send_rows_ptrs[p];
    MPI_Pack_size(1,MPI_INT,comm,&ts); s += ts;
    for (irow=0; irow<nrows; irow++) {
      int tlen,row = gs->send_rows[irow+gs->send_rows_ptrs[p]];
      ierr = MatGetRowLen_MPIAIJ(mat,row,&tlen); CHKERRQ(ierr);
      MPI_Pack_size(1,MPI_INT,comm,&ts); s += ts;
      MPI_Pack_size(tlen,MPI_INT,comm,&ts); s += ts;
      MPI_Pack_size(tlen,MPI_DOUBLE,comm,&ts); s += ts;
    }
/*printf("expecting to pack %d\n",s);*/
    packed_send_buffer = (void *) PetscMalloc(s);
    CHKPTRQ(packed_send_buffer);
    
    MPI_Pack((void *)&nrows,1,MPI_INT,packed_send_buffer,s,&pacloc,comm);
/*printf("Packing rows: ");*/
    for (irow=0; irow<nrows; irow++) {
      int row,ncols,*cols; Scalar *tvals;
      row = gs->send_rows[irow+gs->send_rows_ptrs[p]];
/*printf("%d ",row);*/
      ierr = MatGetRow(mat,row,&ncols,&cols,&tvals); CHKERRQ(ierr);
/*printf("(#e=%d: ",ncols);
      {int i;for(i=0;i<ncols;i++)printf("%d,",cols[i]);}printf(")");
*/
      MPI_Pack((void *)&ncols,1,MPI_INT,
	       packed_send_buffer,s,&pacloc,comm);
      MPI_Pack((void *)cols,ncols,MPI_INT,
	       packed_send_buffer,s,&pacloc,comm);
      MPI_Pack((void *)tvals,ncols,MPI_DOUBLE,
	       packed_send_buffer,s,&pacloc,comm);
      ierr = MatRestoreRow(mat,row,&ncols,&cols,&tvals); CHKERRQ(ierr);
    }
/*printf("\n");*/
    if (pacloc>s) SETERRQ(ip+1,0,"Pack overflow");

    if (gs->custom_sends) {
      ierr = MPI_Send
	(packed_send_buffer,pacloc,MPI_PACKED,p,MBLOCK_TAG,comm);
      CHKERRQ(ierr);
      PetscFree(packed_send_buffer);
    } else {
      ierr = MPI_Isend
	(packed_send_buffer,pacloc,MPI_PACKED,p,MBLOCK_TAG,comm,
	 gs->send_requests+ip); CHKERRQ(ierr);
      gs->send_buffers[ip] = packed_send_buffer;
    }
  }

  PetscFree(send_procs);
  return 0;
}

int MatGatherRows(Mat mat,MatGatherCtx gs, Mat *res_mat)
/* check header, cookie = MATMPIAIJ */
{
  int ierr;

  ierr = MatGatherRowsSend(gs,mat,0,0); CHKERRQ(ierr);
  ierr = MatGatherRowsReceive(gs,0,0,res_mat); CHKERRQ(ierr);

  return 0;
}

int MatGatherRowsPipelineBegin
(Mat mat,PipelineMode ptype, MatGatherCtx gs, Mat *ret_mat)
{
  VecPipeline pipe = gs->vs; PipelineFunction pf;
  PetscObject pl = pipe->custom_pipe_data;
  int ierr;

  if (ptype == PIPELINE_CUSTOM_UP)
    pf = pipe->dn_fun;
  else
    pf = pipe->up_fun;
  if (!pf) {
    ierr = MatGatherRowsSend(gs,mat,0,0); CHKERRQ(ierr);
    ierr = MatGatherRowsReceive(gs,0,0,ret_mat);
  } else {
    ierr = MatGatherRowsReceive(gs,pf,pl,ret_mat);
  }
  return 0;
}

int MatGatherRowsPipelineEnd
(Mat mat,PipelineMode ptype, MatGatherCtx gs, Mat *ret_mat)
{
  VecPipeline pipe = gs->vs; PipelineFunction pf;
  PetscObject pl = pipe->custom_pipe_data;
  int ierr;

  if (ptype == PIPELINE_CUSTOM_UP)
    pf = pipe->up_fun;
  else
    pf = pipe->dn_fun;

  if (pf) {ierr = MatGatherRowsSend(gs,mat,pf,pl); CHKERRQ(ierr);}

  return 0;
}

/* VE !!! not removed */
int MatGetSubMatrix_MPIAIJ(Mat A,IS isrow,IS iscol,MatGetSubMatrixCall scall,Mat *B)
{
  MatGatherCtx get_strip;
  Mat wide_mat;
  IS all_rows;
  int ierr,nrows;

  ierr = MatGatherCtxCreate(A,isrow,&get_strip); CHKERRQ(ierr);
  ierr = MatGatherRows(A,get_strip,&wide_mat); CHKERRQ(ierr);

  if (isrow) {
    Mat *xmat;
    ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
    ierr = ISCreateStride(MPI_COMM_SELF,nrows,0,1,&all_rows); CHKERRQ(ierr);
    xmat = (Mat *) PetscMalloc(sizeof(Mat)); CHKPTRQ(xmat);
    ierr = MatGetSubMatrices
      (wide_mat,1,&all_rows,&iscol,MAT_INITIAL_MATRIX,&xmat);
    CHKERRQ(ierr);
    *B = *xmat; PetscFree(xmat);
    ierr = ISDestroy(all_rows); CHKERRQ(ierr);
  } else *B = 0;
  ierr = MatGatherCtxDestroy(get_strip); CHKERRQ(ierr);
  if (isrow) {
    ierr = MatDestroy(wide_mat); CHKERRQ(ierr);
  }

  return 0;
}
