/*
    Defines parallel vector scatters.
*/

#include "sys.h"
#include "src/is/isimpl.h"
#include "src/vec/vecimpl.h"                     /*I "vec.h" I*/
#include "src/vec/impls/dvecimpl.h"
#include "src/vec/impls/mpi/pvecimpl.h"
#include "vpipe.h"
#include "parpre_vec.h"
#include "parpre_pc.h"
#include "src/pc/pcimpl.h"
#include "src/pc/pcparallel.h"
#include "pinclude/pviewer.h"

/* stolen from vscat.c */
#undef __FUNC__
#define __FUNC__ "VecScatterBegin_MPI_ToAll"
static int VecScatterBegin_MPI_ToAll(Vec x,Vec y,InsertMode addv,int mode,VecScatter ctx)
{ 
  if (mode & SCATTER_REVERSE) {
    Vec_MPI              *yy = (Vec_MPI *) y->data;
    Vec_Seq              *xx = (Vec_Seq *) x->data;
    Scalar               *xv = xx->array, *yv = yy->array, *xvt, *xvt2;
    VecScatter_MPI_ToAll *scat = (VecScatter_MPI_ToAll *) ctx->todata;
    int                  i, size = xx->n;

    if (addv == INSERT_VALUES) {
      /* 
         copy the correct part of the local vector into the local storage of 
         the MPI one  Note: this operation only makes sense if all the local 
         vectors have the same values
      */
      PetscMemcpy(yv,xv+yy->ownership[yy->rank],yy->n*sizeof(Scalar));
    }
    else {
      if (scat->work1) xvt = scat->work1; 
      else {scat->work1 = xvt = (Scalar *) PetscMalloc(size*sizeof(Scalar));CHKPTRQ(xvt);}
      if (!yy->rank) { /* I am the zeroth processor, values are accumulated here */
        if   (scat->work2) xvt2 = scat->work2; 
        else {scat->work2 = xvt2 = (Scalar *) PetscMalloc(size*sizeof(Scalar));CHKPTRQ(xvt2);}
        MPI_Gatherv(yv,yy->n,MPIU_SCALAR,xvt2,scat->count,yy->ownership,MPIU_SCALAR,0,ctx->comm);
#if defined(PETSC_COMPLEX)
        MPI_Reduce(xv, xvt, 2*size, MPI_DOUBLE, MPI_SUM, 0, ctx->comm);
#else
        MPI_Reduce(xv, xvt, size, MPIU_SCALAR, MPI_SUM, 0, ctx->comm);
#endif
	for ( i=0; i<size; i++ ) {
	  xvt[i] += xvt2[i];
	}
        MPI_Scatterv(xvt,scat->count,yy->ownership,MPIU_SCALAR,yv,yy->n,MPIU_SCALAR,0,ctx->comm);
      }
      else {
        MPI_Gatherv(yv, yy->n, MPIU_SCALAR, 0,  0, 0, MPIU_SCALAR, 0, ctx->comm);
#if defined(PETSC_COMPLEX)
        MPI_Reduce(xv, xvt, 2*size, MPI_DOUBLE, MPI_SUM, 0, ctx->comm);
#else
        MPI_Reduce(xv, xvt, size, MPIU_SCALAR, MPI_SUM, 0, ctx->comm);
#endif
        MPI_Scatterv(0,scat->count,yy->ownership,MPIU_SCALAR,yv,yy->n,MPIU_SCALAR,0,ctx->comm);
      }
    }
  }
  else {
    Vec_MPI              *xx = (Vec_MPI *) x->data;
    Vec_Seq              *yy = (Vec_Seq *) y->data;
    Scalar               *xv = xx->array, *yv = yy->array, *yvt;
    VecScatter_MPI_ToAll *scat = (VecScatter_MPI_ToAll *) ctx->todata;
    int                  i, size = yy->n;

    if (addv == INSERT_VALUES) {
      MPI_Allgatherv(xv,xx->n,MPIU_SCALAR,yv,scat->count,xx->ownership,MPIU_SCALAR,ctx->comm);
    }
    else {
      if (scat->work1) yvt = scat->work1; 
      else {scat->work1 = yvt = (Scalar *) PetscMalloc(size*sizeof(Scalar));CHKPTRQ(yvt);}
      MPI_Allgatherv(xv,xx->n,MPIU_SCALAR,yvt,scat->count,xx->ownership,MPIU_SCALAR,ctx->comm);
      for ( i=0; i<size; i++ ) {
	yv[i] += yvt[i];
      }
    }
  }
  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecPipelineBegin_MPI_ToAll"
static int VecPipelineBegin_MPI_ToAll(Vec x,Vec y,InsertMode addv,PipelineMode mode,VecPipeline ctx)
{ 
  if (mode & SCATTER_REVERSE) {
    Vec_MPI              *yy = (Vec_MPI *) y->data;
    Vec_Seq              *xx = (Vec_Seq *) x->data;
    Scalar               *xv = xx->array, *yv = yy->array, *xvt, *xvt2;
    VecPipeline_MPI_ToAll *scat = (VecPipeline_MPI_ToAll *) ctx->todata;
    int                  i, size = xx->n;

    if (addv == INSERT_VALUES) {
      /* 
         copy the correct part of the local vector into the local storage of 
         the MPI one  Note: this operation only makes sense if all the local 
         vectors have the same values
      */
      PetscMemcpy(yv,xv+yy->ownership[yy->rank],yy->n*sizeof(Scalar));
    }
    else {
      if (scat->work1) xvt = scat->work1; 
      else {scat->work1 = xvt = (Scalar *) PetscMalloc(size*sizeof(Scalar));CHKPTRQ(xvt);}
      if (!yy->rank) { /* I am the zeroth processor, values are accumulated here */
        if   (scat->work2) xvt2 = scat->work2; 
        else {scat->work2 = xvt2 = (Scalar *) PetscMalloc(size*sizeof(Scalar));CHKPTRQ(xvt2);}
        MPI_Gatherv(yv,yy->n,MPIU_SCALAR,xvt2,scat->count,yy->ownership,MPIU_SCALAR,0,ctx->comm);
#if defined(PETSC_COMPLEX)
        MPI_Reduce(xv, xvt, 2*size, MPI_DOUBLE, MPI_SUM, 0, ctx->comm);
#else
        MPI_Reduce(xv, xvt, size, MPIU_SCALAR, MPI_SUM, 0, ctx->comm);
#endif
	for ( i=0; i<size; i++ ) {
	  xvt[i] += xvt2[i];
	}
        MPI_Scatterv(xvt,scat->count,yy->ownership,MPIU_SCALAR,yv,yy->n,MPIU_SCALAR,0,ctx->comm);
      }
      else {
        MPI_Gatherv(yv, yy->n, MPIU_SCALAR, 0,  0, 0, MPIU_SCALAR, 0, ctx->comm);
#if defined(PETSC_COMPLEX)
        MPI_Reduce(xv, xvt, 2*size, MPI_DOUBLE, MPI_SUM, 0, ctx->comm);
#else
        MPI_Reduce(xv, xvt, size, MPIU_SCALAR, MPI_SUM, 0, ctx->comm);
#endif
        MPI_Scatterv(0,scat->count,yy->ownership,MPIU_SCALAR,yv,yy->n,MPIU_SCALAR,0,ctx->comm);
      }
    }
  }
  else {
    Vec_MPI              *xx = (Vec_MPI *) x->data;
    Vec_Seq              *yy = (Vec_Seq *) y->data;
    Scalar               *xv = xx->array, *yv = yy->array, *yvt;
    VecPipeline_MPI_ToAll *scat = (VecPipeline_MPI_ToAll *) ctx->todata;
    int                  i, size = yy->n;

    if (addv == INSERT_VALUES) {
      MPI_Allgatherv(xv,xx->n,MPIU_SCALAR,yv,scat->count,xx->ownership,MPIU_SCALAR,ctx->comm);
    }
    else {
      if (scat->work1) yvt = scat->work1; 
      else {scat->work1 = yvt = (Scalar *) PetscMalloc(size*sizeof(Scalar));CHKPTRQ(yvt);}
      MPI_Allgatherv(xv,xx->n,MPIU_SCALAR,yvt,scat->count,xx->ownership,MPIU_SCALAR,ctx->comm);
      for ( i=0; i<size; i++ ) {
	yv[i] += yvt[i];
      }
    }
  }
  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecPipelineDestroy_MPI_ToAll"
static int VecPipelineDestroy_MPI_ToAll(PetscObject obj)
{
  VecPipeline           ctx = (VecPipeline) obj;
  VecPipeline_MPI_ToAll *scat = (VecPipeline_MPI_ToAll *) ctx->todata;

  PetscFree(scat->count);
  if (scat->work1) PetscFree(scat->work1);
  if (scat->work2) PetscFree(scat->work2);
  PetscFree(ctx->todata); 
  PLogObjectDestroy(ctx);
  PetscHeaderDestroy(ctx);
  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecPipelineCopy_MPI_ToAll"
static int VecPipelineCopy_MPI_ToAll(VecPipeline in,VecPipeline out)
{
  VecPipeline_MPI_ToAll *in_to = (VecPipeline_MPI_ToAll *) in->todata, *sto;
  int                  size, i;

  out->pipelinebegin  = in->pipelinebegin;
  out->pipelineend    = in->pipelineend;
  out->copy          = in->copy;
  out->destroy       = in->destroy;
  out->view          = in->view;

  sto       = PetscNew(VecPipeline_MPI_ToAll); CHKPTRQ(sto);
  sto->type = VEC_SCATTER_MPI_TOALL;

  MPI_Comm_size(out->comm,&size);
  sto->count = (int *) PetscMalloc(size*sizeof(int)); CHKPTRQ(sto->count);
  for ( i=0; i<size; i++ ) {
    sto->count[i] = in_to->count[i];
  }
  sto->work1         = 0;
  sto->work2         = 0;
  PLogObjectMemory(out,sizeof(VecPipeline_MPI_ToAll)+size*sizeof(int));
  out->todata        = (void *) sto; 
  out->fromdata      = (void *) 0;
  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecPipelineView_MPI_ToAll"
int VecPipelineView_MPI_ToAll(PetscObject obj,Viewer viewer)
{
  VecPipeline     ctx = (VecPipeline) obj;
  VecPipeline_MPI_General *to=(VecPipeline_MPI_General *) ctx->todata, *from=(VecPipeline_MPI_General *) ctx->fromdata;
  int            i,rank,ierr;
  FILE           *fd;
  ViewerType     vtype;

  ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);

  if (vtype != ASCII_FILE_VIEWER && vtype != ASCII_FILES_VIEWER) return 0;

  MPI_Comm_rank(ctx->comm,&rank);
  ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
  PetscSequentialPhaseBegin(ctx->comm,1);
  fprintf(fd,"[%d] Number sends %d below %d self %d\n",rank,to->n,to->nbelow,to->nself);
  for ( i=0; i<to->n; i++ ){
    fprintf(fd,"[%d]   %d length %d to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],
            to->procs[i]);
  }
  /*
  fprintf(fd,"Now the indices\n");
  for ( i=0; i<to->starts[to->n]; i++ ){
    fprintf(fd,"[%d]%d \n",rank,to->indices[i]);
  }
  */
  fprintf(fd,"[%d]Number receives %d below %d self %d\n",rank,from->n,
          from->nbelow,from->nself);
  for ( i=0; i<from->n; i++ ){
    fprintf(fd,"[%d] %d length %d to whom %d\n",rank,i,from->starts[i+1]-from->starts[i],
            from->procs[i]);
  }
  /*
  fprintf(fd,"Now the indices\n");
  for ( i=0; i<from->starts[from->n]; i++ ){
    fprintf(fd,"[%d]%d \n",rank,from->indices[i]);
  }
  */
  fflush(fd);
  PetscSequentialPhaseEnd(ctx->comm,1);
  return 0;
}  

/*
    The next routine determines what part of  the local part of the scatter is an
exact copy of values into their current location. We check this here and
then know that we need not perform that portion of the scatter.
*/
#undef __FUNC__
#define __FUNC__ "VecScatterLocalOptimize_Private"
static int VecScatterLocalOptimize_Private(VecScatter_Seq_General *gen_to,
                                           VecScatter_Seq_General *gen_from)
{
  int n = gen_to->n,n_nonmatching = 0,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
  int *nto_slots, *nfrom_slots,j = 0;
  
  for ( i=0; i<n; i++ ) {
    if (to_slots[i] != from_slots[i]) n_nonmatching++;
  }

  if (!n_nonmatching) {
    gen_to->nonmatching_computed = 1;
    gen_to->n_nonmatching        = gen_from->n_nonmatching = 0;
    PLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to 0\n");
  } else if (n_nonmatching == n) {
    gen_to->nonmatching_computed = -1;
    PLogInfo(0,"VecScatterLocalOptimize_Private:All values non-matching\n");
  } else {
    gen_to->nonmatching_computed = 1;
    gen_to->n_nonmatching        = gen_from->n_nonmatching = n_nonmatching;
    nto_slots                    = (int *) PetscMalloc(n_nonmatching*sizeof(int));CHKPTRQ(nto_slots);
    gen_to->slots_nonmatching    = nto_slots;
    nfrom_slots                  = (int *) PetscMalloc(n_nonmatching*sizeof(int));CHKPTRQ(nfrom_slots);
    gen_from->slots_nonmatching  = nfrom_slots;
    for ( i=0; i<n; i++ ) {
      if (to_slots[i] != from_slots[i]) {
        nto_slots[j]   = to_slots[i];
        nfrom_slots[j] = from_slots[i];
        j++;
      }
    }
    PLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to %d\n",n,n_nonmatching);
  } 

  return 0;
}

/*
     Even though the next routines are written with parallel 
  vectors, either xin or yin (but not both) may be Seq
  vectors, one for each processor.

     Note: since nsends, nrecvs and nx may be zero but they are used
  in mallocs, we always malloc the quantity plus one. This is not 
  an ideal solution, but it insures that we never try to malloc and 
  then free a zero size location.
  
     gen_from indices indicate where arriving stuff is stashed
     gen_to   indices indicate where departing stuff came from. 
     the naming can be a little confusing.

*/

#undef __FUNC__
#define __FUNC__ "PtoPCopy"
static int PtoPCopy(VecPipeline in,VecPipeline out)
{
  VecPipeline_MPI_General *in_to   = (VecPipeline_MPI_General *) in->todata;
  VecPipeline_MPI_General *in_from = (VecPipeline_MPI_General *) in->fromdata,*out_to,*out_from;
  int            len, ny;

  out->scatterbegin    = in->scatterbegin;
  out->scatterend      = in->scatterend;
  out->pipelinebegin    = in->pipelinebegin;
  out->pipelineend      = in->pipelineend;
  out->copy             = in->copy;
  out->destroy          = in->destroy;
  out->view             = in->view;

  /* allocate entire send scatter context */
  out_to           = (VecPipeline_MPI_General *) PetscMalloc(sizeof(VecPipeline_MPI_General));CHKPTRQ(out_to);
  PLogObjectMemory(out,sizeof(VecPipeline_MPI_General));
  ny               = in_to->starts[in_to->n];
  len              = ny*(sizeof(int) + sizeof(Scalar)) +
                     (in_to->n+1)*sizeof(int) +
                     (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
  out_to->n        = in_to->n; 
  out_to->nbelow   = in_to->nbelow;
  out_to->nself    = in_to->nself;
  out_to->type     = in_to->type;

  out_to->values   = (Scalar *) PetscMalloc( len ); CHKPTRQ(out_to->values);
  PLogObjectMemory(out,len); 
  out_to->requests = (MPI_Request *) (out_to->values + ny);
  out_to->indices  = (int *) (out_to->requests + out_to->n); 
  out_to->starts   = (int *) (out_to->indices + ny);
  out_to->procs    = (int *) (out_to->starts + out_to->n + 1);
  PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
  PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
  PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
  out_to->sstatus  = (MPI_Status *) PetscMalloc((out_to->n+1)*sizeof(MPI_Status));
                     CHKPTRQ(out_to->sstatus);
  out->todata      = (void *) out_to;
  out_to->local.n  = in_to->local.n;
  if (in_to->local.n) {
    out_to->local.slots = (int *) PetscMalloc(in_to->local.n*sizeof(int));
    CHKPTRQ(out_to->local.slots);
    PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
    PLogObjectMemory(out,in_to->local.n*sizeof(int));
  }
  else {out_to->local.slots = 0;}

  /* allocate entire receive context */
  out_from           = (VecPipeline_MPI_General *) PetscMalloc(sizeof(VecPipeline_MPI_General));CHKPTRQ(out_from);
  out_from->type     = in_from->type;
  PLogObjectMemory(out,sizeof(VecPipeline_MPI_General));
  ny                 = in_from->starts[in_from->n];
  len                = ny*(sizeof(int) + sizeof(Scalar)) +
                       (in_from->n+1)*sizeof(int) +
                       (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
  out_from->n        = in_from->n; 
  out_from->nbelow   = in_from->nbelow;
  out_from->nself    = in_from->nself;
  out_from->values   = (Scalar *) PetscMalloc( len ); CHKPTRQ(out_from->values); 
  PLogObjectMemory(out,len);
  out_from->requests = (MPI_Request *) (out_from->values + ny);
  out_from->indices  = (int *) (out_from->requests + out_from->n); 
  out_from->starts   = (int *) (out_from->indices + ny);
  out_from->procs    = (int *) (out_from->starts + out_from->n + 1);
  PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
  PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
  PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
  out->fromdata      = (void *) out_from;
  out_from->local.n  = in_from->local.n;
  if (in_from->local.n) {
    out_from->local.slots = (int *) PetscMalloc(in_from->local.n*sizeof(int));
    PLogObjectMemory(out,in_from->local.n*sizeof(int));CHKPTRQ(out_from->local.slots);
    PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
  }
  else {out_from->local.slots = 0;}
  return 0;
}
/* --------------------------------------------------------------------*/
#undef __FUNC__
#define __FUNC__ "VecScatterBegin_PtoP"
static int VecScatterBegin_PtoP(Vec xin,Vec yin,InsertMode addv,int mode,VecScatter ctx)
{
  VecScatter_MPI_General *gen_to, *gen_from;
  Vec_MPI                *x = (Vec_MPI *)xin->data,*y = (Vec_MPI*) yin->data;
  MPI_Comm               comm = ctx->comm;
  Scalar                 *xv = x->array,*yv = y->array, *val, *rvalues,*svalues;
  MPI_Request            *rwaits, *swaits;
  int                    tag = ctx->tag, i,j,*indices,*rstarts,*sstarts,*rprocs, *sprocs;
  int                    nrecvs, nsends,iend,ierr;

  if (mode & SCATTER_REVERSE ){
    gen_to   = (VecScatter_MPI_General *) ctx->fromdata;
    gen_from = (VecScatter_MPI_General *) ctx->todata;
  }
  else {
    gen_to   = (VecScatter_MPI_General *) ctx->todata;
    gen_from = (VecScatter_MPI_General *) ctx->fromdata;
  }
  rvalues  = gen_from->values;
  svalues  = gen_to->values;
  nrecvs   = gen_from->n;
  nsends   = gen_to->n;
  rwaits   = gen_from->requests;
  swaits   = gen_to->requests;
  indices  = gen_to->indices;
  rstarts  = gen_from->starts;
  sstarts  = gen_to->starts;
  rprocs   = gen_from->procs;
  sprocs   = gen_to->procs;
  
  /* post receives:   */
  for ( i=0; i<nrecvs; i++ ) {
    MPI_Irecv(rvalues+rstarts[i],rstarts[i+1] - rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
  }

  /* do sends:  */
  for ( i=0; i<nsends; i++ ) {
    val  = svalues + sstarts[i];
    iend = sstarts[i+1]-sstarts[i];

    for ( j=0; j<iend; j++ ) {
      val[j] = xv[*indices++];
      /* printf("[%d] sending idx %d val %g\n",PetscGlobalRank,indices[-1],val[j]); */
    } 
    MPI_Isend(val,iend, MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
  }
  /* take care of local scatters */
  if (gen_to->local.n && addv == INSERT_VALUES) {
    if (yv == xv && !gen_to->local.nonmatching_computed) {
      ierr = VecScatterLocalOptimize_Private(&gen_to->local,&gen_from->local);CHKERRQ(ierr);
    }
    if (yv != xv || gen_to->local.nonmatching_computed == -1) {
      int *tslots = gen_to->local.slots, *fslots = gen_from->local.slots;
      int n       = gen_to->local.n;
      for ( i=0; i<n; i++ ) {yv[fslots[i]] = xv[tslots[i]];}
    } else {
      /* 
        In this case, it is copying the values into their old  locations, thus we can skip those  
      */
      int *tslots = gen_to->local.slots_nonmatching, *fslots = gen_from->local.slots_nonmatching;
      int n       = gen_to->local.n_nonmatching;
      for ( i=0; i<n; i++ ) {yv[fslots[i]] = xv[tslots[i]];}
    } 
  }
  else if (gen_to->local.n) {
    int *tslots = gen_to->local.slots, *fslots = gen_from->local.slots;
    int n = gen_to->local.n;
    for ( i=0; i<n; i++ ) {yv[fslots[i]] += xv[tslots[i]];}
  }

  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecScatterEnd_PtoP"
static int VecScatterEnd_PtoP(Vec xin,Vec yin,InsertMode addv,int mode,VecScatter ctx)
{
  VecScatter_MPI_General *gen_to, *gen_from;
  Vec_MPI                *y = (Vec_MPI *)yin->data;
  Scalar                 *rvalues, *yv = y->array,*val;
  int                    nrecvs, nsends,i,*indices,count,imdex,n,*rstarts,*lindices;
  MPI_Request            *rwaits, *swaits;
  MPI_Status             rstatus, *sstatus;

  if (mode & SCATTER_REVERSE ){
    gen_to   = (VecScatter_MPI_General *) ctx->fromdata;
    gen_from = (VecScatter_MPI_General *) ctx->todata;
    sstatus  = gen_from->sstatus;
  }
  else {
    gen_to   = (VecScatter_MPI_General *) ctx->todata;
    gen_from = (VecScatter_MPI_General *) ctx->fromdata;
    sstatus  = gen_to->sstatus;
  }
  rvalues  = gen_from->values;
  nrecvs   = gen_from->n;
  nsends   = gen_to->n;
  rwaits   = gen_from->requests;
  swaits   = gen_to->requests;
  indices  = gen_from->indices;
  rstarts  = gen_from->starts;

  /*  wait on receives */
  count = nrecvs;
  while (count) {
    MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
    /* unpack receives into our local space */
    val      = rvalues + rstarts[imdex];
    n        = rstarts[imdex+1]-rstarts[imdex];
    lindices = indices + rstarts[imdex];
    if (addv == INSERT_VALUES) {
      for ( i=0; i<n; i++ ) {
        yv[lindices[i]] = *val++;
	/*    printf("[%d] recving idx %d val %g\n",PetscGlobalRank,indices[i],val[-1]); */
      }
    } else {
      for ( i=0; i<n; i++ ) {
       yv[lindices[i]] += *val++;
      }
    }
    count--;
  }
  /* wait on sends */
  if (nsends) {
    MPI_Waitall(nsends,swaits,sstatus);
  }
  return 0;
}
static int PtoPPipelinebegin(Vec xin,Vec yin,InsertMode addv,PipelineMode mode,
                             VecPipeline ctx)
{
  VecPipeline_MPI_General *gen_to = (VecPipeline_MPI_General *) ctx->todata;
  VecPipeline_MPI_General *gen_from = (VecPipeline_MPI_General *) ctx->fromdata;
  Vec_MPI        *y = (Vec_MPI *)yin->data;
  MPI_Comm       comm = ctx->comm;
  MPI_Request    *rwaits = gen_from->requests;
  int            nrecvs = gen_from->nbelow,tag = ctx->tag, i,*indices = gen_from->indices;
  int            *rstarts = gen_from->starts,*rprocs = gen_from->procs,count,imdex,n;
  MPI_Status     rstatus;
  Scalar         *yv = y->array,*val,*rvalues = gen_from->values;

  if (mode == PIPELINE_CUSTOM_UP | mode == PIPELINE_CUSTOM_DOWN) { /* VE */
    PipelineFunction PickFunc;
    MPI_Request *twaits;
    int nwaits;
    if (mode == PIPELINE_CUSTOM_UP) PickFunc = *(ctx->dn_fun);
    else PickFunc = *(ctx->up_fun);
    if (!PickFunc) SETERRQ(1,0,"No custom pick function\n");
    /* post receives from selected processors */
    nwaits = 0;
    twaits = (MPI_Request *) PetscMalloc( (gen_from->n+1) * sizeof(MPI_Request) );
    CHKPTRQ(twaits);
    for ( i=0; i<gen_from->n ; i++ ) {
      if ( PickFunc(rprocs[i],ctx->custom_pipe_data) ) {
	MPI_Irecv((rvalues+rstarts[i]),rstarts[i+1] - rstarts[i],
		  MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
        twaits[nwaits++] = rwaits[i]; 
      }
      else {
      }
    }
    /*  wait on completion of receives */
    count = nwaits;
    while (count) {
      int other;
      MPI_Waitany(nwaits,twaits,&imdex,&rstatus); /* imdex points in twaits */
      other = rstatus.MPI_SOURCE;
      for (i=0; i<gen_from->n; i++) 
	if (rprocs[i]==other)  {imdex = i; break;} /* imdex points in rwaits */
      val = rvalues + rstarts[imdex];
      MPI_Get_count(&rstatus,MPIU_SCALAR,&n);
      /* unpack receives into our local space */
      if (n != rstarts[imdex+1] - rstarts[imdex]) SETERRQ(1,0,"Bad message");
      for ( i=0; i<n; i++ ) {
	yv[indices[i+rstarts[imdex]]] = *val++;
      }
      count--;
    }
    PetscFree(twaits);
    return 0;
  }

  if (gen_to->nself || gen_from->nself) SETERRQ(1,0,"PtoPPipelinebegin:No pipeline to self");

  if (mode == PIPELINE_DOWN) {
    /* post receives:   */
    for ( i=0; i<nrecvs; i++ ) {
      MPI_Irecv((rvalues+rstarts[i]),rstarts[i+1] - rstarts[i],
                                MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
    }
    /*  wait on receives */
    count = nrecvs;
    while (count) {
      MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
      /* unpack receives into our local space */
      val = rvalues + rstarts[imdex];
      MPI_Get_count(&rstatus,MPIU_SCALAR,&n);
      if (n != rstarts[imdex+1] - rstarts[imdex]) 
        SETERRQ(1,0,"PtoPPipelinebegin:Bad message");
      if (addv == INSERT_VALUES) {
        for ( i=0; i<n; i++ ) {
          yv[indices[i+rstarts[imdex]]] = *val++;
        }
      }
       else if (addv == MAX_VALUES) { /* VE */
        for ( i=0; i<n; i++ ) {
	  Scalar v = *val++;
          if (yv[indices[i+rstarts[imdex]]]<v) yv[indices[i+rstarts[imdex]]] = v;
        }
      }
      else {
        for ( i=0; i<n; i++ ) {
          yv[indices[i+rstarts[imdex]]] += *val++;
        }
      }
      count--;
    }
  }
  else { /* Pipeline up */
    /* post receives:   */
    for ( i=nrecvs; i<gen_from->n; i++ ) {
      MPI_Irecv((rvalues+rstarts[i]),rstarts[i+1] - rstarts[i],
                                MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
    }
    /*  wait on receives */
    count = gen_from->n - nrecvs;
    while (count) {
      MPI_Waitany(gen_from->n-nrecvs,rwaits+nrecvs,&imdex,&rstatus);
      /* unpack receives into our local space */
      imdex += nrecvs;
      val = rvalues + rstarts[imdex];
      MPI_Get_count(&rstatus,MPIU_SCALAR,&n);
      if (n != rstarts[imdex+1] - rstarts[imdex]) SETERRQ(1,0,"PtoPPipelinebegin:Bad message");
      if (addv == INSERT_VALUES) {
        for ( i=0; i<n; i++ ) {
          yv[indices[i+rstarts[imdex]]] = *val++;
        }
      }
       else if (addv == MAX_VALUES) { /* VE */
        for ( i=0; i<n; i++ ) {
	  Scalar v = *val++;
          if (yv[indices[i+rstarts[imdex]]]<v) yv[indices[i+rstarts[imdex]]] = v;
        }
      }
      else {
        for ( i=0; i<n; i++ ) {
          yv[indices[i+rstarts[imdex]]] += *val++;
        }
      }
      count--;
    }
  }
  return 0;
}

#undef __FUNC__
#define __FUNC__ "PtoPPipelineend"
static int PtoPPipelineend(Vec xin,Vec yin,InsertMode addv, PipelineMode mode,
                           VecPipeline ctx)
{
  VecPipeline_MPI_General *gen_to = (VecPipeline_MPI_General *) ctx->todata;
  Vec_MPI        *x = (Vec_MPI *)xin->data;
  MPI_Comm       comm = ctx->comm;
  MPI_Request    *swaits = gen_to->requests;
  MPI_Status     *sstatus;
  int            nsends = gen_to->n,tag = ctx->tag, i,j,*indices = gen_to->indices;
  int            *sstarts = gen_to->starts,*sprocs = gen_to->procs;
  Scalar         *xv = x->array,*val,*svalues = gen_to->values;

  if (mode == PIPELINE_CUSTOM_UP | mode == PIPELINE_CUSTOM_DOWN) { /* VE */
    PipelineFunction PickFunc;
    MPI_Request *twaits;
    int nwaits,count;
    int *index_base=indices;
    if (mode == PIPELINE_CUSTOM_UP) PickFunc = *(ctx->up_fun);
    else PickFunc = *(ctx->dn_fun);
    if (!PickFunc) SETERRQ(1,0,"No custom pick function\n");
    /* do sends:  */
    nwaits = 0;
    twaits = (MPI_Request *) PetscMalloc( (gen_to->n+1) * sizeof(MPI_Request) );
    CHKPTRQ(twaits);
    for ( i=0; i<gen_to->n; i++ ) {
      if ( PickFunc(sprocs[i],ctx->custom_pipe_data) ) {
        indices = index_base+sstarts[i]; /* shift indices to match i */
        val = svalues + sstarts[i];
        for ( j=0; j<sstarts[i+1]-sstarts[i]; j++ ) {
          val[j] = xv[*indices++];
        }
        MPI_Isend((void*)val,sstarts[i+1] - sstarts[i],
                  MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
        twaits[nwaits++] = swaits[i];
      }
    }
    /* wait for completion of sends */
    count = nwaits;
    if (count) {
      sstatus = (MPI_Status *) PetscMalloc(nwaits*sizeof(MPI_Status));
      CHKPTRQ(sstatus);
      MPI_Waitall(nwaits,twaits,sstatus);
      PetscFree(sstatus);
    }
    PetscFree(twaits);
  }
  else if (mode == PIPELINE_DOWN) {
    /* do sends:  */
    indices += sstarts[gen_to->nbelow]; /* shift indices to match first i */
    for ( i=gen_to->nbelow; i<nsends; i++ ) {
      val = svalues + sstarts[i];
      for ( j=0; j<sstarts[i+1]-sstarts[i]; j++ ) {
        val[j] = xv[*indices++];
      }
      MPI_Isend(val,sstarts[i+1] - sstarts[i],
                   MPIU_SCALAR,sprocs[i],tag,comm,swaits+i-gen_to->nbelow);
    }
    /* wait on sends */
    if (nsends-gen_to->nbelow>0) {
      sstatus = (MPI_Status *)PetscMalloc((nsends-gen_to->nbelow)*sizeof(MPI_Status));
      CHKPTRQ(sstatus);
      MPI_Waitall(nsends-gen_to->nbelow,swaits,sstatus);
      PetscFree(sstatus);
    }
  }
  else {
    /* do sends:  */
    for ( i=0; i<gen_to->nbelow; i++ ) {
      val = svalues + sstarts[i];
      for ( j=0; j<sstarts[i+1]-sstarts[i]; j++ ) {
        val[j] = xv[*indices++];
      }
      MPI_Isend(val,sstarts[i+1]-sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
    }
    /* wait on sends */
    if (gen_to->nbelow>0) {
      sstatus = (MPI_Status *)PetscMalloc((gen_to->nbelow)*sizeof(MPI_Status));
      CHKPTRQ(sstatus);
      MPI_Waitall(gen_to->nbelow,swaits,sstatus);
      PetscFree(sstatus);
    }
  }
  return 0;
}

#undef __FUNC__
#define __FUNC__ "PtoPPipelineDestroy"
static int PtoPPipelineDestroy(PetscObject obj)
{
  VecPipeline     ctx = (VecPipeline) obj;
  VecPipeline_MPI_General *gen_to   = (VecPipeline_MPI_General *) ctx->todata;
  VecPipeline_MPI_General *gen_from = (VecPipeline_MPI_General *) ctx->fromdata;

  if (gen_to->local.slots) PetscFree(gen_to->local.slots);
  if (gen_from->local.slots) PetscFree(gen_from->local.slots);
  PetscFree(gen_to->sstatus);
  PetscFree(gen_to->values); PetscFree(gen_to);
  PetscFree(gen_from->values); PetscFree(gen_from);
  PLogObjectDestroy(ctx);
  PetscHeaderDestroy(ctx);
  return 0;
}

/* --------------------------------------------------------------*/
/* create parallel to sequential scatter context */
#undef __FUNC__
#define __FUNC__ "VecPipelineCreate_PtoS"
int VecPipelineCreate_PtoS(int nx,int *inidx,int ny,int *inidy,Vec xin,VecPipeline ctx)
{
  Vec_MPI        *x = (Vec_MPI *)xin->data;
  VecPipeline_MPI_General *from,*to;
  int            *source,*lens,rank = x->rank, *owners = x->ownership;
  int            size = x->size,*lowner,*start,found;
  int            *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
  int            *owner,*starts,count,tag = xin->tag,slen;
  int            *rvalues,*svalues,base,imdex,nmax,*values,len,*indx,nprocslocal;
  MPI_Comm       comm = xin->comm;
  MPI_Request    *send_waits,*recv_waits;
  MPI_Status     recv_status,*send_status;
  

  /*  first count number of contributors to each processor */
  nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
  PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
  owner = (int *) PetscMalloc((nx+1)*sizeof(int)); CHKPTRQ(owner);
  for ( i=0; i<nx; i++ ) {
    idx = inidx[i];
    found = 0;
    for ( j=0; j<size; j++ ) {
      if (idx >= owners[j] && idx < owners[j+1]) {
        nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
      }
    }
    if (!found) {
printf("index %d (of %d)->%d not found\n",i,nx,idx);
SETERRQ(1,0,"PtoSPipelineCreate:Index out of range");
}
  }
  nprocslocal = nprocs[rank]; 
  nprocs[rank] = procs[rank] = 0; 
  nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];} 

  /* inform other processors of number of messages and max length*/
  work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
  MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
  nrecvs = work[rank]; 
  MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
  nmax = work[rank];
  PetscFree(work);

  /* post receives:   */
  rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
  recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
  CHKPTRQ(recv_waits);
  for ( i=0; i<nrecvs; i++ ) {
    MPI_Irecv((rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
  }

  /* do sends:
      1) starts[i] gives the starting index in svalues for stuff going to 
         the ith processor
  */
  svalues = (int *) PetscMalloc( (nx+1)*sizeof(int) ); CHKPTRQ(svalues);
  send_waits = (MPI_Request *)PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
  starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
  starts[0] = 0; 
  for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  for ( i=0; i<nx; i++ ) {
    if (owner[i] != rank) {
      svalues[starts[owner[i]]++] = inidx[i];
    }
  }

  starts[0] = 0;
  for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  count = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,
                comm,send_waits+count++);
    }
  }
  PetscFree(starts);

  base = owners[rank];

  /*  wait on receives */
  lens = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
  source = lens + nrecvs;
  count = nrecvs; slen = 0;
  while (count) {
    MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
    /* unpack receives into our local space */
    MPI_Get_count(&recv_status,MPI_INT,&n);
    source[imdex]  = recv_status.MPI_SOURCE;
    lens[imdex]  = n;
    slen += n;
    count--;
  }
  PetscFree(recv_waits); 
  
  /* allocate entire send scatter context */
  to = (VecPipeline_MPI_General *) PetscMalloc( sizeof(VecPipeline_MPI_General) ); CHKPTRQ(to);
  PLogObjectMemory(ctx,sizeof(VecPipeline_MPI_General));
  len = slen*(sizeof(int) + sizeof(Scalar)) + (nrecvs+1)*sizeof(int) +
        nrecvs*(sizeof(int) + sizeof(MPI_Request));
  to->n        = nrecvs; 
  to->nbelow   = 0;
  to->nself    = 0;
  to->values   = (Scalar *) PetscMalloc( len ); CHKPTRQ(to->values);
  PLogObjectMemory(ctx,len);
  to->requests = (MPI_Request *) (to->values + slen);
  to->indices  = (int *) (to->requests + nrecvs); 
  to->starts   = (int *) (to->indices + slen);
  to->procs    = (int *) (to->starts + nrecvs + 1);
  to->sstatus  = (MPI_Status *) PetscMalloc((1+nrecvs)*sizeof(MPI_Status));
                 CHKPTRQ(to->sstatus);
  ctx->todata  = (void *) to;
  to->starts[0] = 0;

  if (nrecvs) {
    indx = (int *) PetscMalloc( nrecvs*sizeof(int) ); CHKPTRQ(indx);
    for ( i=0; i<nrecvs; i++ ) indx[i] = i;
    PetscSortIntWithPermutation(nrecvs,source,indx);

    /* move the data into the send scatter */
    for ( i=0; i<nrecvs; i++ ) {
      to->starts[i+1] = to->starts[i] + lens[indx[i]];
      to->procs[i]    = source[indx[i]];
      if (source[indx[i]] < rank) to->nbelow++;
      if (source[indx[i]] == rank) to->nself = 1;
      values = rvalues + indx[i]*nmax;
      for ( j=0; j<lens[indx[i]]; j++ ) {
        to->indices[to->starts[i] + j] = values[j] - base;
      }
    }
    PetscFree(indx);
  }
  PetscFree(rvalues); PetscFree(lens);
 
  /* allocate entire receive scatter context */
  from = (VecPipeline_MPI_General *) PetscMalloc( sizeof(VecPipeline_MPI_General) ); CHKPTRQ(from);
  PLogObjectMemory(ctx,sizeof(VecPipeline_MPI_General));
  len = ny*(sizeof(int) + sizeof(Scalar)) + (nsends+1)*sizeof(int) +
        nsends*(sizeof(int) + sizeof(MPI_Request));
  from->n        = nsends;
  from->nbelow   = 0; 
  from->nself    = 0; 
  from->values   = (Scalar *) PetscMalloc( len );
  PLogObjectMemory(ctx,len);
  from->requests = (MPI_Request *) (from->values + ny);
  from->indices  = (int *) (from->requests + nsends); 
  from->starts   = (int *) (from->indices + ny);
  from->procs    = (int *) (from->starts + nsends + 1);
  ctx->fromdata  = (void *) from;

  /* move data into receive Pipeline */
  lowner = (int *) PetscMalloc( (size+nsends+1)*sizeof(int) ); CHKPTRQ(lowner);
  start = lowner + size;
  count = 0; from->starts[0] = start[0] = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      if (i < rank) from->nbelow++;
      if (i == rank) from->nself = 1;
      lowner[i]            = count;
      from->procs[count++] = i;
      from->starts[count]  = start[count] = start[count-1] + nprocs[i];
    }
  }
  for ( i=0; i<nx; i++ ) {
    if (owner[i] != rank) {
      from->indices[start[lowner[owner[i]]]++] = inidy[i];
    }
  }
  PetscFree(lowner); PetscFree(owner); PetscFree(nprocs);
    
  /* wait on sends */
  if (nsends) {
    send_status = (MPI_Status *)PetscMalloc( nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
    MPI_Waitall(nsends,send_waits,send_status);
    PetscFree(send_status);
  }
  PetscFree(send_waits); PetscFree(svalues);

  if (nprocslocal) {
    int nt;
    /* we have a Pipeline to ourselves */
    from->local.n = to->local.n = nt = nprocslocal;    
    from->local.slots = (int *) PetscMalloc(nt*sizeof(int));CHKPTRQ(from->local.slots);
    to->local.slots   = (int *) PetscMalloc(nt*sizeof(int));CHKPTRQ(to->local.slots);
    PLogObjectMemory(ctx,2*nt*sizeof(int));
    nt = 0;
    for ( i=0; i<nx; i++ ) {
      idx = inidx[i];
      if (idx >= owners[rank] && idx < owners[rank+1]) {
        to->local.slots[nt] = idx - owners[rank];        
        from->local.slots[nt++] = inidy[i];        
      }
    }
  }
  else { 
    from->local.n = 0; from->local.slots = 0;
    to->local.n   = 0; to->local.slots   = 0;
  } 

  to->type = VEC_SCATTER_MPI_GENERAL; from->type = VEC_SCATTER_MPI_GENERAL;

  ctx->destroy        = PtoPPipelineDestroy;
  ctx->scatterbegin  = VecScatterBegin_PtoP;
  ctx->scatterend    = VecScatterEnd_PtoP;
  ctx->pipelinebegin  = PtoPPipelinebegin;
  ctx->pipelineend    = PtoPPipelineend;
  ctx->copy           = PtoPCopy;
  ctx->view           = VecPipelineView_MPI_ToAll;
  return 0;
}

/* ----------------------------------------------------------------*/
/*
    Pipeline from local Seq vectors to a parallel vector. 
 */
#undef __FUNC__
#define __FUNC__ "VecPipelineCreate_StoP"
int VecPipelineCreate_StoP(int nx,int *inidx,int ny,int *inidy,Vec yin,VecPipeline ctx)
{
  Vec_MPI        *y = (Vec_MPI *)yin->data;
  VecPipeline_MPI_General *from,*to;
  int            *source,nprocslocal,*lens,rank = y->rank, *owners = y->ownership;
  int            size = y->size,*lowner,*start;
  int            *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
  int            *owner,*starts,count,tag = yin->tag,slen;
  int            *rvalues,*svalues,base,imdex,nmax,*values,len,found;
  MPI_Comm       comm = yin->comm;
  MPI_Request    *send_waits,*recv_waits;
  MPI_Status     recv_status,*send_status;

  /*  first count number of contributors to each processor */
  nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
  PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
  owner = (int *) PetscMalloc((nx+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
  for ( i=0; i<nx; i++ ) {
    idx = inidy[i];
    found = 0;
    for ( j=0; j<size; j++ ) {
      if (idx >= owners[j] && idx < owners[j+1]) {
        nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
      }
    }
    if (!found) SETERRQ(1,0,"StoPPipelineCreate:Index out of range");
  }
  nprocslocal = nprocs[rank];
  nprocs[rank] = procs[rank] = 0; 
  nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];} 

  /* inform other processors of number of messages and max length*/
  work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
  MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
  nrecvs = work[rank]; 
  MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
  nmax = work[rank];
  PetscFree(work);

  /* post receives:   */
  rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
  recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));
  CHKPTRQ(recv_waits);
  for ( i=0; i<nrecvs; i++ ) {
    MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
  }

  /* do sends:
      1) starts[i] gives the starting index in svalues for stuff going to 
         the ith processor
  */
  svalues = (int *) PetscMalloc( (nx+1)*sizeof(int) ); CHKPTRQ(svalues);
  send_waits = (MPI_Request *)PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
  starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
  starts[0] = 0; 
  for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  for ( i=0; i<nx; i++ ) {
    if (owner[i] != rank) {
      svalues[starts[owner[i]]++] = inidy[i];
    }
  }

  starts[0] = 0;
  for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  count = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
    }
  }
  PetscFree(starts);

  /* allocate entire send Pipeline context */
  to = (VecPipeline_MPI_General *) PetscMalloc( sizeof(VecPipeline_MPI_General) ); CHKPTRQ(to);
  PLogObjectMemory(ctx,sizeof(VecPipeline_MPI_General));
  len = ny*(sizeof(int) + sizeof(Scalar)) + (nsends+1)*sizeof(int) +
        nsends*(sizeof(int) + sizeof(MPI_Request));
  to->n        = nsends; 
  to->nbelow   = 0;
  to->values   = (Scalar *) PetscMalloc( len ); CHKPTRQ(to->values); 
  PLogObjectMemory(ctx,len);
  to->requests = (MPI_Request *) (to->values + ny);
  to->indices  = (int *) (to->requests + nsends); 
  to->starts   = (int *) (to->indices + ny);
  to->procs    = (int *) (to->starts + nsends + 1);
  to->sstatus  = (MPI_Status *) PetscMalloc((1+nsends)*sizeof(MPI_Status));
                 CHKPTRQ(to->sstatus);
  ctx->todata  = (void *) to;

  /* move data into send scatter context */
  lowner = (int *) PetscMalloc( (size+nsends+1)*sizeof(int) ); CHKPTRQ(lowner);
  start = lowner + size;
  count = 0; to->starts[0] = start[0] = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      lowner[i]          = count;
      to->procs[count++] = i;
      to->starts[count]  = start[count] = start[count-1] + nprocs[i];
    }
  }
  for ( i=0; i<nx; i++ ) {
    if (owner[i] != rank) {
      to->indices[start[lowner[owner[i]]]++] = inidx[i];
    }
  }
  PetscFree(lowner); PetscFree(owner); PetscFree(nprocs);

  base = owners[rank];

  /*  wait on receives */
  lens = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
  source = lens + nrecvs;
  count = nrecvs; slen = 0;
  while (count) {
    MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
    /* unpack receives into our local space */
    MPI_Get_count(&recv_status,MPI_INT,&n);
    source[imdex]  = recv_status.MPI_SOURCE;
    lens[imdex]  = n;
    slen += n;
    count--;
  }
  PetscFree(recv_waits); 
 
  /* allocate entire receive scatter context */
  from = (VecPipeline_MPI_General *) PetscMalloc( sizeof(VecPipeline_MPI_General) ); CHKPTRQ(from);
  PLogObjectMemory(ctx,sizeof(VecPipeline_MPI_General));
  len = slen*(sizeof(int) + sizeof(Scalar)) + (nrecvs+1)*sizeof(int) +
        nrecvs*(sizeof(int) + sizeof(MPI_Request));
  from->n        = nrecvs; 
  from->nbelow   = 0;
  from->values   = (Scalar *) PetscMalloc( len );
  PLogObjectMemory(ctx,len);
  from->requests = (MPI_Request *) (from->values + slen);
  from->indices  = (int *) (from->requests + nrecvs); 
  from->starts   = (int *) (from->indices + slen);
  from->procs    = (int *) (from->starts + nrecvs + 1);
  ctx->fromdata  = (void *) from;

  /* move the data into the receive scatter context*/
  from->starts[0] = 0;
  for ( i=0; i<nrecvs; i++ ) {
    from->starts[i+1] = from->starts[i] + lens[i];
    from->procs[i]    = source[i];
    values = rvalues + i*nmax;
    for ( j=0; j<lens[i]; j++ ) {
      from->indices[from->starts[i] + j] = values[j] - base;
    }
  }
  PetscFree(rvalues); PetscFree(lens);
    
  /* wait on sends */
  if (nsends) {
    send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
    MPI_Waitall(nsends,send_waits,send_status);
    PetscFree(send_status);
  }
  PetscFree(send_waits); PetscFree(svalues);

  if (nprocslocal) {
    int nt;
    /* we have a scatter to ourselves */
    from->local.n = to->local.n = nt = nprocslocal;    
    from->local.slots = (int *) PetscMalloc(nt*sizeof(int));CHKPTRQ(from->local.slots);
    to->local.slots = (int *) PetscMalloc(nt*sizeof(int));CHKPTRQ(to->local.slots);
    PLogObjectMemory(ctx,2*nt*sizeof(int));
    nt = 0;
    for ( i=0; i<ny; i++ ) {
      idx = inidy[i];
      if (idx >= owners[rank] && idx < owners[rank+1]) {
        from->local.slots[nt] = idx - owners[rank];        
        to->local.slots[nt++] = inidx[i];        
      }
    }
  }
  else {
    from->local.n = 0; from->local.slots = 0;
    to->local.n = 0; to->local.slots = 0;
  }

  to->type = VEC_SCATTER_MPI_GENERAL; from->type = VEC_SCATTER_MPI_GENERAL;

  ctx->destroy           = PtoPPipelineDestroy;
  ctx->pipelinebegin     = PtoPPipelinebegin;
  ctx->pipelineend       = PtoPPipelineend;
  ctx->copy              = 0;
  ctx->view              = VecPipelineView_MPI_ToAll;
  return 0;
}

/* ---------------------------------------------------------------------------------*/
#undef __FUNC__
#define __FUNC__ "VecPipelineCreate_PtoP"
int VecPipelineCreate_PtoP(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,VecPipeline ctx)
{
  Vec_MPI        *x = (Vec_MPI *)xin->data;
  int            *lens,rank = x->rank, *owners = x->ownership,size = x->size,found;
  int            *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work,*local_inidx,*local_inidy;
  int            *owner,*starts,count,tag = xin->tag,slen,ierr;
  int            *rvalues,*svalues,base,imdex,nmax,*values;
  MPI_Comm       comm = xin->comm;
  MPI_Request    *send_waits,*recv_waits;
  MPI_Status     recv_status;

  /*
  Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
  They then call the StoPPipelineCreate()
  */
  /*  first count number of contributors to each processor */
  nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
  PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
  owner = (int *) PetscMalloc((nx+1)*sizeof(int)); CHKPTRQ(owner);
  for ( i=0; i<nx; i++ ) {
    idx = inidx[i];
    found = 0;
    for ( j=0; j<size; j++ ) {
      if (idx >= owners[j] && idx < owners[j+1]) {
        nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
      }
    }
    if (!found) SETERRQ(1,0,"PtoPPipelineCreate:Index out of range");
  }
  nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];} 

  /* inform other processors of number of messages and max length*/
  work = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
  MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);
  nrecvs = work[rank]; 
  MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);
  nmax = work[rank];
  PetscFree(work);

  /* post receives:   */
  rvalues = (int *) PetscMalloc(2*(nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
  recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
  for ( i=0; i<nrecvs; i++ ) {
    MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
  }

  /* do sends:
      1) starts[i] gives the starting index in svalues for stuff going to 
         the ith processor
  */
  svalues = (int *) PetscMalloc( 2*(nx+1)*sizeof(int) ); CHKPTRQ(svalues);
  send_waits = (MPI_Request *)PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
  starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
  starts[0] = 0; 
  for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  for ( i=0; i<nx; i++ ) {
    svalues[2*starts[owner[i]]]       = inidx[i];
    svalues[1 + 2*starts[owner[i]]++] = inidy[i];
  }
  PetscFree(owner);

  starts[0] = 0;
  for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];} 
  count = 0;
  for ( i=0; i<size; i++ ) {
    if (procs[i]) {
      MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
    }
  }
  PetscFree(starts);
  PetscFree(nprocs);

  base = owners[rank];

  /*  wait on receives */
  lens = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
  count = nrecvs; slen = 0;
  while (count) {
    MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
    /* unpack receives into our local space */
    MPI_Get_count(&recv_status,MPI_INT,&n);
    lens[imdex]  =  n/2;
    slen         += n/2;
    count--;
  }
  PetscFree(recv_waits); 
  
  local_inidx = (int *) PetscMalloc( 2*(slen+1)*sizeof(int) ); CHKPTRQ(local_inidx);
  local_inidy = local_inidx + slen;

  count = 0;
  for ( i=0; i<nrecvs; i++ ) {
    values = rvalues + 2*i*nmax;
    for ( j=0; j<lens[i]; j++ ) {
      local_inidx[count]   = values[2*j] - base;
      local_inidy[count++] = values[2*j+1];
    }
  }
  PetscFree(rvalues); 
  PetscFree(lens);
 
  /* wait on sends */
  if (nsends) {
    MPI_Status *send_status;
    send_status = (MPI_Status *)PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
    MPI_Waitall(nsends,send_waits,send_status);
    PetscFree(send_status);
  }
  PetscFree(send_waits);
  PetscFree(svalues);

  /*
     should sort and remove duplicates from local_inidx,local_inidy 
  */
  ierr = VecPipelineCreate_StoP(slen,local_inidx,slen,local_inidy,yin,ctx); CHKERRQ(ierr);
  PetscFree(local_inidx);

  return 0;
}

/* --------------------------------------------------------------*/
/*@C
   VecPipelineCreate - Creates a vector pipeline context.

   Input Parameters:
.  xin - the vector from which we pipeline
.  yin - the vector to which we pipeline
.  ix - the indices of xin to pipeline
.  iy - the indices of yin to hold results

   Output Parameter:
.  newctx - location to store the new pipeline context

   Notes:
   A VecPipeline context CANNOT be used in two or more simultaneous pipelines.
   In this case a separate VecPipeline is needed for each concurrent pipeline.

.keywords: vector, pipeline, context, create

.seealso: VecPipelineDestroy()
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineCreate"
int VecPipelineCreate(Vec xin,IS ix,Vec yin,IS iy,VecPipeline *newctx)
{
  VecPipeline ctx;
  int        len,size,cando,islocal,totalv,ierr; 
  MPI_Comm   comm = xin->comm;
  PetscTruth ixblock,iyblock;

  /* next 2 lines insure that we use parallel comm if it exists */
  MPI_Comm_size(yin->comm,&size);
  if (size > 1) comm = yin->comm; 

  /* generate the Pipeline context */
  PetscHeaderCreate(ctx,_VecPipeline,VEC_PIPELINE_COOKIE,0,comm);
  PLogObjectCreate(ctx);
  PLogObjectMemory(ctx,sizeof(struct _VecPipeline));
  ctx->inuse = 0;

  VecGetLocalSize_Fast(xin,ctx->to_n);
  VecGetLocalSize_Fast(yin,ctx->from_n);

  /* ---------------------------------------------------------------------------*/
  if (xin->type == VECSEQ && yin->type == VECSEQ) {
    if (ix->type == IS_GENERAL && iy->type == IS_GENERAL){
      int                    nx,ny,*idx,*idy;
      VecPipeline_Seq_General *to,*from;

      ISGetSize(ix,&nx); ISGetIndices(ix,&idx);
      ISGetSize(iy,&ny); ISGetIndices(iy,&idy);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      len               = sizeof(VecPipeline_Seq_General) + nx*sizeof(int);
      to                = (VecPipeline_Seq_General *) PetscMalloc(len); CHKPTRQ(to)
      PLogObjectMemory(ctx,len);
      to->slots         = (int *) (to + 1); 
      to->n             = nx; 
      PetscMemcpy(to->slots,idy,nx*sizeof(int));
      from              = (VecPipeline_Seq_General *) PetscMalloc(len); CHKPTRQ(from);
      from->slots       = (int *) (from + 1);
      from->n           = nx; 
      PetscMemcpy(from->slots,idx,nx*sizeof(int));
      to->type          = VEC_SCATTER_SEQ_GENERAL; 
      from->type        = VEC_SCATTER_SEQ_GENERAL; 
      ctx->todata       = (void *) to; 
      ctx->fromdata     = (void *) from;
/*      ctx->pipelinebegin = VecPipelineBegin_SGtoSG; 
      ctx->destroy      = VecPipelineDestroy_SGtoSG;*/
      ctx->pipelineend   = 0; 
      ctx->copy         = 0;
      *newctx           = ctx;
      return 0;
    }
    else if (ix->type == IS_STRIDE &&  iy->type == IS_STRIDE){
      int                    nx,ny,to_first,to_step,from_first,from_step;
      VecPipeline_Seq_Stride  *from,*to;

      ISGetSize(ix,&nx); ISStrideGetInfo(ix,&from_first,&from_step);
      ISGetSize(iy,&ny); ISStrideGetInfo(iy,&to_first,&to_step);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      to                = PetscNew(VecPipeline_Seq_Stride); CHKPTRQ(to);
      to->n             = nx; 
      to->first         = to_first; 
      to->step          = to_step;
      from              = PetscNew(VecPipeline_Seq_Stride);CHKPTRQ(from);
      PLogObjectMemory(ctx,2*sizeof(VecPipeline_Seq_Stride));
      from->n           = nx;
      from->first       = from_first; 
      from->step        = from_step;
      to->type          = VEC_SCATTER_SEQ_STRIDE; 
      from->type        = VEC_SCATTER_SEQ_STRIDE; 
      ctx->todata       = (void *) to; 
      ctx->fromdata     = (void *) from;
/*      ctx->pipelinebegin = VecPipelineBegin_SStoSS; 
      ctx->destroy      = VecPipelineDestroy_SGtoSG;*/
      ctx->pipelineend   = 0; 
      ctx->copy         = 0;
      *newctx           = ctx;
      return 0;
    }
    else if (ix->type == IS_GENERAL && iy->type == IS_STRIDE){
      int                    nx,ny,*idx,first,step;
      VecPipeline_Seq_General *from;
      VecPipeline_Seq_Stride  *to;

      ISGetSize(ix,&nx); ISGetIndices(ix,&idx);
      ISGetSize(iy,&ny); ISStrideGetInfo(iy,&first,&step);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      to              = PetscNew(VecPipeline_Seq_Stride); CHKPTRQ(to);
      to->n           = nx; 
      to->first       = first; 
      to->step        = step;
      len             = sizeof(VecPipeline_Seq_General) + nx*sizeof(int);
      from            = (VecPipeline_Seq_General *) PetscMalloc(len); CHKPTRQ(from);
      PLogObjectMemory(ctx,len + sizeof(VecPipeline_Seq_Stride));
      from->slots     = (int *) (from + 1); 
      from->n         = nx; 
      PetscMemcpy(from->slots,idx,nx*sizeof(int));
      ctx->todata     = (void *) to; ctx->fromdata = (void *) from;
/*      if (step == 1)  ctx->pipelinebegin = VecPipelineBegin_SGtoSS_Stride1;
      else            ctx->pipelinebegin = VecPipelineBegin_SGtoSS;
      ctx->destroy    = VecPipelineDestroy_SGtoSG;*/
      ctx->pipelineend = 0; 
      ctx->copy       = 0;
      to->type        = VEC_SCATTER_SEQ_STRIDE; 
      from->type      = VEC_SCATTER_SEQ_GENERAL;
      *newctx         = ctx;
      return 0;
    }
    else if (ix->type == IS_STRIDE && iy->type == IS_GENERAL){
      int                    nx,ny,*idx,first,step;
      VecPipeline_Seq_General *to;
      VecPipeline_Seq_Stride  *from;

      ISGetSize(ix,&nx); ISGetIndices(iy,&idx);
      ISGetSize(iy,&ny); ISStrideGetInfo(ix,&first,&step);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      from            = PetscNew(VecPipeline_Seq_Stride);CHKPTRQ(from);
      from->n         = nx; 
      from->first     = first; 
      from->step      = step;
      len             = sizeof(VecPipeline_Seq_General) + nx*sizeof(int);
      to              = (VecPipeline_Seq_General *) PetscMalloc(len); CHKPTRQ(to);
      PLogObjectMemory(ctx,len + sizeof(VecPipeline_Seq_Stride));
      to->slots       = (int *) (to + 1); 
      to->n           = nx; 
      PetscMemcpy(to->slots,idx,nx*sizeof(int));
      ctx->todata     = (void *) to; 
      ctx->fromdata   = (void *) from;
/*      if (step == 1) ctx->pipelinebegin = VecPipelineBegin_SStoSG_Stride1; 
      else           ctx->pipelinebegin = VecPipelineBegin_SStoSG; 
      ctx->destroy    = VecPipelineDestroy_SGtoSG;*/
      ctx->pipelineend = 0; 
      ctx->copy       = 0;
      to->type        = VEC_SCATTER_SEQ_GENERAL; 
      from->type      = VEC_SCATTER_SEQ_STRIDE; 
      *newctx         = ctx;
      return 0;
    }
    else {
      SETERRQ(1,0,"VecPipelineCreate:Cannot generate such a pipeline context yet");
    }
  }
  /* ---------------------------------------------------------------------------*/
  if (xin->type == VECMPI && yin->type == VECSEQ) {
    islocal = 0;
    /* special case extracting (subset of) local portion */ 
    if (ix->type == IS_STRIDE && iy->type == IS_STRIDE){
      Vec_MPI               *x = (Vec_MPI *)xin->data;
      int                   nx,ny,to_first,to_step,from_first,from_step;
      int                   start = x->ownership[x->rank], end = x->ownership[x->rank+1];
      VecPipeline_Seq_Stride *from,*to;

      ISGetSize(ix,&nx); ISStrideGetInfo(ix,&from_first,&from_step);
      ISGetSize(iy,&ny); ISStrideGetInfo(iy,&to_first,&to_step);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      if (ix->min >= start && ix->max < end ) islocal = 1; else islocal = 0;
      MPI_Allreduce( &islocal, &cando,1,MPI_INT,MPI_LAND,xin->comm);
      if (cando) {
        to                = PetscNew(VecPipeline_Seq_Stride);CHKPTRQ(to);
        to->n             = nx; 
        to->first         = to_first;
        to->step          = to_step;
        from              = PetscNew(VecPipeline_Seq_Stride);CHKPTRQ(from);
        PLogObjectMemory(ctx,2*sizeof(VecPipeline_Seq_Stride));
        from->n           = nx; 
        from->first       = from_first-start; 
        from->step        = from_step;
        to->type          = VEC_SCATTER_SEQ_STRIDE; 
        from->type        = VEC_SCATTER_SEQ_STRIDE; 
        ctx->todata       = (void *) to; 
        ctx->fromdata     = (void *) from;
/*        ctx->pipelinebegin = VecPipelineBegin_SStoSS; 
        ctx->destroy      = VecPipelineDestroy_SGtoSG;
        ctx->pipelineend   = 0; 
        ctx->copy         = VecPipelineCopy_PStoSS;*/
        *newctx           = ctx;
        return 0;
      }
    }
    else {
      MPI_Allreduce( &islocal, &cando,1,MPI_INT,MPI_LAND,xin->comm);
    }
    /* test for special case of all processors getting entire vector */
    totalv = 0;
    if (ix->type == IS_STRIDE && iy->type == IS_STRIDE){
      Vec_MPI              *x = (Vec_MPI *)xin->data;
      int                  i,nx,ny,to_first,to_step,from_first,from_step,*count;
      VecPipeline_MPI_ToAll *sto;

      ISGetSize(ix,&nx); ISStrideGetInfo(ix,&from_first,&from_step);
      ISGetSize(iy,&ny); ISStrideGetInfo(iy,&to_first,&to_step);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      if (nx != x->N) {
        totalv = 0;
      } else if (from_first == 0        && from_step == 1 && 
                 from_first == to_first && from_step == to_step){
        totalv = 1; 
      } else totalv = 0;
      MPI_Allreduce(&totalv,&cando,1,MPI_INT,MPI_LAND,xin->comm);

      if (cando) {
        MPI_Comm_size(ctx->comm,&size);
        sto   = PetscNew(VecPipeline_MPI_ToAll);CHKPTRQ(sto);
        count = (int *) PetscMalloc(size*sizeof(int)); CHKPTRQ(count);
        for ( i=0; i<size; i++ ) {
	  count[i] = x->ownership[i+1]-x->ownership[i];
        }
        sto->count        = count;
        sto->work1        = 0;
        sto->work2        = 0;
        sto->type         = VEC_SCATTER_MPI_TOALL;
        PLogObjectMemory(ctx,sizeof(VecPipeline_MPI_ToAll)+size*sizeof(int));
        ctx->todata       = (void *) sto;
        ctx->fromdata     = 0;
        ctx->scatterbegin = VecScatterBegin_MPI_ToAll;   
        ctx->pipelinebegin = VecPipelineBegin_MPI_ToAll;   
        ctx->destroy      = VecPipelineDestroy_MPI_ToAll;
        ctx->pipelineend   = 0;
        ctx->scatterend   = 0;
        ctx->copy         = VecPipelineCopy_MPI_ToAll;
        *newctx           = ctx;
        return 0;
      }
    }
    else {
      MPI_Allreduce( &totalv, &cando,1,MPI_INT,MPI_LAND,xin->comm);
    }
    ierr = ISBlock(ix,&ixblock); CHKERRQ(ierr);
    ierr = ISBlock(iy,&iyblock); CHKERRQ(ierr);
    /* test for blocked indices */
    if (ixblock && iyblock) {
      int nx, ny, *idx, *idy, bsx, bsy;
      ierr = ISBlockGetBlockSize(ix,&bsx); CHKERRQ(ierr);
      ierr = ISBlockGetBlockSize(iy,&bsy); CHKERRQ(ierr);
      if (bsx == bsy && bsx == 5) {
        ISBlockGetSize(ix,&nx); ISBlockGetIndices(ix,&idx);
        ISBlockGetSize(iy,&ny); ISBlockGetIndices(iy,&idy);
        if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
        ierr = VecPipelineCreate_PtoS(nx,idx,ny,idy,xin,ctx); CHKERRQ(ierr);
        ISBlockRestoreIndices(ix,&idx);
        ISBlockRestoreIndices(iy,&idy);
        *newctx = ctx;
        return 0;
      }
    }
    /* left over general case */
    {
      int nx,ny,*idx,*idy;
      ISGetSize(ix,&nx); ISGetIndices(ix,&idx);
      ISGetSize(iy,&ny); ISGetIndices(iy,&idy);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      ierr = VecPipelineCreate_PtoS(nx,idx,ny,idy,xin,ctx); CHKERRQ(ierr);
      ISRestoreIndices(ix,&idx);
      ISRestoreIndices(iy,&idy);
      *newctx = ctx;
      return 0;
    }
  }
  /* ---------------------------------------------------------------------------*/
  if (xin->type == VECSEQ && yin->type == VECMPI) {
    /* special case local copy portion */ 
    islocal = 0;
    if (ix->type == IS_STRIDE && iy->type == IS_STRIDE){
      Vec_MPI               *y = (Vec_MPI *)yin->data;
      int                   nx,ny,to_first,to_step,from_step,start=y->ownership[y->rank];
      int                   end = y->ownership[y->rank+1],from_first;
      VecPipeline_Seq_Stride *from,*to;

      ISGetSize(ix,&nx); ISStrideGetInfo(ix,&from_first,&from_step);
      ISGetSize(iy,&ny); ISStrideGetInfo(iy,&to_first,&to_step);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      if (iy->min >= start && iy->max < end ) islocal = 1; else islocal = 0;
      MPI_Allreduce( &islocal, &cando,1,MPI_INT,MPI_LAND,yin->comm);
      if (cando) {
        to                = PetscNew(VecPipeline_Seq_Stride);CHKPTRQ(to);
        to->n             = nx; 
        to->first         = to_first-start; 
        to->step          = to_step;
        from              = PetscNew(VecPipeline_Seq_Stride);CHKPTRQ(from);
        PLogObjectMemory(ctx,2*sizeof(VecPipeline_Seq_Stride));
        from->n           = nx; 
        from->first       = from_first; 
        from->step        = from_step;
        to->type          = VEC_SCATTER_SEQ_STRIDE; 
        from->type        = VEC_SCATTER_SEQ_STRIDE;
        ctx->todata       = (void *) to;
        ctx->fromdata     = (void *) from;
/*        ctx->pipelinebegin = VecPipelineBegin_SStoSS; 
        ctx->destroy      = VecPipelineDestroy_SGtoSG;*/
        ctx->pipelineend   = 0;  
        ctx->copy         = 0;
        *newctx           = ctx;
        return 0;
      }
    }
    else {
      MPI_Allreduce( &islocal, &cando,1,MPI_INT,MPI_LAND,yin->comm);
    }
    /* general case */
    {
      int nx,ny,*idx,*idy;
      ISGetSize(ix,&nx); ISGetIndices(ix,&idx);
      ISGetSize(iy,&ny); ISGetIndices(iy,&idy);
      if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
      ierr = VecPipelineCreate_StoP(nx,idx,ny,idy,yin,ctx); CHKERRQ(ierr);
      ISRestoreIndices(ix,&idx); ISRestoreIndices(iy,&idy);
      *newctx = ctx;
      return 0;
    }
  }
  /* ---------------------------------------------------------------------------*/
  if (xin->type == VECMPI && yin->type == VECMPI) {
    /* no special cases for now */
    int nx,ny,*idx,*idy;
    ISGetSize(ix,&nx); ISGetIndices(ix,&idx);
    ISGetSize(iy,&ny); ISGetIndices(iy,&idy);
    if (nx != ny) SETERRQ(1,0,"VecPipelineCreate:Local pipeline sizes don't match");
    ierr    = VecPipelineCreate_PtoP(nx,idx,ny,idy,xin,yin,ctx); CHKERRQ(ierr);
    ISRestoreIndices(ix,&idx); 
    ISRestoreIndices(iy,&idy);
    *newctx = ctx;
    return 0;
  }
  SETERRQ(1,0,"VecPipelineCreate:Cannot generate such Pipeline Context yet");
}

/* ------------------------------------------------------------------*/
/*@
   VecPipelineBegin - Begins a generalized pipeline from one vector to
   another. Complete the pipelineing phase with VecPipelineEnd().

   Input Parameters:
.  x - the vector from which we pipeline
.  y - the vector to which we pipeline
.  addv - either ADD_VALUES or INSERT_VALUES, depending whether values are
   added or set
.  mode - the pipelineing mode, usually PIPELINE_ALL.  The available modes are:
$    PIPELINE_ALL, PIPELINE_REVERSE
.  inctx - pipeline context generated by VecPipelineCreate()

   Output Parameter:
.  y - the vector to which we pipeline

   Notes:
   y[iy[i]] = x[ix[i]], for i=0,...,ni-1

   This pipeline is far more general than the conventional
   pipeline, since it can be a gather or a pipeline or a combination,
   depending on the indices ix and iy.  If x is a parallel vector and y
   is sequential, VecPipelineBegin() can serve to gather values to a
   single processor.  Similarly, if y is parallel and x sequential, the
   routine can pipeline from one processor to many processors.

.keywords: vector, pipeline, gather, begin

.seealso: VecPipelineCreate(), VecPipelineEnd()
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineBegin"
int VecPipelineBegin(Vec x,Vec y,InsertMode addv,PipelineMode mode,VecPipeline inctx)
{
  int ierr;
  PetscValidHeaderSpecific(x,VEC_COOKIE); PetscValidHeaderSpecific(y,VEC_COOKIE);
  PetscValidHeaderSpecific(inctx,VEC_PIPELINE_COOKIE);
  if (inctx->inuse) SETERRQ(1,0,"VecPipelineBegin: Pipeline ctx already in use");
#if defined(PETSC_DEBUG)
  {
    int to_n,from_n;
    VecGetLocalSize_Fast(x,to_n);
    VecGetLocalSize_Fast(y,from_n);
    if (mode == PIPELINE_REVERSE) {
      if (to_n != inctx->from_n) SETERRQ(1,0,"VecPipelineBegin:Vector wrong size for pipeline");
      if (from_n != inctx->to_n) SETERRQ(1,0,"VecPipelineBegin:Vector wrong size for pipeline");
    } else {
      if (to_n != inctx->to_n) SETERRQ(1,0,"VecPipelineBegin:Vector wrong size for pipeline");
      if (from_n != inctx->from_n) SETERRQ(1,0,"VecPipelineBegin:Vector wrong size for pipeline");
    }
  }
#endif

  inctx->inuse = 1;
  PLogEventBegin(VEC_PipelineBegin,inctx,x,y,0);
  ierr = (*inctx->pipelinebegin)(x,y,addv,mode,inctx); CHKERRQ(ierr);
  PLogEventEnd(VEC_PipelineBegin,inctx,x,y,0);
  return 0;
}

/* --------------------------------------------------------------------*/
/*@
   VecPipelineEnd - Ends a generalized pipeline from one vector to another.  Call
   after first calling VecPipelineBegin().

   Input Parameters:
.  x - the vector from which we pipeline
.  y - the vector to which we pipeline
.  addv - either ADD_VALUES or INSERT_VALUES, depending whether values are
   added or set
.  mode - the pipelineing mode, usually PIPELINE_ALL.  The available modes are:
$    PIPELINE_ALL, PIPELINE_REVERSE
.  ctx - pipeline context generated by VecPipelineCreate()

   Output Parameter:
.  y - the vector to which we pipeline

   Notes:
   y[iy[i]] = x[ix[i]], for i=0,...,ni-1

.keywords: vector, pipeline, gather, end

.seealso: VecPipelineBegin(), VecPipelineCreate()
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineEnd"
int VecPipelineEnd(Vec x,Vec y,InsertMode addv,PipelineMode mode, VecPipeline ctx)
{
  int ierr;
  PetscValidHeaderSpecific(x,VEC_COOKIE); PetscValidHeaderSpecific(y,VEC_COOKIE);
  PetscValidHeaderSpecific(ctx,VEC_PIPELINE_COOKIE);
  ctx->inuse = 0;
  if (!(ctx)->pipelineend) return 0;
  PLogEventBegin(VEC_PipelineEnd,ctx,x,y,0);
  ierr = (*(ctx)->pipelineend)(x,y,addv,mode,ctx); CHKERRQ(ierr);
  PLogEventEnd(VEC_PipelineEnd,ctx,x,y,0);
  return 0;
}

/*@C
   VecPipelineDestroy - Destroys a pipeline context created by 
   VecPipelineCreate().

   Input Parameter:
.  ctx - the pipeline context

.keywords: vector, pipeline, context, destroy

.seealso: VecPipelineCreate(), VecPipelineCopy()
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineDestroy"
int VecPipelineDestroy( VecPipeline ctx )
{
  PetscValidHeaderSpecific(ctx,VEC_PIPELINE_COOKIE);
  return (*ctx->destroy)((PetscObject)ctx);
}

/*@C
   VecPipelineCopy - Makes a copy of a pipeline context.

   Input Parameter:
.  sctx - the pipeline context

   Output Parameter:
.  ctx - the context copy

.keywords: vector, pipeline, copy, context

.seealso: VecPipelineCreate(), VecPipelineDestroy()
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineCopy"
int VecPipelineCopy( VecPipeline sctx,VecPipeline *ctx )
{
  PetscValidHeaderSpecific(sctx,VEC_PIPELINE_COOKIE);
  PetscValidPointer(ctx);
  if (!sctx->copy) SETERRQ(1,0,"VecPipelineCopy: cannot copy this type");
  PetscHeaderCreate(*ctx,_VecPipeline,VEC_PIPELINE_COOKIE,0,sctx->comm);
  PLogObjectCreate(*ctx);
  PLogObjectMemory(*ctx,sizeof(struct _VecPipeline));
  (*ctx)->to_n   = sctx->to_n;
  (*ctx)->from_n = sctx->from_n;
  return (*sctx->copy)(sctx,*ctx);
}


/* ------------------------------------------------------------------*/
/*@
   VecPipelineView - Views a vector pipeline context.

   Input Parameters:
.  ctx - the pipeline context
.  viewer - the viewer for displaying the context

.keywords: vector, pipeline, view
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineView"
int VecPipelineView(VecPipeline ctx, Viewer viewer)
{
  PetscValidHeaderSpecific(ctx,VEC_PIPELINE_COOKIE);
  PetscValidHeaderSpecific(viewer,VIEWER_COOKIE);
  if (ctx->view) return (*ctx->view)((PetscObject)ctx,viewer);
  else return 0;
}

/*@
   VecPipelineRemap - Remaps the "from" and "to" indices in a 
   vector pipeline context. FOR EXPERTS ONLY!

  Input Parameters:
.   scat - vector pipeline context
.   from - remapping for "from" indices (may be PETSC_NULL)
.   to   - remapping for "to" indices (may be PETSC_NULL)

.keywords: Vec, pipeline, remap
@*/
#undef __FUNC__
#define __FUNC__ "VecPipelineRemap"
int VecPipelineRemap(VecPipeline scat,int *rto,int *rfrom)
{
  VecPipeline_Seq_General *to;
  VecPipeline_MPI_General *mto;
  int                    i;

  PetscValidHeaderSpecific(scat,VEC_PIPELINE_COOKIE);
  if (rto)   {PetscValidIntPointer(rto);}
  if (rfrom) {PetscValidIntPointer(rfrom);}

  to   = (VecPipeline_Seq_General *)scat->todata;
  mto  = (VecPipeline_MPI_General *)scat->todata;

  if (to->type == VEC_SCATTER_MPI_TOALL) SETERRQ(1,0,"VecPipelineRemap:not for all copy pipelines");

  if (rto) {
    if (to->type == VEC_SCATTER_SEQ_GENERAL) {
      for ( i=0; i<to->n; i++ ) {
        to->slots[i] = rto[to->slots[i]];
      }
    } else if (to->type == VEC_SCATTER_MPI_GENERAL) {
      /* handle off processor parts */
      for ( i=0; i<mto->starts[mto->n]; i++ ) {
        mto->indices[i] = rto[mto->indices[i]];
      }
      /* handle local part */
      to = &mto->local;
      for ( i=0; i<to->n; i++ ) {
        to->slots[i] = rto[to->slots[i]];
      }
    } else SETERRQ(1,0,"VecPipelineRemap:Unable to remap such pipelines");
  }
  if (rfrom) {
    SETERRQ(1,0,"VecPipelineRemap:Unable to remap the FROM in pipelines yet");
  }
  return 0;
}


int VecPipelineSetCustomPipeline 
#undef __FUNC__
#define __FUNC__ "VecPipelineSetCustomPipeline"
(VecPipeline ctx,PipelineFunction up,PipelineFunction dn,PetscObject info)
{
  ctx->up_fun = up;
  ctx->dn_fun = dn;
  ctx->custom_pipe_data = info;
  return 0;
}

/* stuff moved from pcpset.c */
/* >>>> Routines for sequential ordering of processors <<<< */

typedef struct {int mytid;} Pipeline_sequential_info;

#undef __FUNC__
#define __FUNC__ "ProcUp"
static int ProcUp(int proc,PetscObject pipe_info)
{
  int mytid = ((Pipeline_sequential_info *)pipe_info)->mytid;

  if (mytid<proc)
    return 1;
  else
    return 0;
}
static int ProcDown(int proc,PetscObject pipe_info)
{ 
  int mytid = ((Pipeline_sequential_info *)pipe_info)->mytid;

  if (mytid>proc)
    return 1;
  else
    return 0;
}

#undef __FUNC__
#define __FUNC__ "PipelineSequentialSetup"
static int PipelineSequentialSetup(VecPipeline vs,PetscObject *obj)
{
  Pipeline_sequential_info *info;

  info = PetscNew(Pipeline_sequential_info);
  MPI_Comm_rank(vs->comm,&(info->mytid));
  *obj = (PetscObject) info;

  return 0;
}

/* >>>> Routines for multicolour ordering of processors <<<< */

typedef struct {
  int mytid,numtids,*proc_colours;
} Pipeline_coloured_info;

static int ProcColourUp(int proc,PetscObject pipe_info)
{
  Pipeline_coloured_info* comm_info =
    (Pipeline_coloured_info *) pipe_info;
  int mytid = comm_info->mytid;

  if (comm_info->proc_colours[mytid]<
      comm_info->proc_colours[proc])
    return 1;
  else
    return 0;
}
static int ProcColourDown(int proc,PetscObject pipe_info)
{ 
  Pipeline_coloured_info* comm_info =
    (Pipeline_coloured_info *) pipe_info;
  int mytid = comm_info->mytid;

  if (comm_info->proc_colours[mytid]>
      comm_info->proc_colours[proc])
    return 1;
  else
    return 0;
}
#undef __FUNC__
#define __FUNC__ "PipelineRedblackSetup"
static int PipelineRedblackSetup(VecPipeline vs,PetscObject *obj)
{
  Pipeline_coloured_info *info;
  int numtids,i;

  info = PetscNew(Pipeline_coloured_info);
  MPI_Comm_rank(vs->comm,&(info->mytid));
  MPI_Comm_size(vs->comm,&numtids);
  info->proc_colours = PetscMalloc(numtids*sizeof(int));
  CHKPTRQ(info->proc_colours);
  for (i=0; i<numtids; i++) {info->proc_colours[i] = i%2;}
  *obj = (PetscObject) info;

  return 0;
}

#undef __FUNC__
#define __FUNC__ "PCColourProcessors"
int PCColourProcessors(VecPipeline ctx,PetscObject pipe_info)
{
  Pipeline_coloured_info *comm_info = (Pipeline_coloured_info *) pipe_info;
  VecPipeline_MPI_General *to = (VecPipeline_MPI_General *) ctx->todata;
  VecPipeline_MPI_General *from = (VecPipeline_MPI_General *) ctx->fromdata;
  int *comm_matrix,*comm_contribution,*my_row,*test_colours;
  int i,col,ierr, len = comm_info->numtids;

  /* allocate permanent space for colours */
  comm_info->proc_colours = (int *) PetscMalloc( len*sizeof(int) );
  CHKPTRQ(comm_info->proc_colours);
  PetscMemzero(comm_info->proc_colours,len*sizeof(int));

  /* allocate temporary space for connectivity matrix */
  comm_matrix = (int *) PetscMalloc( len*len*sizeof(int) );
  CHKPTRQ(comm_matrix);
  PetscMemzero(comm_matrix,len*len*sizeof(int));
  comm_contribution = (int *) PetscMalloc( len*len*sizeof(int) );
  CHKPTRQ(comm_contribution);
  PetscMemzero(comm_contribution,len*len*sizeof(int));
  test_colours = (int *) PetscMalloc( len*sizeof(int) );
  CHKPTRQ(test_colours);

  /* fill in my row of the connectivity matrix */
  my_row = comm_contribution +comm_info->mytid*len;
  for (i=0; i<to->n; i++) my_row[to->procs[i]] = 1;
  for (i=0; i<from->n; i++) my_row[from->procs[i]] = 1;

  /* construct the full matrix */
  ierr = MPI_Allreduce
    (comm_contribution,comm_matrix,len*len,MPI_INT,MPI_MAX,ctx->comm);
  CHKERRQ(ierr);
printf("comm matrix:");
for (i=0; i<len*len; i++) printf("%d ",comm_matrix[i]);
printf("\n");

  /* simply greedy colouring */
  my_row = comm_matrix;
  for (i=0; i<len; i++) {
    PetscMemzero(test_colours,len*sizeof(int));
    for (col=0; col<i; col++) {
      if (i==col) continue;
      if (my_row[col]==1)
	test_colours[comm_info->proc_colours[col]]=1;
    }
    for (col=0; col<len; col++) {
      if (test_colours[col]==0) {
	comm_info->proc_colours[i] = col;
	break;
      }
    }
    my_row += len;
  }
/*
  printf("Processors coloured as: ");
  for (col=0; col<len; col++)
    printf("%d->%d ",col,comm_info->proc_colours[col]);
  printf("\n");
*/
  PetscFree(comm_matrix); PetscFree(comm_contribution); PetscFree(test_colours);
  return 0;
}

#undef __FUNC__
#define __FUNC__ "PCCustomPipelineSetFromOptions"
int PCCustomPipelineSetFromOptions(PC pc)
{
  PCPstruct *pc_data = (PCPstruct *) pc->data;
  char *prefix,value[20];
  int flag,ierr;

  ierr = PCGetOptionsPrefix(pc,&prefix); CHKERRQ(ierr);
  ierr = OptionsGetString(prefix,"-pc_pipeline",value,20,&flag); CHKERRQ(ierr);
  value[3]='\0';
  if (strcmp(value,"seq")==0) {
    /*    printf("setting pipeline option %s\n",value);*/
    ierr = PCParallelSetCustomPipeline(pc,PIPELINE_CUSTOM_SEQUENTIAL);
    CHKERRQ(ierr);
  } else if (strcmp(value,"red")==0) {
    /*    printf("setting pipeline option %s\n",value);*/
    ierr = PCParallelSetCustomPipeline(pc,PIPELINE_CUSTOM_REDBLACK);
    CHKERRQ(ierr);
  } else SETERRQ(1,0,"unknown custom pipeline option");

  return 0;
}

#undef __FUNC__
#define __FUNC__ "PCParallelSetCustomPipeline"
int PCParallelSetCustomPipeline(PC pc,CustomPipelineType pipe_type)
{
  PCPstruct *pc_data = (PCPstruct *) pc->data;
  pc_data->pipe_type = pipe_type;
  if (pipe_type == PIPELINE_CUSTOM_SEQUENTIAL) {
    pc_data->up_fun = &ProcUp;
    pc_data->dn_fun = &ProcDown;
    pc_data->pipeline_setup = &PipelineSequentialSetup;
    pc_data->pipeline_destroy = 0;
  } else if (pipe_type == PIPELINE_CUSTOM_REDBLACK) {
    pc_data->up_fun = &ProcColourUp;
    pc_data->dn_fun = &ProcColourDown;
    pc_data->pipeline_setup = &PipelineRedblackSetup;
    pc_data->pipeline_destroy = 0;
  } else if (pipe_type == PIPELINE_CUSTOM_MULTICOLOUR) {
    pc_data->up_fun = &ProcColourUp;
    pc_data->dn_fun = &ProcColourDown;
    pc_data->pipeline_setup = 0;
    pc_data->pipeline_destroy = 0;
  } else {
    pc_data->up_fun = 0;
    pc_data->dn_fun = 0;
    pc_data->pipeline_setup = 0;
    pc_data->pipeline_destroy = 0;
  }
  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecScatterCopyToPipeline_PtoP"
int VecScatterCopyToPipeline_PtoP(VecScatter in,VecPipeline *out)
{
  int ierr;

  ierr = VecScatterCopy(in,(VecScatter*)out); CHKERRQ(ierr);
  (*out)->pipelinebegin     = PtoPPipelinebegin;
  (*out)->pipelineend       = PtoPPipelineend;

  return 0;
}

#undef __FUNC__
#define __FUNC__ "VecPipelineSetCustomPipelineFromPCPstruct"
int VecPipelineSetCustomPipelineFromPCPstruct(VecPipeline ctx,PCPstruct* pc_data)
{
  PetscObject obj;
  int ierr;

  if (pc_data->pipeline_setup) {
    ierr = (pc_data->pipeline_setup)(ctx,&obj); CHKERRQ(ierr);
  }
  ierr = VecPipelineSetCustomPipeline(ctx,pc_data->up_fun,pc_data->dn_fun,obj);
  CHKERRQ(ierr);
  return 0;
}

