/*
   Defines an Domain Decomposition  preconditioner
*/
#include "sles.h"
#include "src/sles/slesimpl.h"
#include "src/pc/pcimpl.h"
#include "vec.h"
#include "src/vec/vecimpl.h"
#include "src/vec/utils/vpipe.h"
#include "parpre_pc.h"
#include "src/pc/pcparallel.h"
#include "src/pc/pcextra.h"
#include "options.h"
#include "src/mat/impls/aij/mpi/mpiaij.h"

#define CHUNCKSIZE   100

extern int MatMultAXBY_AIJ
(Scalar a, Mat aijin,Vec xx, Scalar b, Vec yy,Vec zz);

extern int LocalSolveSetFromOptions(PC pc);
extern int PCParallelInitLocalMethod(PCPstruct *pc_data, Mat mat, Vec vec);
extern int PCParallelInitCommStruct(PC pc);
extern int PCParallelCreateSubSLES(SLES *solve);

typedef struct {
  PCPstruct par_info;
  SLES interface_method;
  Mat C11,C12,C21,C12_big,C21_big;
  Mat interface_system; 
  Vec edge_vec,intl_vec; Vec interface_vector;
  VecScatter get_edge,put_edge,get_intl,put_intl;
} PC_DDecomp_struct;

/****************************************************************
 * User Interface
 ****************************************************************/
#undef __FUNC__
#define __FUNC__ "PCDomainDecompGetInterfacePC"
int PCDomainDecompGetInterfacePC(PC pc,PC *intpc)
{
  PC_DDecomp_struct *data = (PC_DDecomp_struct *) pc->data;
  SLES int_solve = data->interface_method;
  int ierr;

  if (((PetscObject)pc)->cookie == PC_COOKIE) {
    ierr = SLESGetPC(int_solve,intpc); CHKERRQ(ierr);
  } else return 1;

  return 0;

}
/****************************************************************/

#undef __FUNC__
#define __FUNC__ "PCApply_DomainDecomp"
static int PCApply_DomainDecomp(PC pc,Vec x,Vec y)
{
  PC_DDecomp_struct *pc_data = (PC_DDecomp_struct *) pc->data;
  Mat base_mat = pc->mat;
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) (base_mat->data);
  int ierr,its;
  Vec tmp_i/*internal*/, tmp_e/*edge*/, tmp_ii/*interface, distributed*/;
  Vec tmp_p,tmp_q/*parallel*/, tmp_d/*local domain*/;
  Scalar zero = 0.0, mone = -1.0;
/*int c=0;*/

  ierr = VecDuplicate(x,&tmp_p); CHKERRQ(ierr);
  ierr = VecDuplicate(x,&tmp_q); CHKERRQ(ierr);
  ierr = VecDuplicate(pc_data->intl_vec,&tmp_i); CHKERRQ(ierr);
  ierr = VecDuplicate(pc_data->edge_vec,&tmp_e); CHKERRQ(ierr);
  ierr = VecDuplicate(pc_data->interface_vector,&tmp_ii); CHKERRQ(ierr);
  ierr = VecCreateSeq(MPI_COMM_SELF,Aij->n,&tmp_d); CHKERRQ(ierr);

  ierr = VecSet(&zero,y); CHKERRQ(ierr);
  ierr = VecSet(&zero,tmp_p); CHKERRQ(ierr);

  /*>>>> Parallel solve of internal variables <<<<*/

/*printf("Input vector \n"); VecView(x,0);*/
  ierr = VecScatterBegin
    (x,pc_data->intl_vec,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_intl);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (x,pc_data->intl_vec,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_intl);
  CHKERRQ(ierr);
  ierr = SLESSolve
    (pc_data->par_info.local_method,pc_data->intl_vec,tmp_i,&its);
  CHKERRQ(ierr);
  ierr = VecScatterBegin
    (tmp_i,tmp_p,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_intl);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (tmp_i,tmp_p,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_intl);
  CHKERRQ(ierr);
  
  /*>>>> Move internal data to edge in forward sweep <<<<*/

  ierr = VecScatterBegin
    (x,pc_data->edge_vec,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_edge);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (x,pc_data->edge_vec,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_edge);
  CHKERRQ(ierr);
/*
  ierr = MatMultAXBY_AIJ
    (mone,pc_data->C21,tmp_i,one,pc_data->edge_vec,pc_data->edge_vec);
  CHKERRQ(ierr);
*/
  ierr = MatMult(pc_data->C21_big,tmp_p,tmp_q); CHKERRQ(ierr);
  ierr = VecScatterBegin
    (tmp_q,tmp_e,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_edge);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (tmp_q,tmp_e,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_edge);
  CHKERRQ(ierr);
  ierr = VecAXPY(&mone,tmp_e,pc_data->edge_vec); CHKERRQ(ierr);
  
/*printf("edge after intl solve\n"); VecView(pc_data->edge_vec,0);*/

  /*>>>> Solve on the edge, data is on pc_data->edge_vec <<<<*/

  ierr = VecCopy(pc_data->edge_vec,pc_data->interface_vector); CHKERRQ(ierr);
  ierr = SLESSolve(pc_data->interface_method,pc_data->interface_vector,
		   tmp_ii,&its); CHKERRQ(ierr);
  ierr = VecCopy(tmp_ii,tmp_e); CHKERRQ(ierr);

  ierr = VecScatterBegin
    (tmp_e,y,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_edge);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (tmp_e,y,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_edge);
  CHKERRQ(ierr);
/*printf("edge global after back solve\n"); VecView(y,0);*/

  /*>>>> Move edge to internal; the intl_vec contains a copy of x <<<<*/
/* unnecessary scatter! */
/*  ierr = VecScatterBegin
    (x,pc_data->intl_vec,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_intl);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (x,pc_data->intl_vec,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_intl);
  CHKERRQ(ierr);*/
/*
  ierr = MatMultAXBY_AIJ
    (mone,pc_data->C12,tmp_e,one,pc_data->intl_vec,tmp_i);
  CHKERRQ(ierr);
*/
  ierr = MatMult(pc_data->C12_big,y,tmp_p); CHKERRQ(ierr);
  ierr = VecScatterBegin
    (tmp_p,tmp_i,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_intl);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (tmp_p,tmp_i,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_intl);
  CHKERRQ(ierr);
  ierr = VecAYPX(&mone,pc_data->intl_vec,tmp_i); CHKERRQ(ierr);

  /*>>>> Solve internal <<<<*/
  ierr = SLESSolve
    (pc_data->par_info.local_method,tmp_i,pc_data->intl_vec,&its);
  CHKERRQ(ierr);
 
  ierr = VecScatterBegin
    (pc_data->intl_vec,y,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_intl);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (pc_data->intl_vec,y,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_intl);
  CHKERRQ(ierr);
  
/*printf("Final output\n");VecView(y,0);*/
  ierr = VecDestroy(tmp_p); CHKERRQ(ierr);
  ierr = VecDestroy(tmp_d); CHKERRQ(ierr);
  ierr = VecDestroy(tmp_i); CHKERRQ(ierr);
  ierr = VecDestroy(tmp_e); CHKERRQ(ierr);
  return 0;

}

#undef __FUNC__
#define __FUNC__ "LocalInterface"
static int LocalInterface(int **test,int *n_in,Mat base_mat)
{
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) base_mat->data;
  VecScatter ctx = Aij->Mvctx;
  VecScatter_MPI_General *to = (VecScatter_MPI_General *) ctx->todata;
  Mat dia_mat = Aij->A; Mat_SeqAIJ *aij = (Mat_SeqAIJ *) dia_mat->data;
  int tot_size = aij->n;
  int i,*tt;

  tt = (int *) PetscMalloc( tot_size*sizeof(int) ); CHKPTRQ(tt);
  PetscMemzero(tt,tot_size*sizeof(int));
/*printf("Building interface ");*/
  for ( i=0; i<to->starts[to->n]; i++ ){
/*
    int row = to->indices[i],col,ncol,*idx,accept=0,ierr;
    ierr = MatGetRow(off_mat,row,&ncol,&idx,0); CHKERRQ(ierr);
    for (col=0; col<ncol; col++) {
      if (Aij->garray[idx[col]]>Aij->rstart) {accept++; break;}
    }
    ierr = MatRestoreRow(off_mat,row,&ncol,&idx,0); CHKERRQ(ierr);
    if (accept)
*/
    tt[to->indices[i]] = 1;
/*if (accept) printf("%dYes ",row); else printf("%dNo ",row);*/
  }
/*printf("\n");*/
  *n_in = 0;
  for ( i=0; i<tot_size; i++) *n_in += tt[i];
  *test = tt;
  return 0;
}

#undef __FUNC__
#define __FUNC__ "IntEdgBorDivide"
static int IntEdgBorDivide
(int **edge_vars,int *n_edge_vars,int **intl_vars,int *n_intl_vars,
 int **bord_vars,int *n_bord_vars,
 int *intface_test,int tot_size,int loc_size,Mat off_mat,int *garray)
{
  IndStash stash;
  int i,n_e,n_i,n_b, *ee,*ii,*bb, ierr;

  ee = (int *) PetscMalloc( (loc_size+1)*sizeof(int) );
  CHKPTRQ(ee);
  ii = (int *) PetscMalloc( (tot_size-loc_size+1)*sizeof(int) );
  CHKPTRQ(ii);
  n_e = n_i = 0;
  for ( i=0; i<tot_size; i++ )
    if (intface_test[i]) ee[n_e++] = i;
    else ii[n_i++] = i;
  for ( i=0; i<n_e; i++) intface_test[ee[i]] = i;

  ierr = NewIndexStash(&stash); CHKERRQ(ierr);
  for ( i=0; i<n_e; i++) {
    int ncols,*cols,col;
    ierr = MatGetRow(off_mat,ee[i],&ncols,&cols,0); CHKERRQ(ierr);
    for (col=0; col<ncols; col++) {
      ierr = StashIndex(stash,1,&(garray[cols[col]])); CHKERRQ(ierr);
    }
    ierr = MatRestoreRow(off_mat,ee[i],&ncols,&cols,0); CHKERRQ(ierr);
  }
  n_b = stash->n;
  bb = (int *) PetscMalloc( (n_b+1)*sizeof(int) ); CHKPTRQ(bb);
  PetscMemcpy(bb,stash->array,n_b*sizeof(int));
  ierr = DestroyIndexStash(stash); CHKERRQ(ierr);

  *edge_vars = ee; *intl_vars = ii; *bord_vars = bb;
  *n_edge_vars = n_e; *n_intl_vars = n_i; *n_bord_vars = n_b;

  return 0;
}

#undef __FUNC__
#define __FUNC__ "BorderRenum"
static int BorderRenum
(int *bord_vars,int n_bord_vars,int *intface_test,
 Mat base_mat,Mat interface_system)
{/* this rather messy lump of code takes the interface variables  *
  * and renumbers them from the global numbering to the interface */
  MPI_Comm    comm = base_mat->comm;
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)(base_mat->data);
  Mat_MPIAIJ *Cij = (Mat_MPIAIJ *)(interface_system->data);
  int numtids,idx,p;
  int *ren_bord_map,*req_ptrs,*req_size,*ren_size,*ren_ptrs;

  MPI_Comm_size(comm,&numtids);
  /* set processor pointers in bord_vars */
  req_ptrs = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(req_ptrs);
  p = -1;
  for (idx=0; idx<n_bord_vars; idx++) {
  loopback:
    if (bord_vars[idx]>=Aij->rowners[p+1]) {
      p++; req_ptrs[p]=idx; goto loopback; }
  }
  for (idx=p+1; idx<=numtids; idx++) req_ptrs[idx] = n_bord_vars;
  /* find the size of the request to each processor */
  req_size = (int *) PetscMalloc( numtids*sizeof(int) );
  CHKPTRQ(req_size);
  for (p=0; p<numtids; p++) req_size[p] = req_ptrs[p+1]-req_ptrs[p];
/*printf("Request sizes:");
for (idx=0; idx<numtids; idx++) printf(" %d",req_size[idx]); printf("\n");*/
  /* get the size of the request made by each processor */
  ren_size = (int *) PetscMalloc( numtids*sizeof(int) );
  CHKPTRQ(ren_size);
  for (p=0; p<numtids; p++)
    MPI_Gather((void *)(req_size+p),1,MPI_INT,
	       (void *)ren_size,1,MPI_INT,p,comm);
  /* set pointers to the incoming requests */
  ren_ptrs = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(ren_ptrs); ren_ptrs[0] = 0;
  for (p=0; p<numtids; p++) ren_ptrs[p+1] = ren_ptrs[p]+ren_size[p];
  /* gather the variables to be renumbered for the other procs */
  ren_bord_map = (int *) PetscMalloc( (ren_ptrs[numtids]+1)*sizeof(int) );
  CHKPTRQ(ren_bord_map);
  for (p=0; p<numtids; p++)	MPI_Gatherv
    ((void *)(bord_vars+req_ptrs[p]),req_size[p],MPI_INT,
     (void *)ren_bord_map,ren_size,ren_ptrs,MPI_INT, p,comm);
/*printf("Requested to renumber the following:");
for (i=0; i<n_bord_vars; i++) printf(" %d",ren_bord_map[i]); printf("\n");*/
  /* perform the actual renumbering */
  for (idx=0; idx<ren_ptrs[numtids]; idx++)
    ren_bord_map[idx] =
      intface_test[ren_bord_map[idx]-Aij->rstart] + Cij->rstart;
/*printf("Renumbered:");
for (i=0; i<n_bord_vars; i++) printf(" %d",ren_bord_map[i]); printf("\n");*/
  /* scatter back the whole caboodle */
  for (p=0; p<numtids; p++)	MPI_Gatherv
    ((void *)(ren_bord_map+ren_ptrs[p]),ren_size[p],MPI_INT,
     (void *)bord_vars,req_size,req_ptrs,MPI_INT, p,comm);
/*printf("Received my variables back:");
for (i=0; i<n_bord_vars; i++) printf(" %d",bord_vars[i]); printf("\n");*/
  PetscFree(req_ptrs); PetscFree(req_size); PetscFree(ren_bord_map);
  PetscFree(ren_ptrs); PetscFree(ren_size);

  return 0;
}

#undef __FUNC__
#define __FUNC__ "BigOffMats"
static int BigOffMats(Mat *C12,Mat *C21,Mat base_mat,
		      int n_i,int *ii,int n_e,int *ee)
{
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) base_mat->data;
  Mat c12,c21;
  int ierr,irow,icol,ncol,*idx; Scalar *val;

  ierr = MatCreateMPIAIJ(base_mat->comm,Aij->m,Aij->n,Aij->M,Aij->N,
			 0,0,0,0,&c12); CHKERRQ(ierr);
  for (irow=0; irow<n_i; irow++) {
    int row = Aij->rstart+ii[irow], itcol = 0;
    ierr = MatGetRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr); 
    for (icol=0; icol<ncol; icol++) {
      int col = idx[icol], tcol;
    loopback12:
      if (itcol+1 > n_e) break;
      tcol = Aij->rstart+ee[itcol];
      if (tcol<col) {itcol++; goto loopback12;}
      if (col == tcol) {
	Scalar v = val[icol];
	ierr = MatSetValues(c12,1,&row,1,&col,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      }
    }
    ierr = MatRestoreRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(c12,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(c12,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);

  ierr = MatCreateMPIAIJ(base_mat->comm,Aij->m,Aij->n,Aij->M,Aij->N,
			 0,0,0,0,&c21); CHKERRQ(ierr);
  for (irow=0; irow<n_e; irow++) {
    int row = Aij->rstart+ee[irow], itcol = 0;
    ierr = MatGetRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr); 
    for (icol=0; icol<ncol; icol++) {
      int col = idx[icol], tcol;
    loopback21:
      if (itcol+1 > n_i) break;
      tcol = Aij->rstart+ii[itcol];
      if (tcol<col) {itcol++; goto loopback21;}
      if (col == tcol) {
	Scalar v = val[icol];
	ierr = MatSetValues(c21,1,&row,1,&col,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      }
    }
    ierr = MatRestoreRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(c21,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(c21,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);

  *C12 = c12; *C21 = c21;
  return 0;
}

#undef __FUNC__
#define __FUNC__ "PCSetup_DomainDecomp"
static int PCSetup_DomainDecomp(PC pc)
{
  Mat base_mat = pc->mat;
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) base_mat->data;
  MPI_Comm    comm = base_mat->comm;
  Mat dia_mat = Aij->A, off_mat = Aij->B, C22;
  Mat_SeqAIJ *aij = (Mat_SeqAIJ *) dia_mat->data;
  PC_DDecomp_struct *pc_data = (PC_DDecomp_struct *) pc->data;
  int n_edge_vars,n_intl_vars,n_bord_vars;
  int *edge_vars,*intl_vars,*bord_vars;
  int         ierr;
  IS edge_set;

  /* make sure you're only called for parallel execution */
  if (!(base_mat->type==MATMPIAIJ)) {
    SETERRQ(1,0,"Domain Decomp preconditioner only implemented for AIJMPI\n");
    return -1;
  }

  /* set the pc and interface comm */
  pc->comm = comm;
  pc_data->interface_method->comm = comm;

  /* >>>> Find edge and internal variables <<<< */
  {
    int *intface_test;

    ierr = LocalInterface(&intface_test,&n_edge_vars,base_mat);
    CHKERRQ(ierr);
    
    ierr = IntEdgBorDivide
      (&edge_vars,&n_edge_vars,&intl_vars,&n_intl_vars,
       &bord_vars,&n_bord_vars,
       intface_test,aij->n,n_edge_vars,off_mat,Aij->garray); CHKERRQ(ierr);

    ierr = MatCreateMPIAIJ
      (comm,n_edge_vars,n_edge_vars,PETSC_DECIDE,PETSC_DECIDE,
       0,0,0,0,&pc_data->interface_system); CHKERRQ(ierr);

    ierr = BorderRenum(bord_vars,n_bord_vars,intface_test,
		       base_mat,pc_data->interface_system);
    CHKERRQ(ierr);
    PetscFree(intface_test);

  }
  ierr = BigOffMats(&pc_data->C12_big,&pc_data->C21_big,base_mat,
		    n_intl_vars,intl_vars,n_edge_vars,edge_vars);
  CHKERRQ(ierr);
  {
    IS edge_src,edge_tar,intl_src,intl_tar; Vec local_vec;

    /* index sets for internal & edge vars */
    ierr = ISCreateGeneral(MPI_COMM_SELF,n_intl_vars,intl_vars,&intl_src);
    CHKERRQ(ierr);
    ierr = ISCreateStride(MPI_COMM_SELF,n_intl_vars,0,1,&intl_tar);
    CHKERRQ(ierr);
    ierr = ISCreateGeneral(comm,n_edge_vars,edge_vars,&edge_src);
    CHKERRQ(ierr);
    ierr = ISCreateStride(comm,n_edge_vars,0,1,&edge_tar);
    CHKERRQ(ierr);

    ierr = ISCreateGeneral(comm,n_edge_vars,edge_vars,&edge_set);
    CHKERRQ(ierr);

    ierr = VecCreateSeq
      (MPI_COMM_SELF,n_intl_vars,&pc_data->intl_vec);
    CHKERRQ(ierr);
    ierr = VecCreateSeq
      (MPI_COMM_SELF,n_edge_vars,&pc_data->edge_vec);
    CHKERRQ(ierr);

    ierr = VecCreateSeq(MPI_COMM_SELF,aij->n,&local_vec); CHKERRQ(ierr);

    ierr = VecScatterCreate(local_vec,intl_src,pc_data->intl_vec,intl_tar,
			       &pc_data->get_intl); CHKERRQ(ierr);
/* !!!! */
    ierr = VecScatterCreate(pc_data->intl_vec,intl_tar,local_vec,intl_src,
			       &pc_data->put_intl); CHKERRQ(ierr);

    ierr = VecScatterCreate(local_vec,edge_src,pc_data->edge_vec,edge_tar,
			       &pc_data->get_edge); CHKERRQ(ierr);
    ierr = VecScatterCreate(pc_data->edge_vec,edge_tar,local_vec,edge_src,
			       &pc_data->put_edge); CHKERRQ(ierr);

    {
      IS *i_sets,*j_sets; Mat *res_mat;
      i_sets = (IS*) PetscMalloc(2*sizeof(IS)); CHKPTRQ(i_sets);
      j_sets = (IS*) PetscMalloc(2*sizeof(IS)); CHKPTRQ(j_sets);
      i_sets[0] = intl_src; j_sets[0] = intl_src;
      i_sets[1] = edge_src; j_sets[1] = edge_src;
      
      ierr = MatGetSubMatrices(dia_mat,2,i_sets,j_sets,
			       MAT_INITIAL_MATRIX,&res_mat);
      CHKERRQ(ierr);
      pc_data->C11 = res_mat[0]; C22 = res_mat[1];
      PetscFree(res_mat); PetscFree(i_sets); PetscFree(j_sets);
    }
    ierr = VecDestroy(local_vec); CHKERRQ(ierr);
    ISDestroy(edge_src); ISDestroy(edge_tar);
    ISDestroy(intl_src); ISDestroy(intl_tar);

    PetscFree(intl_vars); PetscFree(edge_vars);
/*
    ierr = MatGetSubMatrix(dia_mat,edge_src,intl_src,
                           MAT_INITIAL_MATRIX,&pc_data->C21);
    CHKERRQ(ierr);
    ierr = MatGetSubMatrix(dia_mat,intl_src,edge_src,
                           MAT_INITIAL_MATRIX,&pc_data->C12);
    CHKERRQ(ierr);
*/
  }

  /* Form the interface system */
  {
    Mat_MPIAIJ *Cij = (Mat_MPIAIJ *)(pc_data->interface_system->data);
    int irow,edge_size,*edge_idxs;

    ierr = ISGetSize(edge_set,&edge_size); CHKERRQ(ierr);
    ierr = ISGetIndices(edge_set,&edge_idxs); CHKERRQ(ierr);
    for (irow=0; irow<edge_size; irow++) {
      int i,j,row,col,ncols,*cols; Scalar v,*vals;
      row = irow;
      ierr = MatGetRow(C22,row,&ncols,&cols,&vals); CHKERRQ(ierr);
      for (col=0; col<ncols; col++) {
	i = irow+Cij->rstart; j = cols[col]+Cij->rstart; v = vals[col];
	ierr = MatSetValues(pc_data->interface_system,1,&i,1,&j,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      }
      ierr = MatRestoreRow(C22,row,&ncols,&cols,&vals); CHKERRQ(ierr);
      row = edge_idxs[irow];
      ierr = MatGetRow(off_mat,row,&ncols,&cols,&vals);
      for (col=0; col<ncols; col++) {
	i = irow+Cij->rstart; j = bord_vars[cols[col]]; v = vals[col];
	ierr = MatSetValues
	  (pc_data->interface_system,1,&i,1,&j,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      }
      ierr = MatRestoreRow(off_mat,row,&ncols,&cols,&vals); CHKERRQ(ierr);
    }
    ierr = ISRestoreIndices(edge_set,&edge_idxs); CHKERRQ(ierr);

/*
MatView(C22,0);
MatView(pc_data->interface_system,0);
*/
    ierr = MatDestroy(C22); CHKERRQ(ierr); PetscFree(bord_vars);
    ierr = MatAssemblyBegin(pc_data->interface_system,MAT_FINAL_ASSEMBLY);
    CHKERRQ(ierr);
    ierr = MatAssemblyEnd  (pc_data->interface_system,MAT_FINAL_ASSEMBLY);
    CHKERRQ(ierr);
    ierr = VecCreateMPI
      (comm,edge_size,PETSC_DECIDE,&pc_data->interface_vector);
    CHKERRQ(ierr);
    {
      PC local_pc;

      ierr = SLESSetOperators
	(pc_data->interface_method,pc_data->interface_system,
	 pc_data->interface_system,0); CHKERRQ(ierr);
      ierr = SLESGetPC(pc_data->interface_method,&local_pc); CHKERRQ(ierr);
      ierr = PCSetVector(local_pc,pc_data->interface_vector); CHKERRQ(ierr);
      ierr = PCSetUp(local_pc); CHKERRQ(ierr);
    }
    
  }

  /* Initialise the local solution method */
  ierr = SLESSetOperators
    (pc_data->par_info.local_method,pc_data->C11,pc_data->C11,0);
  CHKERRQ(ierr);
  {
    PC local_pc;
    PCParallelGetLocalPC(pc,&local_pc);
    ierr = PCSetVector(local_pc,pc_data->intl_vec); CHKERRQ(ierr);
    ierr = PCSetUp(local_pc); CHKERRQ(ierr);
  }
   ierr = VecPipelineSetCustomPipelineFromPCPstruct
     ((VecPipeline)Aij->Mvctx,&(pc_data->par_info)); CHKERRQ(ierr);

  /*>>>> Factor the diagonal blocks <<<<*/

  return 0;

}

#undef __FUNC__
#define __FUNC__ "PCSetFromOptions_DomainDecomp"
static int PCSetFromOptions_DomainDecomp(PC pc)
{
  PC_DDecomp_struct *data = (PC_DDecomp_struct *) pc->data;
  SLES sles = data->interface_method;
  char *prefix; int ierr;

  ierr = PCGetOptionsPrefix(pc,&prefix); CHKERRQ(ierr);
  ierr = SLESSetOptionsPrefix(sles,prefix); CHKERRQ(ierr);
  ierr = SLESAppendOptionsPrefix(sles,"interface_"); CHKERRQ(ierr);
  ierr = SLESSetFromOptions(sles); CHKERRQ(ierr);

  ierr = LocalSolveSetFromOptions(pc); CHKERRQ(ierr);

  return 0;
}

#undef __FUNC__
#define __FUNC__ "PCDestroy_DDecomp"
static int PCDestroy_DDecomp(PetscObject obj)
{
  PC pc = (PC) obj;
  PC_DDecomp_struct *pc_data = (PC_DDecomp_struct *) pc->data;
  int ierr;

  ierr = SLESDestroy(pc_data->par_info.local_method); CHKERRQ(ierr);
  ierr = SLESDestroy(pc_data->interface_method); CHKERRQ(ierr);
  ierr = MatDestroy(pc_data->C11); CHKERRQ(ierr);
/*
  ierr = MatDestroy(pc_data->C12); CHKERRQ(ierr);
  ierr = MatDestroy(pc_data->C21); CHKERRQ(ierr);
*/
  ierr = MatDestroy(pc_data->C12_big); CHKERRQ(ierr);
  ierr = MatDestroy(pc_data->C21_big); CHKERRQ(ierr);
  ierr = MatDestroy(pc_data->interface_system); CHKERRQ(ierr);
  ierr = VecDestroy(pc_data->edge_vec); CHKERRQ(ierr);
  ierr = VecDestroy(pc_data->intl_vec); CHKERRQ(ierr);
  ierr = VecDestroy(pc_data->interface_vector); CHKERRQ(ierr);
  ierr = VecScatterDestroy(pc_data->get_edge); CHKERRQ(ierr);
  ierr = VecScatterDestroy(pc_data->put_edge); CHKERRQ(ierr);
  ierr = VecScatterDestroy(pc_data->get_intl); CHKERRQ(ierr);
  ierr = VecScatterDestroy(pc_data->put_intl); CHKERRQ(ierr);

  return 0;
}

#undef __FUNC__
#define __FUNC__ "PCCreate_DomainDecomp"
int PCCreate_DomainDecomp(PC pc)
{
  int ierr;
  PC_DDecomp_struct *bij;

  pc->apply     = PCApply_DomainDecomp;
  pc->applyrich = 0;
  pc->destroy   = PCDestroy_DDecomp;
  pc->setfrom   = PCSetFromOptions_DomainDecomp;
  pc->printhelp = 0;
  pc->setup     = PCSetup_DomainDecomp;
  pc->type      = PCDomainDecomp;

  bij = (PC_DDecomp_struct *) PetscMalloc( sizeof(PC_DDecomp_struct) );
  CHKPTRQ(bij);
  bij->C11 = bij->C12 = bij->C21 = bij->C12_big = bij->C21_big = 0;
  bij->edge_vec = bij->intl_vec = 0;
  bij->get_edge = bij->put_edge = bij->get_intl = bij->put_intl = 0;

  /* create subsolvers for the interface system and interior */
  ierr = PCParallelInstallSubSolve
    (MPI_COMM_SELF,&(bij->par_info.local_method));
  CHKERRQ(ierr);
  ierr = PCParallelInstallSubSolve
    (/*pc->comm: does not exist yet*/MPI_COMM_SELF/* CRUTCH !!! */,
     &(bij->interface_method));
  CHKERRQ(ierr);

  pc->data      = (void *) bij;

  ierr = PCParallelInitCommStruct(pc); CHKERRQ(ierr);

  return 0;
}

