  /* copy interior into tmp_p */
  ierr = VecScatterBegin
    (tmp_i,tmp_p,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_intl);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (tmp_i,tmp_p,INSERT_VALUES,SCATTER_FORWARD,pc_data->put_intl);
  CHKERRQ(ierr);

  ierr = MatMult(pc_data->C21_big,tmp_p,tmp_q); CHKERRQ(ierr);

  ierr = VecScatterBegin
    (tmp_q,tmp_e,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_edge);
  CHKERRQ(ierr);
  ierr = VecScatterEnd
    (tmp_q,tmp_e,INSERT_VALUES,SCATTER_FORWARD,pc_data->get_edge);
  CHKERRQ(ierr);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
old attempt to form interface system.

    goto skip;
    ierr = ISGetSize(edge_set,&edge_size); CHKERRQ(ierr);
    ierr = ISGetIndices(edge_set,&edge_vars); CHKERRQ(ierr);
    ierr = ISGetIndices(bord_set,&bord_vars); CHKERRQ(ierr);
    for (irow=0; irow<edge_size; irow++) {
      int i,j,row,col,ncols,*cols; Scalar v,*vals;
      /* get elements coupling interior to itself */
      row = irow;
      ierr = MatGetRow(pc_data->C22,row,&ncols,&cols,&vals); CHKERRQ(ierr);
      for (col=0; col<ncols; col++) {
	i = row+Cij->rstart; j = cols[col]+Cij->rstart; v = vals[col];
	ierr = MatSetValues
	  (pc_data->interface_system,1,&i,1,&j,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      }
      ierr = MatRestoreRow(pc_data->C22,row,&ncols,&cols,&vals); CHKERRQ(ierr);
      /* get elements coupling interior to other processors */
      row = edge_vars[irow];
      ierr = MatGetRow(off_mat,row,&ncols,&cols,&vals);
      for (col=0; col<ncols; col++) {
	i = irow+Cij->rstart; j = bord_vars[cols[col]]; v = vals[col];
	ierr = MatSetValues
	  (pc_data->interface_system,1,&i,1,&j,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      }
      ierr = MatRestoreRow(off_mat,row,&ncols,&cols,&vals); CHKERRQ(ierr);
    }
    ierr = ISRestoreIndices(edge_set,&edge_vars); CHKERRQ(ierr);
    ierr = ISRestoreIndices(bord_set,&bord_vars); CHKERRQ(ierr);
/*MatView(C22,0);*/
  skip:

#undef __FUNC__
#define __FUNC__ "BigOffMats"
/* Extract the coupling block internal <-> interface.
   These are global matrices. This should really be small blocks. */
static int BigOffMats(Mat *C12,Mat *C21,Mat base_mat,IS intl_set,IS edge_set)
{
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *) base_mat->data;
  Mat c12,c21;
  int n_i,*ii,n_e,*ee;
  int ierr,irow,icol,ncol,*idx; Scalar *val;

  ierr = ISGetSize(edge_set,&n_e); CHKERRQ(ierr);
  ierr = ISGetIndices(edge_set,&ee); CHKERRQ(ierr);
  ierr = ISGetSize(intl_set,&n_i); CHKERRQ(ierr);
  ierr = ISGetIndices(intl_set,&ii); CHKERRQ(ierr);

  ierr = MatCreateMPIAIJ(base_mat->comm,Aij->m,Aij->n,Aij->M,Aij->N,
			 0,0,0,0,&c12); CHKERRQ(ierr);
  for (irow=0; irow<n_i; irow++) {
    int row = Aij->rstart+ii[irow], i_target_col = 0;
    ierr = MatGetRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr); 
    for (icol=0; icol<ncol; icol++) {
      int col = idx[icol], target_col;
    loopback12:
      if (i_target_col >= n_e) break;
      target_col = Aij->rstart+ee[i_target_col];
      if (target_col<col) {i_target_col++; goto loopback12;}
      if (target_col==col) {
	Scalar v = val[icol];
	ierr = MatSetValues(c12,1,&row,1,&col,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      } else if (target_col>col) continue;
    }
    ierr = MatRestoreRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(c12,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(c12,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);

  ierr = MatCreateMPIAIJ(base_mat->comm,Aij->m,Aij->n,Aij->M,Aij->N,
			 0,0,0,0,&c21); CHKERRQ(ierr);
  for (irow=0; irow<n_e; irow++) {
    int row = Aij->rstart+ee[irow], i_target_col = 0;
    ierr = MatGetRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr); 
    for (icol=0; icol<ncol; icol++) {
      int col = idx[icol], target_col;
    loopback21:
      if (i_target_col >= n_i) break;
      target_col = Aij->rstart+ii[i_target_col];
      if (target_col<col) {i_target_col++; goto loopback21;}
      if (target_col==col) {
	Scalar v = val[icol];
	ierr = MatSetValues(c21,1,&row,1,&col,&v,INSERT_VALUES);
	CHKERRQ(ierr);
      } else if (target_col>col) continue;
    }
    ierr = MatRestoreRow(base_mat,row,&ncol,&idx,&val);
    CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(c21,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(c21,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);

  ierr = ISRestoreIndices(edge_set,&ee); CHKERRQ(ierr);
  ierr = ISRestoreIndices(intl_set,&ii); CHKERRQ(ierr);

  *C12 = c12; *C21 = c21;
  return 0;
}

#undef __FUNC__
#define __FUNC__ "BorderRenum"
static int BorderRenum
(IS bord_set,int *intface_test,
 Mat base_mat,Mat interface_system)
{/* this rather messy lump of code takes the interface variables  *
  * and renumbers them from the global numbering to the interface */
  MPI_Comm    comm = base_mat->comm;
  Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)(base_mat->data);
  Mat_MPIAIJ *Cij = (Mat_MPIAIJ *)(interface_system->data);
  int numtids,idx,p,ierr;
  int *bord_vars,n_bord_vars;
  int *ren_bord_map,*req_ptrs,*req_size,*ren_size,*ren_ptrs;

  ierr = ISGetSize(bord_set,&n_bord_vars); CHKERRQ(ierr);
  ierr = ISGetIndices(bord_set,&bord_vars); CHKERRQ(ierr);

  MPI_Comm_size(comm,&numtids);
  /* set processor pointers in bord_vars */
  req_ptrs = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(req_ptrs);
  p = -1;
  for (idx=0; idx<n_bord_vars; idx++) {
  loopback:
    if (bord_vars[idx]>=Aij->rowners[p+1]) {
      p++; req_ptrs[p]=idx; goto loopback; }
  }
  for (idx=p+1; idx<=numtids; idx++) req_ptrs[idx] = n_bord_vars;
  /* find the size of the request to each processor */
  req_size = (int *) PetscMalloc( numtids*sizeof(int) );
  CHKPTRQ(req_size);
  for (p=0; p<numtids; p++) req_size[p] = req_ptrs[p+1]-req_ptrs[p];
/*printf("Request sizes:");
for (idx=0; idx<numtids; idx++) printf(" %d",req_size[idx]); printf("\n");*/
  /* get the size of the request made by each processor */
  ren_size = (int *) PetscMalloc( numtids*sizeof(int) );
  CHKPTRQ(ren_size);
  for (p=0; p<numtids; p++)
    MPI_Gather((void *)(req_size+p),1,MPI_INT,
	       (void *)ren_size,1,MPI_INT,p,comm);
  /* set pointers to the incoming requests */
  ren_ptrs = (int *) PetscMalloc( (numtids+1)*sizeof(int) );
  CHKPTRQ(ren_ptrs); ren_ptrs[0] = 0;
  for (p=0; p<numtids; p++) ren_ptrs[p+1] = ren_ptrs[p]+ren_size[p];
  /* gather the variables to be renumbered for the other procs */
  ren_bord_map = (int *) PetscMalloc( (ren_ptrs[numtids]+1)*sizeof(int) );
  CHKPTRQ(ren_bord_map);
  for (p=0; p<numtids; p++)	MPI_Gatherv
    ((void *)(bord_vars+req_ptrs[p]),req_size[p],MPI_INT,
     (void *)ren_bord_map,ren_size,ren_ptrs,MPI_INT, p,comm);
/*printf("Requested to renumber the following:");
for (i=0; i<n_bord_vars; i++) printf(" %d",ren_bord_map[i]); printf("\n");*/
  /* perform the actual renumbering */
  for (idx=0; idx<ren_ptrs[numtids]; idx++)
    ren_bord_map[idx] =
      intface_test[ren_bord_map[idx]-Aij->rstart] + Cij->rstart;
/*printf("Renumbered:");
for (i=0; i<n_bord_vars; i++) printf(" %d",ren_bord_map[i]); printf("\n");*/
  /* scatter back the whole caboodle */
  for (p=0; p<numtids; p++)	MPI_Gatherv
    ((void *)(ren_bord_map+ren_ptrs[p]),ren_size[p],MPI_INT,
     (void *)bord_vars,req_size,req_ptrs,MPI_INT, p,comm);
/*printf("Received my variables back:");
for (i=0; i<n_bord_vars; i++) printf(" %d",bord_vars[i]); printf("\n");*/
  ierr = ISRestoreIndices(bord_set,&bord_vars); CHKERRQ(ierr);
  PetscFree(req_ptrs); PetscFree(req_size); PetscFree(ren_bord_map);
  PetscFree(ren_ptrs); PetscFree(ren_size);

  return 0;
}

