/* ---------------------------------------------------------------------
*
*  -- PSSBLAS routine (version 1.0) --
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "psblas.h"

/* #define DEBUG_SWAP_ */

/* Internal Functions */
static void Update_Data(double Y[], double X[], int point_to_proc, int desc_ovrlap[]);
static void Extract(double Y[], double X[], int point_to_proc, int desc_ovrlap[]);

#define Send_Data(X,ndim,proc,ictxt)	Cdgesd2d(ictxt,ndim,1,X,ndim,proc,0)
#define Receive_Data(X,ndim,proc,ictxt) Cdgerv2d(ictxt,ndim,1,X,ndim,proc,0)

void PSI_dSwapOverlap(Y, Sum_Ovrlap, desc_data, desc_ovrlap, work, lwork, ierror)
     
     /* ..
      *  .. Scalar Arguments ..
      */
     int *lwork, *ierror;
     /* ..
      *  .. Array Arguments ..
      */
     int         desc_data[], desc_ovrlap[];
     double       Y[], Sum_Ovrlap[], work[];

{
  /*
   *  Purpose
   *
   *************************
   *  =======
   *  Parameters
   *  ==========
   *
   *  Y       (local input/local output) REAL array
   *          containing the local pieces of a distributed dense matrix of
   *          dimension DIM_MATRX_A x ncol
   *          This array contains the entries of the distributed vector
   *          sub( Y ).  On exit, sub( Y ) is overwritten by the updated
   *          distributed vector sub( Y ).
   *
   *  IY      (global input) pointer to INTEGER
   *          The global row index of the submatrix of the distributed
   *          matrix Y to operate on.
   *
   *  JY      (global input) pointer to INTEGER
   *          The global column index of the submatrix of the distributed
   *          matrix Y to operate on.
   *
   *  INCY    (global input) pointer to INTEGER
   *          The global increment for the elements of Y. Only two values
   *          of INCY are supported in this version, namely 1 and DIM_MATRX_A.
   *
   *  DESCDATA (global and local input) INTEGER array of dimension 8.
   *          The array descriptor of the decomposition.
   *
   *  DESCDOMAINS (local input) INTEGER array. The array descriptor of
   *		  local domains.
   *
   *  DESCTORECV (local input) INTEGER array. The array descriptor of the
   *		  elemnts to receive from other domains.
   *
   *  DESCTOSEND (local input) INTEGER array. The array descriptor of the
   *		 element to send to other domains.
   *
   *  DOMAINSMAP (global input) INTEGER array. The map array of the domains
   *		 on processor.
   *
   *  WORK       (local input/output) REAL array. Work area to memorize intermediate 
   *		 results.
   * 
   *  =====================================================================
   *
   *  .. Local Scalars ..
   */
  int domain;
  int ictxt;
  int nprow, npcol, myrow, mycol;
  int point_to_proc;		/* Pointer to the current process to communicate to */
  int proc_to_comm;		/* Processor to communicate to */
  int ndim;			/* Dimension of vector to send/receive */
  double *X;			/* Work Array */
  int WorkSize;			/* Dimension of Work area */
  int i, loc_dom;
  int err, info=0;
  int int_err[5];
  double real_err[5];

  ictxt = desc_data[CTXT_];
  Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );

#ifdef DEBUG_SWAP_
  printf("Sono entrato in PSI_dSwapOverlap\n");
#endif

  /* Initialize Work Area */
  point_to_proc = 0;
  
  proc_to_comm = desc_ovrlap[PROC_ID_];

  WorkSize = 0;

  /* For each processor to send/receive to/from */
  while (proc_to_comm != -1) {
    /* Update size of Worka Area */
    WorkSize = max(WorkSize, desc_ovrlap[point_to_proc + N_OVRLP_ELEM_]);
    
    /* Update proc_to_send and proc_to_recv */
    point_to_proc += desc_ovrlap[point_to_proc + N_OVRLP_ELEM_]*2+3;
    
    proc_to_comm = desc_ovrlap[point_to_proc + PROC_ID_];
  }
  
  *ierror = 0;
  /* Alloc memory */
  if (WorkSize > *lwork)  {
    if ((X = (double *)malloc(WorkSize*sizeof(double))) == NULL)
      *ierror = WorkSize*sizeof(double);
#ifdef PS_CONTROL_LEVEL
    printf("Alloc memory in PSI_dSwapOverlap: %d elements",WorkSize);
#endif
  } else
    X = (double *)work;
  
  /* Check for ERRORS */
  err = *ierror;
  Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, X, X, -1 ,-1,-1);
  
  if (err)
    return;

  point_to_proc = 0;
  
  proc_to_comm = desc_ovrlap[PROC_ID_];
  
  /* For each processor to send/receive to/from */
  while (proc_to_comm != -1) {
    if (proc_to_comm < myrow) {
      /* First, I send */
      Extract(Y, X, point_to_proc, desc_ovrlap);
      Send_Data(X,desc_ovrlap[point_to_proc+N_OVRLP_ELEM_],proc_to_comm,ictxt);
      /* then, I receive */
      Receive_Data(X,desc_ovrlap[point_to_proc+N_OVRLP_ELEM_],proc_to_comm,ictxt);
      Update_Data(Sum_Ovrlap, X, point_to_proc, desc_ovrlap);
    } else if (proc_to_comm > myrow) {
      /* First, I receive */
      Receive_Data(X,desc_ovrlap[point_to_proc+N_OVRLP_ELEM_],proc_to_comm,ictxt);
      Update_Data(Sum_Ovrlap, X, point_to_proc, desc_ovrlap);
      /* then, I send */
      Extract(Y, X, point_to_proc, desc_ovrlap);
      Send_Data(X,desc_ovrlap[point_to_proc+N_OVRLP_ELEM_],proc_to_comm,ictxt);
    } else {
      /* I send to myproc */
      Extract(Y, X, point_to_proc, desc_ovrlap);
      /* Update Data */
      /*	  point_to_proc += desc_ovrlap[point_to_proc + N_OVRLP_ELEM_]+2;
		  if (desc_ovrlap[point_to_proc + PROC_ID_] != myrow)
		  {
		  psberror("Error in desc_ovrlap: Receive indices for internal exchange are incorrect",&desc_ovrlap[point_to_proc + PROC_ID_],"%d",&myrow,"%d");
		  exit(1);
		  }
		  else
      */
      Update_Data(Sum_Ovrlap, X, point_to_proc, desc_ovrlap);
    }
    
    /* Update proc_to_comm and proc_to_comm */
    point_to_proc += desc_ovrlap[point_to_proc + N_OVRLP_ELEM_]*2+3;
    
    proc_to_comm = desc_ovrlap[point_to_proc + PROC_ID_];
  }
  
  /* Free memory if necessary */
  if (WorkSize > *lwork)
    free(X);
}

static void Update_Data(Sum_Ovrlap, X, point_to_proc, desc_ovrlap)
     double Sum_Ovrlap[]; /* (input/output) Local array of sum of Overlapped elements */
     double X[]; /* (input) Array of elements to insert in Y */
     int point_to_proc; /* (input) Pointer to desc_ovrlap array.
			    In input it points to the block of indexs of elements
			    to update.
			    */
     int desc_ovrlap[]; /* (input) Array of blocks of indexes */
{
  /* PURPOSE: Updates elements in Y with values in X */
  /* Local scalars */
  int i;

  for (i = 0; i < desc_ovrlap[point_to_proc + N_OVRLP_ELEM_]; i++)
    Sum_Ovrlap[desc_ovrlap[point_to_proc + ELEM_RECV_ + i]-1] += X[i];
}

static void Extract(Y, X, point_to_proc, desc_ovrlap)
     double Y[]; /* (input) Local array of elements */
     double X[]; /* (output) Array of elements extracted from Y */
     int point_to_proc; /* (input) Pointer to the desc_ovrlap array.
			    It points to the block of indexes of elements
			    to extract. */
     int desc_ovrlap[]; /* (input) Array of blocks of indexes */
{
  /* PURPOSE: Extracts elements to send from array Y, to array X */
  
  /* Local scalars */
  int i, ndim;
  
  ndim = desc_ovrlap[point_to_proc + N_OVRLP_ELEM_];
  /* Extract elements to send */
  for (i = 0 ; i < ndim ; i++)
    X[i] = Y[desc_ovrlap[point_to_proc + ndim + ELEM_SEND_ + i]-1] ;
}
