/* ---------------------------------------------------------------------
*
*  -- PSBLAS routine (version 1.0) --
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "psblas.h"
#include <stdlib.h>
#include <memory.h>

/* #define DEBUGSPMM__ */

void psdspmm_( trans, m, n, k, alpha, pr, A, ia1, ia2, fida, descra, infoa,
	      pc, ia, ja, X, ix, jx, lldx, beta, Y, iy, jy, lldy,
	      desc_data, desc_halo, desc_ovrlap, desc_ovrlap_elem, 
	      work, lwork)
     /*
      *  .. Scalar Arguments ..
      */
     F_CHAR      trans;
     int         * ia, * ix, * iy, * ja, * jx, * jy, * m, * lldx, * lldy,
     * n, * k, * lwork;
     double       * alpha, * beta;
     /* ..
      *  .. Array Arguments ..
      */
     int         desc_data[], desc_halo[], desc_ovrlap[], desc_ovrlap_elem[] ;
     int	 ia1[], ia2[], infoa[], pr[], pc[];
     char	 fida[5], descra[11];
     double       A[], X[], Y[], work[];
{
  /*
   *  Purpose
   *  =======
   *
   *  PSDPSMM performs one of the distributed matrix-vector operations
   *
   *     sub( Y ) := alpha*Pr*sub( A )*Pc* sub( X )  + beta*sub( Y ),  or
   *     sub( Y ) := alpha*Pr*sub( A )'*Pr* sub( X )  + beta*sub( Y ),
   *
   *  where sub( A ) denotes A(IA:IA+M-1,JA:JA+N-1),
   *
   *        sub( X ) denotes if TRANS = 'N',
   *                       X(IX:IX+N-1,JX:JX+K-1),
   *                     else
   *                       X(IX:IX+M-1,JX:JX+K-1).
   *                     end if
   *
   *        sub( Y ) denotes if trans = 'N',
   *                       Y(IY:IY+M-1,JY:JY),
   *                     else
   *                       Y(IY:IY+N-1,JY:JY)
   *                     end if
   *
   *  alpha and beta are scalars, and sub( X ) and sub( Y ) are distributed
   *  vectors and sub( A ) is a M-by-N distributed submatrix.
   *
   *  Notes
   *  =====
   *
   *  Some description vectors are associated with each distributed sparse
   *  matrix. These vectors stores the information required to the
   *  communication needed to perform distributed operations.
   *
   *  They are:
   *
   *  NAME		 EXPLANATION
   *  ------------------ -------------------------------------------------------
   *  MATRIX_DATA	 Array of INTEGER that contains some local and global
   *			 information of matrix.
   *
   *  DESC_HALO          Array of INTEGER that contains informations for local 
   *                     halo points.
   *
   *  DESC_OVRLAP        Array of INTEGER that contains a list of local overlap 
   *                     elements.
   *                     
   *  OVRLAP_ELEM        Array of INTEGER that contains a list of local overlap
   *                     elements with their multiplicity.
   *
   *  Now we explain each of the above vectors.
   *
   *  Let A be a generic sparse matrix. We denote with MATDATA_A the MATRIX_DATA
   *  array for matrix A.
   *  Data stored in MATRIX_DATA array are:
   *
   *  NOTATION        STORED IN		     EXPLANATION
   *  --------------- ---------------------- -------------------------------------
   *  DEC_TYPE        MATDATA_A[DEC_TYPE_]   Decomposition type
   *  M 	      MATDATA_A[M_]          Total number of equations
   *  N 	      MATDATA_A[N_]          Total number of variables
   *  N_ROW           MATDATA_A[N_ROW_]      Number of local equations
   *  N_COL           MATDATA_A[N_COL_]      Number of local variables
   *  CTXT_A          MATDATA_A[CTXT_]       The BLACS context handle, indicating
   *					     the global context of the operation
   *					     on the matrix.
   *					     The context itself is global.
   *
   *  Let DESCHALO_P be the array DESC_HALO for local process.
   *  This is composed of variable dimension blocks for each process to 
   *  communicate to.
   *  Each block contain indexes of local halo elements to exchange with other 
   *  process.
   *  Let P be the pointer to the first element of a block in DESCHALO_P.
   *  This block is stored in DESCHALO_P as :
   *
   *  NOTATION        STORED IN		          EXPLANATION
   *  --------------- --------------------------- -----------------------------------
   *  PROCESS_ID      DESCHALO_P[P+PROC_ID_]      Identifier of process which exchange 
   *						  data with.
   *  N_ELEMENTS_RECV DESCHALO_P[P+N_ELEM_RECV_]  Number of elements to receive.
   *  ELEMENTS_RECV   DESCHALO_P[P+ELEM_RECV_+i]  Indexes of local elements to
   *					          receive. These are stored in the
   *					          array from location P+ELEM_RECV_ to
   *					          location P+ELEM_RECV_+
   *						  DESCHALO_P[P+N_ELEM_RECV_]-1.
   *  N_ELEMENTS_SEND DESCHALO_P[P+N_ELEM_SEND_]  Number of elements to send.
   *  ELEMENTS_SEND   DESCHALO_P[P+ELEM_SEND_+i]  Indexes of local elements to
   *					          send. These are stored in the
   *					          array from location P+ELEM_SEND_ to
   *					          location P+ELEM_SEND_+
   *						  DESCHALO_P[P+N_ELEM_SEND_]-1.
   *
   *
   *  Let DESCOVRLP_P be the array DESC_OVRLAP for local process.
   *  This is composed of variable dimension blocks for each process to 
   *  communicate to.
   *  Each block contain indexes of local overlap elements to exchange with
   *  other process.
   *  Let P be the pointer to the first element of a block in DESCOVRLP_P.
   *  This block is stored in DESCOVRLP_P as :
   *
   *  NOTATION        STORED IN		            EXPLANATION
   *  ------------- ------------------------------- -----------------------------------
   *  PROCESS_ID    DESCOVRLP_P[P+PROC_ID_]         Identifier of process which exchange
   *						    data with.
   *  N_OVRLAP_ELEM DESCOVRLP_P[P+N_OVRLP_ELEM_]    Number of elements to exchange.
   *  OVRLAP_ELEM   DESCOVRLP_P[P+OVRLP_ELEM_TO_+i] Indexes of local elements to
   *					            exchange. These are stored in the
   *					            array from location P+OVRLP_ELEM_ to
   *					            location P+OVRLP_ELEM_+
   *						    DESCOVRLP_P[P+N_OVRLP_ELEM_]-1.
   *
   *
   *  Let OVR_ELEM_P be the array OVERLAP_ELEM for local process.
   *  This is composed of blocks of two elements. The block
   *  corresponding to the i-th overlapped elements, begin at index 
   *  P = i*2 in array OVR_ELEM_P.
   *  This block is stored in OVR_ELEM_P as :
   *
   *  NOTATION      STORED IN		       EXPLANATION
   *  ------------- -------------------------- ----------------------------------
   *  OVRLAP_ELEM   OVR_ELEM_P[P+OVRLP_ELEM_]  The index of local overlapped 
   *					       element.
   *  N_DOMAINS     OVR_ELEM_P[P+N_DOM_OVR_]   The number of copies of
   *					       local overlapped element.
   *
   *  Parameters
   *  ==========
   *
   *  TRANS   (global input) pointer to CHARACTER
   *          On entry, TRANS specifies the operation to be performed as
   *          follows:
   *
   *          if TRANS = 'N' or 'n',
   *          sub( Y ) := alpha*sub( A )  * sub( X ) + beta*sub( Y ),
   *
   *          else if TRANS = 'T' or 't',
   *          sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ),
   *
   *          else if TRANS = 'C' or 'c',
   *          sub( Y ) := alpha*sub( A )' * sub( X ) + beta*sub( Y ).
   *
   *  M       (global input) pointer to INTEGER
   *          The number of rows to be operated on i.e the number of rows
   *          of the distributed submatrix sub( A ). M >= 0.
   *
   *  N       (global input) pointer to INTEGER
   *          The number of columns to be operated on i.e the number of
   *          columns of the distributed submatrix sub( A ). N >= 0.
   *
   *  K       (global input) pointer to INTEGER
   *          The number of columns of dense matrix Y to operate on.
   *
   *  ALPHA   (global input) pointer to REAL
   *          On entry, ALPHA specifies the scalar alpha.
   *
   *  PR      (local input) array of INTEGER
   *          Permutation matrix.
   *
   *  A       (local input) REAL pointer into the local memory
   *          to an array containing the values of
   *          local pieces of the distributed sparse matrix sub( A ).
   *
   *  IA1     (local input) INTEGER array containing the column
   *          coordinates of the values of local pieces of the
   *          distributed sparse matrix sub( A ).
   *
   *  IA2     (local input) INTEGER array containing the coordinates
   *          of the first element of a row of the
   *          distributed sparse matrix sub( A ) in the array of values 'A'.
   *          The coordinates of the first element of row i of the
   *          distributed sparse matrix sub( A ) in the array of values 'A' 
   *          is IA2[i].
   *
   *  FIDA    (global input) pointer to CHARACTER containing a string
   *          that indicates the type of sparse matrix representation.
   *
   *  DESCRA  (global input) pointer to CHARACTER containing the type of
   *          sparse matrix.
   *             DESCRA[1] = 'G' General
   *             DESCRA[1] = 'S' Symmetric
   *             DESCRA[1] = 'H' Hermitian
   *             DESCRA[1] = 'T' Triangular
   *             DESCRA[1] = 'A' Anti-symmetric
   *             DESCRA[1] = 'D' Diagonal
   *             DESCRA[2] = 'U' Upper
   *             DESCRA[2] = 'L' Lower
   *             DESCRA[3] = 'U' Unit diagonal
   *             DESCRA[3] = 'N' Non Unit diagonal
   *
   *  INFOA   (global output) INTEGER array containing user defined 
   *          information.
   *
   *  PC      (local input) array of INTEGER
   *          Permutation matrix.
   *
   *  IA      (global input) pointer to INTEGER
   *          The global row index of the submatrix of the distributed
   *          sparse matrix A to operate on.
   *
   *  JA      (global input) pointer to INTEGER
   *          The global column index of the submatrix of the distributed
   *          sparse matrix A to operate on.
   *
   *  X       (local input) REAL array containing the local
   *          pieces of a distributed matrix.
   *          This array contains the entries of the distributed vector
   *          sub( X ).
   *
   *  IX      (global input) pointer to INTEGER
   *          The global row index of the submatrix of the distributed
   *          matrix X to operate on.
   *
   *  JX      (global input) pointer to INTEGER
   *          The global column index of the submatrix of the distributed
   *          matrix X to operate on.
   *
   *  LLDX    (local input) pointer to INTEGER
   *          The leading dimension of local dense matrix X.
   *
   *  BETA    (global input) pointer to REAL
   *          On entry,  BETA  specifies the scalar  beta.  When  BETA  is
   *          supplied as zero then sub( Y ) need not be set on input.
   *
   *  Y       (local input/local output) REAL array
   *          containing the local pieces of a distributed matrix.
   *          This array contains the entries of the distributed vector
   *          sub( Y ).  On exit, sub( Y ) is overwritten by the updated
   *          distributed vector sub( Y ).
   *
   *  IY      (global input) pointer to INTEGER
   *          The global row index of the submatrix of the distributed
   *          matrix Y to operate on.
   *
   *  JY      (global input) pointer to INTEGER
   *          The global column index of the submatrix of the distributed
   *          matrix Y to operate on.
   *
   *  LLDX    (local input) pointer to INTEGER
   *          The leading dimension of local dense matrix Y.
   *
   *  DESCDATA (global and local input) INTEGER array. Is the MATRIX_DATA
   *           array.
   *
   *  DESCHALO (local input) INTEGER array. Is the DESC_HALO array.
   *
   *  DESCOVRLAP (local input) INTEGER array. Is the DESC_OVRLAP array.
   *
   *  DESCOVRLAPELEM (local input) INTEGER array. Is the OVRLAP_ELEM array.
   *
   *  WORK       (local input) REAL array. Work area to memorize intermediate 
   *		 results.
   *
   *  LWORK      (local input) pointer to INTEGER. Dimension of Work area.
   *
   *  =====================================================================
   *
   *  .. Local Scalars ..
   */
  char        TrA;
  int         ictxt, iia, iix, iiy;
  int         jja, jjx, jjy, lwork1, Size;
  int         mycol, myrow, nota, nprow, npcol, i, j;
  int         ovr_points;

  /* Error variables */
  int         info=0, ierror=0,err;
  int         int_err[5];
  double      real_err[5];

  /*
   * .. Local Array
   */
  double *tmpx, *tmpy, *work1, *Xcopy, *tmpXcopy, *Ycopy, *tmpYcopy;

  void Update_Ovrlap_points(int k, double *tmpy, int lldy, double *Xcopy, int lldx,
			    int desc_ovrlap_elem[]) ;

  /* ..
   *  .. External Functions ..
   */
  void        blacs_gridinfo_();
  void        pbchkmat();
  void        pbchkvect();
  void        pberror_();
  void        dcsmm();
  void        psberror();
  /* ..
   *  .. Executable Statements ..
   *
   *  Get grid parameters
   */
  ictxt = desc_data[CTXT_];
  Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
  /*
   *  Test the input parameters
   */
  info = 0;
  if( nprow == -1 )
    info = 2010;
  else {
    TrA = Mupcase( F2C_CHAR( trans )[0] );
    nota = ( TrA == 'N' );
    
    if ((*lldx != desc_data[N_COL_])||(*lldy != desc_data[N_COL_]))
      info = 3010;
    
    if( (TrA != 'N') && (TrA != 'T') && (TrA != 'C')) {
      info = 70;	        /* Error Code */
      int_err[0] = 1;	/* Argument position */
    } else if( TrA == 'C') {
      info = 3020;	/* Error Code */
      int_err[0] = 1;	/* Argument position */
    }
      
    pbchkmat( *m, 2, *n, 3, *ia, *ja, desc_data, 25, 7, &iia, &jja,
	      &info, int_err);
    
    if( nota ) {
      pbchkvect( *n, *k, 3, *ix, *jx, *lldx, desc_data, 25, 16, &iix, &jjx,
		 &info, int_err);
      pbchkvect( *m, *k, 2, *iy, *jy, *lldy, desc_data, 25, 21, &iiy, &jjy,
		 &info, int_err);
    } else {
      pbchkvect( *m, *k, 2, *ix, *jx, *lldx, desc_data, 25, 16, &iix, &jjx,
		 &info, int_err);
      pbchkvect( *n, *k, 3, *iy, *jy, *lldy, desc_data, 25, 21, &iiy, &jjy,
		 &info, int_err);
    }
      
    err = info;
    Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, tmpx, tmpx, -1 ,-1,-1);
    
    if( err == 0 ) {
      if( nota )  {
	if ((*ja == *ix)&&(*ia == *iy))	/* No communications */ {
	  /* Initialize work array */
	  if ((iix == 1)&&(iiy == 1))		/* Columns vectors */
	    if (desc_data[N_ROW_] > *lwork)  {
	      /* Allocate memory */
	      if ((work1 =
		   (double *)malloc(desc_data[N_ROW_]*sizeof(double))) == NULL) {
		info = 2020;
		int_err[0] = desc_data[N_ROW_]*sizeof(double);
	      }
	      
	      work1[0] = 0;
	      lwork1 = desc_data[N_ROW_];
#ifdef PS_CONTROL_LEVEL
	      printf("Alloc %d elements in psdspmm",desc_data[N_ROW_]);
#endif
	    } else {
	      work1 = work;
	      work1[0] = 0;
	      lwork1 = *lwork;
	    }
	  else
	    info = 3040;
	  
	  /* Check for ERRORS */
	  err = info;
	  Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, tmpx, tmpx, -1 ,-1,-1);
	  
	  if (err != 0) {
	    psderror_( &ictxt, &info, "PSDSPMM\0", int_err, real_err );
	    return;
	  }
	  
	  /* Matrix-Vector Product */
	  if ((iix == 1)&&(iiy == 1)) {
	    tmpx = &X[(jjx-1)*(*lldx)+iix-1]; /* Pointer to the local X vector */
	    tmpy = &Y[(jjy-1)*(*lldy)+iiy-1]; /* Pointer to the local Y vector */
		      
#ifdef DEBUGSPMM__
	    printf("DSPMM:Vector X\n");
	    for (i = 0; i < desc_data[N_COL_]; i++)
	      printf ("X[%d] = %lf\n", i+1, tmpx[i]);
	    
	    printf("Vector Y\n");
	    for (i = 0; i < desc_data[N_COL_]; i++)
	      printf ("Y[%d] = %lf\n", i+1, tmpy[i]);
#endif
	    dcsmm(&TrA, &desc_data[N_ROW_], k, &desc_data[N_COL_],
		  alpha, pr, fida, descra, A, ia1, ia2, infoa, pc,
		  tmpx, lldx, beta, tmpy, lldy, work1, &lwork1, &ierror);
	  }
	  
	  /* Check for ERRORS */
	  if (ierror != 0) {
	    info = 2010;
	    int_err[0] = ierror;
	  }
	  
	  err = info;
	  Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, tmpx, tmpx, -1 ,-1,-1);
	  
	  if (err != 0) {
	    psderror_( &ictxt, &info, "PSDSPMM\0", int_err, real_err );
	    return;
	  }
	  
#ifdef DEBUGSPMM__
	  printf("DSPMM:After local computations\n");
	  printf("Vector X\n");
	  for (i = 0; i < desc_data[N_COL_]; i++)
	    printf ("X[%d] = %lf\n", i+1, tmpx[i]);
	  
	  printf("Vector Y\n");
	  for (i = 0; i < desc_data[N_COL_]; i++)
	    printf ("Y[%d] = %lf\n", i+1, tmpy[i]);
#endif
	  /* Update Overlap */
	  if ((iix == 1)&&(iiy == 1)) /* Columns vectors */ {
#ifdef DEBUGSPMM__
	    printf("DSPMM:After local computations\n");
	    printf("Vector X\n");
	    for (i = 0; i < desc_data[N_COL_]; i++)
	      printf ("X[%d] = %lf\n", i+1, tmpx[i]);
	    
	    printf("Vector Y\n");
	    for (i = 0; i < desc_data[N_COL_]; i++)
	      printf ("Y[%d] = %lf\n", i+1, tmpy[i]);
#endif
	    /* Update Overlap and Halo */
	    tmpy = &Y[(jjy-1)*(*lldy)+iiy-1]; /* Pointer to the local Y vector */
	    PSI_dSwapData(*k, tmpy, *lldy, desc_data, desc_halo, work1, &lwork1, &info) ;
	    /* Check for allocation error in PSI_dSwapData */
	    if (info != 0) {
	      err = 1;
	      if (info < 0)
		info = 0 ;
	      else {
		int_err[0] = info;
		info = 2020;
	      }
	      
	    }
	    
	    /* Free Work area */
	    if (desc_data[N_ROW_] > *lwork)
	      free(work1);
	  }
	}
	else
	  info = 3030;
      } else { /* TRANS = T */
	if ((*ja == *iy)&&(*ia == *ix)) /* No communications */
	  if (iia <= desc_data[N_ROW_]) 
	    /* I have the portion of vector Y to operate on */
	    if (jja <= desc_data[N_COL_]) {
	      /* I have the portion of vector X to operate on */	      
	      ovr_points = (desc_ovrlap_elem[0] != -1);
	      
	      /* Initialize work array */
	      if ((iix == 1)&&(iiy == 1)) /* Columns vectors */
		if (ovr_points)  {
		  Size = desc_data[N_ROW_]*(*k);
		  
		  if (desc_data[N_ROW_] > *lwork) /* Work area too small */  {
				/* Allocate memory for work1 */
		    if ((work1 = (double *)
			 malloc(desc_data[N_ROW_]*sizeof(double))) == NULL) {
		      info = 2020;
		      int_err[0] = desc_data[N_ROW_]*sizeof(double);
		    }
		    
		    work1[0] = 0;
		    lwork1 = desc_data[N_ROW_];
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",desc_data[N_ROW_]);
#endif
		    
				/* Allocate memory for Xcopy */
		    if ((Xcopy = (double *)
			 malloc(Size*sizeof(double)))
			== NULL)  {
		      info = 2020;
		      int_err[0] = Size*sizeof(double);
		    }
		    
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",Size);
#endif
		    
		    /* Allocate memory for Ycopy */
		    if ((Ycopy = (double *)
			 malloc(Size*sizeof(double)))
			== NULL) {
		      info = 2020;
		      int_err[0] = Size*sizeof(double);
		    }
		    
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",Size);
#endif
		  } else if (desc_data[N_ROW_]+Size > *lwork) {
		/* Initialize array work1 */
		    work1 = work;
		    work1[0] = 0;
		    lwork1 = *lwork;
		    /* Allocate memory for Xcopy */
		    if ((Xcopy = (double *)
			 malloc(Size*sizeof(double)))
			== NULL) {
		      info = 2020;
		      int_err[0] = Size*sizeof(double);
		    }
		    
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",Size);
#endif
		    
		    /* Allocate memory for Ycopy */
		    if ((Ycopy = (double *)
			 malloc(Size*sizeof(double)))
			== NULL) {
		      info = 2020;
		      int_err[0] = Size*sizeof(double);
		    }
		    
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",Size);
#endif
		  } else if (desc_data[N_ROW_]+2*Size > *lwork) {
		    /* Initialize array Xcopy */
		    Xcopy = work;
		    /* Initialize array work1 */
		    work1 = &work[Size];
		    work1[0] = 0;
		    lwork1 = *lwork - Size;
		    
		    /* Allocate memory for Ycopy */
		    if ((Ycopy = (double *)
			 malloc(Size*sizeof(double)))
			== NULL) {
		      info = 2020;
		      int_err[0] = Size*sizeof(double);
		    }
		    
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",Size);
#endif
		  } else /* Work area is big enough */  {
		    /* Initialize array Xcopy */
		    Xcopy = work;
		    /* Initialize array Xcopy */
		    Ycopy = &work[Size];
		    /* Initialize array work1 */
		    work1 = &work[2*Size];
		    work1[0] = 0;
		    lwork1 = *lwork - 2*Size;
		  }
		} else	/* No OVERLAP */
		  if (desc_data[N_ROW_] > *lwork) /* Work area too small */ {
		    /* Allocate memory for work1 */
		    if ((work1 = (double *)
			 malloc(desc_data[N_ROW_]*sizeof(double))) == NULL) {
		      info = 2020;
		      int_err[0] = desc_data[N_ROW_]*sizeof(double);
		    }
		    
		    work1[0] = 0;
		    lwork1 = desc_data[N_ROW_];
#ifdef PS_CONTROL_LEVEL
		    printf("PSDSPMM: Alloc %d elements\n",desc_data[N_ROW_]);
#endif
		  } else  {
		    /* Initialize array work1 */
		    work1 = work;
		    work1[0] = 0;
		    lwork1 = *lwork;
		  }
	      else
		info = 3040;
	      
	      /* Check for ERRORS */
	      err = info;
	      Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, tmpx, tmpx, -1 ,-1,-1);
	      
	      if (err != 0) {
		psderror_( &ictxt, &info, "PSDSPMM\0", int_err, real_err );
		return;
	      }
	      
	      /* Initialize Xcopy, Ycopy and Y vectors */
	      if ((iix == 1)&&(iiy == 1)) /* Columns vectors */
		if (ovr_points)
		  for (j = 0; j < *k; j++) {
		    /* Pointer to the local X vector */
		    tmpx = &X[(jjx+j-1)*(*lldx)+iix-1];
		    
		    /* Pointer to the local vector */
		    tmpy = &Y[(jjy+j-1)*(*lldy)+iiy-1];
		    tmpXcopy = &Xcopy[j*desc_data[N_ROW_]];		    
		    tmpYcopy = &Ycopy[j*desc_data[N_ROW_]];
		    
		    /* Update Xcopy */
		    memcpy(tmpXcopy, tmpx, 
			   desc_data[N_ROW_]*sizeof(double));
		    
		    i = 0;
		    while (desc_ovrlap_elem[i] != -1) {
		      tmpXcopy[desc_ovrlap_elem[i+OVRLP_ELEM_]-1] /= 
			desc_ovrlap_elem[i+N_DOM_OVR_];
		      i += 2;
		    }
		    
		    /* Update Ycopy */
		    memcpy(tmpYcopy, tmpy, 
			   desc_data[N_ROW_]*sizeof(double));
		    
		    for (i = desc_data[N_ROW_]; i < desc_data[N_COL_]; i++)
		      tmpy[i] = 0.0;
		  } else	/* No OVERLAP */   {
		    for (j = 0; j < *k; j++)  {
				/* Pointer to the local vector */
		      tmpy = &Y[(jjy+j-1)*(*lldy)+iiy-1];
		      
		      for (i = desc_data[N_ROW_]; i < desc_data[N_COL_]; i++)
			tmpy[i] = 0.0;
		    }
		    
		    /* Pointer to the local X vector */
		    tmpx = &X[(jjx-1)*(*lldx)+iix-1];
		    Xcopy = tmpx;
		  }
	      
	      if ((iix == 1)&&(iiy == 1)) /* Columns vectors */	{
		/* Pointer to the local vector */
		tmpy = &Y[(jjy-1)*(*lldy)+iiy-1];
		
		/* Matrix-Vector Product */
#ifdef DEBUGSPMM__
		printf("DSPMM:Vector X\n");
		for (i = 0; i < desc_data[N_COL_]; i++)
		  printf ("X[%d] = %lf\n", i+1, tmpXcopy[i]);
		
		printf("Vector Y\n");
		for (i = 0; i < desc_data[N_COL_]; i++)
		  printf ("Y[%d] = %lf\n", i+1, tmpy[i]);
#endif
		
		dcsmm(&TrA, &desc_data[N_COL_], k, &desc_data[N_ROW_],
		      alpha, pr, fida, descra, A, ia1, ia2, infoa, pc,
		      Xcopy, &desc_data[N_ROW_], beta, tmpy, lldy,
		      work1, &lwork1, &ierror);
		
		/* Check for ERRORS */
		if (ierror != 0)   {
		  info = 2010;
		  int_err[0] = ierror;
		}
	      }
	      
	      err = info;
	      Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, tmpx, tmpx, -1 ,-1,-1);
	      
	      if (err != 0) {
		psderror_( &ictxt, &info, "PSDSPMM\0", int_err, real_err );
		return;
	      }
#ifdef DEBUGSPMM__
	      printf("DSPMM:After local computations\n");
	      printf("Vector X\n");
	      for (i = 0; i < desc_data[N_COL_]; i++)
		printf ("X[%d] = %lf\n", i+1, tmpXcopy[i]);
	      
	      printf("Vector Y\n");
	      for (i = 0; i < desc_data[N_COL_]; i++)
		printf ("Y[%d] = %lf\n", i+1, tmpy[i]);
#endif
	      
	      
	      if ((iix == 1)&&(iiy == 1)) /* Columns vectors */	{
		if (ovr_points) {
		  /* Pointer to the local X vector */
		  tmpx = &X[(jjx-1)*(*lldx)+iix-1];		  
		  dcsmm(&TrA, &desc_data[N_ROW_], k, &desc_data[N_ROW_],
			alpha, pr, fida, descra, A, ia1, ia2, infoa, pc,
			tmpx, &desc_data[N_ROW_], beta, Ycopy, 
			&desc_data[N_ROW_], 
			work1, &lwork1, &ierror);
		  
		  
		  Update_Ovrlap_points(*k, tmpy, *lldy, Ycopy, 
				       desc_data[N_ROW_], 
				       desc_ovrlap_elem);
		  
#ifdef DEBUGSPMM__
		  printf("DSPMM:After Update second prod\n");
		  printf("Vector Ycopy\n");
		  for (i = 0; i < desc_data[N_COL_]; i++)
		    printf ("Ycopy[%d] = %lf\n", i+1, Ycopy[i]);
		  
		  printf("Vector Y\n");
		  for (i = 0; i < desc_data[N_COL_]; i++)
		    printf ("Y[%d] = %lf\n", i+1, Y[i]);
#endif
		}
	      }
	      
	      /* Update Halo points */
	      
	      tmpy = &Y[(jjy-1)*(*lldy)+iiy-1]; 
	      PSI_dSwapTran(*k, tmpy, *lldy, desc_data, desc_halo, desc_ovrlap,
			    desc_ovrlap_elem, work1, &lwork1);
	      
	      PSI_dSwapData(*k, tmpy, *lldy, desc_data, desc_halo, work1, &lwork1, &info);
	      
	      /* Check for allocation error in PSI_dSwapData */
	      if (info != 0)  {
		err = 1;
		if (info < 0)
		  info = 0 ;
		else {
		  int_err[0] = info;
		  info = 2020;
		}
	      }
	      

	      	      
	      /* Free Memory */
	      if (ovr_points)
		if (desc_data[N_ROW_] > *lwork) /* Work area too small */ {
		  free(work1);
		  free(Xcopy);
		  free(Ycopy);
		} else if (desc_data[N_ROW_]+Size > *lwork) {
		  free(Xcopy);
		  free(Ycopy);
		} else if (desc_data[N_ROW_]+2*Size > *lwork) {
		  free(Ycopy);
		}
		else ;
	      else	/* No OVERLAP */
		if (desc_data[N_ROW_] > *lwork) /* Work area too small */
		  /* Allocate memory for work1 */
		  free(work1);
	    }
      }
    }
  }
  if( (info != 0) || (err != 0)) {
    psderror_( &ictxt, &info, "PSDSPMM\0", int_err, real_err );
    return;
  }
  
}

void Update_Ovrlap_points(k, tmpy, lldy, Xcopy, lldx, desc_ovrlap_elem)
     int k, lldy, lldx;
     double tmpy[], Xcopy[];
     int desc_ovrlap_elem[];
{
  int i, point, indx;
  
  point = 0;
  indx = desc_ovrlap_elem[point+OVRLP_ELEM_]-1;

  while (indx != -1)
    {
      for (i = 0; i < k; i++)
	tmpy[i*lldy+indx] = Xcopy[i*lldx+indx];

      point += 2;

      indx = desc_ovrlap_elem[point+OVRLP_ELEM_]-1;
    }
}









