/* ---------------------------------------------------------------------
*
*  -- PSBLAS routine (version 1.0) --
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "psblas.h"
#include <stdio.h>
#include <stdlib.h>


/* This here selects another, and better, algorithm, in that: 
   1. It works well, whereas ours has to be fully debugged
   2. It  delegates the collective operation optimization to the BLACS, 
      so we don't have to do it ourselves. 
   However this means that globX has to be used on all processors, and 
   it will be dirty on all processors different from root (unless root == -1)
   We should probably cook up something with MPI_allgatherv. 
*/
#define LARGE_MEMORY

void psdgatherm_(m, n, globX,  iglobX, jglobX, lda_globX, 
		 locX, ilocX, jlocX,  lda_locX, 
		 root, desc_data, desc_halo, desc_overlap, overlap_elem,
		 loc_to_glob, work, lwork)
     /*
      *  .. Scalar Arguments ..
      */
     
     int          * iglobX, * jglobX, * lda_globX, * n, * m, * lwork;
     int          * ilocX, * jlocX, * lda_locX, *root;
     double       * globX, *locX;

     /* ..
      *  .. Array Arguments ..
      */
     int         desc_data[],desc_halo[],desc_overlap[],
                 overlap_elem[],loc_to_glob[];
     double         work[];
{
  /*
   *  Purpose
   *  =======
   *
   * psdgatherm collects dense matrix distributed among
   * processes in a global dense matrix belonging to root process.
  */
  
 
/*   *
   *  Notes
   *  =====
   *
   *  Some description vectors are associated with each distributed sparse
   *  matrix. These vectors stores the information required to the
   *  communication needed to perform distributed operations.
   *
   *  They are:
   *
   *  NAME		 EXPLANATION
   *  ------------------ -------------------------------------------------------
   *  MATRIX_DATA	 Array of INTEGER that contains some local and global
   *			 information of matrix.
   *
   *  DESC_HALO          Array of INTEGER that contains informations for local 
   *                     halo points.
   *
   *  DESC_OVERLAP        Array of INTEGER that contains a list of local overlap 
   *                     elements.
   *                     
   *  OVRLAP_ELEM        Array of INTEGER that contains a list of local overlap
   *                     elements with their multiplicity.
   *
   *  Now we explain each of the above vectors.
   *
   *  Let A be a generic sparse matrix. We denote with MATDATA_A the MATRIX_DATA
   *  array for matrix A.
   *  Data stored in MATRIX_DATA array are:
   *
   *  NOTATION        STORED IN		     EXPLANATION
   *  --------------- ---------------------- -------------------------------------
   *  DEC_TYPE        MATDATA_A[DEC_TYPE_]   Decomposition type
   *  M 	      MATDATA_A[M_]          Total number of equations
   *  N 	      MATDATA_A[N_]          Total number of variables
   *  N_ROW           MATDATA_A[N_ROW_]      Number of local equations
   *  N_COL           MATDATA_A[N_COL_]      Number of local variables
   *  CTXT_A          MATDATA_A[CTXT_]       The BLACS context handle, indicating
   *					     the global context of the operation
   *					     on the matrix.
   *					     The context itself is global.
   *
   *  Let DESCHALO_P be the array DESC_HALO for local process.
   *  This is composed of variable dimension blocks for each process to 
   *  communicate to.
   *  Each block contain indexes of local halo elements to exchange with other 
   *  process.
   *  Let P be the pointer to the first element of a block in DESCHALO_P.
   *  This block is stored in DESCHALO_P as :
   *
   *  NOTATION        STORED IN		          EXPLANATION
   *  --------------- --------------------------- -----------------------------------
   *  PROCESS_ID      DESCHALO_P[P+PROC_ID_]      Identifier of process which exchange 
   *						  data with.
   *  N_ELEMENTS_RECV DESCHALO_P[P+N_ELEM_RECV_]  Number of elements to receive.
   *  ELEMENTS_RECV   DESCHALO_P[P+ELEM_RECV_+i]  Indexes of local elements to
   *					          receive. These are stored in the
   *					          array from location P+ELEM_RECV_ to
   *					          location P+ELEM_RECV_+
   *						  DESCHALO_P[P+N_ELEM_RECV_]-1.
   *  N_ELEMENTS_SEND DESCHALO_P[P+N_ELEM_SEND_]  Number of elements to send.
   *  ELEMENTS_SEND   DESCHALO_P[P+ELEM_SEND_+i]  Indexes of local elements to
   *					          send. These are stored in the
   *					          array from location P+ELEM_SEND_ to
   *					          location P+ELEM_SEND_+
   *						  DESCHALO_P[P+N_ELEM_SEND_]-1.
   *
   *
   *  Let DESCOVRLP_P be the array DESC_OVERLAP for local process.
   *  This is composed of variable dimension blocks for each process to 
   *  communicate to.
   *  Each block contain indexes of local overlap elements to exchange with
   *  other process.
   *  Let P be the pointer to the first element of a block in DESCOVRLP_P.
   *  This block is stored in DESCOVRLP_P as :
   *
   *  NOTATION        STORED IN		            EXPLANATION
   *  ------------- ------------------------------- -----------------------------------
   *  PROCESS_ID    DESCOVRLP_P[P+PROC_ID_]         Identifier of process which exchange
   *						    data with.
   *  N_OVRLAP_ELEM DESCOVRLP_P[P+N_OVRLP_ELEM_]    Number of elements to exchange.
   *  OVRLAP_ELEM   DESCOVRLP_P[P+OVRLP_ELEM_TO_+i] Indexes of local elements to
   *					            exchange. These are stored in the
   *					            array from location P+OVRLP_ELEM_ to
   *					            location P+OVRLP_ELEM_+
   *						    DESCOVRLP_P[P+N_OVRLP_ELEM_]-1.
   *
   *
   *  Let OVR_ELEM_P be the array OVERLAP_ELEM for local process.
   *  This is composed of blocks of two elements. The block
   *  corresponding to the i-th overlapped elements, begin at index 
   *  P = i*2 in array OVR_ELEM_P.
   *  This block is stored in OVR_ELEM_P as :
   *
   *  NOTATION      STORED IN		       EXPLANATION
   *  ------------- -------------------------- ----------------------------------
   *  OVRLAP_ELEM   OVR_ELEM_P[P+OVRLP_ELEM_]  The index of local overlapped 
   *					       element.
   *  N_DOMAINS     OVR_ELEM_P[P+N_DOM_OVR_]   The number of copies of
   *					       local overlapped element.
   *
   *
   *  Parameters
   *  ==========
   *
   *  M       (global input) pointer to INTEGER.
   *          The number of rows of X to consider. M >= 0.
   *
   *  N	      (global input) pointer to INTEGER.
   *	      The number of column of X to consider.
   *
   *  globX   (local output valid only for root process) REAL array containing 
   *          all entries of global matrix.
   *
   *  iglobX  (local output valid only for root process) pointer to INTEGER
   *          The global row index of the array Globx
   *
   *  jglobX  (local output  valid only for root process) pointer to INTEGER
   *          The global column index  of the array Globx
   *
   *  lda_globX(local input) pointer to INTEGER
   *          The leading dimension of the array Globx.
   *
   *  locX    (local input) REAL array containing the local pieces of 
   *          a distributed matrix.
   *          This array contains the entries of the distributed vector
   *          sub( X ).
   *
   *  ilocX   (global input) pointer to INTEGER
   *          The global row index of the subvector of the distributed
   *          matrix X, to operate on.
   *
   *  JlocX   (global input) pointer to INTEGER
   *          The global column index of the subvector of the distributed
   *          matrix X to operate on.
   *
   *  lda_locX(local input) pointer to INTEGER
   *          The leading dimension of local dense matrix locX.
   *
   *  DESCDATA (global and local input) INTEGER array. Is the MATRIX_DATA
   *           array.
   *
   *  DESCHALO (local input) INTEGER array. Is the DESC_HALO array.
   *
   *  DESCOVRLAP (local input) INTEGER array. Is the DESC_OVERLAP array.
   *
   *  DESCOVRLAPELEM (local input) INTEGER array. Is the OVRLAP_ELEM array.
   *
   *  loc_to_glob (local input) INTEGER array. It contains for each local element i
   *               the coresponding global index. 
   *
   *  WORK       (local input) REAL array. Work area to memorize intermediate 
   *		 results.
   *
   *  LWORK      (local input) pointer to INTEGER. Dimension of Work area.
   *
   *  =====================================================================
*/
     /*
   *  .. Local Scalars ..
   */
  int nDim, ictxt, iilocX, jjlocX, tmpx, tmpy ,lwork1,proc,iroot,
    mycol, myrow, nprow, npcol, i, j, k,incx=1,max_size,Worksize,pnt_X3;
  int ione=1;
  
  /*
   * .. Local Array ..
    */
  double *tmplocX,*Result,*tmpglobX, *tmpRes;
  double *dwork1;
  int *X, *tmpindx, nrpi;
  /* ..
   *  .. External Functions ..
   */
  void        blacs_gridinfo();
  void        pbchkvect();
  void        pberror_();
  
  /* Error variables */
  int         info=0, ierror=0, err;
  int         int_err[5], llda, ln, llw;
  double      real_err[5];
  
  /* ..
   *  .. Executable Statements ..
   *
   *  Get grid parameters
   */
/*    fprintf(stderr,"PSGATHERM: Check:"); */
/*      for (i=0; i<=CTXT_; i++) fprintf(stderr," %d",desc_data[i]); */
/*    fprintf(stderr,"\n");  */
  ictxt = desc_data[CTXT_];
  Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
/*      fprintf(stderr,"PSDGATHERM: %d %d %d %d %d %d\n",ictxt, nprow, npcol, myrow, mycol, *root);  */
  /*
   *  Test the input parameters
   */
  info = 0;
  
  if ( nprow == -1 ) {
    info = 2010;
  } else  {
    pbchkglobvect( *m, *n, 1, *iglobX, *jglobX, *lda_globX, desc_data, 11, 3,
		   &info, int_err);
    
    if (info == 0) {
      pbchkvect( *m, *n, 1, *ilocX, *jlocX, *lda_locX, desc_data, 11, 6, 
		 &iilocX, &jjlocX, &info, int_err);
      iroot=*root;
      if (info == 0) {
	if ((iroot<-1)||(iroot>=nprow)) {
	  info=70;
	  int_err[0]=1;
	}      
      }
    }
    err = info;
    Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &err, 1, &tmpx, &tmpy, -1 ,-1,-1);
    info = err;   
 
    
    if (err == 0) {
      if ((*n != 0) && (*m != 0)) {
	if (iilocX == 1) {		/* Column vector */
#if defined(LARGE_MEMORY) 
	  /*  	    iroot=0; */
	  for (j=0; j<*n; j++) {
	    tmpglobX = &globX[j*(*lda_globX)]; 
	    for (i=0; i<*lda_globX; i++) {
	      tmpglobX[i] = 0.0;
	    }
	  }
	  for (j=0; j<*n; j++) {
	    tmpglobX = &globX[j*(*lda_globX)]; 
	    tmplocX = &locX[j*(*lda_locX)]; 
	    for (i=0; i<desc_data[N_ROW_]; i++) {
	      k = loc_to_glob[i] -1;
	      tmpglobX[k] = tmplocX[i];
	    }
	    i = 0;
	    while (overlap_elem[i] != -1)  {
	      k = loc_to_glob[overlap_elem[i+OVRLP_ELEM_]] -1;
	      tmpglobX[k] /= (double) overlap_elem[i+N_DOM_OVR_];
	      i += 2;
	    } 	      
	  }
	  Cdgsum2d( ictxt, ALL, TOPDEF, *m, *n, globX,
		    *lda_globX, iroot, mycol );	   
#else 	  
	  if (*root==-1) {
	    iroot = 0;
	  } else {
	    iroot = *root;
	  }

	  /* Update Overlap and Halo */
	  llda = *lda_locX;
	  ln = *n;
	  Cigamx2d(ictxt, ALL, TOPDEF, 1, 1, &llda, 1, &tmpx, &tmpy, -1 ,-1,-1);
	  
	  llw=(llda*ln);
	  if (llw>*lwork) {
	    Result = (double *)malloc(llw*sizeof(double));
	  } else {
	    Result = work;
	  }
	  tmpindx = (int*) malloc(llda*sizeof(int));
/*  	  fprintf(stderr,"myrow %d llda: %d ln: %d \n",myrow,llda,ln); */
	  for (j=0; j < *n; j++) {
	    tmplocX = &locX[(jjlocX+j-1)*(*lda_locX)+iilocX-1]; 
	    tmpRes  = &Result[j*llda];
	    for (i=0; i< desc_data[N_ROW_] ; i ++) {
	      tmpRes[i] = tmplocX[i];
	    }
	    i = 0;
	    while (overlap_elem[i] != -1) {
	      tmpRes[overlap_elem[i+OVRLP_ELEM_]-1] /= 
		overlap_elem[i+N_DOM_OVR_]; 
	      i += 2;
	    }
	  }
	  
	  if (iroot == myrow) {
	    max_size = 0;
	    nrpi     = desc_data[N_ROW_];
	    for (j=0; j < ln; j++)  {
	      tmpglobX = &globX[j*(*lda_globX)]; 
	      tmpRes   = &Result[j*llda];
	      for (i=0; i< *m; i++) {
		tmpglobX[i] = 0.0;
	      }
	      for (i=0; i<nrpi; i++) {
		tmpglobX[loc_to_glob[i]-1] += tmpRes[i];
	      }
	    }	    	    	   
		
	    for (proc=0; proc < nprow ; proc ++) {
	      if (proc!=myrow) {
/*  		fprintf(stderr,"myrow %d exchanging with : %d \n", */
/*  			myrow,proc); */
		Cigerv2d(ictxt,ione,ione,&nrpi,ione,proc,mycol);
/*  		fprintf(stderr,"myrow %d exchanging with : %d nrpi: %d \n", */
/*  			myrow,proc,nrpi); */
		Cigerv2d(ictxt,nrpi,ione,tmpindx,nrpi,proc,mycol);
/*  		fprintf(stderr,"myrow %d received tmpindx \n", */
/*  			myrow);	 */
		Cdgerv2d(ictxt,nrpi,ln,Result,llda,proc,mycol);
/*  		fprintf(stderr,"myrow %d received Result \n", */
/*  			myrow);	 */
		for (j=0; j < ln; j++)  {
		  tmpglobX = &globX[j*(*lda_globX)]; 
		  tmpRes   = &Result[j*llda];
		  for (i=0; i<nrpi; i++) {
		    tmpglobX[tmpindx[i] -1] += tmpRes[i];
		  }
		}	    	    	   		
	      }
	    }
	    /* for each column vector */
#ifdef DEBUGCOLLECT__
	    printf("\nPSDGATHER: Receiving from proc: %d %d\n",proc,*n);
#endif
	    if (*root == -1) {
	      /*  	      fprintf(stderr,"GATHER: Final broadcast\n"); */
	      Cdgebs2d(ictxt,"All"," ", *m, *n, globX, *lda_globX);
	    }
	  }  else {
	    /* send number of local rows*/
	    nrpi = desc_data[N_ROW_];
	    Cigesd2d(ictxt,ione,ione,&nrpi,ione,iroot,mycol);
	    Cigesd2d(ictxt,nrpi,ione,loc_to_glob,nrpi,iroot,mycol);
	    Cdgesd2d(ictxt,nrpi,ln,Result,llda,iroot,mycol);
	    if (*root == -1) {
	      Cdgebr2d(ictxt,"All"," ", *m, *n, globX, *lda_globX,iroot,mycol);
	    }
	  } 	  
	  if (llw>*lwork) {
	    free(Result); 
	  }
	  free(tmpindx);

#endif
	} else {			/* Case: x is not Column vector */
	  info = 3060 ;
	}     
      }
    } else {
      info = err;
    }
  }
  if( (err != 0) || (info != 0) )  {
    psderror_( &ictxt, &info, "PSDGATHERM", int_err, real_err );
    return;
  }
  
  
}
