#include "superlu_ddefs.h"
/*#include "./superlu_impl.h"*/
#include "netsolve_superlu.h"

int superlu_dist
(MPI_Comm comm,
 double *slu_vals,int *slu_idx,int *cptr,
 int nnzeros,int local_size,direct_info_block info_block,
 double *rhs_vector,double *solution_vector)
{
  /*superlu_info_block superlu_info = (superlu_info_block) info_block;*/
  int Np,np,my_first;

  superlu_options_t options;
  SuperLUStat_t stat;
  SuperMatrix A;
  ScalePermstruct_t ScalePermstruct;
  LUstruct_t LUstruct;
  gridinfo_t grid;
  double   *berr;
  double   *a, *b;
  int_t    *asub, *xa;
  int_t    m, n, nnz;
  int_t    nprow, npcol;
  int      iam, info, ldb, ldx, nrhs;

  MPI_Comm_size(comm,&Np);
  MPI_Comm_rank(comm,&np);

  /* ------------------------------------------------------------
     INITIALIZE THE SUPERLU PROCESS GRID. 
     ------------------------------------------------------------*/
  nprow = 1;  /* Default process rows.      */
  npcol = 1;  /* Default process columns.   */
  nrhs = 1;   /* Number of right-hand side. */

  /* VE processors from commm */
  nprow = Np;

  superlu_gridinit(comm, nprow, npcol, &grid);
  
  /* Bail out if I do not belong in the grid. */
  iam = grid.iam;
  if ( iam >= nprow * npcol )
    goto out;
  
#if ( VAMPIR>=1 )
  VT_traceoff();
#endif
  
#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Enter main()");
#endif
  
  /* Construct the matrix by gathering the bits together */
  {
    int *m_offsets,*m_nnzeros,*ptr_sizes,*ptr_offsets,
      total_size,total_nnzeros;

    MPI_Allreduce(&local_size,&total_size,1,MPI_INT,MPI_SUM,comm);
    MPI_Allreduce(&nnzeros,&total_nnzeros,1,MPI_INT,MPI_SUM,comm);
    m_nnzeros = (int*) malloc((Np+1)*sizeof(int));
    m_offsets = (int*) malloc((Np+1)*sizeof(int));
    MPI_Allgather(&nnzeros,1,MPI_INT,m_nnzeros,1,MPI_INT,comm);
    ptr_sizes = (int*) malloc((Np+1)*sizeof(int));
    ptr_offsets = (int*) malloc((Np+1)*sizeof(int));
    MPI_Allgather(&local_size,1,MPI_INT,ptr_sizes,1,MPI_INT,comm);
    {
      int i,tot;
      /* collect size & offset information for indices */
      /* printf("Nnzeros: "); for (i=0; i<Np; i++) printf("%d ",m_nnzeros[i]);
	 printf("\n");*/
      tot = 0;
      for (i=0; i<Np; i++) {
	int old=tot; tot += m_nnzeros[i]; m_offsets[i] = old;
      }
      /* printf("Offsets: "); for (i=0; i<Np; i++) printf("%d ",m_offsets[i]);
	 printf("\n");*/
      /* collect size & offset information for pointers */
      /* printf("Local sizes: "); for (i=0; i<Np; i++) printf("%d ",ptr_sizes[i]);
	 printf("\n");*/
      tot = 0;
      for (i=0; i<Np; i++) {
	int old=tot; tot += ptr_sizes[i]; ptr_offsets[i] = old;
      }
      /* printf("Offsets: "); for (i=0; i<Np; i++) printf("%d ",ptr_offsets[i]);
	 printf("\n");*/
    }
    m = n = total_size; nnz = total_nnzeros;
    a = (double*) malloc(total_nnzeros*sizeof(double));
    asub = (int*) malloc(total_nnzeros*sizeof(int));
    xa = (int*) malloc((total_size+1)*sizeof(int));
    MPI_Allgatherv(slu_vals,nnzeros,MPI_DOUBLE,
		   a,m_nnzeros,m_offsets,MPI_DOUBLE,comm);
    MPI_Allgatherv(slu_idx,nnzeros,MPI_INT,
		   asub,m_nnzeros,m_offsets,MPI_INT,comm);
    MPI_Allgatherv(cptr,local_size,MPI_INT,
		   xa,ptr_sizes,ptr_offsets,MPI_INT,comm);
    my_first = ptr_offsets[np];
    {
      int ip,ir,i=0;
      for (ip=0; ip<Np; ip++)
	for (ir=0; ir<ptr_sizes[ip]; ir++)
	  xa[i++] += m_offsets[ip];
      xa[total_size] = total_nnzeros;
    }
    /*    print_crs_matrix(a,asub,xa, 0,0,total_size);*/
    free(m_nnzeros); free(m_offsets);

    /* Gather the full right-hand side. */
    if ( !(b = doubleMalloc(m)) ) ABORT("Malloc fails for b[]");
    MPI_Allgatherv(rhs_vector,local_size,MPI_DOUBLE,
		   b,ptr_sizes,ptr_offsets,MPI_DOUBLE,comm);
    free(ptr_sizes); free(ptr_offsets);
  }
  /* ------------------------------------------------------------
     PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
     THE OTHER PROCESSES.
     ------------------------------------------------------------*/
  /* this is a different paradigm than the distributed creation
     that we are using in the netsolve interface,
     so we've eliminated this piece of code altogether.
     Not good. This will have to be resolved ultimately. */
  if ( !iam ) {
    
    /* Broadcast matrix A to the other PEs. */
    /*
    m = order;
    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
    n = order;
    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
    nnz = nnzeros;
    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
    a = slu_vals;
    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
    asub = slu_idx;
    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
    xa = cptr;
    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
    */
  } else {
    /* Receive matrix A from PE 0. */
    /*
    MPI_Bcast( &m,   1,   mpi_int_t,  0, grid.comm );
    MPI_Bcast( &n,   1,   mpi_int_t,  0, grid.comm );
    MPI_Bcast( &nnz, 1,   mpi_int_t,  0, grid.comm );
    
    dallocateA(n, nnz, &a, &asub, &xa);
    
    MPI_Bcast( a,    nnz, MPI_DOUBLE, 0, grid.comm );
    MPI_Bcast( asub, nnz, mpi_int_t,  0, grid.comm );
    MPI_Bcast( xa,   n+1, mpi_int_t,  0, grid.comm );
    */
  }
  
  /* Create compressed column matrix for A. */
  dCreate_CompCol_Matrix(&A, m, n, nnz, a, asub, xa, NC, _D, GE);
  
  ldx = n;
  ldb = m;

  if ( !(berr = doubleMalloc(nrhs)) )
    ABORT("Malloc fails for berr[].");
  
  /* ------------------------------------------------------------
     NOW WE SOLVE THE LINEAR SYSTEM.
     ------------------------------------------------------------*/
  
  /* Set the default input options. */
  set_default_options(&options);
#if 0
  options.Equil = NO;
  options.ColPerm = NATURAL;
  options.ColPerm = MMD_AT_PLUS_A;
  options.RowPerm = NOROWPERM;
#endif
  
  /* Initialize ScalePermstruct and LUstruct. */
  ScalePermstructInit(m, n, &ScalePermstruct);
  LUstructInit(m, n, &LUstruct);
  
  /* Initialize the statistics variables. */
  PStatInit(&stat);
  
  /* Call the linear equation solver. */
  pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
		   &LUstruct, berr, &stat, &info);
  
  memcpy(solution_vector,b+my_first,local_size*sizeof(double));

  PStatPrint(&stat, &grid);        /* Print the statistics. */
  
  /* ------------------------------------------------------------
     DEALLOCATE STORAGE.
     ------------------------------------------------------------*/
  PStatFree(&stat);
  Destroy_CompCol_Matrix(&A);
  Destroy_LU(n, &grid, &LUstruct);
  ScalePermstructFree(&ScalePermstruct);
  LUstructFree(&LUstruct);
  SUPERLU_FREE(b);
  SUPERLU_FREE(berr);
  
  /* ------------------------------------------------------------
     RELEASE THE SUPERLU PROCESS GRID.
     ------------------------------------------------------------*/
 out:
  superlu_gridexit(&grid);
  
  return 0;
}
