/*
 * SuperLU Performance Measurement
 *
 * $Id: sluperf.c,v 1.2 1999/03/08 08:23:05 jakob Exp $
 *
 */

#include "dsp_defs.h"
#include "util.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>

#define ELEM_A(p,q) (1.0/((p)+(q)+1.0))
#define MAXERR (1E-6)
#define nruns 10000

#define RAND_OFS (5.0-(10.0*rand()/(RAND_MAX+1.0)))


int dimension = 0;
int* structure = 0;
int nnz = 0;
double *a = 0;
int    *asub = 0, *xa = 0;
double *rhs = 0;
double *backup_rhs = 0;
double *backup_rhs1 = 0;
double *backup_sys = 0;
double *backup_sys1 = 0;

SuperMatrix A, L, U, B;

void read_system(char*);
void packit(void);

int main(int argc, char *argv[])
{
  int      *perm_r; /* row permutations from partial pivoting */
  int      *perm_c; /* column permutation vector */
  int      info;
  int      p, q;
  DNformat *xform, *aform;
  double *xarray, *aarray;
  double residual = 0;
  double maxerr = 0;
  SuperMatrix AC;
  int     *etree;
  int relax;
  int panel_size;
  int c;

  /* Read structure */    
  read_system("regress.system");

  /* Pack stuff */
  packit();

  /* Allocate permutation vectors and more */
  perm_r = intMalloc(dimension);
  perm_c = intMalloc(dimension);
  etree  = intMalloc(dimension);
  if(!perm_r || !perm_c || !etree) {
    printf("Out of memory allocating permutation vectors and more\n");
    exit(1);
  }

  /* I wish I understood this */
  panel_size = sp_ienv(1);
  relax = sp_ienv(2);

  /* Calculate column permutation and apply to A -> AC */
  get_perm_c(1, &A, perm_c);
  StatInit(panel_size, relax);
  sp_preorder("N", &A, perm_c, etree, &AC);

  /* Factorize first time */
  dgstrf("N", &AC, 1.0, 0.0, relax, panel_size, 
	 etree, NULL, 0, perm_r, perm_c, &L, &U, &info);
  assert(!info);

  /* Now solve the system (perm_c ought to be held constant here) */
  //  dgssv(&A, perm_c, perm_r, &L, &U, &B, &info);
  dgstrs("N", &L, &U, perm_r, perm_c, &B, &info);
  assert(!info);

  /* B now holds X */
  assert(B.Stype == DN);
  assert(B.Dtype == _D);
  assert(B.Mtype == GE);
  assert(B.nrow == dimension);
  assert(B.ncol == 1);
  xform = B.Store;
  assert(xform->lda == dimension);
  xarray = xform->nzval;

  /* Fetch AC's values as well */
  assert(AC.Stype == NCP);
  assert(AC.Dtype == _D);
  assert(AC.Mtype == GE);
  assert(AC.nrow == dimension);
  assert(AC.ncol == dimension);
  aform = AC.Store;
  assert(aform->lda == nnz);
  aarray = aform->nzval;

  /* Calculate |A*x-b| residual */
  for(p = 0; p < dimension; p++) {
    /* Calculate sum(A(p,:) * x(p)) - b(p) */
    double sum = 0;
    for(q = 0; q < dimension; q++)
      if(structure[p + q*dimension]) /* if A(p,q) nonzero */
	{
	  sum += ELEM_A(p,q) * xarray[q];
	}
    sum -= backup_rhs[p];
    if(fabs(sum) > MAXERR)
      printf("ERR: A(%i,:)*x(:) - b(%i) = %6.6g\n", p, p, sum);
    residual += fabs(sum)/((double)dimension*dimension);
    if(fabs(sum) > maxerr) maxerr = fabs(sum);
  }

  printf("Max error = %6.6g\n", maxerr);
  printf("Mean error = %6.6g\n", residual);

  printf("Benchmarking...\n"); 
  for(c = 0; c < nruns; c++) {   
    /* Get rhs backup */
    memcpy(xarray, backup_rhs, sizeof(double)*dimension);
    /* Get system backup */
/*     memcpy(aarray, backup_sys, sizeof(double)*nnz);  */
    aform->nzval = backup_sys;
    /* Factorize non-first time */
    dgstrf("Y", &AC, 1.0, 0.0, relax, panel_size, 
	   etree, NULL, 0, perm_r, perm_c, &L, &U, &info);
    /* Now solve the system (perm_c ought to be held constant here) */
    dgstrs("N", &L, &U, perm_r, perm_c, &B, &info);

    /* Get rhs backup */
    memcpy(xarray, backup_rhs1, sizeof(double)*dimension);
    /* Get system backup */
/*     memcpy(aarray, backup_sys1, sizeof(double)*nnz); */
    aform->nzval = backup_sys1;
    /* Factorize non-first time */
    dgstrf("Y", &AC, 1.0, 0.0, relax, panel_size, 
	   etree, NULL, 0, perm_r, perm_c, &L, &U, &info);
    /* Now solve the system (perm_c ought to be held constant here) */
    dgstrs("N", &L, &U, perm_r, perm_c, &B, &info);
  }

  StatFree();

  exit(0);
}


void read_system(char* fname)
{
  FILE* fp = fopen(fname, "r");
  int dim1, dim2;
  int p,q,ndx = 0;
  if(!fp) {
    printf("Couldn't open structure file\n");
    exit(1);
  }
  /* Read dimension */
  fscanf(fp, "%i %i\n", &dim1, &dim2);
  printf("System dimension is %ix%i\n", dim1, dim2);
  if(dim1 != dim2) {
    printf("Need square system!\n");
    exit(1);
  }
  dimension = dim1;
  /* Alloc. space */
  structure = malloc(dimension*dimension*sizeof(int));
  if(!structure) {
    printf("Out of memory allocating structure buffer\n");
    exit(1);
  }
  /* Read data */
  for(q = 0; q < dimension; q++)
    for(p = 0; p < dimension; p++) {
      int tmp;
      if(1 != fscanf(fp, "%i", &tmp)) {
	printf("Failed to read structure\n");
	exit(1);
      }
      *(structure + ndx++) = tmp;
      if(tmp) nnz++;
    }
  fclose(fp);
  /* Statistics */
  printf("System has %i nonzeros\n", nnz);
}


void packit(void)
{
  int p,q, sndx=0,andx=0;
  /* Allocate stuff */
  a = doubleMalloc(nnz);
  asub = intMalloc(nnz);
  xa = intMalloc(dimension+1);
  rhs = doubleMalloc(dimension);
  if(!a || !asub || !xa || !rhs) {
    printf("Out of memory allocating matrix buffers\n");
    exit(1);
  }

  /* Build Hilbert matrix with read structure */
  /* A(i,j) = S(i,j) * 1/(i+j-1) for 1-based indices where S in {0,1}  */
  /* This should be packed into Harwell-Boeing format, to keep SuperLU happy */
  for(q = 0; q < dimension; q++) {
    xa[q] = andx;
    for(p = 0; p < dimension; p++) {
      /* Is this entry non-zero ? */
      if(structure[sndx++]) {
	a[andx] = ELEM_A(p,q);
	asub[andx++] = p;
      }
    }
    assert(andx); /* We can't have a strictly zero column */
  }
  xa[dimension] = andx;

  /* Have SuperLU build it's own internal representation */
  dCreate_CompCol_Matrix(&A, dimension, dimension, nnz, a, asub, xa, NC, _D, GE);
  
  /* Create pseudo-random right hand side vector */
  
  for(p = 0; p < dimension; p++) {
    rhs[p] = (10.0*rand()/(RAND_MAX+1.0));
    if(1.0*rand()/(RAND_MAX+1.0) > 0.6) rhs[p] = -rhs[p];
  }
  dCreate_Dense_Matrix(&B, dimension, 1, rhs, dimension, DN, _D, GE);

  /* Keep backups */
  backup_rhs = malloc(sizeof(double)*dimension);
  memcpy(backup_rhs, rhs, sizeof(double)*dimension);
  backup_sys = malloc(sizeof(double)*nnz);
  memcpy(backup_sys, a, sizeof(double)*nnz);

  /* Construct alternate rhs */
  backup_rhs1 = malloc(sizeof(double)*dimension);
  for(p = 0; p < dimension; p++)
    backup_rhs1[p] = backup_rhs[p] + RAND_OFS;

  /* Construct alternate system */
  backup_sys1 = malloc(sizeof(double)*nnz);
  for(p = 0; p < nnz; p++)
    backup_sys1[p] = a[p] + RAND_OFS;
    
}
