/* 
   COPYRIGHT U.S. GOVERNMENT 
   
   This software is distributed without charge and comes with
   no warranty.

   Please feel free to send questions, comments, and problem reports
   to prism@super.org. 
*/

/* PURPOSE
   =======
   Tests prism_v_bimmer.c for various mappings and mesh configurations.
   It uses some special matrices A and B whose product C is known.
   It also collect some timings for BiMMeR.
 */

/* INCLUDE FILES */
#include "stdeig.h"
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include <math.h>
#include "mm.h" 
#include "mm_test.h"       /* testing header file */

/* GLOBAL CONSTANTS */

/* C = alpha*A*B + beta*C */
double
d_alpha = 1./3.,
d_beta =  1./4.
/*
d_alpha = 1.,
d_beta =  0.
*/
;
#define PRISM_NUM_DIMS 2

main(int argc, char **argv)
{
  /* --------------- */
  /* Local variables */
  /* --------------- */
  int 
    p_i_dims[PRISM_NUM_DIMS],
    p_i_periods[PRISM_NUM_DIMS],
    p_i_coords[PRISM_NUM_DIMS],
    p_i_remain[PRISM_NUM_DIMS],
    i_2dcomm_rows,     /* # rows in 2D comm used for multiply */
    i_2dcomm_cols,     /* # cols in 2D comm used for multiply */
    i_2d_row,          /* row coord in 2D comm used for multiply */
    i_2d_col,          /* column coord in 2D comm used for multiply */
    i_comm_size,       /* holds # nodes */
    i_world_id,        /* rank in MPI_COMM_WORLD */
    i_node_id,         /* rank in 2D topology */
    i_use_rows,	       /* number of rows to assign to 2D topology */
    i_use_cols,	       /* number of columns to assign to 2D topology */
    i_bcbuf_rows,      /* # of rows in broadcast buffer */
    i_bcbuf_cols,      /* # of cols in broadcast buffer */
    i_rollbuf_rows,    /* # of rows in roll buffer */
    i_rollbuf_cols,    /* # of cols in roll buffer */
    i, j,
    index,
    i_aux,
    i_aux1, 
    i_mdim, i_ndim,    /* prism_i_kdim defined as global variable */
    i_blk_row_nw,      /* row coordinate of nw-corner of block */
    i_blk_col_nw,      /* col coordinate of nw-corner of block */
    i_v_dim,           /* size of v-msh */
    i_sbmsh_nw_x,      /* row idx of nw-corner of the sbmsh */
    i_sbmsh_nw_y,      /* col idx of nw-corner of the sbmsh */
    i_sbmsh_rows,      /* num of rows in the sbmsh */
    i_sbmsh_cols,      /* num of cols in the sbmsh */
    i_m_panelwidth,    /* panel width for torus wrap of rows of op(A) and of C */
    i_n_panelwidth,    /* panel width for torus wrap of cols of op(B) and of C */
    i_k_panelwidth,    /* panel width for torus wrap of cols of op(A) and rows of op(B) */
    i_panel_spc,       /* stride of consecutive panels across v-node */
    i_m_offset,        /* row offset for size of first logical block of op(A) and of C */
    i_n_offset,        /* col offset for size of first logical block of op(B) and of C */
    i_k_offset,        /* offset for size of first logical col block of op(A) and row 
			  block of op(B) */
    i_rows_p_node_a,   /* max rows per node for A */
    i_rows_p_node_c,   /* max rows per node for C */
    i_rows_p_node_b,   /* max rows per node for B */
    i_cols_p_node_a,   /* max cols per node for A */
    i_cols_p_node_b,   /* max cols per node for B */
    i_cols_p_node_c;   /* max cols per node for C */

MPI_Comm
   comm_2d,      /* communicator with 2D topology */
   comm_row,     /* rows of comm_2d */
   comm_col      /* columns of comm_2d */
      ;

  double
    bmax_time, bmin_time,
    rmax_time, rmin_time,
    mmax_time, mmin_time,
    d_e_time, d_s_time, d_max_time, d_min_time,
    d_avg_time, d_aux_time, 
    d_mflops, d_mflops_node, d_maxerr
      ; 

E_LAYOUT               /* enumerate type defined in global.h */
  e_layout;            /* wrap, block, wrap1d, block1d */

  P_R_MATRIX	       /* double pointer type defined in global.h */
    m_r_a,
    m_r_b,
    m_r_c,
    m_r_bcbuf,
    m_r_rollbuf
      ;
  char 
    c_outfile[127],    /* holds input file name for output */
    c_error[120],      /* holds error message for generror */
    c_transa[10],      /* holds true or false for A^t */
    c_transb[10],      /* holds true or false for B^t */
    *c_myname
      ;
  FILE
    *outfile
      ;
  /* --------- */
  /* Functions */
  /* --------- */
  double 
    aij(int i, int j),
    bij(int i, int j),
    cij(int i, int j),
    cij_result(int i, int j)
      ;

  /* initialize some global variables */
  prism_d_b_time = prism_d_r_time = prism_d_m_time = 0.0;

  MPI_Init(&argc, &argv);

  prism_v_init_var();

  MPI_Comm_rank(MPI_COMM_WORLD, &i_world_id);

  if (i_world_id == 0) {
    /* Check on node 0 */
    c_myname = argv[0];
    if (argc != 17) {
      sprintf(c_error, "Usage: %s M N K lambda vdim nw_x nw_y rows cols m_width n_width k_width spc transa transb outfile\n", c_myname);
      prism_v_generror(c_error, brief);
    }
  }

  /* Do for p4, should work for others too */
  if (i_world_id == 0) {
    /* only read args on node 0 */
    i_mdim = atoi(argv[1]); 
    i_ndim = atoi(argv[2]); 
    prism_i_kdim = atoi(argv[3]); 
    prism_d_lambda = strtod(argv[4], NULL);
    prism_d_gamma = - prism_d_lambda;  /* prism_d_gamma can be other values */
    i_v_dim = atoi(argv[5]);
    i_sbmsh_nw_x = atoi(argv[6]);
    i_sbmsh_nw_y = atoi(argv[7]);
    i_sbmsh_rows = atoi(argv[8]);
    i_sbmsh_cols = atoi(argv[9]);
    i_m_panelwidth = atoi(argv[10]);
    i_n_panelwidth = atoi(argv[11]);
    i_k_panelwidth = atoi(argv[12]);
    i_panel_spc =  atoi(argv[13]);
    strcpy(c_transa,argv[14]);
    strcpy(c_transb,argv[15]);
    strcpy(c_outfile,argv[16]);
  }
  /* Interpret character true/false as boolean value 1/0 */
  if (strcmp(c_transa,"true") == 0)
    prism_e_transa = true;
  else
    prism_e_transa = false;
  if (strcmp(c_transb,"true") == 0)
    prism_e_transb = true;
  else
    prism_e_transb = false;
  /* Broadcast args on node 0 to all other nodes */
  MPI_Bcast(&i_mdim, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_ndim, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&prism_i_kdim, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&prism_d_lambda, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  prism_d_gamma = - prism_d_lambda;  /* prism_d_gamma can be other values */
  MPI_Bcast(&i_v_dim, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_sbmsh_nw_x, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_sbmsh_nw_y, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_sbmsh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_sbmsh_cols, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_m_panelwidth, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_n_panelwidth, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_k_panelwidth, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&i_panel_spc, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&prism_e_transa, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&prism_e_transb, 1, MPI_INT, 0, MPI_COMM_WORLD);

  i_use_rows = i_sbmsh_rows;
  i_use_cols = i_sbmsh_cols;

  MPI_Comm_size(MPI_COMM_WORLD, &i_comm_size);
  /* make sure that total number of nodes asked for is same as
     actual number of nodes allocated */
  if (i_use_rows * i_use_cols == i_comm_size) {
    p_i_dims[0] = i_use_rows;
    p_i_dims[1] = i_use_cols;
  }
  else {
    sprintf(c_error, "prism_bimmer_test: request # rows %d * # cols %d != # nodes %d\n",
	    i_use_rows, i_use_cols, i_comm_size);
    prism_v_generror(c_error, brief);
  }
  i_2dcomm_rows = p_i_dims[0];
  i_2dcomm_cols = p_i_dims[1];
  /* don't really need now since same value.  leave in case change */
  i_sbmsh_rows = i_2dcomm_rows;
  i_sbmsh_cols = i_2dcomm_cols;

  /* Make into 2D grid/torus.
     The first dimension is logically the rows and the second
     dimension is the columns.  Want to get a wrap (torus) when
     going down a column which means marking the first dimension
     as periodic.  This is because we roll in the columns of the
     2D topology. */
  p_i_periods[0] = 1;
  p_i_periods[1] = 0;
  MPI_Cart_create(MPI_COMM_WORLD, (int)PRISM_NUM_DIMS, p_i_dims, p_i_periods,
		  (int)1, &comm_2d);

  MPI_Comm_rank(comm_2d, &i_node_id);
  MPI_Cart_coords(comm_2d, i_node_id, (int)PRISM_NUM_DIMS, p_i_coords);
  i_2d_row = p_i_coords[0];
  i_2d_col = p_i_coords[1];

  /* Make comm of rows of 2D topology */
  p_i_remain[0] = 0;
  p_i_remain[1] = 1;
  MPI_Cart_sub(comm_2d, p_i_remain, &comm_row);

  /* Make comm of columns of 2D topology */
  p_i_remain[0] = 1;
  p_i_remain[1] = 0;
  MPI_Cart_sub(comm_2d, p_i_remain, &comm_col);
    
  if (i_node_id == 0) {
    if(strcmp(c_outfile,"stderr") == 0) {
      outfile = stderr;
    }
    else if (strcmp(c_outfile,"stdout") == 0) {
      outfile = stdout;
    }
    else {
      outfile = fopen(c_outfile, "w");
      if (outfile == NULL ) {
	sprintf(c_error, "prism_bimmer_test: cannot open file %s", c_outfile);
	prism_v_generror(c_error, brief);
      }
    }
  }

  if (i_node_id == 0) {
#if PRISM_MPI_COLL
    fprintf(outfile, "prog = \'MC\';\n");
#else
    fprintf(outfile, "prog = \'PC\';\n");
#endif

#if PRISM_BC_READY
    fprintf(outfile, "bc_mode = \'BC_READY\';\n");
#else
    fprintf(outfile, "bc_mode = \'BC_REGULAR\';\n");
#endif

#if PRISM_SKEW_READY
    fprintf(outfile, "skew_mode = \'SKEW_READY\';\n");
#else
    fprintf(outfile, "skew_mode = \'SKEW_REGULAR\';\n");
#endif

    fprintf(outfile,"mach = \'%s\';\n", Prism_STR_MACHINE);
    fprintf(outfile,"params = [%d %d %d %e %e %d %d %d %d %d %d %d %d %d %s %s];\n",
	   i_mdim, i_ndim, prism_i_kdim, prism_d_lambda, prism_d_gamma,
	   i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y, i_sbmsh_rows, i_sbmsh_cols,
	   i_m_panelwidth, i_n_panelwidth, i_k_panelwidth, i_panel_spc, c_transa, c_transb);
  }

  /* -------------- */
  /* Initialization */ 
  /* -------------- */
  i_blk_row_nw = i_blk_col_nw = 0;
  i_m_offset = 0; 
  i_n_offset = 0; 
  i_k_offset = 0; 
  e_layout = wrap; 

  /* -------------------------- */
  /* Allocate memory for matrix */
  /* -------------------------- */
  /* calculate number of rows/cols per node for A, B and C */
  
  i_rows_p_node_c = prism_i_ldwk(i_v_dim, i_mdim, 0, i_m_panelwidth, i_m_offset,
				 i_panel_spc, i_v_dim/i_sbmsh_rows, i_sbmsh_rows);

  i_cols_p_node_c = prism_i_ldwk(i_v_dim, i_ndim, 0, i_n_panelwidth, i_n_offset,
				  i_panel_spc, i_v_dim/i_sbmsh_cols,
				  i_sbmsh_cols);

  i_rows_p_node_a = (prism_e_transa == true) ?
    prism_i_ldwk(i_v_dim, prism_i_kdim, 0, i_k_panelwidth, i_k_offset, i_panel_spc,
		 i_v_dim/i_sbmsh_rows, i_sbmsh_rows)
      : i_rows_p_node_c;
  
  i_rows_p_node_b = (prism_e_transb == true) ? 
    prism_i_ldwk(i_v_dim, i_ndim, 0, i_n_panelwidth, i_n_offset, i_panel_spc, 
		 i_v_dim/i_sbmsh_rows, i_sbmsh_rows) 
      : prism_i_ldwk(i_v_dim, prism_i_kdim, 0, i_k_panelwidth, i_k_offset, i_panel_spc, 
		     i_v_dim/i_sbmsh_rows, i_sbmsh_rows);

  i_cols_p_node_a = (prism_e_transa == true) ?
    prism_i_ldwk(i_v_dim, i_mdim, 0, i_m_panelwidth, i_m_offset, i_panel_spc,
		 i_v_dim/i_sbmsh_cols, i_sbmsh_cols)
      : prism_i_ldwk(i_v_dim, prism_i_kdim, 0, i_k_panelwidth, i_k_offset,
		     i_panel_spc, i_v_dim/i_sbmsh_cols, i_sbmsh_cols);
  
  i_cols_p_node_b = (prism_e_transb == true) ? 
    prism_i_ldwk(i_v_dim, prism_i_kdim, 0, i_k_panelwidth, i_k_offset, i_panel_spc, 
		 i_v_dim/i_sbmsh_cols, i_sbmsh_cols)
      : i_cols_p_node_c;

  /* Allocate memory for matrices */
  m_r_a = prism_m_d_alloc_matrix(i_rows_p_node_a, i_cols_p_node_a); 
  m_r_b = prism_m_d_alloc_matrix(i_rows_p_node_b, i_cols_p_node_b);
  m_r_c = prism_m_d_alloc_matrix(i_rows_p_node_c, i_cols_p_node_c);

  /* Compute size & allocate memory for broadcast & roll buffers */
  if (prism_e_transa == false) {
    if (prism_e_transb == false) {
      if (i_2dcomm_rows >= i_2dcomm_cols) {
	/* broadcast A, roll B */
	i_bcbuf_rows = i_rows_p_node_a;
	i_bcbuf_cols = i_cols_p_node_a;
	i_rollbuf_rows = i_rows_p_node_b;
	i_rollbuf_cols = i_cols_p_node_b;
      }
      else {
	/* roll A, broadcast B */
	i_bcbuf_rows = i_rows_p_node_b;
	i_bcbuf_cols = i_cols_p_node_b;
	i_rollbuf_rows = i_rows_p_node_a;
	i_rollbuf_cols = i_cols_p_node_a;
      }
    }
    else {
      if (i_2dcomm_rows >= i_2dcomm_cols) {
	/* roll B, broadcast C */
	i_bcbuf_rows = i_rows_p_node_c;
	i_bcbuf_cols = i_cols_p_node_c;
	i_rollbuf_rows = i_rows_p_node_b;
	i_rollbuf_cols = i_cols_p_node_b;
      }
      else {
	/* broadcast B, roll C */
	i_bcbuf_rows = i_rows_p_node_b;
	i_bcbuf_cols = i_cols_p_node_b;
	i_rollbuf_rows = i_rows_p_node_c;
	i_rollbuf_cols = i_cols_p_node_c;
      }
    }
  }
  else {
    if (prism_e_transb == false) {
      if (i_2dcomm_rows >= i_2dcomm_cols) {
	/* broadcast A, roll C */
	i_bcbuf_rows = i_rows_p_node_a;
	i_bcbuf_cols = i_cols_p_node_a;
	i_rollbuf_rows = i_rows_p_node_c;
	i_rollbuf_cols = i_cols_p_node_c;
      }
      else {
	/* roll A, broadcast C */
	i_bcbuf_rows = i_rows_p_node_c;
	i_bcbuf_cols = i_cols_p_node_c;
	i_rollbuf_rows = i_rows_p_node_a;
	i_rollbuf_cols = i_cols_p_node_a;
      }
    }
    else {
      /* don't allow A'B' */
      prism_v_generror("prism_bimmer_test: A'B' not implemented yet", brief);
    }
  }
  m_r_bcbuf   = prism_m_d_alloc_matrix(i_bcbuf_rows, i_bcbuf_cols);
  m_r_rollbuf = prism_m_d_alloc_matrix(i_rollbuf_rows, i_rollbuf_cols);

  if (i_node_id == 0) {
/*    fprintf(outfile,"dims for A, B, C = %dx%d, %dx%d, %dx%d\n", 
	   i_rows_p_node_a, i_cols_p_node_a, i_rows_p_node_b, i_cols_p_node_b,
	   i_rows_p_node_c, i_cols_p_node_c);*/
    fprintf(outfile,"a_dims = [%d %d];\n", i_rows_p_node_a, i_cols_p_node_a);
    fprintf(outfile,"b_dims = [%d %d];\n", i_rows_p_node_b, i_cols_p_node_b);
    fprintf(outfile,"c_dims = [%d %d];\n", i_rows_p_node_c, i_cols_p_node_c);
    fprintf(outfile,"bcbuf_dims = [%d %d];\n", i_bcbuf_rows, i_bcbuf_cols);
    fprintf(outfile,"rollbuf_dims = [%d %d];\n", i_rollbuf_rows, i_rollbuf_cols);
  }

  /* Generate matrices a, b */
  if (prism_e_transa == false) {
    prism_v_layout(i_mdim, prism_i_kdim, i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y,
		   i_sbmsh_rows, i_sbmsh_cols, i_m_panelwidth, i_k_panelwidth,
		   i_panel_spc, aij, m_r_a[0], i_rows_p_node_a, e_layout,
		   i_2dcomm_rows, i_2dcomm_cols, i_2d_row, i_2d_col);
  }
  else {
    prism_v_layout(prism_i_kdim, i_mdim, i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y,
		   i_sbmsh_rows, i_sbmsh_cols, i_k_panelwidth, i_m_panelwidth,
		   i_panel_spc, aij, m_r_a[0], i_rows_p_node_a, e_layout,
		   i_2dcomm_rows, i_2dcomm_cols, i_2d_row, i_2d_col);
  }
  if (prism_e_transb == false) {
    prism_v_layout(prism_i_kdim, i_ndim, i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y,
		   i_sbmsh_rows, i_sbmsh_cols, i_k_panelwidth, i_n_panelwidth, 
		   i_panel_spc, bij, m_r_b[0], i_rows_p_node_b, e_layout,
		   i_2dcomm_rows, i_2dcomm_cols, i_2d_row, i_2d_col);
  }
  else {
    prism_v_layout(i_ndim, prism_i_kdim, i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y,
		   i_sbmsh_rows, i_sbmsh_cols, i_n_panelwidth, i_k_panelwidth,
		   i_panel_spc, bij, m_r_b[0], i_rows_p_node_b, e_layout,
		   i_2dcomm_rows, i_2dcomm_cols, i_2d_row, i_2d_col);
  }

  /* Fill matrix C with data to check that user-supplied C is added in correctly */

  prism_v_layout(i_mdim, i_ndim, i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y,
		 i_sbmsh_rows, i_sbmsh_cols, i_m_panelwidth, i_n_panelwidth,
		 i_panel_spc, cij, m_r_c[0], i_rows_p_node_c, e_layout,
		 i_2dcomm_rows, i_2dcomm_cols, i_2d_row, i_2d_col);

  /* Synchronize processors to make sure they start together */
  MPI_Barrier(comm_2d);

  /* --------------------- */
  /* Matrix multiplication */
  /* --------------------- */
  d_s_time = MPI_Wtime();
  d_e_time = MPI_Wtime();

  prism_v_bimmer(i_v_dim, i_blk_row_nw, i_blk_col_nw, i_mdim, i_ndim,
		 prism_i_kdim, i_m_panelwidth, i_n_panelwidth, i_k_panelwidth,
		 i_m_offset, i_n_offset, i_k_offset, i_panel_spc, prism_e_transa,
		 prism_e_transb, d_alpha, m_r_a, i_rows_p_node_a, m_r_b, 
		 i_rows_p_node_b, d_beta, m_r_c, i_rows_p_node_c, m_r_bcbuf,
		 i_bcbuf_rows*i_bcbuf_cols, m_r_rollbuf,
		 i_rollbuf_rows*i_rollbuf_cols, comm_row,
		 comm_col, prism_i_ldim);

  d_max_time = d_min_time = d_avg_time
    = MPI_Wtime() - d_e_time - (d_e_time - d_s_time);

  /* -------- */
  /* Clean-up */
  /* -------- */
  /* Check the result, the product is overwritten by the difference */
  prism_v_error_chk(i_mdim, i_ndim, i_v_dim, i_sbmsh_nw_x, i_sbmsh_nw_y,
		    i_sbmsh_rows, i_sbmsh_cols, i_m_panelwidth, i_n_panelwidth, 
		    i_panel_spc, cij_result, m_r_c[0], i_rows_p_node_c, e_layout,
		    i_2dcomm_rows, i_2dcomm_cols, i_2d_row, i_2d_col);

  /* Search for max error */
  index = i_rows_p_node_c*i_cols_p_node_c;
  /* idamax_ is Fortran function*/
  index = idamax_((fint *)&index, m_r_c[0], (fint *)&i_one);
  d_maxerr = *(m_r_c[0] + index-1);

  MPI_Reduce(&d_maxerr, m_r_bcbuf[0], (int)1, MPI_DOUBLE, MPI_MAX, (int)0,
	     comm_2d);
  d_maxerr = m_r_bcbuf[0][0];

  /* Collect timing results */
  MPI_Reduce(&d_max_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MAX,
	     (int)0, comm_2d);
  d_max_time = d_aux_time;
  MPI_Reduce(&d_min_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MIN,
	     (int)0, comm_2d);
  d_min_time = d_aux_time;
  MPI_Reduce(&d_avg_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_SUM,
	     (int)0, comm_2d);
  d_avg_time = d_aux_time;

  /* Save away b, r and m times so can ave, max and min */
  bmax_time = bmin_time = prism_d_b_time;
  mmax_time = mmin_time = prism_d_m_time;
  rmax_time = rmin_time = prism_d_r_time;

  MPI_Reduce(&prism_d_b_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_SUM,
	     (int)0, comm_2d);
  prism_d_b_time = d_aux_time;
  MPI_Reduce(&prism_d_r_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_SUM,
	     (int)0, comm_2d);
  prism_d_r_time = d_aux_time;
  MPI_Reduce(&prism_d_m_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_SUM,
	     (int)0, comm_2d);
  prism_d_m_time = d_aux_time;

  MPI_Reduce(&bmax_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MAX,
	     (int)0, comm_2d);
  bmax_time = d_aux_time;
  MPI_Reduce(&rmax_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MAX,
	     (int)0, comm_2d);
  rmax_time = d_aux_time;
  MPI_Reduce(&mmax_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MAX,
	     (int)0, comm_2d);
  mmax_time = d_aux_time;

  MPI_Reduce(&bmin_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MIN,
	     (int)0, comm_2d);
  bmin_time = d_aux_time;
  MPI_Reduce(&rmin_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MIN,
	     (int)0, comm_2d);
  rmin_time = d_aux_time;
  MPI_Reduce(&mmin_time, &d_aux_time, (int)1, MPI_DOUBLE, MPI_MIN,
	     (int)0, comm_2d);
  mmin_time = d_aux_time;

  d_avg_time /= (i_sbmsh_rows * i_sbmsh_cols);
  prism_d_b_time /= (i_sbmsh_rows * i_sbmsh_cols);
  prism_d_r_time /= (i_sbmsh_rows * i_sbmsh_cols);
  prism_d_m_time /= (i_sbmsh_rows * i_sbmsh_cols);
  d_mflops = 2.0*i_mdim*i_ndim*prism_i_kdim /(1.0e6*d_max_time);
  d_mflops_node = d_mflops / (i_sbmsh_rows * i_sbmsh_cols);

  /* Output through stdout */
  if(i_node_id == 0) {
/*
    fprintf(outfile,"on 2D topology (%d,%d)\n", i_2dcomm_rows, i_2dcomm_cols);
    fprintf(outfile,"M\t N\t K\t v_dim\t nw_x\t nw_y\t sb_row\t sb_col\n");
    fprintf(outfile,"%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n", 
	   i_mdim,i_ndim,prism_i_kdim,i_v_dim,i_sbmsh_nw_x,i_sbmsh_nw_y,
	   i_sbmsh_rows,i_sbmsh_cols);
    fprintf(outfile,"m_pnl_wth  n_pnl_wth  k_pnl_wth  pnl_spc  m_offset  n_offset  k_offset\n");
    fprintf(outfile,"   %d\t\t%d\t%d\t    %d\t     %d\t\t%d\t  %d\n", i_m_panelwidth, i_n_panelwidth, i_k_panelwidth, i_panel_spc, i_m_offset, i_n_offset, i_k_offset);
    fprintf(outfile,"max error = %10.6e\n", d_maxerr);

    fprintf(outfile,"max_time\t min_time\t avg_time\t mflops\t\tmflops_node \n");
    fprintf(outfile,"%8.4f\t %8.4f\t %8.4f\t%8.4f\t%8.4f\n",
	   d_max_time, d_min_time, d_avg_time, d_mflops,
	   d_mflops_node);

    fprintf(outfile,"broadcast:\n");
    fprintf(outfile,"%8.4f\t %8.4f\t %8.4f\n",
	   bmax_time, bmin_time, prism_d_b_time);

    fprintf(outfile,"multiply:\n");
    fprintf(outfile,"%8.4f\t %8.4f\t %8.4f\n",
	   mmax_time, mmin_time, prism_d_m_time);

    fprintf(outfile,"roll:\n");
    fprintf(outfile,"%8.4f\t %8.4f\t %8.4f\n",
	   rmax_time, rmin_time, prism_d_r_time);
*/
    fprintf(outfile,"mesh = [%d %d];\n", i_2dcomm_rows, i_2dcomm_cols);
    fprintf(outfile,"max_error = %e;\n", d_maxerr);
    fprintf(outfile,"%%mflops:      total         per_node\n");
    fprintf(outfile,"mflops =    [%e %e];\n", d_mflops,
	    d_mflops_node);
    fprintf(outfile,"%%time:         max           min          avg\n");
    fprintf(outfile,"tot_time =  [%e %e %e];\n",
	   d_max_time, d_min_time, d_avg_time);
    fprintf(outfile,"bc_time =   [%e %e %e];\n", bmax_time, bmin_time,
	    prism_d_b_time);
    fprintf(outfile,"mm_time =   [%e %e %e];\n", mmax_time, mmin_time,
	    prism_d_m_time);
    fprintf(outfile,"roll_time = [%e %e %e];\n", rmax_time, rmin_time,
	    prism_d_r_time);
  }

  MPI_Finalize();
  
  if( i_node_id == 0) {
    fprintf(outfile,"mm_stats\n");
    fprintf(outfile,"case=case+1;\n");
    fprintf(outfile,"%% All nodes (%d) done!\n", (i_2dcomm_rows * i_2dcomm_cols)); 
  }

  /* --- */
  /* End */
  /* --- */
}

/*
 * ---------------------------------------------------------------- *
 * Functions to generate a(i,j), b(i,j) and c(i,j). These functions *
 * use global variables prism_d_lambda and d_gammma as parameters.  *
 * ---------------------------------------------------------------- *
 */
  
double aij(int i, int j)
{
  double temp;

  temp = (1.0*(i+1) + prism_d_lambda*(j+1));
  
  return temp;
}

double bij(int i, int j)
{
  double temp;

  temp = (1.0*(i+1) + prism_d_gamma*(j+1));

  return temp;
}

double cij(int i, int j)
{
  /* Fills C with initial data to check that any user-supplied 
     matrix C will be added into result properly */

  double temp;

/* If value of temp is changed here, need to also change in last line of cij_result */

  temp = 1.0*i + prism_d_lambda*j;
  return temp;
}

double cij_result(int i, int j)
{
  /* Note: use global variable prism_i_kdim */
  int k;
  double temp;

  if (prism_e_transa == true && prism_e_transb == true)
    temp = (prism_d_lambda*prism_d_gamma*prism_i_kdim*(prism_i_kdim+1)*(i+1)/2.0
	    + 1.0*prism_i_kdim*(prism_i_kdim+1)*(j+1)/2.0
            + prism_d_lambda*prism_i_kdim*(i+1)*(j+1)
            + prism_d_gamma*prism_i_kdim*(2.0*prism_i_kdim*prism_i_kdim
	    + 3.0*prism_i_kdim+1.0)/6.0);
  else if (prism_e_transa == true)
    temp = (1.0*prism_i_kdim*(prism_i_kdim+1)*(2.0*prism_i_kdim+1.0))/6.0
            + prism_i_kdim*(prism_i_kdim+1)
	    *(prism_d_lambda*(i+1)+prism_d_gamma*(j+1))/2.0
	    + prism_d_lambda*prism_d_gamma*prism_i_kdim*(i+1)*(j+1);
  else if (prism_e_transb == true)
    temp = (prism_d_gamma*prism_i_kdim*(prism_i_kdim+1)*(i+1)/2.0
	    + prism_d_lambda*prism_i_kdim*(prism_i_kdim+1)*(j+1)/2.0
            + 1.0*prism_i_kdim*(i+1)*(j+1)
            + prism_d_lambda*prism_d_gamma*prism_i_kdim*(2.0*prism_i_kdim*prism_i_kdim
            + 3.0*prism_i_kdim+1.0)/6.0);
  else
    temp = (1.0*prism_i_kdim*(prism_i_kdim+1)*(i+1)/2.0
	    + prism_d_lambda*prism_d_gamma*prism_i_kdim*(prism_i_kdim+1.0)
	    *(j+1)/2.0 + prism_d_gamma*prism_i_kdim*(i+1)*(j+1)
	    + prism_d_lambda*prism_i_kdim
	    *(2.0*prism_i_kdim*prism_i_kdim + 3.0*prism_i_kdim+1.0)/6.0);

  /* 
   * Since C = alpha*A*B + beta*C, need to add in original C (from function cij 
   * above). Global variable d_alpha and d_beta are defined at top of this file.
   */

  temp = d_beta*(1.0*i + prism_d_lambda*j) + d_alpha*temp;

  return temp;
}
