/*
 * iterwrite.c like the read version has been modified here to read the 
 * data from a processor in increments of at most processor rows by column
 * blocking size.  This reduces the potential for running out of memory to 
 * buffer the sends because each processor sends but does not necessarily 
 * block on the send.  So done the previous way I could try to communicate
 * the entire matrix at once.  Not a good thing.
 *
 * this also needs the assorted ifdefs and multiple signatures for 
 * portable compilation.
 */

#include <fcntl.h>
#include <unistd.h>
#include <sys/types.h>
#include "core.h"
#include "scalapackservice.h"

int iterwrite(outfile,data,desca)
char *outfile;
double *data;
int *desca;
{

  int i;
  int maxrow,maxcol;			/* largest possoble rows/cols */
  int prow,pcol;			/* current process */
  int size=sizeof(double);
  int num;				/* is there actually something ?*/
  int seekval;				/* file position */
  int output_fd;        		/* output fd's */
  int *rows;				/* array by process or num rows */
  int *cols;				/*  ditto columns */
  int writeblock;			/* default size to write to file */
  int nrows,ncols;			/* number of row/col blocks */
  int rowblock,colblock;		/* iterators over blocks */
  int row_extra,col_extra;		/* excess rows/columns */

  double *buf;				/* the buffer to read from BLACS */
  double *ptr;				/* pointer to access the buffer*/
  double *myptr;			/* pointer to access the buffer*/


  if((MYROW==0) &&(MYCOL==0)) {

      output_fd=open(outfile,O_RDWR);
      if(output_fd <0 ) {
	perror("output file");
	exit(1);
	}

      rows=(int *)malloc(sizeof(int)*PROC_ROWS);
      cols=(int *)malloc(sizeof(int)*PROC_COLS);
      if((rows==NULL) || (cols==NULL) ) {
	perror("row/col arrays ");
	exit(1);
	}


      maxrow=0;
      for(i=0;i<PROC_ROWS;i++) {
	rows[i]=HPC_numroc(desca[M_],ROW_BLOCK,i,0,PROC_ROWS);
	if(rows[i]>maxrow) {
	  maxrow=rows[i];
	  }
	}
      
      maxcol=0;
      for(i=0;i<PROC_COLS;i++) {
	cols[i]=HPC_numroc(desca[N_],COL_BLOCK,i,0,PROC_COLS);
	if(cols[i]>maxcol) {
	  maxcol=cols[i];
	  }
	}
      /*
       * change this to something a lot smaller because I think the
       *  problem is too much memory tied up.
       * so handle one block at a time.
       */
      
      buf=(double *)malloc(size*maxrow*COL_BLOCK);
      if(buf==NULL) {
	perror("buffer");
	exit(1);
	}
      writeblock=ROW_BLOCK*8;

      nrows=desca[M_] / ROW_BLOCK;
      ncols=desca[N_] / COL_BLOCK;
      row_extra=desca[M_] % ROW_BLOCK;
      col_extra=desca[N_] % COL_BLOCK;

      myptr=data;
      for(prow=0;prow<PROC_ROWS;prow++) {
	for(pcol=0;pcol<PROC_COLS;pcol++) {

	  /*if this test is true then the current */
	  /*processor actually has data */

	  num=rows[prow]*cols[pcol];
	  if(num > 0) {
	    if((MYROW == prow) && (MYCOL == pcol)) {
	    /*
	     * first handle myself.  nothing special here
	     * data already in memory and no Blacs calls
	     */
#ifdef DEBUG
	    sprintf(logbuf,"Handling myself" );
	    append_to_log(logbuf);
#endif

	    /*seek from start for this block */
	    /* the 5 ints are for the matrix information */

	    seekval=(pcol*desca[M_]*COL_BLOCK);
	    lseek(output_fd,(seekval*size)+(5*sizeof(int)),SEEK_SET);
	    seekval=0;

	    for(colblock=pcol;colblock<ncols;colblock++) {
	      if((colblock%PROC_COLS)==pcol) {
	      for(i=0;i<COL_BLOCK;i++) {
		for(rowblock=0;rowblock<nrows;rowblock++) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,myptr,writeblock);
		    myptr+=ROW_BLOCK;
		    }
		  else {
		    seekval+=ROW_BLOCK;
		    }
		  }
		if(row_extra > 0) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,myptr,row_extra*size);
		    myptr+=row_extra;
		    }
		  else {
		    seekval+=row_extra;
		    }
		  }
		}
	      }
	    else {
	      seekval+=desca[M_]*COL_BLOCK;
	      }
	    }

	    if(col_extra > 0) {
	      if((colblock%PROC_COLS)==pcol) {
	      for(i=0;i<col_extra;i++) {
		for(rowblock=0;rowblock<nrows;rowblock++) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,myptr,writeblock);
		    myptr+=ROW_BLOCK;
		    }
		  else {
		    seekval+=ROW_BLOCK;
		    }
		  }
		if(row_extra > 0) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,myptr,row_extra*size);
		    myptr+=row_extra;
		    }
		  else {
		    seekval+=row_extra;
		    }
		  }
		}
	      }
	      }
	    } /* end of me */
	else {
	    /*
	     *  read from processor prowxpcol
	     *  now I break up the receives by COL_BLOCK chunks
	     *  to keep from overloading things I hope
	     *  The point is all other processors send immediately
	     *  which is 'locally blocking'  that is until their
	     *  data can be written to a buffer but globally I think it
	     *  is wiping all the available memory for big problems.
	     *  If this doesn't fix it I will try barriers and only
	     *  allow processors to send one at a time.  That is not
	     *  efficient but may be necessary because of reality.
	     */
#ifdef DEBUG
	    sprintf(logbuf,"Handling %d x %d",prow,pcol);
	    append_to_log(logbuf);
#endif

	    /*seek from start for this block */
	    /* the 5 ints are for the matrix information */

	    seekval=(pcol*desca[M_]*COL_BLOCK);
	    lseek(output_fd,(seekval*size)+(5*sizeof(int)),SEEK_SET);
	    seekval=0;

	    for(colblock=pcol;colblock<ncols;colblock++) {
	      if((colblock%PROC_COLS)==pcol) {
		/*Cdgerv2d(desca[CTXT_],rows[prow],COL_BLOCK,buf,
			    rows[prow],prow,pcol);  */
		ptr=buf;
		for(i=0;i<COL_BLOCK;i++) {
		for(rowblock=0;rowblock<nrows;rowblock++) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,ptr,writeblock);
		    ptr+=ROW_BLOCK;
		    }
		  else {
		    seekval+=ROW_BLOCK;
		    }
		  }
		if(row_extra > 0) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,ptr,row_extra*size);
		    ptr+=row_extra;
		    }
		  else {
		    seekval+=row_extra;
		    }
		  }
		}
	      }
	    else {
	      seekval+=desca[M_]*COL_BLOCK;
	      }
	    }

	    if(col_extra > 0) {
	      if((colblock%PROC_COLS)==pcol) {
		/*Cdgerv2d(desca[CTXT_],rows[prow],col_extra,buf,
			    rows[prow],prow,pcol);  */
		ptr=buf;
	      for(i=0;i<col_extra;i++) {
		for(rowblock=0;rowblock<nrows;rowblock++) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,ptr,writeblock);
		    ptr+=ROW_BLOCK;
		    }
		  else {
		    seekval+=ROW_BLOCK;
		    }
		  }
		if(row_extra > 0) {
		  if((rowblock % PROC_ROWS)==prow) {
		    if(seekval>0) {
		      lseek(output_fd,seekval*size,SEEK_CUR);
		      seekval=0;
		      }
		    write(output_fd,ptr,row_extra*size);
		    ptr+=row_extra;
		    }
		  else {
		    seekval+=row_extra;
		    }
		  }
		}
	      }
	    }
	  } /* end of not me */
	  } /* end of num > 0 bytes to process */
	}
      }
      
    close(output_fd);

    /* just don't want a seg fault */

    if(buf) free(buf);
    if(rows) free(rows);
    if(cols) free(cols);
    }
  else {
    maxrow=HPC_numroc(desca[M_],desca[MB_],MYROW,0,PROC_ROWS);
    maxcol=HPC_numroc(desca[N_],desca[NB_],MYCOL,0,PROC_COLS);
    if((maxrow==0) || (maxcol==0)) {
      return(0);
      }
    col_extra=maxcol%COL_BLOCK;
    myptr=data;
    for(i=COL_BLOCK;i<=maxcol;i+=COL_BLOCK) {
      /*Cdgesd2d(desca[CTXT_],maxrow,COL_BLOCK,myptr,maxrow,0,0); */
      myptr=myptr+(COL_BLOCK*maxrow);
      }
    if(col_extra>0){
      /*Cdgesd2d(desca[CTXT_],maxrow,col_extra,myptr,maxrow,0,0); */
      }
    return(0);
    }
   return(0);
}
