/* mpi_bmmc.c

   BMMC-MPI permutation functions.
   */

/* $Id: bmmc_mpi.c,v 1.3 1997/05/06 22:01:39 thc Exp $
   $Log: bmmc_mpi.c,v $
   Revision 1.3  1997/05/06 22:01:39  thc
   Added copyright notice.

   Revision 1.2  1997/04/28 03:53:08  james
   Changed comments in declaration of Q and Q_inv in factor_BMMC_MPI to
   reflect actual usage.

   Revision 1.1  1997/04/18 18:23:59  thc
   Initial revision

   */

/*
 * Copyright (C) 1997, Thomas H. Cormen, thc@cs.dartmouth.edu
 *
 * This software may be freely copied, modified, and redistributed,
 * provided that this copyright notice is preserved on all copies.
 *
 * There is no warranty or other guarantee of fitness for this
 * software, and it is provided solely "as is".  Bug reports or fixes
 * may be sent to the author, who may or may not act on them as he
 * desires.
 *
 * Rights are granted to use this software in any non-commercial
 * enterprise.  For commercial rights to this software, please contact
 * the author.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bit_matrix_types.h"
#include "bit_matrix_fns.h"
#include "mpi.h"
#include "bmmc_mpi.h"

/* Prototypes for private functions. */

static void get_trailer(bit_matrix T,
			bit_matrix A,
			int rows,
			int cols);

static int get_reducer(bit_matrix R,
		       bit_matrix Pi,
		       bit_matrix A,
		       int rows,
		       int cols);


/* Once allocated, an easy way to free all the factored information. */
void free_BMMC_MPI_factor_info(BMMC_MPI_factor_info *info)
{
#define FREEIT(what) if (info->what != NULL) free(info->what);

  FREEIT(alpha_S);
  FREEIT(alpha_Sbar);
  FREEIT(beta);
  FREEIT(gamma_S);
  FREEIT(delta);
  FREEIT(delta_inv);
  FREEIT(in_proc_perm);
}


/* Create the factored information.
   Inputs:
     A: matrix to factor
     c: complement vector
     n: log of problem size (= rows and columns of A)
     p: log of number of processors
     f: bits f through f+p-1 of an index indicate which processor
        the element is on.  Bit numbers run 0 to n-1.

   Output:
     Fields of info are allocated and filled in.
     Return value is 0 if successful, nonzero if error.

   Note:
     The caller should call free_BMMC_MPI_factor_info later on, though
     it is not necessary if factor_BMMC_MPI returns an error code
     other than 0.
*/
int factor_BMMC_MPI(bit_matrix A,
		    matrix_column c,
		    int n,
		    int p,
		    int f,
		    BMMC_MPI_factor_info *info)
{
  bit_matrix A_inv;		/* inverse of A */
  bit_matrix A_temp;		/* temp copy of A */
  bit_matrix A_proc;		/* processor rows of A */
  bit_matrix T;			/* trailer matrix */
  bit_matrix R;			/* reducer matrix */
  bit_matrix Pi;		/* bit permutation matrix */
  bit_matrix T_R_Pi;		/* matrix product */
  bit_matrix A_prime;		/* working matrix */
  bit_matrix Q;			/* converts proc-major bit layout to actual */
  bit_matrix Q_inv;		/* converts actual bit layout to proc-major */
  int nonsingular;		/* is A nonsingular? */
  int card_Sbar;		/* n - p - card_S */
  int j;			/* column index */

  /* Set all pointers in info to NULL, just in case we leave
     factor_BMMC_MPI early and free_BMMC_MPI_factor_info is called
     later on. */
  info->alpha_S = NULL;
  info->alpha_Sbar = NULL;
  info->beta = NULL;
  info->gamma_S = NULL;
  info->delta = NULL;
  info->delta_inv = NULL;
  info->in_proc_perm = NULL;

  /* Check that the parameters are reasonable. */
  if (n < 0 || p < 0 || p > n || f < 0 || f > n-p)
    return BMMC_MPI_BAD_FACTOR;

  /* All bets are off if A is singular (not invertible). */
  A_inv = allocate_bit_matrix(n);
  nonsingular = invert_bit_matrix(A_inv, A, n);
  free_bit_matrix(A_inv);
  if (!nonsingular)
    return BMMC_MPI_SINGULAR;

  /* Adjust A and c according to where the processor bits really are.
     A_temp is a copy of A, and then we set A to point to the same
     matrix as A_temp.  Since A is just a local pointer, the caller
     doesn't see this change, which is as it should be.

     The matrix Q_inv converts the actual data layout used to a
     processor-major layout, which is what perform_BMMC_MPI assumes.
     Q converts the processor-major layout to the actual data layout.

     The adjusted matrix is the product Q A Q_inv, and the adjusted
     complement vector is Q c.

     Q and Q_inv are block matrices of the following forms:

            f     p   n-p-f                      f   n-p-f   p
         +-----+-----+-----+                  +-----+-----+-----+
         |  I  |  0  |  0  | f                |  I  |  0  |  0  | f
         +-----+-----+-----+                  +-----+-----+-----+
     Q = |  0  |  0  |  I  | n-p-f    Q_inv = |  0  |  0  |  I  | p
         +-----+-----+-----+                  +-----+-----+-----+
         |  0  |  I  |  0  | p                |  0  |  I  |  0  | n-p-f
         +-----+-----+-----+                  +-----+-----+-----+

     I denotes a square submatrix that is the identity matrix.  Note
     that if the actual layout is already processor-major, then f = n-p,
     and so n-p-f = 0 and Q = Q_inv = I.
     */

  A_temp = dup_bit_matrix(A, n);
  A = A_temp;
  Q = allocate_bit_matrix(n);
  Q_inv = allocate_bit_matrix(n);
  identity_matrix(Q, n);
  identity_matrix(Q_inv, n);
  for (j = 0; j < p; j++)
    {
      Q_inv[n-p+j] = ((matrix_column) 1) << (f+j);
      Q[f+j] = ((matrix_column) 1) << (n-p+j);
    }
  for (j = 0; j < n-p-f; j++)
    {
      Q_inv[f+j] = ((matrix_column) 1) << (f+p+j);
      Q[f+p+j] = ((matrix_column) 1) << (f+j);
    }

#ifdef DEBUG
  {
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (rank == 0)
      {
	print_bit_matrix(Q, n, n, "Q");
	print_bit_matrix(Q_inv, n, n, "Q_inv");
      }
  }
#endif

  c = bit_matrix_vector_multiply(Q, c, n);
  bit_matrix_multiply(Q_inv, A, Q_inv, n);
  bit_matrix_multiply(A, Q, Q_inv, n);
  free_bit_matrix(Q_inv);
  free_bit_matrix(Q);

#ifdef DEBUG
  {
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (rank == 0)
      {
	print_bit_matrix(A, n, n, "A");
	print_bit_matrix(&c, n, 1, "c");
      }
  }
#endif

  /* Make T be the trailer matrix such that A T has a nonsingular
     trailing p x p submatrix.  A_proc is the processor rows of A,
     i.e., the bottom p rows. */
  A_proc = allocate_bit_matrix(n);
  extract_bit_submatrix(A_proc, A, n-p, 0, p, n);
  T = allocate_bit_matrix(n);
  get_trailer(T, A_proc, p, n);
  
  /* Make R be the reducer matrix such that the only nonzero columns
     in the gamma portion of (A T) R (i.e., the lower left p x (n-p)
     submatrix) form a basis S (i.e., are linearly independent).  The
     call to get_reducer will return the cardinality of this basis.
     Also, fill in the bit permutation Pi that moves nonbasis columns
     of gamma to the left and basis columns of gamma to the right. */
  R = allocate_bit_matrix(n);
  Pi = allocate_bit_matrix(n);
  info->card_S = get_reducer(R, Pi, A_proc, p, n);
  card_Sbar = n - p - info->card_S;
  free_bit_matrix(A_proc);

  /* A_prime = A T R Pi.  And the in-processor permutation is (T R
     Pi)^{-1}. */
  T_R_Pi = allocate_bit_matrix(n);
  A_prime = allocate_bit_matrix(n);
  bit_matrix_multiply(T_R_Pi, R, Pi, n);
  bit_matrix_multiply(T_R_Pi, T, T_R_Pi, n);
  info->in_proc_perm = allocate_bit_matrix(n);
  invert_bit_matrix(info->in_proc_perm, T_R_Pi, n);
  bit_matrix_multiply(A_prime, A, T_R_Pi, n);
  free_bit_matrix(T_R_Pi);
  free_bit_matrix(Pi);
  free_bit_matrix(R);
  free_bit_matrix(T);

  /* Allocate and extract our favorite submatrices. */
  info->alpha_S = allocate_bit_matrix(info->card_S);
  info->alpha_Sbar = allocate_bit_matrix(card_Sbar);
  info->beta = allocate_bit_matrix(p);
  info->gamma_S = allocate_bit_matrix(info->card_S);
  info->delta = allocate_bit_matrix(p);
  info->delta_inv = allocate_bit_matrix(p);

  extract_bit_submatrix(info->alpha_S, A_prime,
			0, card_Sbar, n-p, info->card_S);
  extract_bit_submatrix(info->alpha_Sbar, A_prime, 0, 0, n-p, card_Sbar);
  extract_bit_submatrix(info->beta, A_prime, 0, n-p, n-p, p);
  extract_bit_submatrix(info->gamma_S, A_prime,
			n-p, card_Sbar, p, info->card_S);
  extract_bit_submatrix(info->delta, A_prime, n-p, n-p, p, p);
  invert_bit_matrix(info->delta_inv, info->delta, p);

  /* The final piece of real work is to extract the pieces of the
     complement vector. */
  info->c_offset = c & ~((~((matrix_column) 0)) << (n-p));
  info->c_proc = (c >> (n-p)) & ~((~((matrix_column) 0)) << p);

  /* Finish up by freeing the remaining matrix that we no longer
     need. */
  free_bit_matrix(A_prime);  

  return BMMC_MPI_OK;
}


/* Perform the BMMC permutation, given the factored information.
   Inputs:
     info: factored information
     n:    log of problem size (= rows and columns of A)
     p:    log of number of processors
     rank: rank of this processor, in the range 0 to 2^p-1
     comm: MPI communicator
     size: size of each record, in bytes
     data: pointer to array of 2^(n-p) records, of above size, to permute
     temp: pointer to array of same size as data, for temp use

   Outputs:
     data is overwritten with the result of the permutation.
     Return value is 0 if successful, nonzero if error.

   It is the caller's responsibility to call
   free_BMMC_MPI_factor_info(info) and to deallocate temp and any
   other parameters given to this function afterward.  */
int perform_BMMC_MPI(BMMC_MPI_factor_info *info,
		     int n,
		     int p,
		     int rank,
		     MPI_Comm comm,
		     int size,
		     void *data,
		     void *temp)
{

  /* Macro to determine the next bit that flips in a Gray-code sequence. */
#define GRAY_BIT(type,ordinal,flip)					     \
  {									     \
    static int flips[16] = { 4, 0, 1, 0, 2, 0, 1, 0,			     \
			     3, 0, 1, 0, 2, 0, 1, 0 };			     \
    type x;								     \
    int nibble;								     \
    for (x = ordinal, flip = 0; (nibble = x & 0xF) == 0; x >>= 4, flip += 4) \
      ;									     \
    flip += flips[nibble];						     \
  }

  /* If you don't like this magic number, replace it with your favorite
     magic number.  MPI requires that we use something, but we really
     don't care what. */
#define BMMC_MPI_MSG_TAG 86

  long int vpr = ((long int) 1) << (n-p); /* N/P */
  long int source_index;	/* source index for a permutation */
  long int target_index;	/* target index for a permutation */
  long int ordinal;		/* ordinal for a Gray code */
  int flip;			/* bit that flips in a Gray code */
  int target_procs;		/* how many target processors */
  long int target_recs;		/* how many records per target processor */
  long int S_bits;		/* basis bits in leftmost n-p columns */
  long int Sbar_bits;		/* non-basis bits in leftmost n-p columns */
  void *buffer_addr;		/* address of buffer to send/receive */
  int target_proc;		/* processor number to send to */
  int source_proc;		/* processor number sending to us */

  /* Submatrix-vector subproducts. */
  long int offset_alpha_S_part, offset_beta_part,
    target_proc_gamma_part, target_proc_delta_part, source_proc_delta_part;

  /* Bad news if any of the factors are missing or if card_S is
     out of range. */
  if (info->alpha_S == NULL ||
      info->alpha_Sbar == NULL ||
      info->beta == NULL ||
      info->gamma_S == NULL ||
      info->delta == NULL ||
      info->in_proc_perm == NULL ||
      info->card_S < 0 || info->card_S > n-p || info->card_S > p)
    return BMMC_MPI_BAD_FACTOR;

  /* First perform the in-processor permutation given by
     info->in_proc_perm.  Perform it in Gray-code order for speed.
     However, make sure that in the target index, all bits above m-p
     are 0. */

#ifdef DEBUG
  {
    long int offset;
    printf("Proc %d: start with data buffer\n", rank);
    for (offset = 0; offset < vpr; offset++)
      printf("%ld: %lx\n", offset, *(((long int *) data) + offset));

    print_bit_matrix(info->in_proc_perm, n, n, "in_proc_perm");
  }
#endif

#ifdef VERBOSE
  if (rank == 0)
    printf("Performing the in-processor permutation...\n");
#endif

  for (source_index = 0, ordinal = 0,
	 target_index = bit_matrix_vector_multiply(info->in_proc_perm+n-p,
						   rank, p) & 
	                  ~((~((matrix_column) 0)) << (n-p));
       ordinal < vpr;
       ordinal++)
    {
#ifdef DEBUG
      printf("Proc %d: copy from %ld to %ld\n",
	     rank, source_index, target_index);
#endif
      /* Copy the record from position source_index of the data buffer
	 to position target_index of the temp buffer. */
      bcopy(data + (source_index * size), temp + (target_index * size), size);

      /* Find which bit flips in the Gray-code ordering.  Then flip it
	 in the source index and add the appropriate column of
	 in_proc_perm into target_index. */
      GRAY_BIT(long int, ordinal+1, flip);
      source_index ^= ((long int) 1) << flip;
      target_index ^= info->in_proc_perm[flip];
    }

#ifdef DEBUG
  {
    long int offset;
    printf("Proc %d: after in-proc perm, temp buffer is\n", rank);
    for (offset = 0; offset < vpr; offset++)
      printf("%ld: %lx\n", offset, *(((long int *) temp) + offset));
  }
#endif

  /* Now for the interesting part: the interprocessor communication.
     After performing the in-processor permutation, the temp buffer is
     comprised of 2^card_S consecutive sections, each with
     (N/(P*2^card_S)) records.  Each section is destined for a
     different target processor.  Send it off, and simultaneously
     receive a section of the same size from another processor.  Then
     distribute the received records into the data buffer. */

  target_procs = 1 << info->card_S;
  target_recs = ((long int) 1) << (n - p - info->card_S);

  /* Precompute the submatrix-vector products that don't change. */
  target_proc_delta_part = bit_matrix_vector_multiply(info->delta,
			     (matrix_column) rank, p);
  source_proc_delta_part = bit_matrix_vector_multiply(info->delta_inv,
			     (matrix_column) rank, p);

  /* The target processor will differ for each combination of S_bits. */
  for (S_bits = 0, buffer_addr = temp;
       S_bits < target_procs;
       S_bits++, buffer_addr += target_recs * size)
    {
      /* Precompute more submatrix-vector products that don't change. */
      offset_alpha_S_part = bit_matrix_vector_multiply(info->alpha_S,
			      (matrix_column) S_bits, info->card_S);
      target_proc_gamma_part = bit_matrix_vector_multiply(info->gamma_S,
				 (matrix_column) S_bits, info->card_S);

      /* We now know enough to compute the target processor that we
	 are sending to and the source processor that is sending to us. */
      target_proc = target_proc_gamma_part ^ target_proc_delta_part ^
	              info->c_proc;
      source_proc = bit_matrix_vector_multiply(info->delta_inv,
		      (matrix_column) (target_proc_gamma_part ^ rank ^
				       info->c_proc),
		      p);
      offset_beta_part = bit_matrix_vector_multiply(info->beta,
			   (matrix_column) source_proc, p);

      /* If I'm sending to myself, then there's no need to send.
	 Otherwise, yes, there's a need to send.  Of course, if I'm
	 sending to myself, then I'm receiving from myself, and
	 there's no need to receive, either. */
      if (target_proc != rank)
	{
	  MPI_Status status;

#ifdef DEBUG
	  long int offset;
	  printf("Proc %d: sending buffer to proc %d\n", rank, target_proc);
	  for (offset = 0; offset < target_recs; offset++)
	    printf("%ld: %lx\n",
		   offset, *(((long int *) buffer_addr) + offset));
#endif

#ifdef VERBOSE
	  if (rank == 0)
	    printf("Processor 0 sending to %d, receiving from %d...\n",
		   target_proc, source_proc);
#endif

	  MPI_Sendrecv_replace(buffer_addr, target_recs * size,
			       MPI_BYTE, target_proc, BMMC_MPI_MSG_TAG,
			       source_proc, BMMC_MPI_MSG_TAG, comm,
			       &status);
	  if (status.MPI_ERROR != 0)
	    return BMMC_MPI_MPI_ERROR;

#ifdef DEBUG
	  printf("Proc %d: got buffer from proc %d\n", rank, source_proc);
	  for (offset = 0; offset < target_recs; offset++)
	    printf("%ld: %lx\n",
		   offset, *(((long int *) buffer_addr) + offset));
#endif
	}
#ifdef VERBOSE
      else
	{
	  if (rank == 0)
	    printf("Processor 0 sending to itself...\n");
	}

      if (rank == 0)
	printf("Processor 0 distributing...\n");
#endif

      /* At this point, there are target_recs records starting at
	 buffer_addr to be distributed into the data buffer.  We
	 compute the source index of each (in Gray-code order) and
	 from that, the target index, which is all we really need to
	 know. */
      for (Sbar_bits = 0, ordinal = 0,
	     target_index = offset_alpha_S_part ^ offset_beta_part ^
	                      info->c_offset;
	   ordinal < target_recs;
	   ordinal++)
	{
	  /* Copy the record from position Sbar_bits of the current
	     section of the temp buffer to position target_index of
	     the data buffer. */
	  bcopy(buffer_addr + (Sbar_bits * size),
		data + (target_index * size),
		size);

	  /* Find which bit flips in the Gray-code ordering.  Then
	     flip it in Sbar_bits and add the appropriate column of
	     info->alpha_Sbar into target_index. */
	  GRAY_BIT(long int, ordinal+1, flip);
	  Sbar_bits ^= ((long int) 1) << flip;
	  target_index ^= info->alpha_Sbar[flip];
	}
    }

  /* Elvis has left the building...in a good way. */
  return BMMC_MPI_OK;
}


/* Compute the trailer matrix T such that in the matrix product A T,
   the trailing rows x rows submatrix is nonsingular.
   Inputs:
     A: a matrix with full row rank
     rows, cols: A is rows x cols, T is cols x cols

   Output:
     T, which must already be allocated, is filled in.
     */
static void get_trailer(bit_matrix T,
			bit_matrix A,
			int rows,
			int cols)
{
  matrix_column basis;		/* column basis for A */
  int left_index, right_index;	/* column indices for A */

  /* Start out with T being the identity matrix. */
  identity_matrix(T, cols);

  /* Find a column basis for A, from right to left. */
  find_bit_matrix_basis(&basis, A, rows, cols);

  /* Each column in { cols-rows, ..., cols-1 } NOT in the basis needs
     to have one column of { 0, ..., cols-rows-1 } IN the basis added
     in.  We match them up, one-by-one, and record the result in T. */
  for (left_index = 0, right_index = cols-rows;
       right_index < cols;
       right_index++)
    {
      /* Is this column of A NOT in the basis? */
      if (((basis >> right_index) & 1) == 0)
	{
	  /* It's not, so find the next column in 0 to cols-rows-1
	     that is. */
	  for ( ; ((basis >> left_index) & 1) == 0; left_index++)
	    ;

	  /* Now column left_index of A IS in the basis, and column
	     right_index of A is NOT.  Add the columns together in
	     T. */
	  T[right_index] |= T[left_index];
	  left_index++;		/* don't look at this column again */
	}
    }
}


/* Compute the reducer matrix R and bit permutation matrix Pi.  The
   reducer matrix makes it so that in the matrix product A R, the only
   nonzero columns of the lower left rows x (cols-rows) submatrix are
   linearly independent, i.e., they form a basis.  The bit permutation
   matrix makes it so that, if S indexes the basis columns so that
   there are |S| of them, then the leftmost cols-rows-|S| columns of
   the lower left rows x (cols-rows) submatrix of the product A R Pi
   are all 0, and the remaining |S| columns are the basis columns.
   Inputs:
     A: a matrix with full row rank
     rows, cols: A is rows x cols, R and Pi are cols x cols

   Outputs:
     R and Pi, which are already allocated, are filled in.
     Return value is |S|.  */
static int get_reducer(bit_matrix R,
		       bit_matrix Pi,
		       bit_matrix A,
		       int rows,
		       int cols)
{
  bit_matrix dep;		/* column dependency matrix for A */
  matrix_column S;		/* basis columns of leftmost  */
  int card_S;			/* cardinality of S */
  int card_Sbar;		/* cardinality of Sbar = cols-rows-card_S */
  int j, in_S, in_Sbar;		/* column indices */

  /* Determine the dependencies of the leftmost cols-rows columns of
     A.  dep is a rows x rows bit matrix, where the (i,j) entry is 1
     if column j depends on column i of the leftmost cols-rows columns
     of A.  If j is a basis column, then the jth column of dep is all
     0. */
  dep = allocate_bit_matrix(cols);
  S = find_dependencies(dep, A, rows, cols-rows);

  /* R starts out as a cols x cols identity matrix. */
  identity_matrix(R, cols);

  /* But we OR the first cols-rows columns of dep into R.  While we're
     at it, determine the cardinality of S. */
  for (card_S = 0, j = 0; j < cols-rows; j++)
    {
      R[j] |= dep[j];

      if ((S >> j) & 1)
	card_S++;
    }

  free_bit_matrix(dep);

  /* Now compute Pi so that the product A R Pi is as advertised. */
  card_Sbar = cols - rows - card_S;

  for (j = 0, in_S = card_Sbar, in_Sbar = 0;
       j < cols - rows;
       j++, S >>= 1)
    {
      if (S & 1)
	Pi[in_S++] = ((matrix_column) 1) << j;
      else
	Pi[in_Sbar++] = ((matrix_column) 1) << j;
    }

  for (j = cols - rows; j < cols; j++)
    Pi[j] = ((matrix_column) 1) << j;

  /* Finish by returning the cardinality of S. */
  return card_S;
}

/**********************************************************************/

/* Convenience wrappers. */

/* Perform a BMMC permutation.
   Inputs:
     A:    matrix to factor
     c:    complement vector
     n:    log of problem size (= rows and columns of A)
     p:    log of number of processors
     f:    bits f through f+p-1 of an index indicate which processor
           the element is on.  Bit numbers run 0 to n-1.
     rank: rank of this processor, in the range 0 to 2^p-1
     comm: MPI communicator
     size: size of each record, in bytes
     data: pointer to array of 2^(n-p) records, of above size, to permute
     temp: pointer to array of same size as data, for temp use

   Outputs:
     data is overwritten with the result of the permutation.
     Return value is 0 if successful, nonzero if error.

   It is the caller's responsibility deallocate temp and any other
   parameters given to this function afterward.
   */
int BMMC_MPI(bit_matrix A,
	     matrix_column c,
	     int n,
	     int p,
	     int f,
	     int rank,
	     MPI_Comm comm,
	     int size,
	     void *data,
	     void *temp)
{
  int error_code;
  BMMC_MPI_factor_info info;

  /* Factor the permutation. */
  if ((error_code = factor_BMMC_MPI(A, c, n, p, f, &info)) != BMMC_MPI_OK)
    return error_code;

  /* Perform it. */
  error_code = perform_BMMC_MPI(&info, n, p, rank, comm, size, data, temp);

  /* Free the factor info. */
  free_BMMC_MPI_factor_info(&info);

  return error_code;
}

/* The following functions are the same as BMMC_MPI_factor_info and
   BMMC_MPI except that they do not take the f parameter.

   For BMMC_MPI_factor_info_proc_major and BMMC_MPI_proc_major, data
   is assumed to be in processor-major order, so that the most
   significant p bits contain the processor number and the least
   significant n-p bits contain the offset within the processor.  The
   processor number of element i is floor(i/(N/P)), where N = 2^n and
   P = 2^p.

   For BMMC_MPI_factor_info_proc_minor and BMMC_MPI_proc_minor, data
   is assumed to be in processor-minor order, so that the least
   significant p bits contain the processor number and the most
   significant n-p bits contain the offset within the processor.  The
   processor number of element i is i mod P.
   */

int factor_BMMC_MPI_proc_major(bit_matrix A,
			       matrix_column c,
			       int n,
			       int p,
			       BMMC_MPI_factor_info *info)
{
  return factor_BMMC_MPI(A, c, n, p, n-p, info);
}

int factor_BMMC_MPI_proc_minor(bit_matrix A,
			       matrix_column c,
			       int n,
			       int p,
			       BMMC_MPI_factor_info *info)
{
  return factor_BMMC_MPI(A, c, n, p, 0, info);
}

int BMMC_MPI_proc_major(bit_matrix A,
			matrix_column c,
			int n,
			int p,
			int rank,
			MPI_Comm comm,
			int size,
			void *data,
			void *temp)
{
  return BMMC_MPI(A, c, n, p, n-p, rank, comm, size, data, temp);
}

int BMMC_MPI_proc_minor(bit_matrix A,
			matrix_column c,
			int n,
			int p,
			int rank,
			MPI_Comm comm,
			int size,
			void *data,
			void *temp)
{
  return BMMC_MPI(A, c, n, p, 0, rank, comm, size, data, temp);
}
