#include "Bdef.h"

/*
 *  The bidirectional exchange topology (BE) is specialized for dealing with
 *  case where all nodes participating in the operation need to
 *  receive the answer.  It works best when # of nodes is some even
 *  power of two.  This topology is based on an algorithm presented by
 *  Robert van de Geijn, et al.
 */
void BE_comb(ctxt, scope, N, length, bp, bp2, Xvvop)
BLACSCONTEXT  *ctxt;
char  scope;
int  N;
int  length;
BLACBUFF  *bp;
BLACBUFF  *bp2;
VVFUNPTR  Xvvop;
/*
 *  -- V1.0 BLACS routine --
 *  University of Tennessee, February 28, 1995
 *  Written by Clint Whaley.
 *
 *  Purpose
 *  =======
 *  Perform a element-by-element combine on vectors.
 *  The answer will be left on all participating processes.  Since this method
 *  uses a hypercube communication pattern, the number of nodes participating
 *  in the operation must be a power of 2 for it to perform efficiently.
 *
 *  Arguments
 *  =========
 *  CTXT    (input) pointer to BLACSCONTEXT
 *          The BLACS context where operation is taking place.
 *
 *  SCOPE   (input) char
 *          Limit the scope of the operation.
 *          = 'r' :   Operation is performed by a process row.
 *          = 'c' :   Operation is performed by a process column.
 *          = 'a' :   Operation is performed by all processes in grid.
 *
 *  N       (input) int
 *          The number of elements in the vector to be combined.
 *
 *  LENGTH  (input) int
 *          The length, in bytes, of the vector to be combined.
 *
 *  BP      (input/output) pointer to BLACBUFF.
 *          BLACBUFF is a special data type used by the BLACS to control
 *          buffers and the asynchronous operations coming out of them.
 *          This BLACBUFF should have a buffer who's first N elements
 *          contain the data to be combined. Additional space may be
 *          required, depending upon what combine is being performed.
 *          In any case, the buffer's length should be LENGTH.
 *
 *  BP2     (workspace) pointer to BLACBUFF.
 *          This BLACBUFF is used to receive information for combining with
 *          this process's information.  It should point to a buffer of size
 *          LENGTH.
 *
 *  Xvvop   (input) pointer to typed operation function
 *          Points to a typed function which performs the required operation
 *          (e.g. summation) on the two N-element vectors.
 *
 * ------------------------------------------------------------------------
 */
{
   void Ssend2d00();
   void Srecv2d00();
   int BuffIsFree();
   void Asend2d00();
   void Arecv2d00();
   char *getbuff();

   char *buff, *buff2;
   int msgid, np2msgid, nnodes, rdest, cdest, mydist, np2, bit;
   int FTmsgid, FTnp2msgid;

   switch (scope)
   {
   case 'r':
      nnodes = ctxt->npcol;
      mydist = ctxt->mycol;
      break;
   case 'c':
      nnodes = ctxt->nprow;
      mydist = ctxt->myrow;
      break;
   case 'a':
      nnodes = ctxt->Ng;
      mydist = ctxt->vIam;
      break;
   default :
      return;
   }
   if (nnodes < 2) return;
   rdest = ctxt->myrow;
   cdest = ctxt->mycol;

   buff = bp->Buff;
   buff2 = bp2->Buff;

   for (np2=4; np2 < nnodes; np2 <<= 1);
   if (np2 > nnodes) np2 >>= 1;

   if (np2 != nnodes)
   {
      switch(scope)
      {
      case 'r':
#if (BeginForceType < EndForceType)
            np2msgid = Mrid(ctxt);
            FTnp2msgid = MFTrid();
#else
            np2msgid = FTnp2msgid = Mrid(ctxt);
#endif
         cdest = mydist ^ np2;
         break;
      case 'c':
#if (BeginForceType < EndForceType)
            np2msgid = Mcid(ctxt);
            FTnp2msgid = MFTcid();
#else
            np2msgid = FTnp2msgid = Mcid(ctxt);
#endif
         rdest = mydist ^ np2;
         break;
      case 'a':
#if (BeginForceType < EndForceType)
            np2msgid = Maid(ctxt);
            FTnp2msgid = MFTaid();
#else
            np2msgid = FTnp2msgid = Maid(ctxt);
#endif
         Mvpcoord(ctxt, ctxt->vIam^np2, rdest, cdest);
         break;
      }
      if (mydist >= np2)		/* I am node beyond power of 2 */
      {
         Arecv2d00(ctxt, bp, length, FTnp2msgid);
         Ssend2d00(ctxt, buff, length, rdest, cdest, np2msgid);
/*
 *       Update my message ID's to match those who participate in BE
 */
         for (bit=1; (bit ^ np2); bit <<= 1)
	 {
	    switch(scope)
	    {
	    case 'r':
	       Mrid(ctxt);
#if (BeginForceType < EndForceType)
                  MFTrid();
#endif
	       break;
            case 'c':
	       Mcid(ctxt);
#if (BeginForceType < EndForceType)
                  MFTcid();
#endif
	       break;
            case 'a':
	       Maid(ctxt);
#if (BeginForceType < EndForceType)
                  MFTaid();
#endif
	       break;
	    }
	 }
	 BuffIsFree(bp, 1);
      }
      else if (mydist < (nnodes^np2))  /* need to fan in contents of */
      {                                /* non-power of 2 nodes */
         Srecv2d00(ctxt, buff2, length, np2msgid);
         Xvvop(N, buff, buff2);
      }
   }

   if (mydist < np2)
   {
      for(bit=1; (bit^np2); bit <<= 1)
      {
         switch(scope)
         {
         case 'r':
            msgid = Mrid(ctxt);
#if (BeginForceType < EndForceType)
               FTmsgid = MFTrid();
#endif
            cdest = mydist ^ bit;
            break;
         case 'c':
            msgid = Mcid(ctxt);
#if (BeginForceType < EndForceType)
               FTmsgid = MFTcid();
#endif
            rdest = mydist ^ bit;
            break;
         case 'a':
            msgid = Maid(ctxt);
#if (BeginForceType < EndForceType)
               FTmsgid = MFTaid();
#endif
            Mvpcoord(ctxt, ctxt->vIam ^ bit, rdest, cdest);
            break;
         }
#if (BeginForceType < EndForceType)
            Arecv2d00(ctxt, bp2, length, FTmsgid);
	    Ssend2d00(ctxt, buff, 0, rdest, cdest, msgid);
	    Srecv2d00(ctxt, buff, 0, msgid);
	    Ssend2d00(ctxt, buff, length, rdest, cdest, FTmsgid);
	    BuffIsFree(bp2, 1);
#else
           Arecv2d00(ctxt, bp2, length, msgid);
           Ssend2d00(ctxt, buff, length, rdest, cdest, msgid);
           BuffIsFree(bp2, 1);
#endif
         Xvvop(N, buff, buff2);
      }  /* end for */
/*
 *  For nodes that are not part of the hypercube proper, we must
 *  send data back.
 */
      if (mydist < (nnodes^np2))
      {
         switch(scope)
         {
         case 'r':
            cdest = mydist ^ np2;
            break;
         case 'c':
            rdest = mydist ^ np2;
            break;
         case 'a':
            Mvpcoord(ctxt, mydist ^ np2, rdest, cdest);
            break;
         }
         Ssend2d00(ctxt, buff, length, rdest, cdest, FTnp2msgid);
      }
   }  /* end if (nodes inside power of 2) */
}
