
static gm_inline  MPIR_SHANDLE * lookup_sent_evt(void *buf)
{
  struct gmpi_stoken **tokp;
  MPIR_SHANDLE *shandle;
  for (tokp = &gmpi.used_stoken;;tokp = &(*tokp)->next) {
    struct gmpi_stoken * t = *tokp;
    gm_assert_p(t);
    gm_assert_p(t->buf);
    if (t->buf == buf) {
      shandle = t->shandle;
      t->buf = 0;
      t->shandle = 0;
      *tokp = t->next;
      t->next = gmpi.free_stoken;
      gmpi.free_stoken = t;
      return shandle;
    }
  }
  gm_abort();
  return 0;
}
  

static void gm_inline remember_send(void *buf, MPIR_SHANDLE *shandle)
{
  struct gmpi_stoken *stoken = gmpi.free_stoken;
  gm_assert_p(stoken);
  gm_assert_p(!stoken->buf);
  gm_assert_p(!stoken->shandle);

  gmpi.free_stoken = stoken->next;
  stoken->next = gmpi.used_stoken;
  gmpi.used_stoken = stoken;
  stoken->shandle = shandle;
  stoken->buf = buf;

}

void gmpi_net_datasend(int dest,void *buf,int length,int tag,MPIR_SHANDLE *shandle)
{
  char *p = buf;
  gm_assert_p(length > 0);
  GM_PRINT(2,("netsend:0x%p,%d,tag(%d),%s-copy, \"%2x:%2x:%2x:%2x\"\n",
	      buf,length,tag,shandle->gm.copy?"memory":"zero",p[0],p[1],p[2],p[3]));
  if (!shandle->gm.copy)
    gmpi_use_interval((unsigned long)buf,length);
  gm_send(gmpi.port, buf, tag+GMPI_FIRST_TAG,
	  length, GM_LOW_PRIORITY, gmpi.node_ids[dest], gmpi.port_ids[dest]);
#if 0
  printf("sent from buffer %p %s:%d\n",buf, __FILE__, __LINE__ );
  fflush(stdout);
#endif
  remember_send(buf,shandle);
}

void gmpi_net_ctlsend(int dest,void *buf,int length)
{
  gm_send(gmpi.port, buf, GMPI_CONTROL_TAG,
	  length, GM_LOW_PRIORITY, gmpi.node_ids[dest], gmpi.port_ids[dest]);
#if 0
  printf("sent from buffer %p %s:%d\n",buf, __FILE__, __LINE__ );
  fflush(stdout);
#endif
  remember_send(buf,0);
}

void gmpi_netpostrecv(void *buf,int length,int tag,MPIR_RHANDLE *rhandle,int copy)
{
  GM_PRINT(2,("postrecv:0x%p,%d,tag(%d),%s-copy\n",
	  buf,length,tag,copy?"memory":"zero"));
  if (length && !copy)
    gmpi_use_interval((unsigned long)buf,length);
  gm_provide_receive_buffer(gmpi.port,buf,
			    tag>=0?tag+GMPI_FIRST_TAG:GMPI_CONTROL_TAG, GM_LOW_PRIORITY);
#if 0
  printf("provide recv buffer %p\n",buf );
  fflush(stdout);
#endif

}


static void gm_init_sync(void);

struct gm_port *gmpi_gm_port;

void gmpi_subinit(void)
{
  gm_status_t status;
  int i;

  gmpi_getconf();
  status = gm_open(&gmpi.port, gmpi.board_ids[MPID_MyWorldRank], gmpi.port_ids[MPID_MyWorldRank],
		   "gm-mpi", GM_API_VERSION_1_0);
  if (status != GM_SUCCESS) {
    gm_perror("GM-error in gm_open",status);
    fprintf(stderr,"ERROR: mpi node %d, cannot open GM board %d gm_port %d\n", 
	    MPID_MyWorldRank, gmpi.board_ids[MPID_MyWorldRank],gmpi.port_ids[MPID_MyWorldRank]);
    fflush(stderr);
    exit(1);
  }
  gmpi_gm_port = gmpi.port;
  gm_assert(sizeof(MPID_PKT_SHORT_T) <= gm_max_length_for_size(GMPI_CONTROL_TAG));
  gm_assert(GMPI_MAX_FRAG <= gm_max_length_for_size(GMPI_FIRST_TAG));
  /* hack 4 is "nose" value, it is a reasonable minimum, and I am confident we can prove */
  /* the absence of deadlock starting with 2, although the resource counting is always very accurate */ 
  gm_assert(GMPI_MAX_FRAG <= 4*GMPI_MAX_DMA_BYTES);

  gmpi.stoken = gmpi.max_stoken = MIN(gm_num_send_tokens(gmpi.port)/2,GMPI_NSTOKEN);
  gmpi.rtoken = gm_num_receive_tokens(gmpi.port)/4;
  for (i=0;i<MPID_MyWorldSize;i++) {
    gmpi.node_ids[i] = gm_host_name_to_node_id(gmpi.port,gmpi.node_names[i]);
    if (gmpi.node_ids[i] == GM_NO_SUCH_NODE_ID) {
      unsigned int myid=0;
      gm_get_node_id (gmpi.port, &myid);
      fprintf(stderr,"ERROR:mpi node %d gm_id %d (%s) doesn't know host %s (mpid %d)\n",
	      MPID_MyWorldRank, myid,gmpi.node_names[MPID_MyWorldRank],
	      gmpi.node_names[i],i);
      fprintf(stderr,"WAITING 10 seconds to try to find host %s again...\n",gmpi.node_names[i]); 
      fflush(stderr);
      sleep(10);
      gmpi.node_ids[i] = gm_host_name_to_node_id(gmpi.port,gmpi.node_names[i]);
      if (gmpi.node_ids[i] == GM_NO_SUCH_NODE_ID) {
        fprintf(stderr,"ERROR: mpi node %d gm_id %d (%s) STILL doesn't know host %s (mpid %d)\n",
                 MPID_MyWorldRank, myid,gmpi.node_names[MPID_MyWorldRank],
                    gmpi.node_names[i],i);
        fflush(stderr);
        exit(1);
      }
      else {
        fprintf(stderr,"FOUND host %s\n",gmpi.node_names[i]);       
        fflush(stderr);
      }
    }
  }
  status = gm_get_node_id (gmpi.port, &gmpi.my_node_id);
  gm_assert(status == GM_SUCCESS);
  /*gm_assert(gmpi.my_node_id == gmpi.node_ids[MPID_MyWorldRank]);*/

  if (!(gmpi.my_node_id == gmpi.node_ids[MPID_MyWorldRank])){
    printf("ERROR: gmpi.my_node_id != gmpi.node_ids[MPID_MyWorldRank]\n");
    printf("  gm_id=%d, my rank = %d\n", gmpi.my_node_id,MPID_MyWorldRank);
    for (i=0;i<MPID_MyWorldSize;i++) {
      printf("  node_id = %d, port_id = %d, board_id = %d\n",gmpi.node_ids[i],gmpi.port_ids[i],gmpi.board_ids[i]);
    }
  }

#ifdef GM_SEE_SYNC
  fprintf(stderr,"gmpi: node %d: opened GM board %d (gm_id=%d gm_port=%d)\n", 
		MPID_MyWorldRank, gmpi.board_ids[MPID_MyWorldRank],
	  gmpi.my_node_id,
	  gmpi.port_ids[MPID_MyWorldRank]);
  fflush(stderr);
#endif
  
  status = gm_set_acceptable_sizes (gmpi.port, GM_LOW_PRIORITY,
                                    (1 << (GMPI_FIRST_TAG + GMPI_NTAGS)) - 1);
  if (status != GM_SUCCESS) {
    fprintf(stderr,"gmpi: mpi node %d  ERROR initializing GM port in gm_set_acceptable_sizes: maybe firmware dead?\n",MPID_MyWorldRank);
    fflush(stderr);
    exit(1);
  }
  if (MPID_MyWorldSize > 1) {
    gm_init_sync();
  }
  for (i=0;i<gm_num_receive_tokens(gmpi.port)/2;i++) {
    void * p = DMA_ALLOC(gmpi.port, gm_max_length_for_size(GMPI_CONTROL_TAG), GMPI_RDMA);
    gm_provide_receive_buffer(gmpi.port, p , 
			      GMPI_CONTROL_TAG, GM_LOW_PRIORITY);
    gmpi.dma_bytes[GMPI_RDMA] = 0;
#if 0
    printf("alloc recv buffer %p, %p\n", p, ((char*)p)+gm_max_length_for_size(GMPI_CONTROL_TAG) );
    printf("provide recv buffer %p\n",p );
    fflush(stdout);
#endif
  }
  gmpi_regcache_init();
}




int gmpi_net_lookup(MPID_Device *dev,int blocking)
{
  int err = -1;
  int fast;
  gm_recv_event_t *event;
  
#if 0
  if (blocking == MPID_BLOCKING)
    event = gm_blocking_receive(gmpi.port);
  else
#endif
    event = gm_receive(gmpi.port);


 loop:

#if 0
  if (GM_RECV_EVENT_TYPE (event) != GM_NO_RECV_EVENT) {
    fprintf(stdout,"received event:%d %s\n",
		 GM_RECV_EVENT_TYPE (event),
		_gm_recv_event_name(GM_RECV_EVENT_TYPE (event)));
    fflush(stdout);
  }
#endif

  fast = 0;

  switch (GM_RECV_EVENT_TYPE (event)) {
  case GM_NO_RECV_EVENT:
    return err;
  case GM_SENT_EVENT:
    {
      /* gm_up_n_t *pp; */
      void * *pp; 

      void *p;

      /* pp = (gm_up_n_t *) gm_ntohp (event->sent.message_list); */
      pp = (void **)gm_ntohp (event->sent.message_list); 

      gm_assert (pp);
      do {
	MPIR_SHANDLE *shandle;

	p = (void *) gm_ntohp ((gm_up_t)*pp);
	gm_assert (p);
	shandle = lookup_sent_evt(p);
	if (shandle)
	  gmpi_datasent_event(shandle,GMPI_SEND_HEAD(p));
	else
	  gmpi_ctlsent_event(GMPI_SEND_HEAD(p));
        err = MPI_SUCCESS;
        pp++;
      }
      while (gm_ntohp((gm_up_t)*pp));
      break;
    }
  case GM_FAST_RECV_EVENT:
  case GM_FAST_HIGH_RECV_EVENT:
  case GM_FAST_PEER_RECV_EVENT:
  case GM_FAST_HIGH_PEER_RECV_EVENT:
    fast = 1;
  case GM_RECV_EVENT:
  case GM_HIGH_RECV_EVENT:
  case GM_PEER_RECV_EVENT:
  case GM_HIGH_PEER_RECV_EVENT:
    {
      void *p;
      int size;
      if (fast)
	p = gm_ntohp (event->recv.message);
      else
	p = gm_ntohp (event->recv.buffer);
#if 0
      printf("recv %d bytes",gm_ntohl(event->recv.length));
      if (gm_ntohc(event->recv.size) == GMPI_CONTROL_TAG) {
        MPID_PKT_T *p2 = p;
        printf("buf=%p mode=%d src=%d, cid=%d, tag=%d, len=%d",p2,p2->head.mode,p2->head.src,p2->head.context_id,p2->head.tag,p2->head.len);
      }
      printf("\n");
      fflush(stdout);

#endif
      if ((size = gm_ntohc(event->recv.size)) == GMPI_CONTROL_TAG) {
        /* control block */
        gmpi_ctlrecv_event(dev,p);
#if GM_DEBUG
        gm_bzero(p,gm_ntohl(event->recv.length));
        gm_bzero(gm_ntohp(event->recv.buffer),gm_max_length_for_size(GMPI_CONTROL_TAG));
#endif
        gm_provide_receive_buffer(gmpi.port,gm_ntohp(event->recv.buffer), 
                                  GMPI_CONTROL_TAG, GM_LOW_PRIORITY);
#if 0
	printf("provide recv buffer %p\n",gm_ntohp(event->recv.buffer) );
	fflush(stdout);
#endif

      } else {
        /* we received a real message */
	int tag;
	void *buf = gm_ntohp(event->recv.buffer);
	
	if (fast)
	  gm_memorize_message(gm_ntohp(event->recv.message),buf,gm_ntohl(event->recv.length));
	
	tag = size - GMPI_FIRST_TAG;
	gmpi_datarecv_event(tag, buf, gm_ntohl(event->recv.length));
      }
      err = MPI_SUCCESS;
      break;
    }

    /* these events are created by GM, so we need to pass them to gm_unknown */
    /* see gm.h for a list of events */
/*  add these in when we don't have to work with versions before gm1.1
    case GM_ALARM_EVENT:
    case GM_SENT_TOKENS_EVENT:
    case GM_IGNORE_RECV_EVENT:
      gm_unknown (gmpi.port, event);
      break;
*/

    default:
      /* NOTE: these may be messages generated by gm (version 1.1 or
               newer) like GM_SENT_TOKENS_EVENT or GM_IGNORE_RECV_EVENT,
               in which case they should be passed to gm_unknown and
               they do not indicate that anything is amiss */
      GM_PRINT(5, ("gmpi: warning - unexpected event %d\n",
		GM_RECV_EVENT_TYPE (event)));
      gm_unknown (gmpi.port, event);
  }

  event = gm_receive(gmpi.port);
  goto loop;
}


double startwtime, endwtime;

typedef struct msg_sender
{
  int send_id;
} msg_sender_t;


/* this function allow to either wait for completion of a sent and
   reception of the answer, or if forwad is set to wait for a message
   and forward it */
static void gmpi_init_loop(int forward,gm_u32_t *sbuf,gm_u32_t *rbuf,int no_sent, int no_recv)
{
  gm_status_t status;
  int send_node;
  int recv_done = 0 + no_recv;
  int send_done = 0 + no_sent;
  int next = (MPID_MyWorldRank + 1) % MPID_MyWorldSize;
  int sbuf_old;
  msg_sender_t * myptr;

  union gm_recv_event *event;

  while (1) {
    event = gm_receive(gmpi.port);
    switch (GM_RECV_EVENT_TYPE (event)) {
    case GM_NO_RECV_EVENT:
      sched_yield();
      break;
    case GM_SENT_EVENT:
      gm_assert(send_done == 0);
      send_done = 1;
      if (forward) {
        /* we cannot finish sent if we did not received and forward */
        gm_assert(recv_done);
        goto end;
      } else {
        if (recv_done) {
          goto end;
        }
      }
      break;
    case GM_FAST_RECV_EVENT:
    case GM_FAST_HIGH_RECV_EVENT:
    case GM_FAST_PEER_RECV_EVENT:
    case GM_FAST_HIGH_PEER_RECV_EVENT:
      gm_memorize_message(gm_ntohp(event->recv.message),gm_ntohp(event->recv.buffer),
				gm_ntohl(event->recv.length));
    case GM_RECV_EVENT:
    case GM_HIGH_RECV_EVENT:
    case GM_PEER_RECV_EVENT:
    case GM_HIGH_PEER_RECV_EVENT:
      gm_assert(recv_done == 0);
      recv_done = 1;
      gm_assert(gm_ntohl(event->recv.length) == GMPI_INIT_LEN && 
		gm_ntohp(event->recv.buffer) == rbuf);

      if (no_sent == 1 && no_recv == 0){
         myptr = gm_ntohp(event->recv.message);
	 send_node = gm_ntohl(myptr->send_id); 

         endwtime = MPI_Wtime();
         time_alive[send_node]= endwtime-startwtime;
#if 0
         printf("wall clock time = %f from process %d\n",
	     endwtime-startwtime, send_node);
#endif
      }
      if (forward) {
        gm_assert(send_done == 0);
        sbuf[0] = rbuf[0];
        gm_send(gmpi.port, sbuf, gm_min_size_for_length(GMPI_INIT_LEN), GMPI_INIT_LEN,
                         GM_LOW_PRIORITY, gmpi.node_ids[next], gmpi.port_ids[next]);
#if 0
        printf("sent from buffer %p %s:%d\n",sbuf, __FILE__, __LINE__ );
	fflush(stdout);
#endif

      } else {
        if (send_done)
          goto end;
        GM_PRINT(5,("gmpi: gmpi_init_loop:warning receiving answer before send confirmation\n"));
	fflush(stderr);
      }
      break;

    /* these events are created by GM, so we need to pass them to gm_unknown */
    /* see gm.h for a list of events */
      /*  add these in when we don't have to work with versions before gm1.1
      case GM_SENT_TOKENS_EVENT:
      case GM_IGNORE_RECV_EVENT:
      */
    case _GM_SLEEP_EVENT:
    case GM_ALARM_EVENT:
    case _GM_FLUSHED_ALARM_EVENT:
      gm_unknown(gmpi.port,event);
      break;

    default:
      /* NOTE: these may be messages generated by gm (version 1.1 or
               newer) like GM_SENT_TOKENS_EVENT or GM_IGNORE_RECV_EVENT,
               in which case they should be passed to gm_unknown and
               they do not indicate that anything is amiss */
      GM_PRINT(5, ("gmpi: warning gmpi_init_loop: unknown event %d\n",
		GM_RECV_EVENT_TYPE (event)));
      
      gm_unknown(gmpi.port,event);
    }
  }
 end:
  gm_assert(send_done && recv_done);
}
double* time_alive;

static void gm_init_sync(void)
{
  /* synchronize at beginning to check the network looks OK */
  /* first try to forward a odd value along the ring of nodes */
  /* when we succeed we forward an even value to tell everyone to really start */

  /* 11.1.00 changed this to :
1. first all nodes send to node 0
2. do second (even valued) loop as described above 
  */

  gm_u32_t *sbuf = gm_dma_malloc(gmpi.port,GMPI_INIT_LEN);
  volatile gm_u32_t *rbuf = gm_dma_malloc(gmpi.port,GMPI_INIT_LEN);
  int wait_count;
  double* time_alive_buff = calloc(GMPI_MAX_NUMNODES, sizeof(double));

  time_alive = time_alive_buff;
  gm_assert(time_alive_buff);
  gm_assert(sbuf && rbuf);
  gm_assert(MPID_MyWorldSize > 1);
#ifdef GM_SEE_SYNC
  fprintf(stderr,"gmpi: node %d out of %d waiting for sync\n", MPID_MyWorldRank, MPID_MyWorldSize);
  fflush(stderr);
#endif
  if (MPID_MyWorldRank == 0) {
    startwtime = MPI_Wtime();
    gm_srand(time(NULL)+getpid()+gethostid());
    wait_count = MPID_MyWorldSize - 1;
    do {
      sbuf[0] = gm_rand() | 1;

#ifdef GM_SEE_SYNC
      fprintf(stderr,"gmpi: node %d starting ring comm for sync,magic=0x%x\n",
			MPID_MyWorldRank,sbuf[0]);
      fflush(stderr);
#endif
      gm_provide_receive_buffer(gmpi.port, (void *)rbuf, gm_min_size_for_length(GMPI_INIT_LEN), GM_LOW_PRIORITY);
#if 0
      printf("provide recv buffer %p\n",rbuf );
      fflush(stdout);
#endif

#ifdef OLD_LOOP
      gm_send(gmpi.port,sbuf,gm_min_size_for_length(GMPI_INIT_LEN),
              GMPI_INIT_LEN, GM_LOW_PRIORITY, gmpi.node_ids[1],gmpi.port_ids[1]);
#endif

#if 0
      printf("sent from buffer %p %s:%d\n",sbuf, __FILE__, __LINE__ );
      fflush(stdout);
#endif
      gmpi_init_loop(0,sbuf,(gm_u32_t *)rbuf, 1, 0);
      wait_count --;

#if 0
      printf("received response, %d more to go %s:%d\n",wait_count, __FILE__, __LINE__ );
      fflush(stdout);
#endif

#ifdef OLD_LOOP
    } while (rbuf[0] != sbuf[0]);
#endif
    } while (wait_count > 0);

    sbuf[0] &= ~1;
#ifdef GM_SEE_SYNC
    fprintf(stderr,"gmpi: node %d starting final sync,magic=0x%x\n",
		MPID_MyWorldRank, sbuf[0]);
    fflush(stderr);
#endif
    gm_provide_receive_buffer(gmpi.port,(void *) rbuf, gm_min_size_for_length(GMPI_INIT_LEN), GM_LOW_PRIORITY);
#if 0
    printf("provide recv buffer %p\n",rbuf );
    fflush(stdout);
#endif

    gm_send(gmpi.port,sbuf,gm_min_size_for_length(GMPI_INIT_LEN),
            GMPI_INIT_LEN, GM_LOW_PRIORITY, gmpi.node_ids[1],gmpi.port_ids[1]);
#if 0
    printf("sent from buffer %p %s:%d\n",sbuf, __FILE__, __LINE__ );
    fflush(stdout);
#endif
    gmpi_init_loop(0,sbuf,(gm_u32_t *)rbuf,0,0);
#ifdef GM_SEE_SYNC
    fprintf(stderr,"node: %d Synchronization done\n",MPID_MyWorldRank);
    fflush(stderr);
#endif


  } else {
    /************ id is not 0 ****************/
#ifdef OLD_LOOP
    do { 
#endif

      gm_provide_receive_buffer(gmpi.port, (void *)rbuf, gm_min_size_for_length(GMPI_INIT_LEN), GM_LOW_PRIORITY);
      
#ifdef OLD_LOOP  
#if 0
      printf("provide recv buffer %p\n",rbuf );
      fflush(stdout);
#endif
      gmpi_init_loop(1,sbuf,(gm_u32_t *)rbuf,0,0);
#ifdef GM_SEE_SYNC
      fprintf(stderr,"gmpi: node %d recv sync 0x%x\n",
		MPID_MyWorldRank, rbuf[0]);
      fflush(stderr);
#endif
#endif

      /* everyone send to 0 */
      /*      sbuf_old = gm_rand() | 1;*/
      sbuf[0] = gm_htonl(MPID_MyWorldRank);
      gm_send(gmpi.port,sbuf,gm_min_size_for_length(GMPI_INIT_LEN),
              GMPI_INIT_LEN, GM_LOW_PRIORITY, gmpi.node_ids[0],gmpi.port_ids[0]);

#if 0
      printf("gmpi: node %d,sent from buffer %p %s:%d\n",MPID_MyWorldRank,sbuf, __FILE__, __LINE__ );
      fflush(stdout);
#endif
      gmpi_init_loop(0,sbuf,(gm_u32_t *)rbuf,0,1);

#ifdef OLD_LOOP
    } while (!(rbuf[0] & 1)); 
#endif

    do {
      gm_provide_receive_buffer(gmpi.port, (void *)rbuf, gm_min_size_for_length(GMPI_INIT_LEN), GM_LOW_PRIORITY);
#if 0
      printf("provide recv buffer %p\n",rbuf );
      fflush(stdout);
#endif
      gmpi_init_loop(1,sbuf,(gm_u32_t *)rbuf,0,0);
    } while (rbuf[0] & 1);
  }
  gm_dma_free(gmpi.port,sbuf);
  gm_dma_free(gmpi.port,(void *)rbuf);
}
