#include <stdlib.h>

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>

#include "packets.h"
#include "smpi.h"
#include "mpid_debug.h"
#include "mpiddev.h"
#include "gm.h"
#include "gm_stbar.h"

#ifndef GM_WRITEBAR
#define GM_WRITEBAR() GM_STBAR()
#endif
#ifndef GM_READBAR
#define GM_READBAR() GM_STBAR()
#endif

struct smpi_var smpi;
struct shared_mem * smpi_shmem;

/* indicate if there is enough place in the shared memory receive queue
   for this message : flow control (unit = byte) */
gm_inline unsigned int smpi_able_to_send(int dest, int len)
{
  return (((SMPI_TOTALIN(smpi.my_local_id,dest) >= SMPI_TOTALOUT(smpi.my_local_id,dest)) && (SMPI_TOTALIN(smpi.my_local_id,dest) - SMPI_TOTALOUT(smpi.my_local_id,dest) + SMPI_ALIGN(len) < smpi.available_queue_length)) || ((SMPI_TOTALIN(smpi.my_local_id,dest) < SMPI_TOTALOUT(smpi.my_local_id,dest)) && (SMPI_MAX_INT-SMPI_TOTALOUT(smpi.my_local_id,dest) + SMPI_TOTALIN(smpi.my_local_id,dest) + SMPI_ALIGN(len) < smpi.available_queue_length)));
}

gm_inline void smpi_complete_send(unsigned int my_id, 
				  unsigned int destination, 
				  unsigned int length)
{
  SMPI_NEXT(my_id,destination) += SMPI_ALIGN(length);
  if (SMPI_NEXT(my_id,destination) > SMPI_LAST(my_id,destination))
    SMPI_NEXT(my_id,destination) = SMPI_FIRST(my_id,destination);
  GM_WRITEBAR();
  SMPI_TOTALIN(my_id,destination) += SMPI_ALIGN(length);
}

gm_inline void smpi_complete_recv(unsigned int from_grank, 
				  unsigned int my_id, 
				  unsigned int length)
{
  SMPI_CURRENT(from_grank,my_id) += SMPI_ALIGN(length);
  if (SMPI_CURRENT(from_grank,my_id) > SMPI_LAST(from_grank,my_id))
    SMPI_CURRENT(from_grank,my_id) = SMPI_FIRST(from_grank,my_id);
  GM_WRITEBAR();
  SMPI_TOTALOUT(from_grank,my_id) += SMPI_ALIGN(length);
}


int deregister_smp_buffer(MPIR_SHANDLE  *shandle)
{
  gmpi_unuse_interval((unsigned long)shandle->start,
		      shandle->bytes_as_contig);  
  if (shandle->gm.orig_finish) {
    shandle->finish = shandle->gm.orig_finish;
    shandle->finish(shandle);
  } else
    shandle->finish = 0;

  return MPI_SUCCESS;
}

/* add a send request in the snd_request fifo to send it later */
gm_inline void smpi_queue_send(void *data, MPIR_SHANDLE *shandle, int len, int grank, int prepend)
{
  struct smpi_send_fifo_req *new,**evtp;
  new = (struct smpi_send_fifo_req *)gmpi_xmalloc(sizeof(*new),"MPI/smp_plug:smpi_queue_send");
  smpi_assert(new);
  evtp =&smpi.send_fifo_head;
  if (!prepend) {
    while (*evtp)
      evtp = &(*evtp)->next;
  }
  new->data = data;
  new->shandle = shandle;
  new->len = len;
  new->grank = grank;
  new->next = *evtp;
  *evtp = new;
}

/* this function send a eager message : build a packet into the 
   shared memory area and then copy the payload from the user buffer */
gm_inline void smpi_post_send_bufferinplace(void * buf, int len ,int src_lrank, int tag, int context_id, int destination, MPIR_SHANDLE * shandle)
{
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_SHORT_T * pkt;
  unsigned int my_id = smpi.my_local_id;

  smpi_assert(len <= SMPI_PKT_MAX_LOCALDATA_SIZE);
  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);

  /* build the packet */
  ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
  ptr = (void *)ptr_volatile;
  pkt = (SMPI_PKT_SHORT_T *)ptr;
  pkt->mode = MPID_PKT_SHORT;
  pkt->context_id = context_id;
  pkt->lrank = src_lrank;
  pkt->tag = tag;
  pkt->len = len;

  DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);

  /*copy the data from user buffer */
  if (len > 0) {
    memcpy((void *)((unsigned long)ptr+sizeof(SMPI_PKT_HEAD_T)), buf, len);
    DEBUG_PRINT_PKT_DATA("S Getting data from buf",pkt);
  }
  
  /* update flow control */
  smpi_complete_send(my_id, destination, (len + sizeof(SMPI_PKT_HEAD_T)));

  DEBUG_PRINT_MSG("S Sent message in a single packet");
}

/* process a send_request previously queued */
gm_inline void smpi_post_send_queued(void *data, MPIR_SHANDLE *shandle, int len, int destination)
{
  volatile void * ptr_volatile;
  void * ptr;
  int my_id = smpi.my_local_id; 

  /* the packet is already built */
  ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
  ptr = (void *)ptr_volatile;
  if (len > 0)
    memcpy(ptr, data, len);
  
  DEBUG_PRINT_SMP_SEND_PKT("S Sending queued msg", (SMPI_PKT_HEAD_T *)ptr);
  free(ptr);
 
  smpi_complete_send(my_id, destination, len);

  if (shandle)
    shandle->is_complete = 1;
}

/* send the GET_request packet for rendez-vous */
gm_inline void smpi_post_send_get(void * buf, int len ,int src_lrank, int tag, int context_id, int destination, MPIR_SHANDLE  *shandle)
{
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_GET_T * pkt;
  int my_id = smpi.my_local_id;
  
  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);

  ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
  ptr = (void *)ptr_volatile;
  pkt = (SMPI_PKT_GET_T *)ptr;
  pkt->mode = MPID_PKT_DO_GET;
  pkt->context_id = context_id;
  pkt->lrank = src_lrank;
  pkt->tag = tag;
  pkt->len = len;
  pkt->send_id = shandle;
  pkt->address = buf;

  DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);
 
  smpi_complete_send(my_id, destination, sizeof(SMPI_PKT_GET_T));
 
  shandle->start = buf;
  shandle->bytes_as_contig = len;
}

/* send the SEND_request packet for rendez-vous */
gm_inline void smpi_post_send_rndv(void * buf, int len ,int src_lrank, int tag, int context_id, int destination, MPIR_SHANDLE  *shandle)
{
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_RNDV_T * pkt;
  int my_id = smpi.my_local_id;
  
  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);

  ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
  ptr = (void *)ptr_volatile;
  pkt = (SMPI_PKT_RNDV_T *)ptr;
  pkt->mode = MPID_PKT_REQUEST_SEND;
  pkt->context_id = context_id;
  pkt->lrank = src_lrank;
  pkt->tag = tag;
  pkt->len = len;
  pkt->send_id = shandle;

  DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);

  smpi_complete_send(my_id, destination, sizeof(SMPI_PKT_RNDV_T));
}

/* send the DONE_GET message to ack the rendez-vous copy */
gm_inline void smpi_post_send_done_get(int destination, void * send_id)
{
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_GET_T * pkt;
  int my_id = smpi.my_local_id;
  
  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);

  /* build packet */
  ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
  ptr = (void *)ptr_volatile;
  pkt = (SMPI_PKT_GET_T *)ptr;
  pkt->mode = MPID_PKT_DONE_GET;
  pkt->send_id = send_id;

  DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);

  /* flow control */
  smpi_complete_send(my_id, destination, sizeof(SMPI_PKT_GET_T));
}

/* send the OK_TO_SEND message to ack the SEND_REQUEST rendez-vous pkt */
gm_inline void smpi_post_send_ok_to_send(int destination, MPIR_RHANDLE * rhandle)
{
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_RNDV_T * pkt;
  int my_id = smpi.my_local_id;

  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);
  
  rhandle->gm.netbuf = rhandle->buf;
  rhandle->gm.frag = rhandle->len;
  rhandle->s.MPI_ERROR = 0;
  rhandle->from = destination;

  if (!smpi.send_fifo_head && smpi_able_to_send(destination, sizeof(SMPI_PKT_RNDV_T))) {
    /* build packet */
    ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
    ptr = (void *)ptr_volatile;
    pkt = (SMPI_PKT_RNDV_T *)ptr;
    pkt->mode = MPID_PKT_OK_TO_SEND;
    pkt->len = rhandle->gm.frag;
    pkt->send_id  = rhandle->send_id;
    pkt->recv_id = rhandle;

    DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);

    /* flow control */ 
    smpi_complete_send(my_id, destination, sizeof(SMPI_PKT_RNDV_T));
  } else {
    /* not enough place, we will send it later */
    SMPI_PKT_RNDV_T * pkt_p = (SMPI_PKT_RNDV_T *)gmpi_xmalloc(sizeof(SMPI_PKT_RNDV_T),"MPI/smp_plug:post_send_ok_to_send");
    smpi_assert(pkt_p);
    
    pkt_p->mode = MPID_PKT_OK_TO_SEND;
    pkt_p->len = rhandle->gm.frag;
    pkt_p->send_id = rhandle->send_id;
    pkt_p->recv_id = rhandle;
    
    DEBUG_PRINT_SMP_SEND_PKT("S Sending in the send_queue", pkt_p);
    DEBUG_PRINT_MSG("S Sending message in the send_queue");
    smpi_queue_send(pkt_p, 0, sizeof(SMPI_PKT_RNDV_T), destination, 0);
    DEBUG_PRINT_MSG("S Sent message in the send_queue");
  }
}

/* send the OK_TO_SEND message to ack the SEND_REQUEST rendez-vous pkt */
gm_inline void smpi_post_send_ok_to_send_cont(int destination, MPIR_RHANDLE * rhandle)
{
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_RNDV_T * pkt;
  int my_id = smpi.my_local_id;

  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);

  if (!smpi.send_fifo_head && smpi_able_to_send(destination, sizeof(SMPI_PKT_RNDV_T))) {
    /* build packet */
    ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
    ptr = (void *)ptr_volatile;
    pkt = (SMPI_PKT_RNDV_T *)ptr;
    pkt->mode   = MPID_PKT_OK_TO_SEND;
    pkt->len = rhandle->gm.frag;
    pkt->send_id = rhandle->send_id;
    pkt->recv_id = rhandle;

    DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);

    /* flow control */
    smpi_complete_send(my_id, destination, sizeof(SMPI_PKT_RNDV_T));
  } else {
    /* not enough place, we will send it later */
    SMPI_PKT_RNDV_T * pkt_p = (SMPI_PKT_RNDV_T *)gmpi_xmalloc(sizeof(SMPI_PKT_RNDV_T),"MPI/smp_plug:post_ok_to_send");
    smpi_assert(pkt_p);
    
    pkt_p->mode = MPID_PKT_OK_TO_SEND;
    pkt_p->len = rhandle->gm.frag;
    pkt_p->send_id = rhandle->send_id;
    pkt_p->recv_id = rhandle;
    
    DEBUG_PRINT_SMP_SEND_PKT("S Sending in the send_queue", pkt_p);
    DEBUG_PRINT_MSG("S Sending message in the send_queue");
    smpi_queue_send(pkt_p, 0, sizeof(SMPI_PKT_RNDV_T), destination, 0);
    DEBUG_PRINT_MSG("S Sent message in the send_queue");
  }
}

/* on the sender, when it receives the DONE_GET message, can complete 
   the rendez-vous Send and free the user buffer */
gm_inline int smpi_recv_done_get(int from, int my_id, void * send_id)
{
  MPIR_SHANDLE * shandle;
  
  /* flow control */  
  smpi_complete_recv(from, my_id, sizeof(SMPI_PKT_GET_T));
  shandle = (MPIR_SHANDLE *)send_id; 

  MPID_n_pending--;
  shandle->is_complete  = 1;
  if (shandle->finish)
    shandle->finish(shandle);
  return MPI_SUCCESS;
}

gm_inline int smpi_recv_ok_to_send(int from, int my_id, void * send_id, void * recv_id)
{ 
  volatile void * ptr_volatile;
  void * ptr;
  SMPI_PKT_CONT_GET_T * pkt;
  int destination = from;
  MPIR_SHANDLE * shandle;
  void * recv_handle_id, * send_handle_id;
  int length_to_send, pkt_len, pkt_payload, flow_control_ok, last;

  shandle = (MPIR_SHANDLE *)send_id;
  send_handle_id = send_id;
  recv_handle_id = recv_id;

  /* flow control */
  smpi_complete_recv(from, my_id, sizeof(SMPI_PKT_RNDV_T));
  
  smpi_assert(destination > -1);
  smpi_assert(destination < smpi.num_local_nodes);
  
  shandle->gm.sent_evt_expected--;
  smpi_assert(shandle->gm.sent_evt_expected >= 0);
  length_to_send = shandle->gm.frag;
  flow_control_ok = 1;
  last = 0;
  
  while((flow_control_ok) && ((length_to_send > 0) || (shandle->gm.posted == 0))) {
    if (length_to_send >= SMPI_PKT_LONG_MAX_LOCALDATA_SIZE) { 
      pkt_len = sizeof(SMPI_PKT_CONT_GET_T);
      pkt_payload = SMPI_PKT_LONG_MAX_LOCALDATA_SIZE;
    } else {
      pkt_len = sizeof(SMPI_PKT_HEAD_T)+2*sizeof(void *)+length_to_send;
      pkt_payload = length_to_send;
    }
    
    if (length_to_send == pkt_payload)
      last = 1;

    /* flow control */ 
    if (!smpi.send_fifo_head && smpi_able_to_send(destination,pkt_len)) {
      /* build the packet */
      ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
      ptr = (void *)ptr_volatile;
      pkt = (SMPI_PKT_CONT_GET_T *)ptr;
      if (last)
	pkt->mode = MPID_PKT_DONE_GET;
      else
	pkt->mode = MPID_PKT_CONT_GET;
      pkt->send_id = send_handle_id;
      pkt->recv_id = recv_handle_id;
      
      /*copy the data from user buffer */
      if (pkt_payload > 0) {
	memcpy(pkt->buffer, shandle->gm.copy, pkt_payload);
	pkt->len = pkt_payload;
	shandle->gm.copy = (void *)((long)shandle->gm.copy + pkt_payload);
	length_to_send -= pkt_payload;
      }
      shandle->gm.posted = 1;
      
      DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);
      
      /* update flow control */  
      smpi_complete_send(my_id, destination, pkt_len);
    } else {
      flow_control_ok = 0;
      DEBUG_PRINT_MSG("S Pipeline stopped because of flow control");
      if (!smpi.send_fifo_head && smpi_able_to_send(destination,sizeof(SMPI_PKT_HEAD_T)+2*sizeof(void *))) {
	/* build the packet */
	ptr_volatile = (void *)((&smpi_shmem->pool)+ SMPI_NEXT(my_id,destination));
	ptr = (void *)ptr_volatile;
	pkt = (SMPI_PKT_CONT_GET_T *)ptr;
	pkt->mode = MPID_PKT_FLOW;
	pkt->send_id = send_handle_id;
	pkt->recv_id = recv_handle_id;
	pkt->len = 0;
	shandle->gm.sent_evt_expected++;
	
	DEBUG_PRINT_SMP_SEND_PKT("S Sending", pkt);
	
	/* update flow control */
	smpi_complete_send(my_id, destination, 
			   sizeof(SMPI_PKT_HEAD_T)+2*sizeof(void *));
      } else {
	/* we need to keep the circuit full */
	SMPI_PKT_CONT_GET_T * pkt_p = (SMPI_PKT_CONT_GET_T *)gmpi_xmalloc(sizeof(SMPI_PKT_HEAD_T)+2*sizeof(void *),
									  "MPI/smp_plug:recv_ok_to_send");
	smpi_assert(pkt_p);
	
	pkt_p->mode = MPID_PKT_FLOW;
       	pkt_p->send_id = send_handle_id;
	pkt_p->recv_id = recv_handle_id;
	pkt_len = 0;
	shandle->gm.sent_evt_expected++;
	
	DEBUG_PRINT_SMP_SEND_PKT("S Sending in the send_queue", pkt_p);
	DEBUG_PRINT_MSG("S Sending message in the send_queue");
	smpi_queue_send(pkt_p, 0, sizeof(SMPI_PKT_HEAD_T)+2*sizeof(void *), destination, 0);
	DEBUG_PRINT_MSG("S Sent message in the send_queue");
      }
    }
  }

  shandle->gm.frag = length_to_send;
  
  if ((length_to_send == 0) && (shandle->gm.posted == 1) && (shandle->gm.sent_evt_expected == 0)) {
    MPID_n_pending--;
    shandle->is_complete  = 1;
    if (shandle->finish)
      shandle->finish(shandle);
  }
  return MPI_SUCCESS;
}

gm_inline int smpi_recv_get(int from, int my_id, void * in_pkt)
{ 
  int msglen, pkt_len, last_pkt=0, send_progress=0;
  MPIR_RHANDLE * rhandle;
  SMPI_PKT_CONT_GET_T *pkt = (SMPI_PKT_CONT_GET_T *)in_pkt;

  rhandle = (MPIR_RHANDLE *)pkt->recv_id;
  msglen = pkt->len;
  pkt_len = sizeof(SMPI_PKT_HEAD_T)+2*sizeof(void *)+msglen;
  
  if (pkt->mode == MPID_PKT_DONE_GET)
    last_pkt = 1;
  else 
    if (pkt->mode == MPID_PKT_FLOW)
      send_progress = 1;
  
  if (msglen > 0) {
    memcpy( rhandle->gm.netbuf, pkt->buffer, msglen );
    rhandle->gm.netbuf += msglen;
    rhandle->gm.frag -= msglen;
  }

  /* flow control */
  smpi_complete_recv(from, my_id, pkt_len);

  if (send_progress)
    smpi_post_send_ok_to_send_cont(from, rhandle);

  smpi_assert(rhandle->gm.frag >= 0);
  if (last_pkt) {
    rhandle->is_complete  = 1;
    if (rhandle->finish)
      (rhandle->finish)(rhandle);
  }
  return MPI_SUCCESS;
}

/* GM call to use the kernel to "directcopy" the send buffer into the
   receive buffer. Kernel overhead but only one copy */
gm_inline void smpi_do_get(int from, void * source, void * target, unsigned int length)
{ 
#if defined(SMP_ENABLE_DIRECTCOPY) && defined(GM_CAN_REGISTER_MEMORY)
  gm_status_t status;

  smpi_assert((0 <= from) && (from <=smpi.num_local_nodes));
  status = gm_directcopy_get(gmpi.port, source, target, length, smpi_shmem->board_id[from], smpi_shmem->port_id[from]);
		   
  /* something did wrong : Booum :-)) */
  if (status != GM_SUCCESS) {
    fprintf(stderr,"ERROR : [%d], gm_directcopy failed (status=%d), maybe the Directcopy support should be enabled in GM\n",
	    MPID_MyWorldRank, status);
    gm_perror ("gm_directcopy: ", status);
    fflush(stderr);
    exit(1);
  }
#else
  fprintf(stderr,"ERROR : [%d], directcopy is disabled or GM cannot register memory\n", MPID_MyWorldRank);
  fflush(stderr);
  exit(1);
#endif
}

/* to check new messages in the shared memory receive queues and process
   them : poll from all the other local nodes */
gm_inline int smpi_net_lookup(MPID_Device *dev,int blocking)
{
  volatile void * ptr;
  SMPI_PKT_T * pkt;
  int from, j;
  MPIR_RHANDLE *rhandle;
  int is_posted;
  int err = -1;
  
  for (j=1; j<smpi.num_local_nodes; j++) {
    from = (smpi.my_local_id+j)%smpi.num_local_nodes;
    if (SMPI_TOTALIN(from,smpi.my_local_id) != SMPI_TOTALOUT(from,smpi.my_local_id))
      {
	GM_READBAR();
	ptr = (void *)((&smpi_shmem->pool)+SMPI_CURRENT(from,smpi.my_local_id));
	pkt = (SMPI_PKT_T *)ptr;
	
	DEBUG_PRINT_SMP_PKT("R receiving msg",pkt);

	/* Separate the incoming messages from control messages */
	if (MPID_PKT_IS_MSG(pkt->head.mode)) {
	    
	  /* Is the message expected or not? 
	     This routine RETURNS a rhandle, creating one if the message 
	     is unexpected (is_posted == 0) */
	  MPID_Msg_arrived( pkt->head.lrank, pkt->head.tag, pkt->head.context_id, &rhandle, &is_posted );

#ifdef MPID_DEBUG_ALL   /* #DEBUG_START# */
	  if (MPID_DebugFlag) {
	    FPRINTF( MPID_DEBUG_FILE, "[%d]R msg was %s (%s:%d)\n", MPID_MyWorldRank, is_posted ? "posted" : "unexpected", __FILE__, __LINE__ );
	  }
#endif                  /* #DEBUG_END# */
	  if (is_posted) {
	    switch (pkt->head.mode) {
	    case MPID_PKT_SHORT:
	      DEBUG_TEST_FCN(dev->short_msg->recv,"dev->short->recv");
	      err = (*dev->short_msg->recv)( rhandle, from, pkt );
	      break;
	      
#if defined(SMP_ENABLE_DIRECTCOPY) && defined(GM_CAN_REGISTER_MEMORY)
            case MPID_PKT_DO_GET:
	      DEBUG_TEST_FCN(dev->rndv->irecv,"dev->rndv->irecv");
	      err = (*dev->rndv->irecv)( rhandle, from, pkt );
	      break;
#else	      
	    case MPID_PKT_REQUEST_SEND:
	      rhandle->s.MPI_TAG = pkt->rndv_pkt.tag;
	      rhandle->s.MPI_SOURCE = pkt->rndv_pkt.lrank;
	      rhandle->s.count = pkt->rndv_pkt.len;
	      rhandle->send_id = pkt->rndv_pkt.send_id;
	      /* flow control to receive the REQUEST_SEND */
	      smpi_complete_recv(from, smpi.my_local_id, sizeof(SMPI_PKT_RNDV_T));

	      smpi_post_send_ok_to_send(from, rhandle);
	      err = MPI_SUCCESS;
	      break;
#endif	      
	    
	    default:
	      fprintf( stderr, 
		       "[%d] Internal error: msg packet discarded (%s:%d)\n",
		       MPID_MyWorldRank, __FILE__, __LINE__ );
	      fflush( stderr );
	    }
	  } else {
	    switch (pkt->head.mode) {
	    case MPID_PKT_SHORT:
	      rhandle->send_id = 0;
	      DEBUG_TEST_FCN(dev->short_msg->unex,"dev->short->unex");
	      err = (*dev->short_msg->unex)( rhandle, from, pkt );
	      break;
#if defined(SMP_ENABLE_DIRECTCOPY) && defined(GM_CAN_REGISTER_MEMORY)
	    case MPID_PKT_DO_GET:
	      /* Need the send handle address in order to cancel a send */
	      rhandle->send_id = pkt->get_pkt.send_id;
	      DEBUG_TEST_FCN(dev->rndv->unex,"dev->rndv->unex");
	      err = (*dev->rndv->unex)( rhandle, from, pkt );
	      break;
#else
	    case MPID_PKT_REQUEST_SEND:
	      /* Need the send handle address in order to cancel a send */
	      rhandle->send_id = pkt->rndv_pkt.send_id;
	      DEBUG_TEST_FCN(dev->rndv->unex,"dev->rndv->unex");
	      err = (*dev->rndv->unex)( rhandle, from, pkt );
	      break;
#endif
	    default:
	      fprintf( stderr, 
		       "[%d] Internal error: msg packet discarded (%s:%d)\n",
		       MPID_MyWorldRank, __FILE__, __LINE__ );
	    }
	  }
	} else {
	  switch (pkt->head.mode) {
#if defined(SMP_ENABLE_DIRECTCOPY) && defined(GM_CAN_REGISTER_MEMORY)
	  case MPID_PKT_DONE_GET:
	    err = smpi_recv_done_get(from, smpi.my_local_id, pkt->get_pkt.send_id);
	    break;
#else
	  case MPID_PKT_OK_TO_SEND:
	    err = smpi_recv_ok_to_send(from, smpi.my_local_id, pkt->rndv_pkt.send_id, pkt->rndv_pkt.recv_id);
	    break;

	  case MPID_PKT_FLOW:
	  case MPID_PKT_CONT_GET:
	  case MPID_PKT_DONE_GET:
	    err = smpi_recv_get(from, smpi.my_local_id, pkt);
	    break;
#endif
	  case MPID_PKT_ANTI_SEND:
	    MPID_SendCancelOkPacket( pkt, from );
	    /* flow control to receive the ANTI_SEND */
	    smpi_complete_recv(from, smpi.my_local_id, sizeof(SMPI_PKT_ANTI_SEND_T));

	    err = MPI_SUCCESS;
	    break;
            
	  case MPID_PKT_ANTI_SEND_OK:
	    MPID_RecvCancelOkPacket( pkt, from ); 
	    /* flow control to receive the ANTI_SEND_OK */
	    smpi_complete_recv(from, smpi.my_local_id, sizeof(SMPI_PKT_ANTI_SEND_T));

	    err = MPI_SUCCESS; 
	    break;
	      
	  default:
	    fprintf( stdout, "[%d] Mode %d (0x%x) is unknown (internal error) %s:%d!\n", 
		     MPID_MyWorldRank, pkt->head.mode, pkt->head.mode,
		     __FILE__, __LINE__ );
	  }
	}
      }
  }
  /* DEBUG_PRINT_MSG("Exiting check_incoming"); */
  return err;
}

/* the main loop : check if we can process some queued send requests and poll
   for new messages */
gm_inline int MPID_SMP_Check_incoming(MPID_Device *dev,MPID_BLOCKING_TYPE blocking)
{

  struct smpi_send_fifo_req *sreq;

  while (((sreq = smpi.send_fifo_head)) && smpi_able_to_send(sreq->grank,sreq->len)) {
    smpi_post_send_queued(sreq->data,sreq->shandle,sreq->len, sreq->grank);
    smpi.send_fifo_head = sreq->next;
    free(sreq);
  }
  /* polling */
  return smpi_net_lookup(dev,blocking);
}


/* the init of the SMP device */
void smpi_init(void)
{
  unsigned int i,j,size,pool, pid, wait;
  char * buf;
  struct stat file_status;
  char * shmem_file;

#if defined(SMP_ENABLE_DIRECTCOPY) && defined(GM_CAN_REGISTER_MEMORY)
  gm_status_t status;
#endif

  if (smpi.num_local_nodes > SMPI_MAX_NUMLOCALNODES) {
    fprintf(stderr,"ERROR: mpi node %d, too many local processes (%d processes, %d maximum). Change the SMPI_MAX_NUMLOCALNODES value in smpi.h\n", MPID_MyWorldRank, smpi.num_local_nodes, SMPI_MAX_NUMLOCALNODES );
    fflush(stderr);
    exit(1);
  }

#if defined(SMP_ENABLE_DIRECTCOPY) && defined(GM_CAN_REGISTER_MEMORY)
  /* we need a GM port to access the kernel code for Directcopy */
  /* usually, people who use MPI-GM/SMP have Myrinet, not only one SMP node ;-)) */
  if (MPID_MyWorldSize == smpi.num_local_nodes) {
    status = gm_open(&gmpi.port, gmpi.board_ids[MPID_MyWorldRank], gmpi.port_ids[MPID_MyWorldRank], "gm-mpi", GM_API_VERSION_1_0);
    if (status != GM_SUCCESS) {
      fprintf(stderr,"ERROR: mpi node %d, cannot open GM board %d gm_port %d\n", MPID_MyWorldRank, gmpi.board_ids[MPID_MyWorldRank],gmpi.port_ids[MPID_MyWorldRank]);
      fflush(stderr);
      exit(1);
    }
    gmpi_regcache_init();
  }
#endif

  smpi.available_queue_length = SMPI_LENGTH_QUEUE-SMPI_PKT_LONG_MAX_LOCALDATA_SIZE-sizeof(SMPI_PKT_HEAD_T)-2*sizeof(void *);

  /* get the shared memory file name (mmaped) for the running environnement */
  shmem_file = (char *)getenv("GMPI_SHMEM_FILE");
#if PRINT_CONFINFO
  printf("shmem_file = '%s'\n",shmem_file);
#endif  
  if (!shmem_file) {
    fprintf(stderr, "Error: Need to obtain shared memory file name in GMPI_SHMEM_FILE\n");
    exit(1);
  }

  /* open the shared memory file */
  smpi.fd = open(shmem_file, O_RDWR | O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO);
  if (smpi.fd < 0) { 
    fprintf(stderr, "[%d] smpi_init:error in opening shared memory file: %d\n", MPID_MyWorldRank, errno);
    exit(1);
  }

  /* compute the size of this file */
  size = SMPI_CACHE_LINE_SIZE+sizeof(struct shared_mem)+(smpi.num_local_nodes*(smpi.num_local_nodes-1)*(SMPI_ALIGN(SMPI_LENGTH_QUEUE)));
  
  /* initialization of the shared memory file */
  if (smpi.my_local_id == 0) {
    ftruncate(smpi.fd, size);
    buf = (char *)calloc(size+1, sizeof(char));
    if (write(smpi.fd, buf, size) != size) {
      fprintf(stderr, "[%d] smpi_init:error in writing shared memory file: %d\n", MPID_MyWorldRank, errno);
      exit(1);
    }
    if (lseek(smpi.fd, 0, SEEK_SET) != 0) {
      fprintf(stderr, "[%d] smpi_init:error in lseek on shared memory file: %d\n", MPID_MyWorldRank, errno);
      exit(1);
    }
    free(buf);
  }
  /* synchronization between local processes */
  do
    if (fstat(smpi.fd, &file_status) != 0) {
      fprintf(stderr, "[%d] smpi_init:error in fstat on shared memory file: %d\n", MPID_MyWorldRank, errno);
      exit(1);
    } 
  while (file_status.st_size != size);
  
  /* mmap of the shared memory file */
  smpi.mmap_ptr = mmap(0, size, (PROT_READ | PROT_WRITE), (MAP_SHARED), smpi.fd, 0); 
  if (smpi.mmap_ptr == (void*)-1) { 
    fprintf(stderr, "[%d] smpi_init:error in mmapping shared memory: %d\n", MPID_MyWorldRank, errno);
    exit(1);
  }
  
  /* smpi_shmem = (struct shared_mem *)(smpi.mmap_ptr+SMPI_CACHE_LINE_SIZE-((int)smpi.mmap_ptr%SMPI_CACHE_LINE_SIZE)); */
  smpi_shmem =(struct shared_mem *)smpi.mmap_ptr;
  if (((long)smpi_shmem & (SMPI_CACHE_LINE_SIZE-1)) != 0) {
    fprintf(stderr, "[%d] smpi_init:error in shifting mmapped shared memory\n", MPID_MyWorldRank);
    exit(1);
  }
  
  /* another synchronization barrier */
  if (smpi.my_local_id == 0) {
    wait = 1;
    while (wait) {
      wait = 0;
      for (i=1; i<smpi.num_local_nodes; i++) {
	if (smpi_shmem->pid[i] == 0)
	  wait = 1;
      }
    }
    /* id = 0, unlink the shared memory file, so that it is cleaned
       up when everyone exits */
     if (unlink(shmem_file) != 0)
        fprintf(stderr,"[%d] smpi_init:error in unlinking shmem file, %s: %d\n", MPID_MyWorldRank,shmem_file, errno);

    pid = getpid();
    if (pid == 0) {
      fprintf(stderr, "[%d] smpi_init:error in geting pid\n", MPID_MyWorldRank);
      exit(1);
    }
    smpi_shmem->pid[smpi.my_local_id] = pid;
    GM_WRITEBAR();
  } else {
    while (smpi_shmem->pid[0] != 0);
    while (smpi_shmem->pid[0] == 0) {
      smpi_shmem->pid[smpi.my_local_id] = getpid();
      GM_WRITEBAR();
    }
    
    for (i=0;i<smpi.num_local_nodes;i++)
      if (smpi_shmem->pid[i] <= 0) {
	fprintf(stderr, "[%d] smpi_init:error in geting pid\n", MPID_MyWorldRank);
	exit(1);
      }
  }
  
 
  /* init rqueues in shared memory */
  if (smpi.my_local_id == 0) {
    pool = 0;
    for (i=0;i<smpi.num_local_nodes;i++)
      for (j=0;j<smpi.num_local_nodes;j++)
	if (i != j) { 
	  GM_READBAR();
	  smpi_shmem->rqueues_limits[i][j].first = SMPI_ALIGN(pool);
	  smpi_shmem->rqueues_limits[i][j].last = SMPI_ALIGN(pool+smpi.available_queue_length);
	  
	  smpi_shmem->rqueues_params[i].params[j].current = SMPI_ALIGN(pool);
	  smpi_shmem->rqueues_params[j].params[i].next = SMPI_ALIGN(pool);
	  smpi_shmem->rqueues_params[j].params[i].msgs_total_in = 0;
	  
	  smpi_shmem->rqueues_flow_out[i][j].msgs_total_out = 0;
	  pool+=SMPI_ALIGN(SMPI_LENGTH_QUEUE+SMPI_CACHE_LINE_SIZE);
	  GM_READBAR();
	}
  }
  smpi_shmem->board_id[smpi.my_local_id] = gmpi.board_ids[MPID_MyWorldRank];
  smpi_shmem->port_id[smpi.my_local_id] = gmpi.port_ids[MPID_MyWorldRank];
  GM_READBAR();
}
  

/* Ok, we close everything and come back home */
void smpi_finish(void)
{
  char * shmem_file;

  while (smpi.send_fifo_head) {
    MPID_DeviceCheck(MPID_BLOCKING);
  }
  /* unmap the shared memory file */
  munmap(smpi.mmap_ptr, (SMPI_CACHE_LINE_SIZE+sizeof(struct shared_mem)+(smpi.num_local_nodes*(smpi.num_local_nodes-1)*(SMPI_LENGTH_QUEUE+SMPI_CACHE_LINE_SIZE))));
  close(smpi.fd);

  if (MPID_MyWorldSize == smpi.num_local_nodes)
    gm_close(gmpi.port);
}
