/******************************************************************-*-c-*-
 * Myricom GM networking software and documentation                      *
 * Copyright (c) 1996, 1997, 1998, 1999, 2000 by Myricom, Inc.           *
 * All rights reserved.  See the file `COPYING' for copyright notice.    *
 *************************************************************************/

/* author: glenn@myri.com */

/**
   GM provides reliable ordered delivery between pairs of hosts on a Myrinet
   with two nonpreemptive priority levels.

   ---

   The driving force behind the design of this MCP are the following
   observations:

   [NOTE: The following is based on the original LANai4 dispatch, but
   remains qualitatively correct, although some of the numbers are out
   of date for the newer implementation.  --Glenn]

   (1) The LANai is composed of 4 event-driven interfaces: The sdma, send,
   recv, and rdma interfaces.
   (2) The MCP is responsible for setting up transactions on each interface,
   but can do so only when certain conditions are satisfied (e.g.: The
   send interface is idle and data is available to send.).  These events
   will be called "start events".
   (3) The MCP must be able to detect the completion if transactions on each
   interface.  These events will be called "finish events".

   It turns out that only 14 bits of information are needed to specify
   whether a transaction can be started or concluded.  6 of the bits
   are in the bottom byte of ISR and are maintained by the LANai
   hardware.  The remaining 9 bits can be stored in a general purpose
   register "state" and a mask can be stored in IMR such that
   "ISR&IMR|state" packs the 14 state bits into the bottom of a
   register.  Consequently, a 16K gm_event_index table can be constructed
   such that gm_event_index[ISR&IMR|state] returns the index of the
   highest-priority pending event.  Also, since "state" is a register,
   we can set or clear any set of state bits using a single
   or-immediate or and-immediate instruction.

   We can also construct a small "gm.handler" table to convert the event
   index into a pointer to the code to handle the event.  While we
   could have had the first table perform the full conversion, this
   solution takes about 1/4 the memory and allows the handler for any
   event to be changed dynamically in just 2 instructions.  Also the
   double-table solution takes only 2 more clock cycles per dispatch
   than the single-table solution.

   The result of all this dispatch trickery is that dispatching can
   contributes a best-case of only 35 instructions of latency to the
   best-case one-way message latency while providing a flexible and
   efficient state-transition mechanism for the MCP.

  ---

  WARNING: The following information is somewhat out of date,
  particularly the info about queuing.

  Conceptually, the GM MCP is structured as depicted in the following
  diagram.  In the diagram, rounded rectangles represent the state
  machines, which execute concurrently, the canonical queue symbol
  ("]O") represents queue data structures, and sharp rectangles represent
  other data structures.  The capacity of each queue is specified, with
  "R" being the number for receive tokens and "S" being the number of send
  tokens.


				    +-------+
				    |Pending|
				    |(n)ack |
		Network		    | table |	      Network
		   |		    +-------+		 ^
		   V	 received     ^	   |		 |	+-----+
	       .-------. (n)acks      |	   |	     .-------.	|addr |
	       | Recv  |--------------+--. `-------->| Send  |<-|table|
	       `-------'	      |	 |  ,--------`-------'	+-----+
		|      ^	      |	 |  |	      |	     ^
		|      |	      |	 |  |	      |	     |
		V     /^\ Free	      |	 |  |	      V	    /^\ Send chunks
	      | 2 |   >-< recv	      |	 |  |	Free| 2 |   >-<
	       >-<   | 2 |chunks      |	 |  |	send >-<   | 2 |
  recv chunks  \_/     ^	      |	 |  |  chunks\_/     ^
		|      |	      |	 |  |	      |	     |
		V      |   generated  |	 |  |	      |	     |
	       .--------.   (n)acks   |	 |  |	      |	     |
	       |  RDMA	|-------------'	 |  |	      |	     |
	       `--------'		 |  |	      |	     |
		|      ^		 |  |	      |	     |
		|      |		 |  |	      |	     |
		|     /^\ Free recv	 |  |	      |	     |
		|     >-< token lists	 |  |	      |	     |
		|    | R |array		 |  |	      |	     |
		|      ^		 |  |	      |	     |
		|      |		 |  |	      |	     |
		|      ^                 |  |         |      |
		|			 |  |	      |	     |
		|			 V  V	      V	     |
		|			+----+	     .--------.
		| ,---------------------|sent|------>|	SDMA  |
		| |			|list|	     `--------'
		V V			|arry|		     ^
	       |R+S|			+----+		     |
		>-<					     |
recv tokens	\_/ (---Host/LANai communication queues---) /^\ Send tokens
		 |					    >-<
		 |					   |R+S|
		 |					     ^
		 |					     |
		 V					     ^

  The GM MCP is comprised of four state machines: SDMA, Send, Recv,
  and RDMA.  The responsibilities of each state machines are as
  follows:

    SDMA:

    Remove message descriptors from the send message list, and check the
    message type.  Recv tokens are stored in the appropriate "free recv
    token list".  All other tokens (send tokens) are handled by
    SDMAing the message to be sent into pieces and placing the data in
    "send chunks", usually by DMAing it, as send chunks become available.

    Send:

    Send all enqueued chunks, prepending the correct route before the
    first chunk for a message, and marking the end of the message with a
    tail flit.	After sending a message, place the token for the sent
    message in the sent list for the corresponding connection.

    Also, send any pending ack or nack messages as indicated by the
    pending (n)ack table.

    Recv:

    Break received messages into chunks and pass them to the RDMA
    engine if the message header is valid.  Messages with invalid
    headers are discarded. Intercept ack and nack messages, using the
    information in acks and nacks to free the appropriate token(s)
    from the "sent list array" into the recv token queue, marking the
    token as a send token, and using the information in nacks to
    prepend the appropriate tokens from the sent list to the send
    message list.

    RDMA:

    Check sequence numbers of received messages and check the
    availability of a receive message buffer of the correct size in the
    second level (as indicated by the free recv message array) and drop
    and nack erroneous messages as appropriate (messages that have bad
    CRCs or are undeliverable because there is no recv token of the
    appropriate size are nacked).  DMA chunks into the approriate
    location in the receive message buffer indicated by the recv token,
    and if the message CRC checks out, pass the message to the recvd
    messages list.

  All elements in this system require only bounded pre-allocated
  storage.  This may not be obvious for the pending nack table or the
  sent list array.  The "pending (n)ack table" records only a single
  sequence number and an ack/nack indicator for each connection.  The
  send message list, sent list array, and free send message list form a
  loop in which message descriptors are conserved (as do the other
  three queues/free_list pairs.)

  Only a bounded number of connections are supported.  But since the
  state is small and only one connection is required per host on the
  network, a .5MB LANai board should be able to support O(1000)-node
  networks.  One caveat: Only network diameters <= 16 are supported.

  ---

  Acks and Nacks are worth a little explaining.  GM uses a form of the
  "go back N" protocol with NACKs (see Telecommunication Networks:
  Protocols, Modeling and Analysis by Mischa Schwartz, Addison-Wesley,
  ISBN 0-201-16423-X). "Go back N" is the preferred protocol when
  software overhead, rather than network capacity, is the network's
  limiting factor, because it wastes network bandwidth during error
  recovery to reduce software overhead relative to other protocols.

  For each connection, when each message is sent for the first time, it
  is given a successive sequence number.  When the message is received,
  the receiver checks the sequence number received against the
  expected sequence number (seqno).  If there is a discrepency, the
  receiver nacks the expected seqno, indicating that the sender should
  retransmit the expected message and all following messages and
  implicitly acknowledeging all earlier messages.  All out-of sequence
  messages are dropped. If the seqno of a received message agrees with
  the expected value, the seqno is acked.

  Upon receiving an ack OR nack, the sender (i.e.: the recipient of
  the (n)ack) frees all messages with lower sequence numbers.  Upon
  receiving a nack, the receiver prepends to the send queue all
  messages with sequence numbers greater than or equal to the nacked
  sequence number, rewinding the send stream to the message missed by
  the receiver (the "go back N" part of the protocol).

  The ack and nack messages themselves are not delivered reliably, and
  consequenly timers must be used to force retransmission of
  unacknowledged messages if the ack or nack is lost.  In many reliable
  communication systems, a timer is set for each transmitted message.
  In GM, a timer goes of periodically, at which time all
  unacknowledged messages are queued for retransmission.	 Depending
  upon the length of the timeout, this system either wastes bandwidth
  retransmitting messages unneccesarily or recovers from errors very
  slowly.  However, this system works efficiently on Myrinet, where
  errors are extremely rare, bandwidth is plenty, and latency is
  small.  */

#include <stdio.h>
#include "gcc_version.h"
#include "gm_bootstrap.h"
#include "gm_cpp.h"
#include "gm_debug.h"
#include "gm_debug_lanai_tlb.h"
#include "gm_debug_send_tokens.h"
#include "gm_enable_crc32.h"
#include "gm_enable_datagrams.h"
#include "gm_enable_debug_counters.h"
#include "gm_enable_directed_send.h"
#include "gm_enable_error_counters.h"
#include "gm_enable_packet_counters.h"
#include "gm_enable_ethernet.h"
#include "gm_enable_fast_small_send.h"
#include "gm_enable_galvantech_workaround.h"
#include "gm_enable_mcp_send_length_check.h"
#include "gm_enable_paranoid_interrupts.h"
#include "gm_enable_pio_sends.h"
#include "gm_enable_security.h"
#include "gm_enable_shortcut.h"
#include "gm_enable_crc32.h"
#include "gm_error_handlers.h"
#include "gm_pte.h"
#include "gm_types.h"
#include "gmcp.h"
#include "gm_enable_trace.h"
#include "gm_trace.h"
#if GM_ENABLE_VM
#include "gm_page_hash.h"
#endif

static inline void
gm_flash_while_waiting (char *str)
{
#define MORE_FLASHING 0
#if MORE_FLASHING
#  warning MORE_FLASHING may impact performance.
  gm_morse_async(str);
#else
#if GM_DEBUG
  gm_morse_async(str);
#else  /* !GM_DEBUG */
  GM_PARAMETER_MAY_BE_UNUSED (str);
#endif /* !GM_DEBUG */
#endif /* MORE_FLASHING */
}

#define LZERO 0
#define LONE 1

/****************************************************************
 * Global variables
 ****************************************************************/

/* HACK: Reserve space for the stack, and tell crt0.s where the top of
   the stack is. */

gm_u32_t gm_stack[GM_STACK_WORDS];
void *gm_top_of_stack = (char *) gm_stack + sizeof (gm_stack);

#if GM_ENABLE_GALVANTECH_WORKAROUND
gm_u64_t copied_ack_packet[((sizeof (gm_ack_packet_t) \
			     +GM_MAX_NETWORK_DIAMETER) \ /8) \ +1];
#endif /* GM_ENABLE_GALVANTECH_WORKAROUND */

/***********************************************************************
 * min/max functions
 ***********************************************************************/

static inline gm_s32_t
gm_s32_min (gm_s32_t a, gm_s32_t b)
{
  return a < b ? a : b;
}
static inline gm_s32_t
gm_s32_max (gm_s32_t a, gm_s32_t b)
{
  return a > b ? a : b;
}

/* CSPI hack */
#if L5
void *LAR = 0;
void *EAR = 0;
int DMA_DIR = 0;
#endif

void
gm_abort ()
{
  gm_always_assert (0);
}

#if GM_MIN_SUPPORTED_SRAM <= 256
void
gm_bcopy (void *_from, void *_to, unsigned long len)
{
  register char *from, *to;
  register unsigned long i;

  from = (char *) _from;
  to = (char *) _to;

  for (i = len; i; i--)
    {
      *to++ = *from++;
    }
}

void
gm_reverse_bcopy (void *_from, void *_to, unsigned long len)
{
  register char *from, *to;
  register unsigned long i;

  from = (char *) (((char *) _from) + len);
  to = (char *) (((char *) _to) + len);

  for (i = len; i; i--)
    {
      *to-- = *from--;
    }
}


#else


/* pipelined Duff's Device bcopy implementation with 2 byte read-ahead. */

#define BCOPY3() *t++ = a; a = *f++; *t++ = b; b = *f++; *t++ = c; c = *f++;

void
gm_bcopy (void *from, void *to, gm_size_t len)
{
  char *f, *t, *limit;
  char a, b, c;

  f = from;
  t = to;
  limit = t + len;

  a = *f++;
  b = *f++;
  c = *f++;
  switch (len & 15)
    {
    case 0:
      while (t != limit)
	{
	  BCOPY3 ();
    case 13:
	  BCOPY3 ();
    case 10:
	  BCOPY3 ();
    case 7:
	  BCOPY3 ();
    case 4:
	  BCOPY3 ();
    case 1:
	  if (t + 1 == limit)
	    {
	      *t++ = a;
	      return;
	    }
	  BCOPY3 ();
    case 14:
	  BCOPY3 ();
    case 11:
	  BCOPY3 ();
    case 8:
	  BCOPY3 ();
    case 5:
	  BCOPY3 ();
    case 2:
	  if (t + 2 == limit)
	    {
	      *t++ = a;
	      *t++ = b;
	      return;
	    }
	  BCOPY3 ();
    case 15:
	  BCOPY3 ();
    case 12:
	  BCOPY3 ();
    case 9:
	  BCOPY3 ();
    case 6:
	  BCOPY3 ();
    case 3:
	  BCOPY3 ();
	}
    }
}

#undef BCOPY3

#define BCOPY3() *--t = a; a = *--f; *--t = b; b = *--f; *--t = c; c = *--f;

/* Just like bcopy, by copy the bytes in reverse order */

void
gm_reverse_bcopy (void *from, void *to, unsigned long len)
{
  char *f, *t, *limit;
  char a, b, c;

  f = (char *) from + len;
  t = (char *) to + len;
  limit = (char *) to;

  a = *--f;
  b = *--f;
  c = *--f;
  switch (len & 15)
    {
    case 0:
      while (t != limit)
	{
	  BCOPY3 ();
    case 13:
	  BCOPY3 ();
    case 10:
	  BCOPY3 ();
    case 7:
	  BCOPY3 ();
    case 4:
	  BCOPY3 ();
    case 1:
	  if (t - 1 == limit)
	    {
	      *--t = a;
	      return;
	    }
	  BCOPY3 ();
    case 14:
	  BCOPY3 ();
    case 11:
	  BCOPY3 ();
    case 8:
	  BCOPY3 ();
    case 5:
	  BCOPY3 ();
    case 2:
	  if (t - 2 == limit)
	    {
	      *--t = a;
	      *--t = b;
	      return;
	    }
	  BCOPY3 ();
    case 15:
	  BCOPY3 ();
    case 12:
	  BCOPY3 ();
    case 9:
	  BCOPY3 ();
    case 6:
	  BCOPY3 ();
    case 3:
	  BCOPY3 ();
	}
    }
}

#undef BCOPY3

#endif

/****************************************************************
 * Dispatch macros
 ****************************************************************/

/****************
 * dispatch debug support functions
 ****************
 These are are inline functions (instead of macros) so we can better
 tell what assertion fails from the file and line number reported in
 the GM_FAILED_ASSERTION_INTERRUPT. */

/* verify that the state is not too large for the state table. */


void gm_dispatch (void);
int main (void);

/* instrumenting */
#if GM_ENABLE_TRACE
 /* loic says: we are wasting 3 times GM_TRACEBUFSIZE to ensure
    gtracebuf is aligned on a 2*GM_TRACEBUFSIZE boundary, if is too
    much, another strategy si to use a base pointer and an index, then
    we need 2 global registers, and probably two more instructions per
    instrumentation point, anothe rpossibility is to specify somehow
    to the linker the right alignement.

    glenn says: The linker cannot align on coarser than an 8 byte
    boundary. */
gm_l_trace_t gtracebufarea[GM_LANAI_NUMTRACE * 4];
gm_l_trace_t *gtracebuf;
#endif

#include "gm_dispatch.h"

/****************************************************************
 * 64-bit timer
 ****************************************************************/

/* This function must be called at least once per clock rollover for
   correct operation.  Therefore, it is called in the low-frequency
   timeout code in gm_timer.h. */

inline gm_s64_t
rtc64 ()
{
  static union
  {
    gm_s64_t whole;
    struct
    {
      gm_s32_t high;
      gm_u32_t low;
    }
    parts;
  }
  stage;
  gm_u32_t rtc;

  rtc = RTC;
  if (rtc < stage.parts.low)
    stage.parts.high++;
  stage.parts.low = rtc;

  return stage.whole;
}

/************************************************************************
 * Miscellaneous functions
 ************************************************************************/

#include "gm_rand.h"

void
gm_bzero (void *s_pv, gm_size_t c)
{
#if GM_MIN_SUPPORTED_SRAM <= 256
  register char *cptr;
  register gm_size_t i;

  cptr = (char *) s_pv;
  for (i = c; i; i--)
    {
      *cptr++ = 0;
    }
#else /* GM_MIN_SUPPORTED_SRAM > 256 */
  gm_s32_t *top, *bottom;
  gm_s8_t *s = (gm_s8_t *) s_pv;

  gm_assert (s_pv);

  if (c <= 3)
    {
      gm_size_t i;

      for (i = 0; i < c; i++)
	s[i] = 0;
      return;
    }

  /* Clear the top and bottom three bytes of the region. */

  s[0] = s[1] = s[2] = s[c - 1] = s[c - 2] = s[c - 3] = 0;

  /* Clear all the ints (4 bytes) within the string in a fast unrolled loop. */

  bottom = (gm_s32_t *) ((unsigned) (s + 3) & ~3);
  top = (gm_s32_t *) ((unsigned) (s + c) & ~3);
  if (bottom >= top)
    return;
  switch ((top - bottom) & 0xf)
    {
    case 0:
      while (bottom < top)
	{
	  *bottom++ = 0;
    case 0xf:
	  *bottom++ = 0;
    case 0xe:
	  *bottom++ = 0;
    case 0xd:
	  *bottom++ = 0;
    case 0xc:
	  *bottom++ = 0;
    case 0xb:
	  *bottom++ = 0;
    case 0xa:
	  *bottom++ = 0;
    case 0x9:
	  *bottom++ = 0;
    case 0x8:
	  *bottom++ = 0;
    case 0x7:
	  *bottom++ = 0;
    case 0x6:
	  *bottom++ = 0;
    case 0x5:
	  *bottom++ = 0;
    case 0x4:
	  *bottom++ = 0;
    case 0x3:
	  *bottom++ = 0;
    case 0x2:
	  *bottom++ = 0;
    case 0x1:
	  *bottom++ = 0;
	}
    }
  return;
#endif /* GM_MIN_SUPPORTED_SRAM > 256 */
}

/* Compare 32-bit sequence numbers. Return 0 if equal, negative if
   a<b, and positive if a>b, but compensating for wraparound. */

#define SEQ_CMP32(a, b) ((gm_s32_t) (a) - (gm_s32_t) (b))

/* Compare 16-bit sequence numbers. Return 0 if equal, negative if
   a<b, and positive if a>b, but compensating for wraparound */

#define SEQ_CMP16(a, b) (((gm_s32_t) (a) - (gm_s32_t) (b)) << 16)

/* ethernet address comparison designed for efficient pipelining. */

static inline int
ether_addr_cmp (gm_s8_t * a, gm_s8_t * b)
{
  int ret, c, d, e, f;

  c = a[0];
  d = b[0];
  e = a[1];
  f = b[1];
  if ((ret = c - d) == 0)
    {
      c = a[2];
      d = b[2];
      if ((ret = e - f) == 0)
	{
	  e = a[3];
	  f = b[3];
	  if ((ret = c - d) == 0)
	    {
	      c = a[4];
	      d = b[4];
	      if ((ret = e - f) == 0)
		{
		  e = a[5];
		  f = b[5];
		  if ((ret = c - d) == 0)
		    {
		      ret = e - f;
		    }
		}
	    }
	}
    }
  return ret;
}

static inline void
ether_addr_copy (gm_u8_t * a, gm_u8_t * b)
{
  unsigned int c, d, e;

  c = a[0];
  d = a[1];
  e = a[2];
  b[0] = c;
  b[1] = d;
  b[2] = e;
  c = a[3];
  d = a[4];
  e = a[5];
  b[3] = c;
  b[4] = d;
  b[5] = e;
}

/****************************************************************
 * connection ack/nack functions
 ****************************************************************/

/* Special _gm_reack() entry point to pipeline gm_ack() */

static inline void
_gm_reack (gm_connection_t * c, unsigned int c__ack_pending)
{
  gm_s32_t rtc;

  rtc = RTC;
  gm_assert (&gm_connection[0] - c <= 0 &&
	     &gm_connection[GM_NUM_PRIORITIES * (gm.max_node_id + 1)] - c >
	     0);
  if (!c__ack_pending)
    {
      c->next_to_ack = gm.first_connection_to_ack;
      gm.first_connection_to_ack = c;
      c->ack_pending = -1;
    }
  NOTICE (ACK_PENDING);
  c->known_alive_time = rtc;
}

/* Cause an ack for a connection to be scheduled for sending to the
   remote end of the connection. */

inline void
gm_reack (gm_connection_t * c)
{
  _gm_reack (c, c->ack_pending);
}

/* Ack a message received on a connection. */

static inline void
gm_ack (gm_connection_t * c)
{
  unsigned int c__ack_packet_sexno_parts_seqno;
  unsigned int c__ack_pending;

  c__ack_packet_sexno_parts_seqno = c->ack_packet.sexno.parts.seqno;
  c__ack_pending = c->ack_pending;
  c->ack_packet.subtype = GM_ACK_SUBTYPE;
  c->ack_packet.usecs_delay = 0;
  c->ack_packet.sexno.parts.seqno = c__ack_packet_sexno_parts_seqno + 1;
  _gm_reack (c, c__ack_pending);
}

/****************************************************************
 * ethernet address table functions.
 ****************************************************************
 These functions are used to modify entries in the
 host-memory-resident ethernet address table.  This table directly
 informs the host of GM_ID->ethernet_id mappings, and indirectly
 informs the host of the reverse mappings. */

/* In the host-memory-resident ethernet address table, set the
   ethernet address for GM node NODE_ID to ADDR. */

static inline void
gm_ethernet_addr_table_set (unsigned int node_id, gm_u8_t * addr)
{
  gm_dp_t entry_logical_addr;
  gm_dp_t piece;

  /* compute logical address of addr table entry, aborting if not possible. */
  piece = gm.ethernet.addr_table_piece[node_id / (GM_PAGE_LEN / 8)];

  /* Catch case where MCP supports ethernet, but host does not */
  if (!piece)
    return;

  entry_logical_addr = piece + 8 * (node_id % (GM_PAGE_LEN / 8));

  /* Copy the ethernet address to a known 8-byte aligned buffer with
     the last two bytes cleared. */
  ether_addr_copy (addr, gm.ethernet.addr_stage);
  gm.ethernet.addr_stage[6] = gm.ethernet.addr_stage[7] = 0;

  /* wait for DMA engine to be free */
  await_free_DMA_engine ();

  /* DMA the address to the host. */
  start_RDMA (&gm.ethernet.addr_stage, entry_logical_addr, 8);
}

/* Insert the host name in the host-resident name table at position NODE_ID */

static inline void
gm_name_table_set (unsigned node_id, gm_u8_t * name)
{
  gm_dp_t piece;
  gm_dp_t entry_logical_addr;

  /* compute the logical address to DMA the name to */
  piece = gm.name_table_piece[node_id / (GM_PAGE_LEN / GM_MAX_HOST_NAME_LEN)];
  if (!piece)
    return;
  entry_logical_addr
    = piece + (GM_MAX_HOST_NAME_LEN
	       * (node_id % (GM_PAGE_LEN / GM_MAX_HOST_NAME_LEN)));

  /* wait for DMA engine to be free */
  await_free_DMA_engine ();

  start_RDMA (name, entry_logical_addr, GM_MAX_HOST_NAME_LEN);
}

static inline void
gm_mark_routes_as_retired (void)
{
  unsigned node_id;
  for (node_id = 0; node_id <= gm.max_node_id; node_id++)
    gm_connection[node_id].retired = 1;
}

/* Clear irrrelevant entries in the host-memory-resident address &
   hostname tables. */
static inline void
gm_clear_retired_routes (void)
{
  unsigned node_id;
  gm_u8_t addr_stage[8];
  char _null_name[GM_MAX_HOST_NAME_LEN + GM_DMA_GRANULARITY];
  char *null_name = GM_DMA_ROUNDUP (lp, _null_name);

  gm_bzero (addr_stage, 8);
  gm_bzero (_null_name, sizeof (_null_name));

  /* Clear each ethernet address in host memory. */
  for (node_id = 0; node_id <= gm.max_node_id; node_id++)
    {
      if (gm_connection[node_id].retired)
	{
	  GM_CONNECTION_CLEAR_ROUTE (&gm_connection[node_id]);
	  gm_connection[node_id].retired = 0;
	  gm_ethernet_addr_table_set (node_id, &addr_stage[0]);
	  gm_name_table_set (node_id, null_name);
	}
    }
}

/****
 * More functions
 ****/

/* rotate the send queue given the head suport and connection */

static inline void
rotate_send_queue (gm_subport_t * sp, gm_connection_t * c)
{
  gm_subport_t *sp__next;
  gm_connection_t *c__next_active;

  gm_assert (c->first_active_send_port == sp);

  sp__next = sp->next;
  c__next_active = c->next_active;
  ;
  gm_assert (sp__next);
  gm_assert (c__next_active);
  c->first_active_send_port = sp__next;
  gm.first_active_connection = c__next_active;
}

/****************
 * Send queue debugging
 ****************/

#define GM_DEBUG_SEND_QUEUE 0

#if GM_DEBUG_SEND_QUEUE
#warning debugging send queue
#endif

static int gm_debug_send_queue_cnt;
static unsigned int gm_debug_subport_cnt;

static inline void
check_subport_cnt (void)
{
  gm_connection_t *c;
  gm_subport_t *sp;
  unsigned int cnt = 0;

  if (!GM_DEBUG_SEND_QUEUE)
    return;

  c = gm.first_active_connection;
  if (c)
    {
      do
	{
	  sp = c->first_active_send_port;
	  do
	    {
	      cnt++;
	    }
	  while (sp != c->first_active_send_port);

	  c = c->next_active;
	}
      while (c != gm.first_active_connection);
    }
  gm_assert (cnt == gm_debug_subport_cnt);
}

static inline void
incr_subport_cnt (void)
{
  if (GM_DEBUG_SEND_QUEUE)
    {
      gm_printf (GM_STR ("0x%x in send queue after append.\n"),
		 ++gm_debug_send_queue_cnt);
    }
}

static inline void
decr_subport_cnt (void)
{
  if (GM_DEBUG_SEND_QUEUE)
    {
      gm_printf (GM_STR ("0x%x in send queue after remove.\n"),
		 --gm_debug_send_queue_cnt);
    }
}

/****************
 * Send queueing
 ****************/

static inline void
append_send_token_to_send_queue (gm_send_token_t * st, unsigned sp_id,
				 unsigned target_node_id)
{
  gm_connection_t *c;
  gm_subport_t *sp;
  gm_u32_t c__active_subport_bitmask;
  gm_subport_t *gm_free_subports;
  gm_u32_t port_id;

  gm_assert (st);
  gm_assert (target_node_id <= gm.max_node_id);
  check_subport_cnt ();
  GM_INCR_DEBUG_CNT (gm.sends_in_send_queue_cnt);

  c = &gm_connection[target_node_id];
  c__active_subport_bitmask = c->active_subport_bitmask;
  /* speculative prefetch */ gm_free_subports = gm.free_subports;
  /* speculative prefetch */ port_id = GM_SUBPORT_PORT (sp_id);
  if (c__active_subport_bitmask)
    {
      if (GM_DEBUG_SEND_QUEUE)
	gm_assert (gm_debug_send_queue_cnt);

      /* The connection is active and enqueued */
      if (c->active_subport_bitmask & (1 << sp_id))
	{
	  /* The subport is active, allocated, */
	  /* initialized, and queued. */

	  gm_send_token_t *sp__last_send_token;

	  gm_assert (c->first_active_send_port);
	  sp = c->first_active_send_port;
	  ;
	  ;
	  if (sp)		/* may be 0 for FT HACK. */
	    {
	      while (sp->id != sp_id)
		sp = sp->next;
	    }
	  gm_assert (sp->last_send_token);
	  sp__last_send_token = sp->last_send_token;

	  /* Attach the send token to the subport */
	  st->common.subport = sp;

	  /* Append send token to the send list */
	  gm_assert (sp->last_send_token != st);
	  gm_assert (sp->last_send_token->common.next == 0);
	  sp->last_send_token = st;
	  sp__last_send_token->common.next = st;
	  gm_assert (st->common.next == 0);
	}
      else
	{
	  /* The subport is NOT active, */
	  /* allocated, initialized, or queued. */

	  gm_subport_t *forward, *backward;
	  gm_subport_t *sp__next;
	  gm_u32_t gm_port___active_subport_cnt;
	  gm_s32_t rtc;

	  /* Allocate a subport */
	  gm_assert (gm_free_subports);
	  gm_assert (gm_free_subports == gm.free_subports);
	  sp = gm_free_subports;
	  gm_assert (!sp->disabled);
	  /* pre */ sp__next = sp->next;
	  gm_port___active_subport_cnt = gm_port[port_id].active_subport_cnt;


	  gm.free_subports = gm_free_subports = sp__next;

	  /* BAD: could be eliminated... somehow */
	  gm_port[port_id].active_subport_cnt
	    = gm_port___active_subport_cnt + 1;

	  /* Attach the send token to the subport */
	  st->common.subport = sp;

	  /* Initialize the subport */
	  sp->id = sp_id;
	  sp->connection = c;
	  gm_assert (sp->first_send_token == 0);
	  rtc = RTC;
	  sp->first_send_token = sp->last_send_token = st;
	  /* pre */ c__active_subport_bitmask = c->active_subport_bitmask;
	  forward = c->first_active_send_port;
	  gm_assert (st->common.next == 0);
	  gm_assert (sp->disabled == 0);
	  sp->delay_until = rtc;
	  sp->progress_time = rtc;
	  backward = forward->prev;
	  c->active_subport_bitmask =
	    c__active_subport_bitmask | (1 << sp_id);

	  gm_assert (c->first_active_send_port);
	  gm_assert (c->first_active_send_port->next);
	  gm_assert (c->first_active_send_port->prev);

	  sp->next = forward;
	  sp->prev = backward;
	  forward->prev = backward->next = sp;
	  /* Connection already in active list */

	  incr_subport_cnt ();
	}
    }
  else
    {
      /* The connection is not active or queued */

      gm_subport_t *sp__next;
      gm_u32_t gm_port___active_subport_cnt;
      gm_s32_t rtc;
      gm_connection_t *__gm_first_active_connection;

      /* Allocate a subport */
      gm_assert (gm_free_subports);
      gm_assert (gm_free_subports == gm.free_subports);
      sp = gm_free_subports;
      gm_assert (!sp->disabled);
      /* pre */ sp__next = sp->next;
      /* pre */ gm_port___active_subport_cnt
	= gm_port[port_id].active_subport_cnt;
      gm.free_subports = gm_free_subports = sp__next;

      gm_port[port_id].active_subport_cnt = gm_port___active_subport_cnt + 1;

      /* Attach the send token to the subport */
      st->common.subport = sp;

      /* Initialize the subport */
      sp->id = sp_id;
      sp->connection = c;
      gm_assert (sp->first_send_token == 0);
      rtc = RTC;
      sp->first_send_token = sp->last_send_token = st;
      gm_assert (st->common.next == 0);
      gm_assert (sp->disabled == 0);
      sp->delay_until = rtc;
      sp->progress_time = rtc;
      c->known_alive_time = rtc;

      /* Insert the subport in the active list for the connection */
      /* pre */ __gm_first_active_connection = gm.first_active_connection;
      c->active_subport_bitmask = 1 << sp_id;
      sp->next = sp->prev = sp;
      c->first_active_send_port = sp;

      /* Insert connection in active connection list if needed. */
      if (__gm_first_active_connection)
	{
	  gm_connection_t *forw, *back;

	  if (GM_DEBUG_SEND_QUEUE)
	    gm_assert (gm_debug_send_queue_cnt);

	  forw = __gm_first_active_connection;
	  back = forw->prev_active;
	  c->next_active = forw;
	  forw->prev_active = back->next_active = c;
	  c->prev_active = back;
	}
      else
	{
	  if (GM_DEBUG_SEND_QUEUE)
	    gm_assert (!gm_debug_send_queue_cnt);

	  gm.first_active_connection = c;
	  c->next_active = c->prev_active = c;
	}

      incr_subport_cnt ();
    }

  gm_assert (st);
  gm_assert (st->common.next == 0);
  gm_assert (sp);
  gm_assert (c);
  gm_assert (st->common.subport == sp);
  gm_assert (sp->connection == c);
  gm_assert (sp->id == sp_id);
  gm_assert (sp->first_send_token);
  gm_assert (sp->last_send_token == st);
  gm_assert (c->active_subport_bitmask & (1 << sp_id));
  check_subport_cnt ();
}

static inline void
remove_first_send_token_from_send_queue (gm_send_token_t * st, unsigned sp_id)
{
  gm_subport_t *sp;
  gm_subport_t *sp__next;
  gm_connection_t *c;
  gm_send_token_t *sp__first_send_token;
  gm_subport_t *gm_free_subports;
  gm_u32_t gm_port___active_subport_cnt;
  gm_send_token_t *st__common_next;

  check_subport_cnt ();
  GM_DECR_DEBUG_CNT (gm.sends_in_send_queue_cnt);

  gm_assert (st);
  /* Remove send token from send list */
  sp = st->common.subport;
  /* pre */ st__common_next = st->common.next;
  /* pre */ gm_free_subports = gm.free_subports;
  /* pre */ c = sp->connection;

  gm_assert (sp);
  gm_assert (sp->first_send_token);
  gm_assert (sp->first_send_token == st);
  gm_assert (sp->id == sp_id);
  gm_assert (sp->connection);
  gm_assert (sp->connection->active_subport_bitmask & (1 << sp_id));

  sp->first_send_token = sp__first_send_token = st__common_next;
  /* pre */ sp__next = sp->next;
  if (!sp__first_send_token)
    {
      gm_connection_t *c__next_active;
      gm_u32_t sp__id;

      /* Remove the subport from the send queue. */

      gm_assert (c);
      gm_assert (c == sp->connection);

      /* prefetch */ c__next_active = c->next_active;
      /* prefetch */ sp__id = sp->id;

      gm_assert (sp__next);
      if (sp__next == sp)
	{
	  gm_connection_t *gm_first_active_connection;

	  /* Remove the last subport for the connection. */
	  /* Mark connection as inactive */
	  gm_assert (c->active_subport_bitmask == 1U << sp_id);
	  /* pre */ gm_first_active_connection = gm.first_active_connection;
	  c->active_subport_bitmask = 0;

	  /* Remove connection from send list, marking the send list
	     as empty if it is empty. */
	  if (c__next_active == c)
	    {
	      gm.first_active_connection = 0;
	      NOTICE_NO (SDMA_PENDING);
	    }
	  else
	    {
	      gm_connection_t *forward, *backward;
	      backward = c->prev_active;
	      gm_assert (gm_first_active_connection);
	      if (gm_first_active_connection == c)
		gm.first_active_connection = c__next_active;

	      forward = c__next_active;
	      gm_assert (forward);
	      gm_assert (backward);
	      forward->prev_active = backward;
	      backward->next_active = forward;
	    }
	}
      else
	{
	  /* Remove a subport for the connection, but it's not the last one. */
	  gm_subport_t *forward, *backward;
	  gm_u32_t c__active_subport_bitmask;

	  backward = sp->prev;
	  c__active_subport_bitmask = c->active_subport_bitmask;
	  gm_assert (c->first_active_send_port);
	  forward = sp__next;
	  if (c->first_active_send_port == sp)
	    c->first_active_send_port = sp__next;

	  gm_assert (forward);
	  gm_assert (backward);
	  forward->prev = backward;
	  backward->next = forward;

	  gm_assert (c->active_subport_bitmask & 1 << sp_id);
	  c->active_subport_bitmask =
	    c__active_subport_bitmask ^ (1 << sp_id);
	}
      /* Free the subport. */

      /* pre */ gm_port___active_subport_cnt
	= gm_port[GM_SUBPORT_PORT (sp__id)].active_subport_cnt;

      sp->next = gm_free_subports;
      gm.free_subports = sp;

      /* BAD: could be eliminated... somehow */
      gm_port[GM_SUBPORT_PORT (sp_id)].active_subport_cnt
	= gm_port___active_subport_cnt - 1;

      decr_subport_cnt ();
    }

  check_subport_cnt ();

  LOG_DISPATCH (141, "removed first send token from send queue");
}

/* Generic remove_send_token_from_send_queue() routine.  This is used
   to remove datagram send tokens from the queue, since the
   datagram send token we want to remove may not be at the head
   of the send queue; reliable send tokens may precede it. */

static inline void
remove_send_token_from_send_queue (gm_send_token_t *st, unsigned sp_id)
{
  gm_send_token_t *sp__first_send_token;
  gm_send_token_t *st__common_next;
  gm_subport_t *sp;

  sp = st->common.subport;
  st__common_next = st->common.next;
  ;
  sp__first_send_token = sp->first_send_token;
  ;
  ;
  gm_assert (sp__first_send_token);
  if (sp__first_send_token == st)
    {
      /* st is the first in the list */

      remove_first_send_token_from_send_queue (st, sp_id);
    }
  else
    {
      gm_send_token_t *scan_token;

      /* find the token containing a COMMON.NEXT pointer to ST */

      for (scan_token = sp__first_send_token;
	   scan_token->common.next != st;
	   scan_token = scan_token->common.next)
	{
	  gm_assert (scan_token->common.next);
	}

      /* Splice ST out of the linked list of send tokens. */

      scan_token->common.next = st__common_next;

      /* If ST is the last send token for the subport, update
         SP->LAST_SEND_TOKEN. */

      if (st__common_next == 0)
	{
	  sp->last_send_token = scan_token;
	}

      LOG_DISPATCH (142, "removed send token from send queue (not first)");
      GM_DECR_DEBUG_CNT (gm.sends_in_send_queue_cnt);
    }
}

/*************************************
 * Recv Token Hash Table
 *************************************/

static inline const unsigned
rt_hash (gm_recv_token_key_t key)
{
  gm_u32_t ret = key.whole;

  gm_assert (GM_POWER_OF_TWO (GM_RECV_TOKEN_HASH_BINS));

  ret = ret ^ ret >> 11 ^ ret >> 22;
  ret ^= ret >> 5;
  ret &= GM_RECV_TOKEN_HASH_BINS - 1;
  return ret;
}

static inline gm_recv_token_key_t
recv_token_key (unsigned int sender_node_id,
		unsigned int sender_subport_id,
		unsigned int target_subport_id)
{
  gm_recv_token_key_t ret;

  ret.whole = ((sender_node_id << 16)
	       | (sender_subport_id << 8) | (target_subport_id));
  return ret;
}

static inline void
recv_token_hash_insert (gm_recv_token_t * token, unsigned sender_node_id,
			unsigned sender_subport_id,
			unsigned target_subport_id)
{
  gm_recv_token_t **bin, *_bin;
  gm_recv_token_key_t key;

  key = recv_token_key (sender_node_id, sender_subport_id, target_subport_id);
  bin = &gm.recv_token_bin[rt_hash (key)];
  _bin = *bin;
  token->key = key;
  *bin = token;
  token->next = _bin;

  GM_INCR_DEBUG_CNT (gm.hashed_token_cnt);
}

static inline gm_recv_token_t *
recv_token_hash_get (unsigned sender_node_id, unsigned sender_subport_id,
		     unsigned target_subport_id)
{
  gm_recv_token_t *t;
  gm_recv_token_key_t key;

  key = recv_token_key (sender_node_id, sender_subport_id, target_subport_id);

  t = gm.recv_token_bin[rt_hash (key)];
  while (t && t->key.whole != key.whole)
    t = t->next;
  if (GM_DEBUG && !t)
    {
      gm_printf (GM_STR
		 ("no recv token for node 0x%x subport 0x%x"
		  " target_subport 0x%x\n"),
		 sender_node_id, sender_subport_id, target_subport_id);
    }
  return t;
}

static inline gm_recv_token_t *
recv_token_hash_remove (unsigned sender_node_id, unsigned sender_subport_id,
			unsigned target_subport_id)
{
  gm_recv_token_t **holder, *token, *next_token;
  gm_recv_token_key_t token__key, key;

  key = recv_token_key (sender_node_id, sender_subport_id, target_subport_id);
  /* pre */ holder = &gm.recv_token_bin[rt_hash (key)];
  /* pre */ token = *holder;
  ;
  ;
  /* pre */ token__key.whole = token->key.whole;

  while (token)
    {
      next_token = token->next;
      if (token__key.whole == key.whole)
	{
	  /* fill */ token__key.whole = next_token->key.whole;
	  GM_INCR_DEBUG_CNT (gm.hashed_token_cnt);
	  *holder = next_token;
	  return token;
	}
      token__key.whole = next_token->key.whole;
      holder = &token->next;
      token = next_token;
    }
  if (GM_DEBUG)
    gm_printf (GM_STR
	    ("no recv token for node 0x%x subport 0x%x target_subport 0x%x\n"),
	    sender_node_id, sender_subport_id, target_subport_id);
  return 0;
}

static inline gm_recv_token_t *
recv_token_hash_verify (unsigned sender_node_id, unsigned sender_subport_id,
			unsigned target_subport_id)
{
  gm_recv_token_t *t;
  gm_recv_token_key_t key;

  key = recv_token_key (sender_node_id, sender_subport_id, target_subport_id);

  t = gm.recv_token_bin[rt_hash (key)];
  while (t && t->key.whole != key.whole)
    t = t->next;
  return t;
}

/* Scan the hash table, removing any entry for PORT. */

#define GM_DEBUG_RECV_TOKEN_HASH_REMOVE_PORT_REFERENCES 0

static inline void
recv_token_hash_remove_port_references (unsigned int port)
{
  gm_recv_token_t **holder, *t;
  int i;

  if (GM_DEBUG_RECV_TOKEN_HASH_REMOVE_PORT_REFERENCES)
    {
      gm_printf (GM_STR ("entered " __FUNCTION__ "\n"));
      fflush (stdout);
    }

  /* for each bin */
  for (i = 0; i < GM_RECV_TOKEN_HASH_BINS; i++)
    {
      /* Scan for matches */
      holder = &gm.recv_token_bin[i];
      while (*holder)
	{
	  t = *holder;

	  /* If entry is for PORT */
	  if (GM_SUBPORT_PORT (t->key.parts.target_subport_id) == port)
	    {
	      /* remove and free entry */
	      *holder = t->next;
	      t->next = gm_port[port].free_recv_tokens;
	      gm_port[port].free_recv_tokens = t;
	      GM_DECR_DEBUG_CNT (gm.hashed_token_cnt);
	    }
	  else
	    {
	      /* Try next in bin */
	      holder = &t->next;
	    }
	}
    }

  if (GM_DEBUG_RECV_TOKEN_HASH_REMOVE_PORT_REFERENCES)
    {
      gm_printf (GM_STR ("leaving " __FUNCTION__ "\n"));
      fflush (stdout);
    }
}

/******************************
 * Host <-> LANai communication
 ******************************/

/****************
 * interrupts
 ****************/

static inline void
_await_interrupt_completion (char *LED_msg)
{
  while (get_ISR () & HOST_SIG_BIT)
    {
      gm.while_waiting++;
      gm_flash_while_waiting (LED_msg);
    }
  GM_STBAR ();
#if GM_ENABLE_PARANOID_INTERRUPTS
  while (gm.interrupt.type != GM_NO_INTERRUPT)
    {
      if ((get_ISR () & HOST_SIG_BIT) == 0)
	set_ISR (HOST_SIG_BIT);
      gm.while_waiting++;
      gm_flash_while_waiting (LED_msg);
      GM_STBAR ();
    }
#else
  GM_PARAMETER_MAY_BE_UNUSED (LED_msg);
#endif
}

static void
_gm_interrupt (gm_u32_t type)
{
  GM_STBAR ();
  gm.interrupt.type = type;
  set_ISR (HOST_SIG_BIT);
#if L7
  /* Writing the HOST_SIG_BIT (or any other _sig_bit) in the LANai7
     can fail, so retry setting it until we succeed.  Check
     gm.interrupt.type to make sure the interrupt was not legitimately
     claimed.  This may seem like overkill at first glance, but
     considering that DMA and Packet Interface traffic can starve the
     LANai CPU, it is necessary to guarantee correct operation. */
  {
    gm_u32_t isr, type;		/* temp vars for pipelining */

    isr = get_ISR ();
    type = gm.interrupt.type;
    while ((isr & HOST_SIG_BIT) == 0 && type != GM_NO_INTERRUPT)
      {
	set_ISR (HOST_SIG_BIT);
	isr = get_ISR ();
	type = gm.interrupt.type;
      }
  }
#endif
}

static void
_gm_interrupt_error (char *error)
{
  char *save;			/* save state during error message interrupt */

  _await_interrupt_completion (__FUNCTION__);
  save = gm.interrupt.print.string;
  gm.interrupt.print.string = error;
  _gm_interrupt (GM_PRINT_INTERRUPT);
  _await_interrupt_completion (__FUNCTION__);
  gm.interrupt.print.string = save;
}

static int _prepared_to_interrupt;

/*
 * Interrupt entry points: Call prepare_to_interrupt() before touching
 * the interrupt structure to ensure that the preceeding interrupt has
 * been handled, then gm_interrupt when done writing it to generate
 * the interrupt.  Finally, you may call await_interrupt_completion
 */

inline void
prepare_to_interrupt (char *LED_msg)
{
  if (GM_DEBUG && _prepared_to_interrupt)
    _gm_interrupt_error (GM_STR
			 ("recursive interrupt preparation detected."));
  _await_interrupt_completion (LED_msg);
  if (GM_DEBUG)
    _prepared_to_interrupt = 1;
}

void
gm_interrupt (gm_u32_t type)
{
  if (GM_DEBUG && !_prepared_to_interrupt)
    _gm_interrupt_error (GM_STR ("unprepared interrupt"));
  _gm_interrupt (type);
  _prepared_to_interrupt = 0;
}

inline void
await_interrupt_completion (char *LED_msg)
{
  if (GM_DEBUG && _prepared_to_interrupt)
    {
      _gm_interrupt_error (GM_STR
			   ("awaiting interrupt while prepared for"
			    " interrupt\n"));
    }
  _await_interrupt_completion (LED_msg);
}

/****************
 * putstring
 ****************/

#ifndef gm_putstring
/* this is #defined to be nothing in gmcp.h if SRAM<=512 */
void
gm_putstring (char *my_str)
{
  prepare_to_interrupt ("gm_putstring ");
  gm.interrupt.print.string = my_str;
  gm_interrupt (GM_PRINT_INTERRUPT);
}
#endif

static inline void
wake_port (gm_port_protected_lanai_side_t * port)
{
  port->wake_host = 0;

  /* Tell host driver to wake port. */

  prepare_to_interrupt ("333");
  gm.interrupt.wake.port = port->id;
  gm_interrupt (GM_WAKE_INTERRUPT);
}

/* If requested, wake host */
static inline void
wake_port_if_needed (gm_port_protected_lanai_side_t * port)
{
  if (port->wake_host)
    wake_port (port);
}

#define RECV_QUEUE_SLOT_DMA_ADDR(port)					\
     (port->recv_queue_slot_dma_addr[port->recv_queue_slot_num])

#define RECV_QUEUE_SLOT_HOST_ADDR(port)					\
     (port->recv_queue_slot_host_addr[port->recv_queue_slot_num])

#define RECV_QUEUE_ADVANCE(port) do {					\
       if (++port->recv_queue_slot_num >= GM_NUM_RECV_QUEUE_SLOTS)	\
	 port->recv_queue_slot_num = 0;					\
} while (0)

/* Place a trivial event in the receive queue in host memory. */

static inline void
gm_report (gm_port_protected_lanai_side_t * port,
	   enum gm_recv_event_type type)
{
  /* Verify user legitimate "NEW" type was specified. */

  gm_assert (GM_POWER_OF_TWO (GM_NEW_NO_RECV_EVENT));
  gm_assert (type & GM_NEW_NO_RECV_EVENT);

  /* Wait for DMA interface to be available */

  await_free_DMA_engine ();

  /* Report the event */
  gm.report_dma_stage.type = type;

  gm_assert (sizeof (gm.report_dma_stage) == GM_RDMA_GRANULARITY);
  start_GRANULAR_RDMA (&gm.report_dma_stage,
		       (RECV_QUEUE_SLOT_DMA_ADDR (port)
			+ sizeof (gm_recv_queue_slot_t)
			- GM_RDMA_GRANULARITY), GM_RDMA_GRANULARITY);

  await_free_DMA_engine ();

  wake_port_if_needed (port);

  /* Advance the receive queue */

  RECV_QUEUE_ADVANCE (port);
}

/* non-inline version of gm_report() to save space for the
   non-performance critical error reporting task. */

void
gm_report_error (gm_port_protected_lanai_side_t * port,
		 enum gm_recv_event_type type)
{
  gm_report (port, type);
}

static inline void
gm_unset_alarm (gm_port_protected_lanai_side_t * port)
{
  gm_port_protected_lanai_side_t **pp;

  if (!port->alarm_set)
    return;

  for (pp = &gm.first_port_with_alarm; *pp; pp = &(*pp)->next_with_alarm)
    {
      if (*pp == port)
	{
	  *pp = port->next_with_alarm;
	  break;
	}
    }
  port->alarm_set = 0;
}

static inline void
gm_lanai_set_alarm (gm_port_protected_lanai_side_t * port, gm_s32_t time)
{
  /* BAD: Should sort alarms by time */

  port->next_with_alarm = gm.first_port_with_alarm;
  gm.first_port_with_alarm = port;
  port->alarm_time = time;	/* the "tick" time is .5 microseconds */
  port->alarm_set = 1;
}

static inline void
update_host_sent_queue (gm_port_protected_lanai_side_t * p)
{
  unsigned int cnt;
  gm_dp_t ear;
  gm_lp_t lar;
  struct _gm_sent_token_report *p__sent_slot;

  GM_PRINT (GM_DEBUG_SEND_TOKENS,
	    ("updating sent queue for port %d\n", p->id));

  p__sent_slot = p->sent_slot;
  ;
  ;
  gm_assert ((char *) p__sent_slot + GM_NUM_SEND_QUEUE_SLOTS + 2
	     >= (char *) &p->sent + 1);

  /* null terminate the list */

  gm_assert (p->sent_slot);
  p__sent_slot->token = 0;

  /* determine event to DMA */

  cnt =
    GM_RDMA_ROUNDUP (u32, (char *) (&p->sent + 1) - (char *) p__sent_slot);
  lar = (char *) (&p->sent + 1) - cnt;

  if (GM_DEBUG_SEND_TOKENS)
    {
      if (cnt > 1)
	{
	  LOG_DISPATCH (143, "passing multiple send tokens to host");
	}
      else
	{
	  LOG_DISPATCH (144, "passing 1 send tokens to host");
	}
    }

  GM_PRINT (GM_DEBUG_SEND_TOKENS, ("event looks like:"));
  if (GM_DEBUG_SEND_TOKENS)
    {
      gm_hex_dump (lar, cnt);
    }

  /* determine where to DMA it. */

  ear = (RECV_QUEUE_SLOT_DMA_ADDR (p) + sizeof (gm_recv_queue_slot_t) - cnt);

  /* DMA the GM_SENT_EVENT to the host. */

  await_free_DMA_engine ();
  start_GRANULAR_RDMA (lar, ear, cnt);

  RECV_QUEUE_ADVANCE (p);

  /* BAD: could eliminate this wait by using two staging areas. */

  await_free_DMA_engine ();

  /* If needed inform the host of the completed sends. */
  wake_port_if_needed (p);

  /* Rewind the sent list. */
  p->sent_slot = 0;
}

/* Transfer the sent message lists that have been built for each port
   to the host, generating interrupts if needed.  This function is
   called periodically by the timer handler.  The sent message
   notification is batched because this information is not latency
   sensitive, batching utilizes the I/O bus more efficiently, and
   having the queue in host memory allows the host to more efficiently
   poll for sent messages.  */

static inline void
update_host_sent_queues (void)
{
  if (gm.first_port_with_sent_packets)
    {
      gm_port_protected_lanai_side_t *p;

      p = gm.first_port_with_sent_packets;
      gm.first_port_with_sent_packets = 0;
      do
	{
	  update_host_sent_queue (p);
	  /* Remove port from list of ports with sent messages. */
	  p = p->next_with_sent_packets;
	}
      while (p);
    }
}

static inline struct gm_send_queue_slot *
FAKE_SEND_QUEUE_SLOT (void)
{
  gm_assert (gm.volatile_zero == 0);
  gm_assert (GM_NO_SEND_EVENT == 0);
  return ((struct gm_send_queue_slot *) (&gm.volatile_zero + 1) - 1);
}

static inline int
send_token_in_free_list (gm_send_token_t * st,
			 gm_port_protected_lanai_side_t * port)
{
  gm_send_token_t *free;
  int cnt=0;

  for (free = port->first_free_send_token; free; free = free->common.next)
    {
      if (free == st)
	return 1;
      gm_assert (cnt++ < GM_NUM_SEND_QUEUE_SLOTS);
    }
  return 0;
}

/* Pass the sent token back to the host */

static inline void
pass_sent_token_to_port (unsigned int token,
			 gm_port_protected_lanai_side_t * port,
			 gm_boolean_t force_update, gm_status_t status)
{
  struct _gm_sent_token_report *port__sent_slot;
  unsigned int port__wake_host;
  gm_port_protected_lanai_side_t *gm_first_port_with_sent_packets;
  gm_u32_t isr;

  GM_INCR_DEBUG_CNT (gm.sent_tokens_queued_for_host_cnt);

start_over:

  GM_PRINT (GM_DEBUG_SEND_TOKENS,
	    ("passing token for slot 0x%x to port\n", token - 1));
  gm_assert (token);
  gm_assert (token <= GM_NUM_SEND_QUEUE_SLOTS);
  gm_assert (token <= 255);
  /* pre */ port__sent_slot = port->sent_slot;
  /* pre */ gm_first_port_with_sent_packets = gm.first_port_with_sent_packets;
  /* pre */ port__wake_host = port->wake_host;

  /* If needed, add this port to the list of ports with sent messages
     to be passed to the host. */

  if (port__sent_slot == 0)
    {
      GM_PRINT (GM_DEBUG_SEND_TOKENS, ("starting new sent report list\n"));

      port__sent_slot
	= &port->sent.report[GM_NUM_ELEM (port->sent.report) - 1];
      port->next_with_sent_packets = gm_first_port_with_sent_packets;
      gm.first_port_with_sent_packets = port;
    }

  /* If there is no room for the report, then generate a sent tokens
     event to reset the list and try again. */
  else if (port__sent_slot == &port->sent.report[0])
    {
      GM_PRINT (GM_DEBUG_SEND_TOKENS, ("flushing full sent report list\n"));

      update_host_sent_queues ();
      goto start_over;
    }


  /* pre */ isr = get_ISR ();
  port__sent_slot->token = token;
  port__sent_slot->status = status;
  port__sent_slot--;
  port->sent_slot = port__sent_slot;

  /* If the DMA interface is idle or a wake has been requested for
     this port, immediately pass the sent messages to the host and
     wake the port. */

  if (force_update || (isr & DMA_INT_BIT) | port__wake_host)
    update_host_sent_queues ();
}

static inline void
free_send_token (gm_send_token_t * st, gm_port_protected_lanai_side_t * p)
{
  gm_send_token_t *last, *first;

  /* Free token, highly optimized for L7 pipelining */

  gm_assert (!send_token_in_free_list (st, p));

  first = p->first_free_send_token;
  last = p->last_free_send_token;
  p->last_free_send_token = st;
  st->common.next = 0;
  if (!first)
    {
      gm_port_unprotected_lanai_side_t *PORT;

      /* pre */ PORT = p->PORT;
      first = st;
      last = (gm_send_token_t *) ((char *) &p->first_free_send_token
				  - GM_OFFSETOF (gm_send_token_t,
						 common.next));
      p->send_token_queue_slot
	= &PORT->send_token_queue[st - &p->_send_tokens[0]];
      GM_PRINT (GM_DEBUG_SEND_TOKENS,
		("looking for next send in slot 0x%x\n",
		 (int) (st - &p->_send_tokens[0])));
    }
  last->common.next = st;
}

/* Queue the sent message to be passed to the host eventually via a
   bulk DMA. */

static inline void
pass_sent_token_to_port_and_free (gm_send_token_t * st,
				  gm_port_protected_lanai_side_t * port,
				  int force_sent_queue_update,
				  gm_status_t status)
{
  if (GM_ENABLE_DEBUG_COUNTERS)
    {
      if (st->common.type == GM_ST_RELIABLE)
	{
	  GM_INCR_DEBUG_CNT (gm.completed_reliable_send_cnt);
	}
      else if (st->common.type == GM_ST_DATAGRAM)
	{
	  GM_INCR_DEBUG_CNT (gm.completed_datagram_send_cnt);
	}
    }

  free_send_token (st, port);
  pass_sent_token_to_port (st - &port->_send_tokens[0] + 1, port,
			   force_sent_queue_update, status);
}

static inline void
rewind_send_tokens (gm_send_record_t * sent)
{
  /* restore send tokens to their earlier state */
  do
    {
      gm_send_token_t *st;
      gm_send_record_t *sent__next;
      gm_u32_t before_len, send_len;
      gm_up_t before_ptr;

      st = sent->send_token;
      before_len = sent->before_len;
      before_ptr = sent->before_ptr;
      send_len = st->ackable.send_len;
      sent__next = sent->next;
      ;

      /* Only rewind the send token if it is not already rewound,
         because an earlier send record may have rewound it already.
         Rewinding to the position in the later send record would be
         incorrect. */
      if (send_len < before_len)
	{
	  st->ackable.send_len = before_len;
	  st->ackable.send_ptr = before_ptr;
	}
      sent = sent__next;
    }
  while (sent);
}


/* Note: It is up to the caller to restore the send_sexno for the
   connection. */
static inline void
rewind_connection (gm_connection_t * c)
{
  gm_send_record_t *sent;

  sent = c->first_send_record;
  if (!sent)
    return;

  /* fairly allow the next subport to be the first to send. */
  /* BAD: should be pipelined with preceeding code */

  c->first_active_send_port = sent->send_token->common.subport->next;

  rewind_send_tokens (sent);

  /* Free the send records */
  c->last_send_record->next = gm.free_send_records;
  gm.free_send_records = c->first_send_record;
  c->first_send_record = 0;
}

void
handle_send_error (gm_subport_t * sp, gm_status_t errno, int dest_id)
{
  gm_send_token_t *st;
  unsigned int sp__id, port__open;
  gm_port_protected_lanai_side_t *port;

  gm_assert (sp->first_send_token);

  /* pre */ sp__id = sp->id;
  /* pre */ st = sp->first_send_token;
  ;
  port = &gm_port[GM_SUBPORT_PORT (sp__id)];
  ;
  ;
  port__open = port->open;
  ;
  ;
  gm_printf ("handle_send_error: subport=%d   errno=%d  dest_GMid=%d\n",
			sp__id, errno, dest_id);
  fflush (stdout);

  if (port__open)
    {
      gm_send_token_t *next;

      /* report the failed send to the user. */

      next = st->common.next;
      sp->disabled = 2;
      if (next)
	{
	  remove_first_send_token_from_send_queue (st, sp__id);
	}
      else
	{
	  /* HACK: defer removal of the subport so it can be marked
	     disabled. */

	  sp->first_send_token = 0;
	  sp->last_send_token
	    = (gm_send_token_t *) ((char *) &sp->first_send_token
				   - GM_OFFSETOF (gm_send_token_t,
						  common.next));
	}

      pass_sent_token_to_port_and_free (st, port, 1, errno);
    }
  else
    {
      /* free all send tokens and the subport */

      gm_assert (st == sp->first_send_token);
      gm_assert (st);
      do
	{
	  remove_first_send_token_from_send_queue (st, sp__id);
	  pass_sent_token_to_port_and_free (st, port, 1, errno);
	}
      while ((st = sp->first_send_token) != 0);
    }
}

static int recv_token_cnt[GM_NUM_PRIORITIES][GM_NUM_SIZES];

/* Move all recv tokens queued by the host for PRIORITY into the free list
   for the appropriate size and priority. */

static inline void
absorb_recv_tokens_from_host (gm_port_protected_lanai_side_t * port)
{
  gm_host_recv_token_t *hrt;
  unsigned size, priority;
  int ready;

  while (1)
    {
      gm_recv_token_t *port__free_recv_tokens;
      gm_port_unprotected_lanai_side_t *PORT;

      /* find a host recv token to absorb, if any */

      hrt = port->recv_token_queue_slot;
      /* prefetch */ port__free_recv_tokens = port->free_recv_tokens;
      /* prefetch */ PORT = port->PORT;
      ready = hrt->ready;
      GM_STBAR ();
      /* prefetch */ size = hrt->size;
      /* prefetch */ priority = hrt->priority;
      if (ready)
	{
	  gm_up_t message;
	  unsigned int tag;


	  /* Read rest of info from recv token */
	  message = hrt->message;
	  tag = hrt->tag;

	  /* Catch user violations of recv token regulation. */

	  if (port__free_recv_tokens)
	    {
	      /* Advance recv token queue.  Do this before checking
	         token fields to prevent multiple bad token
	         reports.

	         HACK: Use a fullword write, also clobbering some other
	         fields, to minimize the use of partword stores. */

	      gm_assert (GM_OFFSETOF (gm_host_recv_token_t, tag) + 3
			 == GM_OFFSETOF (gm_host_recv_token_t, ready));
	      gm_assert (GM_OFFSETOF (gm_host_recv_token_t, priority) + 1
			 == GM_OFFSETOF (gm_host_recv_token_t, size));
	      gm_assert (GM_OFFSETOF (gm_host_recv_token_t, size) + 1
			 == GM_OFFSETOF (gm_host_recv_token_t, ready));
	      *(gm_u32_t *) & hrt->ready = 0;
	      port->recv_token_queue_slot = hrt + 1;

	      /* Drop tokens of illegitimate size. Don't check the
	         message pointer P, as the page hash table will catch
	         errors there. */

	      if (size < GM_NUM_SIZES)
		{
		  if (priority <= GM_MAX_PRIORITY)
		    {
		      gm_recv_token_t *t, *next_free;
		      gm_recv_token_t **ready_list, *first_ready;

		      if (GM_DEBUG_RECV_TOKENS)
			{
			  gm_printf (GM_STR
				     ("0x%x tokens, priority %d size 0x%x\n"),
				     ++recv_token_cnt[priority][size],
				     priority, size);
			  fflush (stdout);
			}

		      /* Allocate a receive token buffer in which to
		         store the recv token info */

		      t = port__free_recv_tokens;
		      next_free = t->next;
		      gm_assert (t);
		      ready_list = &port->free_recv_token[priority][size];
		      first_ready = *ready_list;
		      port->free_recv_tokens = next_free;

		      /* Initialize token, and add it to the appropriate
		         ready list. */

		      t->orig_ptr = message;
		      t->recv_ptr = message;
		      t->tag = tag;
		      t->next = first_ready;
		      *ready_list = t;
		    }
		  else
		    {
#if 0
		      /* We really don't wan't to waste tons of memory
		         inlining this repeatedly.  This belongs in a
		         function in mcp/gm_error_handlers.c */
		      gm_printf ("bad recv token - priority pri=%d"
				 " max_pri=%d\n", priority, GM_MAX_PRIORITY);
/*
                      gm_printf("start = 0x%p  end = 0x%p\n",
				&PORT->recv_token_queue[0],
				(&PORT->recv_token_queue
				[GM_NUM_RECV_TOKEN_QUEUE_SLOTS]));
*/
		      fflush (stdout);
		      gm_hex_dump (((unsigned char *) hrt),
				   sizeof (*hrt) + 32);
		      fflush (stdout);
#endif
		      gm_report_error (port, GM_NEW_BAD_RECV_TOKEN_EVENT);
		      continue;
		    }
		}
	      else
		{
#if 0
		  /* We really don't wan't to waste tons of memory
		     inlining this repeatedly.  This belongs in a
		     function in mcp/gm_error_handlers.c */
		  gm_printf ("bad recv token - size   size=%d max_size=%d\n",
			     size, GM_NUM_SIZES);
		  fflush (stdout);
#endif
		  gm_report_error (port, GM_NEW_BAD_RECV_TOKEN_EVENT);
		  continue;
		}
	    }
	  else
	    {
	      gm_report_error (port, GM_NEW_RECV_TOKEN_VIOLATION_EVENT);
	      return;
	    }
	}
      else
	{
	  if (hrt < &PORT->recv_token_queue[GM_NUM_RECV_TOKEN_QUEUE_SLOTS])
	    return;
	  hrt = port->recv_token_queue_slot = &PORT->recv_token_queue[0];
	  if (!hrt->ready)
	    return;
	}
    }
}

/* Allocate a token for a receive of PRIORITY and SIZE, if
   possible. */

static inline gm_recv_token_t *
alloc_lanai_recv_token (unsigned priority,
			gm_port_protected_lanai_side_t * port, unsigned size)
{
  gm_recv_token_t *first;

  first = port->free_recv_token[priority][size];

  if (!first)
    {
      absorb_recv_tokens_from_host (port);
      first = port->free_recv_token[priority][size];
      if (!first)
	return first;
    }

  port->free_recv_token[priority][size] = first->next;

  if (GM_DEBUG_RECV_TOKENS)
    {
      gm_printf (GM_STR ("0x%x tokens, priority %d size 0x%x\n"),
		 --recv_token_cnt[priority][size], priority, size);
      fflush (stdout);
    }

  return first;
}

				/* Flash LED */
static inline void
heartbeat (void)
{
#if 0
  /* Flash "GM" in morse code. */
  gm_morse_async ("GM\n");
#elif 0
  /* Slow changing */
  static char threshold[] =
    { 0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0 };
  int rtc = RTC;
  set_LED (((rtc >> 10) & 0x7) < (threshold[(rtc >> 16) & 0xf]));
#elif 1
  /* pulse periodically and turn on when sending */
  if (gm.led)
    {
      set_LED (1);
    }
  else
    {
      set_LED (((RTC & 0x1ff000) == 0));
    }
#elif 1
  /* Flash like a heart. */
  gm_morse_async ("N ");
#endif
}

/**********************************************************************/
/* Mapper functions */

/* Reply to a mapper scout or config packet. */
static inline void
queue_mapper_scout_reply (struct gm_mapper_scout_reply_stage *mrs,
			  gm_mapper_packet_t * mp)
{
  gm_u8_t *from, *to;
  gm_u32_t route_len;

  gm_assert (mp->common.subtype == GM_MAPPER_SCOUT_PACKET_SUBTYPE);

  mrs->route_len = route_len = mp->scout.route_length;
  if (mrs->route_len <= 32)
    from = &mp->scout.route[mp->scout.route_length];
  else
    from = &mp->scout.extended_route[mp->scout.route_length];

  to = &mrs->route[GM_MAX_NETWORK_DIAMETER];
  if (route_len != GM_NO_ROUTE)
    {
      while (route_len--)
	*--to = *--from;
    }

  /* build the packet */
  gm_assert (mrs->packet.type == GM_MAPPING_PACKET_TYPE);
  gm_assert (mrs->packet.subtype == GM_MAPPER_SCOUT_REPLY_PACKET_SUBTYPE);
  mrs->packet.port = mp->scout.port;
  mrs->packet.phase = mp->scout.phase;
  mrs->packet.mapper_open = gm.port[GM_MAPPER_PORT_ID].open;
  /* mrs->packet.address preset */
  /* mrs->packet.gm_id preset */
  /* mrs->packet.map_version preset */

  gm_assert (mrs->packet.option
	     == (GM_MAPPER_SCOUT_REPLY_PACKED_ROUTES_OPTION
		 | GM_MAPPER_SCOUT_REPLY_LONG_HOSTNAME_OPTION));

  /* Schedule the packet to be sent */
  if (!mrs->in_send_queue)
    {
      /* HACK: Be sneaky and put the token in the send queue for port
         0 or target host ID 0, which is guaranteed not to have
         anything in it and therefore is guaranteed not to block. */
      gm_assert (mrs->send_token.common.next == 0);
      append_send_token_to_send_queue (&mrs->send_token, 0, 0);
      mrs->in_send_queue = 1;
      NOTICE (SDMA_PENDING);
    }
}

/* Reply to a mapper scout or config packet. */
static inline void
queue_mapper_config_reply (struct gm_mapper_config_reply_stage *mrs,
			   gm_mapper_packet_t * mp)
{
  gm_u8_t *from, *to;
  gm_u32_t route_len;

  gm_assert (mp->common.subtype == GM_MAPPER_CONFIG_PACKET_SUBTYPE);
  mrs->route_len = route_len = mp->config.bytes[0];
  from = &mp->config.bytes[1 + route_len];

  to = &mrs->route[GM_MAX_NETWORK_DIAMETER];
  if (route_len != GM_NO_ROUTE)
    {
      while (route_len--)
	*--to = *--from;
    }

  /* build the packet */
  gm_assert (mrs->packet.type == GM_MAPPING_PACKET_TYPE);
  gm_assert (mrs->packet.subtype == GM_MAPPER_CONFIG_REPLY_PACKET_SUBTYPE);
  mrs->packet.port = mp->config.port;
  mrs->packet.phase = mp->config.phase;
  mrs->packet.host_section = mp->config.host_section;
  /* mrs->packet.address preset */

  /* Schedule the packet to be sent */
  if (!mrs->in_send_queue)
    {
      /* HACK: Be sneaky and put the token in the send queue for port
         0 or target host ID 0, which is guaranteed not to have
         anything in it and therefore is guaranteed not to block. */
      gm_assert (mrs->send_token.common.next == 0);
      append_send_token_to_send_queue (&mrs->send_token, 0, 0);
      mrs->in_send_queue = 1;
      NOTICE (SDMA_PENDING);
    }

}

/**********************************************************************/
/* Page table stuff */

#if GM_ENABLE_VM
#include "gm_page_hash.h"

#define GM_DEBUG__DMA_ADDR 0

#define gm_inform_using_bogus(address,dir) \
{                                          \
   gm_u64_t addr64 = (gm_u64_t)address;    \
                                           \
   LOG_DISPATCH(0,dir);                    \
   printf("%s:%d No mapping for user %s address 0x%qx - using bogus page.\n", \
		__FILE__,__LINE__,dir,addr64);         \
   fflush(stdout);                         \
}
/* #define gm_inform_using_bogus(address,dir) {LOG_DISPATCH(0,dir);} */

/* Convert a host virtual address associated with a port to a physical
   address. Returns default plus the page offset if no such page exists. */

#if 0
#warning _dma_addr() is temporarily NOT inline -- needs to be changed back
static gm_dp_t
#else
static inline gm_dp_t
#endif
_dma_addr (gm_up_t host_address, unsigned int port, gm_dp_t default_addr)
{
#if !GM_ENABLE_LMRU_CACHE
  gm_page_hash_cached_entry_t *entry;
  gm_up_t page_port;
  unsigned int offset;
  unsigned collisions = 0;
  gm_cached_pte_t *gm_page_hash_cache_entry;
  
  gm_assert (default_addr);
  gm_assert (GM_PAGE_ALIGNED (default_addr));
  gm_assert (GM_POWER_OF_TWO (GM_MAX_PAGE_HASH_CACHE_INDEX + 1));
  gm_assert (gm.page_hash.cache.entry);
  gm_assert (gm.page_hash.cache.entry
	     [GM_MAX_PAGE_HASH_CACHE_INDEX + 1].page_port == 0);
  gm_assert (port < GM_NUM_PORTS);


  gm_page_hash_cache_entry = gm.page_hash.cache.entry;
  offset = GM_PAGE_OFFSET (host_address);
  page_port = host_address - offset + port;

  /* BAD: A better solution is to use the page number "-1" to represent
     unused bins, since this page number is more likely impossible. */

  if (!page_port)
    { 
      if (GM_DEBUG__DMA_ADDR)
	{
	  gm_pte_print (GM_STR (__GM_FUNCTION__ " returning default_addr\n"),
			&entry->pte);
	}

      gm_inform_using_bogus(host_address,
	(default_addr==gm.page_hash.bogus_sdma_ptr)?"send":"recv");
      return default_addr;
    }

/*** Scan for the cached page table entry. */

  entry = &gm_page_hash_cache_entry[GM_HASH_PAGE_PORT (page_port)
				    & GM_MAX_PAGE_HASH_CACHE_INDEX];
  while (1)
    {
      gm_up_t entry__page_port;
      gm_dp_t entry__dma_page;

      gm_assert (gm.page_hash.cache.
		 entry[GM_MAX_PAGE_HASH_CACHE_INDEX + 1].page_port == 0);

      entry__page_port = entry->page_port;
      entry__dma_page = gm_pte_get_dma_page (entry);

      if (entry__page_port == page_port)
	{
	  if (GM_DEBUG_LANAI_TLB && collisions)
	    {
	      gm_printf (GM_STR ("%d collisions before hit\n"), collisions);
	    }

	  /* Cache hit! */

	  GM_INCR_DEBUG_CNT (gm.hit_cnt);

	  /* gm_morse_sync ("hit\n"); */

	  /* Return DMA address. */

	  ret = entry__dma_page + offset;
	  if (GM_DEBUG__DMA_ADDR)
	    {
	      gm_pte_print (0, &entry->pte);
	      gm_printf (GM_STR (__GM_FUNCTION__
				 " returning 0x%x%08x (cached)"),
			 (gm_u32_t) (ret >> 32), (gm_u32_t) ret);
	    }
	  return ret;
	}
      else if (entry__page_port == 0)
	{
	  gm_dp_t dma_page;

	  /* Check for wraparound */

	  if (entry > &gm_page_hash_cache_entry[GM_MAX_PAGE_HASH_CACHE_INDEX])
	    {
	      entry = &gm_page_hash_cache_entry[0];
	      continue;
	    }

	  if (GM_DEBUG_LANAI_TLB && collisions)
	    {
	      gm_printf (GM_STR ("%d collisions before miss\n"), collisions);
	    }

	  /* Cache miss.  Fetch entry from host. */

	  GM_INCR_DEBUG_CNT (gm.miss_cnt);

	  /* gm_morse_sync ("miss\n"); */
	  dma_page = get_uncached_dma_page_entry (page_port);

	  if (GM_DEBUG_LANAI_TLB && !dma_page)
	    {
	      GM_PANIC (
			("page_port=0x%qx port=%d, host_address=0x%qx\n",
			 (gm_u64_t) page_port, port,
			 (gm_u64_t) host_address));
	    }

	  if (dma_page)
	    {
	      ret = GM_DMA_PAGE_ADDR (dma_page) + offset;
	    }
	  else
	   {
	     gm_inform_using_bogus(host_address,
		(default_addr==gm.page_hash.bogus_sdma_ptr)?"send":"recv");
	     ret = default_addr + offset;
	   }
	  if (GM_DEBUG__DMA_ADDR)
	    {
	      gm_printf
		(GM_STR (__GM_FUNCTION__ " returning 0x%x%08x (uncached)"),
		 (gm_u32_t) (ret >> 32), ret);
	    }
	  return ret;
	}

      if (GM_DEBUG_LANAI_TLB)
	{
	  collisions++;
	}

      entry++;
    }

#else /* GM_ENABLE_LMRU_CACHE */

  gm_cached_pte_t *gm_page_hash_cache_entry;
  gm_cached_pte_t *entry;
  gm_up_t page_port;
  unsigned int offset;
  gm_dp_t ret;

  if (GM_DEBUG__DMA_ADDR)
    {
      gm_printf ("_dma_addr(0x%qx,%d,0x%qx) called.\n",
		 (gm_u64_t) host_address, port, (gm_u64_t) default_addr);
    }

  gm_page_hash_cache_entry = gm.page_hash.cache.entry;
  
  /* preconditions */

  gm_assert (gm.page_hash.cache.root.older);
  gm_assert (gm.page_hash.cache.root.younger);
  gm_assert (default_addr);
  gm_assert (GM_PAGE_ALIGNED (default_addr));
  gm_assert (GM_POWER_OF_TWO (GM_MAX_PAGE_HASH_CACHE_INDEX + 1));
  gm_assert (gm_page_hash_cache_entry
	     [GM_MAX_PAGE_HASH_CACHE_INDEX + 1].pte.page_port == 0);
  gm_assert (port < GM_NUM_PORTS);

  offset = GM_PAGE_OFFSET (host_address);
  page_port = host_address - offset + port;

  /* pre */ entry = (&gm_page_hash_cache_entry
		     [GM_HASH_PAGE_PORT (page_port)
		      & GM_MAX_PAGE_HASH_CACHE_INDEX]);
  if (!page_port)
    {
      if (GM_DEBUG__DMA_ADDR)
	{
	  gm_pte_print (GM_STR ("first pte empty"), &entry->pte);
	  gm_printf (GM_STR (__GM_FUNCTION__ " returning default_addr\n"));
	}
      gm_inform_using_bogus(host_address,
	(default_addr==gm.page_hash.bogus_sdma_ptr)?"send":"recv");
      return default_addr;
    }

  /* Scan for the cached page table entry. */

  if (GM_DEBUG__DMA_ADDR)
    {
      gm_printf (GM_STR ("starting to scan for PTE\n"));
    }

  while (1)
    {
      gm_up_t entry__page_port;

      if (GM_DEBUG__DMA_ADDR)
	{
	  gm_pte_print (GM_STR ("first pte empty"), &entry->pte);
	  gm_printf (GM_STR ("scanning for PTE\n"));
	}

      /* verify that this table is followed by and entry with a user page
         of zero. */
      gm_assert (
		 (gm.page_hash.cache.
		  entry[GM_MAX_PAGE_HASH_CACHE_INDEX + 1].pte.page_port) ==
		 0);

      entry__page_port = entry->pte.page_port;
      if (entry__page_port == page_port)
	{
	  /* Cache hit! */

	  gm_cached_pte_t *younger, *older, *root, *youngest;
	  gm_dp_t entry__dma_page;

	  GM_INCR_DEBUG_CNT (gm.hit_cnt);
	  /* gm_morse_sync ("hit\n"); */

	  /* move the entry to the head of the age queue.  This code
	     is necessarily ugly to pipeline perfectly on the L7. */

	  /* Remove entry from age queue. */

	  younger = entry->younger;
	  older = entry->older;
	  /* pre */ root = &gm.page_hash.cache.root;
	  /* pre */ entry->younger = root;
	  younger->older = older;
	  older->younger = younger;

	  /* Insert entry at head of age queue. */

	  youngest = root->older;
	  root->older = entry;
	  /* pre */ entry__dma_page = gm_pte_get_dma_page (&entry->pte);
	  entry->older = youngest;
	  youngest->younger = entry;

	  /* Return DMA address. */

	  ret = entry__dma_page + offset;
	  break;
	}
      else if (entry__page_port == 0)
	{
	  gm_dp_t dma_page; /* BUG - was unsigned */

	  /* Check for wraparound */

	  if (entry > &gm_page_hash_cache_entry[GM_MAX_PAGE_HASH_CACHE_INDEX])
	    {
	      entry = &gm_page_hash_cache_entry[0];
	      continue;
	    }

	  /* Cache miss.  Fetch entry from host. */

	  GM_INCR_DEBUG_CNT (gm.miss_cnt);
	  /* gm_morse_sync ("miss\n"); */
	  dma_page = get_uncached_dma_page_entry (page_port);

	  if (GM_DEBUG_LANAI_TLB && !dma_page)
	    {
	      GM_PANIC (
			("page_port=0x%qx port=%d host_address=0x%qx\n",
			 (gm_u64_t) page_port, port,
			 (gm_u64_t) host_address));
	    }

	  /* Determine return value. (dma_page is now the
	     corresponding DMA address, or zero if no match.) */

	  if (dma_page)
	    {
	      ret = dma_page + offset;
	      GM_PRINT (GM_DEBUG__DMA_ADDR,
			("0x%qx + 0x%x = 0x%qx\n", dma_page, offset, ret));
	    }
	  else
	    {
	      if (GM_DEBUG_LANAI_TLB)
		{
		  gm_pte_print (GM_STR ("first empty pte"), &entry->pte);
		  gm_printf (GM_STR ("no match; returning default\n"));
		}
	      ret = default_addr + offset;
	      gm_inform_using_bogus(host_address,
		(default_addr==gm.page_hash.bogus_sdma_ptr)?"send":"recv");
	    }
	  break;
	}
      entry++;
    }

  /* return match */

  if (GM_DEBUG__DMA_ADDR)
    {
      gm_pte_print (GM_STR ("matching pte"), &entry->pte);
      gm_printf (GM_STR ("scanning complete, returning 0x%qx\n"),
		 (gm_u64_t) ret);
    }
  return ret;
#endif /* GM_ENABLE_LMRU_CACHE */

}
#endif /* GM_ENABLE_VM */

#if GM_DEBUG_IRIX_DMA_ADDR

/* Code to validate DMA addresses for IRIX hosts */

inline void gm_check_irix_dma_addr (gm_dp_t the_addr, int refer_line, int line)
/* NOTES:
 *
 * We only pass in the referring line number and the local line number.  We
 * do without the filename for now, because we can't use it anyway.  It should
 * always be possible to isolate the problem with only the two line numbers.
 *
 * We believe a well-formed A64 DMA address on the Octane has its
 * high-order word = 0x81000000 or 0x88000000, and its low-order word
 * always greater than 0x10000000.
 *
 * On the O200, the high-order word is always 0xa8000000, and the
 * low-order word is always non-zero.
 */
{
  union gm_64_bit_dp
  {
    gm_u64_t dp_as_64;
    struct { gm_u32_t high_half; gm_u32_t low_half; } dp_as_2_words;
  };
  union gm_64_bit_dp over_dp;
  over_dp.dp_as_64 = the_addr;             

#if GM_DEBUG_IRIX_DMA_ADDR == GM_DEBUG_IRIX_DMA_OCTANE
  /* the following tests are done two ways because we may be distrustful
   * of the generated MCP code here */
  if ( ( (over_dp.dp_as_2_words.high_half>>24 == 0x81) ||
         (over_dp.dp_as_2_words.high_half>>24 == 0x88)   ) &&
       (over_dp.dp_as_2_words.low_half >= 0x10000000     )   )
    return;			/* apparently well-formed */

  if ( ( (over_dp.dp_as_2_words.high_half == (gm_u32_t)0x81000000) ||
         (over_dp.dp_as_2_words.high_half == (gm_u32_t)0x88000000)   ) &&
       (over_dp.dp_as_2_words.low_half >= 0x10000000                 )   ) 
    return;			/* apparently well-formed */

#elif GM_DEBUG_IRIX_DMA_ADDR == GM_DEBUG_IRIX_DMA_O200
  if ( (over_dp.dp_as_2_words.high_half>>24 == 0xa8) &&
       (over_dp.dp_as_2_words.low_half      >= 0   )   )
    return;			/* apparently well-formed */

  if ( (over_dp.dp_as_2_words.high_half == (gm_u32_t)0xa8000000) &&
       (over_dp.dp_as_2_words.low_half  >= 0                   )   )
    return;			/* apparently well-formed */

#else
#error Dont know how to debug IRIX DMA address
  return;

#endif

#if 0				/* gm_printf() is deprecated */
  gm_printf("%s:%d BADLY FORMED DMA address: 0x%x.0x%x\n",
         file,line,
         over_dp.dp_as_2_words.high_half,
         over_dp.dp_as_2_words.low_half);
  gm_printf("BADLY FORMED DMA tests condition results %d %d\n",
  ((over_dp.dp_as_2_words.high_half>>24 == 0x81) ||
         (over_dp.dp_as_2_words.high_half>>24 == 0x88)   ),
       (over_dp.dp_as_2_words.low_half >= 0x10000000       )
        );
#endif

  *(gm_u32_t *) 0  = over_dp.dp_as_2_words.high_half;
  *(gm_u32_t *) 4  = over_dp.dp_as_2_words.low_half;
  *(gm_u32_t *) 8  = refer_line;
  *(gm_u32_t *) 12 = line;
  LOG_DISPATCH (0, "BADLY FORMED DMA ADDRESS STORED IN WORDS 0 AND 1");
  for (;;)
    {
      int i = 1;	     /* loop forever so we can poke from the host */
      i = 2;
    }

}
#endif /* GM_DEBUG_IRIX_DMA_ADDR */


static inline gm_dp_t
sdma_addr (gm_up_t host_address, unsigned int port)
{
  GM_PRINT (GM_DEBUG_LANAI_TLB, ("sdma_addr (%qx, 0x%x) called\n",
				 (gm_u64_t) host_address, port));

#if !GM_ENABLE_VM
  prepare_to_interrupt ("sdma_addr: using no_VM.set / clear\n");
  gm.interrupt.print.string = "smda_addr: using no_VM.set / clear\n";
  gm_interrupt (GM_PRINT_INTERRUPT);

  return ((gm_dp_t) (host_address & ~gm.no_VM.clear) | gm.no_VM.set);
#else /* GM_ENABLE_VM */
  {
    gm_dp_t ret;

    ret = _dma_addr (host_address, port, gm.page_hash.bogus_sdma_ptr);
    GM_PRINT (GM_TRACE_LANAI_DMA, ("returning 0x%qx\n", (gm_u64_t) ret));
    return ret;
  }
#endif /* GM_ENABLE_VM */
}

static inline gm_dp_t
rdma_addr (gm_up_t host_address, unsigned port)
{
#if !GM_ENABLE_VM
  prepare_to_interrupt ("rdma_addr: using no_VM.set / clear\n");
  gm.interrupt.print.string = "rdma_addr: using no_VM.set / clear\n";
  gm_interrupt (GM_PRINT_INTERRUPT);

  return ((gm_dp_t) (host_address & ~gm.no_VM.clear) | gm.no_VM.set);
#else /* GM_ENABLE_VM */
  return _dma_addr (host_address, port, gm.page_hash.bogus_rdma_ptr);
#endif /* GM_ENABLE_VM */
}

/* This function performs a DMA of length LEN from the user virtual
   memory address PTR to the lanai address LAR.  Any DMA from a
   nonDMAable user virtual memory address causes the value 0xaaaaaaaa
   to be DMAd instead.

   NOTE: (ptr % GM_DMA_GRANULARITY) must equal (lar % GM_DMA_GRANULARITY). */

/* This cover macro allows for more informative debugging messages */
/* Do we need the 1st and 3rd asserts below?  They seem wrong/unnecessary
   since a roundup value is added to 'len' before its used for a dma.
   --nelson
*/
#if GM_DEBUG_IRIX_DMA_ADDR
#define USER_SDMA(ptr,lar,len,port) do {				\
  /*gm_assert (GM_DMA_GRANULARITY <= len);*/ 				\
  gm_assert (len <= GM_MAX_DMA_CTR);					\
  /*gm_assert (GM_DMA_ALIGN (u32, len) > 0);*/				\
  _USER_SDMA (ptr, lar, len, port,					\
	      GM_REFERENCE_LABEL (L_sdma__continue_sdma), 1, __LINE__);	\
} while (0)
#else
/* Avoid the overhead of passing the seventh param to _USER_SDMA */
#define USER_SDMA(ptr,lar,len,port) do {				\
  /*gm_assert (GM_DMA_GRANULARITY <= len);*/ 				\
  gm_assert (len <= GM_MAX_DMA_CTR);					\
  /*gm_assert (GM_DMA_ALIGN (u32, len) > 0);*/				\
  _USER_SDMA (ptr, lar, len, port,					\
	      GM_REFERENCE_LABEL (L_sdma__continue_sdma), 1);           \
} while (0)
#endif

static inline void
_USER_SDMA (gm_up_t ptr, gm_lp_t lar, gm_u32_t len, gm_u32_t port_id,
	    void *continue_handler, int continuation_ok
#if GM_DEBUG_IRIX_DMA_ADDR
            , int r_line
#endif
                        )
{
#if GM_CPU_lanai
  /* For LANai embedded applications, build a gather list instead of
     copying the payload. */

  gm.send_chunk[lzero].send_list_end->ptr = (gm_u32_t) ptr;
  gm.send_chunk[lzero].send_list_end->len = len;
  gm.send_chunk[lzero].send_list_end++;
#elif GM_ENABLE_VM
  unsigned r;
  gm_dp_t ear;

  if (GM_DEBUG_PAGE_HASH)
    {
      gm_printf (GM_STR ("_USER_SDMA(%qx,%x,0x%x,0x%x,...) called.\n"),
		 (gm_u64_t) ptr, (unsigned int) lar, len, port_id);
    }

  gm_assert (ptr % GM_DMA_GRANULARITY
	     == (unsigned long) lar % GM_DMA_GRANULARITY);
  gm_assert (gm.page_hash.bogus_sdma_ptr);

again:
  ear = sdma_addr (ptr, port_id);

  /* Handle page crossing */

  r = GM_PAGE_REMAINING (ptr);
  if (len <= r)
    {
#if GM_DEBUG_IRIX_DMA_ADDR
      start_SDMA (ear, lar, len + GM_DMA_GRANULARITY_ROUNDUP, r_line);
#else
      start_SDMA (ear, lar, len + GM_DMA_GRANULARITY_ROUNDUP, 0);
#endif
    }
  else
    {
#if GM_DEBUG_IRIX_DMA_ADDR
      start_SDMA (ear, lar, r + GM_DMA_GRANULARITY_ROUNDUP, r_line);
#else
      start_SDMA (ear, lar, r + GM_DMA_GRANULARITY_ROUNDUP, 0);
#endif
      if (continuation_ok)
	{
	  gm.remaining_sdma_lar = (void *) ((char *) lar + r);
	  gm.remaining_sdma_ctr = len - r;
	  gm.remaining_sdma_hp = ptr + r;
	  gm.remaining_sdma_port_id = port_id;
	  gm.handler[FINISH_SDMA_EVENT] = continue_handler;
	  if (GM_DEBUG_DIRECTED_SEND)
	    gm_printf (GM_STR ("will need to continue this _USER_SDMA.\n"));
	}
      else
	{
	  len -= r;
	  ptr += r;
	  lar += r;
	  await_free_DMA_engine ();
	  goto again;
	}
    }
  gm_assert (len);
#else /* !GM_CPU_lanai && !GM_ENABLE_VM */
#if GM_DEBUG_IRIX_DMA_ADDR
  start_SDMA (sdma_addr (ptr, port_id), lar,
	      len + GM_DMA_GRANULARITY_ROUNDUP, r_line);
#else
  start_SDMA (sdma_addr (ptr, port_id), lar,
	      len + GM_DMA_GRANULARITY_ROUNDUP, 0);
#endif
#endif /* !GM_CPU_lanai && !GM_ENABLE_VM */
}

#if GM_DEBUG_IRIX_DMA_ADDR
#define USER_SDMA_NO_CONTINUATION(ptr, lar, len, port) do {		\
  gm_assert (GM_DMA_GRANULARITY <= len);				\
  gm_assert (len <= GM_MAX_DMA_CTR);					\
  gm_assert (GM_DMA_ALIGN (u32, len) > 0);				\
  _USER_SDMA (ptr, lar, len, port, 0, 0, __LINE__);			\
} while (0)
#else
/* Avoid the overhead of passing the seventh param to _USER_SDMA */
#define USER_SDMA_NO_CONTINUATION(ptr, lar, len, port) do {		\
  gm_assert (GM_DMA_GRANULARITY <= len);				\
  gm_assert (len <= GM_MAX_DMA_CTR);					\
  gm_assert (GM_DMA_ALIGN (u32, len) > 0);				\
  _USER_SDMA (ptr, lar, len, port, 0, 0);			        \
} while (0)
#endif

/* This function peforms a DMA of length LEN from the LANai pointer
   LAR to the user virtual memory address PTR.  If any part of the DMA
   corresponds to nonDMAable user virtual memory addresses, then the
   data is dropped by DMAing it to a trash page in the host. */

/* This cover macro allows for more informative debugging messages */

#define USER_RDMA(lar,ptr,len,port,h1,h2,x) do {			\
  gm_assert (GM_DMA_ALIGNED (ptr));					\
  gm_assert (GM_DMA_ALIGNED (lar));					\
  gm_assert (len >= GM_DMA_GRANULARITY);				\
  _USER_RDMA (lar, ptr, len, port, h1, h2, x);				\
} while (0)

#define GM_DEBUG_USER_RDMA 0

static inline void
_USER_RDMA (gm_lp_t lar, gm_up_t ptr, gm_u32_t len, gm_u32_t port_id,
	    void *continue_handler, void *done_handler, unsigned int extra)
{
  if (GM_DEBUG_USER_RDMA)
    {
      gm_printf (GM_STR ("_USER_RMDA (%p, %qx, %u, %u, ...) called\n"),
		 lar, (gm_u64_t) ptr, len, port_id);
    }

#if GM_DEBUG
  if (ptr >= gm.page_hash.bogus_rdma_ptr
      && ptr < gm.page_hash.bogus_rdma_ptr + GM_PAGE_LEN)
    {
      gm_puts ("RDMAing into bogus page\n");
    }
  else
    {
      gm_puts ("RDMAing\n");
    }
#endif

#if GM_CPU_lanai
  {
    /* For embedded GM, emulate receive DMAs using a copy. */

    gm_u32_t *from, *to, *limit;
    gm_dp_t *ear;

    gm_puts ("RDMA() called.\n");

    EAR = rdma_addr (ptr, port_id);
    LAR = lar;

    from = EAR;
    to = ptr;
    limit = (void *) from + len;

    gm_assert (GM_RDMA_GRANULARITY == 8);
    gm_assert (GM_RDMA_ALIGNED (from));
    gm_assert (GM_RDMA_ALIGNED (to));
    gm_assert (GM_RDMA_ALIGNED (len));
    switch (len & 31)
      {
	gm_u32_t a, b;
      case 0:
	while (from < limit)
	  {
	    a = *from++;
	    b = *from++;
	    *to++ = a;
	    *to++ = b;
      case 8:
	    a = *from++;
	    b = *from++;
	    *to++ = a;
	    *to++ = b;
      case 16:
	    a = *from++;
	    b = *from++;
	    *to++ = a;
	    *to++ = b;
      case 24:
	    a = *from++;
	    b = *from++;
	    *to++ = a;
	    *to++ = b;
	  }
      }
  }
#elif GM_ENABLE_VM
  {
    gm_dp_t ear;
    unsigned r;

    gm_assert (len >= GM_DMA_GRANULARITY);
    gm_assert (len <= GM_MAX_DMA_CTR);
    gm_assert (gm.page_hash.bogus_rdma_ptr);

    r = GM_PAGE_REMAINING (ptr);

    /* If DMA crosses page and next page is not contiguous with first in DMA
       space... */

    ear = rdma_addr (ptr, port_id);
    if (len <= r)
      {
	start_RDMA (lar, ear, len + extra);
	gm.handler[FINISH_RDMA_EVENT] = done_handler;
      }
    else
      {
	start_RDMA (lar, ear, r);
	gm.remaining_rdma_lar = (void *) ((char *) lar + r);
	gm.remaining_rdma_ctr = GM_DMA_ROUNDUP (u32, len - r);
	gm.remaining_rdma_hp = ptr + r;
	gm.remaining_rdma_port_id = port_id;
	gm.handler[FINISH_RDMA_EVENT] = continue_handler;
      }
    gm_assert (len >= GM_DMA_GRANULARITY);
  }
#else /* !GM_CPU_lanai && !GM_ENABLE_VM */
  {

    gm_dp_t ear;

    ear = rdma_addr (ptr, port_id);
    GM_CHECK_IRIX_DMA_ADDR (ear,__FILE__,__LINE__);
    if (GM_DEBUG_USER_RDMA)
      {
	gm_printf (GM_STR ("calling start_RDMA (%p, %qx, %u)\n"),
		lar, (gm_u64_t) ear, len);
      }
    start_RDMA (lar, ear, len + extra);
  }

  gm.handler[FINISH_RDMA_EVENT] = done_handler;
#endif /* !GM_CPU_lanai && !GM_ENABLE_VM */
}

#if GM_MIN_SUPPORTED_SRAM > 512
void
hex_dump (const void *_ptr, unsigned int len)
{
  const gm_u8_t *pos, *ptr;

  if (len > 1024)
    {
      gm_printf ("hex_dump (%p, %u) too long\n", _ptr, len);
    }

  ptr = (gm_u8_t *) _ptr;
  for (pos = ptr - ((long) ptr & 0xf); pos < ptr + len; pos++)
    {
      if (((long) pos & 0xf) == 0)
	{
	  gm_printf ("\n");
	  gm_printf ("%p: ", pos);
	}
      else if (((long) pos & 0xf) == 8)
	{
	  gm_printf (" ");
	}
      
      if (pos >= ptr)
	{
	  gm_printf ("%02x ", (int) *pos);
	}
      else
	{
	  gm_printf (".. ");
	}
    }
  gm_printf ("\n");
}
#endif

/****************
 * print_isr
 ****************/

static void
print_state (void)
{
  gm_u32_t state;

  state = (get_ISR () & get_IMR ()) | GM_STATE;

  gm_printf ("ISR =\n"
	  "\t%s %s %s %s\n"
	  "\t%s %s %s %s\n"
	  "\t%s %s %s %s\n"
	  "\t%s %s\n",
	  state & FREE_RECV_CHUNK ? "FREE_RECV_CHUNK" : "free_recv_chunk",
	  state & RECV_INT_BIT ? "RECV_INT_BIT" : "recv_int_bit",
	  state & BUFF_INT_BIT ? "BUFF_INT_BIT" : "buff_int_bit",
	  state & SEND_INT_BIT ? "SEND_INT_BIT" : "send_int_bit",
	  state & SDMA_PENDING ? "SDMA_PENDING" : "sdma_pending",
	  state & TIME_INT_BIT ? "TIME_INT_BIT" : "time_int_bit",
	  state & FREE_SEND_CHUNK ? "FREE_SEND_CHUNK" : "free_send_chunk",
	  state & ACK_PENDING ? "ACK_PENDING" : "ack_pending",
	  state & RDMA_PENDING ? "RDMA_PENDING" : "rdma_pending",
	  state & RDMAING ? "RDMAING" : "rdmaing",
	  state & RECEIVING ? "RECEIVING" : "receiving",
	  state & SENDING ? "SENDING" : "sending",
	  state & SDMAING ? "SDMAING" : "sdmaing",
	  state & SEND_PENDING ? "SEND_PENDING" : "send_pending");
}

#if GM_ENABLE_DIRECTED_SEND

/* Shift the directed send header up in memory SHIFT bytes. */

static inline void
shift_directed_header (gm_u32_t * hdr, unsigned shift)
{
#if 1
  /* Here we shift just the packet header, and not the target address
     because the target address must not be written until after the
     DMA has completed, because it might be overwritten by (or
     overwrite) the DMA. */

  {
    gm_u8_t *from, *to;
    const int header_len = (GM_MAX_NETWORK_DIAMETER
			    + sizeof (gm_packet_header_t)
			    + sizeof (gm_remote_ptr_t));

    from = (gm_u8_t *) hdr;
    to = (gm_u8_t *) hdr + shift;

    gm_reverse_bcopy (from, to, header_len);
  }
#else
  gm_u32_t a, b, c;
  gm_u32_t *out = hdr;

  gm_assert (GM_OFFSETOF (gm_directed_packet_t, payload)
	     + GM_MAX_NETWORK_DIAMETER == 40);

  switch (shift)
    {
    case 7:
      out++;
      goto case_3;
    case 6:
      out++;
      goto case_2;
    case 5:
      out++;
      goto case_1;
    case 4:
      ;
      a = hdr[0];
      ;
      b = hdr[1];
      ;
      c = hdr[2];
      hdr[1] = a;
      a = hdr[3];
      hdr[2] = b;
      b = hdr[4];
      hdr[3] = c;
      c = hdr[5];
      hdr[4] = a;
      a = hdr[6];
      hdr[5] = b;
      b = hdr[7];
      hdr[6] = c;
      c = hdr[8];
      hdr[7] = a;
      a = hdr[9];
      hdr[8] = b;
      hdr[9] = c;
      hdr[10] = a;
      break;
    case_3:
    case 3:
      ;
      a = hdr[0];
      ;
      b = hdr[1];
      ;
      c = hdr[2];
      out[0] = (a >> 24 & 0xff);
      d = hdr[3];
      out[1] = (a << 8) | (b >> 24 & 0xff);
      a = hdr[4];
      out[2] = (b << 8) | (c >> 24 & 0xff);
      b = hdr[5];
      out[3] = (c << 8) | (d >> 24 & 0xff);
      c = hdr[6];
      out[4] = (d << 8) | (a >> 24 & 0xff);
      d = hdr[7];
      out[5] = (a << 8) | (b >> 24 & 0xff);
      a = hdr[8];
      out[6] = (b << 8) | (c >> 24 & 0xff);
      b = hdr[9];
      out[7] = (c << 8) | (d >> 24 & 0xff);
      out[8] = (d << 8) | (a >> 24 & 0xff);
      out[9] = (a << 8) | (b >> 24 & 0xff);
      *(gm_u16_t *) & out[9] = b >> 8;
      ((gm_u8_t *) & out[9])[2] = b;
      break;
    case_2:
    case 2:
      ;
      a = hdr[0];
      ;
      b = hdr[1];
      ;
      c = hdr[2];
      out[0] = (a >> 16 & 0xffff);
      d = hdr[3];
      out[1] = (a << 16) | (b >> 16 & 0xffff);
      a = hdr[4];
      out[2] = (b << 16) | (c >> 16 & 0xffff);
      b = hdr[5];
      out[3] = (c << 16) | (d >> 16 & 0xffff);
      c = hdr[6];
      out[4] = (d << 16) | (a >> 16 & 0xffff);
      d = hdr[7];
      out[5] = (a << 16) | (b >> 16 & 0xffff);
      a = hdr[8];
      out[6] = (b << 16) | (c >> 16 & 0xffff);
      b = hdr[9];
      out[7] = (c << 16) | (d >> 16 & 0xffff);
      out[8] = (d << 16) | (a >> 16 & 0xffff);
      out[9] = (a << 16) | (b >> 16 & 0xffff);
      *(gm_u16_t *) & out[9] = b;
      break;
    case_1:
    case 1:
      ;
      a = hdr[0];
      ;
      b = hdr[1];
      ;
      c = hdr[1];
      out[0] = (a >> 8 & 0xffffff);
      d = hdr[2];
      out[1] = (a << 24) | (b >> 8 & 0xffffff);
      a = hdr[3];
      out[2] = (b << 24) | (c >> 8 & 0xffffff);
      b = hdr[4];
      out[3] = (c << 24) | (d >> 8 & 0xffffff);
      c = hdr[5];
      out[4] = (d << 24) | (a >> 8 & 0xffffff);
      d = hdr[6];
      out[5] = (a << 24) | (b >> 8 & 0xffffff);
      a = hdr[7];
      out[6] = (b << 24) | (c >> 8 & 0xffffff);
      b = hdr[8];
      out[7] = (c << 24) | (d >> 8 & 0xffffff);
      out[8] = (d << 24) | (a >> 8 & 0xffffff);
      out[9] = (a << 24) | (b >> 8 & 0xffffff);
      *(gm_u8_t *) & out[9] = b;
      break;
    case 0:
      break;
    default:
      gm_always_assert (0);
    }
#endif
}
#endif /* GM_ENABLE_DIRECTED_SEND */

static void
remove_port_from_polling_queue (gm_port_protected_lanai_side_t * port)
{
  if (!port->next_to_poll)
    return;

  if (port->next_to_poll == port)
    {
      gm.poll.port = 0;		/* mark polling queue as empty */
      gm.handler[POLL_EVENT] = gm.poll.idle_handler;	/* stop polling */
    }
  else
    {
      gm_port_protected_lanai_side_t **p;

      /* arrange to not poll the port we are about to remove */

      if (gm.poll.port == port)
	{
	  gm.poll.port = port->next_to_poll;
	}

      /* remove the port from the circular queue */

      for (p = &port->next_to_poll; *p != port; p = &(*p)->next_to_poll)
	;
      *p = port->next_to_poll;
    }

  port->next_to_poll = 0;	/* mark as not in queue */
}

static void
open_port (gm_port_protected_lanai_side_t * port)
{
  unsigned int j;

  GM_PRINT (GM_DEBUG_CONNECTIONS, ("opening port %u\n", port->id));

  gm_assert (!port->open);
  gm_assert (!port->next_to_poll);
  gm_assert (port->active_subport_cnt == 0);

  /* Reset the send list for this port. */

  port->first_free_send_token = &port->_send_tokens[0];
  for (j = 0; j < GM_NUM_ELEM (port->_send_tokens) - 1; j++)
    port->_send_tokens[j].common.next = &port->_send_tokens[j + 1];
  port->last_free_send_token = &port->_send_tokens[j];
  port->last_free_send_token->common.next = 0;
  port->send_token_queue_slot = &port->PORT->send_token_queue[0];

  /* Reset the send list for this port. */

  port->recv_queue_slot_num = 0;

  /* insert the port in the polling queue. */

  if (!gm.poll.port)
    {
      port->next_to_poll = port;
      gm.poll.port = port;
      gm.handler[POLL_EVENT] = gm.poll.active_handler;	/* start polling */
    }
  else
    {
      port->next_to_poll = gm.poll.port->next_to_poll;
      gm.poll.port->next_to_poll = port;
    }
  port->open = -1;
}

static void
gm_init_port (unsigned p)
{
  unsigned int j;
  gm_port_protected_lanai_side_t *port;

  GM_PRINT (GM_DEBUG_CONNECTIONS, ("initializing port %u\n", p));

  port = &gm_port[p];

/***** unprotected port initialization *****/

  /* all fields cleared. */

  gm_assert (GM_PORT);
  gm_bzero (&GM_PORT[p], sizeof (gm_port_unprotected_lanai_side_t));

/****** protected port initialization ******/

  gm_bzero (&gm_port[p], sizeof (gm_port_protected_lanai_side_t));

  port->send_token_queue_slot = &GM_PORT[p].send_token_queue[0];
  port->recv_token_queue_slot = &GM_PORT[p].recv_token_queue[0];
  /* port->woke_host = 0; */
  port->first_free_send_token = &port->_send_tokens[0];
  for (j = 0; j < GM_NUM_ELEM (port->_send_tokens) - 1; j++)
    port->_send_tokens[j].common.next = &port->_send_tokens[j + 1];
  port->last_free_send_token = &port->_send_tokens[j];

  port->free_recv_tokens = &port->_recv_tokens[0];
  for (j = 0; j < GM_NUM_ELEM (port->_recv_tokens) - 1; j++)
    port->_recv_tokens[j].next = &port->_recv_tokens[j + 1];
  /* gm.free_recv_tokens(i)[][] cleared */

  /* Initialize the recv queue such that DMAs to it are harmless. */

  for (j = 0; j < GM_NUM_ELEM (port->recv_queue_slot_dma_addr); j++)
    port->recv_queue_slot_dma_addr[j] = gm.page_hash.bogus_rdma_ptr;

  /* port->sent_slot = 0; */
  port->sent.type = GM_NEW_SENT_TOKENS_EVENT;

  port->id = p;
  port->PORT = &GM_PORT[p];
  port->next_to_poll = 0;
  port->open = 0;
}

/* Reset the port such that all messages will be rejected, and all
   sends are aborted. */

#define GM_DEBUG_RESET_PORT 0

static void
reset_port (unsigned port_id)
{
  unsigned int i;
  gm_port_protected_lanai_side_t *port;
  unsigned int active_subport_cnt;

  GM_PRINT (GM_DEBUG_CONNECTIONS, ("resetting port %u\n", port_id));
  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR (__FUNCTION__ "() called\n"));
    }

  /* When the mapper port is closed, set the mapper level for this
     node to 0. */

  if (port_id == GM_MAPPER_PORT_ID)
    {
      gm.mapper_state.scout_reply.packet.level = 0;
    }

  port = &gm_port[port_id];
  if (!port->open)
    return;

  /* preserve the active subport count */

  active_subport_cnt = port->active_subport_cnt;
  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR (__FUNCTION__ " active_subport_cnt = %d\n"),
		 active_subport_cnt);
    }

  /************
   * reset unprotected port state
   ************/

  /* Clear any token in the user queues. */

  gm_assert (GM_PORT);
  gm_bzero (&GM_PORT[port_id], sizeof (GM_PORT[port_id]));

  /************
   * Reset the protected port state
   ************/

  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR (__FUNCTION__ " unsetting alarm\n"));
    }

  if (port->alarm_set)
    gm_unset_alarm (port);

  /* mark the port as closed. */

  port->privileged = 0;
  port->wake_host = 0;
  port->enable_nack_down_flag = 0;
  port->open = 0;

  /* Arrange for all sends for the port to be dequeued the next time
     they are acked, nacked, or timed out.  The driver will prevent
     the port from being opened until this is done by checking
     port->active_subport_cnt is zero before allowing the port to be
     opened.

     Find any associated subport that has been disabled, and reenable
     it, allowing the messages to be resent and accepted or dropped. */

  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR (__FUNCTION__ " deactivating connections\n"));
    }

  if (GM_DEBUG)
    check_subport_cnt ();

  if (gm.first_active_connection)
    {
      gm_u32_t c_cnt = 0;
      gm_connection_t *c;

      gm_assert (active_subport_cnt);

      /* count the open connections */

      c = gm.first_active_connection;
      do
	c_cnt++;
      while ((c = c->next_active) != gm.first_active_connection);

      /* scan connections */

      while (c_cnt--)
	{
	  gm_connection_t *next_c = c->next_active;
	  gm_subport_t *sp = c->first_active_send_port;
	  gm_u32_t sp_cnt = 0;

	  /* count the subports for the connection */

	  do
	    {
	      sp_cnt++;
	      sp = sp->next;
	    }
	  while (sp != c->first_active_send_port);

	  /* scan the subports */

	  gm_assert (sp);
	  while (sp_cnt--)
	    {
	      gm_subport_t *next_sp;

	      next_sp = sp->next;
	      if (GM_SUBPORT_PORT (sp->id) == port_id)
		{
		  GM_PRINT (GM_DEBUG_CONNECTIONS,
			    ("enabling subport %u\n", (int) sp->id));
		  if (sp->disabled)
		    {
		      gm_send_token_t *st;

		      /* If we used a hack to keep the disabled
		         subport around, use another hack to perform
		         the deferred
		         remove_first_send_token_from_send_queue() */

		      if (sp->first_send_token == 0)
			{
			  gm_send_token_t hack;

			  gm_bzero (&hack, sizeof (hack));
			  hack.common.subport = sp;
			  sp->first_send_token = &hack;
			  remove_first_send_token_from_send_queue (&hack,
								   sp->id);
			}

		      sp->disabled = 0;
		      while ((st = sp->first_send_token) != 0)
			{
			  remove_first_send_token_from_send_queue
			    (st, sp->id);
			  pass_sent_token_to_port_and_free
			    (st, &gm.port[GM_SUBPORT_PORT (sp->id)], 0,
			     GM_SEND_PORT_CLOSED);
			}
		      active_subport_cnt--;
		    }
		}
	      sp = next_sp;
	    }
	  c = next_c;
	}
    }
  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR (__FUNCTION__ " done deactivating connections\n"));
    }

  /* defer resetting the send token queues until the port is opened.
     Only clear the queue. */

  for (i = 0; i < GM_NUM_ELEM (port->_send_tokens); i++)
    port->PORT->send_token_queue[i].type = 0;

  /* Remove recv tokens for receives on this port */

  /* Free recv tokens in hash table. */
  recv_token_hash_remove_port_references (port_id);
  /* Free recv tokens in recv queue */

  port->free_recv_tokens = &port->_recv_tokens[0];

  for (i = 0; i < GM_NUM_RECV_TOKENS - 1; i++)
    port->_recv_tokens[i].next = &port->_recv_tokens[i + 1];

  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR (__FUNCTION__ "3\n"));
    }

  for (i = 0; i < GM_NUM_SIZES; i++)
    {
      port->free_recv_token[GM_LOW_PRIORITY][i] = 0;
      port->free_recv_token[GM_HIGH_PRIORITY][i] = 0;
    }

  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR ("clearing recv queue\n"));
    }

  /* Clear the recv queue in host memory. */
  {
    gm_u8_t reserved_space[sizeof (gm_recv_queue_slot_t) +
			   GM_DMA_GRANULARITY];
    gm_recv_queue_slot_t *clear;

    clear = GM_DMA_ROUNDUP (lp, reserved_space);
    gm_bzero (clear, sizeof (*clear));
    for (port->recv_queue_slot_num = 0;
	 port->recv_queue_slot_num < GM_NUM_RECV_QUEUE_SLOTS;
	 port->recv_queue_slot_num++)
      {
	gm_dp_t ear;

	await_free_DMA_engine ();
	ear = RECV_QUEUE_SLOT_DMA_ADDR (port);
	gm_assert (GM_DMA_ALIGNED (sizeof (*clear)));
	if (ear)		/* catch uninitialized port closure case */
	  start_GRANULAR_RDMA (clear, ear, (unsigned int) sizeof (*clear));
      }

    port->recv_queue_slot_num = 0;
    port->recv_token_queue_slot = &GM_PORT[port_id].recv_token_queue[0];
  }
  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR ("cleared\n"));
    }

  /* Initialize the recv queue such that DMAs to it are harmless */

  for (i = 0; i < GM_NUM_RECV_QUEUE_SLOTS; i++)
    port->recv_queue_slot_dma_addr[i] = gm.page_hash.bogus_rdma_ptr;

  if (port_id == GM_ETHERNET_PORT_ID)
    {
      unsigned int j;
      /* this is probably not complete, there may be some other
         variables that need to be reset */
      /* update 1999/07/16: resetting token_slot may be the last one */
      for (i = 0; i < GM_NUM_ETHERNET_RECV_TOKENS; i++)
	for (j = 0; j < GM_MAX_ETHERNET_SCATTER_CNT; j++)
	  gm.ethernet.recv.token[i].segment[j].len = 0;
      gm.ethernet.recv.token_slot = &gm.ethernet.recv.token[0];
      gm.ethernet.recv.remaining_len = 0;
      gm.ethernet.send.gather_cnt = 0;
    }

  /* next_with_sent_packets updated when sends complete
     active_subport_cnt updated when sends complete */

  /* Remove the port from the list of ports with sent messages. */

  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR ("updating sent queues\n"));
    }
  update_host_sent_queues ();
  if (GM_DEBUG_RESET_PORT)
    {
      gm_printf (GM_STR ("updated sent queues\n"));
    }

  /* port->next_with_alarm updated above */
  /* port->alarm_time updated above */
  port->id = port_id;

  port->unacceptable_recv_sizes[GM_LOW_PRIORITY] = 0;
  port->unacceptable_recv_sizes[GM_HIGH_PRIORITY] = 0;

  /* port->PORT unchanged */

  /************
   * Remove remaining global references to the port
   ************/

  if (gm.registered_raw_recv_port == port)
    gm.registered_raw_recv_port = 0;
  remove_port_from_polling_queue (port);

  /* restore the active subport count */

  port->active_subport_cnt = active_subport_cnt;
}

/**********************************************************************/

/* Update gm.ethernet.send.target to be the index of the next
   node to send a packet to to simulate a broadcast, or 0 if done. */

static inline void
gm_ethernet_compute_next_broadcast_target (void)
{
  gm_u32_t target;

  gm_puts ("gm_ethernet_compute_next_broadcast_target\n");
  target = gm.ethernet.send.target + 1;

  /* Scan for a connection with a remote node. */
  while (1)
    {
      /* check for overrun */
      if (target >= gm.max_node_id)
	{
	  gm.ethernet.send.target = 0;
	  return;
	}
      /* check for remote node */
      if (GM_CONNECTION_HAS_ROUTE (&gm_connection[target])
	  && target != gm.this_node_id)
	{
	  gm.ethernet.send.target = target;
	  return;
	}
      target++;
    }
}

#if GM_MAX_NETWORK_DIAMETER != 24
#error Code does not support GM_MAX_NETWORK_DIAMETER != 24.
#endif

/* hand-optimized route copying function */

static inline void
copy_route (gm_u8_t * _from, gm_u8_t * _to)
{
  gm_u32_t a, b;
  gm_u32_t *from, *to;

  gm_assert (GM_MAX_NETWORK_DIAMETER == 24);

  gm_assert (GM_ALIGNED (_from, 4));
  from = (gm_u32_t *) _from;

  gm_assert (GM_ALIGNED (_to, 4));
  to = (gm_u32_t *) _to;

#if defined lanai7
  /* optimized for LANai7 pipeline with 2 delay slots */

  {
    gm_u32_t c;

    a = from[0];
    b = from[1];
    c = from[2];
    to[0] = a;
    to[1] = b;
    to[2] = c;
    a = from[3];
    b = from[4];
    c = from[5];
    to[3] = a;
    to[4] = b;
    to[5] = c;
  }
#else
  /* optimized for LANai{3,4,5,6} pipeline with 1 delay slot. */

  a = from[0];
  b = from[1];
  to[0] = a;
  to[1] = b;
  a = from[2];
  b = from[3];
  to[2] = a;
  to[3] = b;
  a = from[4];
  b = from[5];
  to[4] = a;
  to[5] = b;
#endif
}

#if GM_ENABLE_ETHERNET
/* Prepend the route for connection C to the beginning of a send
   chunk.  If MARK, also prepend the ethernet packet type. */
static inline void
gm_ethernet_copy_route_from_connection (gm_connection_t * c,
					gm_u8_t * to, int mark)
{
  gm_u32_t a, b;
  gm_assert (GM_ALIGNED (to, 4));
  if (mark)
    {

      gm_puts ("Copying route and marking.\n");

      LOG_DISPATCH(0,"gm_ethernet_copy_route and marking");
#if GM_MAX_NETWORK_DIAMETER != 24
#error GM_MAX_NETWORK_DIAMETER != 24
#endif

      /* copy and realign route */
      a = ((gm_u32_t *) c->route)[0];
      b = ((gm_u32_t *) c->route)[1];
      ((gm_u16_t *) to)[-1] = a >> 16;	/*in pad area: actually in smlt field
					   which is overwritten later */
      ((gm_u16_t *) to)[0] = a;
      ((gm_u16_t *) to)[1] = b >> 16;
      ((gm_u16_t *) to)[2] = b;
      a = ((gm_u32_t *) c->route)[2];
      b = ((gm_u32_t *) c->route)[3];
      ((gm_u16_t *) to)[3] = a >> 16;
      ((gm_u16_t *) to)[4] = a;
      ((gm_u16_t *) to)[5] = b >> 16;
      ((gm_u16_t *) to)[6] = b;
      a = ((gm_u32_t *) c->route)[4];
      b = ((gm_u32_t *) c->route)[5];
      ((gm_u16_t *) to)[7] = a >> 16;
      ((gm_u16_t *) to)[8] = a;
      ((gm_u16_t *) to)[9] = b >> 16;
      ((gm_u16_t *) to)[10] = b;
      ((gm_u16_t *) to)[11] = GM_ETHERNET_PACKET_TYPE;	/* record type */
    }
  else
    {
      gm_puts ("Copying route.\n");
      LOG_DISPATCH(0,"gm_ethernet_copy_route and NOT marking");
      copy_route (c->route, to);
    }
}

static inline void
gm_ethernet_maybe_done (void)
{
  if (gm.ethernet.send.token.ethernet.type == GM_ST_ETHERNET_SEND
      || gm.ethernet.send.target == 0)
    {
      prepare_to_interrupt ("ethernet sent ");
      gm_interrupt (GM_ETHERNET_SENT_INTERRUPT);
      gm_puts ("interrupted host with ethernet sent interrupt\n");

      {
	gm_port_protected_lanai_side_t *port;
	gm_port_unprotected_lanai_side_t *PORT;
	gm_send_token_t *next;

	port = &gm.port[GM_ETHERNET_PORT_ID];
	PORT = &GM_PORT[GM_ETHERNET_PORT_ID];

	next = port->first_free_send_token;
	;
	;
	if (next)
	  {
	    /* record the next send token slot to poll.  ??? should we do
	       this is pass_sent_token_to_port_and_free? */
	    port->send_token_queue_slot
	      = &PORT->send_token_queue[next - &port->_send_tokens[0]];
	  }
	else
	  {
	    gm_putstring ("gm_recycle_first_send_token: faking\n");

	    /* make the MCP poll a fake host send token location where it is
	       guaranteed not to find a send. */
	    gm_assert (GM_NO_SEND_EVENT == 0);
	    port->send_token_queue_slot = FAKE_SEND_QUEUE_SLOT ();
	  }
      }

      gm_assert (gm.ethernet.send.busy);
      gm.ethernet.send.busy = 0;
    }
}
#endif /* GM_ENABLE_ETHERNET */

/**********************************************************************/

/* reset a previously initialized connection */

static void
gm_connection_reset (gm_connection_t * c)
{
  gm_always_assert (c);

  c->send_sexno.parts.sesno = 0;
  c->send_sexno.parts.seqno = 1;

  gm_always_assert (c->ack_packet.type == GM_PACKET_TYPE);

  c->ack_packet.subtype = GM_ACK_SUBTYPE;
  c->ack_packet.sexno.whole = 0;

  gm_always_assert (c->ack_packet.sender_node_id == gm.this_node_id);

#if 0
  c->first_active_send_port = 0;
#endif

  gm_always_assert (c->first_send_record == 0);

  c->open_time = rtc64 () - GM_CONNECTION_TIMEOUT - 1;
  c->close_time = rtc64 () - GM_CONNECTION_TIMEOUT - 1;
}

/* initialize a bzero'd connection */

static void
_gm_connection_init (gm_connection_t * c)
{
  gm_always_assert ((GM_PORT != 0));
  gm_always_assert ((unsigned int) (c + 1) <= (unsigned int) &GM_PORT[0]);

  c->next_active = 0;
  c->prev_active = 0;
  GM_CONNECTION_CLEAR_ROUTE (c);
  /* c->ack_pending = 0; */
  /* c->next_to_ack = 0; */
  /* Force sequence number mismatch to cause connection reset to
     be detected. */
  /* c->send_sexno.parts.sesno = 0; */
  c->send_sexno.parts.seqno = 1;
  /* c->route = 0; */

  c->ack_packet.type = GM_PACKET_TYPE;
  c->ack_packet.subtype = GM_ACK_SUBTYPE;
  /* c->ack_packet.sexno.whole = 0; */
  c->ack_packet.target_node_id = c - &gm_connection[0];
  /* We don't know gm.this_node_id yet, so this just sets it to 0: */
  c->ack_packet.sender_node_id = gm.this_node_id;

  /* c->active_subport_bitmask = 0; */
  /* c->first_active_send_port = 0; */
  /* c->first_send_record = 0; */
  /* c->last_send_record = 0; */
  {
    const gm_u64_t one = 1;
    c->open_time = rtc64 () - (one << 32);
    c->close_time = rtc64 () - (one << 32);
  }
}

/**********************************************************************/

static void
gm_initialize (void)
{
  unsigned int h, i, j;
  static int initialize_called;

  /* Make sure the LANai is not re-initialized.  This can happen if,
     for example, the LANai is reset or accidentally branches to
     memory location 0. */

  /* Macros for temporary salting of location 0 with the line number */
#define GM_ENABLE_SALT_LINENO 0
#if GM_ENABLE_SALT_LINENO
#define GM_SALT_LINENO \
  *(gm_u32_t *) 0 = (0x43210000 + __LINE__);
#else
#define GM_SALT_LINENO /* nothing */
#endif

  *(gm_u32_t *) 0 = 0x12340000;
  while (initialize_called)
    ;
  initialize_called = 1;
  *(gm_u32_t *) 0 = 0x12341111;

  /****************
   * Pre-inialization
   ****************/

  /* The host cleared all of the LANai SRAM before loading the MCP, so
     it is safe to assume that all uninitialized variables are cleared. */


  /* Initialize ubiquitous globals */

#if GM_ENABLE_MCP_GLOBAL_REGS
  gmptr = &_gmp.globals;
  gm_connection = gm.connection;
  gm_port = gm.port;
  /*  GM_PORT = gm._PORT; */
#endif

  /****************
   * Hardware Initialization
   ****************/

  gm_init_lanai_hardware ();

  /****************
   * CPU Register setup
   ****************/

  /********************* CPU Register setup *********************/

  /* Clear the globals that *might* be in hard registers */

  GM_STATE = 0;
  GM_PORT = 0;			/* reinitialized below, once the SRAM
				   length is known. */


  *(gm_u32_t *) 0 = 0x12342222;

  /****************
   * State Machine initialization
   ****************/

  /* Set IMR to participate in DISPATCH( ) */

  set_IMR (IMR_INIT_VAL);

  /* Initialize State */

  NOTICE (FREE_RECV_CHUNK + FREE_SEND_CHUNK);

  NOTICE_NO (RDMA_PENDING
	     + RDMAING
	     + RECEIVING
	     + SDMAING + SEND_PENDING + ACK_PENDING + SENDING + SDMA_PENDING);

  /* Set up gm_event_index table to convert between state and event
     indices.  I.e.: the_index=gm_event_index[state]. */

  for (i = 0; i < (BIGGEST_STATE_BIT << 1); i++)
    {
      /* Highest priority so queues will be rewound when needed.
         Timer events happen only when no acks are pending. */

      if (i & TIME_INT_BIT && i & SEND_INT_BIT)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (TIMER_EVENT, i);
	  continue;
	}

      if (1
#if !GM_CPU_lanai		/* Support embedded DMA emulation */
	  && i & DMA_INT_BIT
#endif
	  && i & RDMAING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (FINISH_RDMA_EVENT, i);
	  continue;
	}

      /* receive events */

      if (i & RECV_INT_BIT && i & RECEIVING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (FINISH_RECV_PACKET_EVENT, i);
	  continue;
	}
      if (i & BUFF_INT_BIT && i & RECEIVING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (RECV_BUFFER_OVERFLOW_EVENT, i);
	  continue;
	}
      if (~i & RECEIVING && i & FREE_RECV_CHUNK)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (START_RECV_PACKET_EVENT, i);
	  continue;
	}

      /* send events */

      if (i & SEND_INT_BIT && i & SENDING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (FINISH_SEND_EVENT, i);
	  continue;
	}
      if (i & SEND_INT_BIT && ~i & SENDING && i & ACK_PENDING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (SEND_ACK_EVENT, i);
	  continue;
	}
      if (i & SEND_INT_BIT && ~i & SENDING && i & SEND_PENDING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (START_SEND_EVENT, i);
	  continue;
	}

      /* SDMA completion events */

      if (1
#if !GM_CPU_lanai		/* support embedded DMA emulation */
	  && i & DMA_INT_BIT
#endif
	  && i & SDMAING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (FINISH_SDMA_EVENT, i);
	  continue;
	}

      /* Interlocked SDMA/RDMA state machine fairness events */

#ifdef POLL_PENDING
      if (i & POLL_PENDING)
	{
	  GM_SET_EVENT_INDEX_FOR_STATE (POLL_EVENT, i);
	}
#endif

      {
	int bits;

	bits = 0;

	if (1
#if GM_CPU_lanai
	    && i & DMA_INT_BIT
#endif
	    && ~i & SDMAING
	    && ~i & RDMAING && i & FREE_SEND_CHUNK && i & SDMA_PENDING)
	  bits |= SDMA_PENDING;

	if (1
#if GM_CPU_lanai
	    && i & DMA_INT_BIT
#endif
	    && ~i & SDMAING && ~i & RDMAING && i & RDMA_PENDING)
	  bits |= RDMA_PENDING;

	switch (bits)
	  {
#ifndef POLL_PENDING
	  case (SDMA_PENDING + RDMA_PENDING):
	    GM_SET_EVENT_INDEX_FOR_STATE (FAIR_SDMA_RDMA_EVENT, i);
	    continue;
	  case (SDMA_PENDING):
	    GM_SET_EVENT_INDEX_FOR_STATE (FAIR_SDMA_EVENT, i);
	    continue;
	  case (RDMA_PENDING):
	    GM_SET_EVENT_INDEX_FOR_STATE (FAIR_RDMA_EVENT, i);
	    continue;
	  case (0):
	    GM_SET_EVENT_INDEX_FOR_STATE (POLL_EVENT, i);
	    continue;
#else /* POLL_PENDING defined */
	  case (SDMA_PENDING + RDMA_PENDING):
	    GM_SET_EVENT_INDEX_FOR_STATE (FAIR_SDMA_RDMA_EVENT, i);
	    continue;
	  case (SDMA_PENDING):
	    GM_SET_EVENT_INDEX_FOR_STATE (START_SDMA_EVENT, i);
	    continue;
	  case (RDMA_PENDING):
	    GM_SET_EVENT_INDEX_FOR_STATE (START_RDMA_EVENT, i);
	    continue;
	  case (0):
	    GM_SET_EVENT_INDEX_FOR_STATE (POLL_EVENT, i);
	    continue;
#endif /* POLL_PENDING defined */
	  default:
	    GM_ABORT ();
	  }
      }
    }
  gm.event_index_table_crc = gm_crc (_gmp.event_index,
				     sizeof (_gmp.event_index));

  /********************************
   * Host synchronization...
   *
   * ... is performed as follows:
   * LANai sets SMP = &gm so the host can find the globals.
   * Host set gm.sram_length so the LANai can compute max_node_id.
   * Host sets important values in LANai SRAM, including some that
   *    require max_node_id to create, and then clears SMP.
   ********************************/

  /* Tell host where to find the lanai globals by putting a pointer
     in the SMP because it is
     (a) cleared during LANai reset
     (b) well-known to the host
     (c) guaranteed not to change until the host finds it.

     We also tag the beginning and end of the globals to allow the host
     to catch inconsistencies in the MCP size. */

  gm.magic = gm.end_magic = 0xcafebabe;
  gm.length = sizeof (gm);

  GM_STBAR ();

  SMP = &gm;

  GM_STBAR ();

  /* HACK to make sure "ZZZ" is not flashed unless host sleeps too long. */

  heartbeat ();

  /****************
   * High memory setup
   *
   * This is done early so the host can wait for max_node_id, which it
   * needs to initialize the host-side ethernet addr and hostname
   * tables.
   ****************/

  *(gm_u32_t *) 0 = 0x12343333;

  /* Wait for the host to set lanai_sram_size */

  while (gm.sram_length == 0)
    {
      GM_STBAR ();
    }

  gm_assert (GM_PAGE_LEN == sizeof (gm_port_unprotected_lanai_side_t));
  gm_assert (gm.sram_length);
  GM_PORT
    = (gm_port_unprotected_lanai_side_t *) (gm.sram_length
					    - GM_PAGE_HASH_PIECE_REF_TABLE_LEN
					    - (GM_NUM_PORTS * GM_PAGE_LEN));

  /* Figure how much space to use for the connection array and how
     much for cached PTEs.  Also determine the location of the cached
     PTE table. */

  {
    unsigned long available_memory;
    unsigned long cached_pte_cnt;
    unsigned long connection_cnt;
    unsigned long const GM_MAX_CONNECTION_CNT = 
#if L4 | L5 
						511+1;
#else
						3000+1;
#endif
	
    gm_assert (GM_PORT);
    available_memory = (char *) GM_PORT - (char *) gm.connection;
    connection_cnt = available_memory / 2 / sizeof (gm_connection_t);
    if (connection_cnt > GM_MAX_CONNECTION_CNT)
      {
	connection_cnt = GM_MAX_CONNECTION_CNT;
      }
    available_memory -= connection_cnt * sizeof (gm_connection_t);
    cached_pte_cnt = available_memory / sizeof (gm_cached_pte_t) - 1;
    if (!GM_POWER_OF_TWO (cached_pte_cnt))
      {
	cached_pte_cnt = 1 << (gm_log2_roundup (cached_pte_cnt) - 1);
      }
    available_memory -= (cached_pte_cnt + 1) * sizeof (gm_cached_pte_t);
    connection_cnt += available_memory / sizeof (gm_connection_t);
    if (connection_cnt > GM_MAX_CONNECTION_CNT)
      {
	connection_cnt = GM_MAX_CONNECTION_CNT;
      }

    gm.max_node_id = connection_cnt - 1;
    gm.page_hash.cache.entry = (void *) &gm.connection[connection_cnt];
    GM_MAX_PAGE_HASH_CACHE_INDEX = cached_pte_cnt - 1;
  }

  /****************
   * Get some important values from the host
   ****************/

  /* Wait for host initialization to complete.  The host sets the
     DMA_STS, BURST, and VERSION registers, as needed while we wait.  This
     takes several milliseconds. */

  while (SMP)
    {
      gm.while_waiting++;
      gm_flash_while_waiting ("ZZZ\n");
    }

  /* Verify the host set all it should have set. */

  gm_assert (gm.page_hash.bogus_sdma_ptr);
  gm_assert (gm.page_hash.bogus_rdma_ptr);
  gm_assert (gm.sram_length);

  *(gm_u32_t *) 0 = 0x12344444;
  
  /****************
   * global initialization
   ****************/

  /* gm.magic set above during host synchronization */
  gm._PORT = GM_PORT;
  /* gm.dma_descriptor = {0}; */

#if L4 | L5
  /* nothing to do */
#elif L6 | L7 | L8 | L9
  gm.dma_descriptor.next_with_flags = (gm_lp_t) DMA_TERMINAL;
#else
#error Unrecognized CPU.
#endif

  gm.event_index_table = gm_event_index;	/* for debugging */
  /* gm.this_node_id = 0 */
  /* gm.port_to_wake = 0; */
  gm_assert (gm.page_hash.bogus_sdma_ptr);	/* set by host */
  gm_assert (gm.page_hash.bogus_rdma_ptr);	/* set by host */
#if GM_ENABLE_VM
  gm_init_page_table ();	/* gm.page_hash.cache */
#endif /* GM_ENABLE_VM */
  /* gm.first_port_with_sent_packets = 0; */

  gm.free_subports = &gm._subport[0];
  for (j = 0; j < GM_NUM_SUBPORTS - 1; j++)
    gm._subport[j].next = &gm._subport[j + 1];

  gm.timeout_time = RTC;
  gm_assert (gm.sram_length);	/* set by host */

  /* gm._state initialized above */
  gm.free_recv_chunk_cnt = 2;
  /* gm.current_rdma_port = 0 */
  /* gm.registered_raw_recv_port = 0 */
  gm.free_send_chunk_cnt = 2;
  /* gm.first_connection_to_ack = 0; */

  /* gm.send_chunk = {0}; */
#if GM_CPU_lanai
  gm.send_chunk[0].send_list = gm.send_chunk[0].packet.as_gm.payload;
  gm.send_chunk[1].send_list = gm.send_chunk[1].packet.as_gm.payload;
  gm.send_chunk[0].send_list_end = gm.send_chunk[0].packet.as_gm.payload;
  gm.send_chunk[1].send_list_end = gm.send_chunk[1].packet.as_gm.payload;
#endif
  gm.send_chunk[0].packet.as_gm.header.type = GM_PACKET_TYPE;
  gm.send_chunk[1].packet.as_gm.header.type = GM_PACKET_TYPE;
  gm.send_chunk[0].cafebabe = 0xcafebabe;
  gm.send_chunk[1].cafebabe = 0xcafebabe;
  /* gm.recv_chunk = {0}; */
  gm.recv_chunk[0].cafebabe = 0xcafebabe;
  gm.recv_chunk[1].cafebabe = 0xcafebabe;
  /* gm.recv_token_bin = {0}; */
  /* gm.first_active_connection = 0; */

  gm.free_send_records = &gm._send_record[0];
  for (j = 0; j < GM_NUM_SEND_RECORDS - 1; j++)
    gm._send_record[j].next = &gm._send_record[j + 1];

  /* gm._remaining_sdma_ctr = 0; */
  /* gm._remaining_rdma_ctr = 0; */
  /* gm.remaining_sdma_lar = 0; */
  /* gm.remaining_rdma_lar = 0; */
  /* gm.remaining_sdma_ear = 0; */
  /* gm.remaining_rdma_ear = 0; */

  /* gm.failed_send_dma_stage = {0}; */
  /* gm.failed_send_dma_stage.type = set dynamically */

  /* gm.report_dma_stage = {0}; */

  /* gm.nack_delay = 0 */
  gm.rand_seed = -1;
  /* gm.backlog_delay = 0 */
  /* gm.pause_rqst = 0 */
  /* gm.pause_ack = 0 */
  gm.port_to_close = -1;
  gm.port_to_open = -1;
  /* gm_failed_line = 0; */
  /* gm_failed_file = 0; */

/*
  gm.nack_cnt = 0;
  gm.nack_down_cnt = 0;
  gm.nack_reject_cnt = 0;
  gm.nack_received_cnt = 0;
  gm.nack_normal_cnt=0;
  gm.nack_send_nothing1_cnt=0;
  gm.nack_send_nothing2_cnt=0;
  gm.nack_send_open_connection_cnt=0;
  gm.nack_send_close_connection_cnt=0;
  gm.nack_receive_close_connection_cnt=0;
  gm.nack_ignore_close_connection_cnt=0;
  gm.handle_connection_reset_request_cnt=0;
  gm.nack_ignored_cnt = 0;
  gm.drop_cnt = 0;
  gm.resend_cnt = 0;
  gm.netrecv_cnt = 0;
  gm.too_small_cnt = 0;
  gm.bogus_header_cnt = 0;
  gm.out_of_sequence_cnt = 0;
*/
  /* gm._reserved_after_resent_cnt = 0 */
  /* gm.led = 0 */
  set_LED (0);

  for (i = 0; i < GM_NUM_PORTS; i++)
    gm_init_port (i);

  /* gm.finishing_rdma_for_port = 0; */
  /* gm.first_port_with_alarm = 0; */

  /* gm.record_log = 0; */
#if GM_LOG_DISPATCHES
  gm.log_slot = gm.log;
  gm.log_end = &gm.log[GM_LOG_LEN];
  gm.logtime_index = 0;
#endif
  /* gm.lzero = 0; */
  /* gm.log = {0} */
  gm.current_handler = "none";
  gm.current_handler_extra = "";
  /* gm.resume_after_halt = 0; */
  /* gm.volatile_zero = 0; */
  for (j = 0; j < 2; j++)
    for (i = 0; i < 128; i++)
      {
	/* gm.dispatch_cnt[i][j]=0; */
      }

  /* gm.hit_cnt = 0 */
  /* gm.miss_cnt = 0 */
  /* gm.pause_cnt = 0 */
  /* gm.hashed_token_cnt = 0 */

  /***
   * Begin mapping state initialization
   ***/

  /*** initialize the mapping scout message reply packet ***/
  /* gm.mapper_state.scout_reply.route_len = 0; */
  /* gm.mapper_state.scout_reply.in_send_queue = 0 */
  /* gm.mapper_state.scout_reply.route[*] = 0 */
  gm.mapper_state.scout_reply.packet.type = GM_MAPPING_PACKET_TYPE;
  gm.mapper_state.scout_reply.packet.subtype =
    GM_MAPPER_SCOUT_REPLY_PACKET_SUBTYPE;
  /* gm.mapper_state.scout_reply.packet.port = don't care */
  /* gm.mapper_state.scout_reply.packet.phase = don't care */
  /* gm.mapper_state.scout_reply.packet.address = initialized by host */
  gm.mapper_state.scout_reply.packet.option =
    GM_MAPPER_SCOUT_REPLY_PACKED_ROUTES_OPTION |
    GM_MAPPER_SCOUT_REPLY_LONG_HOSTNAME_OPTION;
  /*gm.mapper_state.scout_reply.packet.extendHostname = initialized by host */

  /*should the MCP initialized this? */
  gm.mapper_state.scout_reply.packet.node_type = 0;

  gm.mapper_state.scout_reply.send_token.common.type
    = GM_ST_MAPPER_SCOUT_REPLY;
  gm.mapper_state.scout_reply.send_token.common.sendable = 1;

  /*** initialize the mapping config message reply packet ***/
  /* gm.mapper_state.config_reply.route_len = 0; */
  /* gm.mapper_state.config_reply.in_send_queue = 0 */
  /* gm.mapper_state.config_reply.route[*] = 0 */
  gm.mapper_state.config_reply.packet.type = GM_MAPPING_PACKET_TYPE;
  gm.mapper_state.config_reply.packet.subtype =
    GM_MAPPER_CONFIG_REPLY_PACKET_SUBTYPE;
  /* gm.mapper_state.config_reply.packet.port = don't care */
  /* gm.mapper_state.config_reply.packet.phase = don't care */
  /* gm.mapper_state.config_reply.packet.address = initialized by host */
  /* gm.mapper_state.config_reply.packet.host_section = 0 */
  gm.mapper_state.config_reply.send_token.common.type
    = GM_ST_MAPPER_CONFIG_REPLY;
  gm.mapper_state.config_reply.send_token.common.sendable = 1;

  /***
   * End mapping state initialization
   ***/

  /* gm.ethernet = {0} */
  gm.ethernet.send.token.ethernet.sendable = 1;
  gm.ethernet.recv.token_slot = &gm.ethernet.recv.token[0];
  /* gm.trash = {0} */
  gm_assert (gm.max_node_id);	/* set by host */
  /* gm.end_magic set above during host synchronization */

  /********************* Connection initialization *********************/

  /* Initialize the connections, which fill the space between the end
     of LANai memory and the stack.  The number of connections is one
     more than gm.max_node_id, which is computed by the host. */

#if 0
  gm_printf ("%s:%d  gm.max_node_id = %d\n", __FILE__, __LINE__,
	     gm.max_node_id);
  gm_printf ("GM_PORT = %p, gm.connection = %p -> %p\n",
	  (char *) GM_PORT, (char *) gm.connection,
	  (char *) &gm.connection[gm.max_node_id]);
  gm_printf ("%d\n", (gm_connection_t *) GM_PORT - gm.connection);
#endif

  gm_always_assert ((char *) &gm.connection[gm.max_node_id] <=
		    (char *) GM_PORT);

  for (h = 0; h <= gm.max_node_id; h++)
    {
      _gm_connection_init (&gm_connection[h]);
      set_LED (RTC >> 17);
    }

  *(gm_u32_t *) 0 = 0x12345555;

  /********************* ISR setup *********************/

  /* Setup the first receive */

  RMP = gm.recv_chunk[0].packet.as_bytes;
  set_RML (&gm.recv_chunk[0].end);
  NOTICE (RECEIVING);

#ifdef BAD
  /* Verify ISR was reset */

  gm_assert ((get_ISR () & IMR & ~TIME_INT_BIT) == 0);
#endif

  /* Set the SEND_INT_BIT */

  /* Harmlessly send an ack to this node, causing send_int_bit to be set. */

  SMP = &gm_connection[gm.this_node_id].ack_packet;
  set_SMLT (&gm_connection[gm.this_node_id].ack_packet + 1);
  while (!(get_ISR () & SEND_INT_BIT))
    set_LED (RTC >> 17);


  *(gm_u32_t *) 0 = 0x12346666;

  /*
   * Set the DMA_INT_BIT in ISR by using the DMA engine.
   *
   * We take this opportunity to measure the performance of both
   * send and receive DMAs.
   */

  gm_assert (gm.page_hash.bogus_sdma_ptr);


#define MAX_LOOP 4

  {
    int loop;
    unsigned int start;

    gm.e2l_time[0] = gm.l2e_time[0] = 0;
    for (loop = MAX_LOOP; loop; loop--) 
      {
  
        start = RTC;
        {
          gm_s32_t remaining;
          for (remaining = GM_PAGE_LEN; remaining > 0; remaining -= GM_MTU)
            {
              GM_SALT_LINENO;
	      start_SDMA (gm.page_hash.bogus_sdma_ptr,
		          gm.send_chunk[0].packet.as_bytes,
		          (remaining<=GM_MTU ? remaining : GM_MTU), __LINE__);
              GM_SALT_LINENO;
  	      gm_puts("bogus send dma started...\n");
              GM_SALT_LINENO;
  	      await_free_DMA_engine ();
              GM_SALT_LINENO;
            }
        }
        gm.e2l_time[0] += (RTC - start);
        GM_SALT_LINENO;
      } /* for loop */

    GM_SALT_LINENO;
    gm_puts("bogus send dma ended...\n");
    GM_SALT_LINENO;
    gm_assert_p (*(gm_u32_t *)gm.send_chunk[0].packet.as_bytes == 0xaaaaaaaa);
    GM_SALT_LINENO;
    
    /* RDMA a page */
      
    GM_SALT_LINENO;
    gm_assert_p (gm.page_hash.bogus_rdma_ptr);
    GM_SALT_LINENO;
	
    for (loop = MAX_LOOP; loop; loop--) 
      {
        start = RTC;
        {
          gm_s32_t remaining;
          for (remaining = GM_PAGE_LEN; remaining > 0; remaining -= GM_MTU)
            {
	      start_RDMA (gm.send_chunk[0].packet.as_bytes,
		          gm.page_hash.bogus_rdma_ptr,
		          remaining<=GM_MTU ? remaining : GM_MTU);
  	      gm_puts("bogus recv dma started...\n");
  	      await_free_DMA_engine ();
            }
        }
        gm.l2e_time[0] += (RTC - start);
        gm_puts("bogus recv dma ended...\n");
      } /* for loop */

    gm.e2l_time[0] = gm.e2l_time[0]/MAX_LOOP;
    gm.l2e_time[0] = gm.l2e_time[0]/MAX_LOOP;
  }


  GM_SALT_LINENO;

  {
    int loop;
    volatile unsigned int start;

    for (i = 1; i < 4; i++)
    {
     /* First loop:  the "old" way.
      * Second loop: Repeat the "old" way for comparison.
      * Third loop:  Alternate the page used for each of the 8 passes.
      * Fourth loop: Alternate the page used for each DMA.
      * To be added if desired: Alternate send and receive DMA.
      */  
      gm.e2l_time[i] = gm.l2e_time[i] = 0;
      gm_assert (gm.page_hash.bogus_rdma_ptr);
      for (loop = MAX_LOOP; loop; loop--)
      {
	{
	  register gm_s32_t remaining;
          register int      inner_count = 0;

	  start = RTC;
	  for (remaining = GM_PAGE_LEN; remaining > 0; remaining -= GM_MTU)
	  {
            if (((i == 2) && (loop % 2)) || ((i == 3) && (inner_count % 2)))
              {
	        start_SDMA (gm.page_hash.bogus_rdma_ptr,
			    gm.send_chunk[0].packet.as_bytes,
			    (remaining <= GM_MTU ? remaining : GM_MTU),
                             __LINE__);
              }
            else
              {
	        start_SDMA (gm.page_hash.bogus_sdma_ptr,
			    gm.send_chunk[0].packet.as_bytes,
			    (remaining <= GM_MTU ? remaining : GM_MTU),
                             __LINE__);
              }
	    await_free_DMA_engine ();
            inner_count++;
	  }
	}
        gm.e2l_time[i] += (RTC - start);

	if (GM_DEBUG
	    && *(gm_u32_t *) gm.send_chunk[0].packet.as_bytes != 0xaaaaaaaa)
	  {
	    gm_hex_dump (gm.send_chunk[0].packet.as_bytes, 64);
	    GM_ABORT ();
	  }

	/* RDMA a page */

	{
	  register gm_s32_t remaining;
          register int      inner_count = 0;

	  start = RTC;
	  for (remaining = GM_PAGE_LEN; remaining > 0; remaining -= GM_MTU)
	  {
            if (((i == 2) && (loop % 2)) || ((i == 3) && (inner_count % 2)))
              {
	        start_RDMA (gm.send_chunk[0].packet.as_bytes,
			  gm.page_hash.bogus_sdma_ptr,
			  remaining <= GM_MTU ? remaining : GM_MTU);
              }
            else
	      {
	        start_RDMA (gm.send_chunk[0].packet.as_bytes,
			  gm.page_hash.bogus_rdma_ptr,
			  remaining <= GM_MTU ? remaining : GM_MTU);
	      }
	    await_free_DMA_engine ();
            inner_count++;
	  }
	}
        gm.l2e_time[i] += (RTC - start);
      }				/* for loop */

     gm.e2l_time[i] = gm.e2l_time[i] / MAX_LOOP;
     gm.l2e_time[i] = gm.l2e_time[i] / MAX_LOOP;
     } /* for MAX_LOOP loop */
#undef MAX_LOOP
  } /* for i */

  set_IT (1);

  gm.current_handler = "0x12347777";
  *(gm_u32_t *) 0 = 0x12347777;
  GM_STBAR ();

  gm_puts ("gm_initialized\n");

#if GM_ENABLE_TRACE
  gtracebuf =
    (void *) (((unsigned) gtracebufarea + 2 * GM_TRACEBUFSIZE - 1) &
	      ~(2 * GM_TRACEBUFSIZE - 1));
  G_TRACEPTR = (unsigned) gtracebuf;
  gm.trace_index = 0;
  gm_always_assert (G_TRACEPTR >= (unsigned) gtracebufarea
		    && G_TRACEPTR <
		    (unsigned) (gtracebufarea) + sizeof (gtracebufarea));
#endif

#if 0
  gm_printf ("max_node_id = %d\n", gm.max_node_id);
#endif

  gm_always_assert ((char *) &gm.connection[gm.max_node_id] <=
		    (char *) GM_PORT);

  gm.initialized = 1;
}

/****************************************************************
 * SDMA support functions
 ****************************************************************/

/* Commit to using the first send token by removing it from the free list */

static inline void
gm_send_token_commit (gm_port_protected_lanai_side_t * port,
		      gm_send_token_t * st)
{
  gm_send_token_t *next;
  gm_port_unprotected_lanai_side_t *PORT;

  /* remove the send token from the free list */

  gm_assert (st);
  gm_assert (port->first_free_send_token == st);
  next = st->common.next;
  PORT = port->PORT;
  st->common.next = 0;
  port->first_free_send_token = next;
  if (next)
    {
      /* record the next send token slot to poll.  ??? should we do
         this is pass_sent_token_to_port_and_free? */
      port->send_token_queue_slot
	= &PORT->send_token_queue[next - &port->_send_tokens[0]];
      GM_PRINT (GM_DEBUG_SEND_TOKENS,
		("looking for next send in slot 0x%x.\n",
		 (int) (next - &port->_send_tokens[0])));
      LOG_DISPATCH (145, "gm_send_token_commit:next send slot set\n");
    }
  else
    {
      gm_putstring ("gm_recycle_first_send_token: faking\n");

      /* make the MCP poll a fake host send token location where it is
         guaranteed not to find a send. */
      gm_assert (GM_NO_SEND_EVENT == 0);
      port->send_token_queue_slot = FAKE_SEND_QUEUE_SLOT ();
      GM_PRINT (GM_DEBUG_SEND_TOKENS, ("out of send tokens\n"));
      LOG_DISPATCH (145, "out of send tokens");
    }
}

/* Commit to using the first send token by removing it from the free
   list */

static inline void
gm_recycle_first_send_token (gm_port_protected_lanai_side_t * port,
			     gm_send_token_t * st)
{
  gm_send_token_commit (port, st);

  /* recycle it */

  pass_sent_token_to_port_and_free (st, port, 1, GM_SUCCESS);
}

static inline void
gm_ethernet_recycle_first_send_token (gm_port_protected_lanai_side_t * port,
				      gm_send_token_t * st)
{
  gm_send_token_commit (port, st);

  /* recycle it */

  free_send_token (st, port);
}

/****************************************************************
 * Ethernet support functions
 ****************************************************************/

#if GM_EMULATE_BYTE_DMAS && (GM_DMA_GRANULARITY > 1)
/* inline function to ensure compiler detects logical shifts right, with
   fast unrolled loop implementation. */
static inline void
_gm_ether_byte_dma_emul_shift_shift (gm_u32_t * fw,	/* from word */
				     gm_u32_t * tw,	/* to word */
				     int ls,	/* left shift amount */
				     int rs,	/* right shift amount */
				     gm_u32_t rsm,	/* right shift mask */
				     int word_cnt)	/* words to write */
{
  gm_u32_t a, b;

  gm_puts ("GM_EMULATE_BYTE_DMAS: " __FUNCTION__ " called\n");

  a = b = *fw++;
  switch (word_cnt & 0xf)
    {
    case 0x0:
      while (word_cnt > 0)
	{
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0xf:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0xe:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0xd:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0xc:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0xb:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0xa:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0x9:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0x8:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0x7:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0x6:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0x5:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0x4:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0x3:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
    case 0x2:
	  b = *fw++;
	  *tw++ = (a << ls) | ((b >> rs) & rsm);
    case 0x1:
	  a = *fw++;
	  *tw++ = (b << ls) | ((a >> rs) & rsm);
	  word_cnt -= 0x10;
	}
    }
  gm_puts ("GM_EMULATE_BYTE_DMAS: " __FUNCTION__ " called\n");
}

/* Special realignment function that shifts the LEN bytes at FROM to
   memory location (FROM+SHIFT).  The byte pointers FROM and
   (FROM+SHIFT) must be in the same word.

   NOTE: This function will corrupt any partword above or below the
   destination buffer, just like the LANai 4 DMA engine.  This is OK,
   since it is used to realign data that has just been DMAd by the
   LANai4 DMA engine, so the partwords are already corrupted.  Code
   before and after the DMA+realignment must save and restore these
   partwords.

   This function performs very efficient realignment, requiring about
   1 instruction per byte copied, by using only word reads and
   writes. */

static inline void
gm_ether_byte_dma_emul_shift (char *from, int shift, int len)
{
  gm_u32_t *tw;			/* from word */
  gm_u32_t *fw;			/* to word */
  int words_to_shift;

  gm_puts ("GM_EMULATE_BYTE_DMAS: " __FUNCTION__ " called.\n");

  gm_assert (((long) from & ~3) == ((long) (from + shift) & ~3));
  fw = tw = (gm_u32_t *) ((long) from & ~3);
  if (shift > 0)
    fw--;

  words_to_shift = (len + 6) >> 2;	/* worst-case is (((3+len)+3)&~3)/4 */
  switch (shift)
    {
    case -3:
    case 1:
      _gm_ether_byte_dma_emul_shift_shift (fw, tw, 24, 8, 0xffffff,
					   words_to_shift);
      break;
    case -2:
    case 2:
      _gm_ether_byte_dma_emul_shift_shift (fw, tw, 16, 16, 0xffff,
					   words_to_shift);
      break;
    case -1:
    case 3:
      _gm_ether_byte_dma_emul_shift_shift (fw, tw, 8, 24, 0xff,
					   words_to_shift);
      break;
    case 0:
      return;
    default:
      gm_always_assert (0);
    }

  gm_puts ("GM_EMULATE_BYTE_DMAS: " __FUNCTION__ " returning.\n");
}
#endif /* GM_EMULATE_BYTE_DMAS */

/****************************************************************
 * SDMA finishers.  These would be macros in gm_sdma.h, only we want
 # them to not be inlined for compact builds.
 ****************************************************************/

static inline void
finish_finish_sdma_start (gm_send_record_t * const sr,
			  gm_connection_t * const c, gm_subport_t * const sp)
{
  /* prefetch */ gm_connection_t *c__next_active;
  /* prefetch */ gm_subport_t *sp__next;

  /* prefetch */ sp__next = sp->next;
  /* prefetch */ c__next_active = c->next_active;
  sr->next = 0;

  /* Rotate the send list to be fair */
  gm_assert (sp__next);
  c->first_active_send_port = sp__next;
  gm_assert (c__next_active);
  gm.first_active_connection = c__next_active;

  NOTICE (SDMAING);
  gm_printf_p ("Sent a new packet.\n");
}

static inline void
finish_sdma_start (gm_send_record_t * const sr, gm_connection_t * const c,
		   gm_subport_t * const sp, unsigned int lzero,
		   unsigned int p__length, gm_send_token_t * const st)
{
  /* prefetch */ gm_s32_t rtc;
  /* prefetch */ gm_u32_t c__route_len;
  /* prefetch */ gm_send_record_t *c__first_send_record;
  /* prefetch */ gm_send_record_t *c__last_send_record;

  /* prefetch */ c__route_len = c->route_len;
  /* prefetch */ c__first_send_record = c->first_send_record;
  /* prefetch */ c__last_send_record = c->last_send_record;
  /* prefetch */ rtc = RTC;

  /* remember to round up length for misaligned lengths. */
  gm.send_chunk[lzero].smlt = (gm.send_chunk[lzero].packet.as_bytes
			       + sizeof (gm_packet_header_t) + p__length);

  /* Tell the send state machine what to send. */
  gm.send_chunk[lzero].smp = (gm.send_chunk[lzero].packet.as_bytes
			      - c__route_len);
#if GM_ENABLE_CRC32
  gm.send_chunk[lzero].smh = c__route_len?(gm.send_chunk[lzero].packet.as_bytes - 1) : 0;
#endif

  /* Commit to using send record; */
  gm_assert (sr != sr->next);
  gm_assert (sr == gm.free_send_records);
  gm.free_send_records = sr->next;
  sr->send_token = st;
  sr->resend_time = rtc;

  /* Set the route. */

  copy_route (c->route, gm.send_chunk[lzero].route);

  /* Add send record to unacked send record list. */
  if (c__first_send_record)
    {
      c->last_send_record = sr;
      c__last_send_record->next = sr;
      finish_finish_sdma_start (sr, c, sp);
      return;
    }
  else
    {
      c->first_send_record = sr;
      c->last_send_record = sr;
      finish_finish_sdma_start (sr, c, sp);
      return;
    }
  finish_finish_sdma_start (sr, c, sp);
}

/****************************************************************
 * 
 ****************************************************************/

#if GM_ENABLE_GALVANTECH_WORKAROUND

#if 0
/* compute 32-bit one's complement checksum for word-aligned buffer.
   Assumes that it is OK to read 3 word past _limit, and that _start
   and _limit are word aligned. */

gm_u32_t
gm_galvantech_checksum (const void *_start, const void *_limit)
{
  gm_u32_t checksum = 0, a, b, c;
  const gm_u32_t *start, *limit;

  /* Macros to compute ones-complement checksums follow.  These may
     use the carry flag only internally, as the compiler might clobber
     it between macros. */

#define ADDC(c,a) do {							\
  asm ("add.f %1,%2,%0\n\t"						\
       "addc %0,%?r0,%0" : "=r" (c) : "r" (c), "r" (a) : "cc");		\
} while (0)

#define ADDC_3(checksum, start, a, b, c) do {				\
  asm ("add.f %0,%2,%0\n\t"						\
       "ld [%1++],%2\n\t"						\
       "addc.f %0,%3,%0\n\t"						\
       "ld [%1++],%3\n\t"						\
       "addc.f %0,%4,%0\n\t"						\
       "ld [%1++],%4\n\t"						\
       "addc %0,%?r0,%0\n\t"						\
       : "=r"(checksum), "=r"(start), "=r"(a), "=r"(b), "=r"(c)		\
       : "0" (checksum), "1" (start), "2" (a), "3" (b), "4" (c)		\
       : "cc");								\
} while (0)

#define ADDC_15(checksum, start, a, b, c) do {				\
  asm ("add.f %0,%2,%0\n\t"						\
       "ld [%1++],%2\n\t"						\
       "addc.f %0,%3,%0\n\t"						\
       "ld [%1++],%3\n\t"						\
       "addc.f %0,%4,%0\n\t"						\
       "ld [%1++],%4\n\t"						\
       "addc.f %0,%2,%0\n\t"						\
       "ld [%1++],%2\n\t"						\
       "addc.f %0,%3,%0\n\t"						\
       "ld [%1++],%3\n\t"						\
       "addc.f %0,%4,%0\n\t"						\
       "ld [%1++],%4\n\t"						\
       "addc.f %0,%2,%0\n\t"						\
       "ld [%1++],%2\n\t"						\
       "addc.f %0,%3,%0\n\t"						\
       "ld [%1++],%3\n\t"						\
       "addc.f %0,%4,%0\n\t"						\
       "ld [%1++],%4\n\t"						\
       "addc.f %0,%2,%0\n\t"						\
       "ld [%1++],%2\n\t"						\
       "addc.f %0,%3,%0\n\t"						\
       "ld [%1++],%3\n\t"						\
       "addc.f %0,%4,%0\n\t"						\
       "ld [%1++],%4\n\t"						\
       "addc.f %0,%2,%0\n\t"						\
       "ld [%1++],%2\n\t"						\
       "addc.f %0,%3,%0\n\t"						\
       "ld [%1++],%3\n\t"						\
       "addc.f %0,%4,%0\n\t"						\
       "ld [%1++],%4\n\t"						\
       "addc %0,%?r0,%0\n\t"						\
       : "=r"(checksum), "=r"(start), "=r"(a), "=r"(b), "=r"(c)		\
       : "0" (checksum), "1" (start), "2" (a), "3" (b), "4" (c)		\
       : "cc");								\
} while (0)

  if (GM_DEBUG_GALVANTECH_WORKAROUND)
    {
      gm_hex_dump (_start, _limit - _start);
    }

  /* handle misaligned start bytes */

  start = (gm_u32_t *) _start;	/* speculative */
  if ((long) _start & 3)
    {
      switch ((long) _start & 3)
	{
	case 1:
	  start = (gm_u32_t *) (_start + 3);
	  ADDC (checksum, start[-1] & 0x00ffffff);
	  break;
	case 2:
	  start = (gm_u32_t *) (_start + 2);
	  ADDC (checksum, start[-1] & 0x0000ffff);
	  break;
	case 3:
	  start = (gm_u32_t *) (_start + 1);
	  ADDC (checksum, start[-1] & 0x000000ff);
	  break;
	}
    }

  /* prefetch to allow perfect pipelining. */

  a = *start++;
  b = *start++;
  c = *start++;

  /* compute limit of word copies */

  limit = (const gm_u32_t *) GM_ALIGN (lp, _limit, 4);

  /* precompensate for prefetching for start/limit comparisons */

  limit += 3;

  /* handle the bulk of the aligned core with an unrolled loop. */

  while (start < limit - 15)
    {
      ADDC_15 (checksum, start, a, b, c);
    }

  /* handle remaining bulk in smaller unrolled loop */

  while (start < limit - 3)
    {
      ADDC_3 (checksum, start, a, b, c);
    }

  /* handle remaining words  */

  if (start++ < limit)
    {
      ADDC (checksum, a);
      if (start++ < limit)
	{
	  ADDC (checksum, b);
	  if (start < limit)
	    {
	      ADDC (checksum, c);
	    }
	}
    }

  /* handle remaining bytes */

  if ((long) _limit & 3)
    {
      switch ((long) _limit & 3)
	{
	case 3:
	  ADDC (checksum, limit[-3] & 0xffffff00);
	  break;
	case 2:
	  ADDC (checksum, limit[-3] & 0xffff0000);
	  break;
	case 1:
	  ADDC (checksum, limit[-3] & 0xff000000);
	  break;
	}
    }

  /* Rotate computed checksum into the correct position. */

  if ((long) _start & 3)
    {
      switch ((long) _start & 3)
	{
	case 3:
	  checksum = (checksum << 24) | ((checksum >> 8) & 0xffffff);
	  break;
	case 2:
	  checksum = (checksum << 16) | ((checksum >> 16) & 0xffff);
	  break;
	case 1:
	  checksum = (checksum << 8) | ((checksum >> 24) & 0xff);
	  break;
	}
    }

  return checksum;

}

#endif /* 0 */

static inline gm_u32_t
gm_galvantech_header_checksum (const gm_packet_header_t * p)
{
  gm_u32_t *iptr = (gm_u32_t *) p;
  register gm_u32_t checksum;
  register gm_u32_t r0, r1, r2, r3;

  if (((gm_u32_t) p) & 0x3)
    {
      gm_printf ("gm_galvantech_header_checksum(%p) bad pointer\n", p);
    }
  iptr = (gm_u32_t *) p;
  r0 = iptr[0];
  r1 = iptr[1];
  r2 = iptr[2];
  r3 = iptr[3];
  checksum = r0 + r1;
  checksum = checksum + r2 + r3;
  return (checksum);
}

#endif /* GM_ENABLE_GALVANTECH_WORKAROUND */

static inline void
gm_galvantech_set_header_checksum (gm_packet_header_t * p)
{
#if GM_ENABLE_GALVANTECH_WORKAROUND
  p->header_checksum = gm_galvantech_header_checksum (p);
#else
  GM_PARAMETER_MAY_BE_UNUSED (p);
#endif
}

static inline gm_status_t
gm_galvantech_check_header_checksum (const gm_packet_header_t * p)
{
#if GM_ENABLE_GALVANTECH_WORKAROUND
  gm_u32_t computed;

  computed = gm_galvantech_header_checksum (p);
  if (p->header_checksum != computed)
    {
      gm_u32_t *iptr = (gm_u32_t *) p;
      gm_printf ("*** Bad GM hdr cksum detected 0x%08x!=0x%08x "
		 "(0x%08x) %d->%d.\n",
		 p->header_checksum, computed, (p->header_checksum ^ computed),
		 (int) p->sender_node_id, (int) p->target_node_id);
      gm_printf ("header: 0x%08x 0x%08x 0x%08x 0x%08x (0x%08x)\n",
		 iptr[0], iptr[1], iptr[2], iptr[3], iptr[4]);
      gm_printf
	("*** There is probably buggy Galvantech SRAM in node %d or %d\n",
	 (int) p->sender_node_id, (int) p->target_node_id);
      return GM_FAILURE;
    }
#else
  GM_PARAMETER_MAY_BE_UNUSED (p);
#endif
  return GM_SUCCESS;
}

#if GM_ENABLE_GALVANTECH_WORKAROUND
#define GM_GALVANTECH_SET_IP_CHECKSUM(p, checksum) do {			\
  (p)->ip_checksum = (checksum);					\
} while (0)
#else
#define GM_GALVANTECH_SET_IP_CHECKSUM(p, checksum)
#endif

/****************************************************************
 * MARK_LABEL
 ****************************************************************/

#if !GM_FUNCTION_BASED_DISPATCH

#if GM_DEBUG || GM_LOG_DISPATCHES
#define _MARK_LABEL(a, b) MARK_LABEL (a, b)
#define MARK_LABEL(label, label_extra)					\
do {									\
  gm_always_assert (0);							\
GM_CAT (label, label_extra):						\
  asm ("\n" GM_STRINGIFY (label) GM_STRINGIFY (label_extra) ":");	\
  gm.lzero = GM_STRINGIFY (label_extra);				\
  if (0||(&& GM_CAT (label, label_extra) != && L_sdma__poll_for_sdma_0	\
  	  && && GM_CAT (label, label_extra) != && L_timer)		\
  	  && && GM_CAT (label, label_extra) != && L_idle))		\
    {									\
      gm_printf_p (GM_STRINGIFY (label)					\
                   GM_STRINGIFY (label_extra) "\n" );			\
    }									\
  gm.current_handler 							\
    = GM_STRINGIFY (label) GM_STRINGIFY (label_extra);			\
  gm.current_handler_extra = "";					\
} while (0);
#else /* !(GM_DEBUG || GM_LOG_DISPATCHES) */
#define _MARK_LABEL(a, b) MARK_LABEL (a, b)
#define MARK_LABEL(label, label_extra)					\
do {									\
  gm_always_assert (0);							\
GM_CAT (label, label_extra):						\
  asm ("\n" GM_STRINGIFY (label) GM_STRINGIFY (label_extra) ":");	\
} while (0);
#endif /* !(GM_DEBUG || GM_LOG_DISPATCHES) */

#else  /* GM_FUNCTION_BASED_DISPATCH */

#define _MARK_LABEL(label, label_extra)					\
gm_handler_t GM_CAT (label, label_extra) () {;
#define MARK_LABEL(label, label_extra)		\
gm_handler_t GM_CAT (label, label_extra) () {	\
  LOG_DISPATCH (0, #label #label_extra "()");
#endif /* GM_FUNCTION_BASED_DISPATCH */

/*  void gm_dispatch ( ) __attribute__ ((noreturn)); */

#ifdef GM_FUNCTION_BASED_DISPATCH
#define GM_END_HANDLER }
#else
#define GM_END_HANDLER
#endif

/****************************************************************
 * gm_dispatch ()
 ****************************************************************/

void
gm_dispatch (void)
{

  SET_HANDLER (POLL_EVENT, L_idle,);
  SET_HANDLER (START_SDMA_EVENT, L_sdma__start_sdma_0,);
  SET_HANDLER (FINISH_SDMA_EVENT, L_sdma__finish_sdma,);
  SET_HANDLER (SEND_ACK_EVENT, L_send__start_sending_ack,);
  SET_HANDLER (START_SEND_EVENT, L_send__start_sending_chunk_0,);
  SET_HANDLER (FINISH_SEND_EVENT, L_send__finish_sending_chunk,);
  SET_HANDLER (FINISH_RECV_PACKET_EVENT, L_recv__got_chunk_0,);
  SET_HANDLER (START_RECV_PACKET_EVENT, L_recv__start_receiving_chunk_0,);
  SET_HANDLER (RECV_BUFFER_OVERFLOW_EVENT, L_recv__discard_overflow_0,);
  SET_HANDLER (START_RDMA_EVENT, L_rdma__rdma_chunk_0,);
  SET_HANDLER (FINISH_RDMA_EVENT, L_rdma__token_done,);
  SET_HANDLER (TIMER_EVENT, L_timer,);

  /* Fairness handlers */

  SET_HANDLER (FAIR_SDMA_RDMA_EVENT, L_fair__sdma_rdma_0,);
#ifndef POLL_PENDING
  SET_HANDLER (FAIR_SDMA_EVENT, L_fair__sdma_0,);
  SET_HANDLER (FAIR_RDMA_EVENT, L_fair__rdma_0,);
#endif

  /* handlers for (de)activating polling */

  gm.poll.idle_handler = GM_REFERENCE_LABEL (L_idle);
  gm.poll.active_handler = GM_REFERENCE_LABEL (L_sdma__poll_for_sdma_0);

  gm_assert (gm.free_send_chunk_cnt < 2 || NOTICED_NOT (SEND_PENDING));
  /* gm_print_state ( ); */
  /* gm_printf ("free_send_chunk_cnt: 0x%x\n", free_send_chunk_cnt); */

#if !GM_FUNCTION_BASED_DISPATCH
  DISPATCH (146, "initial dispatch");
#else  /* GM_FUNCTION_BASED_DISPATCH */
  gm_get_first_handler ();
  gm_abort ();
#endif /* GM_FUNCTION_BASED_DISPATCH */

#if INDENT_HACK
  {
#elif GM_FUNCTION_BASED_DISPATCH
  }
#endif /* GM_FUNCTION_BASED_DISPATCH */
  

#include "gm_mcp_reboot_node.h"

  /* Each of the sdma, send, recv, and rdma state machines is comprised of
     two almost identical sets of handlers depending upon which of the two
     send or recv buffers is to be used. */
  
#include "gm_sdma.h"
#include "gm_send.h"
#include "gm_recv.h"
#include "gm_rdma.h"

#undef LZERO
#undef LONE
#define LZERO 1
#define LONE 0

#include "gm_sdma1.h"
#include "gm_send1.h"
#include "gm_recv1.h"
#include "gm_rdma1.h"

#include "gm_fair.h"
#include "gm_timer.h"

#undef LZERO
#undef LONE
#define LZERO 0
#define LONE 1

#if !GM_FUNCTION_BASED_DISPATCH
}
#endif /* GM_FUNCTION_BASED_DISPATCH */

int				/* return "int" to prevent compiler warning */
main (void)
{
  gm_initialize ();
  gm_dispatch ();
  return 0;
}

#if 0
static void
gm_bootstrap_init (void)
{
  gm_initialize ();
}
#endif

/*
  This file uses GM standard indentation.

  Local Variables:
  c-file-style:"gnu"
  c-backslash-column:72
  tab-width:8
  End:
*/
