/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "mpidi_ch3_impl.h"
#include "mpidu_process_locks.h" /* for MPIDU_Yield */

volatile unsigned int MPIDI_CH3I_progress_completions = 0;

static inline int handle_read(MPIDI_VC *vc, int nb);

#ifndef MPIDI_CH3_Progress_start
void MPIDI_CH3_Progress_start()
{
    /* MT - This function is empty for the single-threaded implementation */
}
#endif

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Progress(int is_blocking)
{
    MPIDI_VC *vc_ptr;
    int num_bytes, mpi_errno;
    int spin_count = 1;
    unsigned completions = MPIDI_CH3I_progress_completions;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS);
    MPIDI_STATE_DECL(MPID_STATE_MPIDU_YIELD);
#ifdef USE_SLEEP_YIELD
    MPIDI_STATE_DECL(MPID_STATE_MPIDU_SLEEP_YIELD);
#endif

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS);

    MPIDI_DBG_PRINTF((50, FCNAME, "entering, blocking=%s", is_blocking ? "true" : "false"));
    do
    {
	mpi_errno = MPIDI_CH3I_write_progress(MPIDI_CH3I_Process.pg);
	if (mpi_errno != MPI_SUCCESS)
	{
	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3progress", 0);
	    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS);
	    return mpi_errno;
	}
	if (completions != MPIDI_CH3I_progress_completions)
	{
	    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS);
	    return MPI_SUCCESS;
	}

	mpi_errno = MPIDI_CH3I_read_progress(FALSE, &vc_ptr, &num_bytes);
	if (mpi_errno != MPI_SUCCESS)
	{
	    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3progress", 0);
	    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS);
	    return mpi_errno;
	}
	if (vc_ptr == NULL)
	{
	    if (spin_count >= MPIDI_CH3I_Process.pg->nRDMAWaitSpinCount)
	    {
#ifdef USE_SLEEP_YIELD
		if (spin_count >= MPIDI_CH3I_Process.pg->nRDMAWaitYieldCount)
		{
		    MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_SLEEP_YIELD);
		    MPIDU_Sleep_yield();
		    MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_SLEEP_YIELD);
		}
		else
		{
		    MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_YIELD);
		    MPIDU_Yield();
		    MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_YIELD);
		}
#else
		MPIDI_FUNC_ENTER(MPID_STATE_MPIDU_YIELD);
		MPIDU_Yield();
		MPIDI_FUNC_EXIT(MPID_STATE_MPIDU_YIELD);
#endif
	    }
	    spin_count++;
	}
	else
	{
	    MPIDI_DBG_PRINTF((50, FCNAME, "MPIDI_CH3I_read_progress reported %d bytes read", num_bytes));
	    spin_count = 1;
#ifdef USE_SLEEP_YIELD
	    MPIDI_Sleep_yield_count = 0;
#endif
	    mpi_errno = handle_read(vc_ptr, num_bytes);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3progress", 0);
		MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS);
		return mpi_errno;
	    }
	}
    } 
    while (completions == MPIDI_CH3I_progress_completions && is_blocking);

    MPIDI_DBG_PRINTF((50, FCNAME, "exiting, count=%d", MPIDI_CH3I_progress_completions - completions));
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS);
    return MPI_SUCCESS;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress_poke
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3_Progress_poke()
{
    int mpi_errno;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS_POKE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS_POKE);
    mpi_errno = MPIDI_CH3I_Progress(0);
    if (mpi_errno != MPI_SUCCESS)
    {
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**poke", 0);
    }
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS_POKE);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3_Progress_end
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
#ifndef MPIDI_CH3_Progress_end
void MPIDI_CH3_Progress_end()
{
    /* MT - This function is empty for the single-threaded implementation */
}
#endif

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Progress_init()
{
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS_INIT);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS_INIT);
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS_INIT);
    return MPI_SUCCESS;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Progress_finalize
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Progress_finalize()
{
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_PROGRESS_FINALIZE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_PROGRESS_FINALIZE);
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_PROGRESS_FINALIZE);
    return MPI_SUCCESS;
}

/*
 * MPIDI_CH3I_Request_adjust_iov()
 *
 * Adjust the iovec in the request by the supplied number of bytes.  If the iovec has been consumed, return true; otherwise return
 * false.
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3U_Request_adjust_iov
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_Request_adjust_iov(MPID_Request * req, MPIDI_msg_sz_t nb)
{
    int offset = req->rdma.iov_offset;
    const int count = req->ch3.iov_count;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_REQUEST_ADJUST_IOV);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_REQUEST_ADJUST_IOV);
    
    while (offset < count)
    {
	if (req->ch3.iov[offset].MPID_IOV_LEN <= (unsigned int)nb)
	{
	    nb -= req->ch3.iov[offset].MPID_IOV_LEN;
	    offset++;
	}
	else
	{
	    req->ch3.iov[offset].MPID_IOV_BUF = ((char*)req->ch3.iov[offset].MPID_IOV_BUF) + nb;
	    req->ch3.iov[offset].MPID_IOV_LEN -= nb;
	    req->rdma.iov_offset = offset;
	    MPIDI_DBG_PRINTF((60, FCNAME, "adjust_iov returning FALSE"));
	    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_REQUEST_ADJUST_IOV);
	    return FALSE;
	}
    }
    
    req->rdma.iov_offset = offset;

    MPIDI_DBG_PRINTF((60, FCNAME, "adjust_iov returning TRUE"));
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_REQUEST_ADJUST_IOV);
    return TRUE;
}

static inline int post_pkt_recv(MPIDI_VC *vc)
{
    int mpi_errno;
    MPIDI_STATE_DECL(MPID_STATE_POST_PKT_RECV);
    MPIDI_FUNC_ENTER(MPID_STATE_POST_PKT_RECV);
    vc->rdma.req->ch3.iov[0].MPID_IOV_BUF = (void *)&vc->rdma.req->rdma.pkt;
    vc->rdma.req->ch3.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t);
    vc->rdma.req->ch3.iov_count = 1;
    vc->rdma.req->rdma.iov_offset = 0;
    vc->rdma.req->ch3.ca = MPIDI_CH3I_CA_HANDLE_PKT;
    vc->rdma.recv_active = vc->rdma.req;
    mpi_errno = MPIDI_CH3I_post_read( vc , &vc->rdma.req->rdma.pkt, sizeof(vc->rdma.req->rdma.pkt));
    if (mpi_errno != MPI_SUCCESS)
	mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**postpkt", 0);
    MPIDI_FUNC_EXIT(MPID_STATE_POST_PKT_RECV);
    return mpi_errno;
}
/*#define post_pkt_recv(vc) MPIDI_CH3I_post_read( vc , &(vc)->rdma.pkt, sizeof((vc)->rdma.pkt))*/

#undef FUNCNAME
#define FUNCNAME handle_read
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
static inline int handle_read(MPIDI_VC *vc, int nb)
{
    int mpi_errno;
    MPID_Request * req;
    MPIDI_STATE_DECL(MPID_STATE_HANDLE_READ);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLE_READ);
    
    MPIDI_DBG_PRINTF((60, FCNAME, "entering"));

    req = vc->rdma.recv_active;
    if (req == NULL)
    {
	MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
	MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_READ);
	return MPI_SUCCESS;
    }

    if (nb > 0)
    {
	if (MPIDI_CH3I_Request_adjust_iov(req, nb))
	{
	    /* Read operation complete */
	    MPIDI_CA_t ca = req->ch3.ca;
	    
	    vc->rdma.recv_active = NULL;
	    
	    if (ca == MPIDI_CH3I_CA_HANDLE_PKT)
	    {
		MPIDI_CH3_Pkt_t * pkt = &req->rdma.pkt;
		
		if (pkt->type < MPIDI_CH3_PKT_END_CH3)
		{
		    /*printf("I should never get here anymore.\n");fflush(stdout);*/
		    MPIDI_DBG_PRINTF((65, FCNAME, "received CH3 packet %d, calllng CH3U_Handle_recv_pkt()", pkt->type));
		    MPIDI_CH3U_Handle_recv_pkt(vc, pkt);
		    MPIDI_DBG_PRINTF((65, FCNAME, "CH3U_Handle_recv_pkt() returned"));
		    if (vc->rdma.recv_active == NULL)
		    {
			MPIDI_DBG_PRINTF((65, FCNAME, "complete; posting new recv packet"));
			mpi_errno = post_pkt_recv(vc);
			if (mpi_errno != MPI_SUCCESS)
			    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**handle_read", 0);
			MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
			MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_READ);
			return mpi_errno;
		    }
		}
	    }
	    else if (ca == MPIDI_CH3_CA_COMPLETE)
	    {
		MPIDI_DBG_PRINTF((65, FCNAME, "received requested data, decrementing CC"));
		/* mark data transfer as complete adn decrment CC */
		req->ch3.iov_count = 0;
		MPIDI_CH3U_Request_complete(req);
		mpi_errno = post_pkt_recv(vc);
		if (mpi_errno != MPI_SUCCESS)
		    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**handle_read", 0);
		MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
		MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_READ);
		return mpi_errno;
	    }
	    else if (ca < MPIDI_CH3_CA_END_CH3)
	    {
		/* XXX - This code assumes that if another read is not posted by the device during the callback, then the
		   device is not expecting any more data for request.  As a result, the channels posts a read for another
		   packet */
		MPIDI_DBG_PRINTF((65, FCNAME, "finished receiving iovec, calling CH3U_Handle_recv_req()"));
		MPIDI_CH3U_Handle_recv_req(vc, req);
		if (req->ch3.iov_count == 0)
		{
		    MPIDI_DBG_PRINTF((65, FCNAME, "request (assumed) complete, posting new recv packet"));
		    mpi_errno = post_pkt_recv(vc);
		    if (mpi_errno != MPI_SUCCESS)
			mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**handle_read", 0);
		    MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
		    MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_READ);
		    return mpi_errno;
		}
	    }
	    else
	    {
		assert(ca != MPIDI_CH3I_CA_HANDLE_PKT);
		assert(ca < MPIDI_CH3_CA_END_CH3);
	    }
	}
	else
	{
	    assert(req->rdma.iov_offset < req->ch3.iov_count);
	    MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
	    MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_READ);
	    return MPI_SUCCESS;
	}
    }
    else
    {
	MPIDI_DBG_PRINTF((65, FCNAME, "Read args were iov=%x, count=%d",
	    req->ch3.iov + req->rdma.iov_offset, req->ch3.iov_count - req->rdma.iov_offset));
    }
    
    MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_READ);
    return MPI_SUCCESS;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_write_progress
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_write_progress()
{
    MPIDI_CH3I_Process_group_t *pg;
    int mpi_errno;
    int nb, i;
    MPIDI_VC * vc;
    MPIDI_STATE_DECL(MPID_STATE_HANDLE_WRITTEN);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLE_WRITTEN);
    
    /*MPIDI_DBG_PRINTF((60, FCNAME, "entering"));*/
    pg = MPIDI_CH3I_Process.pg;
    for (i=0; i<pg->size; i++)
    {
	vc = &pg->vc_table[i];

	while (vc->rdma.send_active != NULL)
	{
	    MPID_Request * req = vc->rdma.send_active;

	    assert(req->rdma.iov_offset < req->ch3.iov_count);
	    /*MPIDI_DBG_PRINTF((60, FCNAME, "calling rdma_put_datav"));*/
	    mpi_errno = MPIDI_CH3I_RDMA_put_datav(vc, req->ch3.iov + req->rdma.iov_offset, req->ch3.iov_count - req->rdma.iov_offset, &nb);
	    if (mpi_errno != MPI_SUCCESS)
	    {
		mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**write_progress", 0);
		return mpi_errno;
	    }
	    MPIDI_DBG_PRINTF((60, FCNAME, "shm_writev returned %d", nb));

	    if (nb > 0)
	    {
		if (MPIDI_CH3I_Request_adjust_iov(req, nb))
		{
		    /* Write operation complete */
		    MPIDI_CA_t ca = req->ch3.ca;

		    vc->rdma.send_active = NULL;

		    if (ca == MPIDI_CH3_CA_COMPLETE)
		    {
			MPIDI_DBG_PRINTF((65, FCNAME, "sent requested data, decrementing CC"));
			MPIDI_CH3I_SendQ_dequeue(vc);
			vc->rdma.send_active = MPIDI_CH3I_SendQ_head(vc);
			/* mark data transfer as complete and decrment CC */
			req->ch3.iov_count = 0;
			MPIDI_CH3U_Request_complete(req);
		    }
		    else if (ca == MPIDI_CH3I_CA_HANDLE_PKT)
		    {
			MPIDI_CH3_Pkt_t * pkt = &req->rdma.pkt;

			if (pkt->type < MPIDI_CH3_PKT_END_CH3)
			{
			    MPIDI_DBG_PRINTF((65, FCNAME, "setting rdma.send_active"));
			    vc->rdma.send_active = MPIDI_CH3I_SendQ_head(vc);
			}
			else
			{
			    MPIDI_DBG_PRINTF((71, FCNAME, "unknown packet type %d", pkt->type));
			}
		    }
		    else if (ca < MPIDI_CH3_CA_END_CH3)
		    {
			MPIDI_DBG_PRINTF((65, FCNAME, "finished sending iovec, calling CH3U_Handle_send_req()"));
			MPIDI_CH3U_Handle_send_req(vc, req);
			if (req->ch3.iov_count == 0)
			{
			    /* NOTE: This code assumes that if another write is not posted by the device during the callback, then the
			    device has completed the current request.  As a result, the current request is dequeded and next request
			    in the queue is processed. */
			    MPIDI_DBG_PRINTF((65, FCNAME, "request (assumed) complete"));
			    MPIDI_DBG_PRINTF((65, FCNAME, "dequeuing req and posting next send"));
			    MPIDI_CH3I_SendQ_dequeue(vc);
			    vc->rdma.send_active = MPIDI_CH3I_SendQ_head(vc);
			}
		    }
		    else
		    {
			MPIDI_DBG_PRINTF((65, FCNAME, "ca = %d", ca));
			assert(ca < MPIDI_CH3I_CA_END_RDMA);
		    }
		}
		else
		{
		    MPIDI_DBG_PRINTF((65, FCNAME, "iovec updated by %d bytes but not complete", nb));
		    assert(req->rdma.iov_offset < req->ch3.iov_count);
		    break;
		}
	    }
	    else
	    {
		MPIDI_DBG_PRINTF((65, FCNAME, "shm_post_writev returned %d bytes", nb));
		break;
	    }
	}
    }

    /*MPIDI_DBG_PRINTF((60, FCNAME, "exiting"));*/

    MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_WRITTEN);
    return MPI_SUCCESS;
}
