/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#include "rdma_impl.h"
#include "pmi.h"

/* global rmda structure for the local process */
MPIDI_CH3I_RDMA_Process_t MPIDI_CH3I_RDMA_Process;

static void generate_shm_string(char *str)
{
#ifdef USE_WINDOWS_SHM
    UUID guid;
    UuidCreate(&guid);
    sprintf(str, "%08lX-%04X-%04x-%02X%02X-%02X%02X%02X%02X%02X%02X",
	guid.Data1, guid.Data2, guid.Data3,
	guid.Data4[0], guid.Data4[1], guid.Data4[2], guid.Data4[3],
	guid.Data4[4], guid.Data4[5], guid.Data4[6], guid.Data4[7]);
    MPIU_DBG_PRINTF(("GUID = %s\n", str));
#elif defined (USE_POSIX_SHM)
    sprintf(str, "/mpich_shm_%d", getpid());
#elif defined (USE_SYSV_SHM)
    sprintf(str, "%d", getpid());
#else
#error No shared memory subsystem defined
#endif
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RDMA_init_process_group
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RDMA_init_process_group(int * has_parent)
{
    int mpi_errno;
    int rc;
    int pg_rank, pg_size;
    MPIDI_CH3I_Process_group_t * pg;

    /*
     * Extract process group related information from PMI
     */
    rc = PMI_Init(has_parent);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_init", "**pmi_init %d", rc);
	return mpi_errno;
    }
    rc = PMI_Get_rank(&pg_rank);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_get_rank", "**pmi_get_rank %d", rc);
	return mpi_errno;
    }
    rc = PMI_Get_size(&pg_size);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_get_size", "**pmi_get_size %d", rc);
	return mpi_errno;
    }
    
    /*MPIU_Timer_init(pg_rank, pg_size);*/

    /* Allocate process group data structure and populate */
    pg = MPIU_Malloc(sizeof(MPIDI_CH3I_Process_group_t));
    if (pg == NULL)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "process group");
	return mpi_errno;
    }
    pg->size = pg_size;
    pg->rank = pg_rank;
    pg->kvs_name = MPIU_Malloc(PMI_KVS_Get_name_length_max() + 1);
    if (pg->kvs_name == NULL)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "kvs name");
	return mpi_errno;
    }
    rc = PMI_KVS_Get_my_name(pg->kvs_name);
    if (rc != 0)
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_get_my_name", "**pmi_kvs_get_my_name %d", rc);
	return mpi_errno;
    }
    pg->ref_count = 1;

    pg->nRDMAWaitSpinCount = MPIDI_CH3I_SPIN_COUNT_DEFAULT;
    pg->nRDMAWaitYieldCount = MPIDI_CH3I_YIELD_COUNT_DEFAULT;

    MPIDI_CH3I_Process.pg = pg;

    return MPI_SUCCESS;
}

/* init must allocate RDMA memory and initialize the queues and other structures inside it */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMDA_init
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMDA_init()
{
    int error;

    int pg_rank, pg_size;
    MPIDI_VC * vc_table;

    char * key;
    char * val;
    int key_max_sz;
    int val_max_sz;

    char shmemkey[MPIDI_MAX_SHM_NAME_LENGTH];
    int i, j;
    int shm_block;

    pg_rank = MPIDI_CH3I_Process.pg->rank;
    pg_size = MPIDI_CH3I_Process.pg->size;
    vc_table = MPIDI_CH3I_Process.pg->vc_table;

    MPIDI_CH3I_RDMA_Process.nShmEagerLimit = MPIDI_SHM_EAGER_LIMIT;
#ifdef HAVE_SHARED_PROCESS_READ
    MPIDI_CH3I_RDMA_Process.nShmRndvLimit = MPIDI_SHM_RNDV_LIMIT;
#endif
    MPIDI_CH3I_RDMA_Process.addr = NULL;
#ifdef USE_POSIX_SHM
    MPIDI_CH3I_RDMA_Process.key[0] = '\0';
    MPIDI_CH3I_RDMA_Process.id = -1;
#elif defined (USE_SYSV_SHM)
    MPIDI_CH3I_RDMA_Process.key = -1;
    MPIDI_CH3I_RDMA_Process.id = -1;
#elif defined (USE_WINDOWS_SHM)
    MPIDI_CH3I_RDMA_Process.key[0] = '\0';
    MPIDI_CH3I_RDMA_Process.id = NULL;
#else
#error No shared memory subsystem defined
#endif
    MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = MPIDI_CH3I_SPIN_COUNT_DEFAULT;
    MPIDI_CH3I_RDMA_Process.nShmWaitYieldCount = MPIDI_CH3I_YIELD_COUNT_DEFAULT;

    /* initialize the shared memory */
    shm_block = sizeof(MPIDI_CH3I_SHM_Queue_t) * pg_size; 

    if (pg_size > 1)
    {
	/* Allocate space for pmi keys and values */
	key_max_sz = PMI_KVS_Get_key_length_max()+1;
	key = MPIU_Malloc(key_max_sz);
	if (key == NULL)
	{
	    error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "pmi key");
	    return error;
	}
	val_max_sz = PMI_KVS_Get_value_length_max()+1;
	val = MPIU_Malloc(val_max_sz);
	if (val == NULL)
	{
	    error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "pmi value");
	    return error;
	}

	if (pg_rank == 0)
	{
	    generate_shm_string(shmemkey);
	    MPIU_Strncpy(key, "SHMEMKEY", key_max_sz);
	    MPIU_Strncpy(val, shmemkey, val_max_sz);
	    error = PMI_KVS_Put(MPIDI_CH3I_Process.pg->kvs_name, key, val);
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_put", "**pmi_kvs_put %d", error);
		return error;
	    }
	    error = PMI_KVS_Commit(MPIDI_CH3I_Process.pg->kvs_name);
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_commit", "**pmi_kvs_commit %d", error);
		return error;
	    }
	    error = PMI_Barrier();
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", error);
		return error;
	    }
	}
	else
	{
	    MPIU_Strncpy(key, "SHMEMKEY", key_max_sz);
	    error = PMI_Barrier();
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", error);
		return error;
	    }
	    error = PMI_KVS_Get(MPIDI_CH3I_Process.pg->kvs_name, key, val);
	    if (error != 0)
	    {
		error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvs_get", "**pmi_kvs_get %d", error);
		return error;
	    }
	    MPIU_Strncpy(shmemkey, val, val_max_sz);
	}

	MPIU_Free(val);
	MPIU_Free(key);

	MPIU_DBG_PRINTF(("KEY = %s\n", shmemkey));
#if defined(USE_POSIX_SHM) || defined(USE_WINDOWS_SHM)
	MPIU_Strncpy(MPIDI_CH3I_RDMA_Process.key, shmemkey, MPIDI_MAX_SHM_NAME_LENGTH);
#elif defined (USE_SYSV_SHM)
	MPIDI_CH3I_RDMA_Process.key = atoi(shmemkey);
#else
#error No shared memory subsystem defined
#endif

	error = MPIDI_CH3I_SHM_Get_mem( &MPIDI_CH3I_RDMA_Process, pg_size * shm_block, pg_rank, pg_size, TRUE, &MPIDI_CH3I_RDMA_Process.addr );
    }
    else
    {
	error = MPIDI_CH3I_SHM_Get_mem( &MPIDI_CH3I_RDMA_Process, shm_block, 0, 1, FALSE, &MPIDI_CH3I_RDMA_Process.addr );
    }
    if (error != MPI_SUCCESS)
    {
	error = MPIR_Err_create_code(error, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %s", "shared memory block");
	return error;
    }

    /* initialize each shared memory queue */
    for (i=0; i<pg_size; i++)
    {
	if (i == pg_rank)
	{
	    vc_table[i].shm.shm = (MPIDI_CH3I_SHM_Queue_t*)((char*)MPIDI_CH3I_RDMA_Process.addr + (shm_block * i));
	    for (j=0; j<pg_size; j++)
	    {
		vc_table[i].shm.shm[j].head_index = 0;
		vc_table[i].shm.shm[j].tail_index = 0;
	    }
	}
	else
	{
	    /*vc_table[i].shm.shm += pg_rank;*/
	    vc_table[i].shm.shm = NULL;
	    vc_table[i].shm.write_shmq = (MPIDI_CH3I_SHM_Queue_t*)((char*)MPIDI_CH3I_RDMA_Process.addr + (shm_block * i)) + pg_rank;
	    vc_table[i].shm.read_shmq = (MPIDI_CH3I_SHM_Queue_t*)((char*)MPIDI_CH3I_RDMA_Process.addr + (shm_block * pg_rank)) + i;
	    /* post a read of the first packet header */
	    /*vc_table[i].shm.shm_reading_pkt = TRUE;*/
	    /*MPIDI_CH3I_post_read( &vc_table[i] , &vc_table[i].rdma.pkt, sizeof(vc_table[i].rdma.pkt));*/
	    vc_table[i].rdma.req->ch3.iov[0].MPID_IOV_BUF = (void *)&vc_table[i].rdma.req->rdma.pkt;
	    vc_table[i].rdma.req->ch3.iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t);
	    vc_table[i].rdma.req->ch3.iov_count = 1;
	    vc_table[i].rdma.req->rdma.iov_offset = 0;
	    vc_table[i].rdma.req->ch3.ca = MPIDI_CH3I_CA_HANDLE_PKT;
	    vc_table[i].rdma.recv_active = vc_table[i].rdma.req;
	    error = MPIDI_CH3I_post_read( &vc_table[i] , &vc_table[i].rdma.req->rdma.pkt, sizeof(vc_table[i].rdma.req->rdma.pkt));
	    if (error != MPI_SUCCESS)
	    {
		error = MPIR_Err_create_code(error, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**rdma_init", 0);
		return error;
	    }
	}
    }

#ifdef HAVE_WINDOWS_H
    {
	/* if you know the number of processors, calculate the spin count relative to that number */
        SYSTEM_INFO info;
        GetSystemInfo(&info);
        if (info.dwNumberOfProcessors == 1)
            MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = 1;
        else if (info.dwNumberOfProcessors < (DWORD) pg_size)
            MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = ( MPIDI_CH3I_SPIN_COUNT_DEFAULT * info.dwNumberOfProcessors ) / pg_size;
    }
#else
    /* figure out how many processors are available and set the spin count accordingly */
#ifdef HAVE_SYSCONF
    {
	int num_cpus;
	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (num_cpus == 1)
	    MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = 1;
	else if (num_cpus > 0 && num_cpus < pg_size)
	    MPIDI_CH3I_RDMA_Process.nShmWaitSpinCount = ( MPIDI_CH3I_SPIN_COUNT_DEFAULT * num_cpus ) / pg_size;
    }
#endif
#endif

    error = PMI_Barrier(); /* barrier to make sure queues are initialized before continuing */
    if (error != 0)
    {
	error = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", error);
	return error;
    }
#ifdef USE_POSIX_SHM
    shm_unlink(MPIDI_CH3I_RDMA_Process.key);
#elif defined (USE_SYSV_SHM)
    shmctl(MPIDI_CH3I_RDMA_Process.id, IPC_RMID, NULL);
#endif

    return MPI_SUCCESS;
}

/* finalize releases the RDMA memory and any other cleanup */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMDA_finalize
#undef FCNAME
#define FCNAME MPIDI_QUOTE(FUNCNAME)
int MPIDI_CH3I_RMDA_finalize()
{
    int mpi_errno;

    mpi_errno = MPIDI_CH3I_SHM_Release_mem(&MPIDI_CH3I_RDMA_Process, (MPIDI_CH3I_Process.pg->size > 1) ? TRUE : FALSE);
    if (mpi_errno != MPI_SUCCESS)
    {
	MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**rdma_finalize", 0);
    }
    return mpi_errno;
}
