/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*  $Id: mpiexec.c,v 1.22 2003/05/05 13:29:22 gropp Exp $
 *
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

/* quick hack forking mpiexec process manager and implementation of PMI */
/*
   1. parse mpiexec args.  See p 353 of vol 1 of MPI std.
   2. initialize data structures:
        fdtable has one entry for each forked process, containing, among other things, 
          an fd for communicating with that process
        kvs table has one entry for each keyval space.  A keyval space contains
	  key=value pairs for put and get.
        first group id assigned will be 0
   3. fork-exec executables, passing rank, pmi listening port in env, other args
   4. enter select
   5. handle requests from clients
   6. collect finalize messages, exit when all in or pmi_abort
*/
  
/*
  The todo list:
    Add debug argument to mpiexec and use it to control PMIU_printf
    Implement PMI_Spawn_multiple
    Set up SIGCHLD handler for unexpected client deaths
        (This is partially done, but needs to be integrated with the
	handling of process exit.  At least you get one message now)
    Handle mpiexec -path argument correctly
*/

#include "forkerconf.h"

#include <stdio.h>
#include <string.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <stdlib.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <errno.h>
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#include "simple_pmiutil.h"

/* mpimem.h contains prototypes for MPIU_Strncpy etc. */
#include "mpimem.h"

#define DBG_PRINTF printf
#define DBG_FPRINTF fprintf

void mpiexec_usage( const char * );
int  GetIntValue( const char [], int );
void CreateNewSession( void );
void InitTimeout( int );
int  GetRemainingTime( void );
int  allocate_fdentry( void );
int  handle_input_fd( int );
#ifdef USE_SIGCHLD_HANDLER
void setup_sigchild( void );
#endif
void KillChildren( void );
void PrintExitStatus( void );
int GetExitStatus( void );
void forkProcesses( int, char [], char *[], char *[], char [], int, char [] );
void SignalAllProcesses( int , const char [] );
/* int  waitOnProcess( int, int, exit_state_t ); - below for exit_state_t def */

/* Routines for tracing */
int  InitHandleStopped( void );
void CheckIfTraced( void );
void CheckForStopped( const char [] );
void SetCommandOnStopped( const char [] );
void SetDefaultCommandOnStopped( void );
void KillTracedProcesses( void );

/* Functions that handle PMI requests */
int  fPMI_Init( int, char [] );
int  fPMI_Allocate_kvs( int *, char [] );
int  fPMI_Allocate_kvs_group( void );
void fPMI_Handle_barrier( int );
void fPMI_Handle_create_kvs( int );
void fPMI_Handle_destroy_kvs( int );
void fPMI_Handle_put( int );
void fPMI_Handle_get( int );
void fPMI_Handle_get_my_kvsname( int );
void fPMI_Handle_init( int );
void fPMI_Handle_get_maxes( int );
void fPMI_Handle_getbyidx( int );

/* array sizes */
#define MAXNAMELEN  256		/* max length of various names */
/* PATH_MAX is the POSIX name for MAXPATHLEN */
#ifndef PATH_MAX
#define PATH_MAX 2048		/* max length of PATH */
#endif
#define MAXPROCS     64		/* max number of processes to manage */
#define MAXFDENTRIES 75		/* max number of clients (procs), plus a few more */

/* handlers */
#define NOTSET        0         /* no handler set on fd */
#define CLIENT        1         /* handler for pipe to client */

/* client states, maybe not all used (yet) */
typedef enum { UNINITIALIZED=-1, UNKNOWN, ALIVE, COMMUNICATING, FINALIZED, EXITING, GONE } 
    client_state_t;

/* Record the return value from each process */
typedef enum { NORMAL,     /* Normal exit (possibly with nonzero status) */
	       SIGNALLED,  /* Process died on an uncaught signal (e.g., 
			      SIGSEGV) */
	       NOFINALIZE, /* Process exited without calling finalize */
	       KILLED      /* Process was killed by mpiexec */ 
             } exit_state_t;
struct pidstat { 
    int          rc;
    int          sig;
    exit_state_t exit_state;
};
int  waitOnProcess( int, int, exit_state_t );

/*
 * Each in-use fd gets an fdentry.  Because of MPI_Comm_spawn, there isn't
 * necessarily a 1-1 correspondence between these entries and the processes
 * (though there is a 1-1 for the initial set of processes, i.e., those in 
 * MPI_COMM_WORLD).
 */
struct fdentry {
    int  active;		/* whether this entry is filled */
    int  fd;			/* fd assigned by system when opened */
    int  pid;			/* pid of client, if applicable, else -1 */
    int  group;			/* group that client belongs to, if applicable, else -1 */
    int  rank;			/* rank of client, if applicable, else -1 */
    client_state_t state;	/* state of client, if applicable, else -1 */
    int  read;			/* whether this fd should be selected for reading */
    int  write;			/* whether this fd should be selected for writing */
    FILE *file;			/* file from fdopen, if present, for using fgets */
    int  handler;		/* function to call to handle input/output when selected */
    char name[MAXNAMELEN];	/* name of fd, for debugging */
    char kvsname[MAXNAMELEN];	/* name of KVS associated with this process, if any */
};
struct fdentry fdtable[MAXFDENTRIES];
int maxfdentryInUse = -1;       /* Index of the last fdentry in use */

/* Because we may reuse fdentries, we will eventually need to keep
   the results of all process creation in a separate array.  Since this
   is just a stack of results, we don't need a special data structure 
   for it.  We save the exitstatus of each process */
struct pidstat exitstatus[MAXFDENTRIES];
/* How many processes have exited (including the number aborted) */
int num_exited = 0;
/* How many processes have exited with a nonzero code or a signal */
int num_aborted = 0;

/* This is the name of the kvs associated with 
   the initial group of processes */
char initial_kvsname[MAXNAMELEN];

/*
 * Global variables and default values
 */
int numprocs = 1; /* Default if no -n argument provided */
int debug = 0;

/* Set killOnAbort to 1 to kill all children when one dies with a nonzero
   return code or a signal; 0 to continue */
int killOnAbort = 1;
/* This flag lets us know when we're terminating the children */
int inKillChildren = 0;


#define MAX_CLIENT_ARG 50
#define MAX_CLIENT_ENV 200
/* Note that envp is common but not standard */
int main( int argc, char *argv[], char *envp[] )
{
    int  n;
    char softness[MAXNAMELEN];
    char hostname[MAXNAMELEN];
    char archname[MAXNAMELEN];
    char wdirname[MAXNAMELEN];
    char pathname[PATH_MAX];
    char filename[MAXNAMELEN];
    char execname[MAXNAMELEN];
    char *client_arg[MAX_CLIENT_ARG];
    int  i, j, rc, num_fds;
#ifdef USE_SIGCHLD_HANDLER
    int  catch_sigchild = 0;
#endif
    int  groupid;
    int  done;
    int  timeout_seconds;
    int  poll_deltat = 0;
    struct timeval tv;
    fd_set readfds, writefds;

    MPIU_Strncpy( PMIU_print_id, "mpiexec", PMIU_IDSIZE );

/**************************** argument processing **********************************/

    /* set defaults for arguments.  Most of these can also be set
       with environment variables */
    n = 1;			  /* number of processes to start */
    /* Simple test for debugging */
    if (getenv( "MPIEXEC_DEBUG" )) debug = 1;
    /* Value is now in seconds */
    timeout_seconds = GetIntValue( "MPIEXEC_TIMEOUT", 60 );
    if (debug) DBG_PRINTF( "timeout_seconds = %d\n", timeout_seconds );

    /* Defaults for handling stopped processes */
    SetDefaultCommandOnStopped();

    if ( !getcwd( wdirname, MAXNAMELEN ) ) {
	mpiexec_usage( "current working directory name too long\n" );
	/* mpiexec_usage exits */
    }

    MPIU_Strncpy( pathname, getenv( "PATH" ), PATH_MAX );
    MPIU_Strncpy( execname, "a.out", 6 );
    /* others default to empty strings */
    softness[0] = hostname[0] = archname[0] = filename[0] = '\0';
    
    /* process argments */
    if ( argc == 1 )
	mpiexec_usage( NULL );	/* note: mpiexec_usage exits */

    /* process std args before execname: -n, -soft, -host, -arch, -wdir, -path, -file */
    for ( i = 1; i < argc && argv[i][0] == '-'; i+=2 ) {
	if ( strncmp( argv[i], "-n",  strlen( argv[i] ) ) == 0 ||
	     strncmp( argv[i], "-np", strlen( argv[i] ) ) == 0 ) /* undoc'd option */
	    if ( i+1 < argc ) {
		n = atoi( argv[i+1] );
		if ( 0 < n && n <= MAXPROCS )
		    numprocs = n;
		else {
		    MPIU_Error_printf( "invalid number of processes, max is %d\n",MAXPROCS );
		    exit( -1 );
		}
	    }
	    else
		mpiexec_usage( "Missing argument to -n\n" );
	else if ( strncmp( argv[i], "-soft", 6 ) == 0 )
	    if ( i+1 < argc )
		MPIU_Strncpy( softness, argv[i+1], MAXNAMELEN );
	    else
		mpiexec_usage( "Missing argument to -soft\n" );
	else if ( strncmp( argv[i], "-host", 6 ) == 0 )
	    if ( i+1 < argc )
		MPIU_Strncpy( hostname, argv[i+1], MAXNAMELEN );
	    else
		mpiexec_usage( "Missing argument to -host\n" );		    
	else if ( strncmp( argv[i], "-arch", 6 ) == 0 )
	    if ( i+1 < argc )
		MPIU_Strncpy( archname, argv[i+1], MAXNAMELEN );
	    else
		mpiexec_usage( "Missing argument to -arch\n" );		    
	else if ( strncmp( argv[i], "-wdir", 6 ) == 0 )
	    if ( i+1 < argc )
		MPIU_Strncpy( wdirname, argv[i+1], MAXNAMELEN );
	    else
		mpiexec_usage( "Missing argument to -wdir\n" );		    
	else if ( strncmp( argv[i], "-path", 6 ) == 0 )
	    if ( i+1 < argc )
		MPIU_Strncpy( pathname, argv[i+1], PATH_MAX );
	    else
		mpiexec_usage( "Missing argument to -path\n" );		    
	else if ( strncmp( argv[i], "-file", 6 ) == 0 ) {
	    if ( i+1 < argc )
		MPIU_Strncpy( filename, argv[i+1], MAXNAMELEN );
	    else
		mpiexec_usage( "Missing argument to -file\n" );
	}
	/* Here begin the options that are not part of the MPI 
	   recommendations */
	else if ( strncmp( argv[i], "-maxtime", 8 ) == 0 ) {
	    if ( i+1 < argc ) 
		timeout_seconds = atoi( argv[i+1] );
	    else
		mpiexec_usage( "Missing argument to -maxtime\n" );
	}
	else if ( strncmp( argv[i], "-onsig", 6 ) == 0) {
	    if ( i + 1 < argc )
		SetCommandOnStopped( argv[i+1] );
	    else
		mpiexec_usage( "Missing argument to -onsig\n" );
	}
	else {
	    MPIU_Error_printf( "invalid mpiexec argument %s\n", argv[i] );
	    mpiexec_usage( NULL );
	}
    }

    if ( i < argc )
	MPIU_Strncpy( execname, argv[i], MAXNAMELEN );
    else {
	mpiexec_usage( "no program specified\n" );
    }

    /*
    DBG_FPRINTF( stdout, "arguments: n = %d, softness = %s, hostname = %s, "
	     "archname = %s, wdirname = %s, pathname = %s, filename = %s, execname = %s\n",
	     n, softness, hostname, archname, wdirname, pathname, filename, execname );
    */

    /* Start the timeout period */
    InitTimeout( timeout_seconds );

    poll_deltat = InitHandleStopped();

    /* now process application program arguments */
    for ( j = 0; i < argc && j < MAX_CLIENT_ARG; i++, j++ )
	client_arg[j] = argv[i]; /* client_arg[0] is execname */
    if (j == MAX_CLIENT_ARG) {
	
    }
    client_arg[j] = NULL;	 /* null out arg pointer after last arg */

/******************************* other initialization *****************************/

    for ( i = 0; i < MAXFDENTRIES; i++ )
	fdtable[i].active = 0;
    groupid = fPMI_Init( numprocs, initial_kvsname );

/***************************  fork client processes **********************************/

    /* Catch process exit unless specifically requested otherwise */
#ifdef USE_SIGCHLD_HANDLER
    if (catch_sigchild) 
	setup_sigchild( );
#endif

#ifdef USE_NEW_SESSION
    CreateNewSession();
#endif
    
    forkProcesses( numprocs, execname, client_arg, envp, wdirname, 
		   groupid, initial_kvsname );
    
    /* Allocated a new group (this should be deferred until we decide to
       fork more processes) */ 
    groupid = fPMI_Allocate_kvs_group();

/************************ enter select loop to process pmi requests *****************/
    
    done = 0;
    while ( !done ) {
        FD_ZERO( &readfds );
        FD_ZERO( &writefds );

	/* Start with num_fds as the maximum fd seen */
	num_fds = -1;
	for ( i = 0; i < MAXFDENTRIES; i++ )
	    if ( fdtable[i].active && fdtable[i].read ) {
		FD_SET( fdtable[i].fd, &readfds );
		if (fdtable[i].fd > num_fds) 
		    num_fds = fdtable[i].fd;
	    }
	    
	/* Now make this the number of fds */
        num_fds++;

	/* Set a timeout on the select.  This allows us to set limits
	   on how long a parallel job can run, and helps catch
	   deadlocked programs */
        tv.tv_sec  = GetRemainingTime();
	/* We may also want to check for special events outside of
	   select.  Currently, this means ptrace (requires calling
	   wait).  */
	if (poll_deltat > 0) {
	    if (poll_deltat < tv.tv_sec) tv.tv_sec = poll_deltat;
	}
        tv.tv_usec = 0;

	if (num_fds == 0) {
	    /* Special case: everyone has exited! */
	    break;
	}
	/* If we are aborting due to child failure, we may want to
	   abort this select */
        rc = select( num_fds, &readfds, &writefds, NULL, &tv );

        if ( rc == 0 ) {
	    /* We timed out of the select.  This could be
	       either total time expired or a poll timeout for ptrace */
	    if (GetRemainingTime() <= 0) {
		/* Time to abort all proceses */
		MPIU_Error_printf( "Timeout of %d minutes expired; job aborted\n",
			 timeout_seconds / 60 );
		KillChildren( );
		done = 1;
	    }
	    else {
		/* Check for any stopped processes. */
		CheckForStopped( execname );
	    }
        } 
        if ( ( rc == -1 ) && ( errno == EINTR ) ) {
	    /* FIXME: this could happen on a SIGCHLD */
            if (debug) {
		DBG_PRINTF( "select interrupted; continuing\n" );
		fflush( stdout );
	    }
            continue;
        }
        if ( rc < 0 ) {
	    perror( "select failed:" );
            MPIU_Internal_error_printf( "mpiexec main loop: select failed\n" );
	    KillChildren();
	    exit( -1 );
        }

	/* Check only the active fds */
	for ( i = 0; i < num_fds; i++ )
            if ( fdtable[i].active ) 
                if ( FD_ISSET( fdtable[i].fd, &readfds ) )
                    done |= handle_input_fd( i );
    }
    /* On exit from all processes, determine the return code and print
       any requested information about process exit status */
    PrintExitStatus();
    rc = GetExitStatus( );
    return( rc );
}

void mpiexec_usage( const char *msg )
{
    if (msg)
	MPIU_Error_printf( msg );
    MPIU_Usage_printf( "\
Usage: mpiexec -n <numprocs> -soft <softness> -host <hostname> \\\n\
               -wdir <working directory> -path <search path> \\\n\
               -file <filename> execname <args>\n" );
    exit( -1 );
}

/* 
 * Try to get an integer value from the enviroment.  Return the default
 * if the value is not available or invalid
 */
int GetIntValue( const char name[], int default_val )
{
    const char *env_val;
    int  val = default_val;

    env_val = getenv( name );
    if (env_val) {
#ifdef HAVE_STRTOL
	char *invalid_char; /* Used to detect invalid input */
	val = (int) strtol( env_val, &invalid_char, 0 );
	if (*invalid_char != '\0') val = default_val;
#else
	val = atoi( env_val );
#endif
    }
    return val;
}

/*
 * Provide a simple timeout capability.  Initialize the time with 
 * InitTimeout.  Call GetRemainingTime to get the time in seconds left.
 */
int end_time = -1;  /* Time of timeout in seconds */
void InitTimeout( int seconds )
{
#ifdef HAVE_TIME
    time_t t;
    t = time( NULL );
    end_time = seconds + t;
#elif defined(HAVE_GETTIMEOFDAY)
    struct timeval tp;
    gettimeofday( &tp, NULL );
    end_time = seconds + tp.tv_sec;
#else
#   error 'No timer available'
#endif
}

/* Return remaining time in seconds */
int GetRemainingTime( void )
{
    int time_left;
#ifdef HAVE_TIME
    time_t t;
    t = time( NULL );
    time_left = end_time - t;
#elif defined(HAVE_GETTIMEOFDAY)
    struct timeval tp;
    gettimeofday( &tp, NULL );
    time_left = end_time - tp.tv_sec;
#else
#   error 'No timer available'
#endif
    if (time_left < 0) time_left = 0;
    return time_left;
}

/* 
 * Allocate a new fdentry, using the active field to identify 
 * candidates.
 */
int allocate_fdentry( void )
{
    int i;

    for ( i = 0; i < MAXFDENTRIES; i++ )
	if ( fdtable[i].active == 0 )
	    break;
    if ( i >= MAXFDENTRIES ) {
	MPIU_Internal_error_printf( "too many fd's\n" );
        exit( -1 );
    }
    fdtable[i].active	= 1;
    fdtable[i].fd	= -1;
    fdtable[i].pid      = -1;
    fdtable[i].group    = -1;
    fdtable[i].rank     = -1;
    fdtable[i].state    = UNINITIALIZED;
    fdtable[i].read	= 0;
    fdtable[i].write	= 0;
    fdtable[i].file	= NULL;
    fdtable[i].handler	= NOTSET;
    fdtable[i].name[0]  = '\0';

    if (i > maxfdentryInUse)
	maxfdentryInUse = i;

    return( i );
}

/* Fork numprocs processes, with the given PMI groupid and kvsname */
void forkProcesses( int numprocs, char execname[], char *client_arg[], 
		    char *envp[], char wdirname[], 
		    int groupid, char kvsname[] )
{
    int i, j, idx, pid, rc;
    int  client_pipe_fds[2];

    for ( i = 0; i < numprocs; i++ ) {
	/* set up pipe to client for use by pmi */
	socketpair( AF_UNIX, SOCK_STREAM, 0, client_pipe_fds );
	idx		     = allocate_fdentry( );
	fdtable[idx].fd	     = client_pipe_fds[0];
	fdtable[idx].group   = groupid;
	fdtable[idx].rank    = i; /* pid set below, after fork */
	fdtable[idx].state   = UNKNOWN;
	fdtable[idx].read    = 1;
	fdtable[idx].write   = 0;
	fdtable[idx].file    = NULL;
	fdtable[idx].handler = CLIENT;
	MPIU_Strncpy( fdtable[idx].name, "client", MAXNAMELEN ); 
	MPIU_Strncpy( fdtable[idx].kvsname, kvsname, MAXNAMELEN );

	pid = fork( );
	if ( pid < 0 ) {
	    MPIU_Internal_error_printf( "mpiexec fork failed\n" );
	    KillChildren( );
	    exit( -1 );
	}

	if ( pid == 0 ) {	/***********************************  child, client */
	    char env_pmi_fd[MAXNAMELEN];
	    char env_pmi_rank[MAXNAMELEN];
	    char env_pmi_size[MAXNAMELEN];
	    char env_pmi_debug[MAXNAMELEN];
	    char *client_env[MAX_CLIENT_ENV];

	    /* Check to see if we should be traced */
	    CheckIfTraced( );

	    close( client_pipe_fds[0] );

	    /* Close the PMI fd's for the other processes */
	    for ( j=0; j<idx; j++) {
		if (fdtable[j].fd >= 0) close( fdtable[j].fd );
	    }

	    /* build environment for client */
	    for ( j = 0; envp[j] && j < MAX_CLIENT_ENV-5; j++ )
		client_env[j] = envp[j]; /* copy mpiexec environment */
	    if (j == MAX_CLIENT_ENV-5) {
		MPIU_Error_printf( "environment is too large (max is %d)\n",
			 MAX_CLIENT_ENV-5);
		exit(-1);
	    }
	    snprintf( env_pmi_fd, MAXNAMELEN, "PMI_FD=%d" , client_pipe_fds[1] );
	    client_env[j++] = env_pmi_fd;
	    snprintf( env_pmi_rank, MAXNAMELEN, "PMI_RANK=%d", i );
	    client_env[j++] = env_pmi_rank;
	    snprintf( env_pmi_size, MAXNAMELEN, "PMI_SIZE=%d", numprocs );
	    client_env[j++] = env_pmi_size;
	    snprintf( env_pmi_debug, MAXNAMELEN, "PMI_DEBUG=%d", debug );
	    client_env[j++] = env_pmi_debug;
	    client_env[j] = NULL;
	    for ( j = 0; client_env[j]; j++ )
		if (putenv( client_env[j] )) {
		    MPIU_Internal_error_printf( "Could not set environment %s", client_env[j] );
		    perror( "Reason: " );
		    exit( -1 );
		}

	    /* change working directory if specified, replace argv[0], and exec client */
	    rc = chdir( wdirname );
	    if (rc < 0) {
		/* We need an error message here */
		chdir( getenv( "HOME" ) );
	    }
	    client_arg[0] = execname;
	    /* pathname argument should be used here */
	    rc = execvp( execname, client_arg );
	    if ( rc < 0 ) {
		MPIU_Error_printf( "mpiexec could not exec %s\n", execname );
		perror( "Reason: " );
		exit( -1 );
	    }
	}
	else {			/********************************** parent, mpiexec */
	    fdtable[idx].pid = pid;
	    close( client_pipe_fds[1] );
	}
    } /* end of forking loop */
}

/* 
 * This routine can be called to handle the result of a wait.  
 * This is in a separate routine so that it can be used anywhere
 * waitpid or wait are called.
 */
void HandleWaitStatus( pid_t pid, int client_stat, exit_state_t sigstate,
		       int has_finalized ) 
{
    /* Get the status of the exited process */
    if (WIFEXITED(client_stat)) {
	/* true if the process exited normally */
	exitstatus[num_exited].rc = WEXITSTATUS(client_stat);
    }
    else {
	exitstatus[num_exited].rc = -1; /* For unknown, since valid
					   returns in 0-255 */
    }
	
    if (WIFSIGNALED(client_stat)) {
	exitstatus[num_exited].sig        = WTERMSIG(client_stat);
	exitstatus[num_exited].exit_state = sigstate;
	num_aborted++;
    }
    else {
	exitstatus[num_exited].sig= 0;
	exitstatus[num_exited].exit_state = 
	    has_finalized ? NORMAL : NOFINALIZE;

    }
}

/*
 * Wait on the process in fdentry[idx].  Do a blocking wait if 
 * requested.  If sigstate is not "NORMAL", set the exit state for 
 * the process to this value if it exits with a signal.  This is used
 * to separate processes that died because mpiexec sent them a signal
 * from processes that died because they received a signal from a 
 * different source (e.g., SIGFPE or SIGSEGV)
 */
int waitOnProcess( int idx, int blocking, exit_state_t sigstate )
{
    int client_stat, rc, has_finalized;
    pid_t pid;

    /* Careful here: we may want to use WNOHANG; wait a little, then
       do something like kill the process */
    if (debug) {
	DBG_FPRINTF( stderr, "Waiting on status of process %d\n",
		 fdtable[idx].pid );
	fflush( stderr );
    }
    pid = fdtable[idx].pid;
    if (pid <= 0) return -1;

    if (blocking)
	rc = waitpid( pid, &client_stat, 0 );
    else {
	rc = waitpid( pid, &client_stat, WNOHANG );
	if (rc == 0) return 0;
    }
    if (rc < 0) {
	MPIU_Internal_error_printf( "Error waiting for process!" );
	perror( "Reason: " );
	return 0;
    }
    if (debug) {
	DBG_FPRINTF( stderr, "Wait on %d completed\n", fdtable[idx].pid );
	fflush( stderr );
    }

    has_finalized = fdtable[idx].state == FINALIZED;
    HandleWaitStatus( pid, client_stat, sigstate, has_finalized );

    num_exited++;
    return 0;
}

/* 
 * Process input from the socket connecting the mpiexec process to the
 * child process 
 */
int handle_input_fd ( int idx )
{
    int all_done = 0;
    int  rc;
    char inbuf[PMIU_MAXLINE], outbuf[PMIU_MAXLINE], cmd[MAXNAMELEN];

    if ( fdtable[idx].handler == CLIENT ) {
	/* printf( "handling client input for rank %d\n", fdtable[idx].rank ); */
	if ( ( rc = PMIU_readline( fdtable[idx].fd, inbuf, PMIU_MAXLINE ) ) > 0 ) {
	    PMIU_parse_keyvals( inbuf );
	    PMIU_getval( "cmd", cmd, MAXNAMELEN );
	    if ( strncmp( cmd, "barrier_in", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_barrier( idx );
	    }
	    else if ( strncmp( cmd, "finalize", MAXNAMELEN ) == 0 ) {
		fdtable[idx].state = FINALIZED;
	    }
	    else if ( strncmp( cmd, "abort", MAXNAMELEN ) == 0 ) {
		/* No PMI abort command has yet been implemented! */
		KillChildren();
		all_done = 1;
	    }
	    else if ( strncmp( cmd, "get_my_kvsname", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_get_my_kvsname( idx );
	    }
	    else if ( strncmp( cmd, "init", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_init( idx );
	    }
	    else if ( strncmp( cmd, "get_maxes", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_get_maxes( idx );
	    }
	    else if ( strncmp( cmd, "create_kvs", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_create_kvs( idx );
	    }
	    else if ( strncmp( cmd, "destroy_kvs", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_destroy_kvs( idx );
	    }
	    else if ( strncmp( cmd, "put", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_put( idx );
	    }
	    else if ( strncmp( cmd, "get", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_get( idx );
	    }
	    else if ( strncmp( cmd, "getbyidx", MAXNAMELEN ) == 0 ) {
		fPMI_Handle_getbyidx( idx );
	    }
	    else {
		PMIU_printf( 1, "unknown cmd %s\n", cmd );
	    }
	}
	else {                        /* lost contact with client */
	    close( fdtable[idx].fd ); 
	    fdtable[idx].active = 0;
	    rc = waitOnProcess( idx, 1, NORMAL );
	    if (rc) {
		MPIU_Internal_error_printf( "Error waiting on process %d\n",
			 fdtable[idx].pid );
	    }
	    if (fdtable[idx].state != FINALIZED) {
		/* Process exited before finalize */
		KillChildren();
		all_done = 1;
	    }
	    if ( num_exited  == numprocs ) {
		/* Set the global done flag */
		all_done = 1;
	    }
	}
    }
    else {
	MPIU_Internal_error_printf( "unknown handler %d for fdtable entry %d\n",
		 fdtable[idx].handler, idx);
    }
    return all_done;
}


#ifdef USE_SIGCHLD_HANDLER
/* 
 * Signal handler.  Detect a SIGCHLD exit.  The routines are
 *
 * setup_sigchild - Call to install the signal handler
 * handle_sigchild - This is the signal handler
 *
 * If a child exits with a non-zero return code, we may want to kill 
 * the other children.  In most cases, we'll want to kill the other 
 * children if a child dies on a signal.
 * Sometimes we do *not* want to kill the children; particularly when
 * we are debugging.
 *
 */

#if defined(USE_SIGNAL) || defined(USE_SIGACTION)
#include <signal.h>
#else
#error no signal choice
#endif

int handle_sigchild( int sig )
{
    int prog_stat, pid, rc, sigval, i;

    if (inKillChildren) {
	/* Ignore this signal */
	return 0;
    }

    if (debug) {
	DBG_FPRINTF( stderr, "Waiting for any child on signal\n" );
	fflush( stderr );
    }
    pid = waitpid( (pid_t)(-1), &prog_stat, WNOHANG );
    
    if (pid > 0) {
	/* Receives a child failure or exit.  If *failure*, kill the others */
	if (debug) {
	    DBG_FPRINTF( stderr, "Found process %d\n", pid );
	    fflush( stderr );
	}
	rc = 0;
	if (WIFEXITED(prog_stat)) {
	    rc = WEXITSTATUS(prog_stat);
	}
	sigval = 0;
	if (WIFSIGNALED(prog_stat)) {
	    sigval = WTERMSIG(prog_stat);
	}
	if (sigval || rc) {
	    /* Look up this pid in the exitstatus */
	    for (i=0; i<maxfdentryInUse ; i++) {
		if (fdtable[i].pid == pid) {
		    if (debug) {
			DBG_FPRINTF( stderr, "Found process %d\n", pid );
			fflush( stderr );
		    }
		    fdtable[i].active   = 0;
		    exitstatus[i].rc  = rc;
		    exitstatus[i].sig = sigval;
		    break;
		}
	    }
	    if (i == numprocs) {
		/* Did not find the matching pid */
		;
	    }
	    if (killOnAbort) 
		KillChildren();
	}
    }
    else {
	if (debug) {
	    DBG_FPRINTF( stderr, "Did not find child process!\n" );
	    fflush( stderr );
	}
    }
}

#ifdef USE_SIGACTION
void setup_sigchild( void )
{
    struct sigaction oldact;

    /* Get the old signal action, reset the function and 
       if possible turn off the reset-handler-to-default bit, then
       set the new handler */
    sigaction( SIGCHLD, (struct sigaction *)0, &oldact );
    oldact.sa_handler = handle_sigchild;
#ifdef SA_RESETHAND
    /* Note that if this feature is not supported, there is a race
       condition in the handling of signals, and the OS is fundementally
       flawed */
    oldact.sa_flags   = oldact.sa_flags & ~(SA_RESETHAND);
#endif
    sigaddset( &oldact.sa_mask, SIGCHLD );
    sigaction( SIGCHLD, &oldact, (struct sigaction *)0 );
}
#elif defined(USE_SIGNAL)
void setup_sigchild( void )
{
    /* Set new handler; ignore old choice */
    (void)signal( SIGCHLD, handle_sigchild );
}
#else
void setup_sigchild( void )
{
}
#endif
#endif

/* Send a given signal to all processes */
void SignalAllProcesses( int sig, const char msg[] )
{
    int   i, rc;
    pid_t pid;

    for (i=0; i<=maxfdentryInUse; i++) {
	if (fdtable[i].active) {
	    pid = fdtable[i].pid;
	    if (pid > 0) {
		if (debug) {
		    DBG_PRINTF( "sig %d to %d\n", sig, pid ); fflush( stdout );
		}
		rc = kill( pid, sig );
		if (rc) {
		    /* Check for errors.  Ignore if the process does not 
		       exist */
		    if (errno != ESRCH) {
			perror( msg );
		    }
		}
	    }
	}
    }
}

/*
 * Kill all processes.  This is called when (a) a child dies with a non-zero 
 * error code or with a signal *and* (b) the "kill-on-failure" feature is
 * selected (on by default).
 */
void KillChildren( void )
{
    int i, pid, rc;

    /* DBG_FPRINTF( stderr, "Entering kill children\n" ); */

    /* Indicate within KillChildren */
    if (inKillChildren) return;
    inKillChildren = 1;

    /* Loop through the processes and try to kill them; gently first,
     * then brutally 
     */
    
    KillTracedProcesses( );

    SignalAllProcesses( SIGINT, "Could not kill with sigint" );

    /* We should wait here to give time for the processes to exit */
    
    sleep( 1 );
    SignalAllProcesses( SIGQUIT, "Could not kill with sigquit" );
    
    /* Try to wait for the processes */
    for (i=0; i<=maxfdentryInUse; i++) {
	if (fdtable[i].active) {
	    pid = fdtable[i].pid;
	    if (pid > 0) {
		if (debug) {
		    DBG_PRINTF( "Wait on %d\n", pid ); fflush( stdout );
		}
		/* Nonblocking wait */
		rc = waitOnProcess( i, 0, KILLED );
		
	    }
	}
    }
}

/* 
 * Determine the exit status to return from mpiexec.  The rule is:
 *
 * 1. If all processes exited normally (exit_state == NORMAL), then
 * return the max of the exit statuses
 * 2. If any process did not exit normally (but was not killed by mpiexec)
 * return the value 
 */
int GetExitStatus( void )
{
    int i, rc;
    
    /* If all exited normally, return the max of exitstatus */
    rc = 0;
    for (i=0; i<num_exited; i++) {
	if (exitstatus[i].exit_state == NORMAL) {
	    if (exitstatus[i].rc > rc) 
		rc = exitstatus[i].rc;
	}
	else {
	    break;
	}
    }
    if (i == num_exited) return rc;

    /* Abnormal exit.  Look for status on any process that died */
    for (i=0; i<num_exited; i++) {
	if (exitstatus[i].exit_state != NORMAL &&
	    exitstatus[i].exit_state != KILLED) {
	    rc = exitstatus[i].rc;
	    if (rc > 0) return rc;
	}
    }

    /* All processes gave 0 return codes, but some process exited 
       abnormally.  Return a non-zero code */
    return -1;
}

/*
 * Report on the status of the complete processes.
 *
 * We could us an arg to select either a particular process or all processes
 */
void PrintExitStatus( void )
{
    int i, rc, sig;

    /* fprintf( stderr, "%d processes aborted\n", num_aborted ); */
    for (i=0; i<num_exited; i++) {
	rc  = exitstatus[i].rc;
	sig = exitstatus[i].sig;
	if (sig && exitstatus[i].exit_state != KILLED) {
#ifdef HAVE_STRSIGNAL
	    MPIU_Error_printf(
		     "Return code = %d, signaled with %s\n", rc, 
		     strsignal(sig) );
#else
	    MPIU_Error_printf( stderr, 
		     "Return code = %d, signaled with %d\n", rc, sig );
#endif
	}
	else if (debug) {
	    MPIU_Error_printf( stderr, "Return code = %d\n", rc );
	}
	fflush( stderr );
    }
}
/* -------------------------------------------------------------------------
 * The following routines implement the PMI interface.  They
 * manage the key-value space (kvs) and the process groups.
 *
 * ------------------------------------------------------------------------- */
#define MAXGROUPS   256		/* max number of groups */
#define MAXKEYLEN    64		/* max length of key in keyval space */
#define MAXVALLEN   128		/* max length of value in keyval space */
#define MAXPAIRS   1024		/* max number of pairs in a keyval space */
#define MAXKVSS      16		/* max number of keyval spaces */
/*
 * The following structures and arrays are used to implement the PMI 
 * interface.  The global variables are declared static so that they'll
 * have the correct scope if we remove these routines from the mpiexec.c file.
 */
struct groupentry {
    int  groupsize;
    int  num_in_barrier;
    char kvsname[MAXNAMELEN];
};
static struct groupentry grouptable[MAXGROUPS];
static int nextnewgroup = 0;

struct pair {
    char key[MAXKEYLEN];
    char val[MAXVALLEN];
};

struct kvs {
    int  active;
    char kvsname[MAXNAMELEN];
    struct pair pairs[MAXPAIRS];
};
static struct kvs kvstable[MAXKVSS];

static int  kvsid;

/* 
 * The Forker implementation of the PMI routines have names
 * fPMI_xxx
 */

/*
 * Perform any initialization.
 * Input
 * nprocs - Initial number of processes to create (size of initial group)
 * Output
 * kvsname is the initial kvs name (provide a string of size MAXNAMELEN.
 * Return value: groupid
 */
int fPMI_Init( int nprocs, char kvsname[] )
{
    int i;
    for ( i = 0; i < MAXKVSS; i++ )
	kvstable[i].active = 0;

    /* set up group */
    grouptable[nextnewgroup].groupsize = nprocs;

    /* set up keyval space for this group */
    fPMI_Allocate_kvs( &kvsid, kvsname );

    return nextnewgroup;
}

/* kvsname is output */
int fPMI_Allocate_kvs( int *kvsid, char kvsname[] )
{
    int i, j;
    
    for ( i = 0; i < MAXKVSS; i++ )
	if ( !kvstable[i].active )
	    break;
    if ( i >= MAXKVSS ) {
	MPIU_Internal_error_printf( stderr, "too many kvs's\n" );
	return( -1 );
    }
    else {
	kvstable[i].active = 1;
	for ( j = 0; j < MAXPAIRS; j++ ) {
	    kvstable[i].pairs[j].key[0] = '\0';
	    kvstable[i].pairs[j].val[0] = '\0';
	}
	snprintf( kvstable[i].kvsname, MAXNAMELEN, "kvs_%d", i );
	MPIU_Strncpy( kvsname, kvstable[i].kvsname, MAXNAMELEN ); 
	*kvsid = i;
	return( 0 );
    }
}

int fPMI_Allocate_kvs_group( void )
{
    return nextnewgroup++;
}

/* 
 * Handle an incoming "barrier" command
 */
void fPMI_Handle_barrier( int idx )
{
    int i;
    grouptable[fdtable[idx].group].num_in_barrier++;
    if ( grouptable[fdtable[idx].group].num_in_barrier ==
	 grouptable[fdtable[idx].group].groupsize ) {
	for ( i = 0; i < MAXFDENTRIES; i++ ) {
	    if ( fdtable[i].active &&
		 fdtable[i].group == fdtable[idx].group )
		PMIU_writeline(fdtable[i].fd, "cmd=barrier_out\n" );
	}
	grouptable[fdtable[idx].group].num_in_barrier = 0;
    }
}

/* 
 * Handle an incoming "create_kvs" command
 */
void fPMI_Handle_create_kvs( int idx )
{
    int  kvsidx;
    char kvsname[MAXNAMELEN], outbuf[PMIU_MAXLINE];
    fPMI_Allocate_kvs( &kvsidx, kvsname );
    snprintf( outbuf, PMIU_MAXLINE, "cmd=newkvs kvsname=%s\n", kvsname );
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/* 
 * Handle an incoming "destroy_kvs" command 
 */
void fPMI_Handle_destroy_kvs( int idx )
{
    int  i, rc=0;
    char kvsname[MAXNAMELEN];
    char message[PMIU_MAXLINE], outbuf[PMIU_MAXLINE];
    
    PMIU_getval( "kvsname", kvsname, MAXNAMELEN );
    for ( i = 0; i < MAXKVSS; i++ ) {
	if ( strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) {
	    if ( kvstable[i].active ) {
		kvstable[i].active = 0;
		snprintf( message, PMIU_MAXLINE,
			  "KVS_%s_successfully_destroyed", kvsname );
		rc = 0;
	    }
	    else {
		snprintf( message, PMIU_MAXLINE, "KVS_%s_previously_destroyed",
			  kvsname );
		rc = -1;
	    }
	    break;
	}
    }
    if ( i == MAXKVSS ) {
	rc = -1;
	snprintf( message, PMIU_MAXLINE, "KVS %s not found", kvsname );
    }
    snprintf( outbuf, PMIU_MAXLINE, "cmd=kvs_destroyed rc=%d msg=%s\n",
	      rc, message );
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/* 
 * Handle an incoming "put" command
 */
void fPMI_Handle_put( int idx )
{
    int  i, j, rc=0;
    char kvsname[MAXNAMELEN];
    char message[PMIU_MAXLINE], outbuf[PMIU_MAXLINE];
    char key[MAXKEYLEN];
    
    PMIU_getval( "kvsname", kvsname, MAXNAMELEN );
    for ( i = 0; i < MAXKVSS; i++ ) {
	if ( kvstable[i].active &&
	     strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) {
	    /* should check here for duplicate key and raise error */
	    PMIU_getval( "key", key, MAXKEYLEN );
	    for ( j = 0; j < MAXPAIRS; j++ ) {
		if ( strncmp( kvstable[i].pairs[j].key, key, MAXKEYLEN ) == 0 ) {
		    rc = -1;          /* no duplicate keys allowed */
		    snprintf( message, PMIU_MAXLINE, "duplicate_key %s", key );
		    break;
		}
		else if ( strncmp( kvstable[i].pairs[j].key, "", MAXKEYLEN ) == 0 ) {
		    PMIU_getval( "key", kvstable[i].pairs[j].key,
				 MAXKEYLEN );
		    PMIU_getval( "value", kvstable[i].pairs[j].val,
				 MAXVALLEN );
		    rc = 0;
		    MPIU_Strncpy( message, "success", PMIU_MAXLINE );
		    break;
		}
	    }
	    if ( j == MAXPAIRS ) {
		rc = -1;
		snprintf( message, PMIU_MAXLINE, "no_room_in_kvs_%s",
			  kvsname );
	    }
	}
	break;
    }
    if ( i == MAXKVSS ) {
	rc = -1;
	snprintf( message, PMIU_MAXLINE, "kvs_%s_not_found", kvsname );
    }
    snprintf( outbuf, PMIU_MAXLINE, "cmd=put_result rc=%d msg=%s\n",
	      rc, message );
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/*
 * Handle incoming "get" command
 */
void fPMI_Handle_get( int idx )
{
    int  i, j, rc=0;
    char kvsname[MAXNAMELEN];
    char message[PMIU_MAXLINE], key[PMIU_MAXLINE], value[PMIU_MAXLINE];
    char outbuf[PMIU_MAXLINE];
    
    PMIU_getval( "kvsname", kvsname, MAXNAMELEN );
    for ( i = 0; i < MAXKVSS; i++ ) {
	if ( kvstable[i].active &&
	     strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) {
	    PMIU_getval( "key", key, PMIU_MAXLINE );
	    for ( j = 0; j < MAXPAIRS; j++ ) {
		if ( strncmp( kvstable[i].pairs[j].key, key, MAXKEYLEN ) == 0 ) {
		    rc = 0;
		    MPIU_Strncpy( message, "success", PMIU_MAXLINE );
		    MPIU_Strncpy( value, kvstable[i].pairs[j].val, PMIU_MAXLINE );
		    break;
		}
	    }
	    if ( j == MAXPAIRS ) {
		rc = -1;
		MPIU_Strncpy( value, "unknown", PMIU_MAXLINE );
		snprintf( message, PMIU_MAXLINE, "key_%s_not_found", kvsname );
	    }
	}
	break;
    }
    if ( i == MAXKVSS ) {
	rc = -1;
	MPIU_Strncpy( value, "unknown", PMIU_MAXLINE );
	snprintf( message, PMIU_MAXLINE, "kvs_%s_not_found", kvsname );
    }
    snprintf( outbuf, PMIU_MAXLINE, "cmd=get_result rc=%d msg=%s value=%s\n",
	      rc, message, value );
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/* Handle an incoming get_my_kvsname command */
void fPMI_Handle_get_my_kvsname( int idx )
{
    char outbuf[PMIU_MAXLINE];
    snprintf( outbuf, PMIU_MAXLINE, "cmd=my_kvsname kvsname=%s\n",
	      fdtable[idx].kvsname );
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/* Handle an incoming "init" command */
void fPMI_Handle_init( int idx )
{
    /* nothing to do at present */
}

/* Handle an incoming "get_maxes" command */
void fPMI_Handle_get_maxes( int idx )
{
    char outbuf[PMIU_MAXLINE];
    snprintf( outbuf, PMIU_MAXLINE,
	      "cmd=maxes kvsname_max=%d keylen_max=%d vallen_max=%d\n",
	      MAXNAMELEN, MAXKEYLEN, MAXVALLEN );
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/*
 * Handle incoming "getbyidx" command
 */
void fPMI_Handle_getbyidx( int idx )
{
    int i, j;
    char kvsname[MAXNAMELEN], j_char[8], outbuf[PMIU_MAXLINE];

    PMIU_getval( "kvsname", kvsname, MAXNAMELEN );
    for ( i = 0; i < MAXKVSS; i++ ) {
	if ( kvstable[i].active &&
	     strncmp( kvstable[i].kvsname, kvsname, MAXNAMELEN ) == 0 ) {
	    PMIU_getval( "idx", j_char, 8 );
	    j = atoi( j_char );
	    if ( ( j > MAXPAIRS ) ||
		 strncmp( kvstable[i].pairs[j].key, "", MAXKEYLEN ) == 0 ) {
		snprintf( outbuf, PMIU_MAXLINE, "cmd=getbyidx_results rc=-1 "
			  "reason=no_more_keyvals\n" );
	    }
	    else {
		snprintf( outbuf, PMIU_MAXLINE, "cmd=getbyidx_results "
			  "rc=0 nextidx=%d key=%s val=%s\n",
			  j + 1,
			  kvstable[i].pairs[j].key,
			  kvstable[i].pairs[j].val );
	    }
	}
	break;
    }
    if ( i == MAXKVSS ) {
	snprintf( outbuf, PMIU_MAXLINE, "cmd=getbyidx_results rc=-1 "
		  "reason=kvs_%s_not_found\n", kvsname );
    }
    PMIU_writeline( fdtable[idx].fd, outbuf );
}

/* #undef HAVE_PTRACE */

#if defined(HAVE_PTRACE) && defined(HAVE_PTRACE_CONT) 
#include <sys/ptrace.h>
/* 
 * Ptrace to control execution for handling failures.
 * Ptrace causes the process to stop on any signal (except SIGKILL).
 * fork();
 * IGNORE = 0;
 * ptrace( PTRACE_TRACEME, IGNORE, IGNORE);
 * exec...
 *
 * The parent can use
 * ptrace( PTRACE_CONT, pid, 0 );
 * to cause the process to continue.  PTRACE_KILL to kill, 
 * PTRACE_ATTACH and DETACH for processes not started with TRACEME.
 *
 * wait returns status: 
 * WIFSTOPPED
 * WSTOPSIG
 * option value of WUNTRACED
 *
 * When using this option, it may be necessary to timeout the select
 * on PMI messages more often, perform a nonblocking wait on processes
 * and look for ones that are stopped.
 *
 * Functions to write:
 *    CheckForStopped - Checks for a stopped process and executes
 *    the requested routine (which probably executes a simple command)
 * 
 *    RunOnStopped - Runs a command on the stopped process
 *    
 */
#define MAX_COMMAND_LEN 1024
char commandOnStopped[MAX_COMMAND_LEN];
int onStopped = 0;
/* Set the default command */
void SetDefaultCommandOnStopped( void )
{
    char *p = getenv( "MPIEXEC_ONSIG" );
    if (p) 
	MPIU_Strncpy( commandOnStopped, p, MAX_COMMAND_LEN );
}
/* Eventually allow the polling interval to be set by environment/cmdline */
int InitHandleStopped( void ) { return 1; }  /* Make 10 for general use */

/* Set the command to be used on stopped processes */
void SetCommandOnStopped( const char cmd[] )
{
    MPIU_Strncpy( commandOnStopped, cmd, MAX_COMMAND_LEN );
    onStopped = 1;

    /* Check for special cases */
    if (strncmp( commandOnStopped, "traceback", 9 ) == 0) {
	/* FIXME: gdb only reads command from a file! */
	MPIU_Strncpy( commandOnStopped, 
		"gdb -batch -n -x gettb %e %p", MAX_COMMAND_LEN );
    }
    else if (strncmp( commandOnStopped, "gdb", 3 ) == 0) {
	MPIU_Strncpy( commandOnStopped, 
		      "xterm -e \"gdb %e %p\"", MAX_COMMAND_LEN );
    }

}
/*
 * Run the specified command on the given pid.  The following sequences
 * are handled specially:
 * %e - replace with the name of the executable
 * %p - replace with the pid of the process
 * e.g., the command
 *    xterm -e "gdb %e %p" &
 * runs an xterm that runs gdb on the stopped process, in the background.
 */
void RunOnStopped( const char execname[], pid_t pid ) 
{ 
    char c; 
    char fullcommand[MAX_COMMAND_LEN+1]; 
    char *pout, *pin;

    /* Form the full command from the command string */
    pout = fullcommand;
    pin  = commandOnStopped;
    
    while ((c = *pin++) != 0 && (pout - fullcommand) < MAX_COMMAND_LEN) {
	if (c == '%') {
	    if (*pin == 'e') {
		char *eptr = execname;
		pin++;
		/* Replace with the executable name */
		while (*eptr && (pout - fullcommand) < MAX_COMMAND_LEN) {
		    *pout++ = *eptr++;
		}
	    }
	    else if (*pin == 'p') {
		char pidchars[12], *pptr = pidchars;
		pin++;
		/* Replace with the pid */
		snprintf( pidchars, 12, "%d", (int)pid );
		while (*pptr && (pout - fullcommand) < MAX_COMMAND_LEN) {
		    *pout++ = *pptr++;
		}
	    }
	    else {
		*pout++ = c;
		*pout++ = *pin++;
	    }
	}
	else {
	    *pout++ = c;
	}
    }
    if (pout - fullcommand >= MAX_COMMAND_LEN) {
	/* Error - command is too long */
	return;
    }
    /* Add trailing null */
    *pout = 0;

    /* Run this command string in the background and orphaned, but with
       stdout still directed to us */
    /* FIXME: system isn't robust enough for what we want */
    MPIU_Msg_printf( "Running %s\n", fullcommand );
    /* We need to detach before we can run a command that itself wishes to
       use ptrace.  There isn't a good way to do this, but we try
       using PTRACE_DETACH.  What we do use is SIGTSTP, which 
       will often leave the process stopped so that the next command
       can find it.
    */
    ptrace( PTRACE_DETACH, pid, 0, SIGTSTP );
    system( fullcommand );
    /* We could re-attach the process here.  If we don't, we can no
       longer wait on the process.  Instead, we might reattach but turn off
       the handling of events. */
    /* ptrace( PTRACE_ATTACH, pid, 0, 0 ); */
}

/* See if we want to set ptrace for this process.  Putting this into a routine
   allows us to have more complex criteria */
void CheckIfTraced( void )
{
    int rc;
    if (onStopped) {
	rc = ptrace( PTRACE_TRACEME, 0, 0, 0 );
	if (rc < 0) {
	    perror( "Error from ptrace(PTRACE_TRACEME):" );
	}
    }
}

void CheckForStopped( const char execname[] )
{
    pid_t pid;
    int   sig;
    int   client_stat;

    /* ? WUNTRACED */
    while (1) {
	pid = waitpid( -1, &client_stat, WNOHANG );
	if (!pid) return;  /* no stopped process */
	if (WIFSTOPPED(client_stat)) {
	    sig = WSTOPSIG(client_stat);

	    if (sig == SIGTRAP) {
		/* Ignore these signals */
		ptrace( PTRACE_CONT, pid, 0, 0 );
	    }
	    else if (onStopped) {
		/*printf( "Signal is %d %s\n", sig, strsignal(sig) );*/
		/* FIXME: Find this pid in the list of processes; get the 
		   executable name */
		RunOnStopped( execname, pid );
	    }
	}
	else {
	    /* Handle a process exit */
	    /* FIXME: look up pid and see if the process has
	       finalized */
	    HandleWaitStatus( pid, client_stat, NORMAL, 0 );

	    num_exited++;
	    
	}
    }
}
void KillTracedProcesses( void )
{
    int   i;
    pid_t pid;

    for (i=0; i<=maxfdentryInUse; i++) {
	if (fdtable[i].active) {
	    pid = fdtable[i].pid;
	    if (pid > 0) {
		ptrace( PTRACE_KILL, pid, 0, 0 );
	    }
	}
    }
}
#else
/* Dummy routines if ptrace is not available */
int InitHandleStopped( void ) { return 0; }
void SetDefaultCommandOnStopped( void ) {}
void CheckIfTraced( void ) {}
void CheckForStopped( const char cmd[] ) {}
void SetCommandOnStopped( const char cmd[] ) {}
void KillTracedProcesses( void ){}
#endif

/*
 * We should set up error messages so that they are of two flavors:
 * developer and user.  E.g., the developer message might
 * include perror output and a terse message such as
 * "fork failed" while the user message might be more like
 * "Unable to create processes; check the total number of processes"
 */

/* ------------------------------------------------------------------------ */
/* On some systems (SGI IRIX 6), process exit sometimes kills all processes
 * in the process GROUP.  This code attempts to fix that.  
 * We DON'T do it if stdin (0) is connected to a terminal, because that
 * disconnects the process from the terminal.
/* ------------------------------------------------------------------------ */
void CreateNewSession( void )
{
#if defined(HAVE_SETSID) && defined(HAVE_ISATTY) && \
    defined(USE_NEW_SESSION) && defined(HAVE_GETSID)
if (!isatty(0) && getsid(0) != getpid()) {
    pid_t rc;
    /* printf( "Creating a new session\n" ); */
    rc = setsid();
    if (rc < 0) {
	MPIU_Internal_error_printf( "Could not create new process group\n" );
	}
    }
#endif
}
