/*
   From: "Greg Lindahl" <glindahl@hpti.com>
   Subject: [Myricom help #4715] bug in mpich/gm
   Date: Mon, 6 Mar 2000 22:18:43 -0500
 */

/*
   I discovered the following bug the hard way: if you have too many
   outstanding messages, they get corrupted. Here is a test program. Run it:

   mpirun -np 2 ./hpti -s 30000

   and watch it puke:

   gmpi:[0] Packet type 10 (0xa) is unknown gmpriv.c:447!
   gmpi: Packet dump:
   0xffffffaaffffffaaffffffaaffffffaaffffffaaffffffaaffffffaafff
   fffaa

   [ in practice, the memory corruption is not always detected, so I see things
   like floating point exceptions. ]

   Now, you may say, it's unfair to send 30,000 packets before receiving any.
   Well, that's what happens when node 0 is broadcasting many packets to
   bunches of nodes. mm5 used to work on mpich-gm, a long time ago. Now it
   pukes at the broadcast, which does not use MPI_Bcast, it just loops sending
   to all processes. When you replace that with an MPI_Bcast, it works.

   This also afflicts several of the NAS Parallel Benchmarks, at certain
   numbers of processes. It goes away when you get big enough (!)

 */

/* check delay variation */
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"

#define REPS 1000
#define PREC 1000000.
#define R2 20
FILE *fp;
double *vec;
#define ALIGN   8192

int 
main(int argc, char *argv[])
{
	double s, t, q, min, max, avrg, var, ovrhd, secs();
	char *vtmp;
	int i = 1, me, np, tmp;
	int src, dst;
	MPI_Status status;
	int lth = 8;
	int presend = 0;
	int sendcnt = REPS;
	int count;
	char buf[1000];

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &me);
	MPI_Comm_size(MPI_COMM_WORLD, &np);

	while (i < argc) {
		if (argv[i][0] != '-')
			break;
		switch (argv[i][1]) {
		 case 'm':
			 i++;
			 lth = atoi(argv[i]);
			 break;
		 case 's':
			 i++;
			 presend = atoi(argv[i]);
			 break;
		 case 'n':
			 i++;
			 sendcnt = atoi(argv[i]);
			 break;
		 case '?':
		 case ':':
			 fprintf(stderr, "usage: vdelay4 [-m message_size] [-n message_count] [-s seed_count]\n");
			 exit(1);
		}
		i++;
	}


	vtmp = (char *) malloc(lth + ALIGN + 1);	/*page align for hot intel perf. */
	vtmp = (char *) ((((unsigned long) vtmp) + ALIGN - 1) & ~(ALIGN - 1));

	for (i = 0; i < lth; i++)
		vtmp[i] = 0;			/* pre-touch and avoid VM delays */
	vec = (double *) malloc(sendcnt * sizeof(double));
	src = dst = 1;
	ovrhd = 777777;				/* determine clock overhead */
	for (i = 0; i < 10; i++) {
		t = secs();
		s = secs();
		t = s - t;
		if (t < ovrhd)
			ovrhd = t;
	}
	for (i = 0; i < sendcnt; i++)
		vec[i] = 0;				/* avoid pg faults */
	avrg = max = 0;
	min = 777777;
	for (count = 0; count < np; count++) {
		if (count != me)
			MPI_Send(&i, sizeof(i), MPI_BYTE, count, 0, MPI_COMM_WORLD);
	}
	for (count = 0; count < np; count++) {
		if (count != me)
			MPI_Recv(&tmp, sizeof(i), MPI_BYTE, count, 0, MPI_COMM_WORLD, &status);
	}
	for (i = 0; i < presend; i++) {
		for (count = 0; count < np; count++) {
			if (count != me) {
				if ((i % 10000) == 0) {
					printf("%d: presend %d\n",me,i);
					fflush(stdout);
				}
				MPI_Send(vtmp, lth, MPI_BYTE, count, 0, MPI_COMM_WORLD);
			}
		}
	}
	for (i = 0; i < 3; i++) {	/* warmup */
		for (count = 0; count < np; count++) {
			if (count != me)
				MPI_Send(vtmp, lth, MPI_BYTE, count, 0, MPI_COMM_WORLD);
		}
		for (count = 0; count < np; count++) {
			if (count != me)
				MPI_Recv(vtmp, lth, MPI_BYTE, count, 0, MPI_COMM_WORLD, &status);
		}
	}
	s = secs();
	for (i = 0; i < sendcnt; i++) {
		t = secs();
		for (count = 0; count < np; count++) {
			if (count != me)
				MPI_Send(vtmp, lth, MPI_BYTE, count, 0, MPI_COMM_WORLD);
		}
		for (count = 0; count < np; count++) {
			if (count != me)
				MPI_Recv(vtmp, lth, MPI_BYTE, count, 0, MPI_COMM_WORLD, &status);
		}
		q = secs();
		t = q - t - ovrhd;
		if (t < min)
			min = t;
		if (t > max)
			max = t;
		avrg += t;
		vec[i] = t;
	}
	t = secs();
	for (i = 0; i < presend; i++) {
		for (count = 0; count < np; count++) {
			if (count != me)
				MPI_Recv(vtmp, lth, MPI_BYTE, count, 0, MPI_COMM_WORLD, &status);
		}
	}
	if (me == 0) {
		var = 0.;
		avrg = avrg / sendcnt;
		for (i = 0; i < sendcnt; i++)
			var += (vec[i] - avrg) * (vec[i] - avrg);

		printf("at %f overhead %f us\n", t, ovrhd * 1.e6);
		printf("reps %d min %f max %f disp %f avrg %f var %g over %f secs\n",
			   sendcnt, min, max, max - min, avrg, var, t - s);
		printf(" lth %d  %f MBs %f us 1-way avrg: %f %f\n",
		lth, 2.e-6 * lth / min, .5e6 * min, 2.e-6 * lth / avrg, .5e6 * avrg);
		printf("done on %d procs with %d byte messages\n", np, lth);
	}

	printf("%d: completed\n",me);
	MPI_Finalize();
	return 0;
}
double
secs()
{
#include <sys/time.h>
	struct timeval ru;
	gettimeofday(&ru, (struct timezone *) 0);
	return (ru.tv_sec + ((double) ru.tv_usec) / 1000000);
}
