/* doall.c: translate from DOALL.f(Parkbench/Low_Level/rinf1) */

#include "rinf1.h"

int casemax = 20;

/*
 * ICASE - CASE NUMBER, INPUT PARAM
 * N     - VECTOR LENGTH -> [r]return
 * TN    - TIME PER VECTOR OPERATION, OUTPUT PARAM -> [w]return
 * return - 0: special case, n[the same as input]: others
 */
int doall(int icase, int n, double *tn)
{
    int  i, j, k, flop, jt, n2, nl;
    double  ch, fntim, s, sum;
    double  t0, t1, t2;

/*    printf(" doall[%d,%d]\n", icase+1, n);*/
    ntim = 1;	/* SELECT REPEAT */
    if ( n != 0 ){
	ntim = ntimes / n;
	if ( icase == 9 ) ntim = ntim / n;
	if ( 9 < icase && icase < 13 ) ntim = ntim / (n*n);
	if ( ntim == 0 ) ntim = 1;
	fntim = ntim;

	/* TIME OVERHEAD OF CALLING TIMER, IN T0 */
	t1 = dwalltime();
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
	}
	t2 = dwalltime();
	t0 = t2 - t1;
    }

    switch(icase){
    case 0:	/* CONTIGUOUS DYADS */
	label[icase] = "(1) CONTIGUOUS DYADS:  A(I)=B(I)*C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 1.0;
	t1 = dwalltime();
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for shared(a,b,c)
	    for( i = 0 ; i < n ; i++ ){/* THE VECTOR OPERATION */
		a[i] = b[i] * c[i];
	    }
	}
	t2 = dwalltime();
	break;

    case 1:	/* NONCONTIGUOUS DYAD */
	label[icase] = "(2) DYADS,  STRIDE=8:  A(I)=B(I)*C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 1.0;
	nl = 1 + 8*(n - 1);
	if ( nl > nmax ) n = 0;
	if ( n == 0 )
	    return 0;	/*  N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */

	t1 = dwalltime();
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < nl ; i += 8 ){
		a[i] = b[i] * c[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 2:	/* CONTIGUOUS TRIAD */
	label[icase] = "(3) CONTIGUOUS TRIADS:  A(I)=B(I)*C(I)+D(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 2.0;
	t1 = dwalltime();
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){
		a[i] = b[i] * c[i] + d[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 3:	/* NONCONTIGUOUS DYAD */
	label[icase] = "(4) TRIADS,  STRIDE=8:  A(I)=B(I)*C(I)+D(I)";
	flop = 2.0;
	nl = 1 + 8*(n - 1);
	if ( nl > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < nl ; i += 8 ){
		a[i] = b[i] * c[i] + d[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 4:	/* RANDOM SCATTER/GATHER */
	label[icase] = "(5) RANDOM SCATTER/GATHER:";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 2.0;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ )
		a[i] = c[ib[i]];	/* RANDOM GATHER */
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){
/*#pragma omp critical*/
		a[ib[i]] = c[i];	/* RANDOM SCATTER */
	    }
	}

	t2 = dwalltime();
	/* RECORD AVERAGE SCATTER/GATHER, DIVIDE BY 2 */
	break;

    case 5:	/* CONTIGUOUS 4-OPS */
	label[icase] = "(6) CONTIGUOUS 4-OP:  A(I)=B(I)*C(I)+D(I)*E(I)+F(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 4.0;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){	/* THE VECTOR OPERATION */
		a[i] = b[i] * c[i] + d[i] * e[i] + f[i];
	    }
	}

	t2 = dwalltime();
	/* RECORD AVERAGE SCATTER/GATHER, DIVIDE BY 2 */
	break;

    case 6:
	label[icase] = "(7) INNER PRODUCT:  S=S+B(I)*C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 2.0;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
	    sum = 0.0;
#pragma omp parallel for reduction (+:sum)
	    for( i = 0 ; i < n ; i++ ){
		sum = sum + b[i] * c[i];	/* THE VECTOR OPERATION */
	    }
	}

	t2 = dwalltime();
	/* ASSIGN SUM TO A(1) TO PRVENT OPTIMIZING THE 710 LOOP */
	a[1] = sum;
	break;

    case 7:
	label[icase] = "(8) FIRST ORDER RECURRENCE:  A(I)=B(I)*A(I-1)+D(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 2.0;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
	    a[0] = d[0];
	    for( i = 1 ; i < n ; i++ ){	/* THE VECTOR OPERATION */
		a[i] = f[i] * a[i-1] + d[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 8:
	label[icase] = "(9) CHARGE ASSIGNMENT:  A(J(I))=A(J(I))+S";
	if ( n > nmax ) n = 0;
	if ( n == 0 )
	    return 0;

	flop = 1.0;
	t1 = dwalltime();

	ch = 1.7349;
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){	/* THE VECTOR OPERATION */
/*#pragma omp critical*/
		c[ia[i]] = c[ia[i]] + ch;
	    }
	}

	t2 = dwalltime();
	break;

    case 9:
	label[icase] = "(10) TRANSPOSITION:  B(I,J)=A(J,I)";
	if ( n > nmax1 ) n = 0;	/* ESCAPE IF N TOO BIG */
	if ( n == 0 )
	    return 0;

	flop = n;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for private(j)
	    for( i = 0 ; i < n ; i++ ){	/* THE VECTOR OPERATION */
		for( j = 0 ; j < n ; j++ ){
		    matb[i][j] = mata[j][i];	/*!array is F order!*/
		}
	    }
	}

	t2 = dwalltime();
	break;

    case 10:	/* EACH INNER PRODUCT ACCUMULATED IN SEQUENCE */
	label[icase] = "(11) MATRIX MULT BY INNER PRODUCT";
	if ( n > nmax1 ) n = 0;	/* ESCAPE IF N TOO BIG */
	if ( n == 0 )
	    return 0;

	/* CLEAR ARRAY BEFORE ACCUMULATING SUMS */
#pragma omp parallel for private(j)
	for( i = 0 ; i < n ; i++ ){
	    for( j = 0 ; j < n ; j++ ){
		mata[i][j] = 0.0;
	    }
	}

	flop = 2*n*n;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for private(j,k)
	    for( i = 0 ; i < n ; i++ ){
		for( j = 0 ; j < n ; j++ ){	/* THE VECTOR OPERATION */
		    for( k = 0 ; k < n ; k++ ){	/* ACCUMULATE INNER PRODUCT */
			mata[i][j] = mata[i][j] + matb[i][k] * matc[k][j];
		    }
		}
	    }
	}

	t2 = dwalltime();
	break;

    case 11:	/* A COLUMN OF INNER PRODUCTS ARE ACCUMULATED IN PARALLEL */
	/* 2 N**2 VECTOR OPS OF LENGTH N */
	label[icase] = "(12) MATRIX MULT BY MIDDLE PRODUCT";
	if ( n > nmax1 ) n = 0;	/* ESCAPE IF N TOO BIG */
	if ( n == 0 )
	    return 0;

	/* CLEAR ARRAY BEFORE ACCUMULATING SUMS */
#pragma omp parallel for private(j)
	for( i = 0 ; i < n ; i++ ){
	    for( j = 0 ; j < n ; j++ ){
		mata[i][j] = 0.0;
	    }
	}

	flop = 2*n*n;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for private(j,k)
	    for( i = 0 ; i < n ; i++ ){/* THE VECTOR OPERATION OF LENGTH N */
		for( k = 0 ; k < n ; k++ ){
		    for( j = 0 ; j < n ; j++ ){
			mata[i][j] = mata[i][j] + matb[i][k] * matc[k][j];
		    }
		}
	    }
	}

	t2 = dwalltime();
	break;

    case 12:	/* ACCUMULATE ALL N**2 INNER PRODUCTS IN PARALLEL */
	/* USES 2*N VECTOR OPS OF LENGTH N**2 */
	label[icase] = "(13) MATRIX MULT BY OUTER PRODUCT";
	if ( n > nmax1 ) n = 0;	/* ESCAPE IF N TOO BIG */
	if ( n == 0 )
	    return 0;

	/* CLEAR ARRAY BEFORE ACCUMULATING SUMS */
	n2 = n * n;
#pragma omp parallel for
	for( i = 0 ; i < n2 ; i++ ){
	    a[i] = 0.0;
	}

	flop = 2*n;
	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
	    for( k = 0 ; k < n ; k++ ){
	/* LOAD 1D ARRAYS IN PREPARATION FOR VECTOR OP OF LENGTH N**2 */
#pragma omp parallel for private(j)
		for( i = 0 ; i < n ; i++ ){
		    for( j = 0 ; j < n ; j++ ){
			b[n*i+j] = matb[k][j];
			c[n*i+j] = matc[i][k];
		    }
		}
		/* THE VECTOR OPERATION OF LENGTH N**2 */
#pragma omp parallel for
		for( i = 0 ; i < n2 ; i++ ){
		    a[i] = a[i] + b[i] * c[i];
		}
	    }
	}

	t2 = dwalltime();
	n = n2;
	break;

    case 13:	/* DYAD, STRIDE=128 */
	label[icase] = "(14) DYADS,  STRIDE=128:  A(I)=B(I)*C(I)";

	flop = 1.0;
	nl = 1 + 128*(n - 1);
	if ( nl > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < nl ; i += 128 ){
		a[i] = b[i] * c[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 14:	/* NONCONTIGUOUS DYAD, STRIDE=1024 */
	label[icase] = "(15) DYADS,  STRIDE=1024:  A(I)=B(I)*C(I)";

	flop = 1.0;
	nl = 1 + 1024*(n - 1);
	if ( nl > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < nl ; i += 1024 ){
		a[i] = b[i] * c[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 15:	/* CONTIGUOUS TRIAD */
	label[icase] = "(16) CONTIGUOUS DAXPY:  A(I)=S*B(I)+C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	flop = 2.0;
	s = 1.2345;

	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){
		a[i] = s * b[i] + c[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 16:	/* CONTIGUOUS TRIAD */
	label[icase] = "(17) INDIRECT DAXPY:  A(J(I))=S*B(K(I))+C(L(I))";
	if ( n > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	flop = 2.0;
	s = 1.2345;

	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){
/*#pragma omp critical*/
		a[ic[i]] = s * b[ib[i]] + c[ic[i]];
	    }
	}

	t2 = dwalltime();
	break;

    case 17:	/* CONTIGUOUS TRIAD */
	label[icase] = "(18) CONTIGUOUS DAXPY[parallel do]: A(I)=S*B(I)+C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	flop = 2.0;
	s = 1.2345;

	t1 = dwalltime();

	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp parallel for
	    for( i = 0 ; i < n ; i++ ){
		a[i] = s * b[i] + c[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 18:	/* CONTIGUOUS TRIAD(static[10]) */
	label[icase] = "(19) CONTIGUOUS DAXPY[static(10)]: A(I)=S*B(I)+C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	flop = 2.0;
	s = 1.2345;

	t1 = dwalltime();

#pragma omp parallel private(jt)
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp for schedule(static,10)
	    for( i = 0 ; i < n ; i++ ){
		a[i] = s * b[i] + c[i];
	    }
	}

	t2 = dwalltime();
	break;

    case 19:	/* CONTIGUOUS TRIAD(dynamic[10]) */
	label[icase] = "(20) CONTIGUOUS DAXPY[dynamic(10)]: A(I)=S*B(I)+C(I)";
	if ( n > nmax ) n = 0;
	if ( n == 0 )	/* N=0 IS SIGNAL VECTOR TOO LONG TO OUTSIDE */
	    return 0;

	flop = 2.0;
	s = 1.2345;

	t1 = dwalltime();

#pragma omp parallel private(jt)
	for( jt = 0 ; jt < ntim ; jt++ ){
	    dummy(jt);
#pragma omp for schedule(dynamic,10)
	    for( i = 0 ; i < n ; i++ ){
		a[i] = s * b[i] + c[i];
	    }
	}

	t2 = dwalltime();
	break;

    default:
	totim = 0.0;
	*tn = 0.0;
/*	printf(" => test %d is skipped.\n", icase);*/
	return n;
    }

    totim = t2 - t1 - t0;
    *tn = totim / (flop * fntim);
/*    printf(" =>test-%d takes %e (plus %e; %e)sec.\n", icase, totim, t0, t0/ntim);*/

    return n;
}
