#include <stdio.h>
#include "shiftblocks.c"
#define	N	8	/* matrix dimension */
#define NPROC	1	/* number of processors = NPROC^2 */
#include "multiply.c"

static float
aa[N][N] = {    { 1, 0, 1, 2, 1, 0, 0, 1 },
                { 0, 0, 0, 1, 1, 2, 0, 1 },
                { 2, 1, 1,-1, 0, 0, 1, 1 },
                {-1, 0, 1, 2,-1, 1, 0, 1 },
                { 0, 1, 1, 0, 2,-1,-1, 0 },
                { 1, 1, 1, 1, 0, 0,-1, 0 },
                { 1, 0, 0, 2,-1, 1, 0,-1 },
                {-1, 1, 1, 0, 2, 1, 1, 0 }    },
bb[N][N] = {    { 0, 1, 1,-1, 2, 0, 0, 0 },
                { 1, 0,-1, 2, 0, 1, 1,-1 },
                {-1,-1, 0, 0, 1, 2, 1, 0 },
                { 0, 0, 1,-1, 1, 0, 1, 1 },
                { 1, 1, 1, 1, 0, 2,-1, 0 },
                {-1, 0, 0, 1, 2, 1, 0, 0 },
                { 0, 0, 1,-1, 1, 2, 1, 0 },
                { 0, 1, 1, 2,-1, 0,-1,-1 }    };

/*
aa[N][N] = {	{ 1,-1, 0, 0},
		{-1, 2,-1, 0},
		{ 0,-1, 2,-1},
		{ 0, 0,-1, 2}  },
bb[N][N] = {	{4, 3, 2, 1},
		{3, 3, 2, 1},
		{2, 2, 2, 1},
		{1, 1, 1, 1}   };
aa[N][N] = {    { 11, 12, 13, 14, 15, 16 },
		{ 21, 22, 23, 24, 25, 26 },
		{ 31, 32, 33, 34, 35, 36 },
		{ 41, 42, 43, 44, 45, 46 },
		{ 51, 52, 53, 54, 55, 56 },
		{ 61, 62, 63, 64, 65, 66 }    },
bb[N][N] = {    { 11, 12, 13, 14, 15, 16 },
		{ 21, 22, 23, 24, 25, 26 },
		{ 31, 32, 33, 34, 35, 36 },
		{ 41, 42, 43, 44, 45, 46 },
		{ 51, 52, 53, 54, 55, 56 },
		{ 61, 62, 63, 64, 65, 66 }    };
*/

main()
{
    float  cc[N][N], dd[N][N], ee[N][N];
    float  *a[NPROC][NPROC][N/NPROC], *b[NPROC][NPROC][N/NPROC],
	*c[NPROC][NPROC][N/NPROC], *d[NPROC][NPROC][N/NPROC],
	*e[NPROC][NPROC][N/NPROC], *tmp[NPROC][NPROC][N/NPROC];
    int    i, j, k, bsize = N/NPROC;

    /* initialize pointers */
    for (i = 0; i < NPROC; i++)
        for (j = 0; j < NPROC; j++)
	    for (k = 0; k < bsize; k++) {
		a[i][j][k] = &(aa[i*bsize+k][j*bsize]);
		b[i][j][k] = &(bb[i*bsize+k][j*bsize]);
		c[i][j][k] = &(cc[i*bsize+k][j*bsize]);
		d[i][j][k] = &(dd[i*bsize+k][j*bsize]);
		e[i][j][k] = &(ee[i*bsize+k][j*bsize]);
	    }
    printmatrix(&(aa[0][0]), N);
    printmatrix(&(bb[0][0]), N);

    /* a(shift i West) -> c; b(shift j North) -> d */
    for (i = 0; i < NPROC; i++)
	for (j = 0; j < NPROC; j++)
	    shiftblocks(a[i][j],b[i][j],c[i][(j-i+NPROC)%NPROC],d[(i-j+NPROC)%NPROC][j],bsize);
    /* c*d -> e */
    bzero(&(ee[0][0]), N*N*sizeof(float));
    for (i = 0; i < NPROC; i++)
        for (j = 0; j < NPROC; j++)
	    multi_add(c[i][j], d[i][j], e[i][j], bsize);

    for (k = 1; k < NPROC; k++) {
	bcopy(&(cc[0][0]), &(aa[0][0]), N*N*sizeof(float));
	bcopy(&(dd[0][0]), &(bb[0][0]), N*N*sizeof(float));
	/* a(shift 1 East) -> c; b(shift 1 South) -> d */
	for (i = 0; i < NPROC; i++)
            for (j = 0; j < NPROC; j++)
        	shiftblocks(a[i][j],b[i][j],c[i][(j+1+NPROC)%NPROC],d[(i+1+NPROC)%NPROC][j],bsize);
	/* c*d -> e */
	for (i = 0; i < NPROC; i++)
            for (j = 0; j < NPROC; j++)
		multi_add(c[i][j], d[i][j], e[i][j], bsize);
    }

    printmatrix(&(ee[0][0]), N);
}
