#include "atlas_misc.h"
#define ATL_NoFakePF
#define ATL_AltiVec 
#include "atlas_prefetch.h"

void ATL_USERMM
   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
/*
 * matmul with muladd=1, TA=T, TB=N, mu=4, nu=4, ku=2, prefetching A and B
 */
{
   const TYPE *stM = A + K*M;
   const TYPE *stN = B + K*N;
   #ifdef ATL_AltiVec
      int blkstride, cwrdKB, cwrdC=ATL_MulBySize(8);
   #endif
   const int incAn = -K*M, incAm = 3*K+1;
   const int incBm = 1-K, incBn = K<<2;
   const int kstart=(K>>1)-1;
   #ifdef TREAL
      #define incCm 4
      const int incCn = (((ldc) << 2)) - M;
   #else
      #define incCm 8
      const int incCn = (((ldc) << 3)) - (M+M);
   #endif
   TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT),*pC3=pC2+(ldc SHIFT);
   TYPE *bp = (TYPE *) &beta;
   const TYPE *pA0=A, *pA1=A+lda, *pA2=pA1+lda, *pA3=pA2+lda;
   const TYPE *pB0=B, *pB1=B+ldb, *pB2=pB1+ldb, *pB3=pB2+ldb;
   register int k;
   register TYPE rA0, rA1, rA2, rA3, rB0, rB1, rB2, rB3;
   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1, 
                 rC0_2, rC1_2, rC2_2, rC3_2, rC0_3, rC1_3, rC2_3, rC3_3;

   #ifdef ATL_AltiVec
/*
 *    k is blkcount, cwrdKB is block size
 */
      k = 1; /* blkcount set to 1 unless KB too large */
      cwrdKB = (ATL_MulBySize(K)+15) >> 4;  /* # of 16-byte words in KB */
      while (cwrdKB > 32) 
      {
         cwrdKB >>= 1;
         k <<= 1;
      }
      if (cwrdKB == 32) cwrdKB = 0;
      blkstride = (K * sizeof(TYPE)) / k;
      ATL_pfavR(A, ATL_GetCtrl(blkstride, k*K, cwrdKB), 3);
      cwrdKB = ATL_GetCtrl(blkstride, k, cwrdKB);
      if (cwrdC >= 16) cwrdC >>= 4;
      else cwrdC = 1;
      cwrdC = ATL_GetCtrl(0, 1, cwrdC);
   #endif
   do /* N-loop */
   {
      ATL_pfavR(pB0, cwrdKB, 0);
      ATL_pfavR(pB1, cwrdKB, 1);
      ATL_pfavR(pB2, cwrdKB, 2);
      ATL_pfavR(pB3, cwrdKB, 3);
      do /* M-loop */
      {
         #ifdef BETA0
            rC0_0 = rC1_0 = rC2_0 = rC3_0 =
            rC0_1 = rC1_1 = rC2_1 = rC3_1 =
            rC0_2 = rC1_2 = rC2_2 = rC3_2 =
            rC0_3 = rC1_3 = rC2_3 = rC3_3 = ATL_rzero;
         #else
            #ifdef TREAL
               rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];
               rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3];
               rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3];
               rC0_3 = *pC3; rC1_3 = pC3[1]; rC2_3 = pC3[2]; rC3_3 = pC3[3];
            #else
               rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6];
               rC0_1 = *pC1; rC1_1 = pC1[2]; rC2_1 = pC1[4]; rC3_1 = pC1[6];
               rC0_2 = *pC2; rC1_2 = pC2[2]; rC2_2 = pC2[4]; rC3_2 = pC2[6];
               rC0_3 = *pC3; rC1_3 = pC3[2]; rC2_3 = pC3[4]; rC3_3 = pC3[6];
            #endif
            #ifdef BETAX
               rA0 = *bp;
               rC0_0 *= rA0; rC1_0 *= rA0; rC2_0 *= rA0; rC3_0 *= rA0;
               rC0_1 *= rA0; rC1_1 *= rA0; rC2_1 *= rA0; rC3_1 *= rA0;
               rC0_2 *= rA0; rC1_2 *= rA0; rC2_2 *= rA0; rC3_2 *= rA0;
               rC0_3 *= rA0; rC1_3 *= rA0; rC2_3 *= rA0; rC3_3 *= rA0;
            #endif
         #endif
         rA0 = *pA0++; rA1 = *pA1++; rA2 = *pA2++; rA3 = *pA3++;
         rB0 = *pB0++; rB1 = *pB1++; rB2 = *pB2++; rB3 = *pB3++;
         for (k=kstart; k; k--)
         {
            rC0_0 += rA0 * rB0;
            rC1_0 += rA1 * rB0;
            rC2_0 += rA2 * rB0;
            rC3_0 += rA3 * rB0; rB0 = *pB0;
            rC0_1 += rA0 * rB1;
            rC1_1 += rA1 * rB1;
            rC2_1 += rA2 * rB1;
            rC3_1 += rA3 * rB1; rB1 = *pB1;
            rC0_2 += rA0 * rB2;
            rC1_2 += rA1 * rB2;
            rC2_2 += rA2 * rB2;
            rC3_2 += rA3 * rB2; rB2 = *pB2;
            rC0_3 += rA0 * rB3; rA0 = *pA0;
            rC1_3 += rA1 * rB3; rA1 = *pA1;
            rC2_3 += rA2 * rB3; rA2 = *pA2;
            rC3_3 += rA3 * rB3; rA3 = *pA3;

            rC0_0 += rA0 * rB0; rB3 = *pB3;
            rC1_0 += rA1 * rB0;
            rC2_0 += rA2 * rB0;
            rC3_0 += rA3 * rB0; rB0 = pB0[1];
            rC0_1 += rA0 * rB1; pB0 += 2;
            rC1_1 += rA1 * rB1;
            rC2_1 += rA2 * rB1;
            rC3_1 += rA3 * rB1; rB1 = pB1[1];
            rC0_2 += rA0 * rB2; pB1 += 2;
            rC1_2 += rA1 * rB2;
            rC2_2 += rA2 * rB2;
            rC3_2 += rA3 * rB2; rB2 = pB2[1]; pB2 += 2;
            rC0_3 += rA0 * rB3; rA0 = pA0[1]; pA0 += 2;
            rC1_3 += rA1 * rB3; rA1 = pA1[1]; pA1 += 2;
            rC2_3 += rA2 * rB3; rA2 = pA2[1]; pA2 += 2;
            rC3_3 += rA3 * rB3; rA3 = pA3[1]; rB3 = pB3[1]; pA3 += 2; pB3 += 2;
         }
         rC0_0 += rA0 * rB0; ATL_pfavW(pC0, cwrdC, 0);
         rC1_0 += rA1 * rB0; ATL_pfavW(pC1, cwrdC, 1);
         rC2_0 += rA2 * rB0; ATL_pfavW(pC2, cwrdC, 2);
         rC3_0 += rA3 * rB0; rB0 = *pB0; ATL_pfavW(pC3, cwrdC, 3);
         rC0_1 += rA0 * rB1; pB0 += incBm;
         rC1_1 += rA1 * rB1;
         rC2_1 += rA2 * rB1;
         rC3_1 += rA3 * rB1; rB1 = *pB1;
         rC0_2 += rA0 * rB2; pB1 += incBm;
         rC1_2 += rA1 * rB2;
         rC2_2 += rA2 * rB2;
         rC3_2 += rA3 * rB2; rB2 = *pB2;
         rC0_3 += rA0 * rB3; rA0 = *pA0; pA0 += incAm;
         rC1_3 += rA1 * rB3; rA1 = *pA1; pA1 += incAm;
         rC2_3 += rA2 * rB3; rA2 = *pA2; pA2 += incAm;
         rC3_3 += rA3 * rB3; rA3 = *pA3; pA3 += incAm;

         rC0_0 += rA0 * rB0; rB3 = *pB3; ATL_pfavR(pA0, cwrdKB, 0);
         rC1_0 += rA1 * rB0; ATL_pfavR(pA1, cwrdKB, 1);
         rC2_0 += rA2 * rB0; ATL_pfavR(pA2, cwrdKB, 2);
         rC3_0 += rA3 * rB0; ATL_pfavR(pA3, cwrdKB, 3);
         rC0_1 += rA0 * rB1; pB2 += incBm;
         rC1_1 += rA1 * rB1; pB3 += incBm;
         rC2_1 += rA2 * rB1;
         rC3_1 += rA3 * rB1;
         rC0_2 += rA0 * rB2;
         rC1_2 += rA1 * rB2;
         rC2_2 += rA2 * rB2;
         rC3_2 += rA3 * rB2;
         rC0_3 += rA0 * rB3;
         rC1_3 += rA1 * rB3;
         rC2_3 += rA2 * rB3;
         rC3_3 += rA3 * rB3;
         #ifdef TREAL
            *pC0 = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0;
            *pC1 = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1;
            *pC2 = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2;
            *pC3 = rC0_3; pC3[1] = rC1_3; pC3[2] = rC2_3; pC3[3] = rC3_3;
         #else
            *pC0 = rC0_0; pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0;
            *pC1 = rC0_1; pC1[2] = rC1_1; pC1[4] = rC2_1; pC1[6] = rC3_1;
            *pC2 = rC0_2; pC2[2] = rC1_2; pC2[4] = rC2_2; pC2[6] = rC3_2;
            *pC3 = rC0_3; pC3[2] = rC1_3; pC3[4] = rC2_3; pC3[6] = rC3_3;
         #endif
         pC0 += incCm;
         pC1 += incCm;
         pC2 += incCm;
         pC3 += incCm;
      }
      while(pA0 != stM);
      pC0 += incCn; pC1 += incCn; pC2 += incCn; pC3 += incCn;
      pA0 += incAn; pA1 += incAn; pA2 += incAn; pA3 += incAn;
      pB0 += incBn; pB1 += incBn; pB2 += incBn; pB3 += incBn;
   }
   while(pB0 != stN);
}
#ifdef incCm
   #undef incCm
#endif
