#include "atlas_misc.h"
#include "atlas_prefetch.h"

static void axpyCU(const int N, const SCALAR alpha0, const TYPE *X, TYPE *Y)
{
   const TYPE *stX;
   int nr = N;
   register TYPE alpha=alpha0;

   if (nr >= 8)
   {
      *Y   += alpha * *X;
      Y[1] += alpha * X[1];
      Y[2] += alpha * X[2];
      Y[3] += alpha * X[3];
      Y[4] += alpha * X[4];
      Y[5] += alpha * X[5];
      Y[6] += alpha * X[6];
      Y[7] += alpha * X[7];
      X += 8;
      Y += 8;
      nr -= 8;
   }
   if (nr >= 4)
   {
      *Y   += alpha * *X;
      Y[1] += alpha * X[1];
      Y[2] += alpha * X[2];
      Y[3] += alpha * X[3];
      X += 4;
      Y += 4;
      nr -= 4;
   }
   if (nr >= 2)
   {
      *Y   += alpha * *X;
      Y[1] += alpha * X[1];
      X += 2;
      Y += 2;
      nr -= 2;
   }
   if (nr >= 1)
   {
      *Y   += alpha * *X;
   }
}
void ATL_UAXPY(const int N, const SCALAR alpha0, const TYPE *X, const int incX,
               TYPE *Y, const int incY)
{
   const int n = (N/16)*16;
   const TYPE *stX;
   int nr = N-n;
   register TYPE alpha=alpha0;

   if (n)
   {
      stX = X + n;
/*
         ATL_pfl1R(X); 
         ATL_pfl1W(Y); 
         ATL_pfl1R(X+4); 
         ATL_pfl1W(Y+4); 
         ATL_pfl1R(X+8); 
         ATL_pfl1W(Y+8); 
         ATL_pfl1R(X+12); 
         ATL_pfl1W(Y+12); 
*/
      do
      {
         ATL_pfl1R(X+16); 
         ATL_pfl1W(Y+16); 
         ATL_pfl1R(X+20); 
         ATL_pfl1W(Y+20); 
         ATL_pfl1R(X+24); 
         ATL_pfl1W(Y+24); 
         ATL_pfl1R(X+28); 
         ATL_pfl1W(Y+28); 
         Y[0] += alpha * X[0];
         Y[1] += alpha * X[1];
         Y[2] += alpha * X[2];
         Y[3] += alpha * X[3];
         Y[4] += alpha * X[4];
         Y[5] += alpha * X[5];
         Y[6] += alpha * X[6];
         Y[7] += alpha * X[7];
         Y[8] += alpha * X[8];
         Y[9] += alpha * X[9];
         Y[10] += alpha * X[10];
         Y[11] += alpha * X[11];
         Y[12] += alpha * X[12];
         Y[13] += alpha * X[13];
         Y[14] += alpha * X[14];
         Y[15] += alpha * X[15];
         X += 16;
         Y += 16;
      }
      while (X != stX);
   }
   if (nr) axpyCU(nr, alpha0, X, Y);
}
