#include "atlas_misc.h"
#include "atlas_prefetch.h"
static void ATL_dot(const int N, const float *X, const float *Y, float *dot)
{
   int i;
   register float rd=(*dot), id=dot[1], ry, iy, rx, ix;
   switch(N)
   {
   case 3:
      *dot += X[4]*Y[4] - X[5]*Y[5];
      dot[1] += X[4]*Y[5] + X[5]*Y[4];
   case 2:
      *dot += X[2]*Y[2] - X[3]*Y[3];
      dot[1] += X[2]*Y[3] + X[3]*Y[2];
   case 1:
      *dot += *X * *Y - X[1]*Y[1];
      dot[1] += *X * Y[1] + X[1] * *Y;
      return;
   default:;
   }
   for (i=N; i; i--) 
   {
      rx = *X; ix = X[1];
      ry = *Y; iy = Y[1];
      rd += rx*ry - ix*iy; X += 2;
      id += rx*iy + ix*ry; Y += 2;
   }
   *dot = rd;
   dot[1] = id;
}
static void ATL_dot_av(const int N, const float *X, const float *Y, float *dot)
{
   vector float v0, v1, v2;
   vector float vdotr=(vector float)(-0.0f, -0.0f, -0.0f, -0.0f);
   vector float vdoti=(vector float)(-0.0f, -0.0f, -0.0f, -0.0f);
   const vector unsigned char vp = (vector unsigned char)
      (4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11);
   const float *stX = X + N+N;
   char ln[64];
   float *vt;

   do
   {
      v0 = vec_ld(0, X);
      v1 = vec_ld(0, Y);
      vdotr = vec_madd(v0, v1, vdotr); X += 4;
      v2 = vec_perm(v1, v1, vp);
      vdoti = vec_madd(v0, v2, vdoti); Y += 4;
   }
   while (X != stX);
   vt = (float*) (16 + ((((size_t) ln)>>4)<<4));
   vec_st(vdotr, 0, vt);
   vec_st(vdoti, 0, vt+4);
   *dot += vt[0] - vt[1] + vt[2] - vt[3];
   dot[1] += vt[4] + vt[5] + vt[6] + vt[7];
}

void ATL_UDOT(const int N, const TYPE *X, const int incx, 
              const TYPE *Y, const int incy, SCALAR dot)
{
   int cwrd = ATL_MulBySize(N)>>4;
   int i, n;
   size_t ii;

   *dot = dot[1] = ATL_rzero;
   if (N >= 16)
   {
      if (cwrd >= 64)
      {
         cwrd = (cwrd+31)>>5;
         if (cwrd <= 256) cwrd = ATL_GetCtrl(512, cwrd <= 255 ? cwrd : 0, 0);
         else /* use all pipes */
         {
            cwrd >>= 1;
            cwrd = ATL_GetCtrl(1024, cwrd <= 255 ? cwrd : 0, 0);
            ATL_pfavR(X+128, cwrd, 2);
            ATL_pfavR(Y+128, cwrd, 3);
         }
      }
      else cwrd = ATL_GetCtrl(64, (cwrd+3)>>2, 4);
      ATL_pfavR(X, cwrd, 0);
      ATL_pfavR(Y, cwrd, 1);
      for (n=0, ii=(size_t)X; n<N && (ii>>4)<<4 != ii; n++, ii+=ATL_sizeof);
      if (n) 
      {
         ATL_dot(n, X, Y, dot);
         X += n+n;
         Y += n+n;
      }
      n = N - n;
      if ( (i=((n>>2)<<2)) )
      {
         ATL_dot_av(i, X, Y, dot);
         X += i+i;
         Y += i+i;
         n -= i;
      }
      if (n) ATL_dot(n, X, Y, dot);
   }
   else ATL_dot(N, X, Y, dot);
}
