/*
 *             Automatically Tuned Linear Algebra Software v3.2
 *                      (C) Copyright 1999 Camm Maguire                      
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University of Tennessee, the ATLAS group,
 *      or the names of its contributers may not be used to endorse
 *      or promote products derived from this software without specific
 *      written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */
#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>

#define str(a_) xstr(a_)
#define xstr(a_) #a_

#define PREFN 64
#define PREFN2 32
#define PREFA str(PREFN)
#define PREFA2 str(PREFN2)
#define VOLATILE __volatile__

#define la           __asm__ VOLATILE ("movl %esi,%eax\n\t")
#define lb           __asm__ VOLATILE ("movl %eax,%ebx\n\taddl $" PREFA ",%ebx\n\t")
#define prefetch     __asm__ VOLATILE ("prefetcht0 (%ebx)\n\t")
#define prefetcha    __asm__ VOLATILE ("prefetcht0 (%eax)\n\t")
#define loadc        __asm__ VOLATILE ("movups (%eax),%xmm7\n\t")
#if STRIDE % 4
#define loadb        __asm__ VOLATILE ("movups (%eax),%xmm6\n\t")
#else
#define loadb        __asm__ VOLATILE ("movaps (%eax),%xmm6\n\t")
#endif
#define writec       __asm__ VOLATILE ("movups %xmm7,(%esi)\n\taddl $16,%esi")
#define xor(a_)      __asm__ VOLATILE ("xorps %xmm" #a_ ",%xmm" #a_ "\n\t")
#define mul(a_)      __asm__ VOLATILE ("mulps  %xmm" #a_ ",%xmm6\n\t")
#define add          __asm__ VOLATILE ("addps %xmm6,%xmm7\n\t")
#define incx(a_,b_)  __asm__ VOLATILE ("addl %e" #a_ "x,%e" #b_ "x\n\t")

#define dp(a_,b_)           {incx(a_,a);loadb;mul(b_);add;}
#define pref(a_)         {incx(a_,b);prefetch;}
#define prefa(a_)        {incx(a_,a);prefetcha;}
#define dpp(a_,b_,c_)       {incx(a_,a);loadb;pref(c_);mul(b_);add;}

#define bla1          {la;lb;prefetch;loadc;dp(d,0);}
#define bla2          {bla1;dpp(c,1,d);}
#define bla3          {bla2;dp(c,2);}
#define bla4          {bla3;dpp(c,3,c);}
#define bla5          {bla4;dp(c,4);}
#define bla6          {bla5;dpp(c,5,c);}

#define blb1          {la;pref(d);loadc;dp(d,0);}
#define blb2          {la;loadc;dpp(d,0,c);dp(c,1);}
#define blb3          {la;pref(c);loadc;dp(d,0);dpp(c,1,c);dp(c,2);}
#define blb4          {blb2;dpp(c,2,c);dp(c,3);}
#define blb5          {blb3;dpp(c,3,c);dp(c,4);}
#define blb6          {blb4;dpp(c,4,c);dp(c,5);}

#undef DOT_PROD4
#define DOT_PROD4 {Mjoin(bla,NDP);writec;}

#undef DOT_PROD8
#define DOT_PROD8 {DOT_PROD4;Mjoin(blb,NDP);writec;}

#undef DOT_PROD16
#define DOT_PROD16 {DOT_PROD8;DOT_PROD8;}

#define LOOP __asm__ VOLATILE ("\nloop_" str(EXT) ":\n\t"\
                               "cmpl %esi,%edi\n\t"\
                               "jne block16_" str(EXT) "\n\t"\
                               "jmp block8_" str(EXT) "\n\t"\
                               ".align 16\n"\
                               "\nblock16_" str(EXT) ":\n\t")\

#define LAB8 __asm__ VOLATILE ("jmp loop_" str(EXT) "\n\t"\
		             ".align 16\n"\
		             "block8_" str(EXT) ":\n\t"\
		             "movl %0,%%edi\n\t"\
		             "testl $8,%%edi\n\t"\
		             "je block4_" str(EXT) "\n\t"\
		             : : "m" (len))
#define LAB4 __asm__ VOLATILE ("\nblock4_" str(EXT) ":\n\t"\
		             "testl $4,%edi\n\t"\
		             "je block1_" str(EXT) "\n\t")

#define LAB1(a_) __asm__ VOLATILE ("\nblock1_" str(EXT) ":\n\tmovl %%esi,%0\n\t" \
                             : "=m" (a_) : : "si" )

#define load_regs(a_,b_,c_,d_)  \
__asm__ VOLATILE ("movl %0,%%esi\n\t"\
		  "movl %%esi,%%eax\n\t"\
		  "movl %%eax,%%ebx\n\t"\
		  "addl $" PREFA2 ",%%ebx\n\t"\
		  "movl %1,%%edi\n\t"\
		  "movl %2,%%edx\n\t"\
		  "movl %3,%%ecx\n\t"\
		  : : "m" (a_),"m" (b_),"m" (c_),"m" (d_) : \
		  "ax","bx","cx","dx","si","di")

#define ipref(a_)  {prefa(a_);pref(a_);}

#define ir1      {prefetcha;prefetch;ipref(d);}
#define ir2      {ir1;ipref(c);}
#define ir3      {ir2;ipref(c);}
#define ir4      {ir3;ipref(c);}
#define ir5      {ir4;ipref(c);}
#define ir6      {ir5;ipref(c);}

#define init_regs  Mjoin(ir,NDP)

#define init_preload(a_,b_) __asm__ VOLATILE (\
"movl %0,%%ecx\n\tmovl %1,%%edx\n\t"::"m" (a_),"m" (b_):"ax","bx")

#define inca __asm__ VOLATILE ("addl %edx,%ecx\n\t");

#define preload_reg(a_) __asm__ VOLATILE (\
"movss (%ecx),%xmm7\n\tmovups %xmm7,%xmm" #a_ "\n\tshufps $0,%xmm7,%xmm" #a_ "\n\t")

#define pr1 {preload_reg(0);}
#define pr2 {pr1;inca;preload_reg(1);}
#define pr3 {pr2;inca;preload_reg(2);}
#define pr4 {pr3;inca;preload_reg(3);}
#define pr5 {pr4;inca;preload_reg(4);}
#define pr6 {pr5;inca;preload_reg(5);}
     
#define preload_regs(a_,b_) {init_preload(a_,b_);Mjoin(pr,NDP);}

     
static __inline__ void
Mjoin(ma,EXT)(const float *a,int ainc,const float *b,int ldb,float *c,int len) {

    const float *ce,*ce16,*ba;
    const float *tb;
    const float *ta;
    int i,c2b,b2b,a2a;

    c2b=(b-c)*sizeof(*c);
    b2b=ldb*sizeof(*b);
    a2a=ainc*sizeof(*a);

    ce=c+len;

    ba=(const float *)(((unsigned int)(b+3)>>4)<<4);
    for (;c<ce && b<ba;c++,b++) {
      for (ta=a,tb=b,i=0;i<NDP;i++,ta+=ainc,tb+=ldb)
	(*c)+= *ta * *tb;
    }      

    len=ce-c;
    ce16 = c + ((len>>4)<<4);

    preload_regs(a,a2a);

    load_regs(c,ce16,c2b,b2b);
    init_regs;

    LOOP;

    DOT_PROD16;

    LAB8;

    DOT_PROD8;

    LAB4;

    DOT_PROD4;

    LAB1(c);

    b=(void *)c+c2b;
    for (;c < ce;c++,b++) {
      for (ta=a,tb=b,i=0;i<NDP;i++,ta+=ainc,tb+=ldb)
	(*c)+= *ta * *tb;
    }

}
