/*
 *             Automatically Tuned Linear Algebra Software v3.2
 *                      (C) Copyright 1999 Camm Maguire                      
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University of Tennessee, the ATLAS group,
 *      or the names of its contributers may not be used to endorse
 *      or promote products derived from this software without specific
 *      written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */
#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>

#define str(a_) xstr(a_)
#define xstr(a_) #a_

#define PREFN 64
#define PREFN2 32
#define PREFA str(PREFN)
#define PREFA2 str(PREFN2)
#define VOLATILE __volatile__

#define la           __asm__ VOLATILE ("movl %esi,%eax\n\taddl $16,%esi\n\t")
#define lb           __asm__ VOLATILE ("movl %eax,%ebx\n\taddl $" PREFA ",%ebx\n\t")
#define prefetch     __asm__ VOLATILE ("prefetcht0 (%ebx)\n\t")
#define prefetcha    __asm__ VOLATILE ("prefetcht0 (%eax)\n\t")
#define loada        __asm__ VOLATILE ("movups (%eax),%xmm0\n\t")
#if STRIDE % 4
#define loadb        __asm__ VOLATILE ("movups (%eax),%xmm1\n\t")
#else
#define loadb        __asm__ VOLATILE ("movaps (%eax),%xmm1\n\t")
#endif
#define xor(a_)      __asm__ VOLATILE ("xorps %xmm" #a_ ",%xmm" #a_ "\n\t")
#define mul          __asm__ VOLATILE ("mulps  %xmm0,%xmm1\n\t")
#define add(a_)      __asm__ VOLATILE ("addps %xmm1,%xmm" #a_ "\n\t")
#define incx(a_,b_)  __asm__ VOLATILE ("addl %e" #a_ "x,%e" #b_ "x\n\t")

#define dp(a_,b_)        {incx(a_,a);loadb;mul;add(b_);}
#define pref(a_)         {incx(a_,b);prefetch;}
#define prefa(a_)        {incx(a_,a);prefetcha;}
#define dpp(a_,b_,c_)    {incx(a_,a);loadb;pref(c_);mul;add(b_);}

#define bla1          {la;lb;prefetch;loada;dp(d,2);}
#define bla2          {bla1;dpp(c,3,d);}
#define bla3          {bla2;dp(c,4);}
#define bla4          {bla3;dpp(c,5,c);}
#define bla5          {bla4;dp(c,6);}
#define bla6          {bla5;dpp(c,7,c);}

#define blb1          {la;pref(d);loada;dp(d,2);}
#define blb2          {la;loada;dpp(d,2,c);dp(c,3);}
#define blb3          {la;pref(c);loada;dp(d,2);dpp(c,3,c);dp(c,4);}
#define blb4          {blb2;dpp(c,4,c);dp(c,5);}
#define blb5          {blb3;dpp(c,5,c);dp(c,6);}
#define blb6          {blb4;dpp(c,6,c);dp(c,7);}

#undef DOT_PROD4
#define DOT_PROD4 Mjoin(bla,NDP)

#undef DOT_PROD8
#define DOT_PROD8 {DOT_PROD4;Mjoin(blb,NDP);}

#undef DOT_PROD16
#define DOT_PROD16 {DOT_PROD8;DOT_PROD8;}

#define LOOP __asm__ VOLATILE ("\nloop_" str(EXT) ":\n\t"\
                               "cmpl %esi,%edi\n\t"\
                               "jne block16_" str(EXT) "\n\t"\
                               "jmp block8_" str(EXT) "\n\t"\
                               ".align 16\n"\
                               "\nblock16_" str(EXT) ":\n\t")\

#define LAB8 __asm__ VOLATILE ("jmp loop_" str(EXT) "\n\t"\
		             ".align 16\n"\
		             "block8_" str(EXT) ":\n\t"\
		             "movl %0,%%edi\n\t"\
		             "testl $8,%%edi\n\t"\
		             "je block4_" str(EXT) "\n\t"\
		             : : "m" (len))

#define LAB4 __asm__ VOLATILE ("\nblock4_" str(EXT) ":\n\t"\
		             "testl $4,%edi\n\t"\
		             "je block1_" str(EXT) "\n\t")

#define LAB1(a_) __asm__ VOLATILE ("\nblock1_" str(EXT) ":\n\tmovl %%esi,%0\n\t" \
                             : "=m" (a_) : : "si" )


#define load_regs(a_,b_,c_,d_)  \
__asm__ VOLATILE ("movl %0,%%esi\n\t"\
		  "movl %%esi,%%eax\n\t"\
		  "movl %%eax,%%ebx\n\t"\
		  "addl $" PREFA2 ",%%ebx\n\t"\
		  "movl %1,%%edi\n\t"\
		  "movl %2,%%edx\n\t"\
		  "movl %3,%%ecx\n\t"\
		  : : "m" (a_),"m" (b_),"m" (c_),"m" (d_) : \
		  "ax","bx","cx","dx","si","di")

#define ipref(a_,b_)  {prefa(a_);xor(b_);pref(a_);}

#define ir1      {prefetcha;prefetch;ipref(d,2);}
#define ir2      {ir1;ipref(c,3);}
#define ir3      {ir2;ipref(c,4);}
#define ir4      {ir3;ipref(c,5);}
#define ir5      {ir4;ipref(c,6);}
#define ir6      {ir5;ipref(c,7);}

#define init_regs  Mjoin(ir,NDP)

#define init_unload(a_,b_) \
__asm__ VOLATILE ("movl %0,%%esi\n\tmovl %1,%%edi\n\t" : : "m" (a_),"m" (b_):"si","di")

#define incs           __asm__ VOLATILE ("addl %edi,%esi\n\t")
#define unload_reg(a_) __asm__ VOLATILE (\
			"movlhps %xmm" #a_ ",%xmm1\n\t"\
			"addps %xmm1,%xmm" #a_ "\n\t"\
			"shufps $48,%xmm" #a_ ",%xmm1\n\t"\
			"addps %xmm1,%xmm" #a_ "\n\t"\
			"movhlps %xmm" #a_ ",%xmm" #a_ "\n\t"\
			"addss (%esi), %xmm" #a_ "\n\t"\
			"movss %xmm" #a_ ",(%esi)\n\t")
#define unload_nreg(a_) {incs;unload_reg(a_);}


#define ur1 {unload_reg(2);}
#define ur2 {ur1;unload_nreg(3);}
#define ur3 {ur2;unload_nreg(4);}
#define ur4 {ur3;unload_nreg(5);}
#define ur5 {ur4;unload_nreg(6);}
#define ur6 {ur5;unload_nreg(7);}
     
#define unload_regs(a_,b_) {init_unload(a_,b_);Mjoin(ur,NDP);}

     
static __inline__ void
Mjoin(dp,EXT)(const float *a,const float *b,int ldb,float *c,int cinc,int len) {

    const float *ae,*ae16,*ba;
    const float *tb;
    float *tc;
    int i,a2b,b2b,c2c;

    a2b=(b-a)*sizeof(*a);
    b2b=ldb*sizeof(*b);
    c2c=cinc*sizeof(*c);

    ae=a+len;

    ba=(const float *)(((unsigned int)(b+3)>>4)<<4);
    for (;a<ae && b<ba;a++,b++) {
      for (tc=c,tb=b,i=0;i<NDP;i++,tc+=cinc,tb+=ldb)
	(*tc)+= *a * *tb;
    }      

    len=ae-a;
    ae16 = a + ((len>>4)<<4);

    load_regs(a,ae16,a2b,b2b);
    init_regs;

    LOOP;

    DOT_PROD16;

    LAB8;

    DOT_PROD8;

    LAB4;

    DOT_PROD4;

    LAB1(a);

    unload_regs(c,c2c);

    b=(void *)a+a2b;
    for (;a < ae;a++,b++) {
      for (tc=c,tb=b,i=0;i<NDP;i++,tc+=cinc,tb+=ldb)
	(*tc)+= *a * *tb;
    }

}
