/*
 *             Automatically Tuned Linear Algebra Software v3.3.7
 **************** THIS IS AN UNSUPPORTED DEVELOPER RELEASE *****************
 *                      (C) Copyright 1999 Camm Maguire
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University of Tennessee, the ATLAS group,
 *      or the names of its contributers may not be used to endorse
 *      or promote products derived from this software without specific
 *      written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "camm_util.h"
#include "camm_strat.h"

#define VERS 1

#if defined(DREAL) || defined(DCPLX)
#define NR KB8
#else
#define NR KB4
#endif

#define pf(a_,b_)  /*  f(nta,a_,b_) */



#if defined(DREAL) || defined(DCPLX)
#define Z1(a_,b_) pc(a_,b_) ps(1,b_,b_) pasr(b_,a_)
#else
#define Z1(a_,b_) phl(a_,b_) pa(b_,a_) pc(a_,b_) ps(1,b_,b_) pasr(b_,a_)
#endif
#if defined(DREAL) || defined (SREAL)
#ifdef DREAL
#define Z1x4    f(t0,0,cx) pc(4,0) pul(5,4) pc(6,1) puh(5,0) pul(7,6)  \
                puh(7,1) pa(0,4) pa(1,6) pu(4,0,cx) pu(6,SS(CS,CS),cx)
#define Z1x2    f(t0,0,cx) pc(4,0) pul(5,4) puh(5,0)  \
                pa(0,4) pu(4,0,cx)
#else
#define Z1x4    f(t0,0,cx) pc(4,0) pul(5,4) pc(6,1) puh(5,0) pul(7,6)  \
                pa(0,4) puh(7,1) pc(4,2) pa(1,6) ps(68,6,4) ps(238,6,2) pa(4,2) pu(2,0,cx)
#define Z1x2    f(t0,0,cx) pc(4,0) pul(5,4) puh(5,0)  \
                pa(0,4) phl(4,2) pa(2,4) pud(4,0,cx)
#endif
#else
#define Z1x4    Z1(4,0) pus(4,0,cx) Z1(5,1) pus(5,CS,cx) \
                Z1(6,0) pus(6,SS(CS,CS),cx) Z1(7,1) pus(7,SS(SS(CS,CS),CS),cx)
#define Z1x2    Z1(4,0) pus(4,0,cx) Z1(5,1) pus(5,CS,cx)
#endif
#define Z1x1    Z1(4,0) pus(4,0,cx)

#ifdef BETA0
#define W1x4    px(4) px(5) px(6) px(7)
#define W1x2    px(4) px(5)
#define W1x1    px(4)
#endif
#ifdef BETA1
#define W1x4    pls(0,cx,4) pls(CS,cx,5) pls(SS(CS,CS),cx,6) \
                pls(SS(SS(CS,CS),CS),cx,7)
#define W1x2    pls(0,cx,4) pls(CS,cx,5)
#define W1x1    pls(0,cx,4)
#endif
#ifdef BETAX
#define W1x4    pls(0,cx,4) pls(CS,cx,5) pls(SS(CS,CS),cx,6) \
                pls(SS(SS(CS,CS),CS),cx,7) \
                pmsr(3,4) pmsr(3,5) pmsr(3,6) pmsr(3,7)
#define W1x2    pls(0,cx,4) pls(CS,cx,5) pmsr(3,4) pmsr(3,5)
#define W1x1    pls(0,cx,4) pmsr(3,4)
#endif

#if defined(DREAL) || defined(SREAL)
#ifdef DREAL
#define CS 8
#else
#define CS 4
#endif
#define LDCM 1
#else
#ifdef DCPLX
#define CS 16
#else
#define CS 8
#endif
#define LDCM 2
#endif



#if defined(SREAL) || defined(SCPLX)
#define MTYPE float
#else
#define MTYPE double
#endif



void
ATL_USERMM (int m, int n, int k, MTYPE alpha, const MTYPE *a,
	    int lda,const MTYPE *b, int ldb, MTYPE beta, MTYPE *c,
	    int ldc) {

  const MTYPE *bbp=&beta;

  ASM (

#if KB % 4
#error KB must be divisible by four -- m n cleanup needs alignment
#endif

#ifdef BETAX
       pls(0,di,3)
#endif

       "pushl %%ebx\n\t"
       "movl  %%esi,%%ebx\n\t"

#if MB == 0 || NB == 0
       a(4,sp)

#if MB == 0
       "movl %4,%%esi\n\t"
#endif
#if NB == 0
       "movl %5,%%edi\n\t"
#endif
       a(-4,sp)
#endif

       "pushl %%ebp\n\t"
#if NB == 0
       "movl %%edi,%%ebp\n\t"
#else
       mm(MM(NR,NB),bp)
       ra(bx,bp)
#endif

#if MB == 0
       a(8,sp)
       "movl %6,%%edi\n\t"
       a(-8,sp)
#else
       mm(MM(NR,E4(MB)),di)
       ra(ax,di)
#endif

       lab(loopb)

#if NB == 0
       cmp(bx,bp)
       je(end)
#endif

       "pushl %%edi\n\t"
       "pushl %%eax\n\t"

       lab(loopa)

#if MB == 0
       cmp(ax,di)
       je(2)
#endif

#if MB == 0 ||  MB >= 4
#undef N
#define N Mjoin(1x4_,VERS)
#include "camm_pipe2.h"

       W1x4
       KB_block
       Z1x4

       a(SS(SS(NR,NR),SS(NR,NR)),ax)
       a(SS(SS(CS,CS),SS(CS,CS)),cx)

#endif

#if MB == 0
       jmp(loopa)
#else
       cmp(ax,di)
       jne(loopa)
#endif

#if MB == 0
       lab(2)
       a(SS(NR,NR),di)
       cmp(di,si)
       jl(1)
#endif

#if MB == 0 || ( MB / 2 ) % 2
#undef N
#define N Mjoin(1x2_,VERS)
#include "camm_pipe2.h"

       W1x2
       KB_block
       Z1x2

       a(SS(NR,NR),ax)
       a(SS(CS,CS),cx)

#endif

#if MB == 0
       lab(1)
       cmp(ax,si)
       je(stop)
#endif

#if MB == 0 || MB % 2

#undef N
#define N Mjoin(1x1_,VERS)
#include "camm_pipe2.h"

       W1x1
       KB_block
       Z1x1

/*         a(NR,ax) */
       a(CS,cx)

#endif

#if MB == 0
       lab(stop)
#endif

       "popl %%eax\n\t"
       "popl %%edi\n\t"
       ra(dx,cx)
       a(NR,bx)

#if NB == 0
       jmp(loopb)
       lab(end)
#else
       cmp(bx,bp)
       jne(loopb)
#endif

       "popl %%ebp\n\t"
       "popl %%ebx\n\t"


       ::"a" (a),"S" (b),"c" (c),"d" ((ldc-m)*LDCM*sizeof(*c)),
       "m" (a+m*KB),"m" (b+n*KB),"m" (a+((m>>2)<<2)*KB)
#ifdef BETAX
       ,"D" (bbp):"memory");
#else
       :"di","memory");
#endif

}
