
/*
 *             Automatically Tuned Linear Algebra Software v3.5.9
 *                    (C) Copyright 2001 R. Clint Whaley
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the ATLAS group or the names of its contributers may
 *      not be used to endorse or promote products derived from this
 *      software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */
#ifndef ATL_GAS_x8664
   #error "This kernel requires x86-64 assembly!"
#endif
#ifndef Mjoin
   #define Mjoin(pre, nam) my_join(pre, nam)
   #define my_join(pre, nam) pre ## nam
#endif

#if defined(ATL_OS_WinNT) || defined(ATL_OS_Win9x)
   #define ATL_AUSERMM Mjoin(_,ATL_USERMM)
#else
   #define ATL_AUSERMM ATL_USERMM
#endif


#if !defined(NB) || (NB == 0)
   #error "NB must be a compile-time constant!"
#endif

#if (NB/14)*14 != NB
   #error "NB must be multiple of 14!"
#endif

#if !defined(ATL_GAS_x8664)
   #error "This kernel requires a gas x86-64 assembler!"
#endif

#
#  Prefetch defines
#
#define pref2(mem) prefetcht1   mem
#define prefB(mem) prefetcht0   mem
#define prefC(mem) prefetchw    mem

#define NBso	(NB*4)
#define NBNBso  (NB*NB*4)
#define NB2so   (NBso+NBso)
#define NB3so   (NBso+NBso+NBso)
#define NB4so   (NBso+NBso+NBso+NBso)
#define NB5so   (NBso+NBso+NBso+NBso+NBso)
#define NB6so   (NB3so+NB3so)
#define NB7so   (NB3so+NB4so)
#define NB8so   (NB4so+NB4so)
#define NB9so   (NB4so+NB5so)
#define NB10so   (NB5so+NB5so)
#define NB11so   (NB6so+NB5so)
#define NB12so   (NB7so+NB5so)
#define NB13so   (NB8so+NB5so)
#define NB14so   (NB9so+NB5so)

#
#  SSE2 register usage shown be these defines
#
#define rA0	%xmm0
#define rC0	%xmm1
#define rC1	%xmm2
#define rC2	%xmm3
#define rC3	%xmm4
#define rC4	%xmm5
#define rC5	%xmm6
#define rC6	%xmm7
#define rC7	%xmm8
#define rC8	%xmm9
#define rC9	%xmm10
#define rC10	%xmm11
#define rC11	%xmm12
#define rC12	%xmm13
#define rC13	%xmm14
#define rB0	%xmm15

#
#  Integer register usage shown be these defines
#
#define pC      %rsi
#define pA      %rcx
#define pB      %rdi
#define incCn   %rax
#define stM     %rdx
#define stN     %rbp
#define pfA     %r8
#       rax     used in 32/64 conversion

#
# BYTE:                  %rdi         %rsi         %rdx              xmm0
# void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha,
# BYTE:                    %rcx            %r8            %r9              8
#                 const TYPE *A, const int lda, const TYPE *B, const int ldb,
# BYTE:                     %xmm1       16             24
#                 const TYPE beta, TYPE *C, const int ldc)
#
	.text
.global ATL_AUSERMM
ATL_AUSERMM:
#
#	Save callee-saved iregs
#
	movq	%rbp, %r11
   #ifdef BETAX
	movss	%xmm1, -24(%rsp)
   #endif
#
#       pA already comes in right reg
#	Initialize pB = B; pC = C;
#
	movq	%r9, pB
	movq	16(%rsp), pC
					prefC((pC))
					prefC(64(pC))
#
#
#       stM = pA + NBNB - NB*14;  stN = pB + NBNB;
#
	movq	pA, stM
	addq	$NBNBso-NB14so, stM
	movq	stM, pfA
	addq	$NB14so, pfA
	movq	$NBNBso, stN
	addq	pB, stN
#
#       convert ldc to 64 bits, and then set incCn = (ldc - NB)*sizeof
#
	movl	24(%rsp), %eax
	cltq
   #ifdef SREAL
	movq	%rax, %r9
	subq	$NB-14, incCn
	shl	$2, incCn
   #else
	subq	$NB-14, incCn
	shl	$3, incCn
   #endif
UNLOOP:
UMLOOP:
#
#	rC[0-13] = pC[0-13] * beta
#
#ifdef BETA0
	xorps	rC0, rC0
	xorps	rC1, rC1
	xorps	rC2, rC2
	xorps	rC3, rC3
	xorps	rC4, rC4
	xorps	rC5, rC5
	xorps	rC6, rC6
	xorps	rC7, rC7
	xorps	rC8, rC8
	xorps	rC9, rC9
	xorps	rC10, rC10
	xorps	rC11, rC11
	xorps	rC12, rC12
	xorps	rC13, rC13
#else
   #ifdef SREAL
	movss	(pC), rC0
	movss	4(pC), rC1
	movss	8(pC), rC2
	movss	12(pC), rC3
	movss	16(pC), rC4
	movss	20(pC), rC5
	movss	24(pC), rC6
	movss	28(pC), rC7
	movss	32(pC), rC8
	movss	36(pC), rC9
	movss	40(pC), rC10
	movss	44(pC), rC11
	movss	48(pC), rC12
	movss	52(pC), rC13
   #else
	movss	(pC), rC0
	movss	8(pC), rC1
	movss	16(pC), rC2
	movss	24(pC), rC3
	movss	32(pC), rC4
	movss	40(pC), rC5
	movss	48(pC), rC6
	movss	56(pC), rC7
	movss	64(pC), rC8
	movss	72(pC), rC9
	movss	80(pC), rC10
	movss	88(pC), rC11
	movss	96(pC), rC12
	movss	104(pC), rC13
   #endif
   #ifdef BETAX
	mulss	-24(%rsp), rC0
	mulss	-24(%rsp), rC1
	mulss	-24(%rsp), rC2
	mulss	-24(%rsp), rC3
	mulss	-24(%rsp), rC4
	mulss	-24(%rsp), rC5
	mulss	-24(%rsp), rC6
	mulss	-24(%rsp), rC7
	mulss	-24(%rsp), rC8
	mulss	-24(%rsp), rC9
	mulss	-24(%rsp), rC10
	mulss	-24(%rsp), rC11
	mulss	-24(%rsp), rC12
	mulss	-24(%rsp), rC13
   #endif
#endif
#KLOOP:
	movaps	(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC0
	movaps	NBso(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC1
	movaps	NB2so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC2
	movaps	NB3so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC3
	movaps	NB4so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC4
	movaps	NB5so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC5
	movaps	NB6so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC6
	movaps	NB7so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC7
	movaps	NB8so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC8
	movaps	NB9so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC9
	movaps	NB10so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC10
	movaps	NB11so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC11
	movaps	NB12so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC12
	movaps	NB13so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC13

	movaps	16(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC0
	movaps	16+NBso(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC1
	movaps	16+NB2so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC2
	movaps	16+NB3so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC3
	movaps	16+NB4so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC4
	movaps	16+NB5so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC5
	movaps	16+NB6so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC6
	movaps	16+NB7so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC7
	movaps	16+NB8so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC8
	movaps	16+NB9so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC9
	movaps	16+NB10so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC10
	movaps	16+NB11so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC11
	movaps	16+NB12so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC12
	movaps	16+NB13so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC13

	movaps	32(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC0
	movaps	32+NBso(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC1
	movaps	32+NB2so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC2
	movaps	32+NB3so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC3
	movaps	32+NB4so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC4
	movaps	32+NB5so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC5
	movaps	32+NB6so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC6
	movaps	32+NB7so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC7
	movaps	32+NB8so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC8
	movaps	32+NB9so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC9
	movaps	32+NB10so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC10
	movaps	32+NB11so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC11
	movaps	32+NB12so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC12
	movaps	32+NB13so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC13

	movaps	48(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC0
	movaps	48+NBso(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC1
	movaps	48+NB2so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC2
	movaps	48+NB3so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC3
	movaps	48+NB4so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC4
	movaps	48+NB5so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC5
	movaps	48+NB6so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC6
	movaps	48+NB7so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC7
	movaps	48+NB8so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC8
	movaps	48+NB9so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC9
	movaps	48+NB10so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC10
	movaps	48+NB11so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC11
	movaps	48+NB12so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC12
	movaps	48+NB13so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC13

	movaps	64(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC0
	movaps	64+NBso(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC1
	movaps	64+NB2so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC2
	movaps	64+NB3so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC3
	movaps	64+NB4so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC4
	movaps	64+NB5so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC5
	movaps	64+NB6so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC6
	movaps	64+NB7so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC7
	movaps	64+NB8so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC8
	movaps	64+NB9so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC9
	movaps	64+NB10so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC10
	movaps	64+NB11so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC11
	movaps	64+NB12so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC12
	movaps	64+NB13so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC13

	movaps	80(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC0
	movaps	80+NBso(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC1
	movaps	80+NB2so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC2
	movaps	80+NB3so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC3
	movaps	80+NB4so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC4
	movaps	80+NB5so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC5
	movaps	80+NB6so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC6
	movaps	80+NB7so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC7
	movaps	80+NB8so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC8
	movaps	80+NB9so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC9
	movaps	80+NB10so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC10
	movaps	80+NB11so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC11
	movaps	80+NB12so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC12
	movaps	80+NB13so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC13

	movaps	96(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC0
	movaps	96+NBso(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC1
	movaps	96+NB2so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC2
	movaps	96+NB3so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC3
	movaps	96+NB4so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC4
	movaps	96+NB5so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC5
	movaps	96+NB6so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC6
	movaps	96+NB7so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC7
	movaps	96+NB8so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC8
	movaps	96+NB9so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC9
	movaps	96+NB10so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC10
	movaps	96+NB11so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC11
	movaps	96+NB12so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC12
	movaps	96+NB13so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC13

	movaps	112(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC0
	movaps	112+NBso(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC1
	movaps	112+NB2so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC2
	movaps	112+NB3so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC3
	movaps	112+NB4so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC4
	movaps	112+NB5so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC5
	movaps	112+NB6so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC6
	movaps	112+NB7so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC7
	movaps	112+NB8so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC8
	movaps	112+NB9so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC9
	movaps	112+NB10so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC10
	movaps	112+NB11so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC11
	movaps	112+NB12so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC12
	movaps	112+NB13so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC13

	movaps	128(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC0
	movaps	128+NBso(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC1
	movaps	128+NB2so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC2
	movaps	128+NB3so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC3
	movaps	128+NB4so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC4
	movaps	128+NB5so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC5
	movaps	128+NB6so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC6
	movaps	128+NB7so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC7
	movaps	128+NB8so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC8
	movaps	128+NB9so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC9
	movaps	128+NB10so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC10
	movaps	128+NB11so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC11
	movaps	128+NB12so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC12
	movaps	128+NB13so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC13

	movaps	144(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC0
	movaps	144+NBso(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC1
	movaps	144+NB2so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC2
	movaps	144+NB3so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC3
	movaps	144+NB4so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC4
	movaps	144+NB5so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC5
	movaps	144+NB6so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC6
	movaps	144+NB7so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC7
	movaps	144+NB8so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC8
	movaps	144+NB9so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC9
	movaps	144+NB10so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC10
	movaps	144+NB11so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC11
	movaps	144+NB12so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC12
	movaps	144+NB13so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC13

	movaps	160(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC0
	movaps	160+NBso(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC1
	movaps	160+NB2so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC2
	movaps	160+NB3so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC3
	movaps	160+NB4so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC4
	movaps	160+NB5so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC5
	movaps	160+NB6so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC6
	movaps	160+NB7so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC7
	movaps	160+NB8so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC8
	movaps	160+NB9so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC9
	movaps	160+NB10so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC10
	movaps	160+NB11so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC11
	movaps	160+NB12so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC12
	movaps	160+NB13so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC13

	movaps	176(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC0
	movaps	176+NBso(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC1
	movaps	176+NB2so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC2
	movaps	176+NB3so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC3
	movaps	176+NB4so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC4
	movaps	176+NB5so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC5
	movaps	176+NB6so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC6
	movaps	176+NB7so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC7
	movaps	176+NB8so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC8
	movaps	176+NB9so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC9
	movaps	176+NB10so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC10
	movaps	176+NB11so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC11
	movaps	176+NB12so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC12
	movaps	176+NB13so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC13

	movaps	192(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC0
	movaps	192+NBso(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC1
	movaps	192+NB2so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC2
	movaps	192+NB3so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC3
	movaps	192+NB4so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC4
	movaps	192+NB5so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC5
	movaps	192+NB6so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC6
	movaps	192+NB7so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC7
	movaps	192+NB8so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC8
	movaps	192+NB9so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC9
	movaps	192+NB10so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC10
	movaps	192+NB11so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC11
	movaps	192+NB12so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC12
	movaps	192+NB13so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC13

	movaps	208(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC0
	movaps	208+NBso(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC1
	movaps	208+NB2so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC2
	movaps	208+NB3so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC3
	movaps	208+NB4so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC4
	movaps	208+NB5so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC5
	movaps	208+NB6so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC6
	movaps	208+NB7so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC7
	movaps	208+NB8so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC8
	movaps	208+NB9so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC9
	movaps	208+NB10so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC10
	movaps	208+NB11so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC11
	movaps	208+NB12so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC12
	movaps	208+NB13so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC13

	movaps	224(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC0
	movaps	224+NBso(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC1
	movaps	224+NB2so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC2
	movaps	224+NB3so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC3
	movaps	224+NB4so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC4
	movaps	224+NB5so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC5
	movaps	224+NB6so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC6
	movaps	224+NB7so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC7
	movaps	224+NB8so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC8
	movaps	224+NB9so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC9
	movaps	224+NB10so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC10
	movaps	224+NB11so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC11
	movaps	224+NB12so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC12
	movaps	224+NB13so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC13

	movaps	240(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC0
	movaps	240+NBso(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC1
	movaps	240+NB2so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC2
	movaps	240+NB3so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC3
	movaps	240+NB4so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC4
	movaps	240+NB5so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC5
	movaps	240+NB6so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC6
	movaps	240+NB7so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC7
	movaps	240+NB8so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC8
	movaps	240+NB9so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC9
	movaps	240+NB10so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC10
	movaps	240+NB11so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC11
	movaps	240+NB12so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC12
	movaps	240+NB13so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC13

	movaps	256(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC0
	movaps	256+NBso(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC1
	movaps	256+NB2so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC2
	movaps	256+NB3so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC3
	movaps	256+NB4so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC4
	movaps	256+NB5so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC5
	movaps	256+NB6so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC6
	movaps	256+NB7so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC7
	movaps	256+NB8so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC8
	movaps	256+NB9so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC9
	movaps	256+NB10so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC10
	movaps	256+NB11so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC11
	movaps	256+NB12so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC12
	movaps	256+NB13so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC13

	movaps	272(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC0
	movaps	272+NBso(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC1
	movaps	272+NB2so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC2
	movaps	272+NB3so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC3
	movaps	272+NB4so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC4
	movaps	272+NB5so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC5
	movaps	272+NB6so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC6
	movaps	272+NB7so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC7
	movaps	272+NB8so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC8
	movaps	272+NB9so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC9
	movaps	272+NB10so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC10
	movaps	272+NB11so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC11
	movaps	272+NB12so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC12
	movaps	272+NB13so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC13
#ifndef SREAL
						pref2((pfA))
						pref2(64(pfA))
#endif

	movaps	288(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC0
	movaps	288+NBso(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC1
	movaps	288+NB2so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC2
	movaps	288+NB3so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC3
	movaps	288+NB4so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC4
	movaps	288+NB5so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC5
	movaps	288+NB6so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC6
	movaps	288+NB7so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC7
	movaps	288+NB8so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC8
	movaps	288+NB9so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC9
	movaps	288+NB10so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC10
	movaps	288+NB11so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC11
	movaps	288+NB12so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC12
	movaps	288+NB13so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC13

	movaps	304(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC0
	movaps	304+NBso(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC1
	movaps	304+NB2so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC2
	movaps	304+NB3so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC3
	movaps	304+NB4so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC4
	movaps	304+NB5so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC5
	movaps	304+NB6so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC6
	movaps	304+NB7so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC7
	movaps	304+NB8so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC8
	movaps	304+NB9so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC9
	movaps	304+NB10so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC10
	movaps	304+NB11so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC11
	movaps	304+NB12so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC12
	movaps	304+NB13so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC13

	movaps	320(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC0
	movaps	320+NBso(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC1
	movaps	320+NB2so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC2
	movaps	320+NB3so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC3
	movaps	320+NB4so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC4
	movaps	320+NB5so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC5
	movaps	320+NB6so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC6
	movaps	320+NB7so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC7
	movaps	320+NB8so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC8
	movaps	320+NB9so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC9
	movaps	320+NB10so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC10
	movaps	320+NB11so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC11
	movaps	320+NB12so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC12
	movaps	320+NB13so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC13
#
#	While (pB != stK);
#
#	inc	KK
#	jnz	KLOOP

#
#       Get these bastard things summed up correctly
#

					# rC0 = c0a    c0b    c0c    c0d
					# rC1 = c1a    c1b    c1c    c1d
					# rC2 = c2a    c2b    c2c    c2d
					# rC3 = c3a    c3b    c3c    c3d
#
	movaps		rC2, rB0	# rB0 = c2a    c2b    c2c    c2d
						prefC((pC))
						prefC(64(pC))
	movaps		rC0, rA0	# rA0 = c0a    c0b    c0c    c0d
	unpckhps	rC3, rB0	# rB0 = c2c    c3c    c2d    c3d
	unpckhps	rC1, rA0	# rA0 = c0c    c1c    c0d    c1d
	unpcklps	rC3, rC2	# rC2 = c2a    c3a    c2b    c3b
	movlhps		rB0, rC3	# rC3 = c3a    c3b    c2c    c3c
	unpcklps	rC1, rC0	# rC0 = c0a    c1a    c0b    c1b
	movhlps		rA0, rC3	# rC3 = c0d    c1d    c2c    c3c
	movlhps		rC2, rA0	# rA0 = c0c    c1c    c2a    c3a
	movhlps		rC0, rB0	# rB0 = c0b    c1b    c2d    c3d
	addps		rA0, rC3	# rC3 = c0cd   c1cd   c2ac   c3ac
	movlhps		rC0, rC1	# rC1 = c1a    c1b    c0a    c1a
	movhlps		rC1, rC2	# rC2 = c0a    c1a    c2b    c3b
	movaps		rC4, rA0	# rA0 = c4a    c4b    c4c    c4d
	addps		rB0, rC2	# rC2 = c0ab   c1ab   c2bd   c3bd
	movaps		rC6, rB0	# rB0 = c6a    c6b    c6c    c6d
	addps		rC2, rC3	# rC3 = c0abcd c1abcd c2bdac c3bdac


					# rC4 = c4a    c4b    c4c    c4d
					# rC5 = c5a    c5b    c5c    c5d
					# rC6 = c6a    c6b    c6c    c6d
					# rC7 = c7a    c7b    c7c    c7d
					# rC8  = c08a    c08b    c08c    c08d
					# rC9  = c09a    c09b    c09c    c09d
					# rC10 = c10a    c10b    c10c    c10d
					# rC11 = c11a    c11b    c11c    c11d
					# rC12 = c12a    c12b    c12c    c12d
					# rC13 = c13a    c13b    c13c    c13d
#
	movaps		rC10, rC0	# rC0 = c10a    c10b    c10c    c10d
						prefC(128(pC))
#ifdef SREAL
						pref2((pfA))
#else
						prefC(192(pC))
#endif
	movaps		rC8 , rC1	# rC1 = c08a    c08b    c08c    c08d
	movaps          rC12, rC2 	# rC2  = c12a    c12b    c12c    c12d
	unpckhps	rC7, rB0	# rB0 = c6c    c7c    c6d    c7d
	unpckhps	rC5, rA0	# rA0 = c4c    c5c    c4d    c5d
	unpcklps	rC7, rC6	# rC6 = c6a    c7a    c6b    c7b
	unpckhps	rC11, rC0	# rC0 = c10c    c11c    c10d    c11d
	unpckhps	rC9 , rC1	# rC1 = c08c    c09c    c08d    c09d
	movlhps		rB0, rC7	# rC7 = c7a    c7b    c6c    c7c
	unpcklps	rC5, rC4	# rC4 = c4a    c5a    c4b    c5b
	movhlps		rA0, rC7	# rC7 = c4d    c5d    c6c    c7c
	movlhps		rC6, rA0	# rA0 = c4c    c5c    c6a    c7a
	unpcklps	rC11, rC10	# rC10 = c10a    c11a    c10b    c11b
	movhlps		rC4, rB0	# rB0 = c4b    c5b    c6d    c7d
	movlhps		rC0, rC11	# rC11 = c11a    c11b    c10c    c11c
	addps		rA0, rC7	# rC7 = c4cd   c5cd   c6ac   c7ac
	movlhps		rC4, rC5	# rC5 = c5a    c5b    c4a    c5a
	unpcklps	rC9 , rC8 	# rC8  = c08a    c09a    c08b    c09b
	movhlps		rC1, rC11	# rC11 = c08d    c09d    c10c    c11c
	movlhps		rC10, rC1	# rC1 = c08c    c09c    c10a    c11a
	movhlps		rC5, rC6	# rC6 = c4a    c5a    c6b    c7b
	movhlps		rC8 , rC0	# rC0 = c08b    c09b    c10d    c11d
	unpcklps	rC13, rC2	# rC2  = c12a    c13a    c12b    c13b
	addps		rC1, rC11	# rC11 = c08cd   c09cd   c10ac   c11ac
	addps		rB0, rC6	# rC6 = c4ab   c5ab   c6bd   c7bd
	movlhps		rC8 , rC9 	# rC9  = c09a    c09b    c08a    c09a
	unpckhps	rC13, rC12	# rC12 = c12c    c13c    c12d    c13d
	movhlps		rC9 , rC10	# rC10 = c08a    c09a    c10b    c11b
	addps		rC6, rC7	# rC7 = c4abcd c5abcd c6bdac c7bdac
#ifdef SREAL
						pref2(64(pfA))
#else
						prefC(256(pC))
#endif
	addps		rC0, rC10	# rC10 = c08ab   c09ab   c10bd   c11bd
	addps		rC2, rC12	# rC12 = c12ac   c13ac   c12bd   c13bd
	addps		rC10, rC11	# rC11 = c08abcd c09abcd c10bdac c11bdac

#

	movhlps		rC12, rC13	# rC13 = c12bd   c13bd   X       X
#ifndef SREAL
						prefC(192(pC))
#endif
						addq	$68, pfA
	addps		rC13, rC12	# rC12 = c12abcd c13abcd X       X
#
#	Write results back to C;  pC += 14;
#
#ifdef SREAL
	movups	rC3, (pC)
	movups	rC7, 16(pC)
	movups	rC11, 32(pC)
	movlps	rC12, 48(pC)
	addq	$56, pC
#else
	movss	rC3, (pC)
	movss	rC7, 32(pC)
	movhlps	rC3, rC0
	movhlps	rC7, rC6
	movss	rC0, 16(pC)
	movss	rC6, 48(pC)
	shufps	$0x55, rC3, rC3
	shufps	$0x55, rC7, rC7
	movss	rC3, 8(pC)
	movss	rC7, 40(pC)
	shufps	$0x55, rC0, rC0
	shufps	$0x55, rC6, rC6
	movss	rC0, 24(pC)
	movss	rC6, 56(pC)

	movss	rC11, 64(pC)
	movhlps	rC11, rC2
	movss	rC12, 96(pC)
	movss	rC2, 80(pC)
	shufps	$0x55, rC11, rC11
	shufps	$0x55, rC12, rC12
	movss	rC11, 72(pC)
	shufps	$0x55, rC2, rC2
	movss	rC12, 104(pC)
	movss	rC2, 88(pC)

	addq	$112, pC
#endif
#
#       pA += 14*NB; pB -= NB;
#
	addq	$NB14so, pA
#	subq	$NBso, pB
#
#	while (pA != stM);
#
	cmp	pA, stM
	jne	UMLOOP
#
#	Last iteration of M-loop unrolled to prefetch next col of B
#
#UMLOOP_UR:
#
#	rC[0-13] = pC[0-13] * beta
#
#ifdef BETA0
	xorps	rC0, rC0
	xorps	rC1, rC1
	xorps	rC2, rC2
	xorps	rC3, rC3
	xorps	rC4, rC4
	xorps	rC5, rC5
	xorps	rC6, rC6
	xorps	rC7, rC7
	xorps	rC8, rC8
	xorps	rC9, rC9
	xorps	rC10, rC10
	xorps	rC11, rC11
	xorps	rC12, rC12
	xorps	rC13, rC13
#else
   #ifdef SREAL
	movss	(pC), rC0
	movss	4(pC), rC1
	movss	8(pC), rC2
	movss	12(pC), rC3
	movss	16(pC), rC4
	movss	20(pC), rC5
	movss	24(pC), rC6
	movss	28(pC), rC7
	movss	32(pC), rC8
	movss	36(pC), rC9
	movss	40(pC), rC10
	movss	44(pC), rC11
	movss	48(pC), rC12
	movss	52(pC), rC13
   #else
	movss	(pC), rC0
	movss	8(pC), rC1
	movss	16(pC), rC2
	movss	24(pC), rC3
	movss	32(pC), rC4
	movss	40(pC), rC5
	movss	48(pC), rC6
	movss	56(pC), rC7
	movss	64(pC), rC8
	movss	72(pC), rC9
	movss	80(pC), rC10
	movss	88(pC), rC11
	movss	96(pC), rC12
	movss	104(pC), rC13
   #endif
   #ifdef BETAX
	mulss	-24(%rsp), rC0
	mulss	-24(%rsp), rC1
	mulss	-24(%rsp), rC2
	mulss	-24(%rsp), rC3
	mulss	-24(%rsp), rC4
	mulss	-24(%rsp), rC5
	mulss	-24(%rsp), rC6
	mulss	-24(%rsp), rC7
	mulss	-24(%rsp), rC8
	mulss	-24(%rsp), rC9
	mulss	-24(%rsp), rC10
	mulss	-24(%rsp), rC11
	mulss	-24(%rsp), rC12
	mulss	-24(%rsp), rC13
   #endif
#endif
#KLOOP:
	movaps	(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC0
	movaps	NBso(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC1
	movaps	NB2so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC2
	movaps	NB3so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC3
	movaps	NB4so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC4
	movaps	NB5so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC5
	movaps	NB6so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC6
	movaps	NB7so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC7
	movaps	NB8so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC8
	movaps	NB9so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC9
	movaps	NB10so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC10
	movaps	NB11so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC11
	movaps	NB12so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC12
	movaps	NB13so(pA), rA0
	mulps	(pB), rA0
	addps	rA0, rC13

	movaps	16(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC0
	movaps	16+NBso(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC1
	movaps	16+NB2so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC2
	movaps	16+NB3so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC3
	movaps	16+NB4so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC4
	movaps	16+NB5so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC5
	movaps	16+NB6so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC6
	movaps	16+NB7so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC7
	movaps	16+NB8so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC8
	movaps	16+NB9so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC9
	movaps	16+NB10so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC10
	movaps	16+NB11so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC11
	movaps	16+NB12so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC12
	movaps	16+NB13so(pA), rA0
	mulps	16(pB), rA0
	addps	rA0, rC13

	movaps	32(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC0
	movaps	32+NBso(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC1
	movaps	32+NB2so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC2
	movaps	32+NB3so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC3
	movaps	32+NB4so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC4
	movaps	32+NB5so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC5
	movaps	32+NB6so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC6
	movaps	32+NB7so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC7
	movaps	32+NB8so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC8
	movaps	32+NB9so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC9
	movaps	32+NB10so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC10
	movaps	32+NB11so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC11
	movaps	32+NB12so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC12
	movaps	32+NB13so(pA), rA0
	mulps	32(pB), rA0
	addps	rA0, rC13

	movaps	48(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC0
	movaps	48+NBso(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC1
	movaps	48+NB2so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC2
	movaps	48+NB3so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC3
	movaps	48+NB4so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC4
	movaps	48+NB5so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC5
	movaps	48+NB6so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC6
	movaps	48+NB7so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC7
	movaps	48+NB8so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC8
	movaps	48+NB9so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC9
	movaps	48+NB10so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC10
	movaps	48+NB11so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC11
	movaps	48+NB12so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC12
	movaps	48+NB13so(pA), rA0
	mulps	48(pB), rA0
	addps	rA0, rC13

	movaps	64(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC0
	movaps	64+NBso(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC1
	movaps	64+NB2so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC2
	movaps	64+NB3so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC3
	movaps	64+NB4so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC4
	movaps	64+NB5so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC5
	movaps	64+NB6so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC6
	movaps	64+NB7so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC7
	movaps	64+NB8so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC8
	movaps	64+NB9so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC9
	movaps	64+NB10so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC10
	movaps	64+NB11so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC11
	movaps	64+NB12so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC12
	movaps	64+NB13so(pA), rA0
	mulps	64(pB), rA0
	addps	rA0, rC13

	movaps	80(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC0
	movaps	80+NBso(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC1
	movaps	80+NB2so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC2
	movaps	80+NB3so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC3
	movaps	80+NB4so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC4
	movaps	80+NB5so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC5
	movaps	80+NB6so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC6
	movaps	80+NB7so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC7
	movaps	80+NB8so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC8
	movaps	80+NB9so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC9
	movaps	80+NB10so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC10
	movaps	80+NB11so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC11
	movaps	80+NB12so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC12
	movaps	80+NB13so(pA), rA0
	mulps	80(pB), rA0
	addps	rA0, rC13

	movaps	96(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC0
	movaps	96+NBso(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC1
	movaps	96+NB2so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC2
	movaps	96+NB3so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC3
	movaps	96+NB4so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC4
	movaps	96+NB5so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC5
	movaps	96+NB6so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC6
	movaps	96+NB7so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC7
	movaps	96+NB8so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC8
	movaps	96+NB9so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC9
	movaps	96+NB10so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC10
	movaps	96+NB11so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC11
	movaps	96+NB12so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC12
	movaps	96+NB13so(pA), rA0
	mulps	96(pB), rA0
	addps	rA0, rC13

	movaps	112(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC0
	movaps	112+NBso(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC1
	movaps	112+NB2so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC2
	movaps	112+NB3so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC3
	movaps	112+NB4so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC4
	movaps	112+NB5so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC5
	movaps	112+NB6so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC6
	movaps	112+NB7so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC7
	movaps	112+NB8so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC8
	movaps	112+NB9so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC9
	movaps	112+NB10so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC10
	movaps	112+NB11so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC11
	movaps	112+NB12so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC12
	movaps	112+NB13so(pA), rA0
	mulps	112(pB), rA0
	addps	rA0, rC13

	movaps	128(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC0
	movaps	128+NBso(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC1
	movaps	128+NB2so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC2
	movaps	128+NB3so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC3
	movaps	128+NB4so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC4
	movaps	128+NB5so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC5
	movaps	128+NB6so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC6
	movaps	128+NB7so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC7
	movaps	128+NB8so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC8
	movaps	128+NB9so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC9
	movaps	128+NB10so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC10
	movaps	128+NB11so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC11
	movaps	128+NB12so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC12
	movaps	128+NB13so(pA), rA0
	mulps	128(pB), rA0
	addps	rA0, rC13

	movaps	144(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC0
	movaps	144+NBso(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC1
	movaps	144+NB2so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC2
	movaps	144+NB3so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC3
	movaps	144+NB4so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC4
	movaps	144+NB5so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC5
	movaps	144+NB6so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC6
	movaps	144+NB7so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC7
	movaps	144+NB8so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC8
	movaps	144+NB9so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC9
	movaps	144+NB10so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC10
	movaps	144+NB11so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC11
	movaps	144+NB12so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC12
	movaps	144+NB13so(pA), rA0
	mulps	144(pB), rA0
	addps	rA0, rC13

	movaps	160(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC0
	movaps	160+NBso(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC1
	movaps	160+NB2so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC2
	movaps	160+NB3so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC3
	movaps	160+NB4so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC4
	movaps	160+NB5so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC5
	movaps	160+NB6so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC6
	movaps	160+NB7so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC7
	movaps	160+NB8so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC8
	movaps	160+NB9so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC9
	movaps	160+NB10so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC10
	movaps	160+NB11so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC11
	movaps	160+NB12so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC12
	movaps	160+NB13so(pA), rA0
	mulps	160(pB), rA0
	addps	rA0, rC13
						prefB(NBso(pB))
						prefB(64+NBso(pB))

	movaps	176(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC0
	movaps	176+NBso(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC1
	movaps	176+NB2so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC2
	movaps	176+NB3so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC3
	movaps	176+NB4so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC4
	movaps	176+NB5so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC5
	movaps	176+NB6so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC6
	movaps	176+NB7so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC7
	movaps	176+NB8so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC8
	movaps	176+NB9so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC9
	movaps	176+NB10so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC10
	movaps	176+NB11so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC11
	movaps	176+NB12so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC12
	movaps	176+NB13so(pA), rA0
	mulps	176(pB), rA0
	addps	rA0, rC13

	movaps	192(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC0
	movaps	192+NBso(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC1
	movaps	192+NB2so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC2
	movaps	192+NB3so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC3
	movaps	192+NB4so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC4
	movaps	192+NB5so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC5
	movaps	192+NB6so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC6
	movaps	192+NB7so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC7
	movaps	192+NB8so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC8
	movaps	192+NB9so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC9
	movaps	192+NB10so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC10
	movaps	192+NB11so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC11
	movaps	192+NB12so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC12
	movaps	192+NB13so(pA), rA0
	mulps	192(pB), rA0
	addps	rA0, rC13

	movaps	208(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC0
	movaps	208+NBso(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC1
	movaps	208+NB2so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC2
	movaps	208+NB3so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC3
	movaps	208+NB4so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC4
	movaps	208+NB5so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC5
	movaps	208+NB6so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC6
	movaps	208+NB7so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC7
	movaps	208+NB8so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC8
	movaps	208+NB9so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC9
	movaps	208+NB10so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC10
	movaps	208+NB11so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC11
	movaps	208+NB12so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC12
	movaps	208+NB13so(pA), rA0
	mulps	208(pB), rA0
	addps	rA0, rC13
						prefB(128+NBso(pB))
						prefB(192+NBso(pB))

	movaps	224(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC0
	movaps	224+NBso(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC1
	movaps	224+NB2so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC2
	movaps	224+NB3so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC3
	movaps	224+NB4so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC4
	movaps	224+NB5so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC5
	movaps	224+NB6so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC6
	movaps	224+NB7so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC7
	movaps	224+NB8so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC8
	movaps	224+NB9so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC9
	movaps	224+NB10so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC10
	movaps	224+NB11so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC11
	movaps	224+NB12so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC12
	movaps	224+NB13so(pA), rA0
	mulps	224(pB), rA0
	addps	rA0, rC13

	movaps	240(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC0
	movaps	240+NBso(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC1
	movaps	240+NB2so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC2
	movaps	240+NB3so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC3
	movaps	240+NB4so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC4
	movaps	240+NB5so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC5
	movaps	240+NB6so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC6
	movaps	240+NB7so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC7
	movaps	240+NB8so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC8
	movaps	240+NB9so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC9
	movaps	240+NB10so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC10
	movaps	240+NB11so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC11
	movaps	240+NB12so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC12
	movaps	240+NB13so(pA), rA0
	mulps	240(pB), rA0
	addps	rA0, rC13

	movaps	256(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC0
	movaps	256+NBso(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC1
	movaps	256+NB2so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC2
	movaps	256+NB3so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC3
	movaps	256+NB4so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC4
	movaps	256+NB5so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC5
	movaps	256+NB6so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC6
	movaps	256+NB7so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC7
	movaps	256+NB8so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC8
	movaps	256+NB9so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC9
	movaps	256+NB10so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC10
	movaps	256+NB11so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC11
	movaps	256+NB12so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC12
	movaps	256+NB13so(pA), rA0
	mulps	256(pB), rA0
	addps	rA0, rC13

	movaps	272(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC0
	movaps	272+NBso(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC1
	movaps	272+NB2so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC2
	movaps	272+NB3so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC3
	movaps	272+NB4so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC4
	movaps	272+NB5so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC5
	movaps	272+NB6so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC6
	movaps	272+NB7so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC7
	movaps	272+NB8so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC8
	movaps	272+NB9so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC9
	movaps	272+NB10so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC10
	movaps	272+NB11so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC11
	movaps	272+NB12so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC12
	movaps	272+NB13so(pA), rA0
	mulps	272(pB), rA0
	addps	rA0, rC13

	movaps	288(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC0
	movaps	288+NBso(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC1
	movaps	288+NB2so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC2
	movaps	288+NB3so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC3
	movaps	288+NB4so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC4
	movaps	288+NB5so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC5
	movaps	288+NB6so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC6
	movaps	288+NB7so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC7
	movaps	288+NB8so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC8
	movaps	288+NB9so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC9
	movaps	288+NB10so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC10
	movaps	288+NB11so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC11
	movaps	288+NB12so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC12
	movaps	288+NB13so(pA), rA0
	mulps	288(pB), rA0
	addps	rA0, rC13
						prefC((pC))
						prefC((pC,incCn))

	movaps	304(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC0
	movaps	304+NBso(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC1
	movaps	304+NB2so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC2
	movaps	304+NB3so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC3
	movaps	304+NB4so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC4
	movaps	304+NB5so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC5
	movaps	304+NB6so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC6
	movaps	304+NB7so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC7
	movaps	304+NB8so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC8
	movaps	304+NB9so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC9
	movaps	304+NB10so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC10
	movaps	304+NB11so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC11
	movaps	304+NB12so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC12
	movaps	304+NB13so(pA), rA0
	mulps	304(pB), rA0
	addps	rA0, rC13

	movaps	320(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC0
	movaps	320+NBso(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC1
	movaps	320+NB2so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC2
	movaps	320+NB3so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC3
	movaps	320+NB4so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC4
	movaps	320+NB5so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC5
	movaps	320+NB6so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC6
	movaps	320+NB7so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC7
	movaps	320+NB8so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC8
	movaps	320+NB9so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC9
	movaps	320+NB10so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC10
	movaps	320+NB11so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC11
	movaps	320+NB12so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC12
	movaps	320+NB13so(pA), rA0
	mulps	320(pB), rA0
	addps	rA0, rC13
#
#	While (pB != stK);
#
#
#       Get these bastard things summed up correctly
#

					# rC0 = c0a    c0b    c0c    c0d
					# rC1 = c1a    c1b    c1c    c1d
					# rC2 = c2a    c2b    c2c    c2d
					# rC3 = c3a    c3b    c3c    c3d
#
	movaps		rC2, rB0	# rB0 = c2a    c2b    c2c    c2d
	movaps		rC0, rA0	# rA0 = c0a    c0b    c0c    c0d
	unpckhps	rC3, rB0	# rB0 = c2c    c3c    c2d    c3d
	unpckhps	rC1, rA0	# rA0 = c0c    c1c    c0d    c1d
	unpcklps	rC3, rC2	# rC2 = c2a    c3a    c2b    c3b
	movlhps		rB0, rC3	# rC3 = c3a    c3b    c2c    c3c
	unpcklps	rC1, rC0	# rC0 = c0a    c1a    c0b    c1b
	movhlps		rA0, rC3	# rC3 = c0d    c1d    c2c    c3c
	movlhps		rC2, rA0	# rA0 = c0c    c1c    c2a    c3a
	movhlps		rC0, rB0	# rB0 = c0b    c1b    c2d    c3d
	addps		rA0, rC3	# rC3 = c0cd   c1cd   c2ac   c3ac
						prefC(64(pC, incCn))
						prefB(256+NBso(pB))
	movlhps		rC0, rC1	# rC1 = c1a    c1b    c0a    c1a
	movhlps		rC1, rC2	# rC2 = c0a    c1a    c2b    c3b
	movaps		rC4, rA0	# rA0 = c4a    c4b    c4c    c4d
	addps		rB0, rC2	# rC2 = c0ab   c1ab   c2bd   c3bd
	movaps		rC6, rB0	# rB0 = c6a    c6b    c6c    c6d
	addps		rC2, rC3	# rC3 = c0abcd c1abcd c2bdac c3bdac


					# rC4 = c4a    c4b    c4c    c4d
					# rC5 = c5a    c5b    c5c    c5d
					# rC6 = c6a    c6b    c6c    c6d
					# rC7 = c7a    c7b    c7c    c7d
					# rC8  = c08a    c08b    c08c    c08d
					# rC9  = c09a    c09b    c09c    c09d
					# rC10 = c10a    c10b    c10c    c10d
					# rC11 = c11a    c11b    c11c    c11d
					# rC12 = c12a    c12b    c12c    c12d
					# rC13 = c13a    c13b    c13c    c13d
#
	movaps		rC10, rC0	# rC0 = c10a    c10b    c10c    c10d
	movaps		rC8 , rC1	# rC1 = c08a    c08b    c08c    c08d
	movaps          rC12, rC2 	# rC2  = c12a    c12b    c12c    c12d
	unpckhps	rC7, rB0	# rB0 = c6c    c7c    c6d    c7d
	unpckhps	rC5, rA0	# rA0 = c4c    c5c    c4d    c5d
	unpcklps	rC7, rC6	# rC6 = c6a    c7a    c6b    c7b
	unpckhps	rC11, rC0	# rC0 = c10c    c11c    c10d    c11d
	unpckhps	rC9 , rC1	# rC1 = c08c    c09c    c08d    c09d
	movlhps		rB0, rC7	# rC7 = c7a    c7b    c6c    c7c
	unpcklps	rC5, rC4	# rC4 = c4a    c5a    c4b    c5b
	movhlps		rA0, rC7	# rC7 = c4d    c5d    c6c    c7c
	movlhps		rC6, rA0	# rA0 = c4c    c5c    c6a    c7a
	unpcklps	rC11, rC10	# rC10 = c10a    c11a    c10b    c11b
	movhlps		rC4, rB0	# rB0 = c4b    c5b    c6d    c7d
	movlhps		rC0, rC11	# rC11 = c11a    c11b    c10c    c11c
	addps		rA0, rC7	# rC7 = c4cd   c5cd   c6ac   c7ac
	movlhps		rC4, rC5	# rC5 = c5a    c5b    c4a    c5a
	unpcklps	rC9 , rC8 	# rC8  = c08a    c09a    c08b    c09b
	movhlps		rC1, rC11	# rC11 = c08d    c09d    c10c    c11c
	movlhps		rC10, rC1	# rC1 = c08c    c09c    c10a    c11a
	movhlps		rC5, rC6	# rC6 = c4a    c5a    c6b    c7b
	movhlps		rC8 , rC0	# rC0 = c08b    c09b    c10d    c11d
	unpcklps	rC13, rC2	# rC2  = c12a    c13a    c12b    c13b
	addps		rC1, rC11	# rC11 = c08cd   c09cd   c10ac   c11ac
	addps		rB0, rC6	# rC6 = c4ab   c5ab   c6bd   c7bd
	movlhps		rC8 , rC9 	# rC9  = c09a    c09b    c08a    c09a
	unpckhps	rC13, rC12	# rC12 = c12c    c13c    c12d    c13d
	movhlps		rC9 , rC10	# rC10 = c08a    c09a    c10b    c11b
	addps		rC6, rC7	# rC7 = c4abcd c5abcd c6bdac c7bdac
	addps		rC0, rC10	# rC10 = c08ab   c09ab   c10bd   c11bd
	addps		rC2, rC12	# rC12 = c12ac   c13ac   c12bd   c13bd
	addps		rC10, rC11	# rC11 = c08abcd c09abcd c10bdac c11bdac

#

	movhlps		rC12, rC13	# rC13 = c12bd   c13bd   X       X
						prefB(320+NBso(pB))
	addps		rC13, rC12	# rC12 = c12abcd c13abcd X       X
#
#	Write results back to C
#
#ifdef SREAL
	movups	rC3, (pC)
	movups	rC7, 16(pC)
	movups	rC11, 32(pC)
	movlps	rC12, 48(pC)
#else
	movss	rC3, (pC)
	movss	rC7, 32(pC)
	movhlps	rC3, rC0
	movhlps	rC7, rC6
	movss	rC0, 16(pC)
	movss	rC6, 48(pC)
	shufps	$0x55, rC3, rC3
	shufps	$0x55, rC7, rC7
	movss	rC3, 8(pC)
	movss	rC7, 40(pC)
	shufps	$0x55, rC0, rC0
	shufps	$0x55, rC6, rC6
	movss	rC0, 24(pC)
	movss	rC6, 56(pC)

	movss	rC11, 64(pC)
	movhlps	rC11, rC2
	movss	rC12, 96(pC)
	movss	rC2, 80(pC)
	shufps	$0x55, rC11, rC11
	shufps	$0x55, rC12, rC12
	movss	rC11, 72(pC)
	shufps	$0x55, rC2, rC2
	movss	rC12, 104(pC)
	movss	rC2, 88(pC)

#endif
#
#       End unrolled final M-loop iteration
#       stM
#
#
#	pC += incCn;  pA -= NBNB;  pB += NB;
#
	addq	incCn, pC
	subq	$NBNBso-NB14so, pA
	addq	$NBso, pB
#
#	while (pB != stN);
#
	cmp	pB, stN
	jne	UNLOOP

#
#	Restore callee-saved iregs
#
	movq	%r11, %rbp
	ret
