	.file	"sfill.s"
// call sfill(n,alpha,z,incz)
// single precision fill vector z with constant alpha
//
// incz >= 1
// no alignment requirements for z
// if incz = 1, the code will adjust to put stores 
//    at quad aligned boundaries
//
// for :
//  do 10 i=1,incz*n,incz
//10  z(i) = alpha
//
// r16 *n	r17 *alpha	r18 *z	r19 *incz
//
// r31 - loop decrement value
//
	.align	8
	.text
_sfill::			// C entry
_sfill_::			// FORTRAN entry
	fst.q	f4,d1
	ld.l	0(r16),r16	// load n
	adds	-1,r16,r0	// CC=1 if n < 1
	bc	.exit
	ld.l	0(r19),r19	// load incz
	fld.l	0(r17),f4	// load alpha
	shl	2,r19,r19	// incz*sizeof(single)
	adds	-11,r16,r0	// CC=1 if n<11
	bnc	.n_ge_11	// if n>=11 goto .n_ge_11
// if n<11 take care of stores in single store loop
	subs	r18,r19,r18	// set address of z for autoincr
	br	.loop3
	 adds	-1,r16,r22	// set loop counter
.n_ge_11:
// check for incz > 1
	subs	4,r19,r0	// CC set if incz > 1
	bc.t	.incz_gt_1	// if CC=1  store z(1)
	 fst.l	f4,0(r18)	//   and goto .incz_gt_1
// check for z to be quad aligned
	and	15,r18,r0	// CC=1 if lower 4 bits are zero
	bc	.aligned	// if lower 4 bits are 0, z is aligned
//
// if it gets this far z is not quad aligned
//
// find out if z is aligned on 64 or 32 bit boundary
//
// if 64 bit boundary, store alpha to first 2 elements and
// adjust address to quad boundary
//
// if 32 bit boundary, store alpha to first 3 elements and
// adjust address to quad boundary
//
	and	7,r18,r0	// CC=1 if lower 3 bits are zero
	bnc	.32bit_bndry
.64bit_bndry:
	fst.l	f4,0(r18)
	fst.l	f4,4(r18)
	adds	8,r18,r18	// now the array is aligned
	adds	-2,r16,r16	// decrement n
	br	.aligned
	 nop
.32bit_bndry:
	and	12,r18,r31
	adds	-12,r31,r0	// CC=1 if r31<12
	bc	.32_3_bit_bndry	// branch if three elements required
// if we get here, only one element need be handled to align on 128 bndry
.32_1_bit_bndry:
	fst.l	f4,0(r18)
	adds	4,r18,r18
	adds	-1,r16,r16
	br	.aligned
	 nop
.32_3_bit_bndry:
	fst.l	f4,0(r18)
	fst.l	f4,4(r18)
	fst.l	f4,8(r18)
	adds	12,r18,r18	// now the array is aligned
	adds	-3,r16,r16	// decrement n
.aligned:
	and	3,r16,r22	// r22 = n%4
	subs	r16,r22,r16	// n=n-r22
	adds	-8,r16,r16	// preset loop counter 
	adds	-4,r0,r31	// preset loop decrement
	fadd.ss	f0,f4,f5	// copy alpha f5
	fadd.ss	f0,f4,f6	// copy alpha f6
	fadd.ss	f0,f4,f7	// copy alpha f7
	bla	r31,r16,.loop	// set LCC
	 fst.q	f4,0(r18)	// store z(1),...,z(4)
.loop:
	bla	r31,r16,.loop
         fst.q  f4,16(r18)++    // store 4 values of z

	br	.cleanup
	 adds	12,r18,r18	// adjust address of z for single store
.incz_gt_1:
	and	3,r16,r22	// r22 = n%4
	subs	r16,r22,r16	// n=n-r22
	adds	-8,r16,r16	// preset loop counter (CC=1 if n<8)
	adds	-4,r0,r31	// preset loop decrement
	fst.l	f4,r19(r18)++	// store z(2)
	fst.l	f4,r19(r18)++	// store z(3)
	bla	r31,r16,.loop2
	 fst.l	f4,r19(r18)++	// store z(4)
.loop2:				// store 4 values of z
	fst.l	f4,r19(r18)++
	fst.l	f4,r19(r18)++
	fst.l	f4,r19(r18)++
	bla	r31,r16,.loop2
	 fst.l	f4,r19(r18)++
.cleanup:
	bte	0,r22,.exit	// exit if r22=0
	adds    -1,r22,r22      // r22--
.loop3:
	fst.l	f4,r19(r18)++
	adds	-1,r22,r22	// r22--
	bnc	.loop3
.exit:
	fld.q	d1,f4
	bri	r1
	 nop
	.data
	.align	.quad
d1:	.blkf	4

