# asblas.s: # # Vector Operations in VAX Assembler # Single Precision Version # Loops unrolled 4 times # Compile on UNIX or VMS using the UNIX assembler. # cc -c asblas.s # # For documentation, see file toblas.c # # Oliver McBryan # New York University # # # # zero_vector(n,v) # v[i] = 0. # assume n multiple of 4 .text .align 1 .globl _szv _szv: .word 0xc00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v jbr a2 a1: clrf (r10)+ clrf (r10)+ clrf (r10)+ clrf (r10)+ a2: sobgeq r11,a1 ret # vector_equals_scalar(n,v,a) # v[i] = a # assume n multiple of 4 .text .align 1 .globl _sves _sves: .word 0xc00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v cvtdf 12(ap),r0 # a jbr b2 b1: movf r0,(r10)+ movf r0,(r10)+ movf r0,(r10)+ movf r0,(r10)+ b2: sobgeq r11,b1 ret # float sum_elements_of_vector(n,v) # assume n mulitple of 4 .text .align 1 .globl _svsum _svsum: .word 0xc00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v clrf r0 jbr c2 c1: addf3 (r10)+,(r10)+,r1 addf2 r1,r0 addf3 (r10)+,(r10)+,r1 addf2 r1,r0 c2: sobgeq r11,c1 cvtfd r0,r0 ret # copy_vector_to_vector(n,v1,v2) # v2[i] = v1[i] .text .align 1 .globl _svev _svev: .word 0x0 ashl $2,4(ap),r0 # n * 4 movc3 r0,*8(ap),*12(ap) ret # add_scalar_to_vector(n,a,v) # v[i] = v[i] + a # assume n multiple of 4 .text .align 1 .globl _svas _svas: .word 0xc00 ashl $-2,4(ap),r11 # n / 4 cvtdf 8(ap),r0 # a movl 16(ap),r10 # v jbr d2 d1: addf3 r0,(r10),(r10)+ addf3 r0,(r10),(r10)+ addf3 r0,(r10),(r10)+ addf3 r0,(r10),(r10)+ d2: sobgeq r11,d1 ret # multiply_vector_by_scalar(n,v,a) # v[i] *= a # n is a multiple of 4 .text .align 1 .globl _svms _svms: .word 0xc00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v cvtdf 12(ap),r0 # a jbr e2 e1: mulf3 r0,(r10),(r10)+ mulf3 r0,(r10),(r10)+ mulf3 r0,(r10),(r10)+ mulf3 r0,(r10),(r10)+ e2: sobgeq r11,e1 ret # vector_equals_scalar_plus_vector(n,v1,a,v2) # v1[i] = v2[i] + a # n is a multiple of 4 .text .align 1 .globl _svespv _svespv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 cvtdf 12(ap),r0 # a movl 20(ap),r9 # v2 jbr f2 f1: addf3 r0,(r9)+,(r10)+ addf3 r0,(r9)+,(r10)+ addf3 r0,(r9)+,(r10)+ addf3 r0,(r9)+,(r10)+ f2: sobgeq r11,f1 ret # vector_equals_scalar_times_vector(n,v1,a,v2) # v1[i] = v2[i] * a # n is a multiple of 4 .text .align 1 .globl _svesmv _svesmv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 cvtdf 12(ap),r0 # a movl 20(ap),r9 # v2 jbr g2 g1: mulf3 r0,(r9)+,(r10)+ mulf3 r0,(r9)+,(r10)+ mulf3 r0,(r9)+,(r10)+ mulf3 r0,(r9)+,(r10)+ g2: sobgeq r11,g1 ret # float inner_product(n,v1,v2) # return v1.v2 # n multiple of 4 .globl _svdotv _svdotv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 movl 12(ap),r9 # v2 clrf r0 jbr h2 h1: mulf3 (r9)+,(r10)+,r1 addf2 r1,r0 mulf3 (r9)+,(r10)+,r1 addf2 r1,r0 mulf3 (r9)+,(r10)+,r1 addf2 r1,r0 mulf3 (r9)+,(r10)+,r1 addf2 r1,r0 h2: sobgeq r11,h1 cvtfd r0,r0 ret # multiply_vector_by_vector(n,v1,v2) # v1[i] = v1[i]*v2[i] # n multiple of 4 .globl _svmv _svmv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 movl 12(ap),r9 # v2 jbr i2 i1: mulf3 (r9)+,(r10),(r10)+ mulf3 (r9)+,(r10),(r10)+ mulf3 (r9)+,(r10),(r10)+ mulf3 (r9)+,(r10),(r10)+ i2: sobgeq r11,i1 ret # divide_vector_by_vector(n,v1,v2) # v1[i] = v1[i]/v2[i] # n multiple of 4 .globl _svdv _svdv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 movl 12(ap),r9 # v2 jbr j2 j1: divf3 (r9)+,(r10),(r10)+ divf3 (r9)+,(r10),(r10)+ divf3 (r9)+,(r10),(r10)+ divf3 (r9)+,(r10),(r10)+ j2: sobgeq r11,j1 ret # subtract_vector_from_vector(n,v1,v2) # v2[i] = v2[i] - v1[i] # n multiple of 4 .globl _svlv _svlv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 movl 12(ap),r9 # v2 jbr k2 k1: subf3 (r10)+,(r9),(r9)+ subf3 (r10)+,(r9),(r9)+ subf3 (r10)+,(r9),(r9)+ subf3 (r10)+,(r9),(r9)+ k2: sobgeq r11,k1 ret # add_scalar_times_vector_to_vector(n,a,v1,v2) # v2[i] = v2[i] + a*v1[i] .text .align 1 .globl _svpsv _svpsv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 cvtdf 8(ap),r0 # a movl 16(ap),r10 # v1 movl 20(ap),r9 # v2 jbr l2 l1: mulf3 r0,(r10)+,r1 addf3 r1,(r9),(r9)+ mulf3 r0,(r10)+,r1 addf3 r1,(r9),(r9)+ mulf3 r0,(r10)+,r1 addf3 r1,(r9),(r9)+ mulf3 r0,(r10)+,r1 addf3 r1,(r9),(r9)+ l2: sobgeq r11,l1 ret # vector_equals_vector_minus_vector(n,v1,v2,v3) # v1[i] = v2[i] - v3[i] .text .align 1 .globl _svevlv _svevlv: .word 0xe00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r0 # v1 movl 12(ap),r10 # v2 movl 16(ap),r9 # v3 jbr m2 m1: subf3 (r9)+,(r10)+,(r0)+ subf3 (r9)+,(r10)+,(r0)+ subf3 (r9)+,(r10)+,(r0)+ subf3 (r9)+,(r10)+,(r0)+ m2: sobgeq r11,m1 ret # add_vector_times_vector_to_vector(n,v1,v2,v3) # v3[i] = v3[i] + v1[i]*v2[i] .text .align 1 .globl _svpvv _svpvv: .word 0xf00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 movl 12(ap),r9 # v2 movl 16(ap),r8 # v3 jbr n2 n1: mulf3 (r10)+,(r9)+,r0 addf3 r0,(r8),(r8)+ mulf3 (r10)+,(r9)+,r0 addf3 r0,(r8),(r8)+ mulf3 (r10)+,(r9)+,r0 addf3 r0,(r8),(r8)+ mulf3 (r10)+,(r9)+,r0 addf3 r0,(r8),(r8)+ n2: sobgeq r11,n1 ret # vector_equals_vector_plus_scalar_times_vector(n,v1,v2,a,v3) # v1[i] = v2[i] + a*v3[i] .globl _svevpsv _svevpsv: .word 0xf00 ashl $-2,4(ap),r11 # n / 4 movl 8(ap),r10 # v1 movl 12(ap),r9 # v2 cvtdf 16(ap),r0 # a movl 24(ap),r8 # v3 jbr o2 o1: mulf3 (r8)+,r0,r1 addf3 r1,(r9)+,(r10)+ mulf3 (r8)+,r0,r1 addf3 r1,(r9)+,(r10)+ mulf3 (r8)+,r0,r1 addf3 r1,(r9)+,(r10)+ mulf3 (r8)+,r0,r1 addf3 r1,(r9)+,(r10)+ o2: sobgeq r11,o1 ret .