
%define NUM_PARTICLES 128
%define NUM_COLS 256
%define NUM_ROWS 128

section .bss
    trailmap resb NUM_COLS * NUM_ROWS ; it's important that particles comes right after this

    ; every particle is 6 bytes in RAM: 2x posx, 2x posy, 1x velx, 1x vely
    particles resw NUM_PARTICLES * 3

    framecounter resw 1 ; should probably go in .data, saving a mov, but I'm too dumb for it

section .text
    org 0x100

    ;mov word [framecounter], 0  ; pray that it is zero already!

;;; clear_trailmap ; it will be fiiineee uninitialized

    ;mov di, trailmap
    ;mov cx, 256*128

    ; L_clear_fill:
    ;    stosb ; pray that ah is 0
    ;    loop L_clear_fill

    ; dummy solution
    mov di, particles ; if clear_trailmap is cut, so L_test_fill doesn't break

;;; end clear_trailmap

;;;; init stuff

    ; set video mode to 13h (320x200)
    mov al, 0x13 ; pray that ah is 0
    int 0x10

    ; set custom ISR for int 0x1c (timer interrupt)
    mov ax, 0x2500 + 0x1c
    mov dx, I_main
    int 0x21

    ; different prng seed:
    ; The number of clock ticks since midnight will be returned in cx:dx. There are 0x1800B0 clock ticks per day.
    ; mov ah, 0
    ; int 0x1a
    ; mov ax, dx

    ; mov di, particles ; di is already after trailmap, which is where particles is
    mov cx, 3*NUM_PARTICLES ; 3 words per particle
    L_test_fill:
        ; LEET PRNG
        add ax, 42
        imul ax, 1337
        stosw
        loop L_test_fill


;;;; end init stuff

halt:
    hlt
    jmp halt



I_main: ; called at 18.2 Hz by timer interrupt

    mov ax, ds
    mov es, ax

    mov si, trailmap
    mov di, si ;saving trailmap address into di

    ;;;; vertical "blur"
        ; actually just swapping vertical pairs of pixels here and there.
        ; the horizontal step will smooth it out, and this does the job of smearing vertically

        ;; it would be nice if these could cycle mod 6, but oh well, bytes...

;;
;;        ;mov cx, 5461 ; floor(128*256/6) ; would be nicer, but would have to handle overflows
;;        mov cx, 5418 ; floor(127*256/6) ; number of pixel pairs to swap, with no overflows at bottom
;;        L_vert_blur:
;;
;;            lodsb ; al is top pixel
;;            xchg al, byte [si+256] ; putting al into bottom pixel and bottom pixel into al
;;            mov [si], al ; writing bottom pixel value back to top pixel location
;;
;;            add si, 6 ; 6 is nice because mod 256, it advances by 2 columns to the right with each row
;;
;;            loop L_vert_blur
;;        mov si, di ; di still has trailmap address in it, restoring it into si
    ;;;; end vertical "blur"


    ;;;; horizontal blur (also does decay)

        ; mov dl, [si+128*255-1] ; dl will store "prev" (starting with bottom right corner - nope)
        xor dx, dx ; dh will stay 0
        xor bh, bh ; will stay 0

        mov cx, NUM_ROWS * NUM_COLS
        L_horiz_blur: ; this is not done row-by-row, because it's fine this way
            ; doing floor((prev + curr + curr - floor(curr / something) + next) / 4)

            mov ax, dx ; starting with prev

            mov bl, [si] ; reading curr
            add ax, bx  ; adding it twice
            add ax, bx
            mov dl, bl  ; storing curr as next

            shr bl, 3  ; do a bit more decay
            sub ax, bx ; by subtracting a part of curr

            ; mov bl, dl ; overwritten everywhere except in the last col - therefore optional

            ; reading next
            inc si

            ;cmp cx, 1 ; before the loop instruction, counter is 1 for the last column
            ;je skipreadnext
                mov bl, [si] ; shouldn't read if cx is 1! will overflow in the last row, into the particles area
                ; - but who cares. will cause a little flicker in the bottom right corner .... big deal
            ;skipreadnext:

            add ax, bx ; adding next
            shr ax, 2 ; dividing by 4

            stosb
            loop L_horiz_blur

    ;;;; end horizontal blur

    ;;;; handle particles


        mov cx, NUM_PARTICLES

        cmp word [framecounter], 420
        jg allparticles
            ; we are at the beginning, only do a few particles
            mov cx, [framecounter]
            shr cx, 6
            inc cx ; can't be zero, causes really long loop (wraps around to 64k)

        allparticles:



        mov si, particles
        L_foreach_particle:
            push cx

            ;;;; sense
                mov ax, [si] ; posx
                mov bx, [si+2] ; posy

                xor ch, ch
                xor dh, dh


                ; unlike in move, only shift by 5 (3 lower bits are noise), so it's 8x
                mov ch, [si+4] ; load velx
                sar cx, 5 ; fill ch with sign bit, shift velx into cl

                mov dh, [si+5] ; load vely
                sar dx, 5 ; fill dh with sign bit, shift vely into dl

                add ax, cx
                add bx, dx

                and bh, 0x7F ; there are only 128 rows, wraparound posy

                xchg bl, ah ; this makes bx a trailmap index

                mov di, trailmap
                add di, bx ; the 256 columns make indexing neat

                xchg al, byte [di]

                push ax ; forward sample is on the stack

                xchg byte [di], al

                xchg ah, bl ; turns ax and bx back to posx and posy


                xchg cx, dx
                xor cx, 0xFFFF
                sar cx, 1 ; sideways half-length (4x velx)
                sar dx, 1 ; sideways half-length (4x vely)

                add ax, cx
                add bx, dx


                and bh, 0x7F ; there are only 128 rows, wraparound posy

                mov bl, ah ; this makes bx a trailmap index

                mov di, trailmap
                add di, bx ; the 256 columns make indexing neat

                pop bx ; bl is now center sample

                cmp byte [di], bl
                jle endsense

                sar cx, 7 ; sideways velx
                sar dx, 7 ; sideways vely

                add [si+4], cl
                add [si+5], dl

            ;;; end sense
            endsense:

            ;;; move

                ; could shift right by fewer bits for faster dots, keeping low bits as noise,
                ; but then sometimes they would skip rows, really making vertical blur necessary...

                ; x
                mov bh, [si+4] ; load velx
                sar bx, 8 ; fill bh with sign bit, shift velx into bl
                add [si], bx

                ; y
                mov bh, [si+5]
                sar bx, 8 ; fill bh with sign bit, shift vely into bl
                add [si+2], bx

                and byte [si+3], 0x7F ; there are only 128 rows, wraparound posy (only high byte)

            ;;; end move


            ;;; deposit

                mov al, [si+1]   ; the high bits of posx will be the column, so low bits of ax
                mov ah, [si+3]   ; the high bits of posy will be the row, so high bits of ax
                mov di, trailmap
                add di, ax ; the 256 columns make indexing neat
                mov byte [di], 0xFF ; -8 ; just set it to high, no overflows (the -8 is because of the rounding while blitting)

            ;;; end deposit

        pop cx
        add si, 6 ; each particle is 6 bytes long
        loop L_foreach_particle

    ;;;; end handle particles


    ;;;; copy trail map to screen ;;;;

        ; set ES to video memory segment
        mov ax, 0xA000
        mov es, ax

        ; calculate offset in video memory to center the bitmap
        ; (320-256)/2 = 32 pixels horizontally
        ; (200-128)/2 = 36 pixels vertically
        ; but one row up and one imagewidth to the right
        mov di, (35*320)+32 + NUM_COLS ; starting offset in video memory
        mov si, trailmap

        mov cx, NUM_COLS * NUM_ROWS  ; number of pixels
        L_copy_pixel:
            lodsb

            ;xor ah, ah  ; rounding - optional
            ;add ax, 8

            shr al, 4 ; 0..255 to 0..15

            ;cmp word [framecounter], 600
            ;jl norainbow
            ;    add al, 16 ; extra palette offset for rainbow!
            ;norainbow:
            add al, 16

            stosb

            test cl, cl ; test whether we are done with a row
            jnz noskip
                ; add offset to DI to jump to the next line
                add di, 320 - NUM_COLS  ; 320 (screen width) - 256 (bitmap width)
            noskip:

            loop L_copy_pixel

    ;;;; end copy trail map ;;;;

    inc word [framecounter]

iret ; I_main is an interrupt handler, so we must return with iret

