;"xgrey" variant of "codegrinder" by Kuemmel
;main algorithm based on a shader by kusma https://www.shadertoy.com/view/4d33RM
;and the discussion on pout here => http://www.pouet.net/topic.php?which=10564
org 100h
use16
push 0a000h - 70		;center x axis
aas						;part of correct aspect ratio => sizecoding.org
pop es
mov al,13h
int 10h

;---palette generation
mov dx,0x3c9
palette:  	 			
	mov al,cl
	shr al,2			;...as Intel doesn't like shrd ax,cx,18 => undefined result, but works on DOSBox
	out dx,al
	out dx,al
	out dx,al
loop palette

;---main intro routine	
fld1					;1
main_loop:
  mov ax,0xcccd  		
  mul di				;rrrola's trick => y=dh, x=dl
  sub dh,[si]			;center y axis
  pusha
  fild  word[bx-8]		;dh = y				|1		   
  fild  word[bx-9]		;dl = x				|y			|1	  
  fmul	dword[bx+si]	;correct aspect ratio => sizecoding.org
  fld   st1				;y		       		|x			|y	  |1	  
  fmul  st0,st0			;y*y				|x			|y	  |1  	
  fld   st1				;x					|y*y	   	|x	  |y   	|1	
  fmul  st0,st0			;x*x	     		|y*y	   	|x	  |y	|1
  faddp st1,st0			;x*x+y*y	   		|x		  	|y	  |1
  fmul dword[si-258+f]	;(x*x+y*y)/f 			|x	  		|y	  |1		;offset hardcoded to dword address
  fsubr st0,st3			;t=1-xx*xx+yy*yy	|x			|y	  |1	
  fabs	
  fst   st4  			;t					|x			|y	  |1	|t
  fsqrt					;e=sqrt(t)			|x			|y	  |1	|t
  fsubr st0,st3			;1-e			  	|x		  	|y	  |1	|t
  fadd  st0,st3			;2-e				|x		  	|y	  |1	|t
  fmul  st1,st0			;2-e			  	|x*(2-e)	|y	  |1	|t
  fmulp st2,st0			;x*(2-e)		  	|y*(2-e)	|1	  |t	
  fistp word[bx-4]		;new y at al (only highbyte is interesting)
  fistp word[bx-5]		;new x at ah (overwrite former lowbyte)
  popa
  add al,[bp+si]		;inc y_movement
  sub ah,[bp+si]		;inc x_movement
  xor al,ah
  shr al,2
  add al,127
  xor ah,ah 
  push ax
  fild word[bx-4]		;c				|1			|t	
  fmul st0,st0			;c*c			|1			|t	
  fmul st0,st2			;c*c*t			|1			|t
  fsqrt					;sqrt(c*c*t)	|1			|t
  fistp word[bx-4]		;1				|t	
  pop ax
  cmp ax,0xff
  jna no_sat
	 mov al,0xff 
  no_sat:
  not al				;skip not is also cool :-)
  stosb
loop main_loop
inc byte[bp+si]			;update global movement counter
in al,60h
  dec al
jnz main_loop	
ret
f dw 0x3160				;dword float constant for sphere size, first 2 bytes precision not needed
						;there is a similar number in code at +63, but shows more artifacts...so 2 Bytes more wasted ;-)

