; WAT(AF)/HMM — [W]onderful [A]ct of [T]ransformation (with [A]VX2 in [F]ullHD) / [H]olo[M]eta[M]orphosis
; Чудесный акт преображения (с AVX2 в FullHD) / Голометаморфоз

; 256-byte intro for MS-DOS / PC x86
; Presented on Demodulation 2023 party!
; (c) 2023 Jin X (https://t.me/jin_x • jin_x@list.ru)
 
; Features:
; - Full HD TrueColor VESA mode (1920x1080x32bpp)
; - AVX/AVX2 code running in Protected Mode under DOS
; - Strong code compression (-41%: 184 -> 109 bytes)

;-----------------------------------------------------------------------------------------------------------------------
; USER SETTINGS

extended_version	=	0

if ~ extended_version

  iteration_count	=	12
  time_power_exponent	=	3		; 1, 2 or 3

  reciprocal_and_fma	=	1		; may save a byte
  check_high_clamp_range =	1
  vsync			=	0		; 1 - wait for VR start, 2 - full

  init_frame_value	=	0
  exact_height_check	=	1
  if time_power_exponent = 1
    init_frame_value	=	6
    reverse_int_time	=	1
    si_value		=	$21C + (vsync+(vsync+1)/2)*5 ; +0 | +10 | +15
  else if time_power_exponent = 2
    reverse_int_time	=	0
    si_value		=	$21A + (vsync+(vsync+1)/2)*5 ; +0 | +10 | +15
  else if time_power_exponent = 3
    init_frame_value	=	5
    reverse_int_time	=	1
    si_value		=	$216 + (vsync+(vsync+1)/2)*5 ; +0 | +10 | +15
    open_a20_line	=	0
    check_success	=	0
  end if ; time_power_exponent

else ; extended_version

  iteration_count	=	12
  time_power_exponent	=	3		; 1, 2 or 3

  reciprocal_and_fma	=	1		; may save a byte
  check_high_clamp_range =	1
  vsync			=	2		; 1 - wait for VR start, 2 - full

  init_frame_value	=	0
  exact_height_check	=	1
  init_frame_value	=	$490
  reverse_int_time	=	1
  si_value		=	$300

  ; Framework settings
  find_video_mode	=	2
  mode_check_method	=	4

  allow_exit		=	1
  exit_to_dos		=	1
  restore_text_mode 	=	1
  display_msg		=	2

  check_v86_mode	=	1
  open_a20_line		=	1
  disable_nmi		=	1
  init_gdt_descr	=	1
  full_gdt_init		=	1

end if ; [~] extended_version

assert time_power_exponent >= 1 & time_power_exponent <= 3
assert vsync >= 0 & vsync <= 2

;-----------------------------------------------------------------------------------------------------------------------
; MAIN INTRO SOURCE for fasm 1

macro	unpacker!
{
  ifdo extended_version, \
		<mov	cx,$FF>, \
		<push	cx>, \
		<mov	bp,si_value + $100>
  ifel @$bp_zero, \
		<xchg	di,ax>, \
		<mov	di,bp>			; DI = avx function
		mov	si,avx_packed
		mov	bx,avx_bitmap_end-32	; CX = $FF
	.unpacknext:
		bt	[bx],cx
		jnc	.copy			; CF = 1 -> copy

		dec	cx
		bt	[bx],cx
		salc				; AL = 0/$FF
		add	al,$C5			; AL = VEX prefix: $C5 (CF=0) / $C4 (CF=1)
	@@:	stosb				; store VEX byte
		xor	al,$C4 xor $E2		; $C4 -> $E2 (PF=1) -> $C4 (PF=0) | $C5 -> $E3 (PF=0)
		jp	@B			; store $E2 for prefix $C4 (PF=1)

	.dldh:	xchg	ax,dx			; get byte from DL (last prefix byte | opcode byte)
		dec	cx
		bt	[bx],cx
		jc	@F			; CF = 1 -> use it
		lodsb				; load byte from stream
	@@:	stosb				; store byte
		xchg	dx,ax			; move AL back to DL
		xchg	dl,dh			; swap DL & DH
  ifel @$bp_zero, \
		<neg	ah>, \
		<neg	bp>			; -> SF (jump) -> ~SF (no jump)
		js	.dldh			; check flag for DH

	.copy:	movsb				; copy byte from stream
		dec	cx
		jge	.unpacknext		; continue unpacking
		; CL = $FF or a bit less
  ifdo extended_version, \
		<pop	cx>, \
		<mov	si,si_value>

		mov	di,si
  @$di_init = 1
  @$si_value = si_value
} ; unpacker!

macro	init_code!
{
  ifdo extended_version, \
		<mov	di,si>
		; EAX = 7
		fldz
	@@:
		fst	dword [di]		; store scaled (multiplied by scaleNspeed) values 0, 1, ..., 7
		scasd				; DI += 4
		fadd	dword [scaleNspeed + si - si_value]
		dec	ax
		jns	@B

  if init_frame_value
    ifel init_frame_value >= $80, \
		<mov	dx,init_frame_value>, \
		<mov	dh,init_frame_value>
  end if ; init_frame_value
  ifdo extended_version, \
		<movzx	ebp,word [si+$10]>	; bytes per scan line
} ; init_code!

macro	intro_code!
{
  if vsync
		push	dx
  		mov	dx,$3DA
    if vsync = 2
	@@:	in	al,dx
		test	al,8			; waiting for vertical retrace end
		jnz	@B
    end if ; vsync = 2
	@@:	in	al,dx
		test	al,8			; waiting for vertical retrace start
		jz	@B
  		pop	dx
  end if ; vsync

		lea	ebx,[scr_height / 2 - 1] ; initial Y
	.nextline:
		mov	ax,-scr_width / 2	; initial X
	.nextpixel:
		cwde
  ifel extended_version, \
		<call	si_value + $100>, \
		\ ; else (~ extended_version)
    <assert ~ @$bp_zero>, \
		<call	bp>

		add	ax,8			; X
		cmp	ax,scr_width / 2
		jne	.nextpixel

  ifdo extended_version, \
		<lea	edi,[edi+ebp - scr_width*4]>

		dec	ebx			; Y
  ifel exact_height_check, \
		<cmp	bx,-scr_height / 2 - 1>, \
		<cmp	bh,(-scr_height / 2 - 1 - 255) shr 8>
		jne	.nextline

  ifel reverse_int_time, \
		<dec	edx>, \			; decrease frame
		<inc	dx>			; increase frame
  @$eah_not_zero = 1				; {antibug}
} ; intro_code!

; float time = pow(float(1024 - iFrame) * 0.001, 3.0);
; vec2 uv = fragCoord/iResolution.xy - .5;
; uv *= vec2(iResolution.x/iResolution.y, 1);
; uv *= uv;
;
; float dot = uv.x + uv.y;
; float len = sqrt(dot);
; vec3 col = vec3(dot*time, uv);
;
; for (int i = 1; i < 12; ++i) {
;     col += col / (col.brg - len);
; }
;
; col /= 256.;

macro	avx_code!
{
		; float time = float(1536 - iFrame) * 0.001; [time_power_exponent = 1]
		; // or: float time = pow(float(iFrame) * 0.001, 2.0); [time_power_exponent = 2 & init_frame_value = 0 & ~ reverse_int_time]
		; // or: float time = pow(float(1024 - iFrame) * 0.001, 3.0); [time_power_exponent = 3 & init_frame_value = 5 & reverse_int_time]
		; vec2 uv = fragCoord/iResolution.xy - .5;
		; uv *= vec2(iResolution.x/iResolution.y, 1);
		; uv *= uv;
		vcvtsi2ss	xmm1, xmm0, eax		; X
		vcvtsi2ss	xmm4, xmm0, ebx		; Y
		vcvtsi2ss	xmm3, xmm0, edx		; frame
		vbroadcastss	ymm1, xmm1
		vbroadcastss	ymm4, xmm4
		vbroadcastss	ymm3, xmm3
		vbroadcastss	ymm2, [scaleNspeed + si - si_value]
  ifdo check_high_clamp_range, \
		<vbroadcastss	ymm6, [int_color_limit + si - si_value]>
		vfmadd213ps	ymm1, ymm2, [si]	; X' = scaled (X, X+1, ..., X+7)
  if time_power_exponent = 1
		vmulps		ymm3, ymm2, ymm3	; time = frame * scaleNspeed
  else if time_power_exponent = 2
		vmulps		ymm3, ymm3, ymm2	; time = frame * scaleNspeed
		vmulps		ymm3, ymm3, ymm3	; time = time^2
  else if time_power_exponent = 3
		vmulps		ymm3, ymm3, ymm2	; time = frame * scaleNspeed
		vmulps		ymm0, ymm3, ymm3
		vmulps		ymm3, ymm3, ymm0	; ymm3 = time^3
  end if ; time_power_exponent
		vmulps		ymm2, ymm2, ymm4	; Y' = Y * scaleNspeed (scaled Y)
		vmulps		ymm2, ymm2, ymm2	; ymm2 = Y'^2 (uv.y = col.b)
		vmulps		ymm1, ymm1, ymm1	; ymm1 = X'^2 (uv.x = col.g)

		; float dot = uv.x + uv.y;
		; float len = sqrt(dot);
		; vec3 col = vec3(dot*time, uv);
		vaddps		ymm0, ymm1, ymm2	; dot = X'^2 + Y'^2
		vsqrtps		ymm7, ymm0		; ymm7 = len = sqrt(dot)
		vmulps		ymm0, ymm0, ymm3	; ymm0 = dot * time (col.r)

		; for (int i = 1; i < 12; ++i) {
		;     col += col / (col.brg - len);
		; }
		mov		cl, iteration_count
	@@:	vsubps		ymm4, ymm7, ymm0
		vsubps		ymm3, ymm7, ymm2
		vsubps		ymm5, ymm7, ymm1	; ymm3..5 = len - col.brg
  if reciprocal_and_fma
		vrcpps		ymm3, ymm3
		vrcpps		ymm4, ymm4
		vrcpps		ymm5, ymm5		; ymm3..5 = 1 / (len - col.brg)
		vfnmadd231ps	ymm0, ymm0, ymm3
		vfnmadd231ps	ymm1, ymm1, ymm4
		vfnmadd231ps	ymm2, ymm2, ymm5	; col -= col / (len - col.gbr)
  else ; ~ reciprocal_and_fma
		vdivps		ymm4, ymm1, ymm4
		vdivps		ymm5, ymm2, ymm5
		vdivps		ymm3, ymm0, ymm3	; ymm3..5 = col / (col.brg - len)
		vsubps		ymm0, ymm0, ymm3
		vsubps		ymm1, ymm1, ymm4
		vsubps		ymm2, ymm2, ymm5	; col += col / (col.gbr - len)
  end if ; [~] reciprocal_and_fma
		loop		@B

		; col /= 256.;
		; convert 3 floats R,G,B -> 1 int ARGB
  if check_high_clamp_range
		vminps		ymm0, ymm6, ymm0
		vminps		ymm1, ymm6, ymm1
		vminps		ymm2, ymm6, ymm2	; col = clamp(c, 0.0, 255.0)
  end if ; check_high_clamp_range
		vxorps		ymm6, ymm6, ymm6
		vmaxps		ymm0, ymm6, ymm0
		vmaxps		ymm1, ymm6, ymm1
		vmaxps		ymm2, ymm6, ymm2
		vcvtps2dq	ymm0, ymm0
		vcvtps2dq	ymm1, ymm1
		vcvtps2dq	ymm2, ymm2		; col = int(col)
		vpslld		ymm0, ymm0, 16
		vpslld		ymm1, ymm1, 8
		vpor		ymm0, ymm0, ymm1
		vpor		ymm0, ymm0, ymm2	; combine R,G,B

  assert lfb_reg eq edi
		vmovdqa		[es:lfb_reg], ymm0	; output 8 pixels (32 bytes)
		add		lfb_reg,32
		ret
} ; avx_code!

macro	data!
{
  scaleNspeed = $-3
  db		$BA					; -0.01 (this byte is enough) :)
;  df16		-0.001,		scaleNspeed
  ifdo check_high_clamp_range, \
    <df16	254.9,		int_color_limit>	; value 255.0 will overflow (it will be 255.7 is fact)
} ; data!

;-----------------------------------------------------------------------------------------------------------------------
 
; ┌─── ───────── ───────────────────────── ────────── -··
; │ ┌─── ───────── ───────────────────── ────────── ────┐
; │ │ ┌─── ───────── ───────────────── ▄▄▄▄▄▄▄▄▄─ ────┐ │
; │ │ │ ┌─── ───────── ───────────── ▄▀      ▄▀ ────┐ │ │
; │ │ │ │ ┌─── ▄▄▄▄▄▄▄▄▄ ────────  ▄▀      ▄▀ ────┐ │ │ │
; │ │ │ │ │     ▀▄      ▀▄       ▄▀      ▄▀     : │ │ │ │
; │ │ │ │ │       ▀       ▀▄   ▄▀      ▄▀       │ │ │ │ │
; │ │ │ │ │  ████████████▀  ▀▄▀ ▄██▀ ▄▀   ▄███  │ │ │ │ │
; │ │ │ │ │         ▄██▀      ▄██▀ ▄▀   ▄██▀██  │ │ │ │ │
; │ │ │ │ │       ▄██▀  ▀▄  ▄██▀ ▄▀   ▄██▀  ██  │ │ │ │ │
; │ │ │ │ │     ▄██▀   ▄▀ ▄██▀    ▀ ▄██▀    ██  │ │ │ │ │
; │ │ │ │ │   ▄██▀   ▄▀ ▄██▀ ▄    ▄██▀      ██  │ │ │ │ │
; │ │ │ │ │  ▀▀▀   ▄▀  ▀▀▀ ▄▀ ▀▄ ▀▀▀  ▀▄    ▀▀  │ │ │ │ │
; │ │ │ │ :      ▄▀      ▄▀     ▀▄      ▀▄      │ │ │ │ │
; │ │ │ └───── ▄▀      ▄▀ ─────── ▀▀▀▀▀▀▀▀▀ ────┘ │ │ │ │
; │ │ └───── ▄▀      ▄▀ ─────────── ───────── ────┘ │ │ │
; │ └───── ─▀▀▀▀▀▀▀▀▀ ─────────────── ───────── ────┘ │ │
; └───── ────────── ─────────────────── ───────── ────┘ │
; ··-— ────────── ─────────────────────── ───────── ────┘

; ========================[ Greetings to... ]=========================
; Řrřola ♦ HellMood^DESiRE ♦ TomCat^Abaddon ♦ superogue^Marquee Design
; Digimind ♦♦♦ Kuemmel ♦♦ exoticorn^Icebird ♦♦♦ baze^3SC ♦♦♦ g0blinish
; gopher^Alcatraz ♦ iONic^Astroidea ♦ sensenstahl ♦ Dresdenboy^Citavia
; Optimus ♦♦♦ pestis^Brainlez Coders! ♦♦ frag^fsqrt ♦♦♦ Baudsurfer^RSI
; wbcbz7^sibCrew ♦♦ D-Art^Fenomen ♦♦♦ provod^jetlag ♦♦♦ ryg^Farbrausch
; Quiet ♦♦♦ G.Lomsadze ♦♦♦ bfox^insiders ♦♦♦ Adam Bazaroff^Excess Team
; Manwe^SandS ♦♦ TmK^deMarche ♦♦ Dolphin Soft ♦♦ havoc ♦♦ unlord^xylem
; ==================[ ...all sizecoders & YOU! ;) ]===================

; .------------< Welcome to sizecoding chats!!! >-----------.
; | https://discord.gg/NeCSgBTZmh • https://t.me/sizecoders |
; '---------------------------------------------------------'
