;; There isn't a single optimised routine in here.. wait for the next
;; release .. The instructions are written in pairs to make it easier to
;; optimise lateron :)

BITS 32

GLOBAL Convert32_16RGB565
GLOBAL Convert32_32RGB888
GLOBAL Convert32_8RGB332

GLOBAL ConvertX86

	
SECTION .text align=16


;; Convert_*
;; Paramters:	
;;   ESI = source 
;;   EDI = dest
;;   ECX = amount (NOT 0!!! (the ConvertX86 routine checks for that though))
;; Destroys:
;;   EAX, EBX, EDX


Convert32_32RGB888:
	;; I'll use the fpu later.. for now, this is enough
.L1:
	mov eax,[esi]		
	add esi,4
	
	mov [edi],eax
	add edi,4

	dec ecx
	jnz .L1

	jmp globalreturn


;; From:	00000000 | rrrrrrrr | gggggggg | bbbbbbbb
;;   To:	00000000 | 00000000 | rrrrrggg | gggbbbbb
Convert32_16RGB565:

	mov eax,ecx

	and al,1		; if length mod 2 = 0 use dword write routine
	je .L2
	
.L1:	
	mov ebx,[esi]

	mov eax,ebx
	mov edx,ebx
	
	shr eax,3
	shr edx,5
	
	and eax,0000000000011111b
	and edx,0000011111100000b

	shr ebx,8

	or eax,edx

	and ebx,1111100000000000b

	or eax,ebx

	mov [edi],al
	mov [edi+1],ah

	add edi,2
	add esi,4

	dec ecx
	jnz .L1

	jmp globalreturn

.L2:	
	mov eax,[esi]
	add esi,4
	
	mov edx,eax
	shr al,3

	shr edx,5
	mov ebx,eax
		
	and eax,0000000000011111b
	and edx,0000011111100000b

	shr ebx,8
	or eax,edx

	and ebx,1111100000000000b
	nop
	
	mov edx,[esi]	
	or eax,ebx

	add esi,4
	mov ebx,edx

	shl eax,16
	shr dl,3
	
	shr ebx,5
	and dl,0000000000011111b

	and ebx,0000011111100000b
	mov al,dl
	
	or eax,ebx
	shr edx,8
	
	and edx,1111100000000000b
	or eax,edx
	
	ror eax,16
	
	mov [edi],eax
	add edi,4
	
	dec ecx
	dec ecx
	jnz .L2

	jmp globalreturn

	

Convert32_8RGB332:

	mov eax,ecx
 
	and eax,3		; If length mod 4 = 0, use the dword
	je .L2			; write routine which is faster
	
.L1:
	mov eax,[esi]
	add esi,4

	mov ebx,eax
	
	shr al,6

	shr ebx,16
	
	shr ah,3

	and bl,11100000b

	and ah,00011100b
	
	or al,bl
	or al,ah
	
	mov [edi],al
	
	inc edi

	dec ecx
	jnz .L1

	jmp globalreturn
	
.L2:
	mov edx,[esi]
	mov eax,0

        shr dl,6
	add esi,4

	or al,dl
	shr dh,3
	
	and dh,00011100b
	or al,dh		; read delay, have to optimise
	
	shr edx,16
	
	mov ebx,[esi]
	add esi,4

	
	and dl,11100000b	; from previous pixel
	shr bl,6
	
	or al,dl		; from previous pixel
	shr bh,3
	
	or ah,bl
	and bh,00011100b
	
	or ah,bh		
	shr ebx,16		; read delay, have to optimise

	
	mov edx,[esi]
	and bl,11100000b	; from previous pixel
	
	add esi,4
	or ah,bl		; from previous pixel

	shr dl,6
	shl eax,16
	
	shr dh,3
	or al,dl
	
	and dh,00011100b
	or al,dh		; read delay, optimise..
	
		
	mov ebx,[esi]
	shr edx,16
	
	add esi,4
	and dl,11100000b	; from previous pixel

	shr bl,6
	or al,dl		; from previous pixel

	shr bh,3
	or ah,bl
	
	and bh,00011100b
	or ah,bh		; read delay
	
	shr ebx,16
	and bl,11100000b	; read delay
	
	or ah,bl		; read delay
	ror eax,16		; read delay
	
	mov [edi],eax		; read delay
	add edi,4
	
	sub ecx,4
	
	jz .L3			; fuck, l2 out of reach :)
	jmp .L2

.L3:			
	jmp globalreturn



;; ConvertX86:	 
;; EAX = ConverterInfo*
;; --------------------------------------------------------------------------
;; ConverterInfo (ebp+..)
;;   0:	void *s_pixels
;;   4:	int s_width
;;   8:	int s_height
;;  12:	int s_add
;;  16:	void *d_pixels
;;  20:	int d_width
;;  24:	int d_height
;;  28:	int d_add
;;  32:	void (*converter_function)() 
ConvertX86:

	cmp dword [eax+4],0
	je endconvert
	
	push ebp
	mov ebp,eax
	
	mov esi,[ebp+0]
	mov edi,[ebp+16]
	
y_loop:	
	mov ecx,[ebp+4]

	jmp [ebp+32]

globalreturn:	
	add esi,[ebp+12]
	add edi,[ebp+28]
	
	dec dword  [ebp+24]
	jnz y_loop

	
	pop ebp

endconvert:	
	ret		