;
; x86 format converters for HERMES
; Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
; This source code is licensed under the GNU LGPL
; 
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions		
; 
; Some routines are (c) Glenn Fiedler (ptc@gaffer.org)
;

BITS 32

GLOBAL _ConvertI8_32
GLOBAL _ConvertI8_24
GLOBAL _ConvertI8_16
GLOBAL _ConvertI8_INDEX8

GLOBAL _ConvertI8_SetLookup
	
EXTERN _ConvertX86
EXTERN _x86return

SECTION .data

store_ecx dd 0			; This turned out to be faster than push/pop
x86lookup dd 0
			
SECTION .text


;; Convert_*
;; Paramters:	
;;   ESI = source 
;;   EDI = dest
;;   ECX = amount (NOT 0!!! (the ConvertX86 routine checks for that though))
;; Destroys:
;;   EAX, EBX, EDX

_ConvertI8_32:	

	mov ebx,0
	mov edx,[x86lookup]
.L1:
	mov bl,[esi]
	inc esi
	
	mov eax,[edx+ebx*4]

	mov [edi],eax
	add edi,4

	dec ecx
	jnz .L1

	jmp _x86return
	

	
_ConvertI8_24:
	mov ebx,[x86lookup]	

	xor edx,edx

	; check short
	cmp ecx,32
	ja .L3

	
.L1    ; short loop
	mov dl,[esi]
	mov eax,[ebx+edx*4]
	mov [edi+0],al  ; blue
	mov [edi+1],ah  ; green
	shr eax,16
	mov [edi+2],al  ; red
	inc esi
	add edi,3
	dec ecx
	jnz .L1
.L2
	jmp _x86return
    
.L3 ; head
	mov eax,edi
	and eax,11b
	jz .L4
	mov dl,[esi]
	mov eax,[ebx+edx*4]
	mov [edi+0],al  ; blue
	mov [edi+1],ah  ; green
	shr eax,16
	mov [edi+2],al  ; red
	inc esi
	add edi,3
	dec ecx
	jmp .L3

.L4	; save ebp
	push ebp
	mov ebp,ebx

	; save count
	push ecx

	; unroll 4 times
	shr ecx,2

.L5     push ecx                        ; save ecx
        mov dl,[esi]                    ; index to "A"           

        mov eax,[ebp+edx*4]             ; eax = [xx][A2][A1][A0]
        shl eax,8                       ; eax = [A2][A1][A0][xx]

        mov dl,[esi+1]                  ; index to "B"

        mov al,[ebp+edx*4+0]            ; eax = [A2][A1][A0][B0]
        ror eax,8                       ; eax = [B0][A2][A1][A0] (done)
        mov [edi],eax

        mov eax,[ebp+edx*4]             ; eax = [xx][B2][B1][B0]
        shl eax,8                       ; eax = [B2][B1][B0][xx]

        mov dl,[esi+3]                  ; index to "D"

        mov ecx,[ebp+edx*4]             ; ecx = [xx][D2][D1][D0]
        shl ecx,8                       ; ecx = [D2][D1][D0][xx]

        mov dl,[esi+2]                  ; index to "C"

        mov ah,[ebp+edx*4+1]            ; eax = [B2][B1][C1][xx]
        mov al,[ebp+edx*4+0]            ; eax = [B2][B1][C1][C0]
        ror eax,16                      ; eax = [C1][C0][B2][B1] (done)
        
        mov cl,[ebp+edx*4+2]            ; ecx = [D2][D1][D0][C2] (done)

        mov [edi+4],eax        
        mov [edi+8],ecx
    
        add esi,4
        add edi,3*4

        pop ecx                         ; restore ecx
        
        dec ecx
        jnz .L5

	; tail
	pop ecx
	and ecx,11b
	jz .L7

.L6	
	mov dl,[esi]
	mov eax,[ebx+edx*4]
	mov [edi+0],al  ; blue
	mov [edi+1],ah  ; green
	shr eax,16
	mov [edi+2],al  ; red
	inc esi
	add edi,3
	dec ecx
	jnz .L6

.L7	pop ebp
	jmp _x86return

	
	
			
_ConvertI8_16:
	mov ebx,0
	mov edx,[x86lookup]

	test edi,3		; Check if the destination is unaligned mod 4
	jz .L_ALIGNED

	mov bl,[esi]		; If so, convert one pixel and pray
	dec ecx
	
	mov eax,[edx+ebx*4]

	mov [edi],ax
	jz .L3			; if we only had one pixel
	
.L_ALIGNED:	
	mov [store_ecx],ecx

	and ecx,0fffffffeh
	jz .L2			

.L1:
	mov bl,[esi+1]
	
	mov eax,[edx+ebx*4]

	shl eax,16
	mov bl,[esi]
	
	or eax,[edx+ebx*4]
	add esi,2
	
	mov [edi],eax
	add edi,4
			
	sub ecx,2
	jnz .L1

.L2:
	test dword [store_ecx],1
	jz .L3

	mov eax,0		; draw the remaining pixel, no need to be
	mov al,[esi]		; superfast

	mov ebx,[edx+eax*4]

	mov [edi],bx		; better than two byte moves, according to
	                        ; intel's docs
	inc esi

	add edi,2
	
.L3:		
	jmp _x86return

			

_ConvertI8_INDEX8:
	mov edx,ecx

	and ecx,0fffffffch	; Clear the lower two bits = number of
	                        ; blocks of 4 pixels we can draw
	jz .L2			; We have 1,2 or 3 pixels only!
	
.L1:
	shr ecx,2
	rep movsd
	

.L2:
	mov ecx,edx		; Get the remaining pixels to draw
	
	and ecx,3
	jz .L4			; width was modulo 4
	
.L3:	
	mov al,[esi]
	inc esi
	
	mov [edi],al
	inc edi
		
	dec ecx
	jnz .L3
	
.L4:	
	jmp _x86return



_ConvertI8_SetLookup:
	push ebp
	mov ebp,esp
	
	mov eax,[ebp+8]
	mov [x86lookup],eax
	
	pop ebp
	ret
			
	