
#include <stdlib.h>
#include <vga.h>

#include "mylib.h"
			/* How to react to this USE_ASM? */
/* #define USE_ASM */

static inline int muldiv64(int m1, int m2, int d)
{
/* int32 * int32 -> int64 / int32 -> int32 */
    int result;
    __asm__(
	       "imull %%edx\n\t"
	       "idivl %3\n\t"
  :	       "=a"(result)	/* out */
  :	       "a"(m1), "d"(m2), "g"(d)		/* in */
  :	       "ax", "dx"	/* mod */
	);
    return result;
}

BITMAP * my_scale_bitmap( BITMAP *_dp1, int w2, int h2)
{
    unsigned char *dp1;
    unsigned char *dp2;
    BITMAP *n;
    BITMAP *u=NULL;
    int xfactor;
    int yfactor;
    int w1, h1;

    if (w2 == 0 || h2 == 0)
	return NULL;

    if ( _dp1->type != BITMAP_MEMORY ) {
	u = my_create_bitmap ( _dp1->w, _dp1->h );
	my_put_bitmap ( u, _dp1, 0, 0 );
	_dp1 = u;
    }

    n = my_create_bitmap ( w2, h2 );
    dp1 = _dp1->data;
    dp2 = n->data;
    w1 = _dp1->w;
    h1 = _dp1->h;

    xfactor = muldiv64(w1, 65536, w2);	/* scaled by 65536 */
    yfactor = muldiv64(h1, 65536, h2);	/* scaled by 65536 */

	{
	    int y, sy;
	    sy = 0;
	    for (y = 0; y < h2;) {
		int sx = 0;
		unsigned char *dp2old = dp2;
		int x;
		x = 0;
		while (x < w2 - 8) {
#ifdef USE_ASM

		    /* This saves just a couple of cycles per */
		    /* pixel on a 486, but I couldn't resist. */
		    __asm__ __volatile__("movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 1 (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 2 (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 3 (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 4 (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 5 (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 6 (%1, %2)\n\t"
					 "movl %4, %%eax\n\t"
					 "shrl $16, %%eax\n\t"
					 "addl %5, %4\n\t"
					 "movb (%3, %%eax), %%al\n\t"
					 "movb %%al, 7 (%1, %2)\n\t"
					 :	/* output */
					 :	/* input */
				     "ax"(0), "r"(dp2), "r"(x), "r"(dp1),
					 "r"(sx), "r"(xfactor)
					 :"ax", "4"
		    );
#else
		    *(dp2 + x) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 1) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 2) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 3) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 4) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 5) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 6) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    *(dp2 + x + 7) = *(dp1 + (sx >> 16));
		    sx += xfactor;
#endif
		    x += 8;
		}
		while (x < w2) {
		    *(dp2 + x) = *(dp1 + (sx >> 16));
		    sx += xfactor;
		    x++;
		}
		dp2 += w2;
		y++;
		while (y < h2) {
		    int l;
		    int syint = sy >> 16;
		    sy += yfactor;
		    if ((sy >> 16) != syint)
			break;
		    /* Copy identical lines. */
		    l = dp2 - dp2old;
		    __memcpy(dp2, dp2old, l);
		    dp2old = dp2;
		    dp2 += l;
		    y++;
		}
		dp1 = _dp1->data + (sy >> 16) * w1;
	    }
	}
 if ( u != NULL ) my_destroy_bitmap (u);
 return n;
}
