/** This file ADDED FROM CVS 2004-04-24 **/
/***************************************************************
 * FBlend Library,
 *  Copyright (c) Robert J Ohannessian, 2002
 *
 * See the accompanying readme.txt and license.txt file for
 * details.
 *
 * Fast RGBA translucency routines for Allegro
 *
 * This function is intended to replace the awefully slow RGBA trans
 * blenders in Allegro.
 */

/** \file rgbatran.c
 */

#include "allegro.h"
#include "fblend.h"
#include "mmx.h"
#include "sse.h"

#undef FBLEND_MMX
#undef FBLEND_SSE


static void fblend_rgba_trans_32(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
static void fblend_rgba_trans_24(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
static void fblend_rgba_trans_16(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
static void fblend_rgba_trans_15(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
static void fblend_rgba_trans_24_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
static void fblend_rgba_trans_16_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
static void fblend_rgba_trans_15_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);

#ifdef FBLEND_MMX
extern void fblend_rgba_trans_mmx_32(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_mmx_16(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_mmx_15(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_mmx_16_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_mmx_15_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
#endif
#ifdef FBLEND_SSE
extern void fblend_rgba_trans_sse_32(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_sse_16(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_sse_15(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_sse_16_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
extern void fblend_rgba_trans_sse_15_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h);
#endif



/* void fblend_rgba_trans(BITMAP *src, BITMAP *dst, int x, int y, int unused) */
/** \ingroup bitmap_blenders
 *  Draws the source bitmap into the destination bitmap at coordinates
 *  (x,y), using translucency with source alpha. The source bitmap must be
 *  in 32-bpp, and the destination must be in 15, 16, 24 or 32-bpp.
 *
 * <pre>
 *  dest_red = (src_red * src_alpha / 255) + (dest_red * (255 - src_alpha) / 255)
 * </pre>
 *  Repeat for green and blue, and for all pixels to be displayed.
 *
 *  The 32 bpp mask color is also taken in consideration, so those
 *  pixels aren't blended at all, just like in Allegro.
 *  MMX and SSE will automatically be used if they are present.
 *
 *  \note Drawing into non-memory bitmaps is usually very slow due to system
 *        architecture.
 *
 *  \param src  The source bitmap. Must be a linear memory bitmap or
 *              sub-bitmap thereof, and must be in 32 bpp.
 *  \param dst  The destination bitmap. Must be in 15, 16 or 32 bpp
 *              and linear, but not necessarily a memory bitmap.
 *  \param x    Destination coordinate on the x axis.
 *  \param y    Destination coordinate on the y axis.
 *  \param unued Parameter is only there for consistency with the other
 *               blenders.
 */
void fblend_rgba_trans(BITMAP *src, BITMAP *dst, int x, int y, int unused) {
	(void)unused;
	fblend_rgba_trans_3(src, 0, 0, dst, x, y, dst, x, y, src->w, src->h, 0);
}

#define CLIP(x, y, dx, dy, w, h, bmp) { \
	if (x < bmp->cl) {          \
		w += x - bmp->cl;       \
		dx -= x - bmp->cl;      \
		x = bmp->cl;            \
	}                           \
	if (y < bmp->ct) {          \
		h += y - bmp->ct;       \
		dy -= y - bmp->ct;      \
		y = bmp->ct;            \
	}                           \
	if (x + w >= bmp->cr)       \
		w -= x + w - bmp->cr;   \
	if (y + h >= bmp->cb)       \
		h -= y + h - bmp->cb;   \
}

void fblend_rgba_trans_3(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h, int unused) {

	int src1_depth, src2_depth, dst_depth;
	int delta_x = 0, delta_y = 0;

	(void)unused;
		
	/* Clip the image */
	CLIP(dx, dy, delta_x, delta_y, w, h, dst);
	s1x += delta_x; s2x += delta_x; s1y += delta_y; s2y += delta_y; delta_x = 0; delta_y = 0;
	CLIP(s1x, s1y, delta_x, delta_y, w, h, src1);
	s2x += delta_x; dx += delta_x; s2y += delta_y; dy += delta_y; delta_x = 0; delta_y = 0;
	CLIP(s2x, s2y, delta_x, delta_y, w, h, src2);
	s1x += delta_x; dx += delta_x; s1y += delta_y; dy += delta_y; delta_x = 0; delta_y = 0;

	/* Nothing to do? */
	if (w < 1 || h < 1)
		return;
	
	src1_depth = bitmap_color_depth(src1);
	src2_depth = bitmap_color_depth(src2);
	dst_depth = bitmap_color_depth(dst);
	
	/* Incorrct color depths */
	if (!(src1_depth == 32 && src2_depth == 32 && dst_depth == 32)
		&& !(src1_depth == 32 && src2_depth == 24 && dst_depth == 24)
		&& !(src1_depth == 32 && src2_depth == 16 && dst_depth == 16)
		&& !(src1_depth == 32 && src2_depth == 15 && dst_depth == 15))
		return;
	
	acquire_bitmap(dst);

	/* 32 bit code */
	if (dst_depth == 32) {
		#ifdef FBLEND_SSE
			if (cpu_capabilities & CPU_SSE)
				fblend_rgba_trans_sse_32(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
			else
		#endif
		#ifdef FBLEND_MMX
			if (cpu_capabilities & CPU_MMX)
				fblend_rgba_trans_mmx_32(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
			else
		#endif
		fblend_rgba_trans_32(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
	}
	/* 24 bit code */
	else if (dst_depth == 24) {
		if (_rgb_b_shift_32 < _rgb_b_shift_24 || _rgb_r_shift_32 < _rgb_r_shift_24)
			fblend_rgba_trans_24_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
		else
			fblend_rgba_trans_24(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
	}
	else if (dst_depth == 16) {
		if (_rgb_b_shift_32 < _rgb_b_shift_16 || _rgb_r_shift_32 < _rgb_r_shift_16) {
    		#ifdef FBLEND_SSE
    			if (cpu_capabilities & CPU_SSE)
    				fblend_rgba_trans_sse_16_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		#ifdef FBLEND_MMX
    			if (cpu_capabilities & CPU_MMX)
    				fblend_rgba_trans_mmx_16_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		fblend_rgba_trans_16_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    	}
    	else {
    		#ifdef FBLEND_SSE
    			if (cpu_capabilities & CPU_SSE)
    				fblend_rgba_trans_sse_16(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		#ifdef FBLEND_MMX
    			if (cpu_capabilities & CPU_MMX)
    				fblend_rgba_trans_mmx_16(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		fblend_rgba_trans_16(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    	}
	}
	else if (dst_depth == 15) {
		if (_rgb_b_shift_32 < _rgb_b_shift_15 || _rgb_r_shift_32 < _rgb_r_shift_15) {
    		#ifdef FBLEND_SSE
    			if (cpu_capabilities & CPU_SSE)
    				fblend_rgba_trans_sse_15_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		#ifdef FBLEND_MMX
    			if (cpu_capabilities & CPU_MMX)
    				fblend_rgba_trans_mmx_15_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		fblend_rgba_trans_15_rev(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    	}
    	else {
    		#ifdef FBLEND_SSE
    			if (cpu_capabilities & CPU_SSE)
    				fblend_rgba_trans_sse_15(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		#ifdef FBLEND_MMX
    			if (cpu_capabilities & CPU_MMX)
    				fblend_rgba_trans_mmx_15(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
    			else
    		#endif
    		fblend_rgba_trans_15(src1, s1x, s1y, src2, s2x, s2y, dst, dx, dy, w, h);
		}    	
	}
	
	bmp_unwrite_line(dst);
	release_bitmap(dst);

	return;
}


static void fblend_rgba_trans_32(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long *s1, *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned long*)(bmp_read_line(src2, s2y + j) + s2x * sizeof(long));
		d =  (unsigned long*)(bmp_write_line(dst, dy + j) + dx * sizeof(long));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = *s2;
			
			/* Check for mask color */	
			#if 0
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 24) & 0xFF;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0xFF00FF;
			temp1 = (color2 & 0xFF00FF) - temp2;
			temp1 = (((temp1 * fact) >> 8) + temp2) & 0xFF00FF;
			color1 &= 0xFF00;
			color2 &= 0xFF00;
			temp2 = ((((color2 - color1) * fact) >> 8) + color1) & 0xFF00;
			
 			/* Write the data */
			s1++;
			s2++;
			bmp_write32((unsigned long)d, temp1 | temp2);
			d++;
		}
	}

	return;
}

static void fblend_rgba_trans_32_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long *s1, *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned long*)(bmp_read_line(src2, s2y + j) + s2x * sizeof(long));
		d =  (unsigned long*)(bmp_write_line(dst, dy + j) + dx * sizeof(long));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = *s2;
			
			/* Check for mask color */	
			#if 0
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 24) & 0xFF;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0xFF00FF;
			temp1 = (((color2 & 0xFF0000) >> 16) | ((color2 & 0xFF) << 16)) - temp2;
			temp1 = (((temp1 * fact) >> 8) + temp2) & 0xFF00FF;
			color1 &= 0xFF00;
			color2 &= 0xFF00;
			temp2 = ((((color2 - color1) * fact) >> 8) + color1) & 0xFF00;
			
 			/* Write the data */
			s1++;
			s2++;
			bmp_write32((unsigned long)d, temp1 | temp2);
			d++;
		}
	}

	return;
}
 

static void fblend_rgba_trans_24(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long *s1;
		unsigned char *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned char*)(bmp_read_line(src2, s2y + j) + s2x * 3 * sizeof(char));
		d =  (unsigned char*)(bmp_write_line(dst, dy + j) + dx * 3 * sizeof(char));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = s2[0] | (s2[1] << 8) | (s2[2] << 16);
			
			/* Check for mask color */	
			#if 0
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 24) & 0xFF;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0xFF00FF;
			temp1 = (color2 & 0xFF00FF) - temp2;
			temp1 = (((temp1 * fact) >> 8) + temp2) & 0xFF00FF;
			color1 &= 0xFF00;
			color2 &= 0xFF00;
			temp2 = ((((color2 - color1) * fact) >> 8) + color1) & 0xFF00;
			
 			/* Write the data */
			s1++;
			s2 += 3;
			bmp_write24((unsigned long)d, temp1 | temp2);
			d += 3;
		}
	}

	return;
}

static void fblend_rgba_trans_24_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long *s1;
		unsigned char *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned char*)(bmp_read_line(src2, s2y + j) + s2x * 3 * sizeof(char));
		d =  (unsigned char*)(bmp_write_line(dst, dy + j) + dx * 3 * sizeof(char));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = s2[0] | (s2[1] << 8) | (s2[2] << 16);
			
			/* Check for mask color */	
			#if 0
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 24) & 0xFF;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0xFF00FF;
			temp1 = (((color2 & 0xFF0000) >> 16) | ((color2 & 0xFF) << 16)) - temp2;
			temp1 = (((temp1 * fact) >> 8) + temp2) & 0xFF00FF;
			color1 &= 0xFF00;
			color2 &= 0xFF00;
			temp2 = ((((color2 - color1) * fact) >> 8) + color1) & 0xFF00;
			
 			/* Write the data */
			s1++;
			s2 += 3;
			bmp_write24((unsigned long)d, temp1 | temp2);
			d += 3;
		}
	}

	return;
}


static void fblend_rgba_trans_16(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long  *s1;
		unsigned short *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned short*)(bmp_read_line(src2, s2y + j) + s2x * sizeof(short));
		d =  (unsigned short*)(bmp_write_line(dst, dy + j) + dx * sizeof(short));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = *s2;
			
			/* Check for mask color */	
			#if 1
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 27) & 0x1F;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0xF81F;
			temp1 = (((color2 >> 8) & 0xF800) | ((color2 >> 3) & 0x1F)) - temp2;
			temp1 = (((temp1 * fact) >> 5) + temp2) & 0xF81F;
			color1 &= 0x7E0;
			color2 = (color2 >> 5) & 0x7E0;
			temp2 = ((((color2 - color1) * fact) >> 5) + color1) & 0x7E0;
			
 			/* Write the data */
			s1++;
			s2++;
			bmp_write16((unsigned long)d, temp1 | temp2);
			d++;
		}
	}
	return;
}

static void fblend_rgba_trans_16_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long  *s1;
		unsigned short *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned short*)(bmp_read_line(src2, s2y + j) + s2x * sizeof(short));
		d =  (unsigned short*)(bmp_write_line(dst, dy + j) + dx * sizeof(short));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = *s2;
			
			/* Check for mask color */	
			#if 1
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 27) & 0x1F;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0xF81F;
			temp1 = (((color2 << 8) & 0xF800) | ((color2 >> 19) & 0x1F)) - temp2;
			temp1 = (((temp1 * fact) >> 5) + temp2) & 0xF81F;
			color1 &= 0x7E0;
			color2 = (color2 >> 5) & 0x7E0;
			temp2 = ((((color2 - color1) * fact) >> 5) + color1) & 0x7E0;
			
 			/* Write the data */
			s1++;
			s2++;
			bmp_write16((unsigned long)d, temp1 | temp2);
			d++;
		}
	}
	return;
}


static void fblend_rgba_trans_15(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long  *s1;
		unsigned short *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned short*)(bmp_read_line(src2, s2y + j) + s2x * sizeof(short));
		d =  (unsigned short*)(bmp_write_line(dst, dy + j) + dx * sizeof(short));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = *s2;
			
			/* Check for mask color */	
			#if 0
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 27) & 0x1F;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0x7C1F;
			temp1 = (((color2 >> 9) & 0x7C00) | ((color2 >> 3) & 0x1F)) - temp2;
			temp1 = (((temp1 * fact) >> 5) + temp2) & 0x7C1F;
			color1 &= 0x3E0;
			color2 = (color2 >> 6) & 0x3E0;
			temp2 = ((((color2 - color1) * fact) >> 5) + color1) & 0x3E0;
			
 			/* Write the data */
			s1++;
			s2++;
			bmp_write16((unsigned long)d, temp1 | temp2);
			d++;
		}
	}
	return;
}


static void fblend_rgba_trans_15_rev(BITMAP *src1, int s1x, int s1y, BITMAP *src2, int s2x, int s2y, BITMAP *dst, int dx, int dy, int w, int h) {

	int i, j;

	for (j = 0; j < h; j++) {

		unsigned long  *s1;
		unsigned short *s2, *d;
		unsigned long color1, color2;
	
		/* Read src line */
		
		bmp_select(dst);
		s1 = (unsigned long*)(src1->line[s1y + j] + s1x * sizeof(long));
		s2 = (unsigned short*)(bmp_read_line(src2, s2y + j) + s2x * sizeof(short));
		d =  (unsigned short*)(bmp_write_line(dst, dy + j) + dx * sizeof(short));
	    
		for (i = w; i; i--) {
			unsigned long temp1, temp2;
			unsigned long fact;

			/* Read data, 1 pixel at a time */
			color2 = *s1;
			color1 = *s2;
			
			/* Check for mask color */	
			#if 0
			if ((color2 & 0xFFFFFF) == MASK_COLOR_32) {
				/* Select source color such as destination color remains unchanged */
				color2 = 0;
			}
			#else
			
			/* Same code as above, but l33t-ified for a 15% speed boost. */
			temp1 = (color2 & 0xFFFFFF) - MASK_COLOR_32;
			temp1 |= (unsigned long)-(signed long)temp1;
			temp1 = (unsigned long)((signed long)temp1 >> (signed)31);
			color2 = temp1 & color2;
			
			#endif
			
			/* Extract the alpha factor */
			fact = (color2 >> 27) & 0x1F;
			
			/* Mutiply by the factor */
			temp2 = color1 & 0x7C1F;
			temp1 = (((color2 << 7) & 0x7C00) | ((color2 >> 19) & 0x1F)) - temp2;
			temp1 = (((temp1 * fact) >> 5) + temp2) & 0x7C1F;
			color1 &= 0x3E0;
			color2 = (color2 >> 6) & 0x3E0;
			temp2 = ((((color2 - color1) * fact) >> 5) + color1) & 0x3E0;
			
 			/* Write the data */
			s1++;
			s2++;
			bmp_write16((unsigned long)d, temp1 | temp2);
			d++;
		}
	}
	return;
}
