/* $Id: encode.c,v 1.2 2004/03/24 23:51:37 yurand Exp $ */ 
/*
 *         __   _____    ______   ______   ___    ___
 *        /\ \ /\  _ `\ /\  ___\ /\  _  \ /\_ \  /\_ \
 *        \ \ \\ \ \L\ \\ \ \__/ \ \ \L\ \\//\ \ \//\ \      __     __
 *      __ \ \ \\ \  __| \ \ \  __\ \  __ \ \ \ \  \ \ \   /'__`\ /'_ `\
 *     /\ \_\/ / \ \ \/   \ \ \L\ \\ \ \/\ \ \_\ \_ \_\ \_/\  __//\ \L\ \
 *     \ \____//  \ \_\    \ \____/ \ \_\ \_\/\____\/\____\ \____\ \____ \
 *      \/____/    \/_/     \/___/   \/_/\/_/\/____/\/____/\/____/\/___L\ \
 *                                                                  /\____/
 *                                                                  \_/__/
 *
 *      Version 2.2, by Angelo Mottola, 2000-2003.
 *
 *      Encoder core module.
 *
 *      See the readme.txt file for instructions on using this package in your
 *      own programs.
 */


#include "internal.h"


/* Standard quantization tables for luminance and chrominance. Scaled version
 * of these are used by the encoder for a given quality.
 * These tables come from the IJG code, which takes them from the JPeg specs,
 * and are generic quantization tables that give good results on most images.
 */
static const unsigned char default_luminance_quant_table[64] = {
	16,  11,  10,  16,  24,  40,  51,  61,
	12,  12,  14,  19,  26,  58,  60,  55,
	14,  13,  16,  24,  40,  57,  69,  56,
	14,  17,  22,  29,  51,  87,  80,  62,
	18,  22,  37,  56,  68, 109, 103,  77,
	24,  35,  55,  64,  81, 104, 113,  92,
	49,  64,  78,  87, 103, 121, 120, 101,
	72,  92,  95,  98, 112, 100, 103,  99
};
static const unsigned char default_chrominance_quant_table[64] = {
	17,  18,  24,  47,  99,  99,  99,  99,
	18,  21,  26,  66,  99,  99,  99,  99,
	24,  26,  56,  99,  99,  99,  99,  99,
	47,  66,  99,  99,  99,  99,  99,  99,
	99,  99,  99,  99,  99,  99,  99,  99,
	99,  99,  99,  99,  99,  99,  99,  99,
	99,  99,  99,  99,  99,  99,  99,  99,
	99,  99,  99,  99,  99,  99,  99,  99
};

/* Standard huffman tables for luminance AC/DC and chrominance AC/DC.
 * These come from the IJG code, which takes them from the JPeg standard.
 */
static const unsigned char num_codes_dc_luminance[17] =
	{ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
static const unsigned char val_dc_luminance[] =
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };

static const unsigned char num_codes_dc_chrominance[17] =
	{ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
static const unsigned char val_dc_chrominance[] =
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };

static const unsigned char num_codes_ac_luminance[17] =
	{ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
static const unsigned char val_ac_luminance[] = {
	0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
	0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
	0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
	0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
	0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
	0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
	0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
	0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
	0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
	0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
	0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
	0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
	0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
	0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
	0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
	0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
	0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
	0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
	0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
	0xf9, 0xfa
};

static const unsigned char num_codes_ac_chrominance[17] =
	{ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
static const unsigned char val_ac_chrominance[] = {
	0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
	0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
	0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
	0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
	0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
	0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
	0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
	0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
	0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
	0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
	0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
	0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
	0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
	0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
	0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
	0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
	0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
	0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
	0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
	0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
	0xf9, 0xfa
};
     

static HUFFMAN_TABLE huffman_ac_table[2];
static HUFFMAN_TABLE huffman_dc_table[2];
static int luminance_quant_table[64];
static int chrominance_quant_table[64];
static void (*rgb2ycbcr)(int address, short *y1, short *cb1, short *cr1, short *y2, short *cb2, short *cr2);



/* apply_fdct:
 *  Applies the forward discrete cosine transform to the given input block,
 *  in the form of a vector of 64 coefficients.
 *  This uses integer fixed point math and is based on code by the IJG.
 */
static void
apply_fdct(short *data)
{
	int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
	int tmp10, tmp11, tmp12, tmp13;
	int z1, z2, z3, z4, z5;
	short *dataptr = data;
	int i;
	
	for (i = 8; i; i--) {
		tmp0 = dataptr[0] + dataptr[7];
		tmp7 = dataptr[0] - dataptr[7];
		tmp1 = dataptr[1] + dataptr[6];
		tmp6 = dataptr[1] - dataptr[6];
		tmp2 = dataptr[2] + dataptr[5];
		tmp5 = dataptr[2] - dataptr[5];
		tmp3 = dataptr[3] + dataptr[4];
		tmp4 = dataptr[3] - dataptr[4];
		
		tmp10 = tmp0 + tmp3;
		tmp13 = tmp0 - tmp3;
		tmp11 = tmp1 + tmp2;
		tmp12 = tmp1 - tmp2;
		
		dataptr[0] = (tmp10 + tmp11) << 2;
		dataptr[4] = (tmp10 - tmp11) << 2;
		
		z1 = (tmp12 + tmp13) * FIX_0_541196100;
		dataptr[2] = (z1 + (tmp13 * FIX_0_765366865)) >> 11;
		dataptr[6] = (z1 + (tmp12 * -FIX_1_847759065)) >> 11;
		
		z1 = tmp4 + tmp7;
		z2 = tmp5 + tmp6;
		z3 = tmp4 + tmp6;
		z4 = tmp5 + tmp7;
		z5 = (z3 + z4) * FIX_1_175875602;
		
		tmp4 *= FIX_0_298631336;
		tmp5 *= FIX_2_053119869;
		tmp6 *= FIX_3_072711026;
		tmp7 *= FIX_1_501321110;
		z1 *= -FIX_0_899976223;
		z2 *= -FIX_2_562915447;
		z3 *= -FIX_1_961570560;
		z4 *= -FIX_0_390180644;
		
		z3 += z5;
		z4 += z5;
		
		dataptr[7] = (tmp4 + z1 + z3) >> 11;
		dataptr[5] = (tmp5 + z2 + z4) >> 11;
		dataptr[3] = (tmp6 + z2 + z3) >> 11;
		dataptr[1] = (tmp7 + z1 + z4) >> 11;
		
		dataptr += 8;
	}
	
	dataptr = data;
	for (i = 8; i; i--) {
		tmp0 = dataptr[0] + dataptr[56];
		tmp7 = dataptr[0] - dataptr[56];
		tmp1 = dataptr[8] + dataptr[48];
		tmp6 = dataptr[8] - dataptr[48];
		tmp2 = dataptr[16] + dataptr[40];
		tmp5 = dataptr[16] - dataptr[40];
		tmp3 = dataptr[24] + dataptr[32];
		tmp4 = dataptr[24] - dataptr[32];
		
		tmp10 = tmp0 + tmp3;
		tmp13 = tmp0 - tmp3;
		tmp11 = tmp1 + tmp2;
		tmp12 = tmp1 - tmp2;
		
		dataptr[0] = (tmp10 + tmp11) >> 2;
		dataptr[32] = (tmp10 - tmp11) >> 2;
		
		z1 = (tmp12 + tmp13) * FIX_0_541196100;
		dataptr[16] = (z1 + (tmp13 * FIX_0_765366865)) >> 15;
		dataptr[48] = (z1 + (tmp12 * -FIX_1_847759065)) >> 15;
		
		z1 = tmp4 + tmp7;
		z2 = tmp5 + tmp6;
		z3 = tmp4 + tmp6;
		z4 = tmp5 + tmp7;
		z5 = (z3 + z4) * FIX_1_175875602;
		
		tmp4 *= FIX_0_298631336;
		tmp5 *= FIX_2_053119869;
		tmp6 *= FIX_3_072711026;
		tmp7 *= FIX_1_501321110;
		z1 *= -FIX_0_899976223;
		z2 *= -FIX_2_562915447;
		z3 *= -FIX_1_961570560;
		z4 *= -FIX_0_390180644;
		
		z3 += z5;
		z4 += z5;
		dataptr[56] = (tmp4 + z1 + z3) >> 15;
		dataptr[40] = (tmp5 + z2 + z4) >> 15;
		dataptr[24] = (tmp6 + z2 + z3) >> 15;
		dataptr[8] = (tmp7 + z1 + z4) >> 15;
		
		dataptr++;
	}
}


/* zigzag_reorder:
 *  Reorders a vector of coefficients by the zigzag scan.
 */
static void
zigzag_reorder(short *input, short *output)
{
	int i;
	
	for (i = 0; i < 64; i++)
		output[_jpeg_zigzag_scan[i]] = input[i];
}


/* write_quantization_table:
 *  Computes a quantization table given a quality value and writes it to the
 *  output stream.
 */
static void
write_quantization_table(int *quant_table, const unsigned char *data, int quality)
{
	short temp[64], temp_table[64];
	double value;
	int i;

	for (i = 0; i < 64; i++) {
		if (quality == 100)
			value = 1.0;
		else {
			value = (double)data[i];
			value /= QUALITY_FACTOR(quality);
		}
		temp[i] = MID(1, (int)floor(value), 255);
	}
	zigzag_reorder(temp, temp_table);
	for (i = 0; i < 64; i++) {
		_jpeg_chunk_putc(temp_table[i]);
		quant_table[i] = (1 << 16) / (int)temp_table[i];
	}
}


/* write_huffman_table:
 *  Writes an huffman table to the output stream and computes a lookup table
 *  for faster huffman encoding.
 */
static void
write_huffman_table(HUFFMAN_TABLE *table, unsigned const char *num_codes, unsigned const char *value)
{
	HUFFMAN_ENTRY *entry;
	int i, j, code, index;
	
	for (i = 1; i <= 16; i++)
		_jpeg_chunk_putc(num_codes[i]);
	memset(table, 0, sizeof(HUFFMAN_TABLE));
	index = code = 0;
	entry = table->entry;
	for (i = 1; i <= 16; i++) {
		for (j = 0; j < num_codes[i]; j++) {
			entry->value = value[index];
			entry->encoded_value = code;
			entry->bits_length = i;
			_jpeg_chunk_putc(value[index]);
			table->code[entry->value] = entry;
			entry++;
			code++;
			index++;
		}
		code <<= 1;
	}
}


/* write_header:
 *  Writes the complete header of a baseline JPG image to the output stream.
 *  This is made of the following chunks (in order): APP0, COM, DQT, SOF0,
 *  DHT, SOS.
 */
static int
write_header(int sampling, int greyscale, int quality, int width, int height)
{
	char *comment = "Created by JPGalleg";
	unsigned int i;
	int sampling_byte = 0x11;
	
	_jpeg_putw(CHUNK_SOI);
	
	/* APP0 chunk */
	_jpeg_new_chunk(CHUNK_APP0);
	_jpeg_chunk_putc('J');
	_jpeg_chunk_putc('F');
	_jpeg_chunk_putc('I');
	_jpeg_chunk_putc('F');
	_jpeg_chunk_putc(0);
	_jpeg_chunk_putw(0x0101);	/* JFIF version 1.1 */
	_jpeg_chunk_putc(0);		/* No units, 1:1 aspect ratio */
	_jpeg_chunk_putw(1);
	_jpeg_chunk_putw(1);
	_jpeg_chunk_putw(0);		/* No thumbnail */
	_jpeg_write_chunk();
	
	/* COM chunk ;) */
	_jpeg_new_chunk(CHUNK_COM);
	for (i = 0; i < strlen(comment); i++)
		_jpeg_chunk_putc(comment[i]);
	_jpeg_write_chunk();
	
	/* DQT chunk */
	_jpeg_new_chunk(CHUNK_DQT);
	_jpeg_chunk_putc(0);
	write_quantization_table(luminance_quant_table, default_luminance_quant_table, quality);
	if (!greyscale) {
		_jpeg_chunk_putc(1);
		write_quantization_table(chrominance_quant_table, default_chrominance_quant_table, quality);
	}
	_jpeg_write_chunk();
	
	/* SOF0 chunk */
	switch (sampling) {
		case JPG_SAMPLING_411: sampling_byte = 0x22; break;
		case JPG_SAMPLING_422: sampling_byte = 0x21; break;
		case JPG_SAMPLING_444: sampling_byte = 0x11; break;
	}
	_jpeg_new_chunk(CHUNK_SOF0);
	_jpeg_chunk_putc(8);
	_jpeg_chunk_putw(height);
	_jpeg_chunk_putw(width);
	_jpeg_chunk_putc(greyscale ? 1 : 3);
	_jpeg_chunk_putc(1);		/* Y component id */
	_jpeg_chunk_putc(greyscale ? 0x11 : sampling_byte);/* Luminance sampling factor */
	_jpeg_chunk_putc(0);		/* Quantization table number for Y */
	if (!greyscale) {
		_jpeg_chunk_putc(2);	/* Cb component id */
		_jpeg_chunk_putc(0x11);	/* Chrominance always has a sampling factor of 1:1 */
		_jpeg_chunk_putc(1);	/* Quantization table number for CbCr */
		_jpeg_chunk_putc(3);	/* Cr component id */
		_jpeg_chunk_putc(0x11);
		_jpeg_chunk_putc(1);
	}
	_jpeg_write_chunk();
	
	/* DHT chunk */
	_jpeg_new_chunk(CHUNK_DHT);
	_jpeg_chunk_putc(0x00);		/* DC huffman table 0 (used for luminance) */
	write_huffman_table(&huffman_dc_table[0], num_codes_dc_luminance, val_dc_luminance);
	_jpeg_chunk_putc(0x10);		/* AC huffman table 0 (used for luminance) */
	write_huffman_table(&huffman_ac_table[0], num_codes_ac_luminance, val_ac_luminance);
	if (!greyscale) {
		_jpeg_chunk_putc(0x01);		/* DC huffman table 1 (used for chrominance) */
		write_huffman_table(&huffman_dc_table[1], num_codes_dc_chrominance, val_dc_chrominance);
		_jpeg_chunk_putc(0x11);		/* AC huffman table 1 (used for chrominance) */
		write_huffman_table(&huffman_ac_table[1], num_codes_ac_chrominance, val_ac_chrominance);
	}
	_jpeg_write_chunk();
	
	/* SOS chunk */
	_jpeg_new_chunk(CHUNK_SOS);
	_jpeg_chunk_putc(greyscale ? 1 : 3);
	_jpeg_chunk_putc(1);
	_jpeg_chunk_putc(0x00);		/* Y uses DC table 0 and AC table 0 */
	if (!greyscale) {
		_jpeg_chunk_putc(2);
		_jpeg_chunk_putc(0x11);	/* Cb uses DC table 1 and AC table 1 */
		_jpeg_chunk_putc(3);
		_jpeg_chunk_putc(0x11);	/* Cr uses DC table 1 and AC table 1 */
	}
	_jpeg_chunk_putc(0);		/* Cover whole spectrum (0-63) since we're saving a baseline JPG */
	_jpeg_chunk_putc(63);
	_jpeg_chunk_putc(0);		/* Successive approximation not needed for baseline JPG */
	_jpeg_write_chunk();
	
	return 0;
}


/* format_number:
 *  Computes the category and bits of a given number.
 */
static void
format_number(int num, int *category, int *bits)
{
	int abs_num, mask, cat;
	
	mask = num >> 31;
	abs_num = (num ^ mask) - mask;
	
	for (cat = 0; abs_num; cat++)
		abs_num >>= 1;
	*category = cat;
	if (num >= 0)
		*bits = num;
	else
		*bits = num + ((1 << cat) - 1);
}


/* put_bits:
 *   Writes some bits to the output stream.
 */
static int
put_bits(int value, int num_bits)
{
	int i;

	for (i = num_bits - 1; i >= 0; i--) {
		if (_jpeg_put_bit((value >> i) & 0x1))
			return -1;
	}
	return 0;
}


/* huffman_encode:
 *  Writes the huffman code of a given value.
 */
static int
huffman_encode(HUFFMAN_TABLE *table, int value)
{
	HUFFMAN_ENTRY *entry;
	
	entry = table->code[value];
	if (entry)
		return put_bits(entry->encoded_value, entry->bits_length);
	jpgalleg_error = JPG_ERROR_HUFFMAN;
	return -1;
}


/* encode_block:
 *  Encodes an 8x8 basic block of coefficients of given type (luminance or
 *  chrominance) and writes it to the output stream.
 */
static int
encode_block(short *block, int type, int *old_dc)
{
	HUFFMAN_TABLE *dc_table, *ac_table;
	int *quant_table;
	short data[64];
	int i, index;
	int value, num_zeroes;
	int category, bits;
	
	if (type == LUMINANCE) {
		dc_table = &huffman_dc_table[0];
		ac_table = &huffman_ac_table[0];
		quant_table = luminance_quant_table;
	}
	else {
		dc_table = &huffman_dc_table[1];
		ac_table = &huffman_ac_table[1];
		quant_table = chrominance_quant_table;
	}
	
	apply_fdct(block);
	
	for (i = 0; i < 64; i++) {
		value = block[i];
		if (value < 0) {
			value = -value;
			value = (value * quant_table[i]) + (quant_table[i] >> 1);
			block[i] = -(value >> 19);
		}
		else
			block[i] = ((value * quant_table[i]) + (quant_table[i] >> 1)) >> 19;
	}
	
	zigzag_reorder(block, data);
	
	value = data[0] - *old_dc;
	*old_dc = data[0];
	format_number(value, &category, &bits);
	if (huffman_encode(dc_table, category))
		return -1;
	if (put_bits(bits, category))
		return -1;

	num_zeroes = 0;
	for (index = 1; index < 64; index++) {
		if ((value = data[index]) == 0)
			num_zeroes++;
		else {
			while (num_zeroes > 15) {
				if (huffman_encode(ac_table, 0xf0))
					return -1;
				num_zeroes -= 16;
			}
			format_number(value, &category, &bits);
			value = (num_zeroes << 4) | category;
			if (huffman_encode(ac_table, value))
				return -1;
			if (put_bits(bits, category))
				return -1;
			num_zeroes = 0;
		}
	}
	if (num_zeroes > 0) {
		if (huffman_encode(ac_table, 0x00))
			return -1;
	}
	
	return 0;
}


/* _jpeg_c_rgb2ycbcr:
 *  C version of the RGB -> YCbCr color conversion routine. Converts 2 pixels
 *  at a time.
 */
static void
_jpeg_c_rgb2ycbcr(int addr, short *y1, short *cb1, short *cr1, short *y2, short *cb2, short *cr2)
{
	int r, g, b;
	unsigned int *ptr = (unsigned int *)addr;
	
	r = getr32(ptr[0]);
	g = getg32(ptr[0]);
	b = getb32(ptr[0]);
	*y1 = (((r * 76) + (g * 151) + (b * 29)) >> 8) - 128;
	*cb1 = (((r * -43) + (g * -85) + (b * 128)) >> 8);
	*cr1 = (((r * 128) + (g * -107) + (b * -21)) >> 8);
	r = getr32(ptr[1]);
	g = getg32(ptr[1]);
	b = getb32(ptr[1]);
	*y2 = (((r * 76) + (g * 151) + (b * 29)) >> 8) - 128;
	*cb2 = (((r * -43) + (g * -85) + (b * 128)) >> 8);
	*cr2 = (((r * 128) + (g * -107) + (b * -21)) >> 8);
}


/* _jpeg_encode:
 *  Main encoding function.
 */
int
_jpeg_encode(BITMAP *bmp, AL_CONST RGB *pal, int quality, int flags)
{
	short y1[64], y2[64], y3[64], y4[64], cb[64], cr[64];
	short *y1_ptr, *y2_ptr, *y3_ptr, *y4_ptr, *cb_ptr, *cr_ptr;
	short cb1_sample, cr1_sample, cb2_sample, cr2_sample, cb_total, cr_total, dummy;
	int dc_y, dc_cb, dc_cr;
	int sampling, greyscale;
	int block_x, block_y, x, y, i;
	int addr, pitch;
	int error = 0;
	BITMAP *fixed_bmp;
	
	jpgalleg_error = JPG_ERROR_NONE;
	
#ifdef JPGALLEG_MMX
	if (cpu_capabilities & CPU_MMX) {
		if (_rgb_r_shift_32 == 0)
			rgb2ycbcr = _jpeg_mmx_rgb2ycbcr;
		else if (_rgb_r_shift_32 == 16)
			rgb2ycbcr = _jpeg_mmx_bgr2ycbcr;
		else
			rgb2ycbcr = _jpeg_c_rgb2ycbcr;
	}
	else
#endif
	rgb2ycbcr = _jpeg_c_rgb2ycbcr;
	
	quality = MID(1, quality, 100);
	sampling = flags & 0xf;
	if ((sampling != JPG_SAMPLING_411) && (sampling != JPG_SAMPLING_422) && (sampling != JPG_SAMPLING_444))
		return -1;
	greyscale = flags & JPG_GREYSCALE;
	
	fixed_bmp = create_bitmap_ex(32, (bmp->w + 15) & ~0xf, (bmp->h + 15) & ~0xf);
	if (!fixed_bmp)
		return -1;
	if (pal)
		select_palette(pal);
	blit(bmp, fixed_bmp, 0, 0, 0, 0, bmp->w, bmp->h);
	if (pal)
		unselect_palette();
	for (i = bmp->w; i < fixed_bmp->w; i++)
		blit(fixed_bmp, fixed_bmp, bmp->w - 1, 0, i, 0, 1, bmp->h);
	for (i = bmp->h; i < fixed_bmp->h; i++)
		blit(fixed_bmp, fixed_bmp, 0, bmp->h - 1, 0, i, fixed_bmp->w, 1);
	pitch = (int)(fixed_bmp->line[1] - fixed_bmp->line[0]);
	
	if (write_header(sampling, greyscale, quality, bmp->w, bmp->h)) {
		destroy_bitmap(fixed_bmp);
		return -1;
	}
	
	dc_y = dc_cb = dc_cr = 0;
	
	if (!greyscale) {
		switch (sampling) {
			case JPG_SAMPLING_411:
				for (block_y = 0; block_y < bmp->h; block_y += 16) {
					for (block_x = 0; block_x < bmp->w; block_x += 16) {
						addr = (int)fixed_bmp->line[block_y] + (block_x * 4);
						y1_ptr = y1;
						y2_ptr = y2;
						y3_ptr = y3;
						y4_ptr = y4;
						cb_ptr = cb;
						cr_ptr = cr;
						for (y = 0; y < 8; y += 2) {
							for (x = 0; x < 8; x += 2) {
								rgb2ycbcr(addr, y1_ptr, &cb_total, &cr_total, y1_ptr + 1, &cb1_sample, &cr1_sample);
								cb_total += cb1_sample;
								cr_total += cr1_sample;
								rgb2ycbcr(addr + pitch, y1_ptr + 8, &cb1_sample, &cr1_sample, y1_ptr + 9, &cb2_sample, &cr2_sample);
								*cb_ptr = (cb_total + cb1_sample + cb2_sample) / 4;
								*cr_ptr = (cr_total + cr1_sample + cr2_sample) / 4;
								rgb2ycbcr(addr + 32, y2_ptr, &cb_total, &cr_total, y2_ptr + 1, &cb1_sample, &cr1_sample);
								cb_total += cb1_sample;
								cr_total += cr1_sample;
								rgb2ycbcr(addr + 32 + pitch, y2_ptr + 8, &cb1_sample, &cr1_sample, y2_ptr + 9, &cb2_sample, &cr2_sample);
								*(cb_ptr + 4) = (cb_total + cb1_sample + cb2_sample) / 4;
								*(cr_ptr + 4) = (cr_total + cr1_sample + cr2_sample) / 4;
								rgb2ycbcr(addr + (pitch * 8), y3_ptr, &cb_total, &cr_total, y3_ptr + 1, &cb1_sample, &cr1_sample);
								cb_total += cb1_sample;
								cr_total += cr1_sample;
								rgb2ycbcr(addr + (pitch * 9), y3_ptr + 8, &cb1_sample, &cr1_sample, y3_ptr + 9, &cb2_sample, &cr2_sample);
								*(cb_ptr + 32) = (cb_total + cb1_sample + cb2_sample) / 4;
								*(cr_ptr + 32) = (cr_total + cr1_sample + cr2_sample) / 4;
								rgb2ycbcr(addr + 32 + (pitch * 8), y4_ptr, &cb_total, &cr_total, y4_ptr + 1, &cb1_sample, &cr1_sample);
								cb_total += cb1_sample;
								cr_total += cr1_sample;
								rgb2ycbcr(addr + 32 + (pitch * 9), y4_ptr + 8, &cb1_sample, &cr1_sample, y4_ptr + 9, &cb2_sample, &cr2_sample);
								*(cb_ptr + 36) = (cb_total + cb1_sample + cb2_sample) / 4;
								*(cr_ptr + 36) = (cr_total + cr1_sample + cr2_sample) / 4;
								addr += 8;
								y1_ptr += 2;
								y2_ptr += 2;
								y3_ptr += 2;
								y4_ptr += 2;
								cb_ptr++;
								cr_ptr++;
							}
							addr += (pitch - 32) + pitch;
							y1_ptr += 8;
							y2_ptr += 8;
							y3_ptr += 8;
							y4_ptr += 8;
							cb_ptr += 4;
							cr_ptr += 4;
						}
						error |= encode_block(y1, LUMINANCE, &dc_y);
						error |= encode_block(y2, LUMINANCE, &dc_y);
						error |= encode_block(y3, LUMINANCE, &dc_y);
						error |= encode_block(y4, LUMINANCE, &dc_y);
						error |= encode_block(cb, CHROMINANCE, &dc_cb);
						error |= encode_block(cr, CHROMINANCE, &dc_cr);
						if (error)
							goto exit_error;
					}
				}
				break;
				
			case JPG_SAMPLING_422:
				for (block_y = 0; block_y < bmp->h; block_y += 8) {
					for (block_x = 0; block_x < bmp->w; block_x += 16) {
						addr = (int)fixed_bmp->line[block_y] + (block_x * 4);
						y1_ptr = y1;
						y2_ptr = y2;
						cb_ptr = cb;
						cr_ptr = cr;
						for (y = 0; y < 8; y++) {
							for (x = 0; x < 8; x += 2) {
								rgb2ycbcr(addr, y1_ptr, cb_ptr, cr_ptr, y1_ptr + 1, &cb1_sample, &cr1_sample);
								*cb_ptr = (*cb_ptr + cb1_sample) / 2;
								*cr_ptr = (*cr_ptr + cr1_sample) / 2;
								rgb2ycbcr(addr + 32, y2_ptr, cb_ptr + 4, cr_ptr + 4, y2_ptr + 1, &cb1_sample, &cr1_sample);
								*(cb_ptr + 4) = (*(cb_ptr + 4) + cb1_sample) / 2;
								*(cr_ptr + 4) = (*(cr_ptr + 4) + cr1_sample) / 2;
								addr += 8;
								y1_ptr += 2;
								y2_ptr += 2;
								cb_ptr++;
								cr_ptr++;
							}
							addr += (pitch - 32);
							cb_ptr += 4;
							cr_ptr += 4;
						}
						error |= encode_block(y1, LUMINANCE, &dc_y);
						error |= encode_block(y2, LUMINANCE, &dc_y);
						error |= encode_block(cb, CHROMINANCE, &dc_cb);
						error |= encode_block(cr, CHROMINANCE, &dc_cr);
						if (error)
							goto exit_error;
					}
				}
				break;
			
			case JPG_SAMPLING_444:
				for (block_y = 0; block_y < bmp->h; block_y += 8) {
					for (block_x = 0; block_x < bmp->w; block_x += 8) {
						addr = (int)fixed_bmp->line[block_y] + (block_x * 4);
						y1_ptr = y1;
						cb_ptr = cb;
						cr_ptr = cr;
						for (y = 0; y < 8; y++) {
							for (x = 0; x < 8; x += 2) {
								rgb2ycbcr(addr, y1_ptr, cb_ptr, cr_ptr, y1_ptr + 1, cb_ptr + 1, cr_ptr + 1);
								addr += 8;
								y1_ptr += 2;
								cb_ptr += 2;
								cr_ptr += 2;
							}
							addr += (pitch - 32);
						}
						error |= encode_block(y1, LUMINANCE, &dc_y);
						error |= encode_block(cb, CHROMINANCE, &dc_cb);
						error |= encode_block(cr, CHROMINANCE, &dc_cr);
						if (error)
							goto exit_error;
					}
				}
				break;
		}
	}
	else {
		for (block_y = 0; block_y < bmp->h; block_y += 8) {
			for (block_x = 0; block_x < bmp->w; block_x += 8) {
				addr = (int)fixed_bmp->line[block_y] + (block_x * 4);
				y1_ptr = y1;
				for (y = 0; y < 8; y++) {
					for (x = 0; x < 8; x += 2) {
						rgb2ycbcr(addr, y1_ptr, &dummy, &dummy, y1_ptr + 1, &dummy, &dummy);
						addr += 8;
						y1_ptr += 2;
					}
					addr += (pitch - 32);
				}
				if (encode_block(y1, LUMINANCE, &dc_y))
					goto exit_error;
			}
		}
	}

	_jpeg_flush_bits();
	_jpeg_putw(CHUNK_EOI);
	destroy_bitmap(fixed_bmp);
	
	return 0;
	
exit_error:
	destroy_bitmap(fixed_bmp);
	return -1;
}

