/*
 *	PAL software decoder (MMX optimized!!!)
 *	includes Y/C separation and YCrCb to RGB conversion
 *
 *	Copyright (C) 1999, Ewald Snel
 *
 *WWW		http://esnel.op.het.net/
 *e-mail	esnel@cistron.nl
 */

#include <stdio.h>
#include <math.h>
#include <time.h>

#include "esnelmdecode.h"
#include "video.h"

#define decWidth	360		// decoded (display/output) width
#define decHeight	280		// decoded (display/output) height
#define rawWidth	2268	// input (raw) width
#define	rawHeight	293		// input (raw) height
#define rawOffset	5*rawWidth - 102	// offset of first pixel

// decode (static)
#define Pi			3.14159265358979323846
#define divPi		(0x80000000L /Pi)
#define FcHF0		((int) (283.75 * 0x100000000L /rawWidth))
#define FcVF0		((int) (  -.25 * 0x100000000L))

#define FIX64(n)	((int) (64*n + .5))
#define MAKEQW(n)	(((__int64) (n & 0xffff) << 48) | ((__int64) (n & 0xffff) << 32) | ((__int64) (n & 0xffff) << 16) | (__int64) (n & 0xffff))
#define rp			rawWidth		// pitch

static void InitializeDecoder();
static void SetPixels(int y);
static void GetLumaSamples(void *src, int lladjust, int brightness, int contrast);
static void GetChromaSamples( unsigned char src[],
							  unsigned int cbPhase, unsigned int cbHF, unsigned int cbVF,
							  int saturation, int vsaturation );
static void GetColorBurstData( unsigned char *src, unsigned int *cbPhase,
							   unsigned int *cbHF, unsigned int *cbVF, int *cbOdd );


static unsigned char	scanLuma[384];
static char				scanCbU[192], scanCrV[192];
static int				cosint[1024];
static bool				isInitialized = false;

static __int64 cosin64[1024], prevScan[384];
static __int64 csc_CbU	= MAKEQW( FIX64( 2.031 ) );
static __int64 csc_CrV	= MAKEQW( FIX64( 1.141 ) );
static __int64 csc_CgU	= MAKEQW( FIX64( 0.391 ) );
static __int64 csc_CgV	= MAKEQW( FIX64( 0.578 ) );
static __int64 csc_LcY	= MAKEQW( FIX64( 1.164 ) );
static __int64 csc_clpR	= MAKEQW( 49152 - 16*FIX64( 1.164 ) - 128* FIX64( 1.141 ) );
static __int64 csc_clpG	= MAKEQW( 16384 - 16*FIX64( 1.164 ) + 128*(FIX64( 0.391 ) + FIX64( 0.578 ) ) );
static __int64 csc_clpB	= MAKEQW( 49152 - 16*FIX64( 1.164 ) - 128* FIX64( 2.031 ) );
static __int64 csc_lvlR	= MAKEQW( 49152 );
static __int64 csc_lvlG	= MAKEQW( 16384 );
static __int64 csc_lvlB	= MAKEQW( 49152 );
static __int64 csc_subR	= MAKEQW(  16*FIX64( 1.164 ) + 128* FIX64( 1.141 ) );
static __int64 csc_addG	= MAKEQW( -16*FIX64( 1.164 ) + 128*(FIX64( 0.391 ) + FIX64( 0.578 ) ) );
static __int64 csc_subB	= MAKEQW(  16*FIX64( 1.164 ) + 128* FIX64( 2.031 ) );

static short filt_cos[] = { 141, 199, -282, -1020, -457, 1078, 1559, 704,
							-704, -1559, -1078, 457, 1020, 282, -199, -141 };

static short filt_sin[] = { -340, 199, 962, 821, -2182, -1525, 364, 1700,
							1700, 364, -1525, -2182, 821, 962, 199, -340 };

static unsigned char clip8[1024];


EsnelMMXDecoder::EsnelMMXDecoder()
{
	_description = "Improved PAL Color MMX";
	_outwidth = decWidth;
	_outheight = decHeight;
	_use_vbi = true;
}


/*
 Decode next field of PAL data
*/

static long show_settings_timeout;

static bool adjust_luma_level = true;


static struct
{
	int brightness;
	int contrast;
	int saturation;
} color_cfg[2] = { {87, 174, 245}, {108, -181, 245} };

static int color_cfg_i;

void EsnelMMXDecoder::processKey(WPARAM key)
{
	bool show_it = false;

	switch(key)
   {
   	case VK_PRIOR:
        	show_it = true;
      	if (color_cfg[color_cfg_i].brightness < 511)
				color_cfg[color_cfg_i].brightness+=2;
         break;
      case VK_NEXT:
        	show_it = true;
      	if (color_cfg[color_cfg_i].brightness > -511)
				color_cfg[color_cfg_i].brightness-=2;
         break;
   	case VK_HOME:
        	show_it = true;
      	if (color_cfg[color_cfg_i].contrast < 511)
				color_cfg[color_cfg_i].contrast+=2;
         break;
      case VK_END:
        	show_it = true;
      	if (color_cfg[color_cfg_i].contrast > -511)
				color_cfg[color_cfg_i].contrast-=2;
         break;
   	case VK_INSERT:
        	show_it = true;
      	if (color_cfg[color_cfg_i].saturation < 511)
				color_cfg[color_cfg_i].saturation+=2;
         break;
      case VK_DELETE:
        	show_it = true;
      	if (color_cfg[color_cfg_i].saturation > -511)
				color_cfg[color_cfg_i].saturation-=2;
         break;

		case 'A':
      	adjust_luma_level = !adjust_luma_level;
         break;

   }

   if (show_it)
   	show_settings_timeout = time(0) + 5;
}

const char* EsnelMMXDecoder::getHelp(void) const
{
	return("  PG UP/DOWN: Adjust brightness\n"
          "  HOME/END: Adjust contrast\n"
          "  INSERT/DELETE: Adjust saturation\n"
          "  A: Adjust luma level to color burst\n"
          );
}


void EsnelMMXDecoder::decodeFrame(unsigned char *src, int offset, bool invert_polarity)
{
	unsigned int cbPhase, cbHF, cbVF;
	int i, y, vsaturation, cbOdd;

	color_cfg_i = (invert_polarity)?1:0;
	int brightness = color_cfg[color_cfg_i].brightness;
	int contrast = color_cfg[color_cfg_i].contrast;
	int saturation = color_cfg[color_cfg_i].saturation;
	bool lladjust = adjust_luma_level;


	if (!isInitialized)
		InitializeDecoder();

	src += offset;
	GetColorBurstData(src, &cbPhase, &cbHF, &cbVF, &cbOdd);

	src += getColorBurstWidth();

	brightness += (contrast < 0) ? 85 : 0;
	cbHF	   *= 10;
	vsaturation	= cbOdd ? -saturation : saturation;

	lockDisplay();

	for (y=0; y < decHeight; y+=2)
	{
		GetChromaSamples( src, cbPhase, cbHF, cbVF, saturation, vsaturation);

		for (i=0; i < 2; i++)
		{
			GetLumaSamples(src, lladjust, brightness, contrast);
			SetPixels(y+i);

			src += rawWidth;
		}
		cbPhase	+= 2*cbVF;
	}

   unlockDisplay();
   
   if (show_settings_timeout > time(0))
   {
   	char buf[500];
	   sprintf(buf, "Brightness %d\nContrast %d\nSaturation %d",
        color_cfg[color_cfg_i].brightness,
        color_cfg[color_cfg_i].contrast,
        color_cfg[color_cfg_i].saturation);
	   drawText(10, 120, buf, RGB(255, 0, 0));
   }
}


/*
 Initialize decoder
*/
static void InitializeDecoder()
{
	int i;

	for (i=0; i < 1024; i++)
	{
		short c = (int) (0x7fffffff * cos( i*Pi /512.0 )) >> 16;
		short s = (int) (0x7fffffff * sin( i*Pi /512.0 )) >> 16;
		short *v = (short *) &cosin64[i];
		v[0] = s;
		v[1] = c;
		v[2] = c;
		v[3] = ~s;
		cosint[i] = c;

		clip8[i]  = (i < 320) ? 0 : ((i > 320+255) ? 255 : i-320);
	}
	
	isInitialized = true;
}



/*
 MMX optimized YUV422 picture output (PAL specific)

 conversion coefficients for YUV to RGB color space :
	R = 1.164 * (Yc - 16)  +  1.141 * (Vc - 128)
	G = 1.164 * (Yc - 16)  -  0.391 * (Uc - 128)  -  0.578 * (Vc - 128)
	B = 1.164 * (Yc - 16)  +  2.031 * (Uc - 128)
*/

static void CalcPixels(void *pixels)
{
	__asm
	{
		mov			esi , offset scanLuma

   		mov			ebx , offset scanCbU
		mov			edx , offset scanCrV

		mov			edi , [pixels]		// display pointer
		sub			edx , ebx			// chroma pointer U->V offset

		movd		mm1 , [ebx]			// load 4 samples U (blue)
		pxor		mm3 , mm3			// empty unpack (0) register

		movd		mm2 , [ebx][edx]	// load 4 samples V (red)
		punpcklbw	mm1 , mm3			// unpack U samples into words

		movq		mm0 , [esi]			// load 8 samples Y (luminance)
		punpcklbw	mm2 , mm3			// unpack V samples into words

		movq		mm4 , mm1			// copy of U for CgU calculation
		movq		mm7 , mm0			// copy of luminance samples for samples 4..7

		mov			ecx , (decWidth /8)	// load 8-pixel counter

__pixel32:
		pmullw		mm1 , [csc_CbU]		// CbU =  2.031*U
		movq		mm6 , mm2			// copy of V for CgV calculation

		pmullw		mm2 , [csc_CrV]		// CrV =  1.141*V
		punpcklbw	mm0 , mm3			// unpack luminance samples 0..3

		pmullw		mm4 , [csc_CgU]		// CgU = -0.391*U
		punpckhbw	mm7 , mm3			// unpack luminance samples 4..7

		pmullw		mm6 , [csc_CgV]		// CbU = -0.578*V
		movq		mm3 , mm1			// copy of CbU for samples 4..7

		pmullw		mm0 , [csc_LcY]		// Y0 *= 1.164
		movq		mm5 , mm2           // copy of CrV for samples 4..7

		pmullw		mm7 , [csc_LcY]		// Y1 *= 1.164
		punpcklwd	mm1 , mm1			// unpack CbU for samples 0..3

		paddusw		mm4 , mm6			// CgUV = CgU + CgV
		punpckhwd	mm3 , mm3			// unpack CbU for samples 4..7

		movq		mm6 , mm4			// copy of CgUV for samples 4..7
		paddusw		mm1 , mm0			// B0 = Y0 + CbU0

		psubusw		mm1 , [csc_subB]	// clip blue samples 0..3 to MIN_VALUE
		punpcklwd	mm2 , mm2			// unpack CrV samples 0..3

		punpckhwd	mm5 , mm5			// unpack CrV samples 4..7
		paddusw		mm3 , mm7			// B1 = Y1 + CbU1

		psubusw		mm3 , [csc_subB]	// clip blue samples 4..7 to MIN_VALUE
		punpcklwd	mm4 , mm4			// unpack CgUV samples 0..3

		punpckhwd	mm6 , mm6			// unpack CgUV samples 4..7
		paddusw		mm2 , mm0			// R0 = Y0 + CrV0

		psubusw		mm2 , [csc_subR]	// clip red samples 0..3 to MIN_VALUE
		psrlw		mm1 , 6				// position upper 8 bits of blue samples 0..3

		paddusw		mm5 , mm7			// R1 = Y1 + CrV1
		psrlw		mm3 , 6				// position upper 8 bits of blue samples 4..7

		psubusw		mm5 , [csc_subR]	// clip red samples 4..7 to MIN_VALUE
		psrlw		mm2 , 6				// position upper 8 bits of red samples 0..3

		paddusw		mm0 , [csc_addG]	// clip green samples 0..3 to MAX_VALUE
		psrlw		mm5 , 6				// position upper 8 bits of red samples 4..7

		paddusw		mm7 , [csc_addG]	// clip green samples 4..7 to MAX_VALUE
		packuswb	mm1 , mm3			// pack blue samples 0..7 into UBYTE's

		psubusw		mm0 , mm4			// G0 = Y0 - CgUV0
		packuswb	mm2 , mm5			// pack red samples 0..7 into UBYTE's

		psubusw		mm7 , mm6			// G1 = Y1 - CgUV1
		psrlw		mm0 , 6				// position upper 8 bits of green samples 0..3

		psrlw		mm7 , 6				// position upper 8 bits of green samples 4..7
		movq		mm4 , mm1			// copy for blue samples 4..7

		packuswb	mm0 , mm7			// pack green samples 0..7 into UBYTE's
		pxor		mm7 , mm7			// empty unpack (0) register

		movq		mm3 , mm0			// copy for green samples 4..7
		punpcklbw	mm1 , mm2			// unpack {blue, red} samples 0..3

		punpckhbw	mm4 , mm2			// unpack {blue, red} samples 4..7
		movq		mm2 , mm1			// copy for {blue, red} samples 2..3

		movq		mm5 , mm4			// copy for {blue, red} samples 6..7
		punpcklbw	mm0 , mm7			// unpack green samples 0..3 into WORD's

		add			edi , 32			// increase display pointer
		punpckhbw	mm3 , mm7			// unpack green samples 4..7 into WORD's

		punpcklbw	mm1 , mm0			// unpack {blue, green, red, 0} samples 0..1
		add			esi , 8				// increase luma pointer

		punpckhbw   mm2 , mm0           // unpack {blue, green, red, 0} samples 2..3
		add			ebx , 4				// increase chroma pointer

		movq		[edi][-32] , mm1	// store pixels 0..1
		punpcklbw	mm4 , mm3			// unpack {blue, green, red, 0} samples 4..5

		movq		[edi][-24] , mm2	// store pixels 2..3
		punpckhbw	mm5 , mm3			// unpack {blue, green, red, 0} samples 6..7

		movd		mm1 , [ebx]			// load 4 samples U (blue)
		pxor		mm3 , mm3			// empty unpack (0) register

		movd		mm2 , [ebx][edx]	// load 4 samples V (red)
		punpcklbw	mm1 , mm3			// unpack U samples into words

		movq		mm0 , [esi]			// load 8 samples Y (luminance)
		punpcklbw	mm2 , mm3			// unpack V samples into words

		movq		[edi][-16] , mm4	// store pixels 4..5
		movq		mm4 , mm1			// copy of U for CgU calculation

		movq		[edi][-8] , mm5		// store pixels 6..7
		movq		mm7 , mm0			// copy of luminance samples for samples 4..7

		dec			ecx					// decrease 8-pixel counter
		jnz			__pixel32			// do {...} while (counter > 0);

		emms
	}
}



static void SetPixels(int y)
{
	RGBTRIPLE pixels[decWidth];

	static unsigned long mypixels[decWidth];
	CalcPixels(mypixels);
	for (int t = 0; t < decWidth; t++)
	{
		pixels[t].rgbtRed = (unsigned char)((mypixels[t] >> 16) & 255);
	    pixels[t].rgbtGreen = (unsigned char)((mypixels[t] >> 8) & 255);
		pixels[t].rgbtBlue = (unsigned char)(mypixels[t] & 255);
	}

	drawPixels(0, y, pixels, decWidth);
}



/*
 Decode single scanline of Raw 8bit samples in PAL format to grayscale
 luma (Y) samples at half PAL (50%) resolution.
*/
static void GetLumaSamples( void *src, int lladjust, int brightness, int contrast )
{
	__int64 level64;
	__int64 contrast64;
	static __int64 fw384 = MAKEQW( 384 /4 );

	__asm
	{
		movd		mm6 , [contrast]	// load contrast
		pxor		mm7 , mm7			// empty unpack (0) register

		mov			esi , [src]			// load input pointer
		mov			edi , offset scanLuma// load output pointer

		movd		mm5 , [brightness]	// load brightness level
		punpcklwd	mm6 , mm6			// unpack contrast into WORD's

		movq		mm0 , [esi]			// L0: load samples 0..7
		punpckldq	mm6 , mm6			// unpack contrast into DWORD's

		mov			eax , [lladjust]	// load scan adjustment BOOL
		punpcklwd	mm5 , mm5			// unpack brightness level into WORD's

		or			eax , eax			// luma level scan adjustment ?
		jz			__adjusted			// skip luma adjustment if false


	//////////////////////////////////////////
	//	ADJUST BRIGHTNESS TO COLOR BURST	//
	//										//
	//	{mm1, mm2, mm3, mm4} are free		//
	//	{mm6} is contrast					//
	//	{mm5} is brightness					//
	//////////////////////////////////////////
		movq		mm1 , [esi][-168]	// load luma level samples 0..7
		; slot

		movq		mm2 , mm1			// copy for samples 4..7
		punpcklbw	mm1 , mm7			// unpack samples 0..3 into WORD's

		movq		mm3 , [esi][-176]	// load luma level samples 8..15
		punpckhbw	mm2 , mm7			// unpack samples 4..7 into WORD's

		paddw		mm1 , mm2			// add samples --> 0..3
		movq		mm4 , mm3			// copy for samples 12..15

		movq		mm2 , [esi][-184]	// load luma level samples 16..23
		punpcklbw	mm3 , mm7			// unpack samples 8..11 into WORD's

		paddw		mm1 , mm3			// add samples --> 0..3
		punpckhbw	mm4 , mm7			// unpack samples 12..15 into WORD's

		movq		mm3 , mm2			// copy for samples 20..23
		punpcklbw	mm2 , mm7			// unpack samples 16..19 into WORD's

		punpckhbw	mm3 , mm7			// unpack samples 20..23 into WORD's
		paddw		mm1 , mm4			// add samples --> 0..3

		movq		mm4 , [esi][-192]	// load luma level samples 24..31
		paddw		mm1 , mm2			// add samples --> 0..3

		movq		mm2 , mm4			// copy for samples 28..31
		punpcklbw	mm4 , mm7			// unpack samples 24..27 into WORD's

		paddw		mm1 , mm3			// add samples --> 0..3
		punpckhbw	mm2 , mm7			// unpack samples 28..31 into WORD's

		movq		mm3 , [fw384]		// load constant '384' (4 x WORD)
		paddw		mm1 , mm4			// add samples --> 0..3

		// 11CLK's (non-pairing instructions)
		paddw		mm1 , mm2			// add samples --> 0..3
		psubw		mm3 , mm1			//  (384 - cbluma32)
		pmaddwd		mm3 , mm6			//  (384 - cbluma32) * contrast
		psrad		mm3 , 11			// ((384 - cbluma32) * contrast) >> 11
		punpckldq	mm1 , mm3			// put low part of mm3 in high part of mm1
		paddd		mm3 , mm1			// add low part to high part
		punpckhdq	mm3 , mm3			// copy luminance adjustment as DWORD's
		packssdw	mm3 , mm3			// pack luminance adjustment into WORD's
		paddw		mm5 , mm3			// adjust brightness to color burst


	//////////////////////////////////////////
	//			FILTER LUMINANCE DATA		//
	//////////////////////////////////////////
__adjusted:
		movq		[contrast64] , mm6	//#store contrast (4 x WORD)
		punpckldq	mm5 , mm5			//#unpack brightness level into DWORD's

		movq		[level64] , mm5		//#store brightness level (4 x WORD)
		movq		mm1 , mm0			// L0: copy for samples 4..7

		mov			ecx , (decWidth /4)	// load 4-sample counter
		punpcklbw	mm0 , mm7			// L0: unpack samples 0..3

		movq		mm2 , [esi][5]		// L1: load samples 0..7
		punpckhbw	mm1 , mm7			// L0: unpack samples 4..7

		paddw		mm0 , mm1			// L0: add samples --> 0..3
		movq		mm3 , mm2			// L1: copy for samples 4..7

		punpcklbw	mm2 , mm7			// L1: unpack samples 0..3
		movq		mm1 , mm0			// L01: copy of samples L0

		movq		mm4 , [esi][10]		// L2: load samples 0..7
		punpckhbw	mm3 , mm7			// L1: unpack samples 4..7

__pixel:
		paddw		mm2 , mm3			// L1: add samples --> 0..3
		movq		mm5 , mm4			// L2: copy for samples 4..7

		movq		mm6 , [esi][15]		// L3: load samples 0..7
		punpckldq	mm0 , mm2			// L01: unpack samples 0..1 of L0, L1

		punpckhdq	mm1 , mm2			// L01: unpack samples 2..3 of L0, L1
		movq		mm3 , mm6			// L3: copy for samples 4..7

		paddw		mm0 , mm1			// L01: add samples --> 0..1
		punpcklbw	mm4 , mm7			// L2: unpack samples 0..3

		pmaddwd		mm0 , [contrast64]	// L01: mac samples --> DWORD
		punpckhbw	mm5 , mm7			// L2: unpack samples 4..7

		paddw		mm4 , mm5			// L2: add samples --> 0..3
		punpcklbw	mm6 , mm7			// L3: unpack samples 0..3

		movq		mm5 , mm4			// L23: copy of samples L2
		punpckhbw	mm3 , mm7			// L3: unpack samples 4..7

		psrad		mm0 , 9				// L01: scale samples to normal level
		paddw		mm6 , mm3			// L3: add samples --> 0..3

		punpckldq	mm4 , mm6			// L23: unpack samples 0..1 of L2, L3
		add			esi , 20			// increase input pointer

		punpckhdq	mm5 , mm6			// L23: unpack samples 2..3 of L2, L3
		movq		mm6 , mm0			// L01: free MMX register mm0

		movq		mm2 , [esi][5]		//L1: load samples 0..7
		paddw		mm4 , mm5			// L23: add samples --> 0..1

		pmaddwd		mm4 , [contrast64]	// L23: mac samples --> DWORD
		movq		mm3 , mm2			//L1: copy for samples 4..7

		movq		mm0 , [esi]			//L0: load samples 0..7
		punpcklbw	mm2 , mm7			//L1: unpack samples 0..3

		movq		mm1 , mm0			//L0: copy for samples 4..7
		punpcklbw	mm0 , mm7			//L0: unpack samples 0..3

		psrad		mm4 , 9				// L23: scale samples to normal level
		add			edi , 4				// increase output pointer

		packssdw	mm6 , mm4			// L0123: pack samples into WORD's
		; slot

		paddw		mm6 , [level64]		// L0123: add luminance level
		punpckhbw	mm1 , mm7			//L0: unpack samples 4..7

		movq		mm4 , [esi][10]		//L2: load samples 0..7
		packuswb	mm6 , mm6			// L0123: pack samples into UBYTE's

		paddw		mm0 , mm1			//L0: add samples --> 0..3
		punpckhbw	mm3 , mm7			//L1: unpack samples 4..7

		movd		[edi][-4] , mm6		// L0123: store output samples 0..3
		movq		mm1 , mm0			//L01: copy of samples L0

		dec			ecx					// decrease 4-sample counter
		jnz			__pixel				// do {...} while (counter > 0);

		emms
	}
}


/*
 Decode single scanline of Raw 8bit samples in PAL format to chroma (U,V)
 samples at half resolution.
*/
static void GetChromaSamples( unsigned char src[],
							  unsigned int cbPhase, unsigned int cbHF, unsigned int cbVF,
							  int saturation, int vsaturation )
{
	__int64 invertv64, saturation64;
	__int64 *prev = prevScan;

	__asm
	{
		movd		mm0 , [saturation]
		pcmpeqd		mm1 , mm1

		mov			esi , [src]			// load input pointer
		psllq		mm1 , 32

		punpckldq	mm0 , [vsaturation]
		; slot

		movq		[invertv64] , mm1
		pxor		mm0 , mm1

		mov			edx , [cbPhase]		// load color burst phase
		pslld		mm0 , 16

		mov			ebx , offset scanCbU// load chrominance U pointer
		mov			edi , offset scanCrV// load chrominance V pointer

		movq		[saturation64] , mm0
		; slot


	//////////////////////////////////////////
	//	DECODE CHROMINANCE COMPONENT OF PAL	//
	//										//
	//	{esi} is source, {edx} is cbPhase	//
	//	{ebx} is CbU[], {edi} is CbV		//
	//////////////////////////////////////////
		mov			ecx , (decWidth /2)	// load 2-pixel counter

		movq		mm0 , [esi][-7]		// C0: load samples 0..7
		pxor		mm7 , mm7			// empty unpack (0) register

		movq		mm2 , [esi][1]		// C0: load samples 8..15
		movq		mm1 , mm0			// C0: copy for samples 4..7

		movq		mm3 , mm2			// C0: copy for samples 12..15
		punpcklbw	mm0 , mm7			// C0: unpack samples 0..3 into WORD's

		movq		mm4 , mm0			// C0: copy of samples 0..3 for "sine" filter
		punpckhbw	mm1 , mm7			// C0: unpack samples 4..7 into WORD's

		pmaddwd		mm0 , [filt_cos]	// C0: "cosine" filter samples 0..3
		movq		mm5 , mm1			// C0: copy of samples 4..7 for "sine" filter

		pmaddwd		mm1 , [filt_cos][8]	// C0: "cosine" filter samples 4..7
		punpcklbw	mm2 , mm7			// C0: unpack samples 8..11

		pmaddwd		mm4 , [filt_sin]	// C0: "sine" filter samples 0..3
		movq		mm6 , mm2			// C0: copy of samples 8..11 for "sine" filter

__pixel:
		pmaddwd		mm2 , [filt_cos][16]// C0: "cosine" filter samples 8..11
		punpckhbw	mm3 , mm7			// C0: unpack samples 12..15

		pmaddwd		mm5 , [filt_sin][8]	// C0: "sine" filter samples 4..7
		paddd		mm0 , mm1			// C0: add "cosine" filtered samples 0..7

		pmaddwd		mm6 , [filt_sin][16]// C0: "sine" filter samples 8..11
		movq		mm1 , mm3			// C0: copy of samples 12..15 for "sine" filter

		pmaddwd		mm3 , [filt_cos][24]// C0: "cosine" filter samples 12..15
		paddd		mm0 , mm2			// C0: add "cosine" filtered samples 0..11

		pmaddwd		mm1 , [filt_sin][24]// C0: "sine" filter samples 12..15
		paddd		mm4 , mm5			// C0: add "sine" filtered samples 0..7

		paddd		mm4 , mm6			// C0: add "sine" filtered samples 0..11
		mov			eax , edx			// copy of color burst phase

		paddd		mm0 , mm3			// C0: add "cosine" filtered samples 0..15
		shr			eax , 22			// (cb >> 22) ... index into (co)sine table

		paddd		mm4 , mm1			// C0: add "sine" filtered samples 0..15
		punpckldq	mm3 , mm0			// C0: copy low part "cosine" into high part mm3

		punpckldq	mm1 , mm4			// C0: copy low part "sine" into high part mm1
		paddd		mm3 , mm0			// C0: "cosine" filtered data in high DWORD

		movq		mm0 , [rp][esi][-7]	// C1: load samples 0..7
		paddd		mm1 , mm4			// C0: "sine" filtered data in high DWORD

		movq		mm2 , [rp][esi][1]	// C1: load samples 8..15
		punpckhdq	mm3 , mm1			// C0: low DWORD={cosine}, high DWORD={sine}

		psrad		mm3 , 12			// C0: scale (co)sine filtered data
		movq		mm1 , mm0			// C1: copy for samples 4..7

		packssdw	mm3 , mm3			// C0: pack co(sine) filtered data into WORD's
		movq		mm6 , mm2			// C1: copy for samples 12..15

		pmaddwd		mm3 , [cosin64][8*eax]// C0: low DWORD={u}, high DWORD={v}
		punpcklbw	mm0 , mm7			// C1: unpack samples 0..3 into WORD's

		movq		mm4 , mm0			// C1: copy of samples 0..3 for "sine" filter
		punpckhbw	mm1 , mm7			// C1: unpack samples 4..7 into WORD's

		pmaddwd		mm0 , [filt_cos]	// C1: "cosine" filter samples 0..3
		movq		mm5 , mm1			// C1: copy of samples 4..7 for "sine" filter

		pmaddwd		mm1 , [filt_cos][8]	// C1: "cosine" filter samples 4..7
		punpcklbw	mm2 , mm7			// C1: unpack samples 8..11

		pmaddwd		mm4 , [filt_sin]	// C1: "sine" filter samples 0..3
		punpckhbw	mm6 , mm7			// C1: unpack samples 12..15

		pmaddwd		mm5 , [filt_sin][8]	// C1: "sine" filter samples 4..7
		movq		mm7 , mm2			// C1: copy of samples 8..11 for "sine" filter

		pmaddwd		mm2 , [filt_cos][16]// C1: "cosine" filter samples 8..11
		paddd		mm0 , mm1			// C1: add "cosine" filtered samples 0..7

		pmaddwd		mm7 , [filt_sin][16]// C1: "sine" filter samples 8..11
		movq		mm1 , mm6			// C1: copy of samples 12..15 for "sine" filter

		pmaddwd		mm6 , [filt_cos][24]// C1: "cosine" filter samples 12..15
		paddd		mm0 , mm2			// C1: add "cosine" filtered samples 0..11

		pmaddwd		mm1 , [filt_sin][24]// C1: "sine" filter samples 12..15
		paddd		mm4 , mm5			// C1: add "sine" filtered samples 0..7

		paddd		mm4 , mm7			// C1: add "sine" filtered samples 0..11
		mov			eax , [cbVF]		// load color burst vertical stepsize

		paddd		mm0 , mm6			// C1: add "cosine" filtered samples 0..15
		add			eax , edx			// copy of color burst phase

		paddd		mm4 , mm1			// C1: add "sine" filtered samples 0..15
		shr			eax , 22			// (cb >> 22) ... index into (co)sine table

		pxor		mm3 , [invertv64]	// C0: invert chrominace "V" (feature of PAL)
		punpckldq	mm5 , mm0			// C1: copy low part "cosine" into high part mm5

		punpckldq	mm7 , mm4			// C1: copy low part "sine" into high part mm7
		paddd		mm5 , mm0			// C1: "cosine" filtered data in high DWORD

		paddd		mm7 , mm4			// C1: "sine" filtered data in high DWORD
		inc			ebx					// increase chrominance "U" pointer

		movq		mm6 , [cosin64][8*eax]// load (co)sine table entry
		punpckhdq	mm5 , mm7			// C1: low DWORD={cosine}, high DWORD={sine}

		mov			eax , [prev]		// previous scanline chrominance pointer
		psrad		mm5 , 12			// C1: scale (co)sine filtered data

		inc			edi					// increase chrominance "V" pointer
		packssdw	mm5 , mm5			// C1: pack co(sine) filtered data into WORD's

		pmaddwd		mm5 , mm6			// C1: low DWORD={u}, high DWORD={v}
		movq		mm6 , [eax]			// load previous scanline chrominance samples

		add			eax , 8				// increase previous chrominance pointer
		add			esi , 10			// increase input pointer

		pxor		mm7 , mm7			//empty unpack (0) register
		mov			[prev] , eax		// store previous chrominance pointer

		movq		mm0 , [esi][-7]		//C0: load samples 0..7
		paddd		mm5 , mm3			// C01: low DWORD={u}, high DWORD={v}

		movq		mm2 , [esi][1]		//C0: load samples 8..15
		paddd		mm6 , mm5			// average chrominance scanlines (PAL feature)

		pmaddwd		mm6 , [saturation64]// adjust saturation levels
		movq		mm1 , mm0			//C0: copy for samples 4..7

		movq		[eax][-8] , mm5		// store current scanline chrominance samples
		movq		mm3 , mm2			//C0: copy for samples 12..15

		mov			eax , [cbHF]		// load color burst horizontal stepsize
		punpcklbw	mm0 , mm7			//C0: unpack samples 0..3 into WORD's

		psrad		mm6 , 7				// scale chroma output samples
		add			edx , eax			// increment color burst phase (H)

		movq		mm4 , mm0			//C0: copy of samples 0..3 for "sine" filter
		packssdw	mm6 , mm6			// pack chroma output samples into WORD's

		pmaddwd		mm0 , [filt_cos]	//C0: "cosine" filter samples 0..3
		packsswb	mm6 , mm6			// pack chroma output samples into BYTE's

		pmaddwd		mm4 , [filt_sin]	//C0: "sine" filter samples 0..3
		punpckhbw	mm1 , mm7			//C0: unpack samples 4..7 into WORD's

		movd		eax , mm6			// copy of chroma output samples
		movq		mm5 , mm1			//C0: copy of samples 4..7 for "sine" filter

		pmaddwd		mm1 , [filt_cos][8]	//C0: "cosine" filter samples 4..7
		punpcklbw	mm2 , mm7			//C0: unpack samples 8..11

		xor			eax , 8080H			// chroma signed to unsigned conversion
		movq		mm6 , mm2			//C0: copy of samples 8..11 for "sine" filter

		mov			[ebx][-1] , al		// store sample U (blue)
		mov			[edi][-1] , ah		// store sample V (red)

		dec			ecx					// decrease chrominance sample counter
		jnz			__pixel				// do {...} while (counter > 0);

		emms
	}
}


/// Todo :  optimize this (MMX) ...
/*
 Get color burst information
*/
static void GetColorBurstData( unsigned char *src, unsigned int *cbPhase,
							   unsigned int *cbHF, unsigned int *cbVF, int* cbOdd )
{
	static int cbPhases[decHeight];
	double f_amp;
	unsigned int c0, c1, s0, s1, p0, p1, p2, p3, f_cb;
	int i, v, x, y, f_cos, f_sin;


 	// determine phase offsets of colorburst
	for (y=0; y < decHeight; y++)
	{
		for (x=0, f_cos=0, f_sin=0, f_cb=0; x < 72; x++)
		{
			i      = *src++;
			f_cos += cosint[ f_cb               >> 22] * i;
			f_sin += cosint[(f_cb - 0x40000000) >> 22] * i;
			f_cb  += FcHF0;
		}
		src  += (rawWidth - 72);

		double div = sqrt( (double) f_cos*f_cos + (double) f_sin*f_sin );
		if (div != 0)
			f_amp = 1 / div;
		else
      		f_amp = 10000;

		c1 = (unsigned int) (acos( f_cos * f_amp ) * divPi);
		s0 = (unsigned int) (asin( f_sin * f_amp ) * divPi) + 0x80000000;
		c0 = ~c1;
		s1 = ~s0 + 0x80000000;
		p0 = abs(c0 - s0);  p1 = abs(c0 - s1);
		p2 = abs(c1 - s0);  p3 = abs(c1 - s1);

		if (p0 < p2 && p0 < p3 || p1 < p2 && p1 < p3)  c1 = c0;
		if (p0 < p1 && p0 < p3 || p2 < p1 && p2 < p3)  s1 = s0;

		cbPhases[y] = (s1 >> 1) + (c1 >> 1);
	}

	// Vc polarity detection  (even/odd frame)
	for (i=2, p0=0, p1=0; i < (decHeight - 2); i += 2)
	{
		p0 += abs( cbPhases[i] - cbPhases[i - 1] ) >> 8;
		p1 += abs( cbPhases[i + 1] - cbPhases[i] ) >> 8;
	}
	*cbOdd = (p0 < p1)?1:0;

	// sample rate correction (color burst)
	for (i=8, v=0; i < (decHeight /2); i++)
	{
		v += (cbPhases[(decHeight /2) + i] - cbPhases[i]) >> 8;
	}
	v			= ((v /(decHeight /2)) << 8) /((decHeight /2) - 8);
	*cbHF		= v /rawWidth +  FcHF0;
	*cbVF		= v           +  FcVF0;
	i			= cbPhases[0] +  200 * (*cbHF) - 0x60000000L;
	i		   -= (*cbOdd ? 0 : -0x40000000);
	*cbPhase	= i - *cbHF - (*cbHF >> 1);
}
