/*
 *	PAL software decoder (MMX optimized!!!)
 *	includes Y/C separation and YCrCb to RGB conversion
 *
 *	Copyright (C) 1999, Ewald Snel
 *
 *WWW		http://esnel.op.het.net/
 *e-mail	esnel@cistron.nl
 */

#include <stdlib.h>
#include <math.h>
#include "ccd.h"
#include "ronnyvideo.h"
#include "esnelDecode.h"
#include "esnelmDecode.h"


int64 fw384 = MAKEQW( 384 / 4 );

unsigned char scanLuma[384];
char scanCbU[192], scanCrV[192];
int cosint[1024];

int64 cosin64[1024], prevScan[384];

int64 csc_CbU	= MAKEQW( FIX64( 2.031 ) );
int64 csc_CrV	= MAKEQW( FIX64( 1.141 ) );
int64 csc_CgU	= MAKEQW( FIX64( 0.391 ) );
int64 csc_CgV	= MAKEQW( FIX64( 0.578 ) );
int64 csc_LcY	= MAKEQW( FIX64( 1.164 ) );
int64 csc_clpR	= MAKEQW( ( 49152 - 16 * FIX64( 1.164 ) - 128 * FIX64( 1.141 ) ) );
int64 csc_clpG	= MAKEQW( ( 16384 - 16 * FIX64( 1.164 ) + 128 * ( FIX64( 0.391 ) + FIX64( 0.578 ) ) ) );
int64 csc_clpB	= MAKEQW( ( 49152 - 16 * FIX64( 1.164 ) - 128 * FIX64( 2.031 ) ) );
int64 csc_lvlR	= MAKEQW( 49152 );
int64 csc_lvlG	= MAKEQW( 16384 );
int64 csc_lvlB	= MAKEQW( 49152 );
int64 csc_subR	= MAKEQW( ( 16 * FIX64( 1.164 ) + 128 * FIX64( 1.141 ) ) );
int64 csc_addG	= MAKEQW( ( -16 * FIX64( 1.164 ) + 128 * ( FIX64( 0.391 ) + FIX64( 0.578 ) ) ) );
int64 csc_subB	= MAKEQW( ( 16*FIX64( 1.164 ) + 128* FIX64( 2.031 ) ) );

short filt_cos[] = { 141, 199, -282, -1020, -457, 1078, 1559, 704, -704, -1559, -1078, 457, 1020, 282, -199, -141 };

short filt_sin[] = { -340, 199, 962, 821, -2182, -1525, 364, 1700, 1700, 364, -1525, -2182, 821, 962, 199, -340 };



/*
 * Decode next field of PAL data
 */
void decodeFrameEsnelColorMMX(unsigned char *src, int offset, int pitch)
{
    static unsigned int cbPhase, cbHF, cbVF;
    static int i, y, vsaturation, cbOdd;
    static int pitch_local;

    pitch_local = pitch;

    if (!isInitialized) {
	InitializeDecoderMMX();
    }

    src += offset;
    GetColorBurstDataMMX(src, &cbPhase, &cbHF, &cbVF, &cbOdd);

    src += getColorBurstWidth();

    cbHF *= 10;
    vsaturation = cbOdd ? -saturation : saturation;

    for (y = 0; y < decHeight; y += 2) {
	GetChromaSamplesMMX(src, cbPhase, cbHF, cbVF, saturation, vsaturation);

	for (i = 0; i < 2; i++) {
	  GetLumaSamplesMMX(src, adjust_luma_level, brightness, contrast);
	  SetPixelsMMX(y + i, pitch_local);
	  src += rawWidth;
	}

	cbPhase	+= 2 * cbVF;
    }
}


/*
 * Initialize decoder
 */
void InitializeDecoderMMX()
{
    int i;

    for (i = 0; i < 1024; i++) { 
	short c = (int) (0x7fffffff * cos(i * M_PI / 512.0)) >> 16;
	short s = (int) (0x7fffffff * sin(i * M_PI / 512.0)) >> 16;
	short *v = (short *) &cosin64[i];
	v[0] = s;
	v[1] = c;
	v[2] = c;
	v[3] = ~s;
	cosint[i] = c;
    }
	
    isInitialized = true;
}



void SetPixelsMMX(int y, int pitch)
{
    RGBTRIPLE pixels[decWidth + 12];
    int t;
    unsigned long mypixels[decWidth];
    int offset;
    
    CalcPixelsMMX((unsigned char *) mypixels);

    for (t = 0; t < 12; t++) {
	pixels[t].rgbtRed = 0;
	pixels[t].rgbtGreen = 0;
	pixels[t].rgbtBlue = 0;
    }
	
    offset = 29;
    for (t = 0; t < (decWidth - offset); t++) {
	pixels[t + 12].rgbtRed = (unsigned char) ((mypixels[t + offset] >> 16) & 255);
	pixels[t + 12].rgbtGreen = (unsigned char) ((mypixels[t + offset] >> 8) & 255);
	pixels[t + 12].rgbtBlue = (unsigned char) (mypixels[t + offset] & 255);
    }

/*    drawPixels(12, y, pixels + 30, decWidth - 58, pitch); */
    drawPixels(0, y, pixels, decWidth, pitch);
}



/*
 MMX optimized YUV422 picture output (PAL specific)

 conversion coefficients for YUV to RGB color space :
	R = 1.164 * (Yc - 16)  +  1.141 * (Vc - 128)
	G = 1.164 * (Yc - 16)  -  0.391 * (Uc - 128)  -  0.578 * (Vc - 128)
	B = 1.164 * (Yc - 16)  +  2.031 * (Uc - 128)
*/

void CalcPixelsMMX(unsigned char *pixels)
{
    __asm__ __volatile__ ("movl  %0, %%esi" : : "m" (scanLuma));

    __asm__ __volatile__ ("movl  %0, %%ebx" : : "m" (scanCbU));
    __asm__ __volatile__ ("movl  %0, %%edx" : : "m" (scanCrV));

    __asm__ __volatile__ ("movl  %0, %%edi" : : "m" (pixels));		/* display pointer */
    __asm__ __volatile__ ("subl  %ebx, %edx\n"				/* chroma pointer U->V offset */
			  
			  "movd  (%ebx), %mm1\n"			/* load 4 samples U (blue) */
			  "pxor  %mm3, %mm3\n"				/* empty unpack (0) register */

			  "movd  (%ebx, %edx), %mm2\n"			/* load 4 samples V (red) */
			  "punpcklbw  %mm3, %mm1\n"			/* unpack U samples into words */

			  "movq  (%esi), %mm0\n"			/* load 8 samples Y (luminance) */
			  "punpcklbw  %mm3, %mm2\n"			/* unpack V samples into words */

			  "movq  %mm1, %mm4\n"				/* copy of U for CgU calculation */
			  "movq  %mm0, %mm7");				/* copy of luminance samples for samples 4..7 */

    __asm__ __volatile__ ("movl  %0, %%ecx" : : "m" (decWidth / 8));	/* load 8-pixel counter */

    __asm__ __volatile__ ("__pixel32:\n"

			  "pmullw  %0, %%mm1" : : "m" (csc_CbU));	/* CbU =  2.031*U */
    __asm__ __volatile__ ("movq  %mm2, %mm6");				/* copy of V for CgV calculation */

    __asm__ __volatile__ ("pmullw  %0, %%mm2" : : "m" (csc_CrV));	/* CrV =  1.141*V */
    __asm__ __volatile__ ("punpcklbw  %mm3, %mm0");			/* unpack luminance samples 0..3 */

    __asm__ __volatile__ ("pmullw  %0, %%mm4" : : "m" (csc_CgU));	/* CgU = -0.391*U */
    __asm__ __volatile__ ("punpckhbw  %mm3, %mm7");			/* unpack luminance samples 4..7 */

    __asm__ __volatile__ ("pmullw  %0, %%mm6" : : "m" (csc_CgV));	/* CbU = -0.578*V */
    __asm__ __volatile__ ("movq  %mm1, %mm3");				/* copy of CbU for samples 4..7 */

    __asm__ __volatile__ ("pmullw  %0, %%mm0" : : "m" (csc_LcY));	/* Y0 *= 1.164 */
    __asm__ __volatile__ ("movq  %mm2, %mm5");				/* copy of CrV for samples 4..7 */

    __asm__ __volatile__ ("pmullw  %0, %%mm7" : : "m" (csc_LcY));			/* Y1 *= 1.164 */
    __asm__ __volatile__ ("punpcklwd  %mm1, %mm1\n"			/* unpack CbU for samples 0..3 */

			  "paddusw  %mm6, %mm4\n"			/* CgUV = CgU + CgV */
			  "punpckhwd  %mm3, %mm3\n"			/* unpack CbU for samples 4..7 */

			  "movq  %mm4, %mm6\n"				/* copy of CgUV for samples 4..7 */
			  "paddusw  %mm0, %mm1");			/* B0 = Y0 + CbU0 */

    __asm__ __volatile__ ("psubusw  %0, %%mm1" : : "m" (csc_subB));	/* clip blue samples 0..3 to MIN_VALUE */
    __asm__ __volatile__ ("punpcklwd  %mm2, %mm2\n"			/* unpack CrV samples 0..3 */

			  "punpckhwd  %mm5, %mm5\n"			/* unpack CrV samples 4..7 */
			  "paddusw  %mm7, %mm3");			/* B1 = Y1 + CbU1 */

    __asm__ __volatile__ ("psubusw  %0, %%mm3" : : "m" (csc_subB));	/* clip blue samples 4..7 to MIN_VALUE */
    __asm__ __volatile__ ("punpcklwd  %mm4, %mm4\n"			/* unpack CgUV samples 0..3 */

			  "punpckhwd  %mm6, %mm6\n"			/* unpack CgUV samples 4..7 */
			  "paddusw  %mm0, %mm2");			/* R0 = Y0 + CrV0 */

    __asm__ __volatile__ ("psubusw  %0, %%mm2" : : "m" (csc_subR));	/* clip red samples 0..3 to MIN_VALUE */
    __asm__ __volatile__ ("psrlw  $6, %mm1\n"				/* position upper 8 bits of blue samples 0..3 */

			  "paddusw  %mm7, %mm5\n"			/* R1 = Y1 + CrV1 */
			  "psrlw  $6, %mm3\n");				/* position upper 8 bits of blue samples 4..7 */

    __asm__ __volatile__ ("psubusw  %0, %%mm5" : : "m" (csc_subR));	/* clip red samples 4..7 to MIN_VALUE */
    __asm__ __volatile__ ("psrlw  $6, %mm2");				/* position upper 8 bits of red samples 0..3 */

    __asm__ __volatile__ ("paddusw  %0, %%mm0" : : "m" (csc_addG));	/* clip green samples 0..3 to MAX_VALUE */
    __asm__ __volatile__ ("psrlw  $6, %mm5");				/* position upper 8 bits of red samples 4..7 */

    __asm__ __volatile__ ("paddusw  %0, %%mm7" : : "m" (csc_addG));	/* clip green samples 4..7 to MAX_VALUE */
    __asm__ __volatile__ ("packuswb  %mm3, %mm1\n"			/* pack blue samples 0..7 into UBYTE's */

			  "psubusw  %mm4, %mm0\n"		       	/* G0 = Y0 - CgUV0 */
			  "packuswb %mm5, %mm2\n"			/* pack red samples 0..7 into UBYTE's */

			  "psubusw  %mm6, %mm7\n"			/* G1 = Y1 - CgUV1 */
			  "psrlw  $6, %mm0\n"				/* position upper 8 bits of green samples 0..3 */

			  "psrlw  $6, %mm7\n"				/* position upper 8 bits of green samples 4..7 */
			  "movq  %mm1, %mm4\n"				/* copy for blue samples 4..7 */

			  "packuswb  %mm7, %mm0\n"			/* pack green samples 0..7 into UBYTE's */
			  "pxor  %mm7, %mm7\n"				/* empty unpack (0) register */

			  "movq  %mm0, %mm3\n"				/* copy for green samples 4..7 */
			  "punpcklbw  %mm2, %mm1\n"			/* unpack {blue, red} samples 0..3 */

			  "punpckhbw  %mm2, %mm4\n"			/* unpack {blue, red} samples 4..7 */
			  "movq  %mm1, %mm2\n"				/* copy for {blue, red} samples 2..3 */

			  "movq  %mm4, %mm5\n"				/* copy for {blue, red} samples 6..7 */
			  "punpcklbw  %mm7, %mm0\n"			/* unpack green samples 0..3 into WORD's */

			  "addl  $32, %edi\n"				/* increase display pointer */
			  "punpckhbw  %mm7, %mm3\n");			/* unpack green samples 4..7 into WORD's */

    __asm__ __volatile__ ("punpcklbw  %mm0, %mm1\n"			/* unpack {blue, green, red, 0} samples 0..1 */
			  "addl  $8, %esi\n"				/* increase luma pointer */

			  "punpckhbw  %mm0, %mm2\n"			/* unpack {blue, green, red, 0} samples 2..3 */
			  "addl  $4, %ebx\n"				/* increase chroma pointer */

			  "movq  %mm1, -32(%edi)\n"			/* store pixels 0..1 */
			  "punpcklbw  %mm3, %mm4\n"			/* unpack {blue, green, red, 0} samples 4..5 */

			  "movq  %mm2, -24(%edi)\n"			/* store pixels 2..3 */
			  "punpckhbw  %mm3, %mm5\n"			/* unpack {blue, green, red, 0} samples 6..7 */

			  "movd  (%ebx), %mm1\n"			/* load 4 samples U (blue) */
			  "pxor  %mm3, %mm3\n"				/* empty unpack (0) register */

			  "movd  (%ebx, %edx), %mm2\n"			/* load 4 samples V (red) */
			  "punpcklbw  %mm3, %mm1\n"			/* unpack U samples into words */

			  "movq  (%esi), %mm0\n"			/* load 8 samples Y (luminance) */
			  "punpcklbw  %mm3, %mm2\n"			/* unpack V samples into words */

			  "movq  %mm4, -16(%edi)\n"			/* store pixels 4..5 */
			  "movq  %mm1, %mm4\n"				/* copy of U for CgU calculation */

			  "movq  %mm5, -8(%edi)\n"			/* store pixels 6..7 */
			  "movq  %mm0, %mm7\n"				/* copy of luminance samples for samples 4..7 */

			  "decl  %ecx\n"				/* decrease 8-pixel counter */
			  "jnz  __pixel32\n"				/* do {...} while (counter > 0); */

			  "emms");
}



/*
 * Decode single scanline of Raw 8bit samples in PAL format to grayscale
 * luma (Y) samples at half PAL (50%) resolution.
 */
void GetLumaSamplesMMX(unsigned char *src, int lladjust, int brightness, int contrast)
{
    int64 level64;
    int64 contrast64;
    static int64 fw384 = MAKEQW(384 / 4);

    __asm__ __volatile__ ("movd  %0, %%mm6" : : "m" (contrast));	/* load contrast */
    __asm__ __volatile__ ("pxor  %mm7, %mm7");				/* empty unpack (0) register */

    __asm__ __volatile__ ("movl  %0, %%esi" : : "m" (src));		/* load input pointer */
    __asm__ __volatile__ ("movl  %0, %%edi" : : "m" (scanLuma));	/* load output pointer */

    __asm__ __volatile__ ("movd  %0, %%mm5" : : "m" (brightness));	/* load brightness level */
    __asm__ __volatile__ ("punpcklwd  %mm6, %mm6\n"			/* unpack contrast into WORD's */

			  "movq  (%esi), %mm0\n"			/* L0: load samples 0..7 */
			  "punpckldq  %mm6, %mm6");			/* unpack contrast into DWORD's */

    __asm__ __volatile__ ("movl  %0, %%eax" : : "m" (lladjust));	/* load scan adjustment BOOL */
    __asm__ __volatile__ ("punpcklwd  %mm5, %mm5\n"			/* unpack brightness level into WORD's */

			  "orl  %eax, %eax\n"				/* luma level scan adjustment ? */
			  "jz  __adjusted");				/* skip luma adjustment if false */


    /* ADJUST BRIGHTNESS TO COLOR BURST */
    			    
    /* {mm1, mm2, mm3, mm4} are free */
    /* {mm6} is contrast */
    /* {mm5} is brightness */
  
    __asm__ __volatile__ ("movq  -168(%esi), %mm1\n"			/* load luma level samples 0..7 */

			  "movq  %mm1, %mm2\n"				/* copy for samples 4..7 */
			  "punpcklbw  %mm7, %mm1\n"			/* unpack samples 0..3 into WORD's */

			  "movq  -176(%esi), %mm3\n"			/* load luma level samples 8..15 */
			  "punpckhbw  %mm7, %mm2\n"			/* unpack samples 4..7 into WORD's */

			  "paddw  %mm2, %mm1\n"				/* add samples --> 0..3 */
			  "movq  %mm3, %mm4\n"				/* copy for samples 12..15 */

			  "movq  -184(%esi), %mm2\n"			/* load luma level samples 16..23 */
			  "punpcklbw  %mm7, %mm3\n"			/* unpack samples 8..11 into WORD's */

			  "paddw  %mm3, %mm1\n"				/* add samples --> 0..3 */
			  "punpckhbw  %mm7, %mm4\n"			/* unpack samples 12..15 into WORD's */

			  "movq  %mm2, %mm3\n"				/* copy for samples 20..23 */
			  "punpcklbw %mm7, %mm2\n"			/* unpack samples 16..19 into WORD's */

			  "punpckhbw  %mm7, %mm3\n"			/* unpack samples 20..23 into WORD's */
			  "paddw  %mm4, %mm1\n"				/* add samples --> 0..3 */

			  "movq  -192(%esi), %mm4\n"			/* load luma level samples 24..31 */
			  "paddw  %mm2, %mm1\n"				/* add samples --> 0..3 */

			  "movq  %mm4, %mm2\n"				/* copy for samples 28..31 */
			  "punpcklbw  %mm7, %mm4\n"			/* unpack samples 24..27 into WORD's */

			  "paddw  %mm3, %mm1\n"				/* add samples --> 0..3 */
			  "punpckhbw %mm7, %mm2");			/* unpack samples 28..31 into WORD's */

    __asm__ __volatile__ ("movq  %0, %%mm3" : : "m" (fw384));		/* load constant '384' (4 x WORD) */
    __asm__ __volatile__ ("paddw  %mm4, %mm1\n"				/* add samples --> 0..3 */

			  /* 11CLK's (non-pairing instructions) */
			  "paddw  %mm2, %mm1\n"				/* add samples --> 0..3 */
			  "psubw  %mm1, %mm3\n"				/*  (384 - cbluma32) */
			  "pmaddwd  %mm6, %mm3\n"			/*  (384 - cbluma32) * contrast */
			  "psrad  $11, %mm3\n"				/* ((384 - cbluma32) * contrast) >> 11 */
			  "punpckldq  %mm3, %mm1\n"			/* put low part of mm3 in high part of mm1 */
			  "paddd  %mm1, %mm3\n"				/* add low part to high part */
			  "punpckhdq  %mm3, %mm3\n"			/* copy luminance adjustment as DWORD's */
			  "packssdw  %mm3, %mm3\n"			/* pack luminance adjustment into WORD's */
			  "paddw  %mm3, %mm5");				/* adjust brightness to color burst */


    /*   FILTER LUMINANCE DATA */
    __asm__ __volatile__ ("__adjusted:\n"
			  "movq  %%mm6, %0" : "=m" (contrast64));	/* #store contrast (4 x WORD) */
    __asm__ __volatile__ ("punpckldq  %mm5, %mm5");			/* #unpack brightness level into DWORD's */

    __asm__ __volatile__ ("movq  %%mm5, %0" : "=m" (level64));		/* #store brightness level (4 x WORD) */
    __asm__ __volatile__ ("movq  %mm0, %mm1");				/* L0: copy for samples 4..7 */

    __asm__ __volatile__ ("movl  %0, %%ecx" : : "m" (decWidth / 4));	/* load 4-sample counter */
    __asm__ __volatile__ ("punpcklbw  %mm7, %mm0\n"			/* L0: unpack samples 0..3 */

			  "movq  5(%esi), %mm2\n"			/* L1: load samples 0..7 */
			  "punpckhbw  %mm7, %mm1\n"			/* L0: unpack samples 4..7 */

			  "paddw  %mm1, %mm0\n"				/* L0: add samples --> 0..3 */
			  "movq  %mm2, %mm3\n"				/* L1: copy for samples 4..7 */

			  "punpcklbw  %mm7, %mm2\n"			/* L1: unpack samples 0..3 */
			  "movq  %mm0, %mm1\n"				/* L01: copy of samples L0 */

			  "movq  10(%esi), %mm4\n"			/* L2: load samples 0..7 */
			  "punpckhbw  %mm7, %mm3\n"			/* L1: unpack samples 4..7 */

			  "__pixel:\n"
			  "paddw  %mm3, %mm2\n"				/* L1: add samples --> 0..3 */
			  "movq  %mm4, %mm5\n"				/* L2: copy for samples 4..7 */

			  "movq  15(%esi), %mm6\n"			/* L3: load samples 0..7 */
			  "punpckldq  %mm2, %mm0\n"			/* L01: unpack samples 0..1 of L0, L1 */

			  "punpckhdq  %mm2, %mm1\n"			/* L01: unpack samples 2..3 of L0, L1 */
			  "movq  %mm6, %mm3\n"				/* L3: copy for samples 4..7 */

			  "paddw  %mm1, %mm0\n"				/* L01: add samples --> 0..1 */
			  "punpcklbw  %mm7, %mm4");			/* L2: unpack samples 0..3 */

    __asm__ __volatile__ ("pmaddwd  %0, %%mm0" : : "m" (contrast64));	/* L01: mac samples --> DWORD */
    __asm__ __volatile__ ("punpckhbw  %mm7, %mm5\n"			/* L2: unpack samples 4..7 */

			  "paddw  %mm5, %mm4\n"				/* L2: add samples --> 0..3 */
			  "punpcklbw  %mm7, %mm6\n"			/* L3: unpack samples 0..3 */

			  "movq  %mm4, %mm5\n"				/* L23: copy of samples L2 */
			  "punpckhbw  %mm7, %mm3\n"			/* L3: unpack samples 4..7 */

			  "psrad  $9, %mm0\n"				/* L01: scale samples to normal level */
			  "paddw  %mm3, %mm6\n"				/* L3: add samples --> 0..3 */

			  "punpckldq  %mm6, %mm4\n"			/* L23: unpack samples 0..1 of L2, L3 */
			  "addl  $20, %esi\n"				/* increase input pointer */

			  "punpckhdq  %mm6, %mm5\n"			/* L23: unpack samples 2..3 of L2, L3 */
			  "movq  %mm0, %mm6\n"				/* L01: free MMX register mm0 */

			  "movq  5(%esi), %mm2\n"			/* L1: load samples 0..7 */
			  "paddw  %mm5, %mm4");				/* L23: add samples --> 0..1 */

    __asm__ __volatile__ ("pmaddwd  %0, %%mm4" : : "m"  (contrast64));	/* L23: mac samples --> DWORD */
    __asm__ __volatile__ ("movq  %mm2, %mm3\n"				/* L1: copy for samples 4..7 */

			  "movq  (%esi), %mm0\n"			/* L0: load samples 0..7 */
			  "punpcklbw  %mm7, %mm2\n"			/* L1: unpack samples 0..3 */

			  "movq  %mm0, %mm1\n"				/* L0: copy for samples 4..7 */
			  "punpcklbw  %mm7, %mm0\n"			/* L0: unpack samples 0..3 */

			  "psrad  $9, %mm4\n"				/* L23: scale samples to normal level */
			  "addl  $4, %edi\n"				/* increase output pointer */

			  "packssdw  %mm4, %mm6");			/* L0123: pack samples into WORD's */

    __asm__ __volatile__ ("paddw  %0, %%mm6" : : "m" (level64));	/* L0123: add luminance level */
    __asm__ __volatile__ ("punpckhbw  %mm7, %mm1\n"			/* L0: unpack samples 4..7 */

			  "movq  10(%esi), %mm4\n"			/* L2: load samples 0..7 */
			  "packuswb  %mm6, %mm6\n"			/* L0123: pack samples into UBYTE's */

			  "paddw  %mm1, %mm0\n"				/* L0: add samples --> 0..3 */
			  "punpckhbw  %mm7, %mm3\n"			/* L1: unpack samples 4..7 */

			  "movd  %mm6, -4(%edi)\n"			/* L0123: store output samples 0..3 */
			  "movq  %mm0, %mm1\n"				/* L01: copy of samples L0 */

			  "decl  %ecx\n"				/* decrease 4-sample counter */
			  "jnz  __pixel\n"				/* do {...} while (counter > 0); */

			  "emms");
}


/*
 * Decode single scanline of Raw 8bit samples in PAL format to chroma (U,V)
 * samples at half resolution.
 */
void GetChromaSamplesMMX(unsigned char *src, unsigned int cbPhase, unsigned int cbHF, unsigned int cbVF, int saturation, int vsaturation)
{
    
    int64 invertv64, saturation64;
    int64 *prev = prevScan;

    __asm__ __volatile__ ("movd  %0, %%mm0" : : "m" (saturation));
    __asm__ __volatile__ ("pcmpeqd  %mm1, %mm1");

    __asm__ __volatile__ ("mov  %0, %%esi" : : "m" (src));		/* load input pointer */
    __asm__ __volatile__ ("psllq  $32, %mm1");
    
    __asm__ __volatile__ ("punpckldq  %0, %%mm0" : : "m" (vsaturation));

    __asm__ __volatile__ ("movq  %%mm1, %0" : "=m" (invertv64));
    __asm__ __volatile__ ("pxor  %mm1, %mm0");

    __asm__ __volatile__ ("movl  %0, %%edx" : : "m" (cbPhase));		/* load color burst phase */
    __asm__ __volatile__ ("pslld  $16, %%mm0\n"

			  "movl  %0, %%ebx\n"				/* load chrominance U pointer */
			  "movl  %1, %%edi" : : "m" (scanCbU), "m" (scanCrV));	/* load chrominance V pointer */

    __asm__ __volatile__ ("movq  %%mm0, %0" : "=m" (saturation64));

    
    
    /* DECODE CHROMINANCE COMPONENT OF PAL */
    /* {esi} is source, {edx} is cbPhase */
    /* {ebx} is CbU[], {edi} is CbV */
    
    __asm__ __volatile__ ("movl  %0, %%ecx" : : "m" (decWidth / 2));	/* load 2-pixel counter */

    __asm__ __volatile__ ("movq  -7(%esi), %mm0\n"			/* C0: load samples 0..7 */
			  "pxor  %mm7, %mm7\n"				/* empty unpack (0) register */

			  "movq  1(%esi), %mm2\n"			/* C0: load samples 8..15 */
			  "movq  %mm0, %mm1\n"				/* C0: copy for samples 4..7 */

			  "movq  %mm2, %mm3\n"				/* C0: copy for samples 12..15 */
			  "punpcklbw  %mm7, %mm0\n"			/* C0: unpack samples 0..3 into WORD's */

			  "movq  %mm0, %mm4\n"				/* C0: copy of samples 0..3 for "sine" filter */
			  "punpckhbw  %mm7, %mm1");			/* C0: unpack samples 4..7 into WORD's */

    __asm__ __volatile__ ("pmaddwd  (%0), %%mm0" : : "m" (*filt_cos));		/* C0: "cosine" filter samples 0..3 */
    __asm__ __volatile__ ("movq  %mm1, %mm5");				/* C0: copy of samples 4..7 for "sine" filter */

    __asm__ __volatile__ ("pmaddwd  (%0 + 8), %%mm1" : : "m" (*filt_cos));	/* C0: "cosine" filter samples 4..7 */
    __asm__ __volatile__ ("punpcklbw  %mm7, %mm2");			/* C0: unpack samples 8..11 */

    __asm__ __volatile__ ("pmaddwd  (%0), %%mm4" : : "m" (*filt_sin));		/* C0: "sine" filter samples 0..3 */
    __asm__ __volatile__ ("movq  %mm2, %mm6");				/* C0: copy of samples 8..11 for "sine" filter */

    __asm__ __volatile__ ("__pixel_2:\n"
			  "pmaddwd  (%0 + 16), %%mm2" : : "m" (*filt_cos));	/* C0: "cosine" filter samples 8..11 */
    __asm__ __volatile__ ("punpckhbw  %mm7, %mm3");			/* C0: unpack samples 12..15 */

    __asm__ __volatile__ ("pmaddwd  (%0 + 8), %%mm5" : : "m" (*filt_sin));	/* C0: "sine" filter samples 4..7 */
    __asm__ __volatile__ ("paddd  %mm1, %mm0");				/* C0: add "cosine" filtered samples 0..7 */

    __asm__ __volatile__ ("pmaddwd  (%0 + 16), %%mm6" : : "m" (*filt_sin));	/* C0: "sine" filter samples 8..11 */
    __asm__ __volatile__ ("movq  %mm3, %mm1");				/* C0: copy of samples 12..15 for "sine" filter */

    __asm__ __volatile__ ("pmaddwd  (%0 + 24), %%mm3" : : "m" (*filt_cos));	/* C0: "cosine" filter samples 12..15 */
    __asm__ __volatile__ ("paddd  %mm2, %mm0");				/* C0: add "cosine" filtered samples 0..11 */

    __asm__ __volatile__ ("pmaddwd  (%0 + 24), %%mm1" : : "m" (*filt_sin));	/* C0: "sine" filter samples 12..15 */
    __asm__ __volatile__ ("paddd  %mm5, %mm4\n"				/* C0: add "sine" filtered samples 0..7 */

			  "paddd  %mm6, %mm4\n"				/* C0: add "sine" filtered samples 0..11 */
			  "movl  %edx, %eax\n"				/* copy of color burst phase */

			  "paddd  %mm3, %mm0\n"				/* C0: add "cosine" filtered samples 0..15 */
			  "shrl  $22, %eax\n"				/* (cb >> 22) ... index into (co)sine table */

			  "paddd  %mm1, %mm4\n"				/* C0: add "sine" filtered samples 0..15 */
			  "punpckldq  %mm0, %mm3\n"			/* C0: copy low part "cosine" into high part mm3 */

			  "punpckldq  %mm4, %mm1\n"			/* C0: copy low part "sine" into high part mm1 */
			  "paddd  %mm0, %mm3");				/* C0: "cosine" filtered data in high DWORD */

/*    __asm__ __volatile__ ("movq  %0(%%esi), %%mm0" : : "n" (rawWidth - 7));	// C1: load samples 0..7 */
    __asm__ __volatile__ ("movq  2261(%esi), %mm0");			/* C1: load samples 0..7 */
    __asm__ __volatile__ ("paddd  %mm4, %mm1");				/* C0: "sine" filtered data in high DWORD */

/*    __asm__ __volatile__ ("movq  %0(%%esi), %%mm2" : : "n" (rawWidth + 1));	// C1: load samples 8..15 */
    __asm__ __volatile__ ("movq  2269(%esi), %mm2");			/* C1: load samples 8..15 */
    __asm__ __volatile__ ("punpckhdq  %mm1, %mm3\n"			/* C0: low DWORD={cosine}, high DWORD={sine} */

			  "psrad  $12, %mm3\n"				/* C0: scale (co)sine filtered data */
			  "movq  %mm0, %mm1\n"				/* C1: copy for samples 4..7 */

			  "packssdw  %mm3, %mm3\n"			/* C0: pack co(sine) filtered data into WORD's */
			  "movq  %mm2, %mm6");				/* C1: copy for samples 12..15 */

    __asm__ __volatile__ ("pmaddwd  %0(, %%eax, 8), %%mm3" : : "m" (*cosin64));	/* C0: low DWORD={u}, high DWORD={v} */
    __asm__ __volatile__ ("punpcklbw  %mm7, %mm0\n"			/* C1: unpack samples 0..3 into WORD's */

			  "movq  %mm0, %mm4\n"				/* C1: copy of samples 0..3 for "sine" filter */
			  "punpckhbw  %mm7, %mm1");			/* C1: unpack samples 4..7 into WORD's */

    __asm__ __volatile__ ("pmaddwd  (%0), %%mm0" : : "m" (*filt_cos));		/* C1: "cosine" filter samples 0..3 */
    __asm__ __volatile__ ("movq  %mm1, %mm5");				/* C1: copy of samples 4..7 for "sine" filter */

    __asm__ __volatile__ ("pmaddwd  (%0 + 8), %%mm1" : : "m" (*filt_cos));	/* C1: "cosine" filter samples 4..7 */
    __asm__ __volatile__ ("punpcklbw  %mm7, %mm2");			/* C1: unpack samples 8..11 */

    __asm__ __volatile__ ("pmaddwd  (%0), %%mm4" : : "m" (*filt_sin));		/* C1: "sine" filter samples 0..3 */
    __asm__ __volatile__ ("punpckhbw  %mm7, %mm6");			/* C1: unpack samples 12..15 */

    __asm__ __volatile__ ("pmaddwd  (%0 + 8), %%mm5" : : "m" (*filt_sin));	/* C1: "sine" filter samples 4..7 */
    __asm__ __volatile__ ("movq  %mm2, %mm7");				/* C1: copy of samples 8..11 for "sine" filter */

    __asm__ __volatile__ ("pmaddwd  (%0 + 16), %%mm2" : : "m" (*filt_cos));	/* C1: "cosine" filter samples 8..11 */
    __asm__ __volatile__ ("paddd  %mm1, %mm0");				/* C1: add "cosine" filtered samples 0..7 */

    __asm__ __volatile__ ("pmaddwd  (%0 + 16), %%mm7" : : "m" (*filt_sin));	/* C1: "sine" filter samples 8..11 */
    __asm__ __volatile__ ("movq  %mm6, %mm1");				/* C1: copy of samples 12..15 for "sine" filter */

    __asm__ __volatile__ ("pmaddwd  (%0 + 24), %%mm6" : : "m" (*filt_cos));	/* C1: "cosine" filter samples 12..15 */
    __asm__ __volatile__ ("paddd  %mm2, %mm0");				/* C1: add "cosine" filtered samples 0..11 */

    __asm__ __volatile__ ("pmaddwd  (%0 + 24), %%mm1" : : "m" (*filt_sin));	/* C1: "sine" filter samples 12..15 */
    __asm__ __volatile__ ("paddd  %mm5, %mm4\n"				/* C1: add "sine" filtered samples 0..7 */

			  "paddd  %mm7, %mm4");				/* C1: add "sine" filtered samples 0..11 */
    __asm__ __volatile__ ("movl  %0, %%eax" : : "m" (cbVF));		/* load color burst vertical stepsize */

    __asm__ __volatile__ ("paddd  %mm6, %mm0\n"				/* C1: add "cosine" filtered samples 0..15 */
			  "addl  %edx, %eax\n"				/* copy of color burst phase */

			  "paddd  %mm1, %mm4\n"				/* C1: add "sine" filtered samples 0..15 */
			  "shrl  $22, %eax");				/* (cb >> 22) ... index into (co)sine table */

    __asm__ __volatile__ ("pxor  %0, %%mm3" : : "m" (invertv64));	/* C0: invert chrominace "V" (feature of PAL) */
    __asm__ __volatile__ ("punpckldq  %mm0, %mm5\n"			/* C1: copy low part "cosine" into high part mm5 */

			  "punpckldq  %mm4, %mm7\n"			/* C1: copy low part "sine" into high part mm7 */
			  "paddd  %mm0, %mm5\n"				/* C1: "cosine" filtered data in high DWORD */

			  "paddd  %mm4, %mm7\n"				/* C1: "sine" filtered data in high DWORD */
			  "incl  %ebx");				/* increase chrominance "U" pointer */

    __asm__ __volatile__ ("movq  %0(, %%eax, 8), %%mm6" : : "m" (*cosin64));	/* load (co)sine table entry */
    __asm__ __volatile__ ("punpckhdq  %mm7, %mm5");			/* C1: low DWORD={cosine}, high DWORD={sine} */

    __asm__ __volatile__ ("movl %0, %%eax" : : "m" (prev));		/* previous scanline chrominance pointer */
    __asm__ __volatile__ ("psrad  $12, %mm5\n"				/* C1: scale (co)sine filtered data */

			  "incl %edi\n"					/* increase chrominance "V" pointer */
			  "packssdw %mm5, %mm5\n"			/* C1: pack co(sine) filtered data into WORD's */

			  "pmaddwd  %mm6, %mm5\n"			/* C1: low DWORD={u}, high DWORD={v} */
			  "movq  (%eax), %mm6\n"			/* load previous scanline chrominance samples */

			  "addl  $8, %eax\n"				/* increase previous chrominance pointer */
			  "addl  $10, %esi\n"				/* increase input pointer */

			  "pxor  %mm7, %mm7");				/* empty unpack (0) register */
    __asm__ __volatile__ ("movl  %%eax, %0" : "=m" (prev));		/* store previous chrominance pointer */

    __asm__ __volatile__ ("movq  -7(%esi), %mm0\n"			/* C0: load samples 0..7 */
			  "paddd  %mm3, %mm5\n"				/* C01: low DWORD={u}, high DWORD={v} */

			  "movq  1(%esi), %mm2\n"			/* C0: load samples 8..15 */
			  "paddd  %mm5, %mm6");				/* average chrominance scanlines (PAL feature) */

    __asm__ __volatile__ ("pmaddwd  %0, %%mm6" : : "m" (saturation64)); /* adjust saturation levels */
    __asm__ __volatile__ ("movq  %mm0, %mm1\n"				/* C0: copy for samples 4..7 */

			  "movq  %mm5, -8(%eax)\n"			/* store current scanline chrominance samples */
			  "movq  %mm2, %mm3");				/* C0: copy for samples 12..15 */

    __asm__ __volatile__ ("movl  %0, %%eax" : : "m" (cbHF));		/* load color burst horizontal stepsize */
    __asm__ __volatile__ ("punpcklbw  %mm7, %mm0\n"			/* C0: unpack samples 0..3 into WORD's */

			  "psrad  $7, %mm6\n"				/* scale chroma output samples */
			  "addl  %eax, %edx\n"				/* increment color burst phase (H) */

			  "movq  %mm0, %mm4\n"				/* C0: copy of samples 0..3 for "sine" filter */
			  "packssdw  %mm6, %mm6");			/* pack chroma output samples into WORD's */

    __asm__ __volatile__ ("pmaddwd  (%0), %%mm0" : : "m" (*filt_cos));	/* C0: "cosine" filter samples 0..3 */
    __asm__ __volatile__ ("packsswb  %mm6, %mm6");			/* pack chroma output samples into BYTE's */

    __asm__ __volatile__ ("pmaddwd  (%0), %%mm4" : : "m" (*filt_sin));	/* C0: "sine" filter samples 0..3 */
    __asm__ __volatile__ ("punpckhbw  %mm7, %mm1\n"			/* C0: unpack samples 4..7 into WORD's */

			  "movd  %mm6, %eax\n"				/* copy of chroma output samples */
			  "movq  %mm1, %mm5");				/* C0: copy of samples 4..7 for "sine" filter */

    __asm__ __volatile__ ("pmaddwd  (%0 + 8), %%mm1" : : "m" (*filt_cos));	/* C0: "cosine" filter samples 4..7 */
    __asm__ __volatile__ ("punpcklbw  %mm7, %mm2\n"			/* C0: unpack samples 8..11 */

			  "xorl  $0x8080, %eax\n"			/* chroma signed to unsigned conversion */
			  "movq  %mm2, %mm6\n"				/* C0: copy of samples 8..11 for "sine" filter */
			 
			  "movb  %al, -1(%ebx)\n"			/* store sample U (blue) */
			  "movb  %ah, -1(%edi)\n");			/* store sample V (red) */
    
    __asm__ __volatile__ ("decl  %ecx\n"				/* decrease chrominance sample counter */
			  "jnz  __pixel_2\n"				/* do {...} while (counter > 0); */

			  "emms");
}


/*  Todo :  optimize this (MMX) ... */
/*
 * Get color burst information
 */
void GetColorBurstDataMMX(unsigned char *src, unsigned int *cbPhase, unsigned int *cbHF, unsigned int *cbVF, int* cbOdd)
{
    static int cbPhases[decHeight];
    double f_amp, div;
    unsigned int c0, c1, s0, s1, p0, p1, p2, p3, f_cb;
    int i, v, x, y, f_cos, f_sin;


    /* determine phase offsets of colorburst */
    for (y=0; y < decHeight; y++) {
	for (x=0, f_cos=0, f_sin=0, f_cb=0; x < 72; x++) {
	    i      = *src++;
	    f_cos += cosint[ f_cb               >> 22] * i;
	    f_sin += cosint[(f_cb - 0x40000000) >> 22] * i;
	    f_cb  += FcHF0;
	}
	src  += (rawWidth - 72);

	div = sqrt( (double) f_cos*f_cos + (double) f_sin*f_sin );
	if (div != 0)
	    f_amp = 1 / div;
	else
	    f_amp = 10000;

	c1 = (unsigned int) (acos( f_cos * f_amp ) * divPi);
	s0 = (unsigned int) (asin( f_sin * f_amp ) * divPi) + 0x80000000;
	c0 = ~c1;
	s1 = ~s0 + 0x80000000;
	p0 = abs(c0 - s0);  p1 = abs(c0 - s1);
	p2 = abs(c1 - s0);  p3 = abs(c1 - s1);

	if ((p0 < p2 && p0 < p3) || (p1 < p2 && p1 < p3))  c1 = c0;
	if ((p0 < p1 && p0 < p3) || (p2 < p1 && p2 < p3))  s1 = s0;

	cbPhases[y] = (s1 >> 1) + (c1 >> 1);
    }

    /* Vc polarity detection  (even/odd frame) */
    for (i=2, p0=0, p1=0; i < (decHeight - 2); i += 2) {
	p0 += abs( cbPhases[i] - cbPhases[i - 1] ) >> 8;
	p1 += abs( cbPhases[i + 1] - cbPhases[i] ) >> 8;
    }
    *cbOdd = (p0 < p1)?1:0;

    /* sample rate correction (color burst) */
    for (i=8, v=0; i < (decHeight /2); i++) {
	v += (cbPhases[(decHeight /2) + i] - cbPhases[i]) >> 8;
    }
    
    v		= ((v /(decHeight /2)) << 8) /((decHeight /2) - 8);
    *cbHF	= v /rawWidth +  FcHF0;
    *cbVF	= v           +  FcVF0;
    i		= cbPhases[0] +  200 * (*cbHF) - 0x60000000L;
    i	       -= (*cbOdd ? 0 : -0x40000000);
    *cbPhase	=  i - *cbHF - (*cbHF >> 1);
}
