/*
 * Copyright (C) 2000-2007 the xine project
 * 
 * This file is part of xine, a free video player.
 * 
 * xine is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * xine is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
 *
 * $Id: xine_post_swscale.c,v 1.9 2008/12/13 14:24:03 phintuka Exp $
 *
 * Simple (faster) resize for avisynth
 *     Copyright (C) 2002 Tom Barry
 *
 * Very simple 2 tap linear interpolation.  
 * It is unfiltered which means it will not soften much.
 *
 * WarpedResize will do a non-linear stretch/squeeze in both the horizontal 
 * and vertical dimensions. This can be useful when you want to change the 
 * aspect ratio of a video clip and have it mostly distorted at the
 * top, bottom, and side edges. 
 *
 *
 * Ported to linux/xine by Petri Hintukainen <phintuka@users.sourceforge.net>
 *    - Added x86_64 support
 *    - Added PIC support (do not clobber ebx in x86, access only local variables from asm)
 *    - Fixed yv12 stretched warp tables generation
 */
 
#include <xine/xine_internal.h>
#include <xine/post.h>

/*#define DBG(x...)*/
#define DBG(x...) fprintf(stderr, "post_warp: " x)

/*#define STREAMING_STORE_TMP*/
/*#define STREAMING_STORE*/
/*#define PREFETCH*/
/* streaming store and prefetch seems to be slower ...
 * Tested with P3 (128M L2) and C2D (4M L2).
 * Maybe access pattern is enough simple for HW prefetchers.
 */

/*#define VANILLA*/

/*
 * This function accepts a position from 0 to 1 and warps it, to 0 through 1 based
 * upon the wFact var. The warp equations are designed to:
 * 
 * * Always be rising but yield results from 0 to 1
 *
 * * Have a first derivative that doesn't go to 0 or infinity, at least close
 *   to the center of the screen
 *
 * * Have a curvature (absolute val of 2nd derivative) that is small in the
 *   center and smoothly rises towards the edges. We would like the curvature
 *   to be everywhere = 0 when the warp factor = 1
 */
static double WarpFactor(double position, double wFact)
{
  double x;
  double z;
  double w;
  x = 2 * (position - .5);
  if (1) /*(wFact < 1.0)*/
    /* For warp factor < 1 the warp is calculated as (1-w) * x^3 + w *x, centered
     *
     * The warp is calculated as z = (1 - w) * x^3 + w * x, centered
     * around .5 and ranging from 0 to 1. After some tinkering this seems
     * to give decent values and derivatives at the right places.
     */
    w = 2.0 - wFact; /* reverse parm for compat with initial release */
  
  if (x < 0.0) {
    z = -(1 - w) * x*x*x - w * x; /* -1 < x < 0, wFact < 1 */
    return .5 - .5 * z;                     
  } else {
    z = (1 - w) * x*x*x + w * x;  /* -1 < x < 0, wFact < 1 */
    return .5 + .5 * z;           /* amts to same formula as above for now */
  }
}

/*
 * YV12
 *
 * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
 * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
 * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
 * will later be processed each pass through the horizontal resize loop.  I think with my
 * current math the Horizontal Luma and Chroma contains the same values but since I may have screwed it
 * up I'll leave it this way for now. Vertical chroma is different.
 *
 * Note - try just using the luma calcs for both, seem to be the same.
 *
 * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
 */
static void init_tables_yv12(int newwidth, int newheight, int oldwidth, int oldheight,
			     int Interlaced, double hWarp, double vWarp,
			     uint32_t *hControl,   uint32_t *vOffsets,   uint32_t *vWeights,
			     uint32_t *hControlUV, uint32_t *vOffsetsUV, uint32_t *vWeightsUV)
{
  int i;
  int j;
  int k;
  int wY1;
  int wY2;
  DBG("init_yv12: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", 
      oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);

  /* First set up horizontal table, use for both luma & chroma since 
   * it seems to have the same equation.
   * We will geneerate these values in pairs, mostly because that's the way
   * I wrote it for YUY2 above.
   */

  for(i=0; i < newwidth; i+=2) {
    /* first make even pixel control */
    if (hWarp==1)  /*if no warp factor */
      j = i * 256 * (oldwidth-1) / (newwidth-1);
    else           /* stretch and warp somehow */
      j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
    
    k = j>>8;
    wY2 = j - (k << 8);  /* luma weight of right pixel */
    wY1 = 256 - wY2;     /* luma weight of left pixel  */

    if (k > oldwidth - 2) {
      hControl[i*3+4] = oldwidth - 1;  /* point to last byte */
      hControl[i*3] =   0x00000100;    /* use 100% of rightmost Y */
    } else {
      hControl[i*3+4] = k;             /* pixel offset */
      hControl[i*3] = wY2 << 16 | wY1; /* luma weights */
    }

    /* now make odd pixel control */
    if (hWarp==1)   /* if no warp factor */
      j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
    else        /* stretch and warp somehow */
      j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));

    k = j>>8;
    wY2 = j - (k << 8); /* luma weight of right pixel */
    wY1 = 256 - wY2;    /* luma weight of left pixel  */

    if (k > oldwidth - 2) {
      hControl[i*3+5] = oldwidth - 1; /* point to last byte  */
      hControl[i*3+1] = 0x00000100;   /* use 100% of rightmost Y */
    } else {
      hControl[i*3+5] = k;               /* pixel offset */
      hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
    }
  }

  hControl[newwidth*3+4] =  2 * (oldwidth-1); /* give it something to prefetch at end */
  hControl[newwidth*3+5] =  2 * (oldwidth-1); /*  "  */
#ifndef VANILLA
  // UV
  for(i=0; i < newwidth/2; i+=2) {
    /* first make even pixel control */
    if (hWarp==1)  /*if no warp factor */
      j = i * 256 * (oldwidth/2-1) / (newwidth/2-1);
    else           /* stretch and warp somehow */
      j = (int) (256 * WarpFactor(i / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));
    
    k = j>>8;
    wY2 = j - (k << 8);  /* luma weight of right pixel */
    wY1 = 256 - wY2;     /* luma weight of left pixel  */

    if (k > oldwidth/2 - 2) {
      hControlUV[i*3+4] = oldwidth/2 - 1;  /* point to last byte */
      hControlUV[i*3] =   0x00000100;    /* use 100% of rightmost Y */
    } else {
      hControlUV[i*3+4] = k;             /* pixel offset */
      hControlUV[i*3] = wY2 << 16 | wY1; /* luma weights */
    }

    /* now make odd pixel control */
    if (hWarp==1)   /* if no warp factor */
      j = (i+1) * 256 * (oldwidth/2-1) / (newwidth/2-1);
    else        /* stretch and warp somehow */
      j = (int) (256 * WarpFactor((i+1) / (newwidth/2-1.0), hWarp) * (oldwidth/2-1));

    k = j>>8;
    wY2 = j - (k << 8); /* luma weight of right pixel */
    wY1 = 256 - wY2;    /* luma weight of left pixel  */

    if (k > oldwidth/2 - 2) {
      hControlUV[i*3+5] = oldwidth/2 - 1; /* point to last byte  */
      hControlUV[i*3+1] = 0x00000100;   /* use 100% of rightmost Y */
    } else {
      hControlUV[i*3+5] = k;               /* pixel offset */
      hControlUV[i*3+1] = wY2 << 16 | wY1; /* luma weights */
    }
  }

  hControlUV[newwidth/2*3+4] =  (oldwidth/2-1); /* give it something to prefetch at end */
  hControlUV[newwidth/2*3+5] =  (oldwidth/2-1); /*  "  */
#endif

  /* Next set up vertical tables. The offsets are measured in lines and will be mult */
  /* by the source pitch later . */

  /* For YV12 we need separate Luma and chroma tables */

  /* First Luma Table */
  for(i=0; i< newheight; ++i) {
    if (vWarp==1)  /* if no warp factor */
      j = i * 256 * (oldheight-1) / (newheight-1);
    else           /* stretch and warp somehow */
      j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));
    
    if (Interlaced) {  /* do hard way? */
      if (i%2) {       /* is odd output line? */
	if (j < 256) {     /* before 1st odd input line */
	  vOffsets[i] = 1; /* all from line 1 */
	  vWeights[i] = 0; /* weight to give to 2nd line */
	} else {
	  k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
	  vOffsets[i] = k;
	  wY2 = j - (k << 8); 
	  vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
	}
      } else {         /* is even output line */
	k = (j >> 9) << 1;        /* next lower even line */
	vOffsets[i] = k;
	wY2 = j - (k << 8); 
	vWeights[i] = wY2 >> 1;   /* weight to give to 2nd line */
      }
    } else {           /* simple way, do as progressive */
      k = j >> 8;
      vOffsets[i] = k;
      wY2 = j - (k << 8); 
      vWeights[i] = wY2;   /* weight to give to 2nd line */
    }
  }

  /* Vertical table for chroma */
  for(i=0; i< newheight/2; ++i) {
    if (vWarp==1)  /* if no warp factor */
#ifdef VANILLA
      j = (int) ( (i+.25) * 256 * (oldheight-1) / (newheight-1.0) - 64 );
#else
      j = (int) ( (i+.25) * 256 * (oldheight/2-1) / (newheight/2-1.0) - 64 );
#endif
    else           /* stretch and warp somehow */
#ifdef VANILLA
      j = (int) (256 * WarpFactor( (i+.25) / (newheight-1.0), vWarp) * (oldheight-1.0) );
#else
      j = (int) (256 * WarpFactor( (i+.25) / (newheight/2 - 1.0), vWarp) * (oldheight/2 - 1.0) );
#endif
#ifndef VANILLA
    if(j<0) j=0;
#endif
    if (Interlaced) { /* do hard way? */
      if (i%2) {                /* is odd output line? */
	if (j < 256) {            /* before 1st odd input line */
	  vOffsetsUV[i] = 1;         /* all from line 1 */
	  vWeightsUV[i] = 0;         /* weight to give to 2nd line */
	} else {
	  k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
	  vOffsetsUV[i] = k;
	  wY2 = j - (k << 8); 
	  vWeightsUV[i] = wY2 >> 1;  /* weight to give to 2nd line */
	}
      } else {                  /* is even output line */
#ifdef VANILLA
	k = (j >> 9) << 1;         /* next lower even line */
	vOffsetsUV[i] = k;
	wY2 = j - (k << 8); 
	vWeightsUV[i] = wY2 >> 1;  /* weight to give to 2nd line */
#else
	k = (j / 512) << 1;         /* next lower even line */
	vOffsetsUV[i] = k;
	wY2 = j - (k << 8); 
	vWeightsUV[i] = wY2 >> 1;  /* weight to give to 2nd line */
#endif
      }
    } else {              /* simple way, do as progressive */
#ifdef VANILLA
      k = j >> 8;
#else
      k = j / 256; /* j >> 8;  does not work right if  -256 < j < 0 */
#endif
      vOffsetsUV[i] = k;
      wY2 = j - (k << 8);
      vWeightsUV[i] = wY2;      /* weight to give to 2nd line */
    }
  }
}

/*
 * YUY2
 *
 * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int
 * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively.
 * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels
 * will later be processed each pass through the horizontal resize loop.
 *
 * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel.
 */
static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldheight,
			     int Interlaced, double hWarp, double vWarp,
			     uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights )
{
  int i;
  int j;
  int k;
  int wY1;
  int wY2;
  int wUV1;
  int wUV2;
  DBG("init_yuy2: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", 
      oldwidth, oldheight, newwidth, newheight, hWarp, vWarp);
  
  /* First set up horizontal table */
  for(i=0; i < newwidth; i+=2) {
    /* first make even pixel control */
    if (hWarp==1)          /* if no warp factor */
      j = i * 256 * (oldwidth-1) / (newwidth-1);
    else                   /* stretch and warp somehow */
      j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1));
    
    k = j>>8;
    wY2 = j - (k << 8);	   /* luma weight of right pixel */
    wY1 = 256 - wY2;       /* luma weight of left pixel  */
    wUV2 = (k%2)  ?  128 + (wY2 >> 1)  :  wY2 >> 1;
    wUV1 = 256 - wUV2;
    
    if (k > oldwidth - 2) {
      hControl[i*3+4] = oldwidth - 1;      /* point to last byte      */
      hControl[i*3]   = 0x00000100;        /* use 100% of rightmost Y */
      hControl[i*3+2] = 0x00000100;        /* use 100% of rightmost U */
    } else {
      hControl[i*3+4] = k;                 /* pixel offset   */
      hControl[i*3]   = wY2 << 16 | wY1;   /* luma weights   */
      hControl[i*3+2] = wUV2 << 16 | wUV1; /* chroma weights */
    }
    
    /* now make odd pixel control */
    if (hWarp==1)                          /* if no warp factor */
      j = (i+1) * 256 * (oldwidth-1) / (newwidth-1);
    else                                   /* stretch and warp somehow */
      j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1));
    
    k = j>>8;
    wY2 = j - (k << 8); /* luma weight of right pixel */
    wY1 = 256 - wY2;    /* luma weight of left pixel  */
    wUV2 = (k%2)  ?  128 + (wY2 >> 1)  :  wY2 >> 1;
    wUV1 = 256 - wUV2;
    
    if (k > oldwidth - 2) {
      hControl[i*3+5] = oldwidth - 1;    /* point to last byte      */
      hControl[i*3+1] = 0x00000100;      /* use 100% of rightmost Y */
      hControl[i*3+3] = 0x00000100;      /* use 100% of rightmost V */
    } else {
      hControl[i*3+5] = k;               /* pixel offset */
      hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */
      /*			hControl[i*3+3] = wUV2 << 16 | wUV1; // chroma weights */
      /* horiz chroma weights should be same as for even pixel - trbarry 09/16/2002 */
      hControl[i*3+3] = hControl[i*3+2]; /* chroma weights */
    }
  }
  
  hControl[newwidth*3+4] =  2 * (oldwidth-1); /* give it something to prefetch at end */
  hControl[newwidth*3+5] =  2 * (oldwidth-1);
  
  /* Next set up vertical table. The offsets are measured in lines and will be mult */
  /* by the source pitch later */
  for(i=0; i< newheight; ++i) {
    if (vWarp==1)                   /* if no warp factor */
      j = i * 256 * (oldheight-1) / (newheight-1);
    else                            /* stretch and warp somehow */
      j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1));

    if (Interlaced) {           /* do hard way? */
      if (i%2) {                  /* is odd output line? */
	if (j < 256) {            /* before 1st odd input line */
	  vOffsets[i] = 1;        /* all from line 1 */
	  vWeights[i] = 0;        /* weight to give to 2nd line */
	} else  {
	  k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */
	  vOffsets[i] = k;
	  wY2 = j - (k << 8); 
	  vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */
	}
      } else {                    /* is even output line */
	k = (j >> 9) << 1;        /* next lower even line */
	vOffsets[i] = k;
	wY2 = j - (k << 8); 
	vWeights[i] = wY2 >> 1;   /* weight to give to 2nd line */
      }
    } else {                    /* simple way, do as progressive */
      k = j >> 8;
      vOffsets[i] = k;
      wY2 = j - (k << 8);  
      vWeights[i] = wY2;          /* weight to give to 2nd line */
    }
  }
}

/* Register allocation */
/* index/counter registers (REGA, REGC) are loaded from 32bit vars/arrays ! */
#define     REGEA "eax"
#define     REGEB "ebx"
#if defined(__x86_64__)
#  define   REGA  "rax"
#  define   REGB  "rbx"
#  define   REGC  "ecx"
#  define   REGD  "rdx"
#  define   REGDI "rdi"
#  define   REGSI "rsi"
#elif defined(__i386__) 
#  define   REGA  "eax"
#  define   REGB  "ebx"
#  define   REGC  "ecx"
#  define   REGD  "edx"
#  define   REGDI "edi"
#  define   REGSI "esi"
#endif

/* variables accessed from assembler code */
#define _FPround1       "%0"
#define _vWeight1       "%1" 
#define _vWeight2       "%2" 
#define _YMask          "%3"
#define _src_row_size   "%4"
#define _EndOffset      "%5"
#define _pControl       "%6"
#define _row_size       "%7"
#define _vWorkYW        "%8"
#define _dstp           "%9"
#define _vWorkUVW       "%10"
#define _FPround2       "%11"
#define _srcp1          "%12"
#define _srcp2          "%13"
#if !defined(__x86_64__)
#define _oldbx          "%14"
#define _SSEMMXenabledW "%15"
#define _SSE2enabledW   "%16"
#endif

/* Labels */
#define vMaybeSSEMMX      "1"
#define LessThan8         "2"
#define LessThan4         "3"
#define AllDone           "4"
#define LastOne           "5"
#define vLoopSSE2_Fetch   "6"
#define vLoopSSE2         "7"
#define vLoopSSEMMX_Fetch "8"
#define vLoopSSEMMX       "9"
#define vLoopMMX         "10"
#define MoreSpareChange  "11"
#define DoHorizontal     "12"
#define hLoopMMX         "13"
#define hLoopMMXSSE      "14"


/* structure for mmx constants */
typedef union {
  uint64_t uq[1];  /* Unsigned Quadword */
  uint32_t ud[2];  /* Unsigned Doubleword */
} ATTR_ALIGN(16) mmx_t;

/* structure for sse2 constants */
typedef union {
  uint64_t uq[2];  /* Unsigned Quadword */
  uint32_t ud[4];  /* Unsigned Doubleword */
} ATTR_ALIGN(16) sse2_t;


static int do_warp_yuy2(uint8_t *dst, const uint8_t *src,
			const int dst_pitch, const int src_pitch,
			const int dst_width, const int dst_height,			 
			const int src_width, const int src_height,
			const int Interlaced, const uint32_t * const hControl, 
			const uint32_t * const vOffsets, const uint32_t * const vWeights,
			uint32_t *vWorkY, uint32_t *vWorkUV,
			int dst_start)
{
#if defined(__i386__) || defined(__x86_64__)
  sse2_t YMask    = {uq:{UINT64_C(0x00ff00ff00ff00ff),UINT64_C(0x00ff00ff00ff00ff)}}; /* keeps only luma */
  sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words      */
  sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords     */
  sse2_t vWeight1;
  sse2_t vWeight2;

  const uint32_t *pControl = &hControl[0];
  const uint32_t *vWorkYW = vWorkY;
  const uint32_t *vWorkUVW = vWorkUV;
  const uint8_t *srcp = src;
  const uint8_t *srcp1;
  const uint8_t *srcp2;
  uint8_t *dstp = dst + dst_pitch*dst_start;

  const uint32_t src_row_size = src_width * 2;
  const uint32_t row_size = dst_width * 2;
  const uint32_t EndOffset = src_row_size / 2;

#if !defined(__x86_64__)
  const int accel = xine_mm_accel();
  const uint32_t SSE2enabledW   = !!(accel & MM_ACCEL_X86_SSE2);   /* in local storage for asm */
  const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
  long int oldbx;
#endif
  int y;

  for (y = dst_start; y < dst_height; y++) {

    if(vOffsets[y] >= src_height) {
      /* slice completed */
      /*DBG("do_warp_yuy2: max input height reached: need line %d, height %d\n -> Returning next output line: %d\n",
	vOffsets[y], src_height, y);*/
      return y;
    }

    vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] = 
      (256-vWeights[y]) << 16 | (256-vWeights[y]);
    vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] = 
      vWeights[y] << 16 | vWeights[y];

    srcp1 = srcp + vOffsets[y] * src_pitch;
    if (Interlaced)
      srcp2 = (y < dst_height-2)  ?  srcp1 + 2 * src_pitch  :  srcp1;
    else
      srcp2 = (y < dst_height-1)  ?  srcp1 + src_pitch      :  srcp1;

    __asm__ __volatile__ (
#if !defined(__x86_64__)
            /* store ebx (PIC) */
            "mov %%"REGB", "_oldbx"          \n\t"
#endif
	    "movl   "_src_row_size", %%"REGC"  \n\t"
	    "shrl   $3,          %%"REGC"      \n\t" /* 8 bytes a time             */
	    "mov    "_srcp1",    %%"REGSI"     \n\t" /* top of 2 src lines to get  */
	    "mov    "_srcp2",    %%"REGD"      \n\t" /* next "                     */
	    "mov    "_vWorkYW",  %%"REGDI"     \n\t" /* luma work destination line */
	    "mov    "_vWorkUVW", %%"REGB"      \n\t" /* luma work destination line */
	    "xor    %%"REGA",    %%"REGA"      \n\t"
#if !defined(__x86_64__)
	    /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
	     * This first loop is not the performance bottleneck anyway but it is trivial to tune
	     * using SSE2 if we have proper alignment.
	     */
	    "testl  $1, "_SSE2enabledW"  \n\t"  /* is SSE2 supported?*/
	    "jz     "vMaybeSSEMMX"f      \n\t"  /* n, can't do anyway*/
#endif
	    "cmpl   $2, %%"REGC"         \n\t"  /* we have at least 16 bytes, 2 qwords? */
	    "jl     "vMaybeSSEMMX"f      \n\t"  /* n, don't bother*/
	    
	    "shrl   $1, %%"REGC"         \n\t"  /* do 16 bytes at a time instead*/
	    "decl   %%"REGC"             \n"    /* jigger loop ct */
	    
	    ".align 16                   \n\t"

	    "movdqa "_FPround1", %%xmm0  \n\t"
	    "movdqa "_vWeight1", %%xmm5  \n\t"
	    "movdqa "_vWeight2", %%xmm6  \n\t"
	    "movdqa "_YMask",    %%xmm7  \n"

	    ""vLoopSSE2_Fetch":          \n\t"
#ifdef PREFETCH
	    "  prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
	    "  prefetcht0 16(%%"REGD",  %%"REGA", 2) \n"
#endif	    
	    ""vLoopSSE2":  \n\t"
	    "  movdqu   (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
	    "  movdqu   (%%"REGD",  %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */

	    "  movdqa   %%xmm1, %%xmm3  \n\t"  /* get chroma  bytes  */
	    "  pand     %%xmm7, %%xmm1  \n\t"  /* keep only luma     */
	    "  psrlw        $8, %%xmm3  \n\t"  /* right just chroma  */
	    "  pmullw   %%xmm5, %%xmm1  \n\t"  /* mult by weighting factor */
	    "  pmullw   %%xmm5, %%xmm3  \n\t"  /* mult by weighting factor */

	    "  movdqa   %%xmm2, %%xmm4  \n\t"  /* get chroma bytes  */
	    "  pand     %%xmm7, %%xmm2  \n\t"  /* keep only luma    */
	    "  psrlw        $8, %%xmm4  \n\t"  /* right just chroma */
	    "  pmullw   %%xmm6, %%xmm2  \n\t"  /* mult by weighting factor */
	    "  pmullw   %%xmm6, %%xmm4  \n\t"  /* mult by weighting factor */
	    
	    "  paddw    %%xmm2, %%xmm1  \n\t"  /* combine lumas     */
	    "  paddusw  %%xmm0, %%xmm1  \n\t"  /* round             */
	    "  psrlw        $8, %%xmm1  \n\t"  /* right adjust luma */
#ifdef STREAMING_STORE_TMP
	    "  movntdq  %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
#else
	    "  movdqu   %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
#endif
	    "  paddw    %%xmm4, %%xmm3  \n\t"  /* combine chromas */
	    "  paddusw  %%xmm0, %%xmm3  \n\t"  /* round */
	    "  psrlw        $8, %%xmm3  \n\t"  /* right adjust chroma */
	    "  packuswb %%xmm3, %%xmm3  \n\t"  /* pack UV's into low dword */
	    "  movdq2q  %%xmm3, %%mm1   \n\t"  /* save in our work area    */
#ifdef STREAMING_STORE_TMP
	    "  movntq    %%mm1, (%%"REGB", %%"REGA") \n\t"  /* save in our work area */
#else
	    "  movq      %%mm1, (%%"REGB", %%"REGA") \n\t"  /* save in our work area */
#endif
	    "  lea   8(%%"REGA"), %%"REGA"  \n\t"
	    "  decl  %%"REGC"               \n\t"
	    
	    "  jg    "vLoopSSE2_Fetch"b     \n\t"  /* if not on last one loop, prefetch */
	    "  jz    "vLoopSSE2"b           \n\t"  /* or just loop, or not */

	    /* done with our SSE2 fortified loop but we may need to pick up the spare change */
#ifdef STREAMING_STORE_TMP
	    "  sfence    \n\t"
#endif
	    "  movl  "_src_row_size", %%"REGC" \n\t"  /* get count again   */
	    "  andl  $15, %%"REGC"          \n\t"  /* just need mod 16  */

	    "  movq  "_YMask",    %%mm7     \n\t"  /* useful luma mask constant - lazy dupl init */
	    "  movq  "_vWeight1", %%mm5     \n\t"
	    "  movq  "_vWeight2", %%mm6     \n\t"
	    "  movq  "_FPround1", %%mm0     \n\t"  /* useful rounding constant  */

	    "  shrl  $3, %%"REGC"     \n\t"  /* 8 bytes at a time, any?  */
	    "  jz   "MoreSpareChange"f \n"    /* n, did them all  */

	    /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
	     * This first loop is not the performance bottleneck anyway but it is trivial to tune
	     * using SSE if we have proper alignment.
	     */
	    ""vMaybeSSEMMX":    \n\t"

	    "  movq  "_YMask",    %%mm7    \n\t"  /* useful luma mask constant - lazy dupl init */
	    "  movq  "_vWeight1", %%mm5    \n\t"  
	    "  movq  "_vWeight2", %%mm6    \n\t"  
	    "  movq  "_FPround1", %%mm0    \n\t"  /* useful rounding constant  */
#if !defined(__x86_64__)
	    "  testl $1, "_SSEMMXenabledW" \n\t"  /* MMXEXTsupported? */
	    "  jz    "vLoopMMX"f           \n\t"  /* n, can't do anyway */
#endif
	    "  decl  %%"REGC"              \n"    /* jigger loop ctr */

	    ".align 16             \n"
	    ""vLoopSSEMMX_Fetch":  \n\t"
#ifdef PREFETCH
	    "  prefetcht0 8(%%"REGSI", %%"REGA", 2)  \n\t"
	    "  prefetcht0 8(%%"REGD",  %%"REGA", 2)  \n"
#endif
	    ""vLoopSSEMMX":   \n\t"
	    "  movq    (%%"REGSI", %%"REGA", 2), %%mm1  \n\t"   /* top of 2 lines to interpolate */
	    "  movq    (%%"REGD",  %%"REGA", 2), %%mm2  \n\t"   /* 2nd of 2 lines    */

	    "  movq    %%mm1, %%mm3  \n\t"   /* copy top bytes */
	    "  pand    %%mm7, %%mm1  \n\t"   /* keep only luma */
	    "  pxor    %%mm1, %%mm3  \n\t"   /* keep only chroma */
	    "  psrlw      $8, %%mm3  \n\t"   /* right just chroma */
	    "  pmullw  %%mm5, %%mm1  \n\t"   /* mult by weighting factor */
	    "  pmullw  %%mm5, %%mm3  \n\t"   /* mult by weighting factor */
			  
	    "  movq    %%mm2, %%mm4  \n\t"   /* copy 2nd bytes */
	    "  pand    %%mm7, %%mm2  \n\t"   /* keep only luma */
	    "  pxor    %%mm2, %%mm4  \n\t"   /* keep only chroma */
	    "  psrlw      $8, %%mm4  \n\t"   /* right just chroma */
	    "  pmullw  %%mm6, %%mm2  \n\t"   /* mult by weighting factor */
	    "  pmullw  %%mm6, %%mm4  \n\t"   /* mult by weighting factor */
	    
	    "  paddw   %%mm2, %%mm1  \n\t"   /* combine lumas     */
	    "  paddusw %%mm0, %%mm1  \n\t"   /* round             */
	    "  psrlw      $8, %%mm1  \n\t"   /* right adjust luma */
#ifdef STREAMING_STORE_TMP
	    "  movntq  %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
#else
	    "  movq    %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
#endif	    
	    "  paddw    %%mm4, %%mm3  \n\t"  /* combine chromas  */
	    "  paddusw  %%mm0, %%mm3  \n\t"  /* round            */
	    "  psrlw       $8, %%mm3  \n\t"  /* right adjust chroma  */
	    "  packuswb %%mm3, %%mm3  \n\t"  /* pack UV's into low dword */
	    "  movd     %%mm3, (%%"REGB", %%"REGA") \n\t"  /* save in our work area    */
	    
	    "  lea   4(%%"REGA"), %%"REGA" \n\t"
	    "  decl  %%"REGC"              \n\t"
	    "  jg    "vLoopSSEMMX_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
	    "  jz    "vLoopSSEMMX"b        \n\t"  /* or just loop, or not  */
#ifdef STREAMING_STORE_TMP
	    "  sfence                      \n\t"
#endif
	    "  jmp    "MoreSpareChange"f   \n"    /* all done with vertical  */
	    
	    ".align 16     \n"
	    ""vLoopMMX":   \n\t"

	    "  movq (%%"REGSI", %%"REGA", 2), %%mm1  \n\t" /* top of 2 lines to interpolate */
	    "  movq (%%"REGD",  %%"REGA", 2), %%mm2  \n\t" /* 2nd of 2 lines */

	    "  movq     %%mm1, %%mm3  \n\t"  /* copy top bytes    */
	    "  pand     %%mm7, %%mm1  \n\t"  /* keep only luma    */
	    "  pxor     %%mm1, %%mm3  \n\t"  /* keep only chroma  */
	    "  psrlw       $8, %%mm3  \n\t"  /* right just chroma */
	    "  pmullw   %%mm5, %%mm1  \n\t"  /* mult by weighting factor */
	    "  pmullw   %%mm5, %%mm3  \n\t"  /* mult by weighting factor */
	    
	    "  movq     %%mm2, %%mm4  \n\t"  /* copy 2nd bytes    */
	    "  pand     %%mm7, %%mm2  \n\t"  /* keep only luma    */
	    "  pxor     %%mm2, %%mm4  \n\t"  /* keep only chroma  */
	    "  psrlw       $8, %%mm4  \n\t"  /* right just chroma */
	    "  pmullw   %%mm6, %%mm2  \n\t"  /* mult by weighting factor */
	    "  pmullw   %%mm6, %%mm4  \n\t"  /* mult by weighting factor */
	    
	    "  paddw    %%mm2, %%mm1  \n\t"  /* combine lumas     */
	    "  paddusw  %%mm0, %%mm1  \n\t"  /* round             */
	    "  psrlw       $8, %%mm1  \n\t"  /* right adjust luma */
	    "  movq     %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
	    
	    "  paddw    %%mm4, %%mm3  \n\t"  /* combine chromas     */
	    "  paddusw  %%mm0, %%mm3  \n\t"  /* round               */
	    "  psrlw       $8, %%mm3  \n\t"  /* right adjust chroma */
	    "  packuswb %%mm3, %%mm3  \n\t"  /* pack UV's into low dword */
	    "  movd     %%mm3, (%%"REGB", %%"REGA")  \n\t"  /* save in our work area */

	    "  lea      4(%%"REGA"), %%"REGA"  \n\t"
	    "  loop     "vLoopMMX"b      \n"

	    /* Add a little code here to check if we have 2 more pixels to do and, if so, make one
	     * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have
	     * an even number so there will never be more than 2 left. trbarry 7/29/2002
	     */
	    ""MoreSpareChange":    \n\t"

	    "  cmpl  "_EndOffset", %%"REGEA"  \n\t"  /* did we get them all */
	    "  jnl   "DoHorizontal"f \n\t"  /* yes, else have 2 left */
	    "  movl  $1, %%"REGC"    \n\t"  /* jigger loop ct */
	    "  sub   $2, %%"REGA"    \n\t"  /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
	    "  jmp   "vLoopMMX"b     \n"

	    /*  We've taken care of the vertical scaling, now do horizontal  */
	    ""DoHorizontal":      \n\t"

	    "  movq  "_YMask",    %%mm7     \n\t"  /* useful 0U0U..  mask constant  */
	    "  movq  "_FPround2", %%mm6     \n\t"  /* useful rounding constant, dwords  */
	    "  mov   "_pControl", %%"REGSI" \n\t"  /* @ horiz control bytes  */	
	    "  movl  "_row_size", %%"REGC"  \n\t"
	    "  shrl  $2,          %%"REGC"  \n\t"  /* bytes a time, 2 pixels  */
	    "  mov   "_vWorkYW",  %%"REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
	    "  mov   "_dstp",     %%"REGDI" \n\t"  /* the destination line  */
	    "  mov   "_vWorkUVW", %%"REGB"  \n"    /* chroma data, as UVUV UVUV...  */

	    ".align 16  \n"
	    ""hLoopMMX":    \n\t"

	    /* x86_64: must use movl (accessing table of uint32's) */
	    "  movl      16(%%"REGSI"), %%"REGEA"        \n\t"  /* get data offset in pixels, 1st pixel pair */
	    "  movd      (%%"REGD", %%"REGA", 2), %%mm0  \n\t"  /* copy luma pair */
	    "  shr       $1, %%"REGA"                    \n\t"  /* div offset by 2 */
	    "  movd      (%%"REGB", %%"REGA", 2), %%mm1  \n\t"  /* copy UV pair VUVU */
	    "  psllw     $8, %%mm1                       \n\t"  /* shift out V, keep 0000U0U0 */
	    
	    /*  we need to use both even and odd croma from same location - trb 9/2002 */
	    "  punpckldq (%%"REGB", %%"REGA", 2), %%mm1  \r\n"  /* copy UV pair VUVU  */
	    "  psrlw     $8, %%mm1                       \r\n"  /* shift out U0, keep 0V0V 0U0U   */
	    "  movl      20(%%"REGSI"), %%"REGEA"        \r\n"  /* get data offset in pixels, 2nd pixel pair  */
	    "  punpckldq (%%"REGD", %%"REGA", 2), %%mm0  \r\n"  /* copy luma pair  */
	    
	    "  pmaddwd    (%%"REGSI"), %%mm0  \r\n"  /* mult and sum lumas by ctl weights  */
	    "  paddusw    %%mm6, %%mm0        \r\n"  /* round  */
	    "  psrlw      $8, %%mm0           \r\n"  /* right just 2 luma pixel value 000Y,000Y  */
	    
	    "  pmaddwd    8(%%"REGSI"), %%mm1 \r\n"  /* mult and sum chromas by ctl weights */
	    "  paddusw    %%mm6, %%mm1        \r\n"  /* round */
	    "  pslld      $8, %%mm1           \r\n"  /* shift into low bytes of different words */
	    "  pand       %%mm7, %%mm1        \r\n"  /* keep only 2 chroma values 0V00,0U00  */
	    "  por        %%mm1, %%mm0        \r\n"  /* combine luma and chroma, 0V0Y,0U0Y  */
	    "  packuswb   %%mm0, %%mm0        \r\n"  /* pack all into low dword, xxxxVYUY  */
	    "  movd       %%mm0, (%%"REGDI")  \n\t"  /* done with 2 pixels */

	    "  lea     24(%%"REGSI"), %%"REGSI"  \n\t"  /* bump to next control bytest */
	    "  lea      4(%%"REGDI"), %%"REGDI"  \n\t"  /* bump to next output pixel addr */
	    
	    "  loop   "hLoopMMX"b             \n\t"  /* loop for more */

	    "emms              \n\t"
	    /* done with one line */

#if !defined(__x86_64__)
	    "mov "_oldbx", %%"REGB" \n\t"
#endif
	    ::
	    "m" /*0*/(FPround1), 
	    "m" /*1*/(vWeight1), 
	    "m" /*2*/(vWeight2), 
	    "m" /*3*/(YMask),
	    "m" /*4*/(src_row_size),
	    "m" /*5*/(EndOffset),
	    "m" /*6*/(pControl),
	    "m" /*7*/(row_size),
	    "m" /*8*/(vWorkYW),
	    "m" /*9*/(dstp),
	    "m" /*10*/(vWorkUVW),
	    "m" /*11*/(FPround2),
	    "m" /*12*/(srcp1),
	    "m" /*13*/(srcp2)
#if !defined(__x86_64__)
	    ,
	    "m" /*14*/(oldbx),
	    "m" /*15*/(SSEMMXenabledW),
	    "m" /*16*/(SSE2enabledW)
	    : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
#else
	    : REGA, REGB, REGC, REGD, REGSI, REGDI
#endif
	    );

    dstp += dst_pitch;
  }
#endif
  return 0;
}

static int do_warp_yv12(uint8_t *dst, const uint8_t * const src,
			const int dst_pitch, const int src_pitch,
			const int dst_width, const int dst_height,			 
			const int src_width, const int src_height,
			const int Interlaced, const uint32_t * const hControl, 
			const uint32_t * vOffsets, const uint32_t * vWeights,
			uint32_t *vWorkY, int dst_start)
{
#if defined(__i386__) || defined(__x86_64__)
  sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words     */
  sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords    */
  sse2_t vWeight1;
  sse2_t vWeight2;

  const uint32_t *pControl = &hControl[0];
  const uint32_t *vWorkYW = vWorkY;
  const uint8_t *srcp = src;
  const uint8_t *srcp1;
  const uint8_t *srcp2;
  uint8_t *dstp = dst + dst_pitch*dst_start;

  const uint32_t src_row_size = src_width;
  const uint32_t row_size = dst_width;

#if !defined(__x86_64__)
  const int accel = xine_mm_accel();
  const uint32_t SSE2enabledW   = !!(accel & MM_ACCEL_X86_SSE2);   /* in local storage for asm */
  const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */
  long int oldbx;
#endif
  int y;

  /* Operation in sliced mode:
   *  - continue until required next source line is out of slice
   *  - return next output line
   *  - at next call, continue from next souce line
   */

  for (y = dst_start; y < dst_height; y++) {
    if(vOffsets[y] >= src_height) {
      /* slice completed */
      /*DBG("do_warp_yv12: max input height reached: need line %d, height %d\n -> Returning next output line: %d , start was %d\n",
	(int)vOffsets[y], (int)src_height, (int)y, (int)dst_start);*/
      return y;
    }

    vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] =
      (256-vWeights[y]) << 16 | (256-vWeights[y]);
    vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] =
      vWeights[y] << 16 | vWeights[y];

    srcp1 = srcp + vOffsets[y] * src_pitch;

    if (Interlaced)
      srcp2 = (y < dst_height-2)  ?  srcp1 + 2 * src_pitch  :  srcp1;
    else
      srcp2 = (y < dst_height-1)  ?  srcp1 + src_pitch  :  srcp1;

    __asm__  __volatile__(
             "movl "_src_row_size", %%"REGC" \n\t"
	     "shr  $3,         %%"REGC"   \n\t"  /* 8 bytes a time */
	     "mov  "_srcp1",   %%"REGSI"  \n\t"  /* top of 2 src lines to get */
	     "mov  "_srcp2",   %%"REGD"   \n\t"  /* next "  */ 
	     "mov  "_vWorkYW", %%"REGDI"  \n\t"  /* luma work destination line */
	     "xor  %%"REGA",   %%"REGA"   \n\t"
#if !defined(__x86_64__)
	     /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
	      * This first loop is not the performance bottleneck anyway but it is trivial to tune
	      * using SSE2 if we have proper alignment.
	      */
	     "testl $1, "_SSE2enabledW"   \n\t"  /* is SSE2 supported? */
	     "jz    "vMaybeSSEMMX"f       \n\t"  /* n, can't do anyway */
#endif
	     "cmpl  $2, %%"REGC"          \n\t"  /* we have at least 16 byts, 2 qwords? */
	     "jl    "vMaybeSSEMMX"f       \n\t"  /* n, don't bother */
	     
	     "mov   %%"REGSI", %%"REGB"   \n\t"
	     "or    %%"REGD",  %%"REGB"   \n\t"
	     "test  $15,       %%"REGB"   \n\t"  /* both src rows 16 byte aligned? */
	     "jnz   "vMaybeSSEMMX"f       \n\t"  /* n, don't use sse2 */
			 
	     "shr   $1, %%"REGC"          \n\t"  /* do 16 bytes at a time instead */
	     "dec   %%"REGC"              \n\t"  /* jigger loop ct */
			 
	     "movdqa "_FPround1", %%xmm0  \n\t"
	     "movdqa "_vWeight1", %%xmm5  \n\t"
	     "movdqa "_vWeight2", %%xmm6  \n\t"
	     "pxor        %%xmm7, %%xmm7  \n"

	     ".align 16                   \n"
	     ""vLoopSSE2_Fetch":          \n\t"
#ifdef PREFETCH
	     "  prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
	     "  prefetcht0 16(%%"REGD",  %%"REGA", 2) \n"
#endif
	     ""vLoopSSE2":  \n\t"
	     /* we're already checked pointers to be on dqword aligned */
	     "  movdqa  (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
	     "  movdqa  (%%"REGD",  %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
	     "  movdqa    %%xmm1, %%xmm2  \n\t"
	     "  movdqa    %%xmm3, %%xmm4  \n\t"

	     "  punpcklbw %%xmm7, %%xmm1  \n\t"  /* make words */
	     "  punpckhbw %%xmm7, %%xmm2  \n\t"  /*     "     */
	     "  punpcklbw %%xmm7, %%xmm3  \n\t"  /*     "     */
	     "  punpckhbw %%xmm7, %%xmm4  \n\t"  /*     "     */

	     "  pmullw    %%xmm5, %%xmm1  \n\t"  /* mult by top weighting factor */
	     "  pmullw    %%xmm5, %%xmm2  \n\t"  /*    "    */
	     "  pmullw    %%xmm6, %%xmm3  \n\t"  /* mult by bot weighting factor */
	     "  pmullw    %%xmm6, %%xmm4  \n\t"  /*    "    */

	     "  paddw     %%xmm3, %%xmm1  \n\t"  /* combine lumas low */
	     "  paddw     %%xmm4, %%xmm2  \n\t"  /* combine lumas high */

	     "  paddusw   %%xmm0, %%xmm1  \n\t"  /* round */
	     "  paddusw   %%xmm0, %%xmm2  \n\t"  /* round */
			
	     "  psrlw     $8, %%xmm1      \n\t"  /* right adjust luma */
	     "  psrlw     $8, %%xmm2      \n\t"  /* right adjust luma */

	     "  packuswb  %%xmm2, %%xmm1  \n\t"  /* pack words to our 16 byte answer */
#ifdef STREAMING_STORE_TMP
	     "  movntdq   %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
#else
	     "  movdqu    %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
#endif			 
	     "  lea   16(%%"REGA"), %%"REGA" \n\t"
	     "  decl  %%"REGC"            \n\t"

	     "  jg    "vLoopSSE2_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
	     "  jz    "vLoopSSE2"b        \n\t"  /* or just loop, or not  */

	     /* done with our SSE2 fortified loop but we may need to pick up the spare change */
#ifdef STREAMING_STORE_TMP
	     "  sfence                  \n\t"
#endif
	     "  movl  "_src_row_size", %%"REGC" \n\t"  /* get count again   */
	     "  andl  $15, %%"REGC"       \n\t"  /* just need mod 16  */
	     "  movq "_vWeight1", %%mm5   \n\t"
	     "  movq "_vWeight2", %%mm6   \n\t"
	     "  movq "_FPround1", %%mm0   \n\t"  /* useful rounding constant  */

	     "  shrl  $3, %%"REGC"        \n\t"  /* 8 bytes at a time, any?  */
	     "  jz   "MoreSpareChange"f   \n"    /* n, did them all  */

	     /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
	      * This first loop is not the performance bottleneck anyway but it is trivial to tune
	      * using SSE if we have proper alignment.
	      */
	     ""vMaybeSSEMMX":             \n\t"

	     "  movq "_vWeight1", %%mm5   \n\t"  
	     "  movq "_vWeight2", %%mm6   \n\t"  
	     "  movq "_FPround1", %%mm0   \n\t"  /* useful rounding constant  */
	     "  pxor       %%mm7, %%mm7   \n\t"
#if !defined(__x86_64__)
	     "  testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */
	     "  jz    "vLoopMMX"f         \n\t"  /* n, can't do anyway */
#endif
	     "  decl  %%"REGC"      \n"  /* jigger loop ctr */
			 
	     ".align 16             \n"
	     ""vLoopSSEMMX_Fetch":  \n\t"
#ifdef PREFETCH
	     "  prefetcht0 8(%%"REGSI", %%"REGA")  \n\t"
	     "  prefetcht0 8(%%"REGD",  %%"REGA")  \n"
#endif
	     ""vLoopSSEMMX":   \n\t"

	     "  movq    (%%"REGSI", %%"REGA"), %%mm1  \n\t"   /* top of 2 lines to interpolate */
	     "  movq    (%%"REGD",  %%"REGA"), %%mm3  \n\t"   /* 2nd of 2 lines    */

	     "  movq      %%mm1, %%mm2  \n\t"
	     "  movq      %%mm3, %%mm4  \n\t"

	     "  punpcklbw %%mm7, %%mm1  \n\t"  /* make words */
	     "  punpckhbw %%mm7, %%mm2  \n\t"  /*     "     */
	     "  punpcklbw %%mm7, %%mm3  \n\t"  /*     "     */
	     "  punpckhbw %%mm7, %%mm4  \n\t"  /*     "     */

	     "  pmullw    %%mm5, %%mm1  \n\t"  /* mult by top weighting factor */
	     "  pmullw    %%mm5, %%mm2  \n\t"  /*    "    */
	     "  pmullw    %%mm6, %%mm3  \n\t"  /* mult by bot weighting factor */
	     "  pmullw    %%mm6, %%mm4  \n\t"  /*    "    */

	     "  paddw     %%mm3, %%mm1  \n\t"  /* combine lumas low */
	     "  paddw     %%mm4, %%mm2  \n\t"  /* combine lumas high */

	     "  paddusw   %%mm0, %%mm1  \n\t"  /* round */
	     "  paddusw   %%mm0, %%mm2  \n\t"  /* round */
			
	     "  psrlw     $8, %%mm1     \n\t"  /* right adjust luma */
	     "  psrlw     $8, %%mm2     \n\t"  /* right adjust luma */

	     "  packuswb  %%mm2, %%mm1  \n\t"  /* pack words to our 8 byte answer */
#ifdef STREAMING_STORE_TMP
	     "  movntq    %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
#else
	     "  movq      %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
#endif
	     "  lea   8(%%"REGA"), %%"REGA" \n\t"
	     "  decl  %%"REGC"              \n\t"

	     "  jg    "vLoopSSEMMX_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
	     "  jz    "vLoopSSEMMX"b        \n\t"  /* or just loop, or not  */
#ifdef STREAMING_STORE_TMP
	     "  sfence                      \n\t"
#endif
	     "  jmp    "MoreSpareChange"f   \n"    /* all done with vertical  */

	     ".align 16        \n"
	     ""vLoopMMX":      \n\t"

	     "  movq    (%%"REGSI", %%"REGA"), %%mm1  \n\t"  /* top of 2 lines to interpolate */
	     "  movq    (%%"REGD",  %%"REGA"), %%mm3  \n\t"  /* 2nd of 2 lines    */

	     "  movq      %%mm1, %%mm2  \n\t"
	     "  movq      %%mm3, %%mm4  \n\t"

	     "  punpcklbw %%mm7, %%mm1  \n\t"  /* make words */
	     "  punpckhbw %%mm7, %%mm2  \n\t"  /*     "     */
	     "  punpcklbw %%mm7, %%mm3  \n\t"  /*     "     */
	     "  punpckhbw %%mm7, %%mm4  \n\t"  /*     "     */
	     
	     "  pmullw    %%mm5, %%mm1  \n\t"  /* mult by top weighting factor */
	     "  pmullw    %%mm5, %%mm2  \n\t"  /*    "    */
	     "  pmullw    %%mm6, %%mm3  \n\t"  /* mult by bot weighting factor */
	     "  pmullw    %%mm6, %%mm4  \n\t"  /*    "    */

	     "  paddw     %%mm3, %%mm1  \n\t"  /* combine lumas low */
	     "  paddw     %%mm4, %%mm2  \n\t"  /* combine lumas high */

	     "  paddusw   %%mm0, %%mm1  \n\t"  /* round */
	     "  paddusw   %%mm0, %%mm2  \n\t"  /* round */
			
	     "  psrlw     $8, %%mm1     \n\t"  /* right adjust luma */
	     "  psrlw     $8, %%mm2     \n\t"  /* right adjust luma */

	     "  packuswb  %%mm2, %%mm1  \n\t"  /* pack words to our 8 byte answer */
	     "  movq      %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
			 
	     "  lea   8(%%"REGA"), %%"REGA" \n\t"
	     "  loop  "vLoopMMX"b  \n"

	     /* Add a little code here to check if we have more pixels to do and, if so, make one
	      * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have
	      * an even number so there will never be more than 7 left.
	      */
	     ""MoreSpareChange":     \n\t"

	     "  cmpl "_src_row_size", %%"REGEA"  \n\t"  /* did we get them all */
	     "  jnl  "DoHorizontal"f  \n\t"  /* yes, else have 2 left */
	     "  movl $1, %%"REGC"     \n\t"  /* jigger loop ct */
	     "  movl "_src_row_size", %%"REGEA"  \n\t"
	     "  sub  $8, %%"REGA"     \n\t"  /* back up to last 8 pixels */
	     "  jmp  "vLoopMMX"b      \n"

	     /*  We've taken care of the vertical scaling, now do horizontal  */
	     ""DoHorizontal":        \n\t"
	     "  pxor        %%mm7, %%mm7     \n\t"
	     "  movq  "_FPround2", %%mm6     \n\t"  /* useful rounding constant, dwords  */
	     "  mov   "_pControl", %%"REGSI" \n\t"  /* @ horiz control bytes  */	
	     "  movl  "_row_size", %%"REGC"  \n\t"
	     "  shrl  $2, %%"REGC"          \n\t"  /* 4 bytes a time, 4 pixels  */
	     "  mov   "_vWorkYW",  %%"REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
	     "  mov   "_dstp",     %%"REGDI" \n\t"  /* the destination line  */
#if !defined(__x86_64__)
	     "  testl $1, "_SSEMMXenabledW" \n\t"  /* MMXEXTsupported? */
	     "  jz    "hLoopMMX"f           \n\t"  /* n, can't do anyway */
#endif
	     /* With SSE support we will make 8 pixels (from 8 pairs) at a time */
	     "  shrl  $1, %%"REGC"  \n\t"  /* 8 bytes a time instead of 4  */
	     "  jz    "LessThan8"f  \n"

	     ".align 16          \n"
	     ""hLoopMMXSSE":    \n\t"


	     /* handle first 2 pixels */
	     /* phi: must use movl here (x86_64, reading from table of uint_32's) */
	     "  movl   16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
	     "  movl   20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */

	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm0      \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
	     "  movl      16+24(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
	     "  movl      20+24(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */

	     /* handle 3rd and 4th pixel pairs */
	     "  movd      (%%"REGD", %%"REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm1        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
	     "  movl      16+48(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 5st pixel pair */
	     "  movl      20+48(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 6nd pixel pair  */
	     "  pmaddwd 24(%%"REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm1       \n\t"  /* round */
	     "  psrlw         $8, %%mm1       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */

	     /* handle 5th and 6th pixel pairs */
	     "  movd      (%%"REGD", %%"REGA"), %%mm2  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm2  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm2        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
	     "  movl      16+72(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 7st pixel pair */
	     "  movl      20+72(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 8nd pixel pair  */
	     "  pmaddwd 48(%%"REGSI"), %%mm2  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm2       \n\t"  /* round */
	     "  psrlw         $8, %%mm2       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */

	     /* handle 7th and 8th pixel pairs */
	     "  movd      (%%"REGD", %%"REGA"), %%mm3  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm3  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm3        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
	     "  pmaddwd 72(%%"REGSI"), %%mm3  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm3       \n\t"  /* round */
	     "  psrlw         $8, %%mm3       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */

	     /* combine, store, and loop */
	     "  packuswb %%mm1, %%mm0         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
	     "  packuswb %%mm3, %%mm2         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
	     "  packuswb %%mm2, %%mm0         \n\t"  /* and again into  YYYYYYYY */			
#ifdef STREAMING_STORE
	     "  movntq   %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
#else
	     "  movq     %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
#endif

	     "  lea  96(%%"REGSI"), %%"REGSI" \n\t"
	     "  lea   8(%%"REGDI"), %%"REGDI" \n\t"
	     "  decl  %%"REGC"                \n\t"
	     "  jg    "hLoopMMXSSE"b    \n\t"   /* loop for more  */
#ifdef STREAMING_STORE
	     "  sfence                  \n"
#endif
	     ""LessThan8":    \n\t"
	     "  movl "_row_size", %%"REGC"  \n\t"
	     "  andl          $7, %%"REGC"  \n\t"  /* we have done all but maybe this */
	     "  shrl          $2, %%"REGC"  \n\t"  /* now do only 4 bytes at a time */
	     "  jz            "LessThan4"f  \n"

	     ".align 16   \n"
	     ""hLoopMMX":    \n\t"

	     /* handle first 2 pixels */
	     "  movl   16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
	     "  movl   20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm0      \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
	     "  movl      16+24(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
	     "  movl      20+24(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */

	     /* handle 3rd and 4th pixel pairs */
	     "  movd      (%%"REGD", %%"REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpckldq (%%"REGD", %%"REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm1        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
	     "  pmaddwd 24(%%"REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm1       \n\t"  /* round */
	     "  psrlw         $8, %%mm1       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */

	     /* combine, store, and loop */
	     "  packuswb %%mm1, %%mm0         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
	     "  packuswb %%mm7, %%mm0         \n\t"  /* and again into  0000YYYY */			
	     "  movd     %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
	     "  lea  48(%%"REGSI"), %%"REGSI" \n\t"
	     "  lea   4(%%"REGDI"), %%"REGDI" \n\t"

	     "  loop   "hLoopMMX"b            \n"    /* loop for more */
		 
	     /* test to see if we have a mod 4 size row, if not then more spare change */
	     ""LessThan4":    \n\t"
	     "  movl "_row_size", %%"REGC"    \n\t"
	     "  andl          $3, %%"REGC"    \n\t"  /* remainder side mod 4 */
	     "  cmpl          $2, %%"REGC"    \n\t"  
	     "  jl            "LastOne"f      \n\t"  /* none, none */

	     /* handle 2 more pixels */
	     "  movl      16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
	     "  movl      20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
	     "  punpcklbw %%mm7, %%mm0        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */

	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw   %%mm6, %%mm0        \n\t"  /* round */
	     "  psrlw        $8, %%mm0        \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
	     "  packuswb  %%mm7, %%mm0        \n\t"  /* pack into qword, 00000Y0Y */
	     "  packuswb  %%mm7, %%mm0        \n\t"  /* and again into  000000YY */			
	     "  movd      %%mm0, (%%"REGDI")  \n\t"  /* store, we are guarrenteed room in buffer (8 byte mult) */
	     "  subl         $2, %%"REGC"     \n\t"  
	     
	     "  lea  24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */
	     "  lea   2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */

	     /* maybe one last pixel */
	     ""LastOne":   \n\t"
	     "  cmpl   $0, %%"REGC"   \r\n"  /* still more ? */
	     "  jz     "AllDone"f     \r\n"  /* n, done */
	     "  movl   16(%%"REGSI"), %%"REGEA"     \n\t"  /* get data offset in pixels, 1st pixel pair */
	     "  movd   (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
	     "  punpcklbw %%mm7, %%mm0        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */

	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
	     "  movd       %%mm0, %%"REGEA"   \n\t"
	     "  movb        %%al, (%%"REGDI") \n"    /* store last one */
			 
	     ""AllDone":  \n\t"
	     "  emms      \n\t"
#if !defined(__x86_64__)
	     "mov "_oldbx", %%"REGB" \n\t"
#endif
	     ::
	     "m" /*0*/(FPround1),
	     "m" /*1*/(vWeight1),
	     "m" /*2*/(vWeight2),
	     "m" /*3*/(y/*YMask[0]*/),
	     "m" /*4*/(src_row_size),
	     "m" /*5*/(y/*EndOffset*/),
	     "m" /*6*/(pControl),
	     "m" /*7*/(row_size),
	     "m" /*8*/(vWorkYW),
	     "m" /*9*/(dstp),
	     "m" /*10*/(y/*vWorkUVW*/),
	     "m" /*11*/(FPround2),
	     "m" /*12*/(srcp1),
	     "m" /*13*/(srcp2)
#if !defined(__x86_64__)
	     ,
	     "m" /*14*/(oldbx),
	     "m" /*15*/(SSEMMXenabledW),
	     "m" /*16*/(SSE2enabledW)
	     : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI
#else
	     : REGA, REGB, REGC, REGD, REGSI, REGDI
#endif
	     );

    dstp += dst_pitch;
  }
#endif
  return 0; 
}

/*
 * tools
 */

#ifndef ALIGN
#  define ALIGN(b,p) ((void*)((((unsigned long)(p)) + (b)-1) & (~((b)-1))))
#endif
#ifndef MIN
#  define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
#ifndef MAX
#  define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
#ifndef FABS
#  define FABS(x) ((x) < 0.0 ? -(x) : (x))
#endif

/*
 * xine plugin
 */

#define PLUGIN_ID     "warp"
#define PLUGIN_DESCR  "(non-)linear software scaling post plugin";
#define PLUGIN_T      warp_plugin_t
/*#define POST_THREADS*/
/*#define POST_SLICES*/
#include "xine/post_util.h"


/* plugin class initialization function */
void *warp_init_plugin(xine_t *xine, void *);

/* plugin class functions */
static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
				  xine_audio_port_t **audio_target,
				  xine_video_port_t **video_target);

/* plugin instance functions */
static void        warp_dispose(post_plugin_t *this_gen);

/* vo_frame functions */
static vo_frame_t *got_frame(vo_frame_t *frame);
static void        draw_internal(vo_frame_t *frame, vo_frame_t *new_frame);


/* parameter functions */
static xine_post_api_descr_t *warp_get_param_descr(void);
static int                    warp_set_parameters(xine_post_t *this_gen, void *param_gen);
static int                    warp_get_parameters(xine_post_t *this_gen, void *param_gen);
static char                  *warp_get_help(void);


typedef struct warp_parameters_s {
  int    output_width;
  int    output_height;
  double output_aspect;
  int    no_downscaling;
} warp_parameters_t;

START_PARAM_DESCR(warp_parameters_t)
PARAM_ITEM(POST_PARAM_TYPE_INT,    output_width,  NULL, 640, 1920, 0,
  "output video width")
PARAM_ITEM(POST_PARAM_TYPE_INT,    output_height, NULL, 480, 1080, 0,
  "output video height")
PARAM_ITEM(POST_PARAM_TYPE_DOUBLE, output_aspect, NULL,   1,    3, 0,
  "output video aspect ratio")
PARAM_ITEM(POST_PARAM_TYPE_BOOL,   no_downscaling,NULL,   0,    1, 0,
  "disable downscaling")
END_PARAM_DESCR(warp_param_descr)


typedef struct {
  post_plugin_t  post;

  xine_post_in_t parameter_input;

  /* User config  (changes to actual config are delayed) */
  warp_parameters_t config;

  /* Current config */
  int    enable;
  int    output_width;
  int    output_height;
  double output_aspect;
  double factor_x;
  double factor_y;

  /* Last seen input frame */
  int    input_width;
  int    input_height;
  int    input_format;
  int    input_interlaced;
  double input_aspect;

  /* working buffers */
  uint32_t *vWorkY;
  uint32_t *vWorkUV;

  /* scaling tables */
  uint32_t *hControl;
  uint32_t *hControlUV;
  uint32_t *vOffsets;
  uint32_t *vOffsetsUV;
  uint32_t *vWeights;
  uint32_t *vWeightsUV;

  /* memory for work areas and scaling tables */
  void *pMem;

} warp_plugin_t;

/*
 *
 */

static void init_tables(warp_plugin_t *this)
{
#define BP(x) ((uint8_t*)(x))
  /* allocate memory for scaling tables and workspace */
  free(this->pMem);
  this->pMem = malloc(this->input_width*3 + this->output_width*sizeof(uint32_t)*3*2 +
		      this->output_height*sizeof(uint32_t)*4 + 2*9*128);

  /* - aligned for P4 cache line */
  this->vWorkY   = (uint32_t*)ALIGN(128, this->pMem);
  this->vWorkUV  = (uint32_t*)ALIGN(128, BP(this->vWorkY)   + this->input_width*2 + 128);
  this->hControl = (uint32_t*)ALIGN(128, BP(this->vWorkUV)  + this->input_width   + 128);
  this->vOffsets = (uint32_t*)ALIGN(128, BP(this->hControl) + this->output_width  * sizeof(uint32_t) * 3 + 128);
  this->vWeights = (uint32_t*)ALIGN(128, BP(this->vOffsets) + this->output_height * sizeof(uint32_t) + 128);

  if (this->input_format == XINE_IMGFMT_YV12) {
    this->vOffsetsUV = (uint32_t*)ALIGN(128, BP(this->vWeights)   + this->output_height * sizeof(uint32_t) + 128);
    this->vWeightsUV = (uint32_t*)ALIGN(128, BP(this->vOffsetsUV) + this->output_height * sizeof(uint32_t) + 128);
    this->hControlUV = (uint32_t*)ALIGN(128, BP(this->vWeightsUV) + this->output_height * sizeof(uint32_t) + 128);

    init_tables_yv12(this->output_width, this->output_height,
		     this->input_width,  this->input_height,
		     this->input_interlaced, this->factor_x, this->factor_y, 
		     this->hControl,   this->vOffsets,   this->vWeights,
		     this->hControlUV, this->vOffsetsUV, this->vWeightsUV );

  } else if (this->input_format == XINE_IMGFMT_YUY2) {

    init_tables_yuy2(this->output_width, this->output_height,
		     this->input_width,  this->input_height,
		     this->input_interlaced, this->factor_x, this->factor_y, 
		     this->hControl, this->vOffsets, this->vWeights );
  }
}

static void calculate_factors(warp_plugin_t *this)
{
  /* try to guess amount to stretch/shrink */
  double adiff = this->input_aspect - this->output_aspect;
  this->factor_x = 1.0;
  this->factor_y = 1.0;

  if (adiff > 0.1) {

    if (adiff > 0.1 + ((16.0-12.0)/9.0)) {
      /* >16:9 -> >4:3 */
      DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
      this->factor_x = 0.95;
      this->factor_y = 1.15;
      this->output_aspect += (adiff - 4.0/9.0);
      DBG("  changing target ratio to %3.1lf\n", this->output_aspect);
    } else {
      /* 16:9 ... 12:9 -> 4:3 */
      DBG("aspect ratio diff %1.3lf > 0 : 16.9...12:9 -> 4:3\n", adiff);
      this->factor_x = 1.0 - 0.05 * adiff * 9.0/4.0;
      this->factor_y = 1.0 + 0.15 * adiff * 9.0/4.0;
    }

  } else if (adiff < -0.1) {

    if(adiff < -0.1-((16.0-12.0)/9.0)) {
      /* <4:3 -> <16:9 */
      DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff);
      this->factor_x = 1.05;
      this->factor_y = 0.85;
      this->output_aspect += (adiff + 4.0/9.0);
      DBG("  changing target ratio to %3.1lf\n", this->output_aspect);
    } else {
      /* 4:3...16:9 -> 16:9 */
      DBG("aspect ratio diff %1.3lf < 0 : 4:3...16:9 -> 16:9\n", adiff);
      this->factor_x = 1.0 + 0.05 * adiff * 9.0/4.0;
      this->factor_y = 1.0 - 0.15 * adiff * 9.0/4.0;
    }

  } else {
    DBG("aspect ratio matches, no warp\n");
    this->factor_x = 1.0;
    this->factor_y = 1.0;
  }

  DBG("factor_x = %1.3lf factor_y = %1.3lf  output ratio = %1.3lf\n", 
      this->factor_x, this->factor_y, this->output_aspect);
}

/*
 *
 */

void *warp_init_plugin(xine_t *xine, void *data)
{
#if !defined(__x86_64__)
  /* Need at least MMX */
  if (!(xine_mm_accel() & MM_ACCEL_X86_MMX)) {
    fprintf(stderr, "warp_init_plugin: ERROR: at least MMX required\n");
    return NULL;
  }
#endif

  return init_plugin(xine, data);
}

static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs,
					 xine_audio_port_t **audio_target,
					 xine_video_port_t **video_target)
{
  warp_plugin_t     *this     = calloc(1, sizeof(warp_plugin_t));
  post_plugin_t     *this_gen = (post_plugin_t *) this;
  post_in_t         *input;
  post_out_t        *output;
  xine_post_in_t    *input_param;
  post_video_port_t *port;

  static xine_post_api_t post_api =
    { warp_set_parameters,  warp_get_parameters, warp_get_param_descr, warp_get_help };
  
  if (!this || !video_target || !video_target[0]) {
    free(this);
    return NULL;
  }
  
  _x_post_init(this_gen, 0, 1);
  
  port = _x_post_intercept_video_port(this_gen, video_target[0], &input, &output);
  port->intercept_frame = intercept_frame_yuy;
  port->new_frame->draw = post_draw;
  input->xine_in.name   = "video";
  output->xine_out.name = "video (scaled)";
  this_gen->xine_post.video_input[0] = &port->new_port;

  this_gen->dispose = warp_dispose;

  input_param       = &this->parameter_input;
  input_param->name = "parameters";
  input_param->type = XINE_POST_DATA_PARAMETERS;
  input_param->data = &post_api;
  xine_list_push_back(this_gen->input, input_param);

  this->config.output_aspect  = 0.0; /* -> do not change aspect ratio */
  this->config.output_width   = 0;   /* -> do not change width */
  this->config.output_height  = 0;   /* -> do not change height */
  this->config.no_downscaling = 0;

  this->input_width  = 0; /* not known yet, triggers initialization later */
  this->input_height = 0;

  return this_gen;
}

static void warp_dispose(post_plugin_t *this_gen)
{
  if (_x_post_dispose(this_gen)) {
    warp_plugin_t *this = (warp_plugin_t *) this_gen;

    DBG("dispose\n");

    free(this->pMem);
    free(this);
  }
}

static vo_frame_t *got_frame(vo_frame_t *frame)
{
  post_video_port_t *port = (post_video_port_t *)frame->port;
  warp_plugin_t     *this = (warp_plugin_t *)port->post;
  double             adiff = this->input_aspect - frame->ratio;

  if (this->input_width != frame->width || this->input_height != frame->height ||
      this->input_format != frame->format || FABS(adiff)>0.1 ||
      this->input_interlaced != !!(frame->flags & VO_INTERLACED_FLAG)) {

    DBG("detected frame format change: %dx%d -> %dx%d, interlaced %d->%d, aspect %1.3lf->%1.3lf, %s->%s\n",
	this->input_width, this->input_height, frame->width, frame->height,
	this->input_interlaced, !!(frame->flags & VO_INTERLACED_FLAG),
	this->input_aspect, frame->ratio, 
	this->input_format==XINE_IMGFMT_YV12 ? "yv12":"yuy2", 
	frame->format==XINE_IMGFMT_YV12 ? "yv12":"yuy2" );

    /* free tables and buffers */
    free(this->pMem);
    this->pMem = NULL;

    /* remember frame properties to detect changes in video format */
    this->input_width  = frame->width;
    this->input_height = frame->height;
    this->input_format = frame->format;
    this->input_aspect = frame->ratio;
    this->input_interlaced = !!(frame->flags & VO_INTERLACED_FLAG);

    /* re-configure target size and aspect ratio */ 
    this->output_aspect = this->config.output_aspect ?: frame->ratio;
    if (!this->config.no_downscaling) {
      this->output_width  = this->config.output_width  ?: frame->width;
      this->output_height = this->config.output_height ?: frame->height;
    } else {
      this->output_width  = MAX(this->config.output_width,  frame->width);
      this->output_height = MAX(this->config.output_height, frame->height);
    }

    /* calculate warp function factors */
    calculate_factors(this);

    adiff = this->input_aspect - this->output_aspect;
    if(this->output_width  == frame->width &&
       this->output_height == frame->height &&
       adiff < 0.1  && 
       adiff > -0.1 ) {
      this->enable = 0;
      DBG("--> nothing to do, disabling processing for now\n");
      return NULL;
    }

    this->enable = 1;

    init_tables(this);
  }

  if (!this->enable)
    return NULL;

  return port->original_port->get_frame(port->original_port,
					this->output_width, this->output_height, 
					this->output_aspect, frame->format,
					frame->flags | VO_BOTH_FIELDS);
}

static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame)
{
  post_video_port_t *port = (post_video_port_t *)frame->port;
  warp_plugin_t *this = (warp_plugin_t *)port->post;
  int proc_height = frame->height;

  if (frame->format == XINE_IMGFMT_YV12) {

    do_warp_yv12(new_frame->base[0], frame->base[0], 
		 new_frame->pitches[0], frame->pitches[0],
		 this->output_width, this->output_height,
		 frame->width, proc_height,
		 this->input_interlaced,
		 this->hControl, this->vOffsets, this->vWeights,
		 this->vWorkY,
		 0);
    proc_height /= 2;
    do_warp_yv12(new_frame->base[1], frame->base[1], 
		 new_frame->pitches[1], frame->pitches[1],
		 this->output_width/2, this->output_height/2,
		 frame->width/2, proc_height, 
		 this->input_interlaced,
		 this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
		 this->vWorkUV,
		 0);
    do_warp_yv12(new_frame->base[2], frame->base[2], 
		 new_frame->pitches[2], frame->pitches[2],
		 this->output_width/2, this->output_height/2,
		 frame->width/2, proc_height, 
		 this->input_interlaced,
		 this->hControlUV, this->vOffsetsUV, this->vWeightsUV,
		 this->vWorkUV,
		 0);

  } else if (frame->format == XINE_IMGFMT_YUY2) {
    do_warp_yuy2(new_frame->base[0], frame->base[0], 
		 new_frame->pitches[0], frame->pitches[0],
		 this->output_width, this->output_height,
		 frame->width, proc_height, 
		 this->input_interlaced,
		 this->hControl, this->vOffsets, this->vWeights,
		 this->vWorkY, this->vWorkUV, 
		 0);
  }
}

/*
 * parameter functions
 */

static xine_post_api_descr_t *warp_get_param_descr(void)
{
  return &warp_param_descr;
}

static int warp_set_parameters(xine_post_t *this_gen, void *param_gen)
{
  warp_plugin_t *this = (warp_plugin_t *)this_gen;
  warp_parameters_t *params = (warp_parameters_t *)param_gen;

  memcpy(&this->config, params, sizeof(warp_parameters_t));  
  this->input_width = this->input_height = 0;

  if(this->config.output_aspect > 999)
    this->config.output_aspect /= 1000.0;

  DBG("warp_set_parameters: "
      "output_width=%d, output_height=%d, output_aspect=%4.3lf, no_downscaling=%d\n",
      this->config.output_width, this->config.output_height, this->config.output_aspect,
      this->config.no_downscaling);

  return 1;
}

static int warp_get_parameters(xine_post_t *this_gen, void *param_gen)
{
  warp_plugin_t *this = (warp_plugin_t *)this_gen;
  warp_parameters_t *params = (warp_parameters_t *)param_gen;
  
  DBG("warp_get_parameters\n");
  memcpy(params, &this->config, sizeof(warp_parameters_t));

  return 1;
}

static char *warp_get_help(void) {
  return _(
	   "The warp plugin scales video to another resolution. "
           "It supports non-linear stretching to change video aspect ratio. "
	   "\n"
           "Parameters\n"
	   "  output_width:       Scale video to width\n"
	   "                      (0 -> do not change video width)\n"
	   "  output_height:      Scale video to height\n"
	   "                      (0 -> do not change video height)\n"
	   "  output_aspect:      Adjust aspect ratio using non-linear scaling\n"
	   "                      (0 -> do not change video aspect ratio)\n"
	   "  no_downscaling:     Do not downscale video\n"
           "\n"
         );
}


/*
 * plugin info
 */

static post_info_t info = { XINE_POST_TYPE_VIDEO_FILTER };

const plugin_info_t xine_plugin_info[] __attribute__((visibility("default"))) =
{
  /* type, API, "name", version, special_info, init_function */
  { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "warp",    XINE_VERSION_CODE, &info, &warp_init_plugin },
  { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "swscale", XINE_VERSION_CODE, &info, &warp_init_plugin },
  { PLUGIN_NONE, 0, "", 0, NULL, NULL }
};