/* * Copyright (C) 2000-2007 the xine project * * This file is part of xine, a free video player. * * xine is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * xine is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA * * $Id: xine_post_swscale.c,v 1.9 2008/12/13 14:24:03 phintuka Exp $ * * Simple (faster) resize for avisynth * Copyright (C) 2002 Tom Barry * * Very simple 2 tap linear interpolation. * It is unfiltered which means it will not soften much. * * WarpedResize will do a non-linear stretch/squeeze in both the horizontal * and vertical dimensions. This can be useful when you want to change the * aspect ratio of a video clip and have it mostly distorted at the * top, bottom, and side edges. * * * Ported to linux/xine by Petri Hintukainen * - Added x86_64 support * - Added PIC support (do not clobber ebx in x86, access only local variables from asm) * - Fixed yv12 stretched warp tables generation */ #include #include /*#define DBG(x...)*/ #define DBG(x...) fprintf(stderr, "post_warp: " x) /*#define STREAMING_STORE_TMP*/ /*#define STREAMING_STORE*/ /*#define PREFETCH*/ /* streaming store and prefetch seems to be slower ... * Tested with P3 (128M L2) and C2D (4M L2). * Maybe access pattern is enough simple for HW prefetchers. */ /*#define VANILLA*/ /* * This function accepts a position from 0 to 1 and warps it, to 0 through 1 based * upon the wFact var. The warp equations are designed to: * * * Always be rising but yield results from 0 to 1 * * * Have a first derivative that doesn't go to 0 or infinity, at least close * to the center of the screen * * * Have a curvature (absolute val of 2nd derivative) that is small in the * center and smoothly rises towards the edges. We would like the curvature * to be everywhere = 0 when the warp factor = 1 */ static double WarpFactor(double position, double wFact) { double x; double z; double w; x = 2 * (position - .5); if (1) /*(wFact < 1.0)*/ /* For warp factor < 1 the warp is calculated as (1-w) * x^3 + w *x, centered * * The warp is calculated as z = (1 - w) * x^3 + w * x, centered * around .5 and ranging from 0 to 1. After some tinkering this seems * to give decent values and derivatives at the right places. */ w = 2.0 - wFact; /* reverse parm for compat with initial release */ if (x < 0.0) { z = -(1 - w) * x*x*x - w * x; /* -1 < x < 0, wFact < 1 */ return .5 - .5 * z; } else { z = (1 - w) * x*x*x + w * x; /* -1 < x < 0, wFact < 1 */ return .5 + .5 * z; /* amts to same formula as above for now */ } } /* * YV12 * * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively. * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels * will later be processed each pass through the horizontal resize loop. I think with my * current math the Horizontal Luma and Chroma contains the same values but since I may have screwed it * up I'll leave it this way for now. Vertical chroma is different. * * Note - try just using the luma calcs for both, seem to be the same. * * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel. */ static void init_tables_yv12(int newwidth, int newheight, int oldwidth, int oldheight, int Interlaced, double hWarp, double vWarp, uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights, uint32_t *hControlUV, uint32_t *vOffsetsUV, uint32_t *vWeightsUV) { int i; int j; int k; int wY1; int wY2; DBG("init_yv12: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", oldwidth, oldheight, newwidth, newheight, hWarp, vWarp); /* First set up horizontal table, use for both luma & chroma since * it seems to have the same equation. * We will geneerate these values in pairs, mostly because that's the way * I wrote it for YUY2 above. */ for(i=0; i < newwidth; i+=2) { /* first make even pixel control */ if (hWarp==1) /*if no warp factor */ j = i * 256 * (oldwidth-1) / (newwidth-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1)); k = j>>8; wY2 = j - (k << 8); /* luma weight of right pixel */ wY1 = 256 - wY2; /* luma weight of left pixel */ if (k > oldwidth - 2) { hControl[i*3+4] = oldwidth - 1; /* point to last byte */ hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */ } else { hControl[i*3+4] = k; /* pixel offset */ hControl[i*3] = wY2 << 16 | wY1; /* luma weights */ } /* now make odd pixel control */ if (hWarp==1) /* if no warp factor */ j = (i+1) * 256 * (oldwidth-1) / (newwidth-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1)); k = j>>8; wY2 = j - (k << 8); /* luma weight of right pixel */ wY1 = 256 - wY2; /* luma weight of left pixel */ if (k > oldwidth - 2) { hControl[i*3+5] = oldwidth - 1; /* point to last byte */ hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */ } else { hControl[i*3+5] = k; /* pixel offset */ hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */ } } hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */ hControl[newwidth*3+5] = 2 * (oldwidth-1); /* " */ #ifndef VANILLA // UV for(i=0; i < newwidth/2; i+=2) { /* first make even pixel control */ if (hWarp==1) /*if no warp factor */ j = i * 256 * (oldwidth/2-1) / (newwidth/2-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor(i / (newwidth/2-1.0), hWarp) * (oldwidth/2-1)); k = j>>8; wY2 = j - (k << 8); /* luma weight of right pixel */ wY1 = 256 - wY2; /* luma weight of left pixel */ if (k > oldwidth/2 - 2) { hControlUV[i*3+4] = oldwidth/2 - 1; /* point to last byte */ hControlUV[i*3] = 0x00000100; /* use 100% of rightmost Y */ } else { hControlUV[i*3+4] = k; /* pixel offset */ hControlUV[i*3] = wY2 << 16 | wY1; /* luma weights */ } /* now make odd pixel control */ if (hWarp==1) /* if no warp factor */ j = (i+1) * 256 * (oldwidth/2-1) / (newwidth/2-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor((i+1) / (newwidth/2-1.0), hWarp) * (oldwidth/2-1)); k = j>>8; wY2 = j - (k << 8); /* luma weight of right pixel */ wY1 = 256 - wY2; /* luma weight of left pixel */ if (k > oldwidth/2 - 2) { hControlUV[i*3+5] = oldwidth/2 - 1; /* point to last byte */ hControlUV[i*3+1] = 0x00000100; /* use 100% of rightmost Y */ } else { hControlUV[i*3+5] = k; /* pixel offset */ hControlUV[i*3+1] = wY2 << 16 | wY1; /* luma weights */ } } hControlUV[newwidth/2*3+4] = (oldwidth/2-1); /* give it something to prefetch at end */ hControlUV[newwidth/2*3+5] = (oldwidth/2-1); /* " */ #endif /* Next set up vertical tables. The offsets are measured in lines and will be mult */ /* by the source pitch later . */ /* For YV12 we need separate Luma and chroma tables */ /* First Luma Table */ for(i=0; i< newheight; ++i) { if (vWarp==1) /* if no warp factor */ j = i * 256 * (oldheight-1) / (newheight-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1)); if (Interlaced) { /* do hard way? */ if (i%2) { /* is odd output line? */ if (j < 256) { /* before 1st odd input line */ vOffsets[i] = 1; /* all from line 1 */ vWeights[i] = 0; /* weight to give to 2nd line */ } else { k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */ vOffsets[i] = k; wY2 = j - (k << 8); vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ } } else { /* is even output line */ k = (j >> 9) << 1; /* next lower even line */ vOffsets[i] = k; wY2 = j - (k << 8); vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ } } else { /* simple way, do as progressive */ k = j >> 8; vOffsets[i] = k; wY2 = j - (k << 8); vWeights[i] = wY2; /* weight to give to 2nd line */ } } /* Vertical table for chroma */ for(i=0; i< newheight/2; ++i) { if (vWarp==1) /* if no warp factor */ #ifdef VANILLA j = (int) ( (i+.25) * 256 * (oldheight-1) / (newheight-1.0) - 64 ); #else j = (int) ( (i+.25) * 256 * (oldheight/2-1) / (newheight/2-1.0) - 64 ); #endif else /* stretch and warp somehow */ #ifdef VANILLA j = (int) (256 * WarpFactor( (i+.25) / (newheight-1.0), vWarp) * (oldheight-1.0) ); #else j = (int) (256 * WarpFactor( (i+.25) / (newheight/2 - 1.0), vWarp) * (oldheight/2 - 1.0) ); #endif #ifndef VANILLA if(j<0) j=0; #endif if (Interlaced) { /* do hard way? */ if (i%2) { /* is odd output line? */ if (j < 256) { /* before 1st odd input line */ vOffsetsUV[i] = 1; /* all from line 1 */ vWeightsUV[i] = 0; /* weight to give to 2nd line */ } else { k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */ vOffsetsUV[i] = k; wY2 = j - (k << 8); vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */ } } else { /* is even output line */ #ifdef VANILLA k = (j >> 9) << 1; /* next lower even line */ vOffsetsUV[i] = k; wY2 = j - (k << 8); vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */ #else k = (j / 512) << 1; /* next lower even line */ vOffsetsUV[i] = k; wY2 = j - (k << 8); vWeightsUV[i] = wY2 >> 1; /* weight to give to 2nd line */ #endif } } else { /* simple way, do as progressive */ #ifdef VANILLA k = j >> 8; #else k = j / 256; /* j >> 8; does not work right if -256 < j < 0 */ #endif vOffsetsUV[i] = k; wY2 = j - (k << 8); vWeightsUV[i] = wY2; /* weight to give to 2nd line */ } } } /* * YUY2 * * For each horizontal output pair of pixels there is are 2 qword masks followed by 2 int * offsets. The 2 masks are the weights to be used for the luma and chroma, respectively. * Each mask contains LeftWeight1, RightWeight1, LeftWeight2, RightWeight2. So a pair of pixels * will later be processed each pass through the horizontal resize loop. * * The weights are scaled 0-256 and the left and right weights will sum to 256 for each pixel. */ static void init_tables_yuy2(int newwidth, int newheight, int oldwidth, int oldheight, int Interlaced, double hWarp, double vWarp, uint32_t *hControl, uint32_t *vOffsets, uint32_t *vWeights ) { int i; int j; int k; int wY1; int wY2; int wUV1; int wUV2; DBG("init_yuy2: %dx%d->%dx%d hWarp %1.3lf vWarp %1.3lf\n", oldwidth, oldheight, newwidth, newheight, hWarp, vWarp); /* First set up horizontal table */ for(i=0; i < newwidth; i+=2) { /* first make even pixel control */ if (hWarp==1) /* if no warp factor */ j = i * 256 * (oldwidth-1) / (newwidth-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor(i / (newwidth-1.0), hWarp) * (oldwidth-1)); k = j>>8; wY2 = j - (k << 8); /* luma weight of right pixel */ wY1 = 256 - wY2; /* luma weight of left pixel */ wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1; wUV1 = 256 - wUV2; if (k > oldwidth - 2) { hControl[i*3+4] = oldwidth - 1; /* point to last byte */ hControl[i*3] = 0x00000100; /* use 100% of rightmost Y */ hControl[i*3+2] = 0x00000100; /* use 100% of rightmost U */ } else { hControl[i*3+4] = k; /* pixel offset */ hControl[i*3] = wY2 << 16 | wY1; /* luma weights */ hControl[i*3+2] = wUV2 << 16 | wUV1; /* chroma weights */ } /* now make odd pixel control */ if (hWarp==1) /* if no warp factor */ j = (i+1) * 256 * (oldwidth-1) / (newwidth-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor((i+1) / (newwidth-1.0), hWarp) * (oldwidth-1)); k = j>>8; wY2 = j - (k << 8); /* luma weight of right pixel */ wY1 = 256 - wY2; /* luma weight of left pixel */ wUV2 = (k%2) ? 128 + (wY2 >> 1) : wY2 >> 1; wUV1 = 256 - wUV2; if (k > oldwidth - 2) { hControl[i*3+5] = oldwidth - 1; /* point to last byte */ hControl[i*3+1] = 0x00000100; /* use 100% of rightmost Y */ hControl[i*3+3] = 0x00000100; /* use 100% of rightmost V */ } else { hControl[i*3+5] = k; /* pixel offset */ hControl[i*3+1] = wY2 << 16 | wY1; /* luma weights */ /* hControl[i*3+3] = wUV2 << 16 | wUV1; // chroma weights */ /* horiz chroma weights should be same as for even pixel - trbarry 09/16/2002 */ hControl[i*3+3] = hControl[i*3+2]; /* chroma weights */ } } hControl[newwidth*3+4] = 2 * (oldwidth-1); /* give it something to prefetch at end */ hControl[newwidth*3+5] = 2 * (oldwidth-1); /* Next set up vertical table. The offsets are measured in lines and will be mult */ /* by the source pitch later */ for(i=0; i< newheight; ++i) { if (vWarp==1) /* if no warp factor */ j = i * 256 * (oldheight-1) / (newheight-1); else /* stretch and warp somehow */ j = (int) (256 * WarpFactor(i / (newheight-1.0), vWarp) * (oldheight-1)); if (Interlaced) { /* do hard way? */ if (i%2) { /* is odd output line? */ if (j < 256) { /* before 1st odd input line */ vOffsets[i] = 1; /* all from line 1 */ vWeights[i] = 0; /* weight to give to 2nd line */ } else { k = (((j-256) >> 9) << 1) + 1; /* next lowest odd line */ vOffsets[i] = k; wY2 = j - (k << 8); vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ } } else { /* is even output line */ k = (j >> 9) << 1; /* next lower even line */ vOffsets[i] = k; wY2 = j - (k << 8); vWeights[i] = wY2 >> 1; /* weight to give to 2nd line */ } } else { /* simple way, do as progressive */ k = j >> 8; vOffsets[i] = k; wY2 = j - (k << 8); vWeights[i] = wY2; /* weight to give to 2nd line */ } } } /* Register allocation */ /* index/counter registers (REGA, REGC) are loaded from 32bit vars/arrays ! */ #define REGEA "eax" #define REGEB "ebx" #if defined(__x86_64__) # define REGA "rax" # define REGB "rbx" # define REGC "ecx" # define REGD "rdx" # define REGDI "rdi" # define REGSI "rsi" #elif defined(__i386__) # define REGA "eax" # define REGB "ebx" # define REGC "ecx" # define REGD "edx" # define REGDI "edi" # define REGSI "esi" #endif /* variables accessed from assembler code */ #define _FPround1 "%0" #define _vWeight1 "%1" #define _vWeight2 "%2" #define _YMask "%3" #define _src_row_size "%4" #define _EndOffset "%5" #define _pControl "%6" #define _row_size "%7" #define _vWorkYW "%8" #define _dstp "%9" #define _vWorkUVW "%10" #define _FPround2 "%11" #define _srcp1 "%12" #define _srcp2 "%13" #if !defined(__x86_64__) #define _oldbx "%14" #define _SSEMMXenabledW "%15" #define _SSE2enabledW "%16" #endif /* Labels */ #define vMaybeSSEMMX "1" #define LessThan8 "2" #define LessThan4 "3" #define AllDone "4" #define LastOne "5" #define vLoopSSE2_Fetch "6" #define vLoopSSE2 "7" #define vLoopSSEMMX_Fetch "8" #define vLoopSSEMMX "9" #define vLoopMMX "10" #define MoreSpareChange "11" #define DoHorizontal "12" #define hLoopMMX "13" #define hLoopMMXSSE "14" /* structure for mmx constants */ typedef union { uint64_t uq[1]; /* Unsigned Quadword */ uint32_t ud[2]; /* Unsigned Doubleword */ } ATTR_ALIGN(16) mmx_t; /* structure for sse2 constants */ typedef union { uint64_t uq[2]; /* Unsigned Quadword */ uint32_t ud[4]; /* Unsigned Doubleword */ } ATTR_ALIGN(16) sse2_t; static int do_warp_yuy2(uint8_t *dst, const uint8_t *src, const int dst_pitch, const int src_pitch, const int dst_width, const int dst_height, const int src_width, const int src_height, const int Interlaced, const uint32_t * const hControl, const uint32_t * const vOffsets, const uint32_t * const vWeights, uint32_t *vWorkY, uint32_t *vWorkUV, int dst_start) { #if defined(__i386__) || defined(__x86_64__) sse2_t YMask = {uq:{UINT64_C(0x00ff00ff00ff00ff),UINT64_C(0x00ff00ff00ff00ff)}}; /* keeps only luma */ sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */ sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */ sse2_t vWeight1; sse2_t vWeight2; const uint32_t *pControl = &hControl[0]; const uint32_t *vWorkYW = vWorkY; const uint32_t *vWorkUVW = vWorkUV; const uint8_t *srcp = src; const uint8_t *srcp1; const uint8_t *srcp2; uint8_t *dstp = dst + dst_pitch*dst_start; const uint32_t src_row_size = src_width * 2; const uint32_t row_size = dst_width * 2; const uint32_t EndOffset = src_row_size / 2; #if !defined(__x86_64__) const int accel = xine_mm_accel(); const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */ const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */ long int oldbx; #endif int y; for (y = dst_start; y < dst_height; y++) { if(vOffsets[y] >= src_height) { /* slice completed */ /*DBG("do_warp_yuy2: max input height reached: need line %d, height %d\n -> Returning next output line: %d\n", vOffsets[y], src_height, y);*/ return y; } vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] = (256-vWeights[y]) << 16 | (256-vWeights[y]); vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] = vWeights[y] << 16 | vWeights[y]; srcp1 = srcp + vOffsets[y] * src_pitch; if (Interlaced) srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1; else srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1; __asm__ __volatile__ ( #if !defined(__x86_64__) /* store ebx (PIC) */ "mov %%"REGB", "_oldbx" \n\t" #endif "movl "_src_row_size", %%"REGC" \n\t" "shrl $3, %%"REGC" \n\t" /* 8 bytes a time */ "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */ "mov "_srcp2", %%"REGD" \n\t" /* next " */ "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */ "mov "_vWorkUVW", %%"REGB" \n\t" /* luma work destination line */ "xor %%"REGA", %%"REGA" \n\t" #if !defined(__x86_64__) /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions. * This first loop is not the performance bottleneck anyway but it is trivial to tune * using SSE2 if we have proper alignment. */ "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported?*/ "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway*/ #endif "cmpl $2, %%"REGC" \n\t" /* we have at least 16 bytes, 2 qwords? */ "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother*/ "shrl $1, %%"REGC" \n\t" /* do 16 bytes at a time instead*/ "decl %%"REGC" \n" /* jigger loop ct */ ".align 16 \n\t" "movdqa "_FPround1", %%xmm0 \n\t" "movdqa "_vWeight1", %%xmm5 \n\t" "movdqa "_vWeight2", %%xmm6 \n\t" "movdqa "_YMask", %%xmm7 \n" ""vLoopSSE2_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t" " prefetcht0 16(%%"REGD", %%"REGA", 2) \n" #endif ""vLoopSSE2": \n\t" " movdqu (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */ " movdqu (%%"REGD", %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */ " movdqa %%xmm1, %%xmm3 \n\t" /* get chroma bytes */ " pand %%xmm7, %%xmm1 \n\t" /* keep only luma */ " psrlw $8, %%xmm3 \n\t" /* right just chroma */ " pmullw %%xmm5, %%xmm1 \n\t" /* mult by weighting factor */ " pmullw %%xmm5, %%xmm3 \n\t" /* mult by weighting factor */ " movdqa %%xmm2, %%xmm4 \n\t" /* get chroma bytes */ " pand %%xmm7, %%xmm2 \n\t" /* keep only luma */ " psrlw $8, %%xmm4 \n\t" /* right just chroma */ " pmullw %%xmm6, %%xmm2 \n\t" /* mult by weighting factor */ " pmullw %%xmm6, %%xmm4 \n\t" /* mult by weighting factor */ " paddw %%xmm2, %%xmm1 \n\t" /* combine lumas */ " paddusw %%xmm0, %%xmm1 \n\t" /* round */ " psrlw $8, %%xmm1 \n\t" /* right adjust luma */ #ifdef STREAMING_STORE_TMP " movntdq %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ #else " movdqu %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ #endif " paddw %%xmm4, %%xmm3 \n\t" /* combine chromas */ " paddusw %%xmm0, %%xmm3 \n\t" /* round */ " psrlw $8, %%xmm3 \n\t" /* right adjust chroma */ " packuswb %%xmm3, %%xmm3 \n\t" /* pack UV's into low dword */ " movdq2q %%xmm3, %%mm1 \n\t" /* save in our work area */ #ifdef STREAMING_STORE_TMP " movntq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ #else " movq %%mm1, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ #endif " lea 8(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */ " jz "vLoopSSE2"b \n\t" /* or just loop, or not */ /* done with our SSE2 fortified loop but we may need to pick up the spare change */ #ifdef STREAMING_STORE_TMP " sfence \n\t" #endif " movl "_src_row_size", %%"REGC" \n\t" /* get count again */ " andl $15, %%"REGC" \n\t" /* just need mod 16 */ " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */ " movq "_vWeight1", %%mm5 \n\t" " movq "_vWeight2", %%mm6 \n\t" " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ " jz "MoreSpareChange"f \n" /* n, did them all */ /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions. * This first loop is not the performance bottleneck anyway but it is trivial to tune * using SSE if we have proper alignment. */ ""vMaybeSSEMMX": \n\t" " movq "_YMask", %%mm7 \n\t" /* useful luma mask constant - lazy dupl init */ " movq "_vWeight1", %%mm5 \n\t" " movq "_vWeight2", %%mm6 \n\t" " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ #if !defined(__x86_64__) " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */ " jz "vLoopMMX"f \n\t" /* n, can't do anyway */ #endif " decl %%"REGC" \n" /* jigger loop ctr */ ".align 16 \n" ""vLoopSSEMMX_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 8(%%"REGSI", %%"REGA", 2) \n\t" " prefetcht0 8(%%"REGD", %%"REGA", 2) \n" #endif ""vLoopSSEMMX": \n\t" " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */ " movq %%mm1, %%mm3 \n\t" /* copy top bytes */ " pand %%mm7, %%mm1 \n\t" /* keep only luma */ " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */ " psrlw $8, %%mm3 \n\t" /* right just chroma */ " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */ " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */ " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */ " pand %%mm7, %%mm2 \n\t" /* keep only luma */ " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */ " psrlw $8, %%mm4 \n\t" /* right just chroma */ " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */ " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */ " paddw %%mm2, %%mm1 \n\t" /* combine lumas */ " paddusw %%mm0, %%mm1 \n\t" /* round */ " psrlw $8, %%mm1 \n\t" /* right adjust luma */ #ifdef STREAMING_STORE_TMP " movntq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ #else " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ #endif " paddw %%mm4, %%mm3 \n\t" /* combine chromas */ " paddusw %%mm0, %%mm3 \n\t" /* round */ " psrlw $8, %%mm3 \n\t" /* right adjust chroma */ " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */ " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ " lea 4(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */ " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */ #ifdef STREAMING_STORE_TMP " sfence \n\t" #endif " jmp "MoreSpareChange"f \n" /* all done with vertical */ ".align 16 \n" ""vLoopMMX": \n\t" " movq (%%"REGSI", %%"REGA", 2), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA", 2), %%mm2 \n\t" /* 2nd of 2 lines */ " movq %%mm1, %%mm3 \n\t" /* copy top bytes */ " pand %%mm7, %%mm1 \n\t" /* keep only luma */ " pxor %%mm1, %%mm3 \n\t" /* keep only chroma */ " psrlw $8, %%mm3 \n\t" /* right just chroma */ " pmullw %%mm5, %%mm1 \n\t" /* mult by weighting factor */ " pmullw %%mm5, %%mm3 \n\t" /* mult by weighting factor */ " movq %%mm2, %%mm4 \n\t" /* copy 2nd bytes */ " pand %%mm7, %%mm2 \n\t" /* keep only luma */ " pxor %%mm2, %%mm4 \n\t" /* keep only chroma */ " psrlw $8, %%mm4 \n\t" /* right just chroma */ " pmullw %%mm6, %%mm2 \n\t" /* mult by weighting factor */ " pmullw %%mm6, %%mm4 \n\t" /* mult by weighting factor */ " paddw %%mm2, %%mm1 \n\t" /* combine lumas */ " paddusw %%mm0, %%mm1 \n\t" /* round */ " psrlw $8, %%mm1 \n\t" /* right adjust luma */ " movq %%mm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */ " paddw %%mm4, %%mm3 \n\t" /* combine chromas */ " paddusw %%mm0, %%mm3 \n\t" /* round */ " psrlw $8, %%mm3 \n\t" /* right adjust chroma */ " packuswb %%mm3, %%mm3 \n\t" /* pack UV's into low dword */ " movd %%mm3, (%%"REGB", %%"REGA") \n\t" /* save in our work area */ " lea 4(%%"REGA"), %%"REGA" \n\t" " loop "vLoopMMX"b \n" /* Add a little code here to check if we have 2 more pixels to do and, if so, make one * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have * an even number so there will never be more than 2 left. trbarry 7/29/2002 */ ""MoreSpareChange": \n\t" " cmpl "_EndOffset", %%"REGEA" \n\t" /* did we get them all */ " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */ " movl $1, %%"REGC" \n\t" /* jigger loop ct */ " sub $2, %%"REGA" \n\t" /* back up 2 pixels (4 bytes, but eax carried as 1/2) */ " jmp "vLoopMMX"b \n" /* We've taken care of the vertical scaling, now do horizontal */ ""DoHorizontal": \n\t" " movq "_YMask", %%mm7 \n\t" /* useful 0U0U.. mask constant */ " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */ " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */ " movl "_row_size", %%"REGC" \n\t" " shrl $2, %%"REGC" \n\t" /* bytes a time, 2 pixels */ " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */ " mov "_dstp", %%"REGDI" \n\t" /* the destination line */ " mov "_vWorkUVW", %%"REGB" \n" /* chroma data, as UVUV UVUV... */ ".align 16 \n" ""hLoopMMX": \n\t" /* x86_64: must use movl (accessing table of uint32's) */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ " movd (%%"REGD", %%"REGA", 2), %%mm0 \n\t" /* copy luma pair */ " shr $1, %%"REGA" \n\t" /* div offset by 2 */ " movd (%%"REGB", %%"REGA", 2), %%mm1 \n\t" /* copy UV pair VUVU */ " psllw $8, %%mm1 \n\t" /* shift out V, keep 0000U0U0 */ /* we need to use both even and odd croma from same location - trb 9/2002 */ " punpckldq (%%"REGB", %%"REGA", 2), %%mm1 \r\n" /* copy UV pair VUVU */ " psrlw $8, %%mm1 \r\n" /* shift out U0, keep 0V0V 0U0U */ " movl 20(%%"REGSI"), %%"REGEA" \r\n" /* get data offset in pixels, 2nd pixel pair */ " punpckldq (%%"REGD", %%"REGA", 2), %%mm0 \r\n" /* copy luma pair */ " pmaddwd (%%"REGSI"), %%mm0 \r\n" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm0 \r\n" /* round */ " psrlw $8, %%mm0 \r\n" /* right just 2 luma pixel value 000Y,000Y */ " pmaddwd 8(%%"REGSI"), %%mm1 \r\n" /* mult and sum chromas by ctl weights */ " paddusw %%mm6, %%mm1 \r\n" /* round */ " pslld $8, %%mm1 \r\n" /* shift into low bytes of different words */ " pand %%mm7, %%mm1 \r\n" /* keep only 2 chroma values 0V00,0U00 */ " por %%mm1, %%mm0 \r\n" /* combine luma and chroma, 0V0Y,0U0Y */ " packuswb %%mm0, %%mm0 \r\n" /* pack all into low dword, xxxxVYUY */ " movd %%mm0, (%%"REGDI") \n\t" /* done with 2 pixels */ " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytest */ " lea 4(%%"REGDI"), %%"REGDI" \n\t" /* bump to next output pixel addr */ " loop "hLoopMMX"b \n\t" /* loop for more */ "emms \n\t" /* done with one line */ #if !defined(__x86_64__) "mov "_oldbx", %%"REGB" \n\t" #endif :: "m" /*0*/(FPround1), "m" /*1*/(vWeight1), "m" /*2*/(vWeight2), "m" /*3*/(YMask), "m" /*4*/(src_row_size), "m" /*5*/(EndOffset), "m" /*6*/(pControl), "m" /*7*/(row_size), "m" /*8*/(vWorkYW), "m" /*9*/(dstp), "m" /*10*/(vWorkUVW), "m" /*11*/(FPround2), "m" /*12*/(srcp1), "m" /*13*/(srcp2) #if !defined(__x86_64__) , "m" /*14*/(oldbx), "m" /*15*/(SSEMMXenabledW), "m" /*16*/(SSE2enabledW) : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI #else : REGA, REGB, REGC, REGD, REGSI, REGDI #endif ); dstp += dst_pitch; } #endif return 0; } static int do_warp_yv12(uint8_t *dst, const uint8_t * const src, const int dst_pitch, const int src_pitch, const int dst_width, const int dst_height, const int src_width, const int src_height, const int Interlaced, const uint32_t * const hControl, const uint32_t * vOffsets, const uint32_t * vWeights, uint32_t *vWorkY, int dst_start) { #if defined(__i386__) || defined(__x86_64__) sse2_t FPround1 = {uq:{UINT64_C(0x0080008000800080),UINT64_C(0x0080008000800080)}}; /* round words */ sse2_t FPround2 = {uq:{UINT64_C(0x0000008000000080),UINT64_C(0x0000008000000080)}}; /* round dwords */ sse2_t vWeight1; sse2_t vWeight2; const uint32_t *pControl = &hControl[0]; const uint32_t *vWorkYW = vWorkY; const uint8_t *srcp = src; const uint8_t *srcp1; const uint8_t *srcp2; uint8_t *dstp = dst + dst_pitch*dst_start; const uint32_t src_row_size = src_width; const uint32_t row_size = dst_width; #if !defined(__x86_64__) const int accel = xine_mm_accel(); const uint32_t SSE2enabledW = !!(accel & MM_ACCEL_X86_SSE2); /* in local storage for asm */ const uint32_t SSEMMXenabledW = !!(accel & MM_ACCEL_X86_MMXEXT); /* in local storage for asm */ long int oldbx; #endif int y; /* Operation in sliced mode: * - continue until required next source line is out of slice * - return next output line * - at next call, continue from next souce line */ for (y = dst_start; y < dst_height; y++) { if(vOffsets[y] >= src_height) { /* slice completed */ /*DBG("do_warp_yv12: max input height reached: need line %d, height %d\n -> Returning next output line: %d , start was %d\n", (int)vOffsets[y], (int)src_height, (int)y, (int)dst_start);*/ return y; } vWeight1.ud[0] = vWeight1.ud[1] = vWeight1.ud[2] = vWeight1.ud[3] = (256-vWeights[y]) << 16 | (256-vWeights[y]); vWeight2.ud[0] = vWeight2.ud[1] = vWeight2.ud[2] = vWeight2.ud[3] = vWeights[y] << 16 | vWeights[y]; srcp1 = srcp + vOffsets[y] * src_pitch; if (Interlaced) srcp2 = (y < dst_height-2) ? srcp1 + 2 * src_pitch : srcp1; else srcp2 = (y < dst_height-1) ? srcp1 + src_pitch : srcp1; __asm__ __volatile__( "movl "_src_row_size", %%"REGC" \n\t" "shr $3, %%"REGC" \n\t" /* 8 bytes a time */ "mov "_srcp1", %%"REGSI" \n\t" /* top of 2 src lines to get */ "mov "_srcp2", %%"REGD" \n\t" /* next " */ "mov "_vWorkYW", %%"REGDI" \n\t" /* luma work destination line */ "xor %%"REGA", %%"REGA" \n\t" #if !defined(__x86_64__) /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions. * This first loop is not the performance bottleneck anyway but it is trivial to tune * using SSE2 if we have proper alignment. */ "testl $1, "_SSE2enabledW" \n\t" /* is SSE2 supported? */ "jz "vMaybeSSEMMX"f \n\t" /* n, can't do anyway */ #endif "cmpl $2, %%"REGC" \n\t" /* we have at least 16 byts, 2 qwords? */ "jl "vMaybeSSEMMX"f \n\t" /* n, don't bother */ "mov %%"REGSI", %%"REGB" \n\t" "or %%"REGD", %%"REGB" \n\t" "test $15, %%"REGB" \n\t" /* both src rows 16 byte aligned? */ "jnz "vMaybeSSEMMX"f \n\t" /* n, don't use sse2 */ "shr $1, %%"REGC" \n\t" /* do 16 bytes at a time instead */ "dec %%"REGC" \n\t" /* jigger loop ct */ "movdqa "_FPround1", %%xmm0 \n\t" "movdqa "_vWeight1", %%xmm5 \n\t" "movdqa "_vWeight2", %%xmm6 \n\t" "pxor %%xmm7, %%xmm7 \n" ".align 16 \n" ""vLoopSSE2_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t" " prefetcht0 16(%%"REGD", %%"REGA", 2) \n" #endif ""vLoopSSE2": \n\t" /* we're already checked pointers to be on dqword aligned */ " movdqa (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */ " movdqa (%%"REGD", %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */ " movdqa %%xmm1, %%xmm2 \n\t" " movdqa %%xmm3, %%xmm4 \n\t" " punpcklbw %%xmm7, %%xmm1 \n\t" /* make words */ " punpckhbw %%xmm7, %%xmm2 \n\t" /* " */ " punpcklbw %%xmm7, %%xmm3 \n\t" /* " */ " punpckhbw %%xmm7, %%xmm4 \n\t" /* " */ " pmullw %%xmm5, %%xmm1 \n\t" /* mult by top weighting factor */ " pmullw %%xmm5, %%xmm2 \n\t" /* " */ " pmullw %%xmm6, %%xmm3 \n\t" /* mult by bot weighting factor */ " pmullw %%xmm6, %%xmm4 \n\t" /* " */ " paddw %%xmm3, %%xmm1 \n\t" /* combine lumas low */ " paddw %%xmm4, %%xmm2 \n\t" /* combine lumas high */ " paddusw %%xmm0, %%xmm1 \n\t" /* round */ " paddusw %%xmm0, %%xmm2 \n\t" /* round */ " psrlw $8, %%xmm1 \n\t" /* right adjust luma */ " psrlw $8, %%xmm2 \n\t" /* right adjust luma */ " packuswb %%xmm2, %%xmm1 \n\t" /* pack words to our 16 byte answer */ #ifdef STREAMING_STORE_TMP " movntdq %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ #else " movdqu %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ #endif " lea 16(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" " jg "vLoopSSE2_Fetch"b \n\t" /* if not on last one loop, prefetch */ " jz "vLoopSSE2"b \n\t" /* or just loop, or not */ /* done with our SSE2 fortified loop but we may need to pick up the spare change */ #ifdef STREAMING_STORE_TMP " sfence \n\t" #endif " movl "_src_row_size", %%"REGC" \n\t" /* get count again */ " andl $15, %%"REGC" \n\t" /* just need mod 16 */ " movq "_vWeight1", %%mm5 \n\t" " movq "_vWeight2", %%mm6 \n\t" " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ " shrl $3, %%"REGC" \n\t" /* 8 bytes at a time, any? */ " jz "MoreSpareChange"f \n" /* n, did them all */ /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions. * This first loop is not the performance bottleneck anyway but it is trivial to tune * using SSE if we have proper alignment. */ ""vMaybeSSEMMX": \n\t" " movq "_vWeight1", %%mm5 \n\t" " movq "_vWeight2", %%mm6 \n\t" " movq "_FPround1", %%mm0 \n\t" /* useful rounding constant */ " pxor %%mm7, %%mm7 \n\t" #if !defined(__x86_64__) " testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */ " jz "vLoopMMX"f \n\t" /* n, can't do anyway */ #endif " decl %%"REGC" \n" /* jigger loop ctr */ ".align 16 \n" ""vLoopSSEMMX_Fetch": \n\t" #ifdef PREFETCH " prefetcht0 8(%%"REGSI", %%"REGA") \n\t" " prefetcht0 8(%%"REGD", %%"REGA") \n" #endif ""vLoopSSEMMX": \n\t" " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */ " movq %%mm1, %%mm2 \n\t" " movq %%mm3, %%mm4 \n\t" " punpcklbw %%mm7, %%mm1 \n\t" /* make words */ " punpckhbw %%mm7, %%mm2 \n\t" /* " */ " punpcklbw %%mm7, %%mm3 \n\t" /* " */ " punpckhbw %%mm7, %%mm4 \n\t" /* " */ " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */ " pmullw %%mm5, %%mm2 \n\t" /* " */ " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */ " pmullw %%mm6, %%mm4 \n\t" /* " */ " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */ " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */ " paddusw %%mm0, %%mm1 \n\t" /* round */ " paddusw %%mm0, %%mm2 \n\t" /* round */ " psrlw $8, %%mm1 \n\t" /* right adjust luma */ " psrlw $8, %%mm2 \n\t" /* right adjust luma */ " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 8 byte answer */ #ifdef STREAMING_STORE_TMP " movntq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ #else " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ #endif " lea 8(%%"REGA"), %%"REGA" \n\t" " decl %%"REGC" \n\t" " jg "vLoopSSEMMX_Fetch"b \n\t" /* if not on last one loop, prefetch */ " jz "vLoopSSEMMX"b \n\t" /* or just loop, or not */ #ifdef STREAMING_STORE_TMP " sfence \n\t" #endif " jmp "MoreSpareChange"f \n" /* all done with vertical */ ".align 16 \n" ""vLoopMMX": \n\t" " movq (%%"REGSI", %%"REGA"), %%mm1 \n\t" /* top of 2 lines to interpolate */ " movq (%%"REGD", %%"REGA"), %%mm3 \n\t" /* 2nd of 2 lines */ " movq %%mm1, %%mm2 \n\t" " movq %%mm3, %%mm4 \n\t" " punpcklbw %%mm7, %%mm1 \n\t" /* make words */ " punpckhbw %%mm7, %%mm2 \n\t" /* " */ " punpcklbw %%mm7, %%mm3 \n\t" /* " */ " punpckhbw %%mm7, %%mm4 \n\t" /* " */ " pmullw %%mm5, %%mm1 \n\t" /* mult by top weighting factor */ " pmullw %%mm5, %%mm2 \n\t" /* " */ " pmullw %%mm6, %%mm3 \n\t" /* mult by bot weighting factor */ " pmullw %%mm6, %%mm4 \n\t" /* " */ " paddw %%mm3, %%mm1 \n\t" /* combine lumas low */ " paddw %%mm4, %%mm2 \n\t" /* combine lumas high */ " paddusw %%mm0, %%mm1 \n\t" /* round */ " paddusw %%mm0, %%mm2 \n\t" /* round */ " psrlw $8, %%mm1 \n\t" /* right adjust luma */ " psrlw $8, %%mm2 \n\t" /* right adjust luma */ " packuswb %%mm2, %%mm1 \n\t" /* pack words to our 8 byte answer */ " movq %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */ " lea 8(%%"REGA"), %%"REGA" \n\t" " loop "vLoopMMX"b \n" /* Add a little code here to check if we have more pixels to do and, if so, make one * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have * an even number so there will never be more than 7 left. */ ""MoreSpareChange": \n\t" " cmpl "_src_row_size", %%"REGEA" \n\t" /* did we get them all */ " jnl "DoHorizontal"f \n\t" /* yes, else have 2 left */ " movl $1, %%"REGC" \n\t" /* jigger loop ct */ " movl "_src_row_size", %%"REGEA" \n\t" " sub $8, %%"REGA" \n\t" /* back up to last 8 pixels */ " jmp "vLoopMMX"b \n" /* We've taken care of the vertical scaling, now do horizontal */ ""DoHorizontal": \n\t" " pxor %%mm7, %%mm7 \n\t" " movq "_FPround2", %%mm6 \n\t" /* useful rounding constant, dwords */ " mov "_pControl", %%"REGSI" \n\t" /* @ horiz control bytes */ " movl "_row_size", %%"REGC" \n\t" " shrl $2, %%"REGC" \n\t" /* 4 bytes a time, 4 pixels */ " mov "_vWorkYW", %%"REGD" \n\t" /* our luma data, as 0Y0Y 0Y0Y.. */ " mov "_dstp", %%"REGDI" \n\t" /* the destination line */ #if !defined(__x86_64__) " testl $1, "_SSEMMXenabledW" \n\t" /* MMXEXTsupported? */ " jz "hLoopMMX"f \n\t" /* n, can't do anyway */ #endif /* With SSE support we will make 8 pixels (from 8 pairs) at a time */ " shrl $1, %%"REGC" \n\t" /* 8 bytes a time instead of 4 */ " jz "LessThan8"f \n" ".align 16 \n" ""hLoopMMXSSE": \n\t" /* handle first 2 pixels */ /* phi: must use movl here (x86_64, reading from table of uint_32's) */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */ " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm0 \n\t" /* round */ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ /* handle 3rd and 4th pixel pairs */ " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */ " punpcklwd (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " movl 16+48(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 5st pixel pair */ " movl 20+48(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 6nd pixel pair */ " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm1 \n\t" /* round */ " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ /* handle 5th and 6th pixel pairs */ " movd (%%"REGD", %%"REGA"), %%mm2 \n\t" /* copy luma pair 0000xxYY */ " punpcklwd (%%"REGD", %%"REGB"), %%mm2 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm2 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " movl 16+72(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 7st pixel pair */ " movl 20+72(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 8nd pixel pair */ " pmaddwd 48(%%"REGSI"), %%mm2 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm2 \n\t" /* round */ " psrlw $8, %%mm2 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ /* handle 7th and 8th pixel pairs */ " movd (%%"REGD", %%"REGA"), %%mm3 \n\t" /* copy luma pair 0000xxYY */ " punpcklwd (%%"REGD", %%"REGB"), %%mm3 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm3 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " pmaddwd 72(%%"REGSI"), %%mm3 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm3 \n\t" /* round */ " psrlw $8, %%mm3 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ /* combine, store, and loop */ " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */ " packuswb %%mm3, %%mm2 \n\t" /* pack into qword, 0Y0Y0Y0Y */ " packuswb %%mm2, %%mm0 \n\t" /* and again into YYYYYYYY */ #ifdef STREAMING_STORE " movntq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */ #else " movq %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */ #endif " lea 96(%%"REGSI"), %%"REGSI" \n\t" " lea 8(%%"REGDI"), %%"REGDI" \n\t" " decl %%"REGC" \n\t" " jg "hLoopMMXSSE"b \n\t" /* loop for more */ #ifdef STREAMING_STORE " sfence \n" #endif ""LessThan8": \n\t" " movl "_row_size", %%"REGC" \n\t" " andl $7, %%"REGC" \n\t" /* we have done all but maybe this */ " shrl $2, %%"REGC" \n\t" /* now do only 4 bytes at a time */ " jz "LessThan4"f \n" ".align 16 \n" ""hLoopMMX": \n\t" /* handle first 2 pixels */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " movl 16+24(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 3st pixel pair */ " movl 20+24(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 4nd pixel pair */ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm0 \n\t" /* round */ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ /* handle 3rd and 4th pixel pairs */ " movd (%%"REGD", %%"REGA"), %%mm1 \n\t" /* copy luma pair 0000xxYY */ " punpckldq (%%"REGD", %%"REGB"), %%mm1 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm1 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " pmaddwd 24(%%"REGSI"), %%mm1 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm1 \n\t" /* round */ " psrlw $8, %%mm1 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ /* combine, store, and loop */ " packuswb %%mm1, %%mm0 \n\t" /* pack into qword, 0Y0Y0Y0Y */ " packuswb %%mm7, %%mm0 \n\t" /* and again into 0000YYYY */ " movd %%mm0, (%%"REGDI") \n\t" /* done with 4 pixels */ " lea 48(%%"REGSI"), %%"REGSI" \n\t" " lea 4(%%"REGDI"), %%"REGDI" \n\t" " loop "hLoopMMX"b \n" /* loop for more */ /* test to see if we have a mod 4 size row, if not then more spare change */ ""LessThan4": \n\t" " movl "_row_size", %%"REGC" \n\t" " andl $3, %%"REGC" \n\t" /* remainder side mod 4 */ " cmpl $2, %%"REGC" \n\t" " jl "LastOne"f \n\t" /* none, none */ /* handle 2 more pixels */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ " movl 20(%%"REGSI"), %%"REGEB" \r\n" /* get data offset in pixels, 2nd pixel pair */ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ " punpcklwd (%%"REGD", %%"REGB"), %%mm0 \r\n" /* 2nd luma pair, now xxxxYYYY */ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm0 \n\t" /* round */ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ " packuswb %%mm7, %%mm0 \n\t" /* pack into qword, 00000Y0Y */ " packuswb %%mm7, %%mm0 \n\t" /* and again into 000000YY */ " movd %%mm0, (%%"REGDI") \n\t" /* store, we are guarrenteed room in buffer (8 byte mult) */ " subl $2, %%"REGC" \n\t" " lea 24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */ " lea 2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */ /* maybe one last pixel */ ""LastOne": \n\t" " cmpl $0, %%"REGC" \r\n" /* still more ? */ " jz "AllDone"f \r\n" /* n, done */ " movl 16(%%"REGSI"), %%"REGEA" \n\t" /* get data offset in pixels, 1st pixel pair */ " movd (%%"REGD", %%"REGA"), %%mm0 \n\t" /* copy luma pair 0000xxYY */ " punpcklbw %%mm7, %%mm0 \n\t" /* make words out of bytes, 0Y0Y0Y0Y */ " pmaddwd (%%"REGSI"), %%mm0 \n\t" /* mult and sum lumas by ctl weights */ " paddusw %%mm6, %%mm0 \n\t" /* round */ " psrlw $8, %%mm0 \n\t" /* right just 4 luma pixel value 0Y0Y0Y0Y */ " movd %%mm0, %%"REGEA" \n\t" " movb %%al, (%%"REGDI") \n" /* store last one */ ""AllDone": \n\t" " emms \n\t" #if !defined(__x86_64__) "mov "_oldbx", %%"REGB" \n\t" #endif :: "m" /*0*/(FPround1), "m" /*1*/(vWeight1), "m" /*2*/(vWeight2), "m" /*3*/(y/*YMask[0]*/), "m" /*4*/(src_row_size), "m" /*5*/(y/*EndOffset*/), "m" /*6*/(pControl), "m" /*7*/(row_size), "m" /*8*/(vWorkYW), "m" /*9*/(dstp), "m" /*10*/(y/*vWorkUVW*/), "m" /*11*/(FPround2), "m" /*12*/(srcp1), "m" /*13*/(srcp2) #if !defined(__x86_64__) , "m" /*14*/(oldbx), "m" /*15*/(SSEMMXenabledW), "m" /*16*/(SSE2enabledW) : REGA, /*REGB,*/ REGC, REGD, REGSI, REGDI #else : REGA, REGB, REGC, REGD, REGSI, REGDI #endif ); dstp += dst_pitch; } #endif return 0; } /* * tools */ #ifndef ALIGN # define ALIGN(b,p) ((void*)((((unsigned long)(p)) + (b)-1) & (~((b)-1)))) #endif #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif #ifndef MAX # define MAX(a,b) ((a) > (b) ? (a) : (b)) #endif #ifndef FABS # define FABS(x) ((x) < 0.0 ? -(x) : (x)) #endif /* * xine plugin */ #define PLUGIN_ID "warp" #define PLUGIN_DESCR "(non-)linear software scaling post plugin"; #define PLUGIN_T warp_plugin_t /*#define POST_THREADS*/ /*#define POST_SLICES*/ #include "xine/post_util.h" /* plugin class initialization function */ void *warp_init_plugin(xine_t *xine, void *); /* plugin class functions */ static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs, xine_audio_port_t **audio_target, xine_video_port_t **video_target); /* plugin instance functions */ static void warp_dispose(post_plugin_t *this_gen); /* vo_frame functions */ static vo_frame_t *got_frame(vo_frame_t *frame); static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame); /* parameter functions */ static xine_post_api_descr_t *warp_get_param_descr(void); static int warp_set_parameters(xine_post_t *this_gen, void *param_gen); static int warp_get_parameters(xine_post_t *this_gen, void *param_gen); static char *warp_get_help(void); typedef struct warp_parameters_s { int output_width; int output_height; double output_aspect; int no_downscaling; } warp_parameters_t; START_PARAM_DESCR(warp_parameters_t) PARAM_ITEM(POST_PARAM_TYPE_INT, output_width, NULL, 640, 1920, 0, "output video width") PARAM_ITEM(POST_PARAM_TYPE_INT, output_height, NULL, 480, 1080, 0, "output video height") PARAM_ITEM(POST_PARAM_TYPE_DOUBLE, output_aspect, NULL, 1, 3, 0, "output video aspect ratio") PARAM_ITEM(POST_PARAM_TYPE_BOOL, no_downscaling,NULL, 0, 1, 0, "disable downscaling") END_PARAM_DESCR(warp_param_descr) typedef struct { post_plugin_t post; xine_post_in_t parameter_input; /* User config (changes to actual config are delayed) */ warp_parameters_t config; /* Current config */ int enable; int output_width; int output_height; double output_aspect; double factor_x; double factor_y; /* Last seen input frame */ int input_width; int input_height; int input_format; int input_interlaced; double input_aspect; /* working buffers */ uint32_t *vWorkY; uint32_t *vWorkUV; /* scaling tables */ uint32_t *hControl; uint32_t *hControlUV; uint32_t *vOffsets; uint32_t *vOffsetsUV; uint32_t *vWeights; uint32_t *vWeightsUV; /* memory for work areas and scaling tables */ void *pMem; } warp_plugin_t; /* * */ static void init_tables(warp_plugin_t *this) { #define BP(x) ((uint8_t*)(x)) /* allocate memory for scaling tables and workspace */ free(this->pMem); this->pMem = malloc(this->input_width*3 + this->output_width*sizeof(uint32_t)*3*2 + this->output_height*sizeof(uint32_t)*4 + 2*9*128); /* - aligned for P4 cache line */ this->vWorkY = (uint32_t*)ALIGN(128, this->pMem); this->vWorkUV = (uint32_t*)ALIGN(128, BP(this->vWorkY) + this->input_width*2 + 128); this->hControl = (uint32_t*)ALIGN(128, BP(this->vWorkUV) + this->input_width + 128); this->vOffsets = (uint32_t*)ALIGN(128, BP(this->hControl) + this->output_width * sizeof(uint32_t) * 3 + 128); this->vWeights = (uint32_t*)ALIGN(128, BP(this->vOffsets) + this->output_height * sizeof(uint32_t) + 128); if (this->input_format == XINE_IMGFMT_YV12) { this->vOffsetsUV = (uint32_t*)ALIGN(128, BP(this->vWeights) + this->output_height * sizeof(uint32_t) + 128); this->vWeightsUV = (uint32_t*)ALIGN(128, BP(this->vOffsetsUV) + this->output_height * sizeof(uint32_t) + 128); this->hControlUV = (uint32_t*)ALIGN(128, BP(this->vWeightsUV) + this->output_height * sizeof(uint32_t) + 128); init_tables_yv12(this->output_width, this->output_height, this->input_width, this->input_height, this->input_interlaced, this->factor_x, this->factor_y, this->hControl, this->vOffsets, this->vWeights, this->hControlUV, this->vOffsetsUV, this->vWeightsUV ); } else if (this->input_format == XINE_IMGFMT_YUY2) { init_tables_yuy2(this->output_width, this->output_height, this->input_width, this->input_height, this->input_interlaced, this->factor_x, this->factor_y, this->hControl, this->vOffsets, this->vWeights ); } } static void calculate_factors(warp_plugin_t *this) { /* try to guess amount to stretch/shrink */ double adiff = this->input_aspect - this->output_aspect; this->factor_x = 1.0; this->factor_y = 1.0; if (adiff > 0.1) { if (adiff > 0.1 + ((16.0-12.0)/9.0)) { /* >16:9 -> >4:3 */ DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff); this->factor_x = 0.95; this->factor_y = 1.15; this->output_aspect += (adiff - 4.0/9.0); DBG(" changing target ratio to %3.1lf\n", this->output_aspect); } else { /* 16:9 ... 12:9 -> 4:3 */ DBG("aspect ratio diff %1.3lf > 0 : 16.9...12:9 -> 4:3\n", adiff); this->factor_x = 1.0 - 0.05 * adiff * 9.0/4.0; this->factor_y = 1.0 + 0.15 * adiff * 9.0/4.0; } } else if (adiff < -0.1) { if(adiff < -0.1-((16.0-12.0)/9.0)) { /* <4:3 -> <16:9 */ DBG("aspect ratio diff %1.3lf > 0 : too large !\n", adiff); this->factor_x = 1.05; this->factor_y = 0.85; this->output_aspect += (adiff + 4.0/9.0); DBG(" changing target ratio to %3.1lf\n", this->output_aspect); } else { /* 4:3...16:9 -> 16:9 */ DBG("aspect ratio diff %1.3lf < 0 : 4:3...16:9 -> 16:9\n", adiff); this->factor_x = 1.0 + 0.05 * adiff * 9.0/4.0; this->factor_y = 1.0 - 0.15 * adiff * 9.0/4.0; } } else { DBG("aspect ratio matches, no warp\n"); this->factor_x = 1.0; this->factor_y = 1.0; } DBG("factor_x = %1.3lf factor_y = %1.3lf output ratio = %1.3lf\n", this->factor_x, this->factor_y, this->output_aspect); } /* * */ void *warp_init_plugin(xine_t *xine, void *data) { #if !defined(__x86_64__) /* Need at least MMX */ if (!(xine_mm_accel() & MM_ACCEL_X86_MMX)) { fprintf(stderr, "warp_init_plugin: ERROR: at least MMX required\n"); return NULL; } #endif return init_plugin(xine, data); } static post_plugin_t *open_plugin(post_class_t *class_gen, int inputs, xine_audio_port_t **audio_target, xine_video_port_t **video_target) { warp_plugin_t *this = calloc(1, sizeof(warp_plugin_t)); post_plugin_t *this_gen = (post_plugin_t *) this; post_in_t *input; post_out_t *output; xine_post_in_t *input_param; post_video_port_t *port; static xine_post_api_t post_api = { warp_set_parameters, warp_get_parameters, warp_get_param_descr, warp_get_help }; if (!this || !video_target || !video_target[0]) { free(this); return NULL; } _x_post_init(this_gen, 0, 1); port = _x_post_intercept_video_port(this_gen, video_target[0], &input, &output); port->intercept_frame = intercept_frame_yuy; port->new_frame->draw = post_draw; input->xine_in.name = "video"; output->xine_out.name = "video (scaled)"; this_gen->xine_post.video_input[0] = &port->new_port; this_gen->dispose = warp_dispose; input_param = &this->parameter_input; input_param->name = "parameters"; input_param->type = XINE_POST_DATA_PARAMETERS; input_param->data = &post_api; xine_list_push_back(this_gen->input, input_param); this->config.output_aspect = 0.0; /* -> do not change aspect ratio */ this->config.output_width = 0; /* -> do not change width */ this->config.output_height = 0; /* -> do not change height */ this->config.no_downscaling = 0; this->input_width = 0; /* not known yet, triggers initialization later */ this->input_height = 0; return this_gen; } static void warp_dispose(post_plugin_t *this_gen) { if (_x_post_dispose(this_gen)) { warp_plugin_t *this = (warp_plugin_t *) this_gen; DBG("dispose\n"); free(this->pMem); free(this); } } static vo_frame_t *got_frame(vo_frame_t *frame) { post_video_port_t *port = (post_video_port_t *)frame->port; warp_plugin_t *this = (warp_plugin_t *)port->post; double adiff = this->input_aspect - frame->ratio; if (this->input_width != frame->width || this->input_height != frame->height || this->input_format != frame->format || FABS(adiff)>0.1 || this->input_interlaced != !!(frame->flags & VO_INTERLACED_FLAG)) { DBG("detected frame format change: %dx%d -> %dx%d, interlaced %d->%d, aspect %1.3lf->%1.3lf, %s->%s\n", this->input_width, this->input_height, frame->width, frame->height, this->input_interlaced, !!(frame->flags & VO_INTERLACED_FLAG), this->input_aspect, frame->ratio, this->input_format==XINE_IMGFMT_YV12 ? "yv12":"yuy2", frame->format==XINE_IMGFMT_YV12 ? "yv12":"yuy2" ); /* free tables and buffers */ free(this->pMem); this->pMem = NULL; /* remember frame properties to detect changes in video format */ this->input_width = frame->width; this->input_height = frame->height; this->input_format = frame->format; this->input_aspect = frame->ratio; this->input_interlaced = !!(frame->flags & VO_INTERLACED_FLAG); /* re-configure target size and aspect ratio */ this->output_aspect = this->config.output_aspect ?: frame->ratio; if (!this->config.no_downscaling) { this->output_width = this->config.output_width ?: frame->width; this->output_height = this->config.output_height ?: frame->height; } else { this->output_width = MAX(this->config.output_width, frame->width); this->output_height = MAX(this->config.output_height, frame->height); } /* calculate warp function factors */ calculate_factors(this); adiff = this->input_aspect - this->output_aspect; if(this->output_width == frame->width && this->output_height == frame->height && adiff < 0.1 && adiff > -0.1 ) { this->enable = 0; DBG("--> nothing to do, disabling processing for now\n"); return NULL; } this->enable = 1; init_tables(this); } if (!this->enable) return NULL; return port->original_port->get_frame(port->original_port, this->output_width, this->output_height, this->output_aspect, frame->format, frame->flags | VO_BOTH_FIELDS); } static void draw_internal(vo_frame_t *frame, vo_frame_t *new_frame) { post_video_port_t *port = (post_video_port_t *)frame->port; warp_plugin_t *this = (warp_plugin_t *)port->post; int proc_height = frame->height; if (frame->format == XINE_IMGFMT_YV12) { do_warp_yv12(new_frame->base[0], frame->base[0], new_frame->pitches[0], frame->pitches[0], this->output_width, this->output_height, frame->width, proc_height, this->input_interlaced, this->hControl, this->vOffsets, this->vWeights, this->vWorkY, 0); proc_height /= 2; do_warp_yv12(new_frame->base[1], frame->base[1], new_frame->pitches[1], frame->pitches[1], this->output_width/2, this->output_height/2, frame->width/2, proc_height, this->input_interlaced, this->hControlUV, this->vOffsetsUV, this->vWeightsUV, this->vWorkUV, 0); do_warp_yv12(new_frame->base[2], frame->base[2], new_frame->pitches[2], frame->pitches[2], this->output_width/2, this->output_height/2, frame->width/2, proc_height, this->input_interlaced, this->hControlUV, this->vOffsetsUV, this->vWeightsUV, this->vWorkUV, 0); } else if (frame->format == XINE_IMGFMT_YUY2) { do_warp_yuy2(new_frame->base[0], frame->base[0], new_frame->pitches[0], frame->pitches[0], this->output_width, this->output_height, frame->width, proc_height, this->input_interlaced, this->hControl, this->vOffsets, this->vWeights, this->vWorkY, this->vWorkUV, 0); } } /* * parameter functions */ static xine_post_api_descr_t *warp_get_param_descr(void) { return &warp_param_descr; } static int warp_set_parameters(xine_post_t *this_gen, void *param_gen) { warp_plugin_t *this = (warp_plugin_t *)this_gen; warp_parameters_t *params = (warp_parameters_t *)param_gen; memcpy(&this->config, params, sizeof(warp_parameters_t)); this->input_width = this->input_height = 0; if(this->config.output_aspect > 999) this->config.output_aspect /= 1000.0; DBG("warp_set_parameters: " "output_width=%d, output_height=%d, output_aspect=%4.3lf, no_downscaling=%d\n", this->config.output_width, this->config.output_height, this->config.output_aspect, this->config.no_downscaling); return 1; } static int warp_get_parameters(xine_post_t *this_gen, void *param_gen) { warp_plugin_t *this = (warp_plugin_t *)this_gen; warp_parameters_t *params = (warp_parameters_t *)param_gen; DBG("warp_get_parameters\n"); memcpy(params, &this->config, sizeof(warp_parameters_t)); return 1; } static char *warp_get_help(void) { return _( "The warp plugin scales video to another resolution. " "It supports non-linear stretching to change video aspect ratio. " "\n" "Parameters\n" " output_width: Scale video to width\n" " (0 -> do not change video width)\n" " output_height: Scale video to height\n" " (0 -> do not change video height)\n" " output_aspect: Adjust aspect ratio using non-linear scaling\n" " (0 -> do not change video aspect ratio)\n" " no_downscaling: Do not downscale video\n" "\n" ); } /* * plugin info */ static post_info_t info = { XINE_POST_TYPE_VIDEO_FILTER }; const plugin_info_t xine_plugin_info[] __attribute__((visibility("default"))) = { /* type, API, "name", version, special_info, init_function */ { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "warp", XINE_VERSION_CODE, &info, &warp_init_plugin }, { PLUGIN_POST, POST_PLUGIN_IFACE_VERSION, "swscale", XINE_VERSION_CODE, &info, &warp_init_plugin }, { PLUGIN_NONE, 0, "", 0, NULL, NULL } };