4
* Extended MMX prediction composition
5
* routines handling the four different interpolation cases...
7
* Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
10
* This program is free software; you can reaxstribute it and/or
11
* modify it under the terms of the GNU General Public License
12
* as published by the Free Software Foundation; either version 2
13
* of the License, or (at your option) any later version.
15
* This program is distributed in the hope that it will be useful,
16
* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
20
* You should have received a copy of the GNU General Public License
21
* along with this program; if not, write to the Free Software
22
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
27
#include "mjpeg_types.h"
28
#if defined( ARCH_X86) || defined(ARCH_X86_64)
33
* void predcomp_<ix><iy>_mmx(char *src,char *dst,int lx, int w, int h, int addflag);
35
* ix - Interpolation in x iy - Interpolation in y
40
/* The no interpolation case... */
42
void predcomp_00_mmx(char *src,char *dst,int lx, int w, int h, int addflag)
46
* mm1 = one's mask for src
47
* mm0 = zero mask for src...
50
movd_g2r(0x00010001, mm1);
51
punpckldq_r2r(mm1, mm1);
57
movq_m2r(src[0], mm4); /* first 8 bytes of row */
61
punpcklbw_r2r(mm0, mm4);
62
punpckhbw_r2r(mm0, mm5);
64
movq_m2r(dst[0], mm2);
66
punpcklbw_r2r(mm0, mm2);
67
punpckhbw_r2r(mm0, mm3);
74
packuswb_r2r(mm5, mm4);
77
movq_r2m(mm4, dst[0]);
81
movq_m2r(src[8], mm4); /* first 8 bytes of row */
85
punpcklbw_r2r(mm0, mm4);
86
punpckhbw_r2r(mm0, mm5);
88
movq_m2r(dst[8], mm2);
90
punpcklbw_r2r(mm0, mm2);
91
punpckhbw_r2r(mm0, mm3);
98
packuswb_r2r(mm5, mm4);
100
movq_r2m(mm4, dst[8]);
103
dst += lx; /* update pointer to next row */
113
/* The x-axis interpolation case... */
115
void predcomp_10_mmx(char *src,char *dst,int lx, int w, int h, int addflag)
117
movd_g2r(0x00010001, mm1);
118
punpckldq_r2r(mm1, mm1);
123
movq_m2r(src[0], mm4); /* first 8 bytes of row */
125
punpcklbw_r2r(mm0, mm4);
126
punpckhbw_r2r(mm0, mm5);
127
movq_m2r(src[1], mm2);
129
punpcklbw_r2r(mm0, mm2);
130
punpckhbw_r2r(mm0, mm3);
132
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
141
movq_m2r(dst[0], mm2);
143
punpcklbw_r2r(mm0, mm2);
144
punpckhbw_r2r(mm0, mm3);
145
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
153
packuswb_r2r(mm5, mm4);
154
movq_r2m(mm4, dst[0]);
158
movq_m2r(src[8], mm4); /* first 8 bytes of row */
160
punpcklbw_r2r(mm0, mm4);
161
punpckhbw_r2r(mm0, mm5);
162
movq_m2r(src[9], mm2);
164
punpcklbw_r2r(mm0, mm2);
165
punpckhbw_r2r(mm0, mm3);
167
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
176
movq_m2r(dst[8], mm2);
178
punpcklbw_r2r(mm0, mm2);
179
punpckhbw_r2r(mm0, mm3);
180
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
188
packuswb_r2r(mm5, mm4);
189
movq_r2m(mm4, dst[8]);
202
/* The y-axis interpolation case... */
204
void predcomp_01_mmx(char *src,char *dst,int lx, int w, int h, int addflag)
206
movd_g2r(0x00010001, mm1);
207
punpckldq_r2r(mm1, mm1);
212
movq_m2r(src[0], mm4); /* first 8 bytes of row */
214
src += lx; /* Next row */
215
punpcklbw_r2r(mm0, mm4);
216
punpckhbw_r2r(mm0, mm5);
218
movq_m2r(src[0], mm2);
220
punpcklbw_r2r(mm0, mm2);
221
punpckhbw_r2r(mm0, mm3);
223
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
232
movq_m2r(dst[0], mm2);
234
punpcklbw_r2r(mm0, mm2);
235
punpckhbw_r2r(mm0, mm3);
236
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
244
packuswb_r2r(mm5, mm4);
245
movq_r2m(mm4, dst[0]);
249
src -= lx; /* Back to first row... */
250
movq_m2r(src[8], mm4); /* first 8 bytes of row */
252
src += lx; /* Next row */
253
punpcklbw_r2r(mm0, mm4);
254
punpckhbw_r2r(mm0, mm5);
256
movq_m2r(src[8], mm2);
258
punpcklbw_r2r(mm0, mm2);
259
punpckhbw_r2r(mm0, mm3);
261
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
270
movq_m2r(dst[8], mm2);
272
punpcklbw_r2r(mm0, mm2);
273
punpckhbw_r2r(mm0, mm3);
274
paddw_r2r(mm2, mm4); /* Average mm4/mm5 and mm2/mm3 */
282
packuswb_r2r(mm5, mm4);
283
movq_r2m(mm4, dst[8]);
295
/* The x-axis and y-axis interpolation case... */
297
void predcomp_11_mmx(char *src,char *dst,int lx, int w, int h, int addflag)
305
movd_g2r(0x00020002, mm2);
306
punpckldq_r2r(mm2, mm2);
307
movd_g2r(0x00010001, mm1);
308
punpckldq_r2r(mm1, mm1);
312
movq_m2r(src[0], mm4); /* mm4 and mm6 accumulate partial sums for interp. */
314
punpcklbw_r2r(mm0, mm4);
315
punpckhbw_r2r(mm0, mm6);
317
movq_m2r(src[1], mm5);
319
punpcklbw_r2r(mm0, mm5);
321
punpckhbw_r2r(mm0, mm7);
324
src += lx; /* update pointer to next row */
326
movq_m2r(src[0], mm5); /* first 8 bytes 1st row: avg src in x */
328
punpcklbw_r2r(mm0, mm5); /* Accumulate partial interpolation */
330
punpckhbw_r2r(mm0, mm7);
333
movq_m2r(src[1], mm5);
335
punpcklbw_r2r(mm0, mm5);
337
punpckhbw_r2r(mm0, mm7);
348
movq_m2r(dst[0], mm5);
350
punpcklbw_r2r(mm0, mm5);
351
punpckhbw_r2r(mm0, mm7);
352
paddw_r2r(mm5, mm4); /* Average mm4/mm6 and mm5/mm7 */
360
packuswb_r2r(mm6, mm4);
361
movq_r2m(mm4, dst[0]);
365
src -= lx; /* Back to first row... */
367
movq_m2r(src[8], mm4); /* mm4 and mm6 accumulate partial sums for interp. */
369
punpcklbw_r2r(mm0, mm4);
370
punpckhbw_r2r(mm0, mm6);
372
movq_m2r(src[9], mm5);
374
punpcklbw_r2r(mm0, mm5);
376
punpckhbw_r2r(mm0, mm7);
379
src += lx; /* update pointer to next row */
381
movq_m2r(src[8], mm5); /* first 8 bytes 1st row: avg src in x */
383
punpcklbw_r2r(mm0, mm5); /* Accumulate partial interpolation */
385
punpckhbw_r2r(mm0, mm7);
388
movq_m2r(src[9], mm5);
390
punpcklbw_r2r(mm0, mm5);
392
punpckhbw_r2r(mm0, mm7);
403
movq_m2r(dst[8], mm5);
405
punpcklbw_r2r(mm0, mm5);
406
punpckhbw_r2r(mm0, mm7);
407
paddw_r2r(mm5, mm4); /* Average mm4/mm6 and mm5/mm7 */
415
packuswb_r2r(mm6, mm4);
416
movq_r2m(mm4, dst[8]);