2
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4
* Use of this source code is governed by a BSD-style license
5
* that can be found in the LICENSE file in the root of the source
6
* tree. An additional intellectual property rights grant can be found
7
* in the file PATENTS. All contributing project authors may
8
* be found in the AUTHORS file in the root of the source tree.
12
#include "vpx_config.h"
14
#include "vpx_ports/mem.h"
15
#include "filter_x86.h"
17
extern const short vp8_six_tap_mmx[8][6*8];
19
extern void vp8_filter_block1d_h6_mmx
21
unsigned char *src_ptr,
22
unsigned short *output_ptr,
23
unsigned int src_pixels_per_line,
24
unsigned int pixel_step,
25
unsigned int output_height,
26
unsigned int output_width,
27
const short *vp8_filter
29
extern void vp8_filter_block1dc_v6_mmx
31
unsigned short *src_ptr,
32
unsigned char *output_ptr,
34
unsigned int pixels_per_line,
35
unsigned int pixel_step,
36
unsigned int output_height,
37
unsigned int output_width,
38
const short *vp8_filter
40
extern void vp8_filter_block1d8_h6_sse2
42
unsigned char *src_ptr,
43
unsigned short *output_ptr,
44
unsigned int src_pixels_per_line,
45
unsigned int pixel_step,
46
unsigned int output_height,
47
unsigned int output_width,
48
const short *vp8_filter
50
extern void vp8_filter_block1d16_h6_sse2
52
unsigned char *src_ptr,
53
unsigned short *output_ptr,
54
unsigned int src_pixels_per_line,
55
unsigned int pixel_step,
56
unsigned int output_height,
57
unsigned int output_width,
58
const short *vp8_filter
60
extern void vp8_filter_block1d8_v6_sse2
62
unsigned short *src_ptr,
63
unsigned char *output_ptr,
65
unsigned int pixels_per_line,
66
unsigned int pixel_step,
67
unsigned int output_height,
68
unsigned int output_width,
69
const short *vp8_filter
71
extern void vp8_filter_block1d16_v6_sse2
73
unsigned short *src_ptr,
74
unsigned char *output_ptr,
76
unsigned int pixels_per_line,
77
unsigned int pixel_step,
78
unsigned int output_height,
79
unsigned int output_width,
80
const short *vp8_filter
82
extern void vp8_unpack_block1d16_h6_sse2
84
unsigned char *src_ptr,
85
unsigned short *output_ptr,
86
unsigned int src_pixels_per_line,
87
unsigned int output_height,
88
unsigned int output_width
90
extern void vp8_filter_block1d8_h6_only_sse2
92
unsigned char *src_ptr,
93
unsigned int src_pixels_per_line,
94
unsigned char *output_ptr,
96
unsigned int output_height,
97
const short *vp8_filter
99
extern void vp8_filter_block1d16_h6_only_sse2
101
unsigned char *src_ptr,
102
unsigned int src_pixels_per_line,
103
unsigned char *output_ptr,
105
unsigned int output_height,
106
const short *vp8_filter
108
extern void vp8_filter_block1d8_v6_only_sse2
110
unsigned char *src_ptr,
111
unsigned int src_pixels_per_line,
112
unsigned char *output_ptr,
114
unsigned int output_height,
115
const short *vp8_filter
120
void vp8_sixtap_predict4x4_mmx
122
unsigned char *src_ptr,
123
int src_pixels_per_line,
126
unsigned char *dst_ptr,
130
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */
131
const short *HFilter, *VFilter;
132
HFilter = vp8_six_tap_mmx[xoffset];
133
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
134
VFilter = vp8_six_tap_mmx[yoffset];
135
vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
140
void vp8_sixtap_predict16x16_mmx
142
unsigned char *src_ptr,
143
int src_pixels_per_line,
146
unsigned char *dst_ptr,
151
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
153
const short *HFilter, *VFilter;
156
HFilter = vp8_six_tap_mmx[xoffset];
158
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
159
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
160
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
161
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
163
VFilter = vp8_six_tap_mmx[yoffset];
164
vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
165
vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
166
vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
167
vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
172
void vp8_sixtap_predict8x8_mmx
174
unsigned char *src_ptr,
175
int src_pixels_per_line,
178
unsigned char *dst_ptr,
183
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
185
const short *HFilter, *VFilter;
187
HFilter = vp8_six_tap_mmx[xoffset];
188
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
189
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
191
VFilter = vp8_six_tap_mmx[yoffset];
192
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
193
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
198
void vp8_sixtap_predict8x4_mmx
200
unsigned char *src_ptr,
201
int src_pixels_per_line,
204
unsigned char *dst_ptr,
209
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
211
const short *HFilter, *VFilter;
213
HFilter = vp8_six_tap_mmx[xoffset];
214
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
215
vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
217
VFilter = vp8_six_tap_mmx[yoffset];
218
vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
219
vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
225
void vp8_bilinear_predict16x16_mmx
227
unsigned char *src_ptr,
228
int src_pixels_per_line,
231
unsigned char *dst_ptr,
235
vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
236
vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
237
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
238
vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
244
void vp8_sixtap_predict16x16_sse2
246
unsigned char *src_ptr,
247
int src_pixels_per_line,
250
unsigned char *dst_ptr,
255
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
257
const short *HFilter, *VFilter;
263
HFilter = vp8_six_tap_mmx[xoffset];
264
vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
265
VFilter = vp8_six_tap_mmx[yoffset];
266
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
270
/* First-pass only */
271
HFilter = vp8_six_tap_mmx[xoffset];
272
vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
277
/* Second-pass only */
278
VFilter = vp8_six_tap_mmx[yoffset];
279
vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
280
vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
285
void vp8_sixtap_predict8x8_sse2
287
unsigned char *src_ptr,
288
int src_pixels_per_line,
291
unsigned char *dst_ptr,
295
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
296
const short *HFilter, *VFilter;
302
HFilter = vp8_six_tap_mmx[xoffset];
303
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
304
VFilter = vp8_six_tap_mmx[yoffset];
305
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
309
/* First-pass only */
310
HFilter = vp8_six_tap_mmx[xoffset];
311
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
316
/* Second-pass only */
317
VFilter = vp8_six_tap_mmx[yoffset];
318
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
323
void vp8_sixtap_predict8x4_sse2
325
unsigned char *src_ptr,
326
int src_pixels_per_line,
329
unsigned char *dst_ptr,
333
DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
334
const short *HFilter, *VFilter;
340
HFilter = vp8_six_tap_mmx[xoffset];
341
vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
342
VFilter = vp8_six_tap_mmx[yoffset];
343
vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
347
/* First-pass only */
348
HFilter = vp8_six_tap_mmx[xoffset];
349
vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
354
/* Second-pass only */
355
VFilter = vp8_six_tap_mmx[yoffset];
356
vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
364
extern void vp8_filter_block1d8_h6_ssse3
366
unsigned char *src_ptr,
367
unsigned int src_pixels_per_line,
368
unsigned char *output_ptr,
369
unsigned int output_pitch,
370
unsigned int output_height,
371
unsigned int vp8_filter_index
374
extern void vp8_filter_block1d16_h6_ssse3
376
unsigned char *src_ptr,
377
unsigned int src_pixels_per_line,
378
unsigned char *output_ptr,
379
unsigned int output_pitch,
380
unsigned int output_height,
381
unsigned int vp8_filter_index
384
extern void vp8_filter_block1d16_v6_ssse3
386
unsigned char *src_ptr,
387
unsigned int src_pitch,
388
unsigned char *output_ptr,
389
unsigned int out_pitch,
390
unsigned int output_height,
391
unsigned int vp8_filter_index
394
extern void vp8_filter_block1d8_v6_ssse3
396
unsigned char *src_ptr,
397
unsigned int src_pitch,
398
unsigned char *output_ptr,
399
unsigned int out_pitch,
400
unsigned int output_height,
401
unsigned int vp8_filter_index
404
extern void vp8_filter_block1d4_h6_ssse3
406
unsigned char *src_ptr,
407
unsigned int src_pixels_per_line,
408
unsigned char *output_ptr,
409
unsigned int output_pitch,
410
unsigned int output_height,
411
unsigned int vp8_filter_index
414
extern void vp8_filter_block1d4_v6_ssse3
416
unsigned char *src_ptr,
417
unsigned int src_pitch,
418
unsigned char *output_ptr,
419
unsigned int out_pitch,
420
unsigned int output_height,
421
unsigned int vp8_filter_index
424
void vp8_sixtap_predict16x16_ssse3
426
unsigned char *src_ptr,
427
int src_pixels_per_line,
430
unsigned char *dst_ptr,
435
DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
441
vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
442
src_pixels_per_line, FData2,
444
vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
449
/* First-pass only */
450
vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
451
dst_ptr, dst_pitch, 16, xoffset);
458
/* Second-pass only */
459
vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
461
dst_ptr, dst_pitch, 16, yoffset);
465
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
466
* yoffset==0) case correctly. Add copy function here to guarantee
467
* six-tap function handles all possible offsets. */
468
vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
473
void vp8_sixtap_predict8x8_ssse3
475
unsigned char *src_ptr,
476
int src_pixels_per_line,
479
unsigned char *dst_ptr,
483
DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
489
vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
490
src_pixels_per_line, FData2,
492
vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
497
vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
498
dst_ptr, dst_pitch, 8, xoffset);
505
/* Second-pass only */
506
vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
508
dst_ptr, dst_pitch, 8, yoffset);
512
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
513
* yoffset==0) case correctly. Add copy function here to guarantee
514
* six-tap function handles all possible offsets. */
515
vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
521
void vp8_sixtap_predict8x4_ssse3
523
unsigned char *src_ptr,
524
int src_pixels_per_line,
527
unsigned char *dst_ptr,
531
DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
537
vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
538
src_pixels_per_line, FData2,
540
vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
545
/* First-pass only */
546
vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
547
dst_ptr, dst_pitch, 4, xoffset);
554
/* Second-pass only */
555
vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
557
dst_ptr, dst_pitch, 4, yoffset);
561
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
562
* yoffset==0) case correctly. Add copy function here to guarantee
563
* six-tap function handles all possible offsets. */
564
vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
569
void vp8_sixtap_predict4x4_ssse3
571
unsigned char *src_ptr,
572
int src_pixels_per_line,
575
unsigned char *dst_ptr,
579
DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
585
vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
587
FData2, 4, 9, xoffset);
588
vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
593
vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
594
dst_ptr, dst_pitch, 4, xoffset);
601
vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
603
dst_ptr, dst_pitch, 4, yoffset);
607
/* ssse3 second-pass only function couldn't handle (xoffset==0 &&
608
* yoffset==0) case correctly. Add copy function here to guarantee
609
* six-tap function handles all possible offsets. */
612
for (r = 0; r < 4; r++)
614
dst_ptr[0] = src_ptr[0];
615
dst_ptr[1] = src_ptr[1];
616
dst_ptr[2] = src_ptr[2];
617
dst_ptr[3] = src_ptr[3];
618
dst_ptr += dst_pitch;
619
src_ptr += src_pixels_per_line;