1
;******************************************************************************
2
;* MMX/SSSE3-optimized functions for H264 chroma MC
3
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4
;* 2005-2008 Loren Merritt
6
;* This file is part of FFmpeg.
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
;* Lesser General Public License for more details.
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
23
%include "libavutil/x86/x86util.asm"
27
rnd_rv40_2d_tbl: times 4 dw 0
43
rnd_rv40_1d_tbl: times 4 dw 0
69
%macro mv0_pixels_mc8 0
75
CHROMAMC_AVG mm0, [r0 ]
76
CHROMAMC_AVG mm1, [r0+r2]
83
CHROMAMC_AVG mm0, [r0 ]
84
CHROMAMC_AVG mm1, [r0+r2]
92
%macro chroma_mc8_mmx_func 2-3
95
%define rnd_1d_rv40 r8
96
%define rnd_2d_rv40 r8
99
%define rnd_1d_rv40 rnd_rv40_1d_tbl
100
%define rnd_2d_rv40 rnd_rv40_2d_tbl
106
; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
107
; uint8_t *src /* align 1 */,
108
; int stride, int h, int mx, int my)
109
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
115
jne .at_least_one_non_zero
116
; mx == 0 AND my == 0 - no filter needed
120
.at_least_one_non_zero:
124
and r7, 6 ; &~1 for mx/my=[0,7]
131
and r0, 6 ; &~1 for mx/my=[0,7]
146
mov r6, r2 ; dxy = x ? 1 : stride
149
; mx == 0 XOR my == 0 - 1 dimensional filter only
154
lea r8, [rnd_rv40_1d_tbl]
163
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
165
punpckldq m5, m5 ; mm5 = B = x
167
psubw m4, m5 ; mm4 = A = 8-x
170
movq m0, [r1 ] ; mm0 = src[0..7]
171
movq m2, [r1+r6] ; mm1 = src[1..8]
179
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
181
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
191
CHROMAMC_AVG m0, [dest_reg]
192
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
200
.both_non_zero: ; general case, bilinear
205
lea r8, [rnd_rv40_2d_tbl]
211
mov r6, rsp ; backup stack pointer
212
and rsp, ~(mmsize-1) ; align stack
213
sub rsp, 16 ; AA and DD
217
punpckldq m4, m4 ; mm4 = x words
218
punpckldq m6, m6 ; mm6 = y words
220
pmullw m4, m6 ; mm4 = x * y
225
movq [rsp+8], m4 ; DD = x * y
226
psubw m5, m4 ; mm5 = B = 8x - xy
227
psubw m6, m4 ; mm6 = C = 8y - xy
229
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
233
movq m0, [r1 ] ; mm0 = src[0..7]
234
movq m1, [r1+1] ; mm1 = src[1..8]
248
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
249
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
258
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
268
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
271
paddw m2, [rnd_2d_%2+rnd_bias*8]
272
paddw m3, [rnd_2d_%2+rnd_bias*8]
276
CHROMAMC_AVG m2, [dest_reg]
277
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
282
mov rsp, r6 ; restore stack pointer
286
%macro chroma_mc4_mmx_func 2
293
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
311
lea r6, [rnd_rv40_2d_tbl]
312
%define rnd_2d_rv40 r6
314
%define rnd_2d_rv40 rnd_rv40_2d_tbl
316
and r5, 6 ; &~1 for mx/my=[0,7]
346
paddw m6, [rnd_2d_%2+rnd_bias*8]
350
CHROMAMC_AVG4 m1, m6, [r0]
365
paddw m0, [rnd_2d_%2+rnd_bias*8]
369
CHROMAMC_AVG4 m1, m0, [r0]
377
%macro chroma_mc2_mmx_func 2
378
cglobal %1_%2_chroma_mc2, 6, 7, 0
387
imul r5d, r4d ; x*y<<16 | y*(8-x)
389
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
393
punpckldq m5, m5 ; mm5 = {A,B,A,B}
394
punpckldq m6, m6 ; mm6 = {C,D,C,D}
398
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
403
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
406
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
409
paddw m1, [rnd_2d_%2]
410
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
414
CHROMAMC_AVG4 m1, m3, [r0]
423
%define rnd_1d_h264 pw_4
424
%define rnd_2d_h264 pw_32
425
%define rnd_1d_vc1 pw_3
426
%define rnd_2d_vc1 pw_28
439
%define CHROMAMC_AVG NOTHING
440
%define CHROMAMC_AVG4 NOTHING
441
chroma_mc8_mmx_func put, h264, _rnd
442
chroma_mc8_mmx_func put, vc1, _nornd
443
chroma_mc8_mmx_func put, rv40
444
chroma_mc4_mmx_func put, h264
445
chroma_mc4_mmx_func put, rv40
448
chroma_mc2_mmx_func put, h264
450
%define CHROMAMC_AVG DIRECT_AVG
451
%define CHROMAMC_AVG4 COPY_AVG
452
chroma_mc8_mmx_func avg, h264, _rnd
453
chroma_mc8_mmx_func avg, vc1, _nornd
454
chroma_mc8_mmx_func avg, rv40
455
chroma_mc4_mmx_func avg, h264
456
chroma_mc4_mmx_func avg, rv40
457
chroma_mc2_mmx_func avg, h264
460
chroma_mc8_mmx_func avg, h264, _rnd
461
chroma_mc8_mmx_func avg, vc1, _nornd
462
chroma_mc8_mmx_func avg, rv40
463
chroma_mc4_mmx_func avg, h264
464
chroma_mc4_mmx_func avg, rv40
466
%macro chroma_mc8_ssse3_func 2-3
467
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
473
jne .at_least_one_non_zero
474
; mx == 0 AND my == 0 - no filter needed
478
.at_least_one_non_zero:
484
; general case, bilinear
489
add r4, 8 ; x*288+8 = x<<8 | (8-x)
491
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
492
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
496
movdqa m5, [rnd_2d_%2]
543
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
545
movdqa m6, [rnd_1d_%2]
580
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
582
movdqa m6, [rnd_1d_%2]
614
%macro chroma_mc4_ssse3_func 2
615
cglobal %1_%2_chroma_mc4, 6, 7, 0
625
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
626
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
639
punpcklbw m1, [r1+r2*1+1]
640
punpcklbw m3, [r1+r2*2+1]
657
CHROMAMC_AVG m1, [r0 ]
658
CHROMAMC_AVG m3, [r0+r2]
667
%define CHROMAMC_AVG NOTHING
669
chroma_mc8_ssse3_func put, h264, _rnd
670
chroma_mc8_ssse3_func put, vc1, _nornd
672
chroma_mc4_ssse3_func put, h264
674
%define CHROMAMC_AVG DIRECT_AVG
676
chroma_mc8_ssse3_func avg, h264, _rnd
677
chroma_mc8_ssse3_func avg, vc1, _nornd
679
chroma_mc4_ssse3_func avg, h264