1
;******************************************************************************
2
;* MMX/SSSE3-optimized functions for H264 chroma MC
3
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4
;* 2005-2008 Loren Merritt
6
;* This file is part of Libav.
8
;* Libav is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
13
;* Libav is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
;* Lesser General Public License for more details.
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with Libav; if not, write to the Free Software
20
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
24
%include "x86util.asm"
28
rnd_rv40_2d_tbl: times 4 dw 0
44
rnd_rv40_1d_tbl: times 4 dw 0
70
%macro mv0_pixels_mc8 0
75
CHROMAMC_AVG mm0, [r0 ]
76
CHROMAMC_AVG mm1, [r0+r2]
83
CHROMAMC_AVG mm0, [r0 ]
84
CHROMAMC_AVG mm1, [r0+r2]
93
%macro chroma_mc8_mmx_func 3
94
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
95
; int stride, int h, int mx, int my)
96
cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
102
jne .at_least_one_non_zero
103
; mx == 0 AND my == 0 - no filter needed
107
.at_least_one_non_zero
110
%define rnd_1d_rv40 r11
111
%define rnd_2d_rv40 r11
113
%define rnd_1d_rv40 rnd_rv40_1d_tbl
114
%define rnd_2d_rv40 rnd_rv40_2d_tbl
118
and r10, 6 ; &~1 for mx/my=[0,7]
125
and r0, 6 ; &~1 for mx/my=[0,7]
140
mov r6, r2 ; dxy = x ? 1 : stride
143
; mx == 0 XOR my == 0 - 1 dimensional filter only
148
lea r11, [rnd_rv40_1d_tbl]
157
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
159
punpckldq m5, m5 ; mm5 = B = x
161
psubw m4, m5 ; mm4 = A = 8-x
164
movq m0, [r1 ] ; mm0 = src[0..7]
165
movq m2, [r1+r6] ; mm1 = src[1..8]
173
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
175
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
185
CHROMAMC_AVG m0, [dest_reg]
186
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
194
.both_non_zero ; general case, bilinear
199
lea r11, [rnd_rv40_2d_tbl]
205
mov r6, rsp ; backup stack pointer
206
and rsp, ~(mmsize-1) ; align stack
207
sub rsp, 16 ; AA and DD
211
punpckldq m4, m4 ; mm4 = x words
212
punpckldq m6, m6 ; mm6 = y words
214
pmullw m4, m6 ; mm4 = x * y
219
movq [rsp+8], m4 ; DD = x * y
220
psubw m5, m4 ; mm5 = B = 8x - xy
221
psubw m6, m4 ; mm6 = C = 8y - xy
223
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
227
movq m0, [r1 ] ; mm0 = src[0..7]
228
movq m1, [r1+1] ; mm1 = src[1..8]
242
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
243
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
252
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
262
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
265
paddw m2, [rnd_2d_%2+rnd_bias*8]
266
paddw m3, [rnd_2d_%2+rnd_bias*8]
270
CHROMAMC_AVG m2, [dest_reg]
271
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
276
mov rsp, r6 ; restore stack pointer
280
%macro chroma_mc4_mmx_func 3
281
cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
299
lea r11, [rnd_rv40_2d_tbl]
300
%define rnd_2d_rv40 r11
302
%define rnd_2d_rv40 rnd_rv40_2d_tbl
304
and r5, 6 ; &~1 for mx/my=[0,7]
334
paddw m6, [rnd_2d_%2+rnd_bias*8]
338
CHROMAMC_AVG4 m1, m6, [r0]
353
paddw m0, [rnd_2d_%2+rnd_bias*8]
357
CHROMAMC_AVG4 m1, m0, [r0]
365
%macro chroma_mc2_mmx_func 3
366
cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
375
imul r5d, r4d ; x*y<<16 | y*(8-x)
377
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
381
punpckldq m5, m5 ; mm5 = {A,B,A,B}
382
punpckldq m6, m6 ; mm6 = {C,D,C,D}
386
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
391
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
394
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
397
paddw m1, [rnd_2d_%2]
398
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
402
CHROMAMC_AVG4 m1, m3, [r0]
411
%define rnd_1d_h264 pw_4
412
%define rnd_2d_h264 pw_32
413
%define rnd_1d_vc1 pw_3
414
%define rnd_2d_vc1 pw_28
427
%define CHROMAMC_AVG NOTHING
428
%define CHROMAMC_AVG4 NOTHING
429
chroma_mc8_mmx_func put, h264, mmx_rnd
430
chroma_mc8_mmx_func put, vc1, mmx_nornd
431
chroma_mc8_mmx_func put, rv40, mmx
432
chroma_mc4_mmx_func put, h264, mmx
433
chroma_mc4_mmx_func put, rv40, mmx
434
chroma_mc2_mmx_func put, h264, mmx2
436
%define CHROMAMC_AVG DIRECT_AVG
437
%define CHROMAMC_AVG4 COPY_AVG
439
chroma_mc8_mmx_func avg, h264, mmx2_rnd
440
chroma_mc8_mmx_func avg, vc1, mmx2_nornd
441
chroma_mc8_mmx_func avg, rv40, mmx2
442
chroma_mc4_mmx_func avg, h264, mmx2
443
chroma_mc4_mmx_func avg, rv40, mmx2
444
chroma_mc2_mmx_func avg, h264, mmx2
447
chroma_mc8_mmx_func avg, h264, 3dnow_rnd
448
chroma_mc8_mmx_func avg, vc1, 3dnow_nornd
449
chroma_mc8_mmx_func avg, rv40, 3dnow
450
chroma_mc4_mmx_func avg, h264, 3dnow
451
chroma_mc4_mmx_func avg, rv40, 3dnow
453
%macro chroma_mc8_ssse3_func 3
454
cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
460
jne .at_least_one_non_zero
461
; mx == 0 AND my == 0 - no filter needed
465
.at_least_one_non_zero
471
; general case, bilinear
475
add r4, 8 ; x*288+8 = x<<8 | (8-x)
478
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
479
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
483
movdqa m5, [rnd_2d_%2]
531
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
533
movdqa m6, [rnd_1d_%2]
568
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
570
movdqa m6, [rnd_1d_%2]
602
%macro chroma_mc4_ssse3_func 3
603
cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
613
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
614
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
623
punpcklbw m0, [r1 +1]
628
punpcklbw m1, [r1 +1]
629
punpcklbw m3, [r1+r2+1]
646
CHROMAMC_AVG m1, [r0 ]
647
CHROMAMC_AVG m3, [r0+r2]
656
%define CHROMAMC_AVG NOTHING
658
chroma_mc8_ssse3_func put, h264, ssse3_rnd
659
chroma_mc8_ssse3_func put, vc1, ssse3_nornd
661
chroma_mc4_ssse3_func put, h264, ssse3
663
%define CHROMAMC_AVG DIRECT_AVG
666
chroma_mc8_ssse3_func avg, h264, ssse3_rnd
667
chroma_mc8_ssse3_func avg, vc1, ssse3_nornd
669
chroma_mc4_ssse3_func avg, h264, ssse3