1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3
;*****************************************************************************
4
;* Copyright (C) 2011 x264 project
6
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8
;* This file is part of Libav.
10
;* Libav is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
15
;* Libav is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
;* Lesser General Public License for more details.
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with Libav; if not, write to the Free Software
22
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
26
%include "x86util.asm"
34
pw_pixel_max: times 8 dw ((1 << 10)-1)
36
pad10: times 8 dw 10*1023
37
pad20: times 8 dw 20*1023
38
pad30: times 8 dw 30*1023
39
depad: times 4 dd 32*20*1023 + 512
40
depad2: times 8 dw 20*1023 + 16*1022 + 16
41
unpad: times 8 dw 16*1022/32 ; needs to be mod 16
43
tap1: times 4 dw 1, -5
44
tap2: times 4 dw 20, 20
45
tap3: times 4 dw -5, 1
46
pd_0f: times 4 dd 0xffff
69
psubw %1, %2 ; (a-b)/4-b
70
paddw %1, %3 ; (a-b)/4-b+c
71
psraw %1, 2 ; ((a-b)/4-b+c)/4
72
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
94
FILT_H %1, %7, %8, [pw_16]
96
CLIPW %1, [pb_0], [pw_pixel_max]
106
%define OP_MOV AVG_MOV
116
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
119
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
124
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
126
call stub_%2_h264_qpel%4_%3_10_%1
131
call stub_%2_h264_qpel%4_%3_10_%1
136
call stub_%2_h264_qpel%4_%3_10_%1
139
lea r0, [r0+r2*%4+%4*2]
140
lea r1, [r1+r2*%4+%4*2]
141
call stub_%2_h264_qpel%4_%3_10_%1
146
call stub_%2_h264_qpel%4_%3_10_%1
149
call stub_%2_h264_qpel%4_%3_10_%1
152
call stub_%2_h264_qpel%4_%3_10_%1
153
lea r0, [r10+r2*%4+%4*2]
154
lea r1, [r11+r2*%4+%4*2]
155
%ifndef UNIX64 ; fall through to function
156
call stub_%2_h264_qpel%4_%3_10_%1
162
;cpu, put/avg, mc, 4/8, ...
165
MCAxA %1, %2, %3, %4, i, %5,%6,%7
167
cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
168
%ifndef UNIX64 ; no prologue or epilogue for UNIX64
169
call stub_%2_h264_qpel%4_%3_10_%1
173
stub_%2_h264_qpel%4_%3_10_%1:
176
;-----------------------------------------------------------------------------
177
; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
178
;-----------------------------------------------------------------------------
192
cglobal_mc mmxext, %1, mc00, 4, 3,4,0
198
cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
206
cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
216
OP_MOV [r0+r2+16], m1
227
%define OP_MOV AVG_MOV
230
;-----------------------------------------------------------------------------
231
; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
232
;-----------------------------------------------------------------------------
235
%define PALIGNR PALIGNR_MMX
239
%1 sse2_cache64 , put, 8
240
%define PALIGNR PALIGNR_SSSE3
241
%1 ssse3_cache64, put, 8
244
%define OP_MOV AVG_MOV
245
%define PALIGNR PALIGNR_MMX
249
%1 sse2_cache64 , avg, 8
250
%define PALIGNR PALIGNR_SSSE3
251
%1 ssse3_cache64, avg, 8
256
cglobal_mc %1, %2, mc20, %3, 3,4,9
258
mova m1, [pw_pixel_max]
273
%else ; movu is slow on these processors
281
PALIGNR m3, m0, m2, 2, m5
282
PALIGNR m7, m0, m2, 8, m5
284
PALIGNR m4, m0, m2, 4, m5
285
PALIGNR m7, m0, m2, 6, m5
291
PALIGNR m3, m6, m2, 2, m5
293
PALIGNR m4, m6, m2, 4, m5
294
PALIGNR m7, m6, m2, 6, m5
300
FILT_H m2, m3, m4, p16
314
;-----------------------------------------------------------------------------
315
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
316
;-----------------------------------------------------------------------------
318
cglobal_mc %1, %2, mc30, %3, 3,5,9
320
jmp stub_%2_h264_qpel%3_mc10_10_%1.body
325
;-----------------------------------------------------------------------------
326
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
327
;-----------------------------------------------------------------------------
329
cglobal_mc %1, %2, mc10, %3, 3,5,9
333
mova m1, [pw_pixel_max]
348
%else ; movu is slow on these processors
356
PALIGNR m3, m0, m2, 2, m5
357
PALIGNR m7, m0, m2, 8, m5
359
PALIGNR m4, m0, m2, 4, m5
360
PALIGNR m7, m0, m2, 6, m5
366
PALIGNR m3, m6, m2, 2, m5
368
PALIGNR m4, m6, m2, 4, m5
369
PALIGNR m7, m6, m2, 6, m5
375
FILT_H m2, m3, m4, p16
392
;-----------------------------------------------------------------------------
393
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
394
;-----------------------------------------------------------------------------
399
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
409
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
418
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
424
cglobal_mc %1, %2, mc02, %3, 3,4,8
431
call v_filt%3_ %+ i %+ _10_%1.no_addr4
441
;-----------------------------------------------------------------------------
442
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
443
;-----------------------------------------------------------------------------
445
cglobal_mc %1, %2, mc01, %3, 3,5,8
455
call v_filt%3_ %+ i %+ _10_%1
467
;-----------------------------------------------------------------------------
468
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
469
;-----------------------------------------------------------------------------
471
cglobal_mc %1, %2, mc03, %3, 3,5,8
473
jmp stub_%2_h264_qpel%3_mc01_10_%1.body
478
;-----------------------------------------------------------------------------
479
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
480
;-----------------------------------------------------------------------------
481
%macro H_FILT_AVG 3-4
483
;FILT_H with fewer registers and averaged with the FILT_V result
484
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
485
;unfortunately I need three registers, so m5 will have to be re-read from memory
492
psraw m5, 2 ; (a-b)/4
493
psubw m5, m6 ; (a-b)/4-b
496
paddw m5, m6 ; (a-b)/4-b+c
497
psraw m5, 2 ; ((a-b)/4-b+c)/4
498
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
500
CLIPW m5, [pb_0], [pw_pixel_max]
513
H_FILT_AVG mmxext, 4, i
517
H_FILT_AVG mmxext, 4, i, 0
524
H_FILT_AVG sse2, 8, i, 0
526
H_FILT_AVG sse2, 8, i
533
; this REALLY needs x86_64
534
cglobal_mc %1, %2, mc11, %3, 3,6,8
546
call v_filt%3_ %+ i %+ _10_%1
547
call h_filt%3_ %+ i %+ _10_%1
560
;-----------------------------------------------------------------------------
561
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
562
;-----------------------------------------------------------------------------
564
cglobal_mc %1, %2, mc31, %3, 3,6,8
567
jmp stub_%2_h264_qpel%3_mc11_10_%1.body
572
;-----------------------------------------------------------------------------
573
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
574
;-----------------------------------------------------------------------------
576
cglobal_mc %1, %2, mc13, %3, 3,7,12
578
jmp stub_%2_h264_qpel%3_mc11_10_%1.body
583
;-----------------------------------------------------------------------------
584
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
585
;-----------------------------------------------------------------------------
587
cglobal_mc %1, %2, mc33, %3, 3,6,8
590
jmp stub_%2_h264_qpel%3_mc11_10_%1.body
595
;-----------------------------------------------------------------------------
596
; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
597
;-----------------------------------------------------------------------------
602
psubw %1, %2 ; a-5*b+4*c
604
paddw %1, %3 ; a-5*b+20*c
626
neg r2 ; This actually saves instructions
627
lea r1, [r1+r2*2-mmsize+PAD]
628
lea r4, [rsp+PAD+gprsize]
643
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
645
movu [r4+i*mmsize*3], m0
650
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
652
movu [r4+i*mmsize*3], m0
654
lea r1, [r1+r2*8+mmsize]
682
movu m1, [r1+mmsize-4]
683
movu m2, [r1+mmsize-2]
684
mova m3, [r1+mmsize+0]
685
movu m4, [r1+mmsize+2]
686
movu m5, [r1+mmsize+4]
687
movu m6, [r1+mmsize+6]
734
cglobal_mc %1, %2, mc22, %3, 3,7,12
735
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
736
mov r6, rsp ; backup stack pointer
737
and rsp, ~(mmsize-1) ; align stack
743
mova m7, [pw_pixel_max]
760
mov rsp, r6 ; restore stack pointer
766
;-----------------------------------------------------------------------------
767
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
768
;-----------------------------------------------------------------------------
770
cglobal_mc %1, %2, mc12, %3, 3,7,12
771
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
772
mov r6, rsp ; backup stack pointer
773
and rsp, ~(mmsize-1) ; align stack
782
mova m7, [pw_pixel_max]
793
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
805
mov rsp, r6 ; restore stack pointer
811
;-----------------------------------------------------------------------------
812
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
813
;-----------------------------------------------------------------------------
815
cglobal_mc %1, %2, mc32, %3, 3,7,12
816
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
817
mov r6, rsp ; backup stack pointer
818
and rsp, ~(mmsize-1) ; align stack
823
mov r4d, 2 ; sizeof(pixel)
824
jmp stub_%2_h264_qpel%3_mc12_10_%1.body
829
;-----------------------------------------------------------------------------
830
; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
831
;-----------------------------------------------------------------------------
863
cglobal_mc %1, %2, mc21, %3, 3,7,12
866
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
867
mov r6, rsp ; backup stack pointer
868
and rsp, ~(mmsize-1) ; align stack
876
mov r4d, PAD-mmsize ; H buffer
877
jmp stub_%2_h264_qpel%3_mc12_10_%1.body
882
;-----------------------------------------------------------------------------
883
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
884
;-----------------------------------------------------------------------------
886
cglobal_mc %1, %2, mc23, %3, 3,7,12
888
jmp stub_%2_h264_qpel%3_mc21_10_%1.body