100
99
%define OP_MOV mova
106
105
%define OP_MOV AVG_MOV
116
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
119
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
124
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
126
call stub_%2_h264_qpel%4_%3_10_%1
131
call stub_%2_h264_qpel%4_%3_10_%1
136
call stub_%2_h264_qpel%4_%3_10_%1
139
lea r0, [r0+r2*%4+%4*2]
140
lea r1, [r1+r2*%4+%4*2]
141
call stub_%2_h264_qpel%4_%3_10_%1
114
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
115
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
120
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
125
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
128
lea r0, [r0+r2*%3+%3*2]
129
lea r1, [r1+r2*%3+%3*2]
130
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
143
132
%else ; ARCH_X86_64
146
call stub_%2_h264_qpel%4_%3_10_%1
149
call stub_%2_h264_qpel%4_%3_10_%1
152
call stub_%2_h264_qpel%4_%3_10_%1
153
lea r0, [r10+r2*%4+%4*2]
154
lea r1, [r11+r2*%4+%4*2]
155
%ifndef UNIX64 ; fall through to function
156
call stub_%2_h264_qpel%4_%3_10_%1
133
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
137
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
139
lea r1, [r %+ p1+%3*2]
140
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
142
lea r1, [r %+ p1+r2*%3]
143
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
144
lea r0, [r%6+r2*%3+%3*2]
145
lea r1, [r %+ p1+r2*%3+%3*2]
146
%if UNIX64 == 0 ; fall through to function
147
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
162
153
;cpu, put/avg, mc, 4/8, ...
165
MCAxA %1, %2, %3, %4, i, %5,%6,%7
156
%if ARCH_X86_32 || cpuflag(sse2)
157
MCAxA_OP %1, %2, %3, i, %4,%5,%6
167
cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
168
%ifndef UNIX64 ; no prologue or epilogue for UNIX64
169
call stub_%2_h264_qpel%4_%3_10_%1
160
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
161
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
162
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
173
stub_%2_h264_qpel%4_%3_10_%1:
166
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
176
169
;-----------------------------------------------------------------------------
192
cglobal_mc mmxext, %1, mc00, 4, 3,4,0
185
cglobal_mc %1, mc00, 4, 3,4,0
198
cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
191
cglobal %1_h264_qpel8_mc00_10, 3,4
201
194
lea r0, [r0+r2*4]
232
225
;-----------------------------------------------------------------------------
233
226
%macro MC_CACHE 1
234
227
%define OP_MOV mova
235
%define PALIGNR PALIGNR_MMX
239
%1 sse2_cache64 , put, 8
240
%define PALIGNR PALIGNR_SSSE3
241
%1 ssse3_cache64, put, 8
230
INIT_XMM sse2, cache64
232
INIT_XMM ssse3, cache64
244
237
%define OP_MOV AVG_MOV
245
%define PALIGNR PALIGNR_MMX
249
%1 sse2_cache64 , avg, 8
250
%define PALIGNR PALIGNR_SSSE3
251
%1 ssse3_cache64, avg, 8
240
INIT_XMM sse2, cache64
242
INIT_XMM ssse3, cache64
256
cglobal_mc %1, %2, mc20, %3, 3,4,9
249
cglobal_mc %1, mc20, %2, 3,4,9
258
251
mova m1, [pw_pixel_max]
259
252
%if num_mmregs > 8
314
307
;-----------------------------------------------------------------------------
315
308
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
316
309
;-----------------------------------------------------------------------------
318
cglobal_mc %1, %2, mc30, %3, 3,5,9
311
cglobal_mc %1, mc30, %2, 3,5,9
320
jmp stub_%2_h264_qpel%3_mc10_10_%1.body
313
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
325
318
;-----------------------------------------------------------------------------
326
319
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
327
320
;-----------------------------------------------------------------------------
329
cglobal_mc %1, %2, mc10, %3, 3,5,9
322
cglobal_mc %1, mc10, %2, 3,5,9
333
326
mova m1, [pw_pixel_max]
334
327
%if num_mmregs > 8
392
385
;-----------------------------------------------------------------------------
393
386
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
394
387
;-----------------------------------------------------------------------------
399
392
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
406
399
RESET_MM_PERMUTATION
409
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
402
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
415
408
RESET_MM_PERMUTATION
418
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
411
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
424
cglobal_mc %1, %2, mc02, %3, 3,4,8
417
cglobal_mc %1, mc02, %2, 3,4,8
430
423
%assign i (j % 6)
431
call v_filt%3_ %+ i %+ _10_%1.no_addr4
424
call v_filt%2_ %+ i %+ _10.no_addr4
441
434
;-----------------------------------------------------------------------------
442
435
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
443
436
;-----------------------------------------------------------------------------
445
cglobal_mc %1, %2, mc01, %3, 3,5,8
438
cglobal_mc %1, mc01, %2, 3,5,8
454
447
%assign i (j % 6)
455
call v_filt%3_ %+ i %+ _10_%1
448
call v_filt%2_ %+ i %+ _10
467
460
;-----------------------------------------------------------------------------
468
461
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
469
462
;-----------------------------------------------------------------------------
471
cglobal_mc %1, %2, mc03, %3, 3,5,8
464
cglobal_mc %1, mc03, %2, 3,5,8
473
jmp stub_%2_h264_qpel%3_mc01_10_%1.body
466
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
478
471
;-----------------------------------------------------------------------------
479
472
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
480
473
;-----------------------------------------------------------------------------
481
%macro H_FILT_AVG 3-4
474
%macro H_FILT_AVG 2-3
483
476
;FILT_H with fewer registers and averaged with the FILT_V result
484
477
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
485
478
;unfortunately I need three registers, so m5 will have to be re-read from memory
510
503
RESET_MM_PERMUTATION
513
H_FILT_AVG mmxext, 4, i
517
H_FILT_AVG mmxext, 4, i, 0
520
513
RESET_MM_PERMUTATION
524
H_FILT_AVG sse2, 8, i, 0
526
H_FILT_AVG sse2, 8, i
533
526
; this REALLY needs x86_64
534
cglobal_mc %1, %2, mc11, %3, 3,6,8
527
cglobal_mc %1, mc11, %2, 3,6,8
560
553
;-----------------------------------------------------------------------------
561
554
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
562
555
;-----------------------------------------------------------------------------
564
cglobal_mc %1, %2, mc31, %3, 3,6,8
557
cglobal_mc %1, mc31, %2, 3,6,8
567
jmp stub_%2_h264_qpel%3_mc11_10_%1.body
560
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
572
565
;-----------------------------------------------------------------------------
573
566
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
574
567
;-----------------------------------------------------------------------------
576
cglobal_mc %1, %2, mc13, %3, 3,7,12
569
cglobal_mc %1, mc13, %2, 3,7,12
578
jmp stub_%2_h264_qpel%3_mc11_10_%1.body
571
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
583
576
;-----------------------------------------------------------------------------
584
577
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
585
578
;-----------------------------------------------------------------------------
587
cglobal_mc %1, %2, mc33, %3, 3,6,8
580
cglobal_mc %1, mc33, %2, 3,6,8
590
jmp stub_%2_h264_qpel%3_mc11_10_%1.body
583
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
734
cglobal_mc %1, %2, mc22, %3, 3,7,12
727
cglobal_mc %1, mc22, %2, 3,7,12
735
728
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
736
729
mov r6, rsp ; backup stack pointer
737
730
and rsp, ~(mmsize-1) ; align stack
743
736
mova m7, [pw_pixel_max]
744
737
%if num_mmregs > 8
766
759
;-----------------------------------------------------------------------------
767
760
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
768
761
;-----------------------------------------------------------------------------
770
cglobal_mc %1, %2, mc12, %3, 3,7,12
763
cglobal_mc %1, mc12, %2, 3,7,12
771
764
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
772
765
mov r6, rsp ; backup stack pointer
773
766
and rsp, ~(mmsize-1) ; align stack
782
775
mova m7, [pw_pixel_max]
783
776
%if num_mmregs > 8
811
804
;-----------------------------------------------------------------------------
812
805
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
813
806
;-----------------------------------------------------------------------------
815
cglobal_mc %1, %2, mc32, %3, 3,7,12
808
cglobal_mc %1, mc32, %2, 3,7,12
816
809
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
817
810
mov r6, rsp ; backup stack pointer
818
811
and rsp, ~(mmsize-1) ; align stack
823
816
mov r4d, 2 ; sizeof(pixel)
824
jmp stub_%2_h264_qpel%3_mc12_10_%1.body
817
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
863
cglobal_mc %1, %2, mc21, %3, 3,7,12
856
cglobal_mc %1, mc21, %2, 3,7,12
866
859
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
867
860
mov r6, rsp ; backup stack pointer
868
861
and rsp, ~(mmsize-1) ; align stack
876
869
mov r4d, PAD-mmsize ; H buffer
877
jmp stub_%2_h264_qpel%3_mc12_10_%1.body
870
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
882
875
;-----------------------------------------------------------------------------
883
876
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
884
877
;-----------------------------------------------------------------------------
886
cglobal_mc %1, %2, mc23, %3, 3,7,12
879
cglobal_mc %1, mc23, %2, 3,7,12
888
jmp stub_%2_h264_qpel%3_mc21_10_%1.body
881
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body