1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
;* GNU General Public License for more details.
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
;*****************************************************************************
26
pb_00: times 16 db 0x00
27
pb_01: times 16 db 0x01
28
pb_03: times 16 db 0x03
29
pb_a1: times 16 db 0xa1
33
; expands to [base],...,[base+7*stride]
34
%define PASS8ROWS(base, base3, stride, stride3) \
35
[base], [base+stride], [base+stride*2], [base3], \
36
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
38
; in: 8 rows of 4 bytes in %1..%8
39
; out: 4 rows of 8 bytes in m0..m3
40
%macro TRANSPOSE4x8_LOAD 8
69
; in: 4 rows of 8 bytes in m0..m3
70
; out: 8 rows of 4 bytes in %1..%8
71
%macro TRANSPOSE8x4_STORE 8
111
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113
%macro TRANSPOSE6x8_MEM 9
121
SBUTTERFLY bw, m0, m1, m7
122
SBUTTERFLY bw, m2, m3, m1
123
SBUTTERFLY bw, m4, m5, m3
125
SBUTTERFLY bw, m6, %8, m5
126
SBUTTERFLY wd, m0, m2, m1
127
SBUTTERFLY wd, m4, m6, m2
130
SBUTTERFLY wd, m7, [%9+0x10], m6
131
SBUTTERFLY wd, m3, m5, m4
132
SBUTTERFLY dq, m7, m3, m0
133
SBUTTERFLY dq, m1, m2, m5
142
; in: 8 rows of 8 in %1..%8
143
; out: 8 rows of 8 in %9..%16
144
%macro TRANSPOSE8x8_MEM 16
152
SBUTTERFLY bw, m0, m1, m7
153
SBUTTERFLY bw, m2, m3, m1
154
SBUTTERFLY bw, m4, m5, m3
155
SBUTTERFLY bw, m6, %8, m5
157
SBUTTERFLY wd, m0, m2, m3
158
SBUTTERFLY wd, m4, m6, m2
159
SBUTTERFLY wd, m7, m1, m6
162
SBUTTERFLY wd, m2, m5, m1
163
SBUTTERFLY dq, m0, m4, m5
164
SBUTTERFLY dq, m7, m2, m4
169
SBUTTERFLY dq, m3, %11, m0
170
SBUTTERFLY dq, m6, m1, m5
177
; out: %4 = |%1-%2|>%3
188
; out: %4 = |%1-%2|>%3
209
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210
; out: m5=beta-1, m7=mask, %3=alpha-1
217
packuswb m4, m4 ; 16x alpha-1
218
packuswb m5, m5 ; 16x beta-1
222
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
225
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
234
%macro DEBLOCK_P0_Q0 0
237
pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
240
pavgb m3, m0 ; (p1 - q1 + 256)>>1
241
pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
243
pavgb m4, m2 ; (q0 - p0 + 256)>>1
245
paddusb m3, m4 ; d+128+33
246
mova m6, [pb_a1 GLOBAL]
248
psubusb m3, [pb_a1 GLOBAL]
258
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260
; clobbers: q2, tmp, tc0
264
pavgb %2, %6 ; avg(p2,avg(p0,q0))
266
pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277
;-----------------------------------------------------------------------------
278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279
;-----------------------------------------------------------------------------
281
cglobal x264_deblock_v_luma_sse2, 5,5,10
287
add r4, r0 ; pix-3*stride
289
mova m0, [r4+r1] ; p1
290
mova m1, [r4+2*r1] ; p0
292
mova m3, [r0+r1] ; q1
296
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
303
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
310
movdqa m4, [r0+2*r1] ; q2
311
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
316
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
323
;-----------------------------------------------------------------------------
324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325
;-----------------------------------------------------------------------------
327
cglobal x264_deblock_h_luma_sse2, 5,7
334
%define pix_tmp rsp+0x30
340
; transpose 6x16 -> tmp space
341
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
344
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
347
; alpha, beta, tc0 are still in r2d, r3d, r4
348
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
349
lea r0, [pix_tmp+0x30]
354
call x264_deblock_v_luma_sse2
356
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
359
movq m0, [pix_tmp+0x18]
360
movq m1, [pix_tmp+0x28]
361
movq m2, [pix_tmp+0x38]
362
movq m3, [pix_tmp+0x48]
363
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
369
movq m0, [pix_tmp+0x10]
370
movq m1, [pix_tmp+0x20]
371
movq m2, [pix_tmp+0x30]
372
movq m3, [pix_tmp+0x40]
373
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
384
%macro DEBLOCK_LUMA 3
385
;-----------------------------------------------------------------------------
386
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
387
;-----------------------------------------------------------------------------
388
cglobal x264_deblock_%2_luma_%1, 5,5
393
add r4, r0 ; pix-3*stride
394
%assign pad 2*%3+12-(stack_offset&15)
397
mova m0, [r4+r1] ; p1
398
mova m1, [r4+2*r1] ; p0
400
mova m3, [r0+r1] ; q1
406
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
407
mova [esp+%3], m4 ; tc
411
mova [esp], m4 ; mask
414
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
416
pand m4, [esp+%3] ; tc
420
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
422
mova m4, [r0+2*r1] ; q2
423
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
424
mova m5, [esp] ; mask
426
mova m5, [esp+%3] ; tc
430
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
438
;-----------------------------------------------------------------------------
439
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
440
;-----------------------------------------------------------------------------
442
cglobal x264_deblock_h_luma_%1, 0,5
448
%assign pad 0x78-(stack_offset&15)
450
%define pix_tmp esp+12
452
; transpose 6x16 -> tmp space
453
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
456
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
459
lea r0, [pix_tmp+0x30]
465
call x264_deblock_%2_luma_%1
467
add dword [esp ], 8 ; pix_tmp+0x38
468
add dword [esp+16], 2 ; tc0+2
469
call x264_deblock_%2_luma_%1
473
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
478
movq m0, [pix_tmp+0x10]
479
movq m1, [pix_tmp+0x20]
480
movq m2, [pix_tmp+0x30]
481
movq m3, [pix_tmp+0x40]
482
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
486
movq m0, [pix_tmp+0x18]
487
movq m1, [pix_tmp+0x28]
488
movq m2, [pix_tmp+0x38]
489
movq m3, [pix_tmp+0x48]
490
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
494
%endmacro ; DEBLOCK_LUMA
497
DEBLOCK_LUMA sse2, v, 16
503
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
508
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
521
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
528
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
532
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
537
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
545
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
553
mova %1, t1 ; store p0
559
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
561
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
566
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
574
mova %2, t0 ; store p1
575
mova %3, t1 ; store p2
578
%macro LUMA_INTRA_SWAP_PQ 0
584
%define mask1p mask1q
587
%macro DEBLOCK_LUMA_INTRA 2
603
%define mask1q [rsp-24]
607
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
612
%define mask0 spill(2)
613
%define mask1p spill(3)
614
%define mask1q spill(4)
615
%define mpb_00 [pb_00 GLOBAL]
616
%define mpb_01 [pb_01 GLOBAL]
619
;-----------------------------------------------------------------------------
620
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
621
;-----------------------------------------------------------------------------
622
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
627
lea r5, [r1*3] ; 3*stride
633
add r4, r0 ; pix-4*stride
640
mova mpb_01, [pb_01 GLOBAL]
641
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
642
SWAP 7, 12 ; m12=mask0
644
pavgb t5, mpb_01 ; alpha/4+1
647
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
648
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
649
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
656
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
659
pavgb m4, [pb_00 GLOBAL]
660
pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
661
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
663
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
666
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
670
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
672
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
681
;-----------------------------------------------------------------------------
682
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
683
;-----------------------------------------------------------------------------
684
cglobal x264_deblock_h_luma_intra_%1, 4,7
692
; transpose 8x16 -> tmp space
693
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
696
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
698
lea r0, [pix_tmp+0x40]
700
call x264_deblock_v_luma_intra_%1
702
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
704
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
709
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
713
cglobal x264_deblock_h_luma_intra_%1, 2,4
717
%assign pad 0x8c-(stack_offset&15)
721
; transpose 8x16 -> tmp space
722
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
725
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
727
lea r0, [pix_tmp+0x40]
732
call x264_deblock_%2_luma_intra_%1
734
add dword [rsp], 8 ; pix_tmp+8
735
call x264_deblock_%2_luma_intra_%1
744
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
745
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
748
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
752
%endmacro ; DEBLOCK_LUMA_INTRA
755
DEBLOCK_LUMA_INTRA sse2, v
758
DEBLOCK_LUMA_INTRA mmxext, v8