1
;*****************************************************************************
2
;* deblock-a.asm: h264 encoder library
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
;* GNU General Public License for more details.
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
;*****************************************************************************
26
pb_00: times 16 db 0x00
27
pb_01: times 16 db 0x01
28
pb_03: times 16 db 0x03
29
pb_a1: times 16 db 0xa1
33
; expands to [base],...,[base+7*stride]
34
%define PASS8ROWS(base, base3, stride, stride3) \
35
[base], [base+stride], [base+stride*2], [base3], \
36
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
38
; in: 8 rows of 4 bytes in %1..%8
39
; out: 4 rows of 8 bytes in m0..m3
40
%macro TRANSPOSE4x8_LOAD 8
69
; in: 4 rows of 8 bytes in m0..m3
70
; out: 8 rows of 4 bytes in %1..%8
71
%macro TRANSPOSE8x4_STORE 8
111
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113
%macro TRANSPOSE6x8_MEM 9
121
SBUTTERFLY bw, m0, m1, m7
122
SBUTTERFLY bw, m2, m3, m1
123
SBUTTERFLY bw, m4, m5, m3
125
SBUTTERFLY bw, m6, %8, m5
126
SBUTTERFLY wd, m0, m2, m1
127
SBUTTERFLY wd, m4, m6, m2
130
SBUTTERFLY wd, m7, [%9+0x10], m6
131
SBUTTERFLY wd, m3, m5, m4
132
SBUTTERFLY dq, m7, m3, m0
133
SBUTTERFLY dq, m1, m2, m5
142
; in: 8 rows of 8 in %1..%8
143
; out: 8 rows of 8 in %9..%16
144
%macro TRANSPOSE8x8_MEM 16
152
SBUTTERFLY bw, m0, m1, m7
153
SBUTTERFLY bw, m2, m3, m1
154
SBUTTERFLY bw, m4, m5, m3
155
SBUTTERFLY bw, m6, %8, m5
157
SBUTTERFLY wd, m0, m2, m3
158
SBUTTERFLY wd, m4, m6, m2
159
SBUTTERFLY wd, m7, m1, m6
162
SBUTTERFLY wd, m2, m5, m1
163
SBUTTERFLY dq, m0, m4, m5
164
SBUTTERFLY dq, m7, m2, m4
169
SBUTTERFLY dq, m3, %11, m0
170
SBUTTERFLY dq, m6, m1, m5
177
; out: %4 = |%1-%2|>%3
188
; out: %4 = |%1-%2|>%3
209
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210
; out: m5=beta-1, m7=mask, %3=alpha-1
217
packuswb m4, m4 ; 16x alpha-1
218
packuswb m5, m5 ; 16x beta-1
222
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
225
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
234
%macro DEBLOCK_P0_Q0 0
237
pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
240
pavgb m3, m0 ; (p1 - q1 + 256)>>1
241
pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
243
pavgb m4, m2 ; (q0 - p0 + 256)>>1
245
paddusb m3, m4 ; d+128+33
246
mova m6, [pb_a1 GLOBAL]
248
psubusb m3, [pb_a1 GLOBAL]
258
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260
; clobbers: q2, tmp, tc0
264
pavgb %2, %6 ; avg(p2,avg(p0,q0))
266
pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277
;-----------------------------------------------------------------------------
278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279
;-----------------------------------------------------------------------------
281
cglobal x264_deblock_v_luma_sse2
287
add r4, r0 ; pix-3*stride
289
mova m0, [r4+r1] ; p1
290
mova m1, [r4+2*r1] ; p0
292
mova m3, [r0+r1] ; q1
296
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
303
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
310
movdqa m4, [r0+2*r1] ; q2
311
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
316
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
323
;-----------------------------------------------------------------------------
324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325
;-----------------------------------------------------------------------------
327
cglobal x264_deblock_h_luma_sse2
335
; transpose 6x16 -> tmp space
336
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
339
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
342
; alpha, beta, tc0 are still in r2d, r3d, r4
343
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344
lea r0, [pix_tmp+0x30]
346
call x264_deblock_v_luma_sse2
348
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
351
movq m0, [pix_tmp+0x18]
352
movq m1, [pix_tmp+0x28]
353
movq m2, [pix_tmp+0x38]
354
movq m3, [pix_tmp+0x48]
355
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
361
movq m0, [pix_tmp+0x10]
362
movq m1, [pix_tmp+0x20]
363
movq m2, [pix_tmp+0x30]
364
movq m3, [pix_tmp+0x40]
365
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
372
%macro DEBLOCK_LUMA 3
373
;-----------------------------------------------------------------------------
374
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
375
;-----------------------------------------------------------------------------
376
cglobal x264_deblock_%2_luma_%1, 5,5
381
add r4, r0 ; pix-3*stride
382
%assign pad 2*%3+12-(stack_offset&15)
385
mova m0, [r4+r1] ; p1
386
mova m1, [r4+2*r1] ; p0
388
mova m3, [r0+r1] ; q1
394
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
395
mova [esp+%3], m4 ; tc
399
mova [esp], m4 ; mask
402
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
404
pand m4, [esp+%3] ; tc
408
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
410
mova m4, [r0+2*r1] ; q2
411
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
412
mova m5, [esp] ; mask
414
mova m5, [esp+%3] ; tc
418
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
426
;-----------------------------------------------------------------------------
427
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
428
;-----------------------------------------------------------------------------
430
cglobal x264_deblock_h_luma_%1, 0,5
436
%assign pad 0x78-(stack_offset&15)
438
%define pix_tmp esp+12
440
; transpose 6x16 -> tmp space
441
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
444
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
447
lea r0, [pix_tmp+0x30]
453
call x264_deblock_%2_luma_%1
455
add dword [esp ], 8 ; pix_tmp+0x38
456
add dword [esp+16], 2 ; tc0+2
457
call x264_deblock_%2_luma_%1
461
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
466
movq m0, [pix_tmp+0x10]
467
movq m1, [pix_tmp+0x20]
468
movq m2, [pix_tmp+0x30]
469
movq m3, [pix_tmp+0x40]
470
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
474
movq m0, [pix_tmp+0x18]
475
movq m1, [pix_tmp+0x28]
476
movq m2, [pix_tmp+0x38]
477
movq m3, [pix_tmp+0x48]
478
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
482
%endmacro ; DEBLOCK_LUMA
485
DEBLOCK_LUMA sse2, v, 16
491
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
496
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
509
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
516
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
520
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
525
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
533
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
541
mova %1, t1 ; store p0
547
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
549
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
554
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
562
mova %2, t0 ; store p1
563
mova %3, t1 ; store p2
566
%macro LUMA_INTRA_SWAP_PQ 0
572
%define mask1p mask1q
575
%macro DEBLOCK_LUMA_INTRA 2
591
%define mask1q [rsp-24]
595
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
600
%define mask0 spill(2)
601
%define mask1p spill(3)
602
%define mask1q spill(4)
603
%define mpb_00 [pb_00 GLOBAL]
604
%define mpb_01 [pb_01 GLOBAL]
607
;-----------------------------------------------------------------------------
608
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609
;-----------------------------------------------------------------------------
610
cglobal x264_deblock_%2_luma_intra_%1, 4,6
615
lea r5, [r1*3] ; 3*stride
621
add r4, r0 ; pix-4*stride
628
mova mpb_01, [pb_01 GLOBAL]
629
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
630
SWAP 7, 12 ; m12=mask0
632
pavgb t5, mpb_01 ; alpha/4+1
635
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
636
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
637
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
644
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
647
pavgb m4, [pb_00 GLOBAL]
648
pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
649
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
651
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
654
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
658
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
660
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
669
;-----------------------------------------------------------------------------
670
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671
;-----------------------------------------------------------------------------
672
cglobal x264_deblock_h_luma_intra_%1
680
; transpose 8x16 -> tmp space
681
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
684
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
686
lea r0, [pix_tmp+0x40]
688
call x264_deblock_v_luma_intra_%1
690
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
692
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
697
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
701
cglobal x264_deblock_h_luma_intra_%1, 2,4
705
%assign pad 0x8c-(stack_offset&15)
709
; transpose 8x16 -> tmp space
710
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
713
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
715
lea r0, [pix_tmp+0x40]
720
call x264_deblock_%2_luma_intra_%1
722
add dword [rsp], 8 ; pix_tmp+8
723
call x264_deblock_%2_luma_intra_%1
732
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
733
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
736
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
740
%endmacro ; DEBLOCK_LUMA_INTRA
743
DEBLOCK_LUMA_INTRA sse2, v
746
DEBLOCK_LUMA_INTRA mmxext, v8
1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
;* GNU General Public License for more details.
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
;*****************************************************************************
26
pb_00: times 16 db 0x00
27
pb_01: times 16 db 0x01
28
pb_03: times 16 db 0x03
29
pb_a1: times 16 db 0xa1
33
; expands to [base],...,[base+7*stride]
34
%define PASS8ROWS(base, base3, stride, stride3) \
35
[base], [base+stride], [base+stride*2], [base3], \
36
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
38
; in: 8 rows of 4 bytes in %1..%8
39
; out: 4 rows of 8 bytes in m0..m3
40
%macro TRANSPOSE4x8_LOAD 8
69
; in: 4 rows of 8 bytes in m0..m3
70
; out: 8 rows of 4 bytes in %1..%8
71
%macro TRANSPOSE8x4_STORE 8
111
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113
%macro TRANSPOSE6x8_MEM 9
121
SBUTTERFLY bw, m0, m1, m7
122
SBUTTERFLY bw, m2, m3, m1
123
SBUTTERFLY bw, m4, m5, m3
125
SBUTTERFLY bw, m6, %8, m5
126
SBUTTERFLY wd, m0, m2, m1
127
SBUTTERFLY wd, m4, m6, m2
130
SBUTTERFLY wd, m7, [%9+0x10], m6
131
SBUTTERFLY wd, m3, m5, m4
132
SBUTTERFLY dq, m7, m3, m0
133
SBUTTERFLY dq, m1, m2, m5
142
; in: 8 rows of 8 in %1..%8
143
; out: 8 rows of 8 in %9..%16
144
%macro TRANSPOSE8x8_MEM 16
152
SBUTTERFLY bw, m0, m1, m7
153
SBUTTERFLY bw, m2, m3, m1
154
SBUTTERFLY bw, m4, m5, m3
155
SBUTTERFLY bw, m6, %8, m5
157
SBUTTERFLY wd, m0, m2, m3
158
SBUTTERFLY wd, m4, m6, m2
159
SBUTTERFLY wd, m7, m1, m6
162
SBUTTERFLY wd, m2, m5, m1
163
SBUTTERFLY dq, m0, m4, m5
164
SBUTTERFLY dq, m7, m2, m4
169
SBUTTERFLY dq, m3, %11, m0
170
SBUTTERFLY dq, m6, m1, m5
177
; out: %4 = |%1-%2|>%3
188
; out: %4 = |%1-%2|>%3
209
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210
; out: m5=beta-1, m7=mask, %3=alpha-1
217
packuswb m4, m4 ; 16x alpha-1
218
packuswb m5, m5 ; 16x beta-1
222
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
225
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
234
%macro DEBLOCK_P0_Q0 0
237
pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
240
pavgb m3, m0 ; (p1 - q1 + 256)>>1
241
pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
243
pavgb m4, m2 ; (q0 - p0 + 256)>>1
245
paddusb m3, m4 ; d+128+33
246
mova m6, [pb_a1 GLOBAL]
248
psubusb m3, [pb_a1 GLOBAL]
258
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260
; clobbers: q2, tmp, tc0
264
pavgb %2, %6 ; avg(p2,avg(p0,q0))
266
pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277
;-----------------------------------------------------------------------------
278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279
;-----------------------------------------------------------------------------
281
cglobal x264_deblock_v_luma_sse2
287
add r4, r0 ; pix-3*stride
289
mova m0, [r4+r1] ; p1
290
mova m1, [r4+2*r1] ; p0
292
mova m3, [r0+r1] ; q1
296
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
303
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
310
movdqa m4, [r0+2*r1] ; q2
311
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
316
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
323
;-----------------------------------------------------------------------------
324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325
;-----------------------------------------------------------------------------
327
cglobal x264_deblock_h_luma_sse2
335
; transpose 6x16 -> tmp space
336
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
339
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
342
; alpha, beta, tc0 are still in r2d, r3d, r4
343
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344
lea r0, [pix_tmp+0x30]
346
call x264_deblock_v_luma_sse2
348
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
351
movq m0, [pix_tmp+0x18]
352
movq m1, [pix_tmp+0x28]
353
movq m2, [pix_tmp+0x38]
354
movq m3, [pix_tmp+0x48]
355
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
361
movq m0, [pix_tmp+0x10]
362
movq m1, [pix_tmp+0x20]
363
movq m2, [pix_tmp+0x30]
364
movq m3, [pix_tmp+0x40]
365
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
372
%macro DEBLOCK_LUMA 3
373
;-----------------------------------------------------------------------------
374
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
375
;-----------------------------------------------------------------------------
376
cglobal x264_deblock_%2_luma_%1, 5,5
381
add r4, r0 ; pix-3*stride
382
%assign pad 2*%3+12-(stack_offset&15)
385
mova m0, [r4+r1] ; p1
386
mova m1, [r4+2*r1] ; p0
388
mova m3, [r0+r1] ; q1
394
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
395
mova [esp+%3], m4 ; tc
399
mova [esp], m4 ; mask
402
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
404
pand m4, [esp+%3] ; tc
408
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
410
mova m4, [r0+2*r1] ; q2
411
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
412
mova m5, [esp] ; mask
414
mova m5, [esp+%3] ; tc
418
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
426
;-----------------------------------------------------------------------------
427
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
428
;-----------------------------------------------------------------------------
430
cglobal x264_deblock_h_luma_%1, 0,5
436
%assign pad 0x78-(stack_offset&15)
438
%define pix_tmp esp+12
440
; transpose 6x16 -> tmp space
441
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
444
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
447
lea r0, [pix_tmp+0x30]
453
call x264_deblock_%2_luma_%1
455
add dword [esp ], 8 ; pix_tmp+0x38
456
add dword [esp+16], 2 ; tc0+2
457
call x264_deblock_%2_luma_%1
461
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
466
movq m0, [pix_tmp+0x10]
467
movq m1, [pix_tmp+0x20]
468
movq m2, [pix_tmp+0x30]
469
movq m3, [pix_tmp+0x40]
470
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
474
movq m0, [pix_tmp+0x18]
475
movq m1, [pix_tmp+0x28]
476
movq m2, [pix_tmp+0x38]
477
movq m3, [pix_tmp+0x48]
478
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
482
%endmacro ; DEBLOCK_LUMA
485
DEBLOCK_LUMA sse2, v, 16
491
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
496
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
509
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
516
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
520
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
525
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
533
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
541
mova %1, t1 ; store p0
547
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
549
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
554
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
562
mova %2, t0 ; store p1
563
mova %3, t1 ; store p2
566
%macro LUMA_INTRA_SWAP_PQ 0
572
%define mask1p mask1q
575
%macro DEBLOCK_LUMA_INTRA 2
591
%define mask1q [rsp-24]
595
%define spill(x) [esp+16*x+((stack_offset+4)&15)]
600
%define mask0 spill(2)
601
%define mask1p spill(3)
602
%define mask1q spill(4)
603
%define mpb_00 [pb_00 GLOBAL]
604
%define mpb_01 [pb_01 GLOBAL]
607
;-----------------------------------------------------------------------------
608
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609
;-----------------------------------------------------------------------------
610
cglobal x264_deblock_%2_luma_intra_%1, 4,6
615
lea r5, [r1*3] ; 3*stride
621
add r4, r0 ; pix-4*stride
628
mova mpb_01, [pb_01 GLOBAL]
629
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
630
SWAP 7, 12 ; m12=mask0
632
pavgb t5, mpb_01 ; alpha/4+1
635
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
636
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
637
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
644
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
647
pavgb m4, [pb_00 GLOBAL]
648
pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
649
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
651
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
654
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
658
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
660
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
669
;-----------------------------------------------------------------------------
670
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671
;-----------------------------------------------------------------------------
672
cglobal x264_deblock_h_luma_intra_%1
680
; transpose 8x16 -> tmp space
681
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
684
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
686
lea r0, [pix_tmp+0x40]
688
call x264_deblock_v_luma_intra_%1
690
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
692
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
697
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
701
cglobal x264_deblock_h_luma_intra_%1, 2,4
705
%assign pad 0x8c-(stack_offset&15)
709
; transpose 8x16 -> tmp space
710
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
713
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
715
lea r0, [pix_tmp+0x40]
720
call x264_deblock_%2_luma_intra_%1
722
add dword [rsp], 8 ; pix_tmp+8
723
call x264_deblock_%2_luma_intra_%1
732
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
733
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
736
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
740
%endmacro ; DEBLOCK_LUMA_INTRA
743
DEBLOCK_LUMA_INTRA sse2, v
746
DEBLOCK_LUMA_INTRA mmxext, v8