344
347
; transpose 6x16 -> tmp space
345
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
348
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
348
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
351
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
350
353
; vertical filter
351
354
; alpha, beta, tc0 are still in r2d, r3d, r4
352
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
355
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
353
356
lea r0, [pix_tmp+0x30]
356
359
mov [rsp+0x20], r4
358
call deblock_v_luma_8_%1
361
call deblock_v_luma_8
360
363
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
695
697
LUMA_INTRA_SWAP_PQ
696
698
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
705
704
;-----------------------------------------------------------------------------
706
705
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
707
706
;-----------------------------------------------------------------------------
708
cglobal deblock_h_luma_intra_8_%1, 4,7
707
cglobal deblock_h_luma_intra_8, 4,9
714
713
%define pix_tmp rsp
716
715
; transpose 8x16 -> tmp space
717
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
720
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
716
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
719
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
722
721
lea r0, [pix_tmp+0x40]
724
call deblock_v_luma_intra_8_%1
723
call deblock_v_luma_intra_8
726
725
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
728
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
733
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
727
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
732
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
737
cglobal deblock_h_luma_intra_8_%1, 2,4
736
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
741
%assign pad 0x8c-(stack_offset&15)
743
740
%define pix_tmp rsp
745
742
; transpose 8x16 -> tmp space
911
;-----------------------------------------------------------------------------
912
; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
913
; int8_t ref[2][40], int16_t mv[2][40][2],
914
; int bidir, int edges, int step,
915
; int mask_mv0, int mask_mv1, int field);
923
;-----------------------------------------------------------------------------
924
%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
925
; dir, d_idx, mask_dir, bidir
933
xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
938
test b_idxd, dword mask_mvd
939
jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
941
movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
942
punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
943
pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
944
pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
945
pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
946
psubb m0, m2 ; { ref0[b] != ref0[bn],
947
; ref0[b] != ref1[bn] }
948
psubb m1, m3 ; { ref1[b] != ref1[bn],
949
; ref1[b] != ref0[bn] }
952
mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
953
mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
956
psubw m1, [mvq+b_idxq*4+12*4]
957
psubw m2, [mvq+b_idxq*4+12*4+mmsize]
958
psubw m3, [mvq+b_idxq*4+52*4]
959
psubw m4, [mvq+b_idxq*4+52*4+mmsize]
964
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
969
mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
970
mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
973
psubw m1, [mvq+b_idxq*4+12*4]
974
psubw m2, [mvq+b_idxq*4+12*4+mmsize]
975
psubw m3, [mvq+b_idxq*4+52*4]
976
psubw m4, [mvq+b_idxq*4+52*4+mmsize]
981
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
990
movd m0, [refq+b_idxq+12]
991
psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
993
mova m1, [mvq+b_idxq*4+12*4]
994
mova m2, [mvq+b_idxq*4+12*4+mmsize]
995
psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
996
psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
999
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1002
%endif ; bidir == 1/0
1005
movd m1, [nnzq+b_idxq+12]
1006
por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1014
movq [bsq+b_idxq+32*dir], m1
1016
add b_idxd, dword stepd
1017
cmp b_idxd, dword edgesd
1022
cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1023
step, mask_mv0, mask_mv1, field
1024
%define b_idxq bidirq
1025
%define b_idxd bidird
1038
%define mask_mv0d mask_mv0m
1039
%define mask_mv1d mask_mv1m
1041
shl dword mask_mv1d, 3
1042
shl dword mask_mv0d, 3
1046
loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
1047
loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
1049
mova m0, [bsq+mmsize*0]
1050
mova m1, [bsq+mmsize*1]
1051
mova m2, [bsq+mmsize*2]
1052
mova m3, [bsq+mmsize*3]
1053
TRANSPOSE4x4W 0, 1, 2, 3, 4
1054
mova [bsq+mmsize*0], m0
1055
mova [bsq+mmsize*1], m1
1056
mova [bsq+mmsize*2], m2
1057
mova [bsq+mmsize*3], m3
1061
loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
1062
loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
1064
mova m0, [bsq+mmsize*0]
1065
mova m1, [bsq+mmsize*1]
1066
mova m2, [bsq+mmsize*2]
1067
mova m3, [bsq+mmsize*3]
1068
TRANSPOSE4x4W 0, 1, 2, 3, 4
1069
mova [bsq+mmsize*0], m0
1070
mova [bsq+mmsize*1], m1
1071
mova [bsq+mmsize*2], m2
1072
mova [bsq+mmsize*3], m3