332
338
mova [r0+r1*0], m0
333
339
mova [r0+r1*1], m1
334
340
lea r0, [r0+r1*2]
345
;-----------------------------------------------------------------------------
346
; void predict_8x8_dc(pixel *src, int stride)
347
;-----------------------------------------------------------------------------
349
; sort of a hack, but it works
359
cglobal pred8x8_dc_10_%1, 2,6
369
pshufw m2, m0, 00001110b
370
pshufw m3, m1, 00001110b
380
movzx r2d, word [r0+r1*1-2]
381
movzx r3d, word [r0+r1*2-2]
383
movzx r3d, word [r0+r5*1-2]
385
movzx r3d, word [r4-2]
389
movzx r2d, word [r4+r1*1-2]
390
movzx r3d, word [r4+r1*2-2]
392
movzx r3d, word [r4+r5*1-2]
394
movzx r3d, word [r4+r1*4-2]
399
punpckldq m0, m2 ; s0, s1, s2, s3
400
%2 m3, m0, 11110110b ; s2, s1, s3, s3
401
%2 m0, m0, 01110100b ; s0, s1, s3, s1
404
pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
407
pshufd m3, m0, 11111010b
428
PRED8x8_DC mmxext, pshufw
430
PRED8x8_DC sse2 , pshuflw
432
;-----------------------------------------------------------------------------
433
; void pred8x8_top_dc(pixel *src, int stride)
434
;-----------------------------------------------------------------------------
436
cglobal pred8x8_top_dc_10_sse2, 2,4
459
;-----------------------------------------------------------------------------
460
; void pred8x8_plane(pixel *src, int stride)
461
;-----------------------------------------------------------------------------
463
cglobal pred8x8_plane_10_sse2, 2,7,7
468
pmaddwd m2, [pw_m32101234]
476
psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
477
movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
478
movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
480
movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
481
movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
484
movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
485
movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
489
movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
490
movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
499
mova m3, [pw_pixel_max]
504
pmullw m2, [pw_m32101234] ; b
505
pmullw m5, m4, [pw_m3] ; c
522
;-----------------------------------------------------------------------------
523
; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
524
;-----------------------------------------------------------------------------
525
%macro PRED8x8L_128_DC 1
526
cglobal pred8x8l_128_dc_10_%1, 4,4
527
mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
542
PRED8x8L_128_DC mmxext
546
;-----------------------------------------------------------------------------
547
; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
548
;-----------------------------------------------------------------------------
549
%macro PRED8x8L_TOP_DC 1
550
cglobal pred8x8l_top_dc_10_%1, 4,4,6
558
pinsrw m1, [r0+r1], 0
559
pinsrw m2, [r0+r2+14], 7
562
PRED4x4_LOWPASS m0, m2, m1, m0
585
;-----------------------------------------------------------------------------
586
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
587
;-----------------------------------------------------------------------------
588
;TODO: see if scalar is faster
590
cglobal pred8x8l_dc_10_%1, 4,6,6
594
mova m0, [r0+r3*2-16]
595
punpckhwd m0, [r0+r3*1-16]
596
mova m1, [r4+r3*0-16]
597
punpckhwd m1, [r0+r5*1-16]
599
mova m2, [r4+r3*2-16]
600
punpckhwd m2, [r4+r3*1-16]
601
mova m3, [r4+r3*4-16]
602
punpckhwd m3, [r4+r5*1-16]
611
pinsrw m1, [r0+r1], 0
612
pinsrw m2, [r0+r2+14], 7
617
pshuflw m4, m4, 11100101b
618
pinsrw m5, [r0+r1-2], 7
619
PRED4x4_LOWPASS m3, m4, m5, m3
620
PRED4x4_LOWPASS m0, m2, m1, m0
644
;-----------------------------------------------------------------------------
645
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
646
;-----------------------------------------------------------------------------
647
%macro PRED8x8L_VERTICAL 1
648
cglobal pred8x8l_vertical_10_%1, 4,4,6
656
pinsrw m1, [r0+r1], 0
657
pinsrw m2, [r0+r2+14], 7
660
PRED4x4_LOWPASS m0, m2, m1, m0
673
PRED8x8L_VERTICAL sse2
676
PRED8x8L_VERTICAL avx
679
;-----------------------------------------------------------------------------
680
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
681
;-----------------------------------------------------------------------------
682
%macro PRED8x8L_HORIZONTAL 1
683
cglobal pred8x8l_horizontal_10_%1, 4,4,5
689
punpckhwd m0, [r0+r1-16]
690
mova m1, [r0+r3*2-16]
691
punpckhwd m1, [r0+r3*1-16]
695
mova m2, [r2+r3*0-16]
696
punpckhwd m2, [r0+r1-16]
697
mova m3, [r2+r3*2-16]
698
punpckhwd m3, [r2+r3*1-16]
701
PALIGNR m4, m3, [r2+r1-16], 14, m0
703
pshuflw m0, m0, 11100101b
704
PRED4x4_LOWPASS m4, m3, m0, m4
727
%define PALIGNR PALIGNR_MMX
728
PRED8x8L_HORIZONTAL sse2
729
%define PALIGNR PALIGNR_SSSE3
730
PRED8x8L_HORIZONTAL ssse3
733
PRED8x8L_HORIZONTAL avx
736
;-----------------------------------------------------------------------------
737
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
738
;-----------------------------------------------------------------------------
739
%macro PRED8x8L_DOWN_LEFT 1
740
cglobal pred8x8l_down_left_10_%1, 4,4,7
748
pinsrw m1, [r0+r1], 0
749
pinsrw m2, [r0+r2+14], 7
750
PRED4x4_LOWPASS m6, m2, m1, m3
751
jz .fix_tr ; flags from shr r2d
754
PALIGNR m2, m1, m3, 14, m3
755
pshufhw m5, m5, 10100100b
756
PRED4x4_LOWPASS m1, m2, m5, m1
761
PALIGNR m2, m1, m6, 2, m0
762
PALIGNR m3, m1, m6, 14, m0
763
PALIGNR m5, m1, 2, m0
765
PRED4x4_LOWPASS m6, m4, m2, m6
766
PRED4x4_LOWPASS m1, m3, m5, m1
768
PALIGNR m1, m6, 14, m2
771
PALIGNR m1, m6, 14, m2
774
PALIGNR m1, m6, 14, m2
777
PALIGNR m1, m6, 14, m2
780
PALIGNR m1, m6, 14, m2
783
PALIGNR m1, m6, 14, m2
786
PALIGNR m1, m6, 14, m6
796
%define PALIGNR PALIGNR_MMX
797
PRED8x8L_DOWN_LEFT sse2
798
%define PALIGNR PALIGNR_SSSE3
799
PRED8x8L_DOWN_LEFT ssse3
802
PRED8x8L_DOWN_LEFT avx
805
;-----------------------------------------------------------------------------
806
;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
807
;-----------------------------------------------------------------------------
808
%macro PRED8x8L_DOWN_RIGHT 1
809
; standard forbids this when has_topleft is false
811
cglobal pred8x8l_down_right_10_%1, 4,5,8
815
mova m0, [r0+r3*1-16]
816
punpckhwd m0, [r0+r3*0-16]
817
mova m1, [r0+r1*1-16]
818
punpckhwd m1, [r0+r3*2-16]
820
mova m2, [r4+r3*1-16]
821
punpckhwd m2, [r4+r3*0-16]
822
mova m3, [r4+r1*1-16]
823
punpckhwd m3, [r4+r3*2-16]
826
mova m0, [r4+r3*4-16]
828
PALIGNR m4, m3, m0, 14, m0
829
PALIGNR m1, m3, 2, m2
831
pshuflw m0, m0, 11100101b
832
PRED4x4_LOWPASS m6, m1, m4, m3
833
PRED4x4_LOWPASS m4, m3, m0, m4
839
pinsrw m2, [r0+r2+14], 7
840
PRED4x4_LOWPASS m3, m2, m1, m3
841
PALIGNR m2, m3, m6, 2, m0
842
PALIGNR m5, m3, m6, 14, m0
844
PRED4x4_LOWPASS m6, m4, m2, m6
845
PRED4x4_LOWPASS m3, m5, m7, m3
847
PALIGNR m3, m6, 14, m2
850
PALIGNR m3, m6, 14, m2
853
PALIGNR m3, m6, 14, m2
856
PALIGNR m3, m6, 14, m2
859
PALIGNR m3, m6, 14, m2
862
PALIGNR m3, m6, 14, m2
865
PALIGNR m3, m6, 14, m6
871
%define PALIGNR PALIGNR_MMX
872
PRED8x8L_DOWN_RIGHT sse2
873
%define PALIGNR PALIGNR_SSSE3
874
PRED8x8L_DOWN_RIGHT ssse3
877
PRED8x8L_DOWN_RIGHT avx
880
;-----------------------------------------------------------------------------
881
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
882
;-----------------------------------------------------------------------------
883
%macro PRED8x8L_VERTICAL_RIGHT 1
884
; likewise with 8x8l_down_right
885
cglobal pred8x8l_vertical_right_10_%1, 4,5,7
889
mova m0, [r0+r3*1-16]
890
punpckhwd m0, [r0+r3*0-16]
891
mova m1, [r0+r1*1-16]
892
punpckhwd m1, [r0+r3*2-16]
894
mova m2, [r4+r3*1-16]
895
punpckhwd m2, [r4+r3*0-16]
896
mova m3, [r4+r1*1-16]
897
punpckhwd m3, [r4+r3*2-16]
900
mova m0, [r4+r3*4-16]
902
PALIGNR m4, m3, m0, 14, m0
903
PALIGNR m1, m3, 2, m2
904
PRED4x4_LOWPASS m3, m1, m4, m3
910
pinsrw m5, [r0+r2+14], 7
911
PRED4x4_LOWPASS m2, m5, m1, m2
912
PALIGNR m6, m2, m3, 12, m1
913
PALIGNR m5, m2, m3, 14, m0
914
PRED4x4_LOWPASS m0, m6, m2, m5
920
PRED4x4_LOWPASS m1, m3, m6, m1
921
PALIGNR m2, m1, 14, m4
924
PALIGNR m0, m1, 14, m3
927
PALIGNR m2, m1, 14, m4
930
PALIGNR m0, m1, 14, m3
933
PALIGNR m2, m1, 14, m4
936
PALIGNR m0, m1, 14, m1
942
%define PALIGNR PALIGNR_MMX
943
PRED8x8L_VERTICAL_RIGHT sse2
944
%define PALIGNR PALIGNR_SSSE3
945
PRED8x8L_VERTICAL_RIGHT ssse3
948
PRED8x8L_VERTICAL_RIGHT avx
951
;-----------------------------------------------------------------------------
952
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
953
;-----------------------------------------------------------------------------
954
%macro PRED8x8L_HORIZONTAL_UP 1
955
cglobal pred8x8l_horizontal_up_10_%1, 4,4,6
956
mova m0, [r0+r3*0-16]
957
punpckhwd m0, [r0+r3*1-16]
962
mova m4, [r0+r1*1-16]
965
mova m1, [r0+r3*2-16]
966
punpckhwd m1, [r0+r1*1-16]
968
mova m2, [r2+r3*0-16]
969
punpckhwd m2, [r2+r3*1-16]
970
mova m3, [r2+r3*2-16]
971
punpckhwd m3, [r2+r1*1-16]
974
PALIGNR m1, m0, m4, 14, m4
976
pshufhw m2, m2, 10100100b
977
PRED4x4_LOWPASS m0, m1, m2, m0
980
pshufhw m1, m1, 10100100b
981
pshufhw m2, m2, 01010100b
983
PRED4x4_LOWPASS m1, m2, m0, m1
988
pshufd m0, m5, 11111001b
989
pshufd m1, m5, 11111110b
990
pshufd m2, m5, 11111111b
994
PALIGNR m2, m5, m4, 4, m0
995
PALIGNR m3, m5, m4, 8, m1
996
PALIGNR m5, m5, m4, 12, m4
1004
%define PALIGNR PALIGNR_MMX
1005
PRED8x8L_HORIZONTAL_UP sse2
1006
%define PALIGNR PALIGNR_SSSE3
1007
PRED8x8L_HORIZONTAL_UP ssse3
1010
PRED8x8L_HORIZONTAL_UP avx
1014
;-----------------------------------------------------------------------------
1015
; void pred16x16_vertical(pixel *src, int stride)
1016
;-----------------------------------------------------------------------------
1019
mova [%1+mmsize], %3
1026
%macro PRED16x16_VERTICAL 1
1027
cglobal pred16x16_vertical_10_%1, 2,3
1031
mova m1, [r0+mmsize]
1037
MOV16 r0+r1*1, m0, m1, m2, m3
1038
MOV16 r0+r1*2, m0, m1, m2, m3
1046
PRED16x16_VERTICAL mmxext
1048
PRED16x16_VERTICAL sse2
1050
;-----------------------------------------------------------------------------
1051
; void pred16x16_horizontal(pixel *src, int stride)
1052
;-----------------------------------------------------------------------------
1053
%macro PRED16x16_HORIZONTAL 1
1054
cglobal pred16x16_horizontal_10_%1, 2,3
1057
movd m0, [r0+r1*0-4]
1058
movd m1, [r0+r1*1-4]
1061
MOV16 r0+r1*0, m0, m0, m0, m0
1062
MOV16 r0+r1*1, m1, m1, m1, m1
1070
PRED16x16_HORIZONTAL mmxext
1072
PRED16x16_HORIZONTAL sse2
1074
;-----------------------------------------------------------------------------
1075
; void pred16x16_dc(pixel *src, int stride)
1076
;-----------------------------------------------------------------------------
1077
%macro PRED16x16_DC 1
1078
cglobal pred16x16_dc_10_%1, 2,6
1082
paddw m0, [r0+mmsize]
1090
movzx r3d, word [r0]
1091
movzx r4d, word [r0+r1]
1094
movzx r2d, word [r0]
1096
movzx r2d, word [r0+r1]
1107
MOV16 r5+r1*0, m0, m0, m0, m0
1108
MOV16 r5+r1*1, m0, m0, m0, m0
1120
;-----------------------------------------------------------------------------
1121
; void pred16x16_top_dc(pixel *src, int stride)
1122
;-----------------------------------------------------------------------------
1123
%macro PRED16x16_TOP_DC 1
1124
cglobal pred16x16_top_dc_10_%1, 2,3
1127
paddw m0, [r0+mmsize]
1139
MOV16 r0+r1*1, m0, m0, m0, m0
1140
MOV16 r0+r1*2, m0, m0, m0, m0
1148
PRED16x16_TOP_DC mmxext
1150
PRED16x16_TOP_DC sse2
1152
;-----------------------------------------------------------------------------
1153
; void pred16x16_left_dc(pixel *src, int stride)
1154
;-----------------------------------------------------------------------------
1155
%macro PRED16x16_LEFT_DC 1
1156
cglobal pred16x16_left_dc_10_%1, 2,6
1160
movzx r3d, word [r0]
1161
movzx r4d, word [r0+r1]
1164
movzx r2d, word [r0]
1166
movzx r2d, word [r0+r1]
1176
MOV16 r5+r1*0, m0, m0, m0, m0
1177
MOV16 r5+r1*1, m0, m0, m0, m0
1185
PRED16x16_LEFT_DC mmxext
1187
PRED16x16_LEFT_DC sse2
1189
;-----------------------------------------------------------------------------
1190
; void pred16x16_128_dc(pixel *src, int stride)
1191
;-----------------------------------------------------------------------------
1192
%macro PRED16x16_128_DC 1
1193
cglobal pred16x16_128_dc_10_%1, 2,3
1197
MOV16 r0+r1*0, m0, m0, m0, m0
1198
MOV16 r0+r1*1, m0, m0, m0, m0
1206
PRED16x16_128_DC mmxext
1208
PRED16x16_128_DC sse2