1900
1898
void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1901
1899
int stride, int h, int x, int y);
1902
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1903
int stride, int h, int x, int y);
1904
1900
void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1905
1901
int stride, int h, int x, int y);
1906
void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1907
int stride, int h, int x, int y);
1908
1902
void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1909
1903
int stride, int h, int x, int y);
1910
void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1911
int stride, int h, int x, int y);
1913
1905
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1914
1906
int stride, int h, int x, int y);
1915
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1916
int stride, int h, int x, int y);
1917
1907
void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1918
1908
int stride, int h, int x, int y);
1919
void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1920
int stride, int h, int x, int y);
1921
1909
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1922
1910
int stride, int h, int x, int y);
1923
void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1924
int stride, int h, int x, int y);
1926
1912
void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1927
1913
int stride, int h, int x, int y);
2430
2416
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2418
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2419
int32_t max, unsigned int len);
2420
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2421
int32_t max, unsigned int len);
2422
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
2423
int32_t max, unsigned int len);
2424
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
2425
int32_t max, unsigned int len);
2427
extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2428
const float *src1, int len);
2429
extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2430
const float *src1, int len);
2432
2432
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2434
2434
int mm_flags = av_get_cpu_flags();
2435
const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
2435
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2436
2436
const int bit_depth = avctx->bits_per_raw_sample;
2438
2438
if (avctx->dsp_mask) {
2616
2615
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2619
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2620
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2621
c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2622
c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2623
c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2624
c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2625
c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2626
c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2627
c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2628
c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2629
c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2630
c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2631
c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2632
c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2633
c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2634
c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2635
c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2637
SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2638
SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2639
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2640
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2641
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2642
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2644
if (!high_bit_depth) {
2645
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2646
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2647
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2648
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2649
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2650
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2653
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2654
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2655
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2656
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2659
c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2660
c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2662
if (!high_bit_depth) {
2618
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2619
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2620
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2621
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2622
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2623
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2624
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2625
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2626
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2627
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2628
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2629
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2630
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2631
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2632
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2633
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2634
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2636
SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2637
SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2638
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2639
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2640
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2641
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2643
if (!high_bit_depth) {
2644
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2645
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2646
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2647
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2648
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2649
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2651
else if (bit_depth == 10) {
2654
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2655
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2656
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2657
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2659
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2660
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2664
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2665
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2666
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2667
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2670
if (!high_bit_depth && CONFIG_H264CHROMA) {
2663
2671
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2664
2672
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2665
2673
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2666
2674
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2668
if (bit_depth == 10) {
2676
if (bit_depth == 10 && CONFIG_H264CHROMA) {
2669
2677
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
2670
2678
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
2671
2679
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
2714
2722
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2717
SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2718
SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2719
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2720
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2721
SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2722
SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2725
SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2726
SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2727
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2728
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2729
SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2730
SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2724
2732
if (!high_bit_depth) {
2725
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2726
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2727
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2728
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2729
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2730
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2733
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2734
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2735
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2736
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2737
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2738
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2733
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2734
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2735
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2736
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2741
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2742
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2743
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2744
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2739
if (!high_bit_depth) {
2747
if (!high_bit_depth && CONFIG_H264CHROMA) {
2740
2748
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2741
2749
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2744
c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2745
c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2764
2770
if(mm_flags & AV_CPU_FLAG_SSE2){
2765
2771
if (!high_bit_depth) {
2766
H264_QPEL_FUNCS(0, 1, sse2);
2767
H264_QPEL_FUNCS(0, 2, sse2);
2768
H264_QPEL_FUNCS(0, 3, sse2);
2769
H264_QPEL_FUNCS(1, 1, sse2);
2770
H264_QPEL_FUNCS(1, 2, sse2);
2771
H264_QPEL_FUNCS(1, 3, sse2);
2772
H264_QPEL_FUNCS(2, 1, sse2);
2773
H264_QPEL_FUNCS(2, 2, sse2);
2774
H264_QPEL_FUNCS(2, 3, sse2);
2775
H264_QPEL_FUNCS(3, 1, sse2);
2776
H264_QPEL_FUNCS(3, 2, sse2);
2777
H264_QPEL_FUNCS(3, 3, sse2);
2772
H264_QPEL_FUNCS(0, 1, sse2);
2773
H264_QPEL_FUNCS(0, 2, sse2);
2774
H264_QPEL_FUNCS(0, 3, sse2);
2775
H264_QPEL_FUNCS(1, 1, sse2);
2776
H264_QPEL_FUNCS(1, 2, sse2);
2777
H264_QPEL_FUNCS(1, 3, sse2);
2778
H264_QPEL_FUNCS(2, 1, sse2);
2779
H264_QPEL_FUNCS(2, 2, sse2);
2780
H264_QPEL_FUNCS(2, 3, sse2);
2781
H264_QPEL_FUNCS(3, 1, sse2);
2782
H264_QPEL_FUNCS(3, 2, sse2);
2783
H264_QPEL_FUNCS(3, 3, sse2);
2786
#define H264_QPEL_FUNCS_10(x, y, CPU)\
2787
c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2788
c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2789
c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2790
c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2780
2791
if (bit_depth == 10) {
2781
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
2782
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
2792
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2793
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2794
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2795
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2796
H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2797
H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2798
H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2800
if (CONFIG_H264CHROMA) {
2801
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2802
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2918
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
2920
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2883
2924
#if HAVE_AVX && HAVE_YASM
2884
2925
if (mm_flags & AV_CPU_FLAG_AVX) {
2885
2926
if (bit_depth == 10) {
2886
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
2887
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
2927
//AVX implies !cache64.
2928
//TODO: Port cache(32|64) detection from x264.
2929
H264_QPEL_FUNCS_10(1, 0, sse2)
2930
H264_QPEL_FUNCS_10(2, 0, sse2)
2931
H264_QPEL_FUNCS_10(3, 0, sse2)
2933
if (CONFIG_H264CHROMA) {
2934
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2935
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2938
c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2893
2943
if (CONFIG_ENCODERS)
2894
2944
dsputilenc_init_mmx(c, avctx);
2897
// for speed testing
2898
get_pixels = just_return;
2899
put_pixels_clamped = just_return;
2900
add_pixels_clamped = just_return;
2902
pix_abs16x16 = just_return;
2903
pix_abs16x16_x2 = just_return;
2904
pix_abs16x16_y2 = just_return;
2905
pix_abs16x16_xy2 = just_return;
2907
put_pixels_tab[0] = just_return;
2908
put_pixels_tab[1] = just_return;
2909
put_pixels_tab[2] = just_return;
2910
put_pixels_tab[3] = just_return;
2912
put_no_rnd_pixels_tab[0] = just_return;
2913
put_no_rnd_pixels_tab[1] = just_return;
2914
put_no_rnd_pixels_tab[2] = just_return;
2915
put_no_rnd_pixels_tab[3] = just_return;
2917
avg_pixels_tab[0] = just_return;
2918
avg_pixels_tab[1] = just_return;
2919
avg_pixels_tab[2] = just_return;
2920
avg_pixels_tab[3] = just_return;
2922
avg_no_rnd_pixels_tab[0] = just_return;
2923
avg_no_rnd_pixels_tab[1] = just_return;
2924
avg_no_rnd_pixels_tab[2] = just_return;
2925
avg_no_rnd_pixels_tab[3] = just_return;
2927
//av_fdct = just_return;
2928
//ff_idct = just_return;