1
;*****************************************************************************
2
;* x86-optimized AC-3 DSP utils
3
;* Copyright (c) 2011 Justin Ruggles
5
;* This file is part of Libav.
7
;* Libav is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
12
;* Libav is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
;* Lesser General Public License for more details.
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with Libav; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
23
%include "x86util.asm"
27
; 16777216.0f - used in ff_float_to_fixed24()
28
pf_1_24: times 4 dd 0x4B800000
32
;-----------------------------------------------------------------------------
33
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
34
;-----------------------------------------------------------------------------
36
%macro AC3_EXPONENT_MIN 1
37
cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
42
mov offsetq, reuse_blksq
43
mova m0, [expq+offsetq]
47
PMINUB m0, [expq+offsetq], m1
58
%define PMINUB PMINUB_MMX
63
%define PMINUB PMINUB_MMXEXT
64
%define LOOP_ALIGN ALIGN 16
65
AC3_EXPONENT_MIN mmxext
74
;-----------------------------------------------------------------------------
75
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
77
; This function uses 2 different methods to calculate a valid result.
78
; 1) logical 'or' of abs of each element
79
; This is used for ssse3 because of the pabsw instruction.
80
; It is also used for mmx because of the lack of min/max instructions.
81
; 2) calculate min/max for the array, then or(abs(min),abs(max))
82
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
83
;-----------------------------------------------------------------------------
85
%macro AC3_MAX_MSB_ABS_INT16 2
86
cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len
92
mova m1, [srcq+mmsize]
100
mova m1, [srcq+mmsize]
103
; using memory args is faster for ssse3
105
pabsw m1, [srcq+mmsize]
131
%define ABS2 ABS2_MMX
132
%define PSHUFLW pshufw
133
AC3_MAX_MSB_ABS_INT16 mmx, or_abs
134
%define ABS2 ABS2_MMX2
135
AC3_MAX_MSB_ABS_INT16 mmxext, min_max
137
%define PSHUFLW pshuflw
138
AC3_MAX_MSB_ABS_INT16 sse2, min_max
139
%define ABS2 ABS2_SSSE3
140
AC3_MAX_MSB_ABS_INT16 ssse3, or_abs
142
;-----------------------------------------------------------------------------
143
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
144
;-----------------------------------------------------------------------------
146
%macro AC3_SHIFT 4 ; l/r, 16/32, shift instruction, instruction set
147
cglobal ac3_%1shift_int%2_%4, 3,3,5, src, len, shift
151
mova m2, [srcq+mmsize ]
152
mova m3, [srcq+mmsize*2]
153
mova m4, [srcq+mmsize*3]
159
mova [srcq+mmsize ], m2
160
mova [srcq+mmsize*2], m3
161
mova [srcq+mmsize*3], m4
163
sub lend, mmsize*32/%2
169
;-----------------------------------------------------------------------------
170
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
171
;-----------------------------------------------------------------------------
174
AC3_SHIFT l, 16, psllw, mmx
176
AC3_SHIFT l, 16, psllw, sse2
178
;-----------------------------------------------------------------------------
179
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
180
;-----------------------------------------------------------------------------
183
AC3_SHIFT r, 32, psrad, mmx
185
AC3_SHIFT r, 32, psrad, sse2
187
;-----------------------------------------------------------------------------
188
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
189
;-----------------------------------------------------------------------------
191
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
192
; than round-to-nearest.
194
cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
220
cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
244
cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
248
movaps m2, [srcq+16 ]
249
movaps m3, [srcq+32 ]
250
movaps m4, [srcq+48 ]
252
movaps m5, [srcq+64 ]
253
movaps m6, [srcq+80 ]
254
movaps m7, [srcq+96 ]
255
movaps m8, [srcq+112]
278
movdqa [dstq+16 ], m2
279
movdqa [dstq+32 ], m3
280
movdqa [dstq+48 ], m4
282
movdqa [dstq+64 ], m5
283
movdqa [dstq+80 ], m6
284
movdqa [dstq+96 ], m7
285
movdqa [dstq+112], m8