1
/* Copyright (C) 2002 Jean-Marc Valin */
4
@brief Various analysis/synthesis filters (SSE version)
7
Redistribution and use in source and binary forms, with or without
8
modification, are permitted provided that the following conditions
11
- Redistributions of source code must retain the above copyright
12
notice, this list of conditions and the following disclaimer.
14
- Redistributions in binary form must reproduce the above copyright
15
notice, this list of conditions and the following disclaimer in the
16
documentation and/or other materials provided with the distribution.
18
- Neither the name of the Xiph.org Foundation nor the names of its
19
contributors may be used to endorse or promote products derived from
20
this software without specific prior written permission.
22
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
26
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
#include <xmmintrin.h>
37
void filter_mem16_10(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
39
__m128 num[3], den[3], mem[3];
43
/* Copy numerator, denominator and memory to aligned xmm */
46
mem[i] = _mm_loadu_ps(_mem+4*i);
47
num[i] = _mm_loadu_ps(_num+4*i);
48
den[i] = _mm_loadu_ps(_den+4*i);
50
mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
51
num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
52
den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
58
/* Compute next filter result */
59
xx = _mm_load_ps1(x+i);
60
yy = _mm_add_ss(xx, mem[0]);
61
_mm_store_ss(y+i, yy);
62
yy = _mm_shuffle_ps(yy, yy, 0);
65
mem[0] = _mm_move_ss(mem[0], mem[1]);
66
mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
68
mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
69
mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
71
mem[1] = _mm_move_ss(mem[1], mem[2]);
72
mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
74
mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
75
mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
77
mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
79
mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
80
mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
82
/* Put memory back in its place */
83
_mm_storeu_ps(_mem, mem[0]);
84
_mm_storeu_ps(_mem+4, mem[1]);
85
_mm_store_ss(_mem+8, mem[2]);
86
mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
87
_mm_store_ss(_mem+9, mem[2]);
90
void filter_mem16_8(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem)
92
__m128 num[2], den[2], mem[2];
96
/* Copy numerator, denominator and memory to aligned xmm */
99
mem[i] = _mm_loadu_ps(_mem+4*i);
100
num[i] = _mm_loadu_ps(_num+4*i);
101
den[i] = _mm_loadu_ps(_den+4*i);
108
/* Compute next filter result */
109
xx = _mm_load_ps1(x+i);
110
yy = _mm_add_ss(xx, mem[0]);
111
_mm_store_ss(y+i, yy);
112
yy = _mm_shuffle_ps(yy, yy, 0);
115
mem[0] = _mm_move_ss(mem[0], mem[1]);
116
mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
118
mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
119
mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
121
mem[1] = _mm_sub_ss(mem[1], mem[1]);
122
mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
124
mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
125
mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
127
/* Put memory back in its place */
128
_mm_storeu_ps(_mem, mem[0]);
129
_mm_storeu_ps(_mem+4, mem[1]);
133
#define OVERRIDE_FILTER_MEM16
134
void filter_mem16(const float *x, const float *_num, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
137
filter_mem16_10(x, _num, _den, y, N, ord, _mem);
139
filter_mem16_8(x, _num, _den, y, N, ord, _mem);
144
void iir_mem16_10(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
146
__m128 den[3], mem[3];
150
/* Copy numerator, denominator and memory to aligned xmm */
153
mem[i] = _mm_loadu_ps(_mem+4*i);
154
den[i] = _mm_loadu_ps(_den+4*i);
156
mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
157
den[2] = _mm_setr_ps(_den[8], _den[9], 0, 0);
163
/* Compute next filter result */
164
xx = _mm_load_ps1(x+i);
165
yy = _mm_add_ss(xx, mem[0]);
166
_mm_store_ss(y+i, yy);
167
yy = _mm_shuffle_ps(yy, yy, 0);
170
mem[0] = _mm_move_ss(mem[0], mem[1]);
171
mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
173
mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
175
mem[1] = _mm_move_ss(mem[1], mem[2]);
176
mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
178
mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
180
mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
182
mem[2] = _mm_sub_ps(mem[2], _mm_mul_ps(yy, den[2]));
184
/* Put memory back in its place */
185
_mm_storeu_ps(_mem, mem[0]);
186
_mm_storeu_ps(_mem+4, mem[1]);
187
_mm_store_ss(_mem+8, mem[2]);
188
mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
189
_mm_store_ss(_mem+9, mem[2]);
193
void iir_mem16_8(const float *x, const float *_den, float *y, int N, int ord, float *_mem)
195
__m128 den[2], mem[2];
199
/* Copy numerator, denominator and memory to aligned xmm */
202
mem[i] = _mm_loadu_ps(_mem+4*i);
203
den[i] = _mm_loadu_ps(_den+4*i);
210
/* Compute next filter result */
211
xx = _mm_load_ps1(x+i);
212
yy = _mm_add_ss(xx, mem[0]);
213
_mm_store_ss(y+i, yy);
214
yy = _mm_shuffle_ps(yy, yy, 0);
217
mem[0] = _mm_move_ss(mem[0], mem[1]);
218
mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
220
mem[0] = _mm_sub_ps(mem[0], _mm_mul_ps(yy, den[0]));
222
mem[1] = _mm_sub_ss(mem[1], mem[1]);
223
mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
225
mem[1] = _mm_sub_ps(mem[1], _mm_mul_ps(yy, den[1]));
227
/* Put memory back in its place */
228
_mm_storeu_ps(_mem, mem[0]);
229
_mm_storeu_ps(_mem+4, mem[1]);
232
#define OVERRIDE_IIR_MEM16
233
void iir_mem16(const float *x, const float *_den, float *y, int N, int ord, float *_mem, char *stack)
236
iir_mem16_10(x, _den, y, N, ord, _mem);
238
iir_mem16_8(x, _den, y, N, ord, _mem);
242
void fir_mem16_10(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
244
__m128 num[3], mem[3];
248
/* Copy numerator, denominator and memory to aligned xmm */
251
mem[i] = _mm_loadu_ps(_mem+4*i);
252
num[i] = _mm_loadu_ps(_num+4*i);
254
mem[2] = _mm_setr_ps(_mem[8], _mem[9], 0, 0);
255
num[2] = _mm_setr_ps(_num[8], _num[9], 0, 0);
261
/* Compute next filter result */
262
xx = _mm_load_ps1(x+i);
263
yy = _mm_add_ss(xx, mem[0]);
264
_mm_store_ss(y+i, yy);
265
yy = _mm_shuffle_ps(yy, yy, 0);
268
mem[0] = _mm_move_ss(mem[0], mem[1]);
269
mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
271
mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
273
mem[1] = _mm_move_ss(mem[1], mem[2]);
274
mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
276
mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
278
mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0xfd);
280
mem[2] = _mm_add_ps(mem[2], _mm_mul_ps(xx, num[2]));
282
/* Put memory back in its place */
283
_mm_storeu_ps(_mem, mem[0]);
284
_mm_storeu_ps(_mem+4, mem[1]);
285
_mm_store_ss(_mem+8, mem[2]);
286
mem[2] = _mm_shuffle_ps(mem[2], mem[2], 0x55);
287
_mm_store_ss(_mem+9, mem[2]);
290
void fir_mem16_8(const float *x, const float *_num, float *y, int N, int ord, float *_mem)
292
__m128 num[2], mem[2];
296
/* Copy numerator, denominator and memory to aligned xmm */
299
mem[i] = _mm_loadu_ps(_mem+4*i);
300
num[i] = _mm_loadu_ps(_num+4*i);
307
/* Compute next filter result */
308
xx = _mm_load_ps1(x+i);
309
yy = _mm_add_ss(xx, mem[0]);
310
_mm_store_ss(y+i, yy);
311
yy = _mm_shuffle_ps(yy, yy, 0);
314
mem[0] = _mm_move_ss(mem[0], mem[1]);
315
mem[0] = _mm_shuffle_ps(mem[0], mem[0], 0x39);
317
mem[0] = _mm_add_ps(mem[0], _mm_mul_ps(xx, num[0]));
319
mem[1] = _mm_sub_ss(mem[1], mem[1]);
320
mem[1] = _mm_shuffle_ps(mem[1], mem[1], 0x39);
322
mem[1] = _mm_add_ps(mem[1], _mm_mul_ps(xx, num[1]));
324
/* Put memory back in its place */
325
_mm_storeu_ps(_mem, mem[0]);
326
_mm_storeu_ps(_mem+4, mem[1]);
329
#define OVERRIDE_FIR_MEM16
330
void fir_mem16(const float *x, const float *_num, float *y, int N, int ord, float *_mem, char *stack)
333
fir_mem16_10(x, _num, y, N, ord, _mem);
335
fir_mem16_8(x, _num, y, N, ord, _mem);