~ubuntu-branches/ubuntu/utopic/fftw3/utopic

« back to all changes in this revision

Viewing changes to dft/simd/common/t1bv_16.c

  • Committer: Package Import Robot
  • Author(s): Matthias Klose
  • Date: 2011-12-14 13:21:22 UTC
  • mfrom: (3.1.5 sid)
  • Revision ID: package-import@ubuntu.com-20111214132122-l4avyl2kkr7vq5aj
Tags: 3.3-1ubuntu1
* Merge with Debian; remaining changes:
  - Revert the ARM workaround.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright (c) 2003, 2007-11 Matteo Frigo
 
3
 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
 
4
 *
 
5
 * This program is free software; you can redistribute it and/or modify
 
6
 * it under the terms of the GNU General Public License as published by
 
7
 * the Free Software Foundation; either version 2 of the License, or
 
8
 * (at your option) any later version.
 
9
 *
 
10
 * This program is distributed in the hope that it will be useful,
 
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
13
 * GNU General Public License for more details.
 
14
 *
 
15
 * You should have received a copy of the GNU General Public License
 
16
 * along with this program; if not, write to the Free Software
 
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
18
 *
 
19
 */
 
20
 
 
21
/* This file was automatically generated --- DO NOT EDIT */
 
22
/* Generated on Wed Jul 27 06:15:35 EDT 2011 */
 
23
 
 
24
#include "codelet-dft.h"
 
25
 
 
26
#ifdef HAVE_FMA
 
27
 
 
28
/* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1bv_16 -include t1b.h -sign 1 */
 
29
 
 
30
/*
 
31
 * This function contains 87 FP additions, 64 FP multiplications,
 
32
 * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
 
33
 * 61 stack variables, 3 constants, and 32 memory accesses
 
34
 */
 
35
#include "t1b.h"
 
36
 
 
37
static void t1bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
 
38
{
 
39
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
 
40
     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
 
41
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
 
42
     {
 
43
          INT m;
 
44
          R *x;
 
45
          x = ii;
 
46
          for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(rs)) {
 
47
               V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15;
 
48
               V Tv;
 
49
               {
 
50
                    V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu;
 
51
                    V Tp;
 
52
                    {
 
53
                         V T1, T2, T5, T7;
 
54
                         T1 = LD(&(x[0]), ms, &(x[0]));
 
55
                         T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
 
56
                         T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
 
57
                         T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
 
58
                         {
 
59
                              V Tz, TG, TB, TE;
 
60
                              Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
 
61
                              TG = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
 
62
                              TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
 
63
                              TE = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
 
64
                              {
 
65
                                   V Ti, TX, TY, Td, Tg, Tm, Tt, To;
 
66
                                   {
 
67
                                        V T3, T6, T8, TA, TH, TC, TF, Tb;
 
68
                                        Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
 
69
                                        T3 = BYTW(&(W[TWVL * 14]), T2);
 
70
                                        T6 = BYTW(&(W[TWVL * 6]), T5);
 
71
                                        T8 = BYTW(&(W[TWVL * 22]), T7);
 
72
                                        TA = BYTW(&(W[TWVL * 2]), Tz);
 
73
                                        TH = BYTW(&(W[TWVL * 10]), TG);
 
74
                                        TC = BYTW(&(W[TWVL * 18]), TB);
 
75
                                        TF = BYTW(&(W[TWVL * 26]), TE);
 
76
                                        Tc = BYTW(&(W[0]), Tb);
 
77
                                        TW = VSUB(T1, T3);
 
78
                                        T4 = VADD(T1, T3);
 
79
                                        T19 = VSUB(T6, T8);
 
80
                                        T9 = VADD(T6, T8);
 
81
                                        Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
 
82
                                        TD = VADD(TA, TC);
 
83
                                        TX = VSUB(TA, TC);
 
84
                                        TI = VADD(TF, TH);
 
85
                                        TY = VSUB(TF, TH);
 
86
                                   }
 
87
                                   Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
 
88
                                   Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
 
89
                                   Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
 
90
                                   Tj = BYTW(&(W[TWVL * 24]), Ti);
 
91
                                   Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
 
92
                                   To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
 
93
                                   TZ = VADD(TX, TY);
 
94
                                   T1a = VSUB(TX, TY);
 
95
                                   Te = BYTW(&(W[TWVL * 16]), Td);
 
96
                                   Th = BYTW(&(W[TWVL * 8]), Tg);
 
97
                                   Tn = BYTW(&(W[TWVL * 28]), Tm);
 
98
                                   Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
 
99
                                   Tu = BYTW(&(W[TWVL * 20]), Tt);
 
100
                                   Tp = BYTW(&(W[TWVL * 12]), To);
 
101
                              }
 
102
                         }
 
103
                    }
 
104
                    {
 
105
                         V Tf, T11, Tk, T12, Ts;
 
106
                         TO = VADD(T4, T9);
 
107
                         Ta = VSUB(T4, T9);
 
108
                         TJ = VSUB(TD, TI);
 
109
                         TP = VADD(TD, TI);
 
110
                         Tf = VADD(Tc, Te);
 
111
                         T11 = VSUB(Tc, Te);
 
112
                         Tk = VADD(Th, Tj);
 
113
                         T12 = VSUB(Th, Tj);
 
114
                         Ts = BYTW(&(W[TWVL * 4]), Tr);
 
115
                         T14 = VSUB(Tn, Tp);
 
116
                         Tq = VADD(Tn, Tp);
 
117
                         T1i = VFNMS(LDK(KP707106781), TZ, TW);
 
118
                         T10 = VFMA(LDK(KP707106781), TZ, TW);
 
119
                         T1b = VFMA(LDK(KP707106781), T1a, T19);
 
120
                         T1l = VFNMS(LDK(KP707106781), T1a, T19);
 
121
                         T13 = VFNMS(LDK(KP414213562), T12, T11);
 
122
                         T1c = VFMA(LDK(KP414213562), T11, T12);
 
123
                         TR = VADD(Tf, Tk);
 
124
                         Tl = VSUB(Tf, Tk);
 
125
                         T15 = VSUB(Tu, Ts);
 
126
                         Tv = VADD(Ts, Tu);
 
127
                    }
 
128
               }
 
129
               {
 
130
                    V T1d, T16, TS, Tw, TU, TQ;
 
131
                    T1d = VFMA(LDK(KP414213562), T14, T15);
 
132
                    T16 = VFNMS(LDK(KP414213562), T15, T14);
 
133
                    TS = VADD(Tq, Tv);
 
134
                    Tw = VSUB(Tq, Tv);
 
135
                    TU = VADD(TO, TP);
 
136
                    TQ = VSUB(TO, TP);
 
137
                    {
 
138
                         V T1e, T1j, T17, T1m;
 
139
                         T1e = VSUB(T1c, T1d);
 
140
                         T1j = VADD(T1c, T1d);
 
141
                         T17 = VADD(T13, T16);
 
142
                         T1m = VSUB(T13, T16);
 
143
                         {
 
144
                              V TV, TT, TK, Tx;
 
145
                              TV = VADD(TR, TS);
 
146
                              TT = VSUB(TR, TS);
 
147
                              TK = VSUB(Tl, Tw);
 
148
                              Tx = VADD(Tl, Tw);
 
149
                              {
 
150
                                   V T1h, T1f, T1o, T1k;
 
151
                                   T1h = VFMA(LDK(KP923879532), T1e, T1b);
 
152
                                   T1f = VFNMS(LDK(KP923879532), T1e, T1b);
 
153
                                   T1o = VFMA(LDK(KP923879532), T1j, T1i);
 
154
                                   T1k = VFNMS(LDK(KP923879532), T1j, T1i);
 
155
                                   {
 
156
                                        V T1g, T18, T1p, T1n;
 
157
                                        T1g = VFMA(LDK(KP923879532), T17, T10);
 
158
                                        T18 = VFNMS(LDK(KP923879532), T17, T10);
 
159
                                        T1p = VFNMS(LDK(KP923879532), T1m, T1l);
 
160
                                        T1n = VFMA(LDK(KP923879532), T1m, T1l);
 
161
                                        ST(&(x[WS(rs, 8)]), VSUB(TU, TV), ms, &(x[0]));
 
162
                                        ST(&(x[0]), VADD(TU, TV), ms, &(x[0]));
 
163
                                        ST(&(x[WS(rs, 4)]), VFMAI(TT, TQ), ms, &(x[0]));
 
164
                                        ST(&(x[WS(rs, 12)]), VFNMSI(TT, TQ), ms, &(x[0]));
 
165
                                        {
 
166
                                             V TN, TL, TM, Ty;
 
167
                                             TN = VFMA(LDK(KP707106781), TK, TJ);
 
168
                                             TL = VFNMS(LDK(KP707106781), TK, TJ);
 
169
                                             TM = VFMA(LDK(KP707106781), Tx, Ta);
 
170
                                             Ty = VFNMS(LDK(KP707106781), Tx, Ta);
 
171
                                             ST(&(x[WS(rs, 15)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
 
172
                                             ST(&(x[WS(rs, 1)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
 
173
                                             ST(&(x[WS(rs, 9)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
 
174
                                             ST(&(x[WS(rs, 7)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
 
175
                                             ST(&(x[WS(rs, 3)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
 
176
                                             ST(&(x[WS(rs, 13)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
 
177
                                             ST(&(x[WS(rs, 11)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
 
178
                                             ST(&(x[WS(rs, 5)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
 
179
                                             ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
 
180
                                             ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
 
181
                                             ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
 
182
                                             ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
 
183
                                        }
 
184
                                   }
 
185
                              }
 
186
                         }
 
187
                    }
 
188
               }
 
189
          }
 
190
     }
 
191
     VLEAVE();
 
192
}
 
193
 
 
194
static const tw_instr twinstr[] = {
 
195
     VTW(0, 1),
 
196
     VTW(0, 2),
 
197
     VTW(0, 3),
 
198
     VTW(0, 4),
 
199
     VTW(0, 5),
 
200
     VTW(0, 6),
 
201
     VTW(0, 7),
 
202
     VTW(0, 8),
 
203
     VTW(0, 9),
 
204
     VTW(0, 10),
 
205
     VTW(0, 11),
 
206
     VTW(0, 12),
 
207
     VTW(0, 13),
 
208
     VTW(0, 14),
 
209
     VTW(0, 15),
 
210
     {TW_NEXT, VL, 0}
 
211
};
 
212
 
 
213
static const ct_desc desc = { 16, XSIMD_STRING("t1bv_16"), twinstr, &GENUS, {53, 30, 34, 0}, 0, 0, 0 };
 
214
 
 
215
void XSIMD(codelet_t1bv_16) (planner *p) {
 
216
     X(kdft_dit_register) (p, t1bv_16, &desc);
 
217
}
 
218
#else                           /* HAVE_FMA */
 
219
 
 
220
/* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t1bv_16 -include t1b.h -sign 1 */
 
221
 
 
222
/*
 
223
 * This function contains 87 FP additions, 42 FP multiplications,
 
224
 * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
 
225
 * 36 stack variables, 3 constants, and 32 memory accesses
 
226
 */
 
227
#include "t1b.h"
 
228
 
 
229
static void t1bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
 
230
{
 
231
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
 
232
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
 
233
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
 
234
     {
 
235
          INT m;
 
236
          R *x;
 
237
          x = ii;
 
238
          for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(rs)) {
 
239
               V TJ, T1b, TD, T1c, T17, T18, Ty, TK, T10, T11, T12, Tb, TM, T13, T14;
 
240
               V T15, Tm, TN, TG, TI, TH;
 
241
               TG = LD(&(x[0]), ms, &(x[0]));
 
242
               TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
 
243
               TI = BYTW(&(W[TWVL * 14]), TH);
 
244
               TJ = VSUB(TG, TI);
 
245
               T1b = VADD(TG, TI);
 
246
               {
 
247
                    V TA, TC, Tz, TB;
 
248
                    Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
 
249
                    TA = BYTW(&(W[TWVL * 6]), Tz);
 
250
                    TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
 
251
                    TC = BYTW(&(W[TWVL * 22]), TB);
 
252
                    TD = VSUB(TA, TC);
 
253
                    T1c = VADD(TA, TC);
 
254
               }
 
255
               {
 
256
                    V Tp, Tw, Tr, Tu, Ts, Tx;
 
257
                    {
 
258
                         V To, Tv, Tq, Tt;
 
259
                         To = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
 
260
                         Tp = BYTW(&(W[TWVL * 2]), To);
 
261
                         Tv = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
 
262
                         Tw = BYTW(&(W[TWVL * 10]), Tv);
 
263
                         Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
 
264
                         Tr = BYTW(&(W[TWVL * 18]), Tq);
 
265
                         Tt = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
 
266
                         Tu = BYTW(&(W[TWVL * 26]), Tt);
 
267
                    }
 
268
                    T17 = VADD(Tp, Tr);
 
269
                    T18 = VADD(Tu, Tw);
 
270
                    Ts = VSUB(Tp, Tr);
 
271
                    Tx = VSUB(Tu, Tw);
 
272
                    Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
 
273
                    TK = VMUL(LDK(KP707106781), VADD(Ts, Tx));
 
274
               }
 
275
               {
 
276
                    V T2, T9, T4, T7, T5, Ta;
 
277
                    {
 
278
                         V T1, T8, T3, T6;
 
279
                         T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
 
280
                         T2 = BYTW(&(W[0]), T1);
 
281
                         T8 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
 
282
                         T9 = BYTW(&(W[TWVL * 24]), T8);
 
283
                         T3 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
 
284
                         T4 = BYTW(&(W[TWVL * 16]), T3);
 
285
                         T6 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
 
286
                         T7 = BYTW(&(W[TWVL * 8]), T6);
 
287
                    }
 
288
                    T10 = VADD(T2, T4);
 
289
                    T11 = VADD(T7, T9);
 
290
                    T12 = VSUB(T10, T11);
 
291
                    T5 = VSUB(T2, T4);
 
292
                    Ta = VSUB(T7, T9);
 
293
                    Tb = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), T5));
 
294
                    TM = VFMA(LDK(KP382683432), T5, VMUL(LDK(KP923879532), Ta));
 
295
               }
 
296
               {
 
297
                    V Td, Tk, Tf, Ti, Tg, Tl;
 
298
                    {
 
299
                         V Tc, Tj, Te, Th;
 
300
                         Tc = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
 
301
                         Td = BYTW(&(W[TWVL * 28]), Tc);
 
302
                         Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
 
303
                         Tk = BYTW(&(W[TWVL * 20]), Tj);
 
304
                         Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
 
305
                         Tf = BYTW(&(W[TWVL * 12]), Te);
 
306
                         Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
 
307
                         Ti = BYTW(&(W[TWVL * 4]), Th);
 
308
                    }
 
309
                    T13 = VADD(Td, Tf);
 
310
                    T14 = VADD(Ti, Tk);
 
311
                    T15 = VSUB(T13, T14);
 
312
                    Tg = VSUB(Td, Tf);
 
313
                    Tl = VSUB(Ti, Tk);
 
314
                    Tm = VFMA(LDK(KP923879532), Tg, VMUL(LDK(KP382683432), Tl));
 
315
                    TN = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
 
316
               }
 
317
               {
 
318
                    V T1a, T1g, T1f, T1h;
 
319
                    {
 
320
                         V T16, T19, T1d, T1e;
 
321
                         T16 = VMUL(LDK(KP707106781), VSUB(T12, T15));
 
322
                         T19 = VSUB(T17, T18);
 
323
                         T1a = VBYI(VSUB(T16, T19));
 
324
                         T1g = VBYI(VADD(T19, T16));
 
325
                         T1d = VSUB(T1b, T1c);
 
326
                         T1e = VMUL(LDK(KP707106781), VADD(T12, T15));
 
327
                         T1f = VSUB(T1d, T1e);
 
328
                         T1h = VADD(T1d, T1e);
 
329
                    }
 
330
                    ST(&(x[WS(rs, 6)]), VADD(T1a, T1f), ms, &(x[0]));
 
331
                    ST(&(x[WS(rs, 14)]), VSUB(T1h, T1g), ms, &(x[0]));
 
332
                    ST(&(x[WS(rs, 10)]), VSUB(T1f, T1a), ms, &(x[0]));
 
333
                    ST(&(x[WS(rs, 2)]), VADD(T1g, T1h), ms, &(x[0]));
 
334
               }
 
335
               {
 
336
                    V T1k, T1o, T1n, T1p;
 
337
                    {
 
338
                         V T1i, T1j, T1l, T1m;
 
339
                         T1i = VADD(T1b, T1c);
 
340
                         T1j = VADD(T17, T18);
 
341
                         T1k = VSUB(T1i, T1j);
 
342
                         T1o = VADD(T1i, T1j);
 
343
                         T1l = VADD(T10, T11);
 
344
                         T1m = VADD(T13, T14);
 
345
                         T1n = VBYI(VSUB(T1l, T1m));
 
346
                         T1p = VADD(T1l, T1m);
 
347
                    }
 
348
                    ST(&(x[WS(rs, 12)]), VSUB(T1k, T1n), ms, &(x[0]));
 
349
                    ST(&(x[0]), VADD(T1o, T1p), ms, &(x[0]));
 
350
                    ST(&(x[WS(rs, 4)]), VADD(T1k, T1n), ms, &(x[0]));
 
351
                    ST(&(x[WS(rs, 8)]), VSUB(T1o, T1p), ms, &(x[0]));
 
352
               }
 
353
               {
 
354
                    V TF, TQ, TP, TR;
 
355
                    {
 
356
                         V Tn, TE, TL, TO;
 
357
                         Tn = VSUB(Tb, Tm);
 
358
                         TE = VSUB(Ty, TD);
 
359
                         TF = VBYI(VSUB(Tn, TE));
 
360
                         TQ = VBYI(VADD(TE, Tn));
 
361
                         TL = VSUB(TJ, TK);
 
362
                         TO = VSUB(TM, TN);
 
363
                         TP = VSUB(TL, TO);
 
364
                         TR = VADD(TL, TO);
 
365
                    }
 
366
                    ST(&(x[WS(rs, 5)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
 
367
                    ST(&(x[WS(rs, 13)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
 
368
                    ST(&(x[WS(rs, 11)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
 
369
                    ST(&(x[WS(rs, 3)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
 
370
               }
 
371
               {
 
372
                    V TU, TY, TX, TZ;
 
373
                    {
 
374
                         V TS, TT, TV, TW;
 
375
                         TS = VADD(TJ, TK);
 
376
                         TT = VADD(Tb, Tm);
 
377
                         TU = VADD(TS, TT);
 
378
                         TY = VSUB(TS, TT);
 
379
                         TV = VADD(TD, Ty);
 
380
                         TW = VADD(TM, TN);
 
381
                         TX = VBYI(VADD(TV, TW));
 
382
                         TZ = VBYI(VSUB(TW, TV));
 
383
                    }
 
384
                    ST(&(x[WS(rs, 15)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
 
385
                    ST(&(x[WS(rs, 7)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
 
386
                    ST(&(x[WS(rs, 1)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
 
387
                    ST(&(x[WS(rs, 9)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
 
388
               }
 
389
          }
 
390
     }
 
391
     VLEAVE();
 
392
}
 
393
 
 
394
static const tw_instr twinstr[] = {
 
395
     VTW(0, 1),
 
396
     VTW(0, 2),
 
397
     VTW(0, 3),
 
398
     VTW(0, 4),
 
399
     VTW(0, 5),
 
400
     VTW(0, 6),
 
401
     VTW(0, 7),
 
402
     VTW(0, 8),
 
403
     VTW(0, 9),
 
404
     VTW(0, 10),
 
405
     VTW(0, 11),
 
406
     VTW(0, 12),
 
407
     VTW(0, 13),
 
408
     VTW(0, 14),
 
409
     VTW(0, 15),
 
410
     {TW_NEXT, VL, 0}
 
411
};
 
412
 
 
413
static const ct_desc desc = { 16, XSIMD_STRING("t1bv_16"), twinstr, &GENUS, {83, 38, 4, 0}, 0, 0, 0 };
 
414
 
 
415
void XSIMD(codelet_t1bv_16) (planner *p) {
 
416
     X(kdft_dit_register) (p, t1bv_16, &desc);
 
417
}
 
418
#endif                          /* HAVE_FMA */