~ubuntu-branches/ubuntu/utopic/fftw3/utopic

« back to all changes in this revision

Viewing changes to dft/simd/codelets/t2bv_20.c

  • Committer: Package Import Robot
  • Author(s): Matthias Klose
  • Date: 2011-12-14 13:21:22 UTC
  • mfrom: (3.1.5 sid)
  • Revision ID: package-import@ubuntu.com-20111214132122-l4avyl2kkr7vq5aj
Tags: 3.3-1ubuntu1
* Merge with Debian; remaining changes:
  - Revert the ARM workaround.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
 * Copyright (c) 2003, 2007-8 Matteo Frigo
3
 
 * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
4
 
 *
5
 
 * This program is free software; you can redistribute it and/or modify
6
 
 * it under the terms of the GNU General Public License as published by
7
 
 * the Free Software Foundation; either version 2 of the License, or
8
 
 * (at your option) any later version.
9
 
 *
10
 
 * This program is distributed in the hope that it will be useful,
11
 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 
 * GNU General Public License for more details.
14
 
 *
15
 
 * You should have received a copy of the GNU General Public License
16
 
 * along with this program; if not, write to the Free Software
17
 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 
 *
19
 
 */
20
 
 
21
 
/* This file was automatically generated --- DO NOT EDIT */
22
 
/* Generated on Sun Jul 12 06:42:53 EDT 2009 */
23
 
 
24
 
#include "codelet-dft.h"
25
 
 
26
 
#ifdef HAVE_FMA
27
 
 
28
 
/* Generated by: ../../../genfft/gen_twiddle_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include t2b.h -sign 1 */
29
 
 
30
 
/*
31
 
 * This function contains 123 FP additions, 88 FP multiplications,
32
 
 * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
33
 
 * 68 stack variables, 4 constants, and 40 memory accesses
34
 
 */
35
 
#include "t2b.h"
36
 
 
37
 
static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38
 
{
39
 
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
40
 
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
41
 
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
42
 
     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
43
 
     INT m;
44
 
     R *x;
45
 
     x = ii;
46
 
     for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(rs)) {
47
 
          V T4, TX, T1m, T1K, T1y, Tk, Tf, T14, TQ, TZ, T1O, T1w, T1L, T1p, T1M;
48
 
          V T1s, TF, TY, T1x, Tp;
49
 
          {
50
 
               V T1, TV, T2, TT;
51
 
               T1 = LD(&(x[0]), ms, &(x[0]));
52
 
               TV = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
53
 
               T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
54
 
               TT = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
55
 
               {
56
 
                    V T9, T1n, TK, T1v, TP, Te, T1q, T1u, TB, TD, Tm, T1o, Tz, Tn, T1r;
57
 
                    V TE, To;
58
 
                    {
59
 
                         V TM, TO, Ta, Tc;
60
 
                         {
61
 
                              V T5, T7, TG, TI, T1k, T1l;
62
 
                              T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
63
 
                              T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
64
 
                              TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
65
 
                              TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
66
 
                              {
67
 
                                   V TW, T3, TU, T6, T8, TH, TJ, TL, TN;
68
 
                                   TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
69
 
                                   TW = BYTW(&(W[TWVL * 28]), TV);
70
 
                                   T3 = BYTW(&(W[TWVL * 18]), T2);
71
 
                                   TU = BYTW(&(W[TWVL * 8]), TT);
72
 
                                   T6 = BYTW(&(W[TWVL * 6]), T5);
73
 
                                   T8 = BYTW(&(W[TWVL * 26]), T7);
74
 
                                   TH = BYTW(&(W[TWVL * 24]), TG);
75
 
                                   TJ = BYTW(&(W[TWVL * 4]), TI);
76
 
                                   TM = BYTW(&(W[TWVL * 32]), TL);
77
 
                                   TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
78
 
                                   T4 = VSUB(T1, T3);
79
 
                                   T1k = VADD(T1, T3);
80
 
                                   TX = VSUB(TU, TW);
81
 
                                   T1l = VADD(TU, TW);
82
 
                                   T9 = VSUB(T6, T8);
83
 
                                   T1n = VADD(T6, T8);
84
 
                                   TK = VSUB(TH, TJ);
85
 
                                   T1v = VADD(TH, TJ);
86
 
                                   TO = BYTW(&(W[TWVL * 12]), TN);
87
 
                              }
88
 
                              Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
89
 
                              T1m = VSUB(T1k, T1l);
90
 
                              T1K = VADD(T1k, T1l);
91
 
                              Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
92
 
                         }
93
 
                         {
94
 
                              V Tb, Tx, Td, Th, Tj, Tw, Tg, Ti, Tv;
95
 
                              Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
96
 
                              Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
97
 
                              Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
98
 
                              TP = VSUB(TM, TO);
99
 
                              T1y = VADD(TM, TO);
100
 
                              Tb = BYTW(&(W[TWVL * 30]), Ta);
101
 
                              Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
102
 
                              Td = BYTW(&(W[TWVL * 10]), Tc);
103
 
                              Th = BYTW(&(W[TWVL * 14]), Tg);
104
 
                              Tj = BYTW(&(W[TWVL * 34]), Ti);
105
 
                              Tw = BYTW(&(W[TWVL * 16]), Tv);
106
 
                              {
107
 
                                   V TA, TC, Ty, Tl;
108
 
                                   TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
109
 
                                   TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
110
 
                                   Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
111
 
                                   Ty = BYTW(&(W[TWVL * 36]), Tx);
112
 
                                   Te = VSUB(Tb, Td);
113
 
                                   T1q = VADD(Tb, Td);
114
 
                                   Tk = VSUB(Th, Tj);
115
 
                                   T1u = VADD(Th, Tj);
116
 
                                   TB = BYTW(&(W[0]), TA);
117
 
                                   TD = BYTW(&(W[TWVL * 20]), TC);
118
 
                                   Tm = BYTW(&(W[TWVL * 22]), Tl);
119
 
                                   T1o = VADD(Tw, Ty);
120
 
                                   Tz = VSUB(Tw, Ty);
121
 
                                   Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
122
 
                              }
123
 
                         }
124
 
                    }
125
 
                    Tf = VADD(T9, Te);
126
 
                    T14 = VSUB(T9, Te);
127
 
                    TQ = VSUB(TK, TP);
128
 
                    TZ = VADD(TK, TP);
129
 
                    T1r = VADD(TB, TD);
130
 
                    TE = VSUB(TB, TD);
131
 
                    T1O = VADD(T1u, T1v);
132
 
                    T1w = VSUB(T1u, T1v);
133
 
                    To = BYTW(&(W[TWVL * 2]), Tn);
134
 
                    T1L = VADD(T1n, T1o);
135
 
                    T1p = VSUB(T1n, T1o);
136
 
                    T1M = VADD(T1q, T1r);
137
 
                    T1s = VSUB(T1q, T1r);
138
 
                    TF = VSUB(Tz, TE);
139
 
                    TY = VADD(Tz, TE);
140
 
                    T1x = VADD(Tm, To);
141
 
                    Tp = VSUB(Tm, To);
142
 
               }
143
 
          }
144
 
          {
145
 
               V T1V, T1N, T12, T1b, TR, T1G, T1t, T1z, T1P, Tq, T15, T11, T1j, T10;
146
 
               T1V = VSUB(T1L, T1M);
147
 
               T1N = VADD(T1L, T1M);
148
 
               T12 = VSUB(TY, TZ);
149
 
               T10 = VADD(TY, TZ);
150
 
               T1b = VFNMS(LDK(KP618033988), TF, TQ);
151
 
               TR = VFMA(LDK(KP618033988), TQ, TF);
152
 
               T1G = VSUB(T1p, T1s);
153
 
               T1t = VADD(T1p, T1s);
154
 
               T1z = VSUB(T1x, T1y);
155
 
               T1P = VADD(T1x, T1y);
156
 
               Tq = VADD(Tk, Tp);
157
 
               T15 = VSUB(Tk, Tp);
158
 
               T11 = VFNMS(LDK(KP250000000), T10, TX);
159
 
               T1j = VADD(TX, T10);
160
 
               {
161
 
                    V T1J, T1H, T1D, T1Z, T1X, T1T, T1f, T1h, T19, T17, T1C, T1S, T1a, Tu, T1F;
162
 
                    V T1A;
163
 
                    T1F = VSUB(T1w, T1z);
164
 
                    T1A = VADD(T1w, T1z);
165
 
                    {
166
 
                         V T1W, T1Q, Tt, Tr;
167
 
                         T1W = VSUB(T1O, T1P);
168
 
                         T1Q = VADD(T1O, T1P);
169
 
                         Tt = VSUB(Tf, Tq);
170
 
                         Tr = VADD(Tf, Tq);
171
 
                         {
172
 
                              V T1e, T16, T1d, T13;
173
 
                              T1e = VFNMS(LDK(KP618033988), T14, T15);
174
 
                              T16 = VFMA(LDK(KP618033988), T15, T14);
175
 
                              T1d = VFNMS(LDK(KP559016994), T12, T11);
176
 
                              T13 = VFMA(LDK(KP559016994), T12, T11);
177
 
                              T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
178
 
                              T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
179
 
                              {
180
 
                                   V T1B, T1R, Ts, T1i;
181
 
                                   T1B = VADD(T1t, T1A);
182
 
                                   T1D = VSUB(T1t, T1A);
183
 
                                   T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
184
 
                                   T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
185
 
                                   T1R = VADD(T1N, T1Q);
186
 
                                   T1T = VSUB(T1N, T1Q);
187
 
                                   Ts = VFNMS(LDK(KP250000000), Tr, T4);
188
 
                                   T1i = VADD(T4, Tr);
189
 
                                   T1f = VFNMS(LDK(KP951056516), T1e, T1d);
190
 
                                   T1h = VFMA(LDK(KP951056516), T1e, T1d);
191
 
                                   T19 = VFNMS(LDK(KP951056516), T16, T13);
192
 
                                   T17 = VFMA(LDK(KP951056516), T16, T13);
193
 
                                   ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
194
 
                                   T1C = VFNMS(LDK(KP250000000), T1B, T1m);
195
 
                                   ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
196
 
                                   T1S = VFNMS(LDK(KP250000000), T1R, T1K);
197
 
                                   T1a = VFNMS(LDK(KP559016994), Tt, Ts);
198
 
                                   Tu = VFMA(LDK(KP559016994), Tt, Ts);
199
 
                                   ST(&(x[WS(rs, 5)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
200
 
                                   ST(&(x[WS(rs, 15)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
201
 
                              }
202
 
                         }
203
 
                    }
204
 
                    {
205
 
                         V T1E, T1I, T1U, T1Y;
206
 
                         T1E = VFNMS(LDK(KP559016994), T1D, T1C);
207
 
                         T1I = VFMA(LDK(KP559016994), T1D, T1C);
208
 
                         T1U = VFMA(LDK(KP559016994), T1T, T1S);
209
 
                         T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
210
 
                         {
211
 
                              V T1c, T1g, T18, TS;
212
 
                              T1c = VFMA(LDK(KP951056516), T1b, T1a);
213
 
                              T1g = VFNMS(LDK(KP951056516), T1b, T1a);
214
 
                              T18 = VFMA(LDK(KP951056516), TR, Tu);
215
 
                              TS = VFNMS(LDK(KP951056516), TR, Tu);
216
 
                              ST(&(x[WS(rs, 18)]), VFMAI(T1H, T1E), ms, &(x[0]));
217
 
                              ST(&(x[WS(rs, 2)]), VFNMSI(T1H, T1E), ms, &(x[0]));
218
 
                              ST(&(x[WS(rs, 14)]), VFNMSI(T1J, T1I), ms, &(x[0]));
219
 
                              ST(&(x[WS(rs, 6)]), VFMAI(T1J, T1I), ms, &(x[0]));
220
 
                              ST(&(x[WS(rs, 16)]), VFMAI(T1X, T1U), ms, &(x[0]));
221
 
                              ST(&(x[WS(rs, 4)]), VFNMSI(T1X, T1U), ms, &(x[0]));
222
 
                              ST(&(x[WS(rs, 12)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
223
 
                              ST(&(x[WS(rs, 8)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
224
 
                              ST(&(x[WS(rs, 17)]), VFMAI(T1f, T1c), ms, &(x[WS(rs, 1)]));
225
 
                              ST(&(x[WS(rs, 3)]), VFNMSI(T1f, T1c), ms, &(x[WS(rs, 1)]));
226
 
                              ST(&(x[WS(rs, 13)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
227
 
                              ST(&(x[WS(rs, 7)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
228
 
                              ST(&(x[WS(rs, 9)]), VFMAI(T19, T18), ms, &(x[WS(rs, 1)]));
229
 
                              ST(&(x[WS(rs, 11)]), VFNMSI(T19, T18), ms, &(x[WS(rs, 1)]));
230
 
                              ST(&(x[WS(rs, 1)]), VFMAI(T17, TS), ms, &(x[WS(rs, 1)]));
231
 
                              ST(&(x[WS(rs, 19)]), VFNMSI(T17, TS), ms, &(x[WS(rs, 1)]));
232
 
                         }
233
 
                    }
234
 
               }
235
 
          }
236
 
     }
237
 
}
238
 
 
239
 
static const tw_instr twinstr[] = {
240
 
     VTW(0, 1),
241
 
     VTW(0, 2),
242
 
     VTW(0, 3),
243
 
     VTW(0, 4),
244
 
     VTW(0, 5),
245
 
     VTW(0, 6),
246
 
     VTW(0, 7),
247
 
     VTW(0, 8),
248
 
     VTW(0, 9),
249
 
     VTW(0, 10),
250
 
     VTW(0, 11),
251
 
     VTW(0, 12),
252
 
     VTW(0, 13),
253
 
     VTW(0, 14),
254
 
     VTW(0, 15),
255
 
     VTW(0, 16),
256
 
     VTW(0, 17),
257
 
     VTW(0, 18),
258
 
     VTW(0, 19),
259
 
     {TW_NEXT, VL, 0}
260
 
};
261
 
 
262
 
static const ct_desc desc = { 20, "t2bv_20", twinstr, &GENUS, {77, 42, 46, 0}, 0, 0, 0 };
263
 
 
264
 
void X(codelet_t2bv_20) (planner *p) {
265
 
     X(kdft_dit_register) (p, t2bv_20, &desc);
266
 
}
267
 
#else                           /* HAVE_FMA */
268
 
 
269
 
/* Generated by: ../../../genfft/gen_twiddle_c -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include t2b.h -sign 1 */
270
 
 
271
 
/*
272
 
 * This function contains 123 FP additions, 62 FP multiplications,
273
 
 * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
274
 
 * 54 stack variables, 4 constants, and 40 memory accesses
275
 
 */
276
 
#include "t2b.h"
277
 
 
278
 
static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
279
 
{
280
 
     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
281
 
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
282
 
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
283
 
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
284
 
     INT m;
285
 
     R *x;
286
 
     x = ii;
287
 
     for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(rs)) {
288
 
          V T4, T10, T1B, T1R, TF, T14, T15, TQ, Tf, Tq, Tr, T1N, T1O, T1P, T1t;
289
 
          V T1w, T1D, TT, TU, T11, T1K, T1L, T1M, T1m, T1p, T1C, T1i, T1j;
290
 
          {
291
 
               V T1, TZ, T3, TX, TY, T2, TW, T1z, T1A;
292
 
               T1 = LD(&(x[0]), ms, &(x[0]));
293
 
               TY = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
294
 
               TZ = BYTW(&(W[TWVL * 28]), TY);
295
 
               T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
296
 
               T3 = BYTW(&(W[TWVL * 18]), T2);
297
 
               TW = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
298
 
               TX = BYTW(&(W[TWVL * 8]), TW);
299
 
               T4 = VSUB(T1, T3);
300
 
               T10 = VSUB(TX, TZ);
301
 
               T1z = VADD(T1, T3);
302
 
               T1A = VADD(TX, TZ);
303
 
               T1B = VSUB(T1z, T1A);
304
 
               T1R = VADD(T1z, T1A);
305
 
          }
306
 
          {
307
 
               V T9, T1k, TK, T1s, TP, T1v, Te, T1n, Tk, T1r, Tz, T1l, TE, T1o, Tp;
308
 
               V T1u;
309
 
               {
310
 
                    V T6, T8, T5, T7;
311
 
                    T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
312
 
                    T6 = BYTW(&(W[TWVL * 6]), T5);
313
 
                    T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
314
 
                    T8 = BYTW(&(W[TWVL * 26]), T7);
315
 
                    T9 = VSUB(T6, T8);
316
 
                    T1k = VADD(T6, T8);
317
 
               }
318
 
               {
319
 
                    V TH, TJ, TG, TI;
320
 
                    TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
321
 
                    TH = BYTW(&(W[TWVL * 24]), TG);
322
 
                    TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
323
 
                    TJ = BYTW(&(W[TWVL * 4]), TI);
324
 
                    TK = VSUB(TH, TJ);
325
 
                    T1s = VADD(TH, TJ);
326
 
               }
327
 
               {
328
 
                    V TM, TO, TL, TN;
329
 
                    TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
330
 
                    TM = BYTW(&(W[TWVL * 32]), TL);
331
 
                    TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
332
 
                    TO = BYTW(&(W[TWVL * 12]), TN);
333
 
                    TP = VSUB(TM, TO);
334
 
                    T1v = VADD(TM, TO);
335
 
               }
336
 
               {
337
 
                    V Tb, Td, Ta, Tc;
338
 
                    Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
339
 
                    Tb = BYTW(&(W[TWVL * 30]), Ta);
340
 
                    Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
341
 
                    Td = BYTW(&(W[TWVL * 10]), Tc);
342
 
                    Te = VSUB(Tb, Td);
343
 
                    T1n = VADD(Tb, Td);
344
 
               }
345
 
               {
346
 
                    V Th, Tj, Tg, Ti;
347
 
                    Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
348
 
                    Th = BYTW(&(W[TWVL * 14]), Tg);
349
 
                    Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
350
 
                    Tj = BYTW(&(W[TWVL * 34]), Ti);
351
 
                    Tk = VSUB(Th, Tj);
352
 
                    T1r = VADD(Th, Tj);
353
 
               }
354
 
               {
355
 
                    V Tw, Ty, Tv, Tx;
356
 
                    Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
357
 
                    Tw = BYTW(&(W[TWVL * 16]), Tv);
358
 
                    Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
359
 
                    Ty = BYTW(&(W[TWVL * 36]), Tx);
360
 
                    Tz = VSUB(Tw, Ty);
361
 
                    T1l = VADD(Tw, Ty);
362
 
               }
363
 
               {
364
 
                    V TB, TD, TA, TC;
365
 
                    TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
366
 
                    TB = BYTW(&(W[0]), TA);
367
 
                    TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
368
 
                    TD = BYTW(&(W[TWVL * 20]), TC);
369
 
                    TE = VSUB(TB, TD);
370
 
                    T1o = VADD(TB, TD);
371
 
               }
372
 
               {
373
 
                    V Tm, To, Tl, Tn;
374
 
                    Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
375
 
                    Tm = BYTW(&(W[TWVL * 22]), Tl);
376
 
                    Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
377
 
                    To = BYTW(&(W[TWVL * 2]), Tn);
378
 
                    Tp = VSUB(Tm, To);
379
 
                    T1u = VADD(Tm, To);
380
 
               }
381
 
               TF = VSUB(Tz, TE);
382
 
               T14 = VSUB(T9, Te);
383
 
               T15 = VSUB(Tk, Tp);
384
 
               TQ = VSUB(TK, TP);
385
 
               Tf = VADD(T9, Te);
386
 
               Tq = VADD(Tk, Tp);
387
 
               Tr = VADD(Tf, Tq);
388
 
               T1N = VADD(T1r, T1s);
389
 
               T1O = VADD(T1u, T1v);
390
 
               T1P = VADD(T1N, T1O);
391
 
               T1t = VSUB(T1r, T1s);
392
 
               T1w = VSUB(T1u, T1v);
393
 
               T1D = VADD(T1t, T1w);
394
 
               TT = VADD(Tz, TE);
395
 
               TU = VADD(TK, TP);
396
 
               T11 = VADD(TT, TU);
397
 
               T1K = VADD(T1k, T1l);
398
 
               T1L = VADD(T1n, T1o);
399
 
               T1M = VADD(T1K, T1L);
400
 
               T1m = VSUB(T1k, T1l);
401
 
               T1p = VSUB(T1n, T1o);
402
 
               T1C = VADD(T1m, T1p);
403
 
          }
404
 
          T1i = VADD(T4, Tr);
405
 
          T1j = VBYI(VADD(T10, T11));
406
 
          ST(&(x[WS(rs, 15)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
407
 
          ST(&(x[WS(rs, 5)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
408
 
          {
409
 
               V T1Q, T1S, T1T, T1X, T1Z, T1V, T1W, T1Y, T1U;
410
 
               T1Q = VMUL(LDK(KP559016994), VSUB(T1M, T1P));
411
 
               T1S = VADD(T1M, T1P);
412
 
               T1T = VFNMS(LDK(KP250000000), T1S, T1R);
413
 
               T1V = VSUB(T1K, T1L);
414
 
               T1W = VSUB(T1N, T1O);
415
 
               T1X = VBYI(VFMA(LDK(KP951056516), T1V, VMUL(LDK(KP587785252), T1W)));
416
 
               T1Z = VBYI(VFNMS(LDK(KP951056516), T1W, VMUL(LDK(KP587785252), T1V)));
417
 
               ST(&(x[0]), VADD(T1R, T1S), ms, &(x[0]));
418
 
               T1Y = VSUB(T1T, T1Q);
419
 
               ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
420
 
               ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
421
 
               T1U = VADD(T1Q, T1T);
422
 
               ST(&(x[WS(rs, 4)]), VSUB(T1U, T1X), ms, &(x[0]));
423
 
               ST(&(x[WS(rs, 16)]), VADD(T1X, T1U), ms, &(x[0]));
424
 
          }
425
 
          {
426
 
               V T1G, T1E, T1F, T1y, T1I, T1q, T1x, T1J, T1H;
427
 
               T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
428
 
               T1E = VADD(T1C, T1D);
429
 
               T1F = VFNMS(LDK(KP250000000), T1E, T1B);
430
 
               T1q = VSUB(T1m, T1p);
431
 
               T1x = VSUB(T1t, T1w);
432
 
               T1y = VBYI(VFNMS(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
433
 
               T1I = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1x)));
434
 
               ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
435
 
               T1J = VADD(T1G, T1F);
436
 
               ST(&(x[WS(rs, 6)]), VADD(T1I, T1J), ms, &(x[0]));
437
 
               ST(&(x[WS(rs, 14)]), VSUB(T1J, T1I), ms, &(x[0]));
438
 
               T1H = VSUB(T1F, T1G);
439
 
               ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
440
 
               ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
441
 
          }
442
 
          {
443
 
               V TR, T16, T1d, T1b, T13, T1e, Tu, T1a;
444
 
               TR = VFNMS(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TF));
445
 
               T16 = VFNMS(LDK(KP951056516), T15, VMUL(LDK(KP587785252), T14));
446
 
               T1d = VFMA(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T15));
447
 
               T1b = VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TQ));
448
 
               {
449
 
                    V TV, T12, Ts, Tt;
450
 
                    TV = VMUL(LDK(KP559016994), VSUB(TT, TU));
451
 
                    T12 = VFNMS(LDK(KP250000000), T11, T10);
452
 
                    T13 = VSUB(TV, T12);
453
 
                    T1e = VADD(TV, T12);
454
 
                    Ts = VFNMS(LDK(KP250000000), Tr, T4);
455
 
                    Tt = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
456
 
                    Tu = VSUB(Ts, Tt);
457
 
                    T1a = VADD(Tt, Ts);
458
 
               }
459
 
               {
460
 
                    V TS, T17, T1g, T1h;
461
 
                    TS = VSUB(Tu, TR);
462
 
                    T17 = VBYI(VSUB(T13, T16));
463
 
                    ST(&(x[WS(rs, 17)]), VSUB(TS, T17), ms, &(x[WS(rs, 1)]));
464
 
                    ST(&(x[WS(rs, 3)]), VADD(TS, T17), ms, &(x[WS(rs, 1)]));
465
 
                    T1g = VADD(T1a, T1b);
466
 
                    T1h = VBYI(VSUB(T1e, T1d));
467
 
                    ST(&(x[WS(rs, 11)]), VSUB(T1g, T1h), ms, &(x[WS(rs, 1)]));
468
 
                    ST(&(x[WS(rs, 9)]), VADD(T1g, T1h), ms, &(x[WS(rs, 1)]));
469
 
               }
470
 
               {
471
 
                    V T18, T19, T1c, T1f;
472
 
                    T18 = VADD(Tu, TR);
473
 
                    T19 = VBYI(VADD(T16, T13));
474
 
                    ST(&(x[WS(rs, 13)]), VSUB(T18, T19), ms, &(x[WS(rs, 1)]));
475
 
                    ST(&(x[WS(rs, 7)]), VADD(T18, T19), ms, &(x[WS(rs, 1)]));
476
 
                    T1c = VSUB(T1a, T1b);
477
 
                    T1f = VBYI(VADD(T1d, T1e));
478
 
                    ST(&(x[WS(rs, 19)]), VSUB(T1c, T1f), ms, &(x[WS(rs, 1)]));
479
 
                    ST(&(x[WS(rs, 1)]), VADD(T1c, T1f), ms, &(x[WS(rs, 1)]));
480
 
               }
481
 
          }
482
 
     }
483
 
}
484
 
 
485
 
static const tw_instr twinstr[] = {
486
 
     VTW(0, 1),
487
 
     VTW(0, 2),
488
 
     VTW(0, 3),
489
 
     VTW(0, 4),
490
 
     VTW(0, 5),
491
 
     VTW(0, 6),
492
 
     VTW(0, 7),
493
 
     VTW(0, 8),
494
 
     VTW(0, 9),
495
 
     VTW(0, 10),
496
 
     VTW(0, 11),
497
 
     VTW(0, 12),
498
 
     VTW(0, 13),
499
 
     VTW(0, 14),
500
 
     VTW(0, 15),
501
 
     VTW(0, 16),
502
 
     VTW(0, 17),
503
 
     VTW(0, 18),
504
 
     VTW(0, 19),
505
 
     {TW_NEXT, VL, 0}
506
 
};
507
 
 
508
 
static const ct_desc desc = { 20, "t2bv_20", twinstr, &GENUS, {111, 50, 12, 0}, 0, 0, 0 };
509
 
 
510
 
void X(codelet_t2bv_20) (planner *p) {
511
 
     X(kdft_dit_register) (p, t2bv_20, &desc);
512
 
}
513
 
#endif                          /* HAVE_FMA */