~ubuntu-branches/ubuntu/utopic/fftw3/utopic

« back to all changes in this revision

Viewing changes to rdft/simd/common/hc2cbdftv_20.c

  • Committer: Package Import Robot
  • Author(s): Matthias Klose
  • Date: 2011-12-14 13:21:22 UTC
  • mfrom: (3.1.5 sid)
  • Revision ID: package-import@ubuntu.com-20111214132122-l4avyl2kkr7vq5aj
Tags: 3.3-1ubuntu1
* Merge with Debian; remaining changes:
  - Revert the ARM workaround.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright (c) 2003, 2007-11 Matteo Frigo
 
3
 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
 
4
 *
 
5
 * This program is free software; you can redistribute it and/or modify
 
6
 * it under the terms of the GNU General Public License as published by
 
7
 * the Free Software Foundation; either version 2 of the License, or
 
8
 * (at your option) any later version.
 
9
 *
 
10
 * This program is distributed in the hope that it will be useful,
 
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
13
 * GNU General Public License for more details.
 
14
 *
 
15
 * You should have received a copy of the GNU General Public License
 
16
 * along with this program; if not, write to the Free Software
 
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
18
 *
 
19
 */
 
20
 
 
21
/* This file was automatically generated --- DO NOT EDIT */
 
22
/* Generated on Wed Jul 27 06:20:05 EDT 2011 */
 
23
 
 
24
#include "codelet-rdft.h"
 
25
 
 
26
#ifdef HAVE_FMA
 
27
 
 
28
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include hc2cbv.h */
 
29
 
 
30
/*
 
31
 * This function contains 143 FP additions, 108 FP multiplications,
 
32
 * (or, 77 additions, 42 multiplications, 66 fused multiply/add),
 
33
 * 134 stack variables, 4 constants, and 40 memory accesses
 
34
 */
 
35
#include "hc2cbv.h"
 
36
 
 
37
static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
 
38
{
 
39
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
 
40
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
 
41
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
 
42
     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
 
43
     {
 
44
          INT m;
 
45
          for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(rs)) {
 
46
               V T1M, T1T, T4, TF, T12, Te, T16, Ts, Tb, TN, TA, TG, TU, T1Y, T11;
 
47
               V T1e, T29, T21, T15, Th, T13, Tp;
 
48
               {
 
49
                    V TS, TT, Tf, T10, T20, T1Z, TX, Tg, Tn, To, T2, T3, TD, TE, T8;
 
50
                    V TV, T7, TZ, Tz, T9, Tu, Tv, T5, T6, Tx, Ty, Tc, Td, Tq, Tr;
 
51
                    V TY, Ta, TW, Tw;
 
52
                    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
 
53
                    T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
 
54
                    TD = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
 
55
                    TE = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
 
56
                    T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
 
57
                    T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
 
58
                    Tx = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
 
59
                    Ty = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
 
60
                    T8 = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
 
61
                    TS = VFMACONJ(T3, T2);
 
62
                    T4 = VFNMSCONJ(T3, T2);
 
63
                    TT = VFMACONJ(TE, TD);
 
64
                    TF = VFNMSCONJ(TE, TD);
 
65
                    TV = VFMACONJ(T6, T5);
 
66
                    T7 = VFNMSCONJ(T6, T5);
 
67
                    TZ = VFMACONJ(Ty, Tx);
 
68
                    Tz = VFNMSCONJ(Ty, Tx);
 
69
                    T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
 
70
                    Tu = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
 
71
                    Tv = LD(&(Rm[0]), -ms, &(Rm[0]));
 
72
                    Tc = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
 
73
                    Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
 
74
                    Tq = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
 
75
                    Tr = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
 
76
                    Tf = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
 
77
                    TY = VFMACONJ(T9, T8);
 
78
                    Ta = VFMSCONJ(T9, T8);
 
79
                    TW = VFMACONJ(Tv, Tu);
 
80
                    Tw = VFNMSCONJ(Tv, Tu);
 
81
                    T12 = VFMACONJ(Td, Tc);
 
82
                    Te = VFNMSCONJ(Td, Tc);
 
83
                    T16 = VFMACONJ(Tr, Tq);
 
84
                    Ts = VFMSCONJ(Tr, Tq);
 
85
                    T10 = VSUB(TY, TZ);
 
86
                    T20 = VADD(TY, TZ);
 
87
                    Tb = VADD(T7, Ta);
 
88
                    TN = VSUB(T7, Ta);
 
89
                    T1Z = VADD(TV, TW);
 
90
                    TX = VSUB(TV, TW);
 
91
                    TA = VSUB(Tw, Tz);
 
92
                    TG = VADD(Tw, Tz);
 
93
                    Tg = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
 
94
                    Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
 
95
                    To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
 
96
                    TU = VSUB(TS, TT);
 
97
                    T1Y = VADD(TS, TT);
 
98
                    T11 = VADD(TX, T10);
 
99
                    T1e = VSUB(TX, T10);
 
100
                    T29 = VSUB(T1Z, T20);
 
101
                    T21 = VADD(T1Z, T20);
 
102
                    T15 = VFMACONJ(Tg, Tf);
 
103
                    Th = VFMSCONJ(Tg, Tf);
 
104
                    T13 = VFMACONJ(To, Tn);
 
105
                    Tp = VFMSCONJ(To, Tn);
 
106
               }
 
107
               {
 
108
                    V T1S, T2B, T1W, T1I, T2q, T2w, T2i, T2c, T1C, T1K, T1s, T1g, T1, T2t, T1v;
 
109
                    V T1Q, T2A, T1q, T2m, TC, T1w, TP, T1x, T2f, T2r, T2g, T1E, T1D, T2y, T2x;
 
110
                    V T1i, T1h, T2D, T2C, T2s, T1t, T1u, T1y, T2u, TQ, T2d, T2e, T1U, T1L, T2j;
 
111
                    V T2k;
 
112
                    {
 
113
                         V T1R, T1F, T1V, T1o, TO, Tl, T1d, T2a, T1l, TB, TK, T1G, Tk, T1b, T19;
 
114
                         V T27, T25, T1H, TJ, T17, T23, TM, Ti, T14, T22, Tt, TH, Tj, T18, T24;
 
115
                         V TI, T2b, T2p, T1X, T2v, T2h, T2n, T1B, T1f, T28, T2o, T1a, TR, T1J, T1r;
 
116
                         V T1z, T26, Tm, TL, T1O, T1m, T1j, T2z, T1N, T1p, T1P, T2l, T1c, T1A, T1n;
 
117
                         V T1k;
 
118
                         T1R = LDW(&(W[TWVL * 18]));
 
119
                         T17 = VSUB(T15, T16);
 
120
                         T23 = VADD(T15, T16);
 
121
                         TM = VSUB(Te, Th);
 
122
                         Ti = VADD(Te, Th);
 
123
                         T14 = VSUB(T12, T13);
 
124
                         T22 = VADD(T12, T13);
 
125
                         Tt = VSUB(Tp, Ts);
 
126
                         TH = VADD(Tp, Ts);
 
127
                         T1F = LDW(&(W[TWVL * 28]));
 
128
                         T1V = LDW(&(W[TWVL * 8]));
 
129
                         T1o = VFMA(LDK(KP618033988), TM, TN);
 
130
                         TO = VFNMS(LDK(KP618033988), TN, TM);
 
131
                         Tj = VADD(Tb, Ti);
 
132
                         Tl = VSUB(Tb, Ti);
 
133
                         T18 = VADD(T14, T17);
 
134
                         T1d = VSUB(T14, T17);
 
135
                         T24 = VADD(T22, T23);
 
136
                         T2a = VSUB(T22, T23);
 
137
                         T1l = VFMA(LDK(KP618033988), Tt, TA);
 
138
                         TB = VFNMS(LDK(KP618033988), TA, Tt);
 
139
                         TI = VADD(TG, TH);
 
140
                         TK = VSUB(TG, TH);
 
141
                         T1G = VADD(T4, Tj);
 
142
                         Tk = VFNMS(LDK(KP250000000), Tj, T4);
 
143
                         T1b = VSUB(T11, T18);
 
144
                         T19 = VADD(T11, T18);
 
145
                         T27 = VSUB(T21, T24);
 
146
                         T25 = VADD(T21, T24);
 
147
                         T1H = VADD(TF, TI);
 
148
                         TJ = VFNMS(LDK(KP250000000), TI, TF);
 
149
                         T2b = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2a, T29));
 
150
                         T2p = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T29, T2a));
 
151
                         T1X = LDW(&(W[TWVL * 6]));
 
152
                         T1S = VZMUL(T1R, VADD(TU, T19));
 
153
                         T2v = LDW(&(W[TWVL * 22]));
 
154
                         T2B = VADD(T1Y, T25);
 
155
                         T26 = VFNMS(LDK(KP250000000), T25, T1Y);
 
156
                         T1W = VZMULI(T1V, VFMAI(T1H, T1G));
 
157
                         T1I = VZMULI(T1F, VFNMSI(T1H, T1G));
 
158
                         T2h = LDW(&(W[TWVL * 30]));
 
159
                         T2n = LDW(&(W[TWVL * 14]));
 
160
                         T1B = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1d, T1e));
 
161
                         T1f = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1e, T1d));
 
162
                         T28 = VFMA(LDK(KP559016994), T27, T26);
 
163
                         T2o = VFNMS(LDK(KP559016994), T27, T26);
 
164
                         T1a = VFNMS(LDK(KP250000000), T19, TU);
 
165
                         TR = LDW(&(W[TWVL * 2]));
 
166
                         T1J = LDW(&(W[TWVL * 26]));
 
167
                         T1r = LDW(&(W[TWVL * 34]));
 
168
                         T1z = LDW(&(W[TWVL * 10]));
 
169
                         T1k = VFMA(LDK(KP559016994), Tl, Tk);
 
170
                         Tm = VFNMS(LDK(KP559016994), Tl, Tk);
 
171
                         T2q = VZMUL(T2n, VFMAI(T2p, T2o));
 
172
                         T2w = VZMUL(T2v, VFNMSI(T2p, T2o));
 
173
                         T2i = VZMUL(T2h, VFMAI(T2b, T28));
 
174
                         T2c = VZMUL(T1X, VFNMSI(T2b, T28));
 
175
                         T1c = VFNMS(LDK(KP559016994), T1b, T1a);
 
176
                         T1A = VFMA(LDK(KP559016994), T1b, T1a);
 
177
                         TL = VFNMS(LDK(KP559016994), TK, TJ);
 
178
                         T1n = VFMA(LDK(KP559016994), TK, TJ);
 
179
                         T1O = VFMA(LDK(KP951056516), T1l, T1k);
 
180
                         T1m = VFNMS(LDK(KP951056516), T1l, T1k);
 
181
                         T1j = LDW(&(W[TWVL * 36]));
 
182
                         T2z = LDW(&(W[0]));
 
183
                         T1N = LDW(&(W[TWVL * 20]));
 
184
                         T1C = VZMUL(T1z, VFMAI(T1B, T1A));
 
185
                         T1K = VZMUL(T1J, VFNMSI(T1B, T1A));
 
186
                         T1s = VZMUL(T1r, VFMAI(T1f, T1c));
 
187
                         T1g = VZMUL(TR, VFNMSI(T1f, T1c));
 
188
                         T1p = VFMA(LDK(KP951056516), T1o, T1n);
 
189
                         T1P = VFNMS(LDK(KP951056516), T1o, T1n);
 
190
                         T2l = LDW(&(W[TWVL * 16]));
 
191
                         T1 = LDW(&(W[TWVL * 4]));
 
192
                         T2t = LDW(&(W[TWVL * 24]));
 
193
                         T1v = LDW(&(W[TWVL * 12]));
 
194
                         T1Q = VZMULI(T1N, VFNMSI(T1P, T1O));
 
195
                         T2A = VZMULI(T2z, VFMAI(T1p, T1m));
 
196
                         T1q = VZMULI(T1j, VFNMSI(T1p, T1m));
 
197
                         T2m = VZMULI(T2l, VFMAI(T1P, T1O));
 
198
                         TC = VFMA(LDK(KP951056516), TB, Tm);
 
199
                         T1w = VFNMS(LDK(KP951056516), TB, Tm);
 
200
                         TP = VFNMS(LDK(KP951056516), TO, TL);
 
201
                         T1x = VFMA(LDK(KP951056516), TO, TL);
 
202
                         T2f = LDW(&(W[TWVL * 32]));
 
203
                    }
 
204
                    T2D = VCONJ(VSUB(T2B, T2A));
 
205
                    T2C = VADD(T2A, T2B);
 
206
                    T2s = VCONJ(VSUB(T2q, T2m));
 
207
                    T2r = VADD(T2m, T2q);
 
208
                    T1t = VADD(T1q, T1s);
 
209
                    T1u = VCONJ(VSUB(T1s, T1q));
 
210
                    T1y = VZMULI(T1v, VFNMSI(T1x, T1w));
 
211
                    T2u = VZMULI(T2t, VFMAI(T1x, T1w));
 
212
                    TQ = VZMULI(T1, VFNMSI(TP, TC));
 
213
                    T2g = VZMULI(T2f, VFMAI(TP, TC));
 
214
                    ST(&(Rm[0]), T2D, -ms, &(Rm[0]));
 
215
                    ST(&(Rp[0]), T2C, ms, &(Rp[0]));
 
216
                    ST(&(Rm[WS(rs, 4)]), T2s, -ms, &(Rm[0]));
 
217
                    ST(&(Rm[WS(rs, 9)]), T1u, -ms, &(Rm[WS(rs, 1)]));
 
218
                    T1E = VCONJ(VSUB(T1C, T1y));
 
219
                    T1D = VADD(T1y, T1C);
 
220
                    T2y = VCONJ(VSUB(T2w, T2u));
 
221
                    T2x = VADD(T2u, T2w);
 
222
                    T1i = VCONJ(VSUB(T1g, TQ));
 
223
                    T1h = VADD(TQ, T1g);
 
224
                    ST(&(Rp[WS(rs, 9)]), T1t, ms, &(Rp[WS(rs, 1)]));
 
225
                    T1L = VADD(T1I, T1K);
 
226
                    T1M = VCONJ(VSUB(T1K, T1I));
 
227
                    ST(&(Rp[WS(rs, 3)]), T1D, ms, &(Rp[WS(rs, 1)]));
 
228
                    ST(&(Rm[WS(rs, 6)]), T2y, -ms, &(Rm[0]));
 
229
                    ST(&(Rp[WS(rs, 6)]), T2x, ms, &(Rp[0]));
 
230
                    ST(&(Rm[WS(rs, 1)]), T1i, -ms, &(Rm[WS(rs, 1)]));
 
231
                    ST(&(Rp[WS(rs, 1)]), T1h, ms, &(Rp[WS(rs, 1)]));
 
232
                    T2d = VADD(T1W, T2c);
 
233
                    T2e = VCONJ(VSUB(T2c, T1W));
 
234
                    ST(&(Rm[WS(rs, 3)]), T1E, -ms, &(Rm[WS(rs, 1)]));
 
235
                    ST(&(Rp[WS(rs, 7)]), T1L, ms, &(Rp[WS(rs, 1)]));
 
236
                    T1U = VCONJ(VSUB(T1S, T1Q));
 
237
                    T1T = VADD(T1Q, T1S);
 
238
                    T2j = VADD(T2g, T2i);
 
239
                    T2k = VCONJ(VSUB(T2i, T2g));
 
240
                    ST(&(Rp[WS(rs, 2)]), T2d, ms, &(Rp[0]));
 
241
                    ST(&(Rp[WS(rs, 4)]), T2r, ms, &(Rp[0]));
 
242
                    ST(&(Rm[WS(rs, 5)]), T1U, -ms, &(Rm[WS(rs, 1)]));
 
243
                    ST(&(Rm[WS(rs, 2)]), T2e, -ms, &(Rm[0]));
 
244
                    ST(&(Rp[WS(rs, 8)]), T2j, ms, &(Rp[0]));
 
245
                    ST(&(Rm[WS(rs, 8)]), T2k, -ms, &(Rm[0]));
 
246
               }
 
247
               ST(&(Rp[WS(rs, 5)]), T1T, ms, &(Rp[WS(rs, 1)]));
 
248
               ST(&(Rm[WS(rs, 7)]), T1M, -ms, &(Rm[WS(rs, 1)]));
 
249
          }
 
250
     }
 
251
     VLEAVE();
 
252
}
 
253
 
 
254
static const tw_instr twinstr[] = {
 
255
     VTW(1, 1),
 
256
     VTW(1, 2),
 
257
     VTW(1, 3),
 
258
     VTW(1, 4),
 
259
     VTW(1, 5),
 
260
     VTW(1, 6),
 
261
     VTW(1, 7),
 
262
     VTW(1, 8),
 
263
     VTW(1, 9),
 
264
     VTW(1, 10),
 
265
     VTW(1, 11),
 
266
     VTW(1, 12),
 
267
     VTW(1, 13),
 
268
     VTW(1, 14),
 
269
     VTW(1, 15),
 
270
     VTW(1, 16),
 
271
     VTW(1, 17),
 
272
     VTW(1, 18),
 
273
     VTW(1, 19),
 
274
     {TW_NEXT, VL, 0}
 
275
};
 
276
 
 
277
static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {77, 42, 66, 0} };
 
278
 
 
279
void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
 
280
     X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
 
281
}
 
282
#else                           /* HAVE_FMA */
 
283
 
 
284
/* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include hc2cbv.h */
 
285
 
 
286
/*
 
287
 * This function contains 143 FP additions, 62 FP multiplications,
 
288
 * (or, 131 additions, 50 multiplications, 12 fused multiply/add),
 
289
 * 114 stack variables, 4 constants, and 40 memory accesses
 
290
 */
 
291
#include "hc2cbv.h"
 
292
 
 
293
static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
 
294
{
 
295
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
 
296
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
 
297
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
 
298
     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
 
299
     {
 
300
          INT m;
 
301
          for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(rs)) {
 
302
               V TK, T1v, TY, T1x, T1j, T2f, TS, TT, TO, TU, T5, To, Tp, Tq, T2a;
 
303
               V T2d, T2g, T2k, T2j, T1k, T1l, T18, T1m, T1f;
 
304
               {
 
305
                    V T2, TP, T4, TR, TI, T1d, T9, T12, Td, T15, TE, T1a, Tv, T13, Tm;
 
306
                    V T1c, Tz, T16, Ti, T19, T3, TQ, TH, TG, TF, T6, T8, T7, Tc, Tb;
 
307
                    V Ta, TD, TC, TB, Ts, Tu, Tt, Tl, Tk, Tj, Tw, Ty, Tx, Tf, Th;
 
308
                    V Tg, TA, TJ, TW, TX, T1h, T1i, TM, TN, Te, Tn, T28, T29, T2b, T2c;
 
309
                    V T14, T17, T1b, T1e;
 
310
                    T2 = LD(&(Rp[0]), ms, &(Rp[0]));
 
311
                    TP = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
 
312
                    T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
 
313
                    T4 = VCONJ(T3);
 
314
                    TQ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
 
315
                    TR = VCONJ(TQ);
 
316
                    TH = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
 
317
                    TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
 
318
                    TG = VCONJ(TF);
 
319
                    TI = VSUB(TG, TH);
 
320
                    T1d = VADD(TG, TH);
 
321
                    T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
 
322
                    T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
 
323
                    T8 = VCONJ(T7);
 
324
                    T9 = VSUB(T6, T8);
 
325
                    T12 = VADD(T6, T8);
 
326
                    Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
 
327
                    Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
 
328
                    Tb = VCONJ(Ta);
 
329
                    Td = VSUB(Tb, Tc);
 
330
                    T15 = VADD(Tb, Tc);
 
331
                    TD = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
 
332
                    TB = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
 
333
                    TC = VCONJ(TB);
 
334
                    TE = VSUB(TC, TD);
 
335
                    T1a = VADD(TC, TD);
 
336
                    Ts = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
 
337
                    Tt = LD(&(Rm[0]), -ms, &(Rm[0]));
 
338
                    Tu = VCONJ(Tt);
 
339
                    Tv = VSUB(Ts, Tu);
 
340
                    T13 = VADD(Ts, Tu);
 
341
                    Tl = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
 
342
                    Tj = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
 
343
                    Tk = VCONJ(Tj);
 
344
                    Tm = VSUB(Tk, Tl);
 
345
                    T1c = VADD(Tk, Tl);
 
346
                    Tw = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
 
347
                    Tx = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
 
348
                    Ty = VCONJ(Tx);
 
349
                    Tz = VSUB(Tw, Ty);
 
350
                    T16 = VADD(Tw, Ty);
 
351
                    Tf = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
 
352
                    Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
 
353
                    Th = VCONJ(Tg);
 
354
                    Ti = VSUB(Tf, Th);
 
355
                    T19 = VADD(Tf, Th);
 
356
                    TA = VSUB(Tv, Tz);
 
357
                    TJ = VSUB(TE, TI);
 
358
                    TK = VFNMS(LDK(KP951056516), TJ, VMUL(LDK(KP587785252), TA));
 
359
                    T1v = VFMA(LDK(KP951056516), TA, VMUL(LDK(KP587785252), TJ));
 
360
                    TW = VSUB(T9, Td);
 
361
                    TX = VSUB(Ti, Tm);
 
362
                    TY = VFNMS(LDK(KP951056516), TX, VMUL(LDK(KP587785252), TW));
 
363
                    T1x = VFMA(LDK(KP951056516), TW, VMUL(LDK(KP587785252), TX));
 
364
                    T1h = VADD(T2, T4);
 
365
                    T1i = VADD(TP, TR);
 
366
                    T1j = VSUB(T1h, T1i);
 
367
                    T2f = VADD(T1h, T1i);
 
368
                    TS = VSUB(TP, TR);
 
369
                    TM = VADD(Tv, Tz);
 
370
                    TN = VADD(TE, TI);
 
371
                    TT = VADD(TM, TN);
 
372
                    TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
 
373
                    TU = VFNMS(LDK(KP250000000), TT, TS);
 
374
                    T5 = VSUB(T2, T4);
 
375
                    Te = VADD(T9, Td);
 
376
                    Tn = VADD(Ti, Tm);
 
377
                    To = VADD(Te, Tn);
 
378
                    Tp = VFNMS(LDK(KP250000000), To, T5);
 
379
                    Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
 
380
                    T28 = VADD(T12, T13);
 
381
                    T29 = VADD(T15, T16);
 
382
                    T2a = VADD(T28, T29);
 
383
                    T2b = VADD(T19, T1a);
 
384
                    T2c = VADD(T1c, T1d);
 
385
                    T2d = VADD(T2b, T2c);
 
386
                    T2g = VADD(T2a, T2d);
 
387
                    T2k = VSUB(T2b, T2c);
 
388
                    T2j = VSUB(T28, T29);
 
389
                    T14 = VSUB(T12, T13);
 
390
                    T17 = VSUB(T15, T16);
 
391
                    T1k = VADD(T14, T17);
 
392
                    T1b = VSUB(T19, T1a);
 
393
                    T1e = VSUB(T1c, T1d);
 
394
                    T1l = VADD(T1b, T1e);
 
395
                    T18 = VSUB(T14, T17);
 
396
                    T1m = VADD(T1k, T1l);
 
397
                    T1f = VSUB(T1b, T1e);
 
398
               }
 
399
               {
 
400
                    V T2L, T22, T1S, T26, T2m, T2G, T2s, T2A, T1q, T1U, T1C, T1M, T10, T2E, T1I;
 
401
                    V T2q, T1A, T2K, T20, T2w, T21, T1Q, T1R, T1P, T25, T1r, T1s, T2C, T2N, T1N;
 
402
                    V T2H, T2I, T2M, T1E, T1D, T1O, T1V, T2n, T2B, T24, T2o, T2t, T2u, T23, T1W;
 
403
                    T2L = VADD(T2f, T2g);
 
404
                    T21 = LDW(&(W[TWVL * 18]));
 
405
                    T22 = VZMUL(T21, VADD(T1j, T1m));
 
406
                    T1Q = VADD(T5, To);
 
407
                    T1R = VBYI(VADD(TS, TT));
 
408
                    T1P = LDW(&(W[TWVL * 28]));
 
409
                    T1S = VZMULI(T1P, VSUB(T1Q, T1R));
 
410
                    T25 = LDW(&(W[TWVL * 8]));
 
411
                    T26 = VZMULI(T25, VADD(T1Q, T1R));
 
412
                    {
 
413
                         V T2l, T2z, T2i, T2y, T2e, T2h, T27, T2F, T2r, T2x, T1g, T1K, T1p, T1L, T1n;
 
414
                         V T1o, T11, T1T, T1B, T1J, TL, T1G, TZ, T1H, Tr, TV, T1, T2D, T1F, T2p;
 
415
                         V T1w, T1Y, T1z, T1Z, T1u, T1y, T1t, T2J, T1X, T2v;
 
416
                         T2l = VBYI(VFMA(LDK(KP951056516), T2j, VMUL(LDK(KP587785252), T2k)));
 
417
                         T2z = VBYI(VFNMS(LDK(KP951056516), T2k, VMUL(LDK(KP587785252), T2j)));
 
418
                         T2e = VMUL(LDK(KP559016994), VSUB(T2a, T2d));
 
419
                         T2h = VFNMS(LDK(KP250000000), T2g, T2f);
 
420
                         T2i = VADD(T2e, T2h);
 
421
                         T2y = VSUB(T2h, T2e);
 
422
                         T27 = LDW(&(W[TWVL * 6]));
 
423
                         T2m = VZMUL(T27, VSUB(T2i, T2l));
 
424
                         T2F = LDW(&(W[TWVL * 22]));
 
425
                         T2G = VZMUL(T2F, VADD(T2z, T2y));
 
426
                         T2r = LDW(&(W[TWVL * 30]));
 
427
                         T2s = VZMUL(T2r, VADD(T2l, T2i));
 
428
                         T2x = LDW(&(W[TWVL * 14]));
 
429
                         T2A = VZMUL(T2x, VSUB(T2y, T2z));
 
430
                         T1g = VBYI(VFNMS(LDK(KP951056516), T1f, VMUL(LDK(KP587785252), T18)));
 
431
                         T1K = VBYI(VFMA(LDK(KP951056516), T18, VMUL(LDK(KP587785252), T1f)));
 
432
                         T1n = VFNMS(LDK(KP250000000), T1m, T1j);
 
433
                         T1o = VMUL(LDK(KP559016994), VSUB(T1k, T1l));
 
434
                         T1p = VSUB(T1n, T1o);
 
435
                         T1L = VADD(T1o, T1n);
 
436
                         T11 = LDW(&(W[TWVL * 2]));
 
437
                         T1q = VZMUL(T11, VADD(T1g, T1p));
 
438
                         T1T = LDW(&(W[TWVL * 26]));
 
439
                         T1U = VZMUL(T1T, VSUB(T1L, T1K));
 
440
                         T1B = LDW(&(W[TWVL * 34]));
 
441
                         T1C = VZMUL(T1B, VSUB(T1p, T1g));
 
442
                         T1J = LDW(&(W[TWVL * 10]));
 
443
                         T1M = VZMUL(T1J, VADD(T1K, T1L));
 
444
                         Tr = VSUB(Tp, Tq);
 
445
                         TL = VSUB(Tr, TK);
 
446
                         T1G = VADD(Tr, TK);
 
447
                         TV = VSUB(TO, TU);
 
448
                         TZ = VBYI(VSUB(TV, TY));
 
449
                         T1H = VBYI(VADD(TY, TV));
 
450
                         T1 = LDW(&(W[TWVL * 4]));
 
451
                         T10 = VZMULI(T1, VADD(TL, TZ));
 
452
                         T2D = LDW(&(W[TWVL * 24]));
 
453
                         T2E = VZMULI(T2D, VSUB(T1G, T1H));
 
454
                         T1F = LDW(&(W[TWVL * 12]));
 
455
                         T1I = VZMULI(T1F, VADD(T1G, T1H));
 
456
                         T2p = LDW(&(W[TWVL * 32]));
 
457
                         T2q = VZMULI(T2p, VSUB(TL, TZ));
 
458
                         T1u = VADD(Tq, Tp);
 
459
                         T1w = VSUB(T1u, T1v);
 
460
                         T1Y = VADD(T1u, T1v);
 
461
                         T1y = VADD(TO, TU);
 
462
                         T1z = VBYI(VADD(T1x, T1y));
 
463
                         T1Z = VBYI(VSUB(T1y, T1x));
 
464
                         T1t = LDW(&(W[TWVL * 36]));
 
465
                         T1A = VZMULI(T1t, VSUB(T1w, T1z));
 
466
                         T2J = LDW(&(W[0]));
 
467
                         T2K = VZMULI(T2J, VADD(T1w, T1z));
 
468
                         T1X = LDW(&(W[TWVL * 20]));
 
469
                         T20 = VZMULI(T1X, VSUB(T1Y, T1Z));
 
470
                         T2v = LDW(&(W[TWVL * 16]));
 
471
                         T2w = VZMULI(T2v, VADD(T1Y, T1Z));
 
472
                    }
 
473
                    T1r = VADD(T10, T1q);
 
474
                    ST(&(Rp[WS(rs, 1)]), T1r, ms, &(Rp[WS(rs, 1)]));
 
475
                    T1s = VCONJ(VSUB(T1q, T10));
 
476
                    ST(&(Rm[WS(rs, 1)]), T1s, -ms, &(Rm[WS(rs, 1)]));
 
477
                    T2C = VCONJ(VSUB(T2A, T2w));
 
478
                    ST(&(Rm[WS(rs, 4)]), T2C, -ms, &(Rm[0]));
 
479
                    T2N = VCONJ(VSUB(T2L, T2K));
 
480
                    ST(&(Rm[0]), T2N, -ms, &(Rm[0]));
 
481
                    T1N = VADD(T1I, T1M);
 
482
                    ST(&(Rp[WS(rs, 3)]), T1N, ms, &(Rp[WS(rs, 1)]));
 
483
                    T2H = VADD(T2E, T2G);
 
484
                    ST(&(Rp[WS(rs, 6)]), T2H, ms, &(Rp[0]));
 
485
                    T2I = VCONJ(VSUB(T2G, T2E));
 
486
                    ST(&(Rm[WS(rs, 6)]), T2I, -ms, &(Rm[0]));
 
487
                    T2M = VADD(T2K, T2L);
 
488
                    ST(&(Rp[0]), T2M, ms, &(Rp[0]));
 
489
                    T1E = VCONJ(VSUB(T1C, T1A));
 
490
                    ST(&(Rm[WS(rs, 9)]), T1E, -ms, &(Rm[WS(rs, 1)]));
 
491
                    T1D = VADD(T1A, T1C);
 
492
                    ST(&(Rp[WS(rs, 9)]), T1D, ms, &(Rp[WS(rs, 1)]));
 
493
                    T1O = VCONJ(VSUB(T1M, T1I));
 
494
                    ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
 
495
                    T1V = VADD(T1S, T1U);
 
496
                    ST(&(Rp[WS(rs, 7)]), T1V, ms, &(Rp[WS(rs, 1)]));
 
497
                    T2n = VADD(T26, T2m);
 
498
                    ST(&(Rp[WS(rs, 2)]), T2n, ms, &(Rp[0]));
 
499
                    T2B = VADD(T2w, T2A);
 
500
                    ST(&(Rp[WS(rs, 4)]), T2B, ms, &(Rp[0]));
 
501
                    T24 = VCONJ(VSUB(T22, T20));
 
502
                    ST(&(Rm[WS(rs, 5)]), T24, -ms, &(Rm[WS(rs, 1)]));
 
503
                    T2o = VCONJ(VSUB(T2m, T26));
 
504
                    ST(&(Rm[WS(rs, 2)]), T2o, -ms, &(Rm[0]));
 
505
                    T2t = VADD(T2q, T2s);
 
506
                    ST(&(Rp[WS(rs, 8)]), T2t, ms, &(Rp[0]));
 
507
                    T2u = VCONJ(VSUB(T2s, T2q));
 
508
                    ST(&(Rm[WS(rs, 8)]), T2u, -ms, &(Rm[0]));
 
509
                    T23 = VADD(T20, T22);
 
510
                    ST(&(Rp[WS(rs, 5)]), T23, ms, &(Rp[WS(rs, 1)]));
 
511
                    T1W = VCONJ(VSUB(T1U, T1S));
 
512
                    ST(&(Rm[WS(rs, 7)]), T1W, -ms, &(Rm[WS(rs, 1)]));
 
513
               }
 
514
          }
 
515
     }
 
516
     VLEAVE();
 
517
}
 
518
 
 
519
static const tw_instr twinstr[] = {
 
520
     VTW(1, 1),
 
521
     VTW(1, 2),
 
522
     VTW(1, 3),
 
523
     VTW(1, 4),
 
524
     VTW(1, 5),
 
525
     VTW(1, 6),
 
526
     VTW(1, 7),
 
527
     VTW(1, 8),
 
528
     VTW(1, 9),
 
529
     VTW(1, 10),
 
530
     VTW(1, 11),
 
531
     VTW(1, 12),
 
532
     VTW(1, 13),
 
533
     VTW(1, 14),
 
534
     VTW(1, 15),
 
535
     VTW(1, 16),
 
536
     VTW(1, 17),
 
537
     VTW(1, 18),
 
538
     VTW(1, 19),
 
539
     {TW_NEXT, VL, 0}
 
540
};
 
541
 
 
542
static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {131, 50, 12, 0} };
 
543
 
 
544
void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
 
545
     X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
 
546
}
 
547
#endif                          /* HAVE_FMA */