2
* Copyright (c) 2003, 2007-8 Matteo Frigo
3
* Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Sun Jul 12 06:47:45 EDT 2009 */
24
#include "codelet-rdft.h"
28
/* Generated by: ../../../genfft/gen_hc2cdft_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
31
* This function contains 249 FP additions, 224 FP multiplications,
32
* (or, 119 additions, 94 multiplications, 130 fused multiply/add),
33
* 167 stack variables, 8 constants, and 64 memory accesses
37
static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
39
DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40
DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
41
DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
42
DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
44
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
45
DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
46
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
48
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(rs)) {
49
V T2m, T2b, T2c, T2d, T2v, T2r, T20, T2i, T2n, T2e, T2o, T2u, T2j, T2f, T2t;
50
V T2s, T2x, T2w, T2l, T2k, T2h, T2g;
52
V T41, T3B, T40, T3a, T2J, T27, T2y, Ts, T2C, T1X, T2B, T1Q, T3F, T3w, T4l;
53
V T49, T1b, T1s, T3c, TB, T1f, T3g, T44, T1l, T3k, T3o, T4b, T28, T14, T1d;
56
V T1V, T1E, T3A, Th, T3v, T47, T1J, T3q, T8, T38, T25, T39, T3z, Tq, T1O;
57
V T3r, T3, T7, T3u, T24, T22, T3t, T1I, Tn, T1G, To, Tm, T1K, Tl, T1N;
58
V Tp, T1L, TU, T3f, T3m, T13, T3e, T3n, T1i, TH, TI, T1k, TG, TF, T1c;
61
V T1x, T1y, T1U, T1B, T1S, T1C, T1A, T23, T21, T1z, T1, T2, T1T, T5, T6;
62
V T1R, T4, T1w, Ta, Tb, T1H, Te, Tf, Td, Tc, T1F, T9, T1D, Tj, Tk;
64
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
65
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
67
T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
68
T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
69
T1R = LDW(&(W[TWVL * 32]));
70
T4 = LDW(&(W[TWVL * 30]));
71
T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
72
T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
73
T3 = VFMACONJ(T2, T1);
74
T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
75
T1w = LDW(&(W[TWVL * 48]));
76
T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
77
T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
78
T7 = VZMULJ(T4, VFMACONJ(T6, T5));
79
T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
80
T1A = LDW(&(W[TWVL * 16]));
81
T23 = LDW(&(W[TWVL * 46]));
82
T21 = LDW(&(W[TWVL * 14]));
83
T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
84
Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
87
Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
88
T9 = LDW(&(W[TWVL * 6]));
89
T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
90
T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
91
T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
92
T1H = LDW(&(W[TWVL * 8]));
93
Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
94
Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
95
Td = LDW(&(W[TWVL * 38]));
96
Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
99
T1F = LDW(&(W[TWVL * 40]));
100
Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
101
T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
102
Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
103
Ti = LDW(&(W[TWVL * 54]));
104
Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
105
T1M = LDW(&(W[TWVL * 56]));
106
Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
107
T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
108
To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
109
Tm = LDW(&(W[TWVL * 22]));
110
T1K = LDW(&(W[TWVL * 24]));
111
Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
114
T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
116
T3v = VSUB(T3t, T3u);
117
T47 = VADD(T3u, T3t);
118
T1J = VSUB(T1G, T1I);
119
T3q = VADD(T1I, T1G);
120
Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
121
T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
124
T25 = VSUB(T22, T24);
125
T39 = VADD(T22, T24);
128
T1O = VSUB(T1L, T1N);
129
T3r = VADD(T1N, T1L);
131
V T10, T11, TZ, T1o, TY, T1r, TN, TO, TM, T19, TR, TS, TQ, T17, T26;
132
V Tr, T1W, T1P, T3s, T48, TW, TX, TP, T1a, TV, T1q, TT, T18, Ty, Tz;
133
V Tx, Tw, T1j, Tu, T12, T1p, Tv, Tt, T1h, TD, TA, TE, TC, T1e;
134
TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
135
TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
136
T41 = VADD(T3A, T3z);
137
T3B = VSUB(T3z, T3A);
140
T1W = VADD(T1J, T1O);
141
T1P = VSUB(T1J, T1O);
142
T3s = VSUB(T3q, T3r);
143
T48 = VADD(T3q, T3r);
144
T40 = VADD(T38, T39);
145
T3a = VSUB(T38, T39);
146
T2J = VFNMS(LDK(KP707106781), T26, T25);
147
T27 = VFMA(LDK(KP707106781), T26, T25);
148
T2y = VFMA(LDK(KP707106781), Tr, T8);
149
Ts = VFNMS(LDK(KP707106781), Tr, T8);
150
T2C = VFMA(LDK(KP707106781), T1W, T1V);
151
T1X = VFNMS(LDK(KP707106781), T1W, T1V);
152
T2B = VFMA(LDK(KP707106781), T1P, T1E);
153
T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
154
T3F = VFMA(LDK(KP414213562), T3s, T3v);
155
T3w = VFNMS(LDK(KP414213562), T3v, T3s);
156
T4l = VSUB(T48, T47);
157
T49 = VADD(T47, T48);
158
TM = LDW(&(W[TWVL * 10]));
159
T19 = LDW(&(W[TWVL * 12]));
160
TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
161
TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
162
TQ = LDW(&(W[TWVL * 42]));
163
T17 = LDW(&(W[TWVL * 44]));
164
TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
165
TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
166
TP = VZMULJ(TM, VFMACONJ(TO, TN));
167
T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
168
TV = LDW(&(W[TWVL * 58]));
169
T1q = LDW(&(W[TWVL * 60]));
170
TT = VZMULJ(TQ, VFMACONJ(TS, TR));
171
T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
172
T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
173
T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
174
TZ = LDW(&(W[TWVL * 26]));
175
T1o = LDW(&(W[TWVL * 28]));
176
TY = VZMULJ(TV, VFMACONJ(TX, TW));
177
T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
180
T1b = VSUB(T18, T1a);
181
T3m = VADD(T1a, T18);
182
Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
183
T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
184
T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
185
Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
186
Tt = LDW(&(W[TWVL * 18]));
187
T1h = LDW(&(W[TWVL * 20]));
188
Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
189
Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
190
Tx = LDW(&(W[TWVL * 50]));
193
T1s = VSUB(T1p, T1r);
194
T3n = VADD(T1r, T1p);
195
Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
196
T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
197
T1j = LDW(&(W[TWVL * 52]));
198
TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
199
TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
200
TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
201
TC = LDW(&(W[TWVL * 2]));
202
T1e = LDW(&(W[TWVL * 4]));
203
TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
204
TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
205
T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
206
TG = LDW(&(W[TWVL * 34]));
209
TF = VZMULJ(TC, VFMACONJ(TE, TD));
210
T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
211
T1c = LDW(&(W[TWVL * 36]));
213
T3g = VSUB(T3e, T3f);
214
T44 = VADD(T3e, T3f);
215
T1l = VSUB(T1i, T1k);
216
T3k = VADD(T1i, T1k);
217
TJ = VZMULJ(TG, VFMACONJ(TI, TH));
218
T3o = VSUB(T3m, T3n);
219
T4b = VADD(T3n, T3m);
220
T28 = VFMA(LDK(KP414213562), TU, T13);
221
T14 = VFNMS(LDK(KP414213562), T13, TU);
222
T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
227
V T4k, T4p, T2z, T2a, T2K, T15, T2E, T1n, T2F, T1u, T4c, T3R, T3D, T3i, T3O;
228
V T46, T4g, T3G, T3P, T3S, T3x, T4q, T4n, T42, T1g, T3j, T3E, T3p, T4m, T3d;
229
V T43, T29, TL, T1m, T1t, T3l, T4a, T3C, T3h, T45, T3Q, T3W, T4d, T4h, T3H;
230
V T3L, T3y, T3K, T4r, T4v, T4o, T4u, T4j, T4i, T4e, T4f, T3N, T3M, T3I, T3J;
231
V T4x, T4w, T4s, T4t;
232
T42 = VADD(T40, T41);
233
T4k = VSUB(T40, T41);
234
T1g = VSUB(T1d, T1f);
235
T3j = VADD(T1f, T1d);
236
T3d = VSUB(T3b, T3c);
237
T43 = VADD(T3b, T3c);
238
T29 = VFNMS(LDK(KP414213562), TB, TK);
239
TL = VFMA(LDK(KP414213562), TK, TB);
240
T1m = VSUB(T1g, T1l);
241
T1t = VADD(T1g, T1l);
242
T3l = VSUB(T3j, T3k);
243
T4a = VADD(T3j, T3k);
244
T3C = VSUB(T3g, T3d);
245
T3h = VADD(T3d, T3g);
246
T45 = VADD(T43, T44);
247
T4p = VSUB(T44, T43);
248
T2z = VADD(T29, T28);
249
T2a = VSUB(T28, T29);
252
T2E = VFMA(LDK(KP707106781), T1m, T1b);
253
T1n = VFNMS(LDK(KP707106781), T1m, T1b);
254
T2F = VFMA(LDK(KP707106781), T1t, T1s);
255
T1u = VFNMS(LDK(KP707106781), T1t, T1s);
256
T3E = VFNMS(LDK(KP414213562), T3l, T3o);
257
T3p = VFMA(LDK(KP414213562), T3o, T3l);
258
T4m = VSUB(T4a, T4b);
259
T4c = VADD(T4a, T4b);
260
T3R = VFMA(LDK(KP707106781), T3C, T3B);
261
T3D = VFNMS(LDK(KP707106781), T3C, T3B);
262
T3i = VFNMS(LDK(KP707106781), T3h, T3a);
263
T3O = VFMA(LDK(KP707106781), T3h, T3a);
264
T46 = VSUB(T42, T45);
265
T4g = VADD(T42, T45);
266
T3G = VSUB(T3E, T3F);
267
T3P = VADD(T3F, T3E);
268
T3S = VADD(T3w, T3p);
269
T3x = VSUB(T3p, T3w);
270
T4q = VSUB(T4m, T4l);
271
T4n = VADD(T4l, T4m);
272
T4d = VSUB(T49, T4c);
273
T4h = VADD(T49, T4c);
274
T3H = VFNMS(LDK(KP923879532), T3G, T3D);
275
T3L = VFMA(LDK(KP923879532), T3G, T3D);
276
T3y = VFMA(LDK(KP923879532), T3x, T3i);
277
T3K = VFNMS(LDK(KP923879532), T3x, T3i);
278
T4r = VFMA(LDK(KP707106781), T4q, T4p);
279
T4v = VFNMS(LDK(KP707106781), T4q, T4p);
280
T4o = VFMA(LDK(KP707106781), T4n, T4k);
281
T4u = VFNMS(LDK(KP707106781), T4n, T4k);
282
T3Q = VFMA(LDK(KP923879532), T3P, T3O);
283
T3W = VFNMS(LDK(KP923879532), T3P, T3O);
284
T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
285
T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
286
T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
287
T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
288
T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
289
T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
290
T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
291
T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
292
T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
293
T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
294
T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
295
T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
296
ST(&(Rp[0]), T4i, ms, &(Rp[0]));
297
ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
298
ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
299
ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
300
ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
301
ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
302
ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
303
ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
304
ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
305
ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
306
ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
307
ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
309
V T2A, T2W, T2L, T2Z, T2D, T2N, T2M, T2G, T3T, T3X, T16, T2p, T1v, T35, T31;
310
V T2I, T2S, T34, T2Y, T2P, T2T, T1Y, T2H, T30, T3Z, T3Y, T3U, T3V, T2O, T2X;
311
V T32, T33, T36, T37, T2U, T2V, T2Q, T2R, T1Z, T2q;
312
T2A = VFNMS(LDK(KP923879532), T2z, T2y);
313
T2W = VFMA(LDK(KP923879532), T2z, T2y);
314
T2L = VFNMS(LDK(KP923879532), T2K, T2J);
315
T2Z = VFMA(LDK(KP923879532), T2K, T2J);
316
T2D = VFMA(LDK(KP198912367), T2C, T2B);
317
T2N = VFNMS(LDK(KP198912367), T2B, T2C);
318
T2M = VFMA(LDK(KP198912367), T2E, T2F);
319
T2G = VFNMS(LDK(KP198912367), T2F, T2E);
320
T3T = VFMA(LDK(KP923879532), T3S, T3R);
321
T3X = VFNMS(LDK(KP923879532), T3S, T3R);
322
T16 = VFNMS(LDK(KP923879532), T15, Ts);
323
T2m = VFMA(LDK(KP923879532), T15, Ts);
324
T2H = VSUB(T2D, T2G);
325
T30 = VADD(T2D, T2G);
326
T2b = VFNMS(LDK(KP923879532), T2a, T27);
327
T2p = VFMA(LDK(KP923879532), T2a, T27);
328
T1v = VFMA(LDK(KP668178637), T1u, T1n);
329
T2c = VFNMS(LDK(KP668178637), T1n, T1u);
330
T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
331
T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
332
T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
333
T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
334
T2O = VSUB(T2M, T2N);
335
T2X = VADD(T2N, T2M);
336
T35 = VFNMS(LDK(KP980785280), T30, T2Z);
337
T31 = VFMA(LDK(KP980785280), T30, T2Z);
338
T2I = VFMA(LDK(KP980785280), T2H, T2A);
339
T2S = VFNMS(LDK(KP980785280), T2H, T2A);
340
ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
341
ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
342
ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
343
ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
344
T34 = VFNMS(LDK(KP980785280), T2X, T2W);
345
T2Y = VFMA(LDK(KP980785280), T2X, T2W);
346
T2P = VFMA(LDK(KP980785280), T2O, T2L);
347
T2T = VFNMS(LDK(KP980785280), T2O, T2L);
348
T2d = VFMA(LDK(KP668178637), T1Q, T1X);
349
T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
350
T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
351
T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
352
T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
353
T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
354
T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
355
T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
356
T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
357
T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
358
T1Z = VSUB(T1v, T1Y);
359
T2q = VADD(T1Y, T1v);
360
ST(&(Rm[0]), T33, -ms, &(Rm[0]));
361
ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
362
ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
363
ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
364
ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
365
ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
366
ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
367
ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
368
T2v = VFNMS(LDK(KP831469612), T2q, T2p);
369
T2r = VFMA(LDK(KP831469612), T2q, T2p);
370
T20 = VFMA(LDK(KP831469612), T1Z, T16);
371
T2i = VFNMS(LDK(KP831469612), T1Z, T16);
375
T2n = VADD(T2d, T2c);
376
T2e = VSUB(T2c, T2d);
377
T2o = VFMA(LDK(KP831469612), T2n, T2m);
378
T2u = VFNMS(LDK(KP831469612), T2n, T2m);
379
T2j = VFMA(LDK(KP831469612), T2e, T2b);
380
T2f = VFNMS(LDK(KP831469612), T2e, T2b);
381
T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
382
T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
383
T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
384
T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
385
T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
386
T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
387
T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
388
T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
389
ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
390
ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
391
ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
392
ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
393
ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
394
ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
395
ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
396
ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
400
static const tw_instr twinstr[] = {
435
static const hc2c_desc desc = { 32, "hc2cfdftv_32", twinstr, &GENUS, {119, 94, 130, 0} };
437
void X(codelet_hc2cfdftv_32) (planner *p) {
438
X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
442
/* Generated by: ../../../genfft/gen_hc2cdft_c -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h */
445
* This function contains 249 FP additions, 133 FP multiplications,
446
* (or, 233 additions, 117 multiplications, 16 fused multiply/add),
447
* 130 stack variables, 9 constants, and 64 memory accesses
451
static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
453
DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
454
DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
455
DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
456
DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
457
DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
458
DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
459
DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
460
DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
461
DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
463
for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(rs)) {
464
V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
465
V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
468
V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
469
V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
472
V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
473
V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
474
V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
475
T1 = LD(&(Rp[0]), ms, &(Rp[0]));
476
T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
480
T1m = VZMULIJ(T1l, VSUB(T3, T1));
481
T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
482
T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
484
T1D = LDW(&(W[TWVL * 16]));
485
T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
486
T2i = LDW(&(W[TWVL * 14]));
487
T2j = VZMULJ(T2i, VADD(T1G, T1F));
488
T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
489
T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
491
T1I = LDW(&(W[TWVL * 48]));
492
T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
493
T2k = LDW(&(W[TWVL * 46]));
494
T2l = VZMULJ(T2k, VADD(T1L, T1K));
495
T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
496
T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
498
T5 = LDW(&(W[TWVL * 30]));
499
T9 = VZMULJ(T5, VADD(T6, T8));
500
T1n = LDW(&(W[TWVL * 32]));
501
T1o = VZMULIJ(T1n, VSUB(T8, T6));
502
Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
503
Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
505
Tb = LDW(&(W[TWVL * 6]));
506
Tf = VZMULJ(Tb, VADD(Tc, Te));
507
T1q = LDW(&(W[TWVL * 8]));
508
T1r = VZMULIJ(T1q, VSUB(Te, Tc));
509
Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
510
To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
512
Tm = LDW(&(W[TWVL * 54]));
513
Tq = VZMULJ(Tm, VADD(Tn, Tp));
514
T1v = LDW(&(W[TWVL * 56]));
515
T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
516
Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
517
Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
519
Tr = LDW(&(W[TWVL * 22]));
520
Tv = VZMULJ(Tr, VADD(Ts, Tu));
521
T1x = LDW(&(W[TWVL * 24]));
522
T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
523
Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
524
Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
526
Tg = LDW(&(W[TWVL * 38]));
527
Tk = VZMULJ(Tg, VADD(Th, Tj));
528
T1s = LDW(&(W[TWVL * 40]));
529
T1t = VZMULIJ(T1s, VSUB(Tj, Th));
531
Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
532
T2m = VSUB(T2j, T2l);
535
Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
536
T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
539
T3R = VSUB(T3P, T3Q);
540
T4h = VADD(T3Q, T3P);
542
T3p = VADD(T2j, T2l);
543
T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
544
T4g = VADD(T3o, T3p);
545
T3z = VADD(T1m, T1o);
546
T3A = VADD(T1H, T1M);
547
T3B = VSUB(T3z, T3A);
548
T4n = VADD(T3z, T3A);
549
T3C = VADD(T1w, T1y);
550
T3D = VADD(T1r, T1t);
551
T3E = VSUB(T3C, T3D);
552
T4o = VADD(T3D, T3C);
553
T1p = VSUB(T1m, T1o);
554
T1N = VSUB(T1H, T1M);
555
T1u = VSUB(T1r, T1t);
556
T1z = VSUB(T1w, T1y);
557
T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
558
T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
559
T1B = VADD(T1p, T1A);
560
T2S = VADD(T1N, T1C);
561
T1O = VSUB(T1C, T1N);
562
T2R = VSUB(T1p, T1A);
565
V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
566
V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
567
V T20, T27, T1U, T1Z;
569
V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
570
V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
571
V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
572
TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
573
TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
575
Tz = LDW(&(W[TWVL * 2]));
576
TD = VZMULJ(Tz, VADD(TA, TC));
577
T1Q = LDW(&(W[TWVL * 4]));
578
T1R = VZMULIJ(T1Q, VSUB(TC, TA));
579
T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
580
T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
582
T17 = LDW(&(W[TWVL * 10]));
583
T1b = VZMULJ(T17, VADD(T18, T1a));
584
T28 = LDW(&(W[TWVL * 12]));
585
T29 = VZMULIJ(T28, VSUB(T1a, T18));
586
T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
587
T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
589
T1c = LDW(&(W[TWVL * 42]));
590
T1g = VZMULJ(T1c, VADD(T1d, T1f));
591
T2a = LDW(&(W[TWVL * 44]));
592
T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
593
TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
594
TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
596
TE = LDW(&(W[TWVL * 34]));
597
TI = VZMULJ(TE, VADD(TF, TH));
598
T1S = LDW(&(W[TWVL * 36]));
599
T1T = VZMULIJ(T1S, VSUB(TH, TF));
600
TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
601
TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
603
TK = LDW(&(W[TWVL * 18]));
604
TO = VZMULJ(TK, VADD(TL, TN));
605
T1X = LDW(&(W[TWVL * 20]));
606
T1Y = VZMULIJ(T1X, VSUB(TN, TL));
607
TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
608
TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
610
TW = LDW(&(W[TWVL * 58]));
611
T10 = VZMULJ(TW, VADD(TX, TZ));
612
T21 = LDW(&(W[TWVL * 60]));
613
T22 = VZMULIJ(T21, VSUB(TZ, TX));
614
T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
615
T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
617
T11 = LDW(&(W[TWVL * 26]));
618
T15 = VZMULJ(T11, VADD(T12, T14));
619
T23 = LDW(&(W[TWVL * 28]));
620
T24 = VZMULIJ(T23, VSUB(T14, T12));
621
TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
622
TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
624
TP = LDW(&(W[TWVL * 50]));
625
TT = VZMULJ(TP, VADD(TQ, TS));
626
T1V = LDW(&(W[TWVL * 52]));
627
T1W = VZMULIJ(T1V, VSUB(TS, TQ));
631
TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
632
T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
633
T16 = VSUB(T10, T15);
634
T1h = VSUB(T1b, T1g);
635
T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
636
T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
637
T3J = VADD(T1Y, T1W);
638
T3K = VADD(T1R, T1T);
639
T3L = VSUB(T3J, T3K);
640
T4q = VADD(T3K, T3J);
641
T3G = VADD(T22, T24);
642
T3H = VADD(T29, T2b);
643
T3I = VSUB(T3G, T3H);
644
T4r = VADD(T3G, T3H);
645
T3u = VADD(T10, T15);
646
T3v = VADD(T1b, T1g);
647
T3w = VSUB(T3u, T3v);
648
T4k = VADD(T3u, T3v);
651
T3t = VSUB(T3r, T3s);
652
T4j = VADD(T3r, T3s);
653
T25 = VSUB(T22, T24);
654
T2c = VSUB(T29, T2b);
655
T1U = VSUB(T1R, T1T);
656
T1Z = VSUB(T1W, T1Y);
657
T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
658
T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
659
T26 = VADD(T20, T25);
660
T2V = VADD(T27, T2c);
661
T2d = VSUB(T27, T2c);
662
T2U = VSUB(T25, T20);
665
V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
666
V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
667
V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
668
V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
669
T4i = VADD(T4g, T4h);
670
T4l = VADD(T4j, T4k);
671
T4m = VADD(T4i, T4l);
672
T4w = VSUB(T4i, T4l);
673
T4p = VADD(T4n, T4o);
674
T4s = VADD(T4q, T4r);
675
T4t = VADD(T4p, T4s);
676
T4x = VBYI(VSUB(T4s, T4p));
677
T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
678
ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
679
T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
680
ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
681
T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
682
ST(&(Rp[0]), T4v, ms, &(Rp[0]));
683
T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
684
ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
685
T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
686
T4F = VSUB(T4k, T4j);
687
T4B = VSUB(T4n, T4o);
688
T4C = VSUB(T4r, T4q);
689
T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
690
T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
691
T4E = VADD(T4A, T4D);
692
T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
693
T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
694
T4K = VSUB(T4A, T4D);
695
T4I = VCONJ(VSUB(T4E, T4H));
696
ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
697
T4N = VADD(T4K, T4L);
698
ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
699
T4J = VADD(T4E, T4H);
700
ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
701
T4M = VCONJ(VSUB(T4K, T4L));
702
ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
703
T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
704
T3y = VADD(T3q, T3x);
705
T47 = VSUB(T3q, T3x);
706
T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
707
T3T = VADD(T3R, T3S);
708
T45 = VSUB(T3S, T3R);
709
T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
710
T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
711
T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
712
T44 = VSUB(T3M, T3F);
713
T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
714
T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
715
T3W = VADD(T3U, T3V);
716
T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
717
T3O = VADD(T3y, T3N);
718
T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
719
T4d = VADD(T47, T48);
720
T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
721
T40 = VSUB(T3y, T3N);
722
T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
723
T49 = VSUB(T47, T48);
724
T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
725
T3Y = VCONJ(VSUB(T3O, T3X));
726
ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
727
T4e = VADD(T4c, T4d);
728
ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
729
T4f = VCONJ(VSUB(T4d, T4c));
730
ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
731
T3Z = VADD(T3O, T3X);
732
ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
733
T42 = VCONJ(VSUB(T40, T41));
734
ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
735
T4a = VADD(T46, T49);
736
ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
737
T4b = VCONJ(VSUB(T49, T46));
738
ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
739
T43 = VADD(T40, T41);
740
ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
742
V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
743
V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
746
T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
749
T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
750
T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
751
T2u = VADD(T2s, T2t);
752
T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
753
T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
754
T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
755
T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
756
T2C = VSUB(T2e, T1P);
757
T2n = VSUB(T2h, T2m);
758
T2q = VSUB(T2o, T2p);
759
T2r = VADD(T2n, T2q);
760
T2D = VSUB(T2q, T2n);
761
T2g = VADD(T1k, T2f);
762
T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
763
T2L = VADD(T2F, T2G);
764
T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
765
T2y = VSUB(T1k, T2f);
766
T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
767
T2H = VSUB(T2F, T2G);
768
T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
769
T2w = VCONJ(VSUB(T2g, T2v));
770
ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
771
T2M = VADD(T2K, T2L);
772
ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
773
T2N = VCONJ(VSUB(T2L, T2K));
774
ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
775
T2x = VADD(T2g, T2v);
776
ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
777
T2A = VCONJ(VSUB(T2y, T2z));
778
ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
779
T2I = VADD(T2E, T2H);
780
ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
781
T2J = VCONJ(VSUB(T2H, T2E));
782
ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
783
T2B = VADD(T2y, T2z);
784
ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
787
V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
788
V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
791
T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
792
T2Q = VADD(T2O, T2P);
793
T3f = VSUB(T2O, T2P);
794
T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
795
T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
796
T34 = VADD(T32, T33);
797
T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
798
T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
799
T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
800
T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
801
T3c = VSUB(T2W, T2T);
802
T2Z = VADD(T2m, T2h);
804
T31 = VADD(T2Z, T30);
805
T3d = VSUB(T30, T2Z);
806
T2Y = VADD(T2Q, T2X);
807
T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
808
T3l = VADD(T3f, T3g);
809
T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
810
T38 = VSUB(T2Q, T2X);
811
T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
812
T3h = VSUB(T3f, T3g);
813
T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
814
T36 = VCONJ(VSUB(T2Y, T35));
815
ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
816
T3m = VADD(T3k, T3l);
817
ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
818
T3n = VCONJ(VSUB(T3l, T3k));
819
ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
820
T37 = VADD(T2Y, T35);
821
ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
822
T3a = VCONJ(VSUB(T38, T39));
823
ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
824
T3i = VADD(T3e, T3h);
825
ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
826
T3j = VCONJ(VSUB(T3h, T3e));
827
ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
828
T3b = VADD(T38, T39);
829
ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
835
static const tw_instr twinstr[] = {
870
static const hc2c_desc desc = { 32, "hc2cfdftv_32", twinstr, &GENUS, {233, 117, 16, 0} };
872
void X(codelet_hc2cfdftv_32) (planner *p) {
873
X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
875
#endif /* HAVE_FMA */