~ubuntu-branches/ubuntu/maverick/blender/maverick

« back to all changes in this revision

Viewing changes to extern/fftw/dft/simd/codelets/t2bv_64.c

  • Committer: Bazaar Package Importer
  • Author(s): Khashayar Naderehvandi, Khashayar Naderehvandi, Alessio Treglia
  • Date: 2009-01-22 16:53:59 UTC
  • mfrom: (14.1.1 experimental)
  • Revision ID: james.westby@ubuntu.com-20090122165359-v0996tn7fbit64ni
Tags: 2.48a+dfsg-1ubuntu1
[ Khashayar Naderehvandi ]
* Merge from debian experimental (LP: #320045), Ubuntu remaining changes:
  - Add patch correcting header file locations.
  - Add libvorbis-dev and libgsm1-dev to Build-Depends.
  - Use avcodec_decode_audio2() in source/blender/src/hddaudio.c

[ Alessio Treglia ]
* Add missing previous changelog entries.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright (c) 2003, 2006 Matteo Frigo
 
3
 * Copyright (c) 2003, 2006 Massachusetts Institute of Technology
 
4
 *
 
5
 * This program is free software; you can redistribute it and/or modify
 
6
 * it under the terms of the GNU General Public License as published by
 
7
 * the Free Software Foundation; either version 2 of the License, or
 
8
 * (at your option) any later version.
 
9
 *
 
10
 * This program is distributed in the hope that it will be useful,
 
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
13
 * GNU General Public License for more details.
 
14
 *
 
15
 * You should have received a copy of the GNU General Public License
 
16
 * along with this program; if not, write to the Free Software
 
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
18
 *
 
19
 */
 
20
 
 
21
/* This file was automatically generated --- DO NOT EDIT */
 
22
/* Generated on Sat Jul  1 22:26:27 EDT 2006 */
 
23
 
 
24
#include "codelet-dft.h"
 
25
 
 
26
#ifdef HAVE_FMA
 
27
 
 
28
/* Generated by: ../../../genfft/gen_twiddle_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2bv_64 -include t2b.h -sign 1 */
 
29
 
 
30
/*
 
31
 * This function contains 519 FP additions, 384 FP multiplications,
 
32
 * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
 
33
 * 187 stack variables, and 128 memory accesses
 
34
 */
 
35
/*
 
36
 * Generator Id's : 
 
37
 * $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
 
38
 * $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
 
39
 * $Id: gen_twiddle_c.ml,v 1.14 2006-02-12 23:34:12 athena Exp $
 
40
 */
 
41
 
 
42
#include "t2b.h"
 
43
 
 
44
static const R *t2bv_64(R *ri, R *ii, const R *W, stride ios, INT m, INT dist)
 
45
{
 
46
     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
 
47
     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
 
48
     DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
 
49
     DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
 
50
     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
 
51
     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
 
52
     DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
 
53
     DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
 
54
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
 
55
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
 
56
     DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
 
57
     DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
 
58
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
 
59
     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
 
60
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
 
61
     INT i;
 
62
     R *x;
 
63
     x = ii;
 
64
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(ios)) {
 
65
          V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h;
 
66
          V T6E;
 
67
          {
 
68
               V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43;
 
69
               V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3b;
 
70
               V T4m, T4C, T7e, T5l, T7d, T5o, T3a, TV, T4B, T4j, T3X, T3Y, T6o, T7b, T5f;
 
71
               V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25;
 
72
               V T1S, T2q, T2u, T2w;
 
73
               {
 
74
                    V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e;
 
75
                    {
 
76
                         V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21;
 
77
                         {
 
78
                              V T1, T2, T7, T5, T32, T34, T2X, T2Z;
 
79
                              T1 = LD(&(x[0]), dist, &(x[0]));
 
80
                              T2 = LD(&(x[WS(ios, 32)]), dist, &(x[0]));
 
81
                              T7 = LD(&(x[WS(ios, 48)]), dist, &(x[0]));
 
82
                              T5 = LD(&(x[WS(ios, 16)]), dist, &(x[0]));
 
83
                              T32 = LD(&(x[WS(ios, 56)]), dist, &(x[0]));
 
84
                              T34 = LD(&(x[WS(ios, 24)]), dist, &(x[0]));
 
85
                              T2X = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
 
86
                              T2Z = LD(&(x[WS(ios, 40)]), dist, &(x[0]));
 
87
                              {
 
88
                                   V T1m, T54, T6j, T36, T56, T31, T55, T1n, T1q, T1s, T4, T9;
 
89
                                   {
 
90
                                        V T3, T8, T6, T33, T35, T2Y, T30, T1l;
 
91
                                        T1l = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
 
92
                                        T3 = BYTW(&(W[TWVL * 62]), T2);
 
93
                                        T8 = BYTW(&(W[TWVL * 94]), T7);
 
94
                                        T6 = BYTW(&(W[TWVL * 30]), T5);
 
95
                                        T33 = BYTW(&(W[TWVL * 110]), T32);
 
96
                                        T35 = BYTW(&(W[TWVL * 46]), T34);
 
97
                                        T2Y = BYTW(&(W[TWVL * 14]), T2X);
 
98
                                        T30 = BYTW(&(W[TWVL * 78]), T2Z);
 
99
                                        T1m = BYTW(&(W[0]), T1l);
 
100
                                        T54 = VSUB(T1, T3);
 
101
                                        T4 = VADD(T1, T3);
 
102
                                        T6j = VSUB(T6, T8);
 
103
                                        T9 = VADD(T6, T8);
 
104
                                        T36 = VADD(T33, T35);
 
105
                                        T56 = VSUB(T33, T35);
 
106
                                        T31 = VADD(T2Y, T30);
 
107
                                        T55 = VSUB(T2Y, T30);
 
108
                                        T1n = LD(&(x[WS(ios, 33)]), dist, &(x[WS(ios, 1)]));
 
109
                                   }
 
110
                                   T1q = LD(&(x[WS(ios, 17)]), dist, &(x[WS(ios, 1)]));
 
111
                                   T1s = LD(&(x[WS(ios, 49)]), dist, &(x[WS(ios, 1)]));
 
112
                                   Ta = VSUB(T4, T9);
 
113
                                   T3U = VADD(T4, T9);
 
114
                                   {
 
115
                                        V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z;
 
116
                                        T1U = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
 
117
                                        T3V = VADD(T31, T36);
 
118
                                        T37 = VSUB(T31, T36);
 
119
                                        T57 = VADD(T55, T56);
 
120
                                        T6k = VSUB(T55, T56);
 
121
                                        T1o = BYTW(&(W[TWVL * 64]), T1n);
 
122
                                        T1r = BYTW(&(W[TWVL * 32]), T1q);
 
123
                                        T1t = BYTW(&(W[TWVL * 96]), T1s);
 
124
                                        T1V = BYTW(&(W[TWVL * 16]), T1U);
 
125
                                        T1W = LD(&(x[WS(ios, 41)]), dist, &(x[WS(ios, 1)]));
 
126
                                        T1Z = LD(&(x[WS(ios, 57)]), dist, &(x[WS(ios, 1)]));
 
127
                                        T7a = VFNMS(LDK(KP707106781), T57, T54);
 
128
                                        T58 = VFMA(LDK(KP707106781), T57, T54);
 
129
                                        T7B = VFNMS(LDK(KP707106781), T6k, T6j);
 
130
                                        T6l = VFMA(LDK(KP707106781), T6k, T6j);
 
131
                                        T1p = VADD(T1m, T1o);
 
132
                                        T5B = VSUB(T1m, T1o);
 
133
                                        T5O = VSUB(T1r, T1t);
 
134
                                        T1u = VADD(T1r, T1t);
 
135
                                        T1X = BYTW(&(W[TWVL * 80]), T1W);
 
136
                                        T20 = BYTW(&(W[TWVL * 112]), T1Z);
 
137
                                        T21 = LD(&(x[WS(ios, 25)]), dist, &(x[WS(ios, 1)]));
 
138
                                   }
 
139
                              }
 
140
                         }
 
141
                         {
 
142
                              V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h;
 
143
                              {
 
144
                                   V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J;
 
145
                                   T28 = LD(&(x[WS(ios, 63)]), dist, &(x[WS(ios, 1)]));
 
146
                                   T2a = LD(&(x[WS(ios, 31)]), dist, &(x[WS(ios, 1)]));
 
147
                                   T1v = VSUB(T1p, T1u);
 
148
                                   T41 = VADD(T1p, T1u);
 
149
                                   T1Y = VADD(T1V, T1X);
 
150
                                   T5C = VSUB(T1V, T1X);
 
151
                                   T22 = BYTW(&(W[TWVL * 48]), T21);
 
152
                                   T2d = LD(&(x[WS(ios, 15)]), dist, &(x[WS(ios, 1)]));
 
153
                                   T29 = BYTW(&(W[TWVL * 124]), T28);
 
154
                                   T2b = BYTW(&(W[TWVL * 60]), T2a);
 
155
                                   T2f = LD(&(x[WS(ios, 47)]), dist, &(x[WS(ios, 1)]));
 
156
                                   T2H = LD(&(x[WS(ios, 55)]), dist, &(x[WS(ios, 1)]));
 
157
                                   T2J = LD(&(x[WS(ios, 23)]), dist, &(x[WS(ios, 1)]));
 
158
                                   {
 
159
                                        V T23, T5D, T2e, T2g, T2I, T2K, T2M;
 
160
                                        T2M = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
 
161
                                        T23 = VADD(T20, T22);
 
162
                                        T5D = VSUB(T20, T22);
 
163
                                        T2e = BYTW(&(W[TWVL * 28]), T2d);
 
164
                                        T2c = VADD(T29, T2b);
 
165
                                        T5W = VSUB(T29, T2b);
 
166
                                        T2g = BYTW(&(W[TWVL * 92]), T2f);
 
167
                                        T2I = BYTW(&(W[TWVL * 108]), T2H);
 
168
                                        T2K = BYTW(&(W[TWVL * 44]), T2J);
 
169
                                        T2N = BYTW(&(W[TWVL * 12]), T2M);
 
170
                                        {
 
171
                                             V T5E, T5P, T42, T2O;
 
172
                                             T5E = VADD(T5C, T5D);
 
173
                                             T5P = VSUB(T5C, T5D);
 
174
                                             T24 = VSUB(T1Y, T23);
 
175
                                             T42 = VADD(T1Y, T23);
 
176
                                             T69 = VSUB(T2g, T2e);
 
177
                                             T2h = VADD(T2e, T2g);
 
178
                                             T2O = LD(&(x[WS(ios, 39)]), dist, &(x[WS(ios, 1)]));
 
179
                                             T2L = VADD(T2I, T2K);
 
180
                                             T5Y = VSUB(T2I, T2K);
 
181
                                             T5Q = VFMA(LDK(KP707106781), T5P, T5O);
 
182
                                             T7o = VFNMS(LDK(KP707106781), T5P, T5O);
 
183
                                             T5F = VFMA(LDK(KP707106781), T5E, T5B);
 
184
                                             T7l = VFNMS(LDK(KP707106781), T5E, T5B);
 
185
                                             T43 = VADD(T41, T42);
 
186
                                             T4F = VSUB(T41, T42);
 
187
                                             T2P = BYTW(&(W[TWVL * 76]), T2O);
 
188
                                        }
 
189
                                   }
 
190
                              }
 
191
                              T2i = VSUB(T2c, T2h);
 
192
                              T48 = VADD(T2c, T2h);
 
193
                              {
 
194
                                   V TW, TY, T11, T2Q, T5X, T13;
 
195
                                   TW = LD(&(x[WS(ios, 62)]), dist, &(x[0]));
 
196
                                   TY = LD(&(x[WS(ios, 30)]), dist, &(x[0]));
 
197
                                   T11 = LD(&(x[WS(ios, 14)]), dist, &(x[0]));
 
198
                                   T2Q = VADD(T2N, T2P);
 
199
                                   T5X = VSUB(T2N, T2P);
 
200
                                   T13 = LD(&(x[WS(ios, 46)]), dist, &(x[0]));
 
201
                                   {
 
202
                                        V T12, T5Z, T6a, T49, T14, T18, T1a;
 
203
                                        {
 
204
                                             V T17, T19, TX, TZ;
 
205
                                             T17 = LD(&(x[WS(ios, 54)]), dist, &(x[0]));
 
206
                                             T19 = LD(&(x[WS(ios, 22)]), dist, &(x[0]));
 
207
                                             TX = BYTW(&(W[TWVL * 122]), TW);
 
208
                                             TZ = BYTW(&(W[TWVL * 58]), TY);
 
209
                                             T12 = BYTW(&(W[TWVL * 26]), T11);
 
210
                                             T5Z = VADD(T5X, T5Y);
 
211
                                             T6a = VSUB(T5Y, T5X);
 
212
                                             T2R = VSUB(T2L, T2Q);
 
213
                                             T49 = VADD(T2Q, T2L);
 
214
                                             T14 = BYTW(&(W[TWVL * 90]), T13);
 
215
                                             T18 = BYTW(&(W[TWVL * 106]), T17);
 
216
                                             T5q = VSUB(TX, TZ);
 
217
                                             T10 = VADD(TX, TZ);
 
218
                                             T1a = BYTW(&(W[TWVL * 42]), T19);
 
219
                                        }
 
220
                                        T6b = VFMA(LDK(KP707106781), T6a, T69);
 
221
                                        T7v = VFNMS(LDK(KP707106781), T6a, T69);
 
222
                                        T60 = VFMA(LDK(KP707106781), T5Z, T5W);
 
223
                                        T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
 
224
                                        T4a = VADD(T48, T49);
 
225
                                        T4I = VSUB(T48, T49);
 
226
                                        T5v = VSUB(T14, T12);
 
227
                                        T15 = VADD(T12, T14);
 
228
                                        T1b = VADD(T18, T1a);
 
229
                                        T5s = VSUB(T18, T1a);
 
230
                                   }
 
231
                                   T1c = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
 
232
                                   T1e = LD(&(x[WS(ios, 38)]), dist, &(x[0]));
 
233
                              }
 
234
                         }
 
235
                    }
 
236
                    {
 
237
                         V Th, T59, Tf, Tv, T5d, Tj, Tm, To;
 
238
                         {
 
239
                              V T5h, TQ, T5m, T5i, TO, TS, TJ, T4h, TD, TI;
 
240
                              {
 
241
                                   V T4k, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC;
 
242
                                   Tz = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
 
243
                                   T4k = VADD(T10, T15);
 
244
                                   T16 = VSUB(T10, T15);
 
245
                                   TB = LD(&(x[WS(ios, 34)]), dist, &(x[0]));
 
246
                                   T1d = BYTW(&(W[TWVL * 10]), T1c);
 
247
                                   T1f = BYTW(&(W[TWVL * 74]), T1e);
 
248
                                   TE = LD(&(x[WS(ios, 18)]), dist, &(x[0]));
 
249
                                   TG = LD(&(x[WS(ios, 50)]), dist, &(x[0]));
 
250
                                   TA = BYTW(&(W[TWVL * 2]), Tz);
 
251
                                   TK = LD(&(x[WS(ios, 10)]), dist, &(x[0]));
 
252
                                   TM = LD(&(x[WS(ios, 42)]), dist, &(x[0]));
 
253
                                   TC = BYTW(&(W[TWVL * 66]), TB);
 
254
                                   {
 
255
                                        V T1g, T5r, TF, TH, TL, TN, TP;
 
256
                                        TP = LD(&(x[WS(ios, 58)]), dist, &(x[0]));
 
257
                                        T1g = VADD(T1d, T1f);
 
258
                                        T5r = VSUB(T1d, T1f);
 
259
                                        TF = BYTW(&(W[TWVL * 34]), TE);
 
260
                                        TH = BYTW(&(W[TWVL * 98]), TG);
 
261
                                        TL = BYTW(&(W[TWVL * 18]), TK);
 
262
                                        TN = BYTW(&(W[TWVL * 82]), TM);
 
263
                                        T5h = VSUB(TA, TC);
 
264
                                        TD = VADD(TA, TC);
 
265
                                        TQ = BYTW(&(W[TWVL * 114]), TP);
 
266
                                        {
 
267
                                             V T5w, T5t, T4l, T1h, TR;
 
268
                                             T5w = VSUB(T5s, T5r);
 
269
                                             T5t = VADD(T5r, T5s);
 
270
                                             T4l = VADD(T1g, T1b);
 
271
                                             T1h = VSUB(T1b, T1g);
 
272
                                             T5m = VSUB(TF, TH);
 
273
                                             TI = VADD(TF, TH);
 
274
                                             T5i = VSUB(TL, TN);
 
275
                                             TO = VADD(TL, TN);
 
276
                                             TR = LD(&(x[WS(ios, 26)]), dist, &(x[0]));
 
277
                                             T5u = VFMA(LDK(KP707106781), T5t, T5q);
 
278
                                             T7h = VFNMS(LDK(KP707106781), T5t, T5q);
 
279
                                             T5x = VFMA(LDK(KP707106781), T5w, T5v);
 
280
                                             T7g = VFNMS(LDK(KP707106781), T5w, T5v);
 
281
                                             T1i = VFNMS(LDK(KP414213562), T1h, T16);
 
282
                                             T3b = VFMA(LDK(KP414213562), T16, T1h);
 
283
                                             T4m = VADD(T4k, T4l);
 
284
                                             T4C = VSUB(T4k, T4l);
 
285
                                             TS = BYTW(&(W[TWVL * 50]), TR);
 
286
                                        }
 
287
                                   }
 
288
                              }
 
289
                              TJ = VSUB(TD, TI);
 
290
                              T4h = VADD(TD, TI);
 
291
                              {
 
292
                                   V Tb, Td, Tr, T5j, TT, Tt, Tg;
 
293
                                   Tb = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
 
294
                                   Td = LD(&(x[WS(ios, 36)]), dist, &(x[0]));
 
295
                                   Tr = LD(&(x[WS(ios, 12)]), dist, &(x[0]));
 
296
                                   T5j = VSUB(TQ, TS);
 
297
                                   TT = VADD(TQ, TS);
 
298
                                   Tt = LD(&(x[WS(ios, 44)]), dist, &(x[0]));
 
299
                                   Tg = LD(&(x[WS(ios, 20)]), dist, &(x[0]));
 
300
                                   {
 
301
                                        V Ti, Tc, Te, Ts;
 
302
                                        Ti = LD(&(x[WS(ios, 52)]), dist, &(x[0]));
 
303
                                        Tc = BYTW(&(W[TWVL * 6]), Tb);
 
304
                                        Te = BYTW(&(W[TWVL * 70]), Td);
 
305
                                        Ts = BYTW(&(W[TWVL * 22]), Tr);
 
306
                                        {
 
307
                                             V T5k, T5n, TU, T4i, Tu;
 
308
                                             T5k = VADD(T5i, T5j);
 
309
                                             T5n = VSUB(T5i, T5j);
 
310
                                             TU = VSUB(TO, TT);
 
311
                                             T4i = VADD(TO, TT);
 
312
                                             Tu = BYTW(&(W[TWVL * 86]), Tt);
 
313
                                             Th = BYTW(&(W[TWVL * 38]), Tg);
 
314
                                             T59 = VSUB(Tc, Te);
 
315
                                             Tf = VADD(Tc, Te);
 
316
                                             T7e = VFNMS(LDK(KP707106781), T5k, T5h);
 
317
                                             T5l = VFMA(LDK(KP707106781), T5k, T5h);
 
318
                                             T7d = VFNMS(LDK(KP707106781), T5n, T5m);
 
319
                                             T5o = VFMA(LDK(KP707106781), T5n, T5m);
 
320
                                             T3a = VFMA(LDK(KP414213562), TJ, TU);
 
321
                                             TV = VFNMS(LDK(KP414213562), TU, TJ);
 
322
                                             T4B = VSUB(T4h, T4i);
 
323
                                             T4j = VADD(T4h, T4i);
 
324
                                             Tv = VADD(Ts, Tu);
 
325
                                             T5d = VSUB(Tu, Ts);
 
326
                                             Tj = BYTW(&(W[TWVL * 102]), Ti);
 
327
                                        }
 
328
                                   }
 
329
                                   Tm = LD(&(x[WS(ios, 60)]), dist, &(x[0]));
 
330
                                   To = LD(&(x[WS(ios, 28)]), dist, &(x[0]));
 
331
                              }
 
332
                         }
 
333
                         {
 
334
                              V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J;
 
335
                              {
 
336
                                   V T1w, T1y, T1M, T1O, Tq, T5c, T1B;
 
337
                                   T1w = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
 
338
                                   T1y = LD(&(x[WS(ios, 37)]), dist, &(x[WS(ios, 1)]));
 
339
                                   T1M = LD(&(x[WS(ios, 13)]), dist, &(x[WS(ios, 1)]));
 
340
                                   T1O = LD(&(x[WS(ios, 45)]), dist, &(x[WS(ios, 1)]));
 
341
                                   T1B = LD(&(x[WS(ios, 21)]), dist, &(x[WS(ios, 1)]));
 
342
                                   {
 
343
                                        V Tk, T5a, Tn, Tp;
 
344
                                        Tk = VADD(Th, Tj);
 
345
                                        T5a = VSUB(Th, Tj);
 
346
                                        Tn = BYTW(&(W[TWVL * 118]), Tm);
 
347
                                        Tp = BYTW(&(W[TWVL * 54]), To);
 
348
                                        {
 
349
                                             V T1x, T1z, T1N, T1P;
 
350
                                             T1x = BYTW(&(W[TWVL * 8]), T1w);
 
351
                                             T1z = BYTW(&(W[TWVL * 72]), T1y);
 
352
                                             T1N = BYTW(&(W[TWVL * 24]), T1M);
 
353
                                             T1P = BYTW(&(W[TWVL * 88]), T1O);
 
354
                                             T5b = VFNMS(LDK(KP414213562), T5a, T59);
 
355
                                             T6m = VFMA(LDK(KP414213562), T59, T5a);
 
356
                                             T3X = VADD(Tf, Tk);
 
357
                                             Tl = VSUB(Tf, Tk);
 
358
                                             Tq = VADD(Tn, Tp);
 
359
                                             T5c = VSUB(Tn, Tp);
 
360
                                             T1A = VADD(T1x, T1z);
 
361
                                             T5G = VSUB(T1x, T1z);
 
362
                                             T1Q = VADD(T1N, T1P);
 
363
                                             T5K = VSUB(T1N, T1P);
 
364
                                             T1C = BYTW(&(W[TWVL * 40]), T1B);
 
365
                                        }
 
366
                                   }
 
367
                                   T1D = LD(&(x[WS(ios, 53)]), dist, &(x[WS(ios, 1)]));
 
368
                                   T5e = VFNMS(LDK(KP414213562), T5d, T5c);
 
369
                                   T6n = VFMA(LDK(KP414213562), T5c, T5d);
 
370
                                   T3Y = VADD(Tq, Tv);
 
371
                                   Tw = VSUB(Tq, Tv);
 
372
                                   T1H = LD(&(x[WS(ios, 61)]), dist, &(x[WS(ios, 1)]));
 
373
                                   T1J = LD(&(x[WS(ios, 29)]), dist, &(x[WS(ios, 1)]));
 
374
                              }
 
375
                              {
 
376
                                   V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E;
 
377
                                   T2j = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
 
378
                                   T1E = BYTW(&(W[TWVL * 104]), T1D);
 
379
                                   T6o = VSUB(T6m, T6n);
 
380
                                   T7b = VADD(T6m, T6n);
 
381
                                   T5f = VADD(T5b, T5e);
 
382
                                   T7C = VSUB(T5b, T5e);
 
383
                                   Tx = VADD(Tl, Tw);
 
384
                                   T38 = VSUB(Tl, Tw);
 
385
                                   T1I = BYTW(&(W[TWVL * 120]), T1H);
 
386
                                   T1K = BYTW(&(W[TWVL * 56]), T1J);
 
387
                                   T1F = VADD(T1C, T1E);
 
388
                                   T5H = VSUB(T1C, T1E);
 
389
                                   T2k = BYTW(&(W[TWVL * 4]), T2j);
 
390
                                   T2l = LD(&(x[WS(ios, 35)]), dist, &(x[WS(ios, 1)]));
 
391
                                   T2z = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)]));
 
392
                                   T2B = LD(&(x[WS(ios, 43)]), dist, &(x[WS(ios, 1)]));
 
393
                                   {
 
394
                                        V T5I, T5R, T44, T1G, T2m, T2A, T2C, T5S, T5L, T1R, T45, T2o, T5J, T1L;
 
395
                                        T2o = LD(&(x[WS(ios, 19)]), dist, &(x[WS(ios, 1)]));
 
396
                                        T5J = VSUB(T1I, T1K);
 
397
                                        T1L = VADD(T1I, T1K);
 
398
                                        T5I = VFNMS(LDK(KP414213562), T5H, T5G);
 
399
                                        T5R = VFMA(LDK(KP414213562), T5G, T5H);
 
400
                                        T44 = VADD(T1A, T1F);
 
401
                                        T1G = VSUB(T1A, T1F);
 
402
                                        T2m = BYTW(&(W[TWVL * 68]), T2l);
 
403
                                        T2A = BYTW(&(W[TWVL * 20]), T2z);
 
404
                                        T2C = BYTW(&(W[TWVL * 84]), T2B);
 
405
                                        T5S = VFNMS(LDK(KP414213562), T5J, T5K);
 
406
                                        T5L = VFMA(LDK(KP414213562), T5K, T5J);
 
407
                                        T1R = VSUB(T1L, T1Q);
 
408
                                        T45 = VADD(T1L, T1Q);
 
409
                                        T2p = BYTW(&(W[TWVL * 36]), T2o);
 
410
                                        T61 = VSUB(T2k, T2m);
 
411
                                        T2n = VADD(T2k, T2m);
 
412
                                        T65 = VSUB(T2C, T2A);
 
413
                                        T2D = VADD(T2A, T2C);
 
414
                                        T7p = VSUB(T5I, T5L);
 
415
                                        T5M = VADD(T5I, T5L);
 
416
                                        T7m = VSUB(T5R, T5S);
 
417
                                        T5T = VADD(T5R, T5S);
 
418
                                        T4G = VSUB(T44, T45);
 
419
                                        T46 = VADD(T44, T45);
 
420
                                        T25 = VSUB(T1G, T1R);
 
421
                                        T1S = VADD(T1G, T1R);
 
422
                                        T2q = LD(&(x[WS(ios, 51)]), dist, &(x[WS(ios, 1)]));
 
423
                                   }
 
424
                                   T2u = LD(&(x[WS(ios, 59)]), dist, &(x[WS(ios, 1)]));
 
425
                                   T2w = LD(&(x[WS(ios, 27)]), dist, &(x[WS(ios, 1)]));
 
426
                              }
 
427
                         }
 
428
                    }
 
429
               }
 
430
               {
 
431
                    V T67, T7w, T6e, T7t, T3s, T3E, T39, T3D, T1k, T3k, T3t, T3c, T1T, T3v, T3w;
 
432
                    V T26, T2G, T3y, T3z, T2T;
 
433
                    {
 
434
                         V T4A, T4N, T47, T4v, T2r, T2v, T2x, T4s, T40, T3W, T3Z;
 
435
                         T4A = VSUB(T3U, T3V);
 
436
                         T3W = VADD(T3U, T3V);
 
437
                         T3Z = VADD(T3X, T3Y);
 
438
                         T4N = VSUB(T3X, T3Y);
 
439
                         T47 = VSUB(T43, T46);
 
440
                         T4v = VADD(T43, T46);
 
441
                         T2r = BYTW(&(W[TWVL * 100]), T2q);
 
442
                         T2v = BYTW(&(W[TWVL * 116]), T2u);
 
443
                         T2x = BYTW(&(W[TWVL * 52]), T2w);
 
444
                         T4s = VADD(T3W, T3Z);
 
445
                         T40 = VSUB(T3W, T3Z);
 
446
                         {
 
447
                              V T4O, T4n, T4Q, T4H, T4E, T4W, T4u, T4y, T4d, T4J, T2F, T2S;
 
448
                              {
 
449
                                   V T6c, T63, T2t, T4b, T6d, T66, T2E, T4c;
 
450
                                   {
 
451
                                        V T4D, T62, T2s, T64, T2y, T4t;
 
452
                                        T4O = VSUB(T4B, T4C);
 
453
                                        T4D = VADD(T4B, T4C);
 
454
                                        T62 = VSUB(T2r, T2p);
 
455
                                        T2s = VADD(T2p, T2r);
 
456
                                        T64 = VSUB(T2v, T2x);
 
457
                                        T2y = VADD(T2v, T2x);
 
458
                                        T4t = VADD(T4j, T4m);
 
459
                                        T4n = VSUB(T4j, T4m);
 
460
                                        T4Q = VFMA(LDK(KP414213562), T4F, T4G);
 
461
                                        T4H = VFNMS(LDK(KP414213562), T4G, T4F);
 
462
                                        T4E = VFMA(LDK(KP707106781), T4D, T4A);
 
463
                                        T4W = VFNMS(LDK(KP707106781), T4D, T4A);
 
464
                                        T6c = VFNMS(LDK(KP414213562), T61, T62);
 
465
                                        T63 = VFMA(LDK(KP414213562), T62, T61);
 
466
                                        T2t = VSUB(T2n, T2s);
 
467
                                        T4b = VADD(T2n, T2s);
 
468
                                        T6d = VFMA(LDK(KP414213562), T64, T65);
 
469
                                        T66 = VFNMS(LDK(KP414213562), T65, T64);
 
470
                                        T2E = VSUB(T2y, T2D);
 
471
                                        T4c = VADD(T2y, T2D);
 
472
                                        T4u = VSUB(T4s, T4t);
 
473
                                        T4y = VADD(T4s, T4t);
 
474
                                   }
 
475
                                   T67 = VADD(T63, T66);
 
476
                                   T7w = VSUB(T66, T63);
 
477
                                   T6e = VADD(T6c, T6d);
 
478
                                   T7t = VSUB(T6d, T6c);
 
479
                                   T4d = VADD(T4b, T4c);
 
480
                                   T4J = VSUB(T4c, T4b);
 
481
                                   T2F = VADD(T2t, T2E);
 
482
                                   T2S = VSUB(T2E, T2t);
 
483
                              }
 
484
                              {
 
485
                                   V Ty, T1j, T4R, T4K;
 
486
                                   Ty = VFMA(LDK(KP707106781), Tx, Ta);
 
487
                                   T3s = VFNMS(LDK(KP707106781), Tx, Ta);
 
488
                                   T3E = VSUB(TV, T1i);
 
489
                                   T1j = VADD(TV, T1i);
 
490
                                   T39 = VFMA(LDK(KP707106781), T38, T37);
 
491
                                   T3D = VFNMS(LDK(KP707106781), T38, T37);
 
492
                                   T4R = VFMA(LDK(KP414213562), T4I, T4J);
 
493
                                   T4K = VFNMS(LDK(KP414213562), T4J, T4I);
 
494
                                   {
 
495
                                        V T4w, T4e, T4P, T4Z;
 
496
                                        T4w = VADD(T4a, T4d);
 
497
                                        T4e = VSUB(T4a, T4d);
 
498
                                        T4P = VFMA(LDK(KP707106781), T4O, T4N);
 
499
                                        T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
 
500
                                        T1k = VFMA(LDK(KP923879532), T1j, Ty);
 
501
                                        T3k = VFNMS(LDK(KP923879532), T1j, Ty);
 
502
                                        {
 
503
                                             V T4L, T50, T4S, T4X;
 
504
                                             T4L = VADD(T4H, T4K);
 
505
                                             T50 = VSUB(T4H, T4K);
 
506
                                             T4S = VSUB(T4Q, T4R);
 
507
                                             T4X = VADD(T4Q, T4R);
 
508
                                             {
 
509
                                                  V T4f, T4o, T4x, T4z;
 
510
                                                  T4f = VADD(T47, T4e);
 
511
                                                  T4o = VSUB(T47, T4e);
 
512
                                                  T4x = VSUB(T4v, T4w);
 
513
                                                  T4z = VADD(T4v, T4w);
 
514
                                                  {
 
515
                                                       V T53, T51, T4M, T4U;
 
516
                                                       T53 = VFNMS(LDK(KP923879532), T50, T4Z);
 
517
                                                       T51 = VFMA(LDK(KP923879532), T50, T4Z);
 
518
                                                       T4M = VFNMS(LDK(KP923879532), T4L, T4E);
 
519
                                                       T4U = VFMA(LDK(KP923879532), T4L, T4E);
 
520
                                                       {
 
521
                                                            V T52, T4Y, T4T, T4V;
 
522
                                                            T52 = VFMA(LDK(KP923879532), T4X, T4W);
 
523
                                                            T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
 
524
                                                            T4T = VFNMS(LDK(KP923879532), T4S, T4P);
 
525
                                                            T4V = VFMA(LDK(KP923879532), T4S, T4P);
 
526
                                                            {
 
527
                                                                 V T4p, T4r, T4g, T4q;
 
528
                                                                 T4p = VFNMS(LDK(KP707106781), T4o, T4n);
 
529
                                                                 T4r = VFMA(LDK(KP707106781), T4o, T4n);
 
530
                                                                 T4g = VFNMS(LDK(KP707106781), T4f, T40);
 
531
                                                                 T4q = VFMA(LDK(KP707106781), T4f, T40);
 
532
                                                                 ST(&(x[0]), VADD(T4y, T4z), dist, &(x[0]));
 
533
                                                                 ST(&(x[WS(ios, 32)]), VSUB(T4y, T4z), dist, &(x[0]));
 
534
                                                                 ST(&(x[WS(ios, 16)]), VFMAI(T4x, T4u), dist, &(x[0]));
 
535
                                                                 ST(&(x[WS(ios, 48)]), VFNMSI(T4x, T4u), dist, &(x[0]));
 
536
                                                                 ST(&(x[WS(ios, 44)]), VFNMSI(T51, T4Y), dist, &(x[0]));
 
537
                                                                 ST(&(x[WS(ios, 20)]), VFMAI(T51, T4Y), dist, &(x[0]));
 
538
                                                                 ST(&(x[WS(ios, 52)]), VFMAI(T53, T52), dist, &(x[0]));
 
539
                                                                 ST(&(x[WS(ios, 12)]), VFNMSI(T53, T52), dist, &(x[0]));
 
540
                                                                 ST(&(x[WS(ios, 4)]), VFMAI(T4V, T4U), dist, &(x[0]));
 
541
                                                                 ST(&(x[WS(ios, 60)]), VFNMSI(T4V, T4U), dist, &(x[0]));
 
542
                                                                 ST(&(x[WS(ios, 36)]), VFMAI(T4T, T4M), dist, &(x[0]));
 
543
                                                                 ST(&(x[WS(ios, 28)]), VFNMSI(T4T, T4M), dist, &(x[0]));
 
544
                                                                 ST(&(x[WS(ios, 56)]), VFNMSI(T4r, T4q), dist, &(x[0]));
 
545
                                                                 ST(&(x[WS(ios, 8)]), VFMAI(T4r, T4q), dist, &(x[0]));
 
546
                                                                 ST(&(x[WS(ios, 40)]), VFMAI(T4p, T4g), dist, &(x[0]));
 
547
                                                                 ST(&(x[WS(ios, 24)]), VFNMSI(T4p, T4g), dist, &(x[0]));
 
548
                                                                 T3t = VADD(T3a, T3b);
 
549
                                                                 T3c = VSUB(T3a, T3b);
 
550
                                                            }
 
551
                                                       }
 
552
                                                  }
 
553
                                             }
 
554
                                        }
 
555
                                   }
 
556
                                   T1T = VFMA(LDK(KP707106781), T1S, T1v);
 
557
                                   T3v = VFNMS(LDK(KP707106781), T1S, T1v);
 
558
                                   T3w = VFNMS(LDK(KP707106781), T25, T24);
 
559
                                   T26 = VFMA(LDK(KP707106781), T25, T24);
 
560
                                   T2G = VFMA(LDK(KP707106781), T2F, T2i);
 
561
                                   T3y = VFNMS(LDK(KP707106781), T2F, T2i);
 
562
                                   T3z = VFNMS(LDK(KP707106781), T2S, T2R);
 
563
                                   T2T = VFMA(LDK(KP707106781), T2S, T2R);
 
564
                              }
 
565
                         }
 
566
                    }
 
567
                    {
 
568
                         V T3u, T3M, T3F, T3P, T3x, T3G, T3q, T3m, T3h, T3j, T3r, T3p, T2W, T3i;
 
569
                         {
 
570
                              V T3d, T3n, T27, T3e, T2U, T3f;
 
571
                              T3d = VFMA(LDK(KP923879532), T3c, T39);
 
572
                              T3n = VFNMS(LDK(KP923879532), T3c, T39);
 
573
                              T27 = VFNMS(LDK(KP198912367), T26, T1T);
 
574
                              T3e = VFMA(LDK(KP198912367), T1T, T26);
 
575
                              T2U = VFNMS(LDK(KP198912367), T2T, T2G);
 
576
                              T3f = VFMA(LDK(KP198912367), T2G, T2T);
 
577
                              T3u = VFMA(LDK(KP923879532), T3t, T3s);
 
578
                              T3M = VFNMS(LDK(KP923879532), T3t, T3s);
 
579
                              {
 
580
                                   V T3g, T3l, T2V, T3o;
 
581
                                   T3g = VSUB(T3e, T3f);
 
582
                                   T3l = VADD(T3e, T3f);
 
583
                                   T2V = VADD(T27, T2U);
 
584
                                   T3o = VSUB(T27, T2U);
 
585
                                   T3F = VFNMS(LDK(KP923879532), T3E, T3D);
 
586
                                   T3P = VFMA(LDK(KP923879532), T3E, T3D);
 
587
                                   T3x = VFMA(LDK(KP668178637), T3w, T3v);
 
588
                                   T3G = VFNMS(LDK(KP668178637), T3v, T3w);
 
589
                                   T3q = VFMA(LDK(KP980785280), T3l, T3k);
 
590
                                   T3m = VFNMS(LDK(KP980785280), T3l, T3k);
 
591
                                   T3h = VFNMS(LDK(KP980785280), T3g, T3d);
 
592
                                   T3j = VFMA(LDK(KP980785280), T3g, T3d);
 
593
                                   T3r = VFNMS(LDK(KP980785280), T3o, T3n);
 
594
                                   T3p = VFMA(LDK(KP980785280), T3o, T3n);
 
595
                                   T2W = VFNMS(LDK(KP980785280), T2V, T1k);
 
596
                                   T3i = VFMA(LDK(KP980785280), T2V, T1k);
 
597
                              }
 
598
                         }
 
599
                         {
 
600
                              V T7n, T7Z, T8j, T89, T7k, T7O, T8g, T7Y, T7H, T7R, T80, T7q, T7u, T82, T83;
 
601
                              V T7x;
 
602
                              {
 
603
                                   V T7c, T7W, T7D, T87, T7f, T7E, T3A, T3H, T7F, T7i;
 
604
                                   T7c = VFNMS(LDK(KP923879532), T7b, T7a);
 
605
                                   T7W = VFMA(LDK(KP923879532), T7b, T7a);
 
606
                                   T7D = VFMA(LDK(KP923879532), T7C, T7B);
 
607
                                   T87 = VFNMS(LDK(KP923879532), T7C, T7B);
 
608
                                   T7f = VFNMS(LDK(KP668178637), T7e, T7d);
 
609
                                   T7E = VFMA(LDK(KP668178637), T7d, T7e);
 
610
                                   ST(&(x[WS(ios, 46)]), VFNMSI(T3p, T3m), dist, &(x[0]));
 
611
                                   ST(&(x[WS(ios, 18)]), VFMAI(T3p, T3m), dist, &(x[0]));
 
612
                                   ST(&(x[WS(ios, 50)]), VFMAI(T3r, T3q), dist, &(x[0]));
 
613
                                   ST(&(x[WS(ios, 14)]), VFNMSI(T3r, T3q), dist, &(x[0]));
 
614
                                   ST(&(x[WS(ios, 2)]), VFMAI(T3j, T3i), dist, &(x[0]));
 
615
                                   ST(&(x[WS(ios, 62)]), VFNMSI(T3j, T3i), dist, &(x[0]));
 
616
                                   ST(&(x[WS(ios, 34)]), VFMAI(T3h, T2W), dist, &(x[0]));
 
617
                                   ST(&(x[WS(ios, 30)]), VFNMSI(T3h, T2W), dist, &(x[0]));
 
618
                                   T3A = VFMA(LDK(KP668178637), T3z, T3y);
 
619
                                   T3H = VFNMS(LDK(KP668178637), T3y, T3z);
 
620
                                   T7F = VFMA(LDK(KP668178637), T7g, T7h);
 
621
                                   T7i = VFNMS(LDK(KP668178637), T7h, T7g);
 
622
                                   T7n = VFNMS(LDK(KP923879532), T7m, T7l);
 
623
                                   T7Z = VFMA(LDK(KP923879532), T7m, T7l);
 
624
                                   {
 
625
                                        V T3I, T3N, T3B, T3Q;
 
626
                                        T3I = VSUB(T3G, T3H);
 
627
                                        T3N = VADD(T3G, T3H);
 
628
                                        T3B = VADD(T3x, T3A);
 
629
                                        T3Q = VSUB(T3x, T3A);
 
630
                                        {
 
631
                                             V T7j, T88, T7G, T7X;
 
632
                                             T7j = VADD(T7f, T7i);
 
633
                                             T88 = VSUB(T7f, T7i);
 
634
                                             T7G = VSUB(T7E, T7F);
 
635
                                             T7X = VADD(T7E, T7F);
 
636
                                             {
 
637
                                                  V T3S, T3O, T3J, T3L;
 
638
                                                  T3S = VFNMS(LDK(KP831469612), T3N, T3M);
 
639
                                                  T3O = VFMA(LDK(KP831469612), T3N, T3M);
 
640
                                                  T3J = VFNMS(LDK(KP831469612), T3I, T3F);
 
641
                                                  T3L = VFMA(LDK(KP831469612), T3I, T3F);
 
642
                                                  {
 
643
                                                       V T3T, T3R, T3C, T3K;
 
644
                                                       T3T = VFMA(LDK(KP831469612), T3Q, T3P);
 
645
                                                       T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
 
646
                                                       T3C = VFNMS(LDK(KP831469612), T3B, T3u);
 
647
                                                       T3K = VFMA(LDK(KP831469612), T3B, T3u);
 
648
                                                       T8j = VFNMS(LDK(KP831469612), T88, T87);
 
649
                                                       T89 = VFMA(LDK(KP831469612), T88, T87);
 
650
                                                       T7k = VFNMS(LDK(KP831469612), T7j, T7c);
 
651
                                                       T7O = VFMA(LDK(KP831469612), T7j, T7c);
 
652
                                                       T8g = VFNMS(LDK(KP831469612), T7X, T7W);
 
653
                                                       T7Y = VFMA(LDK(KP831469612), T7X, T7W);
 
654
                                                       T7H = VFMA(LDK(KP831469612), T7G, T7D);
 
655
                                                       T7R = VFNMS(LDK(KP831469612), T7G, T7D);
 
656
                                                       ST(&(x[WS(ios, 42)]), VFMAI(T3R, T3O), dist, &(x[0]));
 
657
                                                       ST(&(x[WS(ios, 22)]), VFNMSI(T3R, T3O), dist, &(x[0]));
 
658
                                                       ST(&(x[WS(ios, 54)]), VFNMSI(T3T, T3S), dist, &(x[0]));
 
659
                                                       ST(&(x[WS(ios, 10)]), VFMAI(T3T, T3S), dist, &(x[0]));
 
660
                                                       ST(&(x[WS(ios, 58)]), VFMAI(T3L, T3K), dist, &(x[0]));
 
661
                                                       ST(&(x[WS(ios, 6)]), VFNMSI(T3L, T3K), dist, &(x[0]));
 
662
                                                       ST(&(x[WS(ios, 26)]), VFMAI(T3J, T3C), dist, &(x[0]));
 
663
                                                       ST(&(x[WS(ios, 38)]), VFNMSI(T3J, T3C), dist, &(x[0]));
 
664
                                                       T80 = VFNMS(LDK(KP923879532), T7p, T7o);
 
665
                                                       T7q = VFMA(LDK(KP923879532), T7p, T7o);
 
666
                                                  }
 
667
                                             }
 
668
                                        }
 
669
                                   }
 
670
                                   T7u = VFNMS(LDK(KP923879532), T7t, T7s);
 
671
                                   T82 = VFMA(LDK(KP923879532), T7t, T7s);
 
672
                                   T83 = VFNMS(LDK(KP923879532), T7w, T7v);
 
673
                                   T7x = VFMA(LDK(KP923879532), T7w, T7v);
 
674
                              }
 
675
                              {
 
676
                                   V T5g, T6I, T6p, T6T, T5p, T6q, T6r, T5y;
 
677
                                   T5g = VFMA(LDK(KP923879532), T5f, T58);
 
678
                                   T6I = VFNMS(LDK(KP923879532), T5f, T58);
 
679
                                   {
 
680
                                        V T7r, T7I, T7y, T7J;
 
681
                                        T7r = VFNMS(LDK(KP534511135), T7q, T7n);
 
682
                                        T7I = VFMA(LDK(KP534511135), T7n, T7q);
 
683
                                        T7y = VFNMS(LDK(KP534511135), T7x, T7u);
 
684
                                        T7J = VFMA(LDK(KP534511135), T7u, T7x);
 
685
                                        {
 
686
                                             V T81, T8a, T84, T8b;
 
687
                                             T81 = VFMA(LDK(KP303346683), T80, T7Z);
 
688
                                             T8a = VFNMS(LDK(KP303346683), T7Z, T80);
 
689
                                             T84 = VFMA(LDK(KP303346683), T83, T82);
 
690
                                             T8b = VFNMS(LDK(KP303346683), T82, T83);
 
691
                                             T6p = VFMA(LDK(KP923879532), T6o, T6l);
 
692
                                             T6T = VFNMS(LDK(KP923879532), T6o, T6l);
 
693
                                             T5p = VFNMS(LDK(KP198912367), T5o, T5l);
 
694
                                             T6q = VFMA(LDK(KP198912367), T5l, T5o);
 
695
                                             {
 
696
                                                  V T7K, T7P, T7z, T7S;
 
697
                                                  T7K = VSUB(T7I, T7J);
 
698
                                                  T7P = VADD(T7I, T7J);
 
699
                                                  T7z = VADD(T7r, T7y);
 
700
                                                  T7S = VSUB(T7r, T7y);
 
701
                                                  {
 
702
                                                       V T8c, T8h, T85, T8k;
 
703
                                                       T8c = VSUB(T8a, T8b);
 
704
                                                       T8h = VADD(T8a, T8b);
 
705
                                                       T85 = VADD(T81, T84);
 
706
                                                       T8k = VSUB(T81, T84);
 
707
                                                       {
 
708
                                                            V T7Q, T7U, T7L, T7N;
 
709
                                                            T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
 
710
                                                            T7U = VFMA(LDK(KP881921264), T7P, T7O);
 
711
                                                            T7L = VFNMS(LDK(KP881921264), T7K, T7H);
 
712
                                                            T7N = VFMA(LDK(KP881921264), T7K, T7H);
 
713
                                                            {
 
714
                                                                 V T7T, T7V, T7A, T7M;
 
715
                                                                 T7T = VFMA(LDK(KP881921264), T7S, T7R);
 
716
                                                                 T7V = VFNMS(LDK(KP881921264), T7S, T7R);
 
717
                                                                 T7A = VFNMS(LDK(KP881921264), T7z, T7k);
 
718
                                                                 T7M = VFMA(LDK(KP881921264), T7z, T7k);
 
719
                                                                 {
 
720
                                                                      V T8i, T8m, T8d, T8f;
 
721
                                                                      T8i = VFMA(LDK(KP956940335), T8h, T8g);
 
722
                                                                      T8m = VFNMS(LDK(KP956940335), T8h, T8g);
 
723
                                                                      T8d = VFNMS(LDK(KP956940335), T8c, T89);
 
724
                                                                      T8f = VFMA(LDK(KP956940335), T8c, T89);
 
725
                                                                      {
 
726
                                                                           V T8l, T8n, T86, T8e;
 
727
                                                                           T8l = VFNMS(LDK(KP956940335), T8k, T8j);
 
728
                                                                           T8n = VFMA(LDK(KP956940335), T8k, T8j);
 
729
                                                                           T86 = VFNMS(LDK(KP956940335), T85, T7Y);
 
730
                                                                           T8e = VFMA(LDK(KP956940335), T85, T7Y);
 
731
                                                                           ST(&(x[WS(ios, 53)]), VFMAI(T7V, T7U), dist, &(x[WS(ios, 1)]));
 
732
                                                                           ST(&(x[WS(ios, 11)]), VFNMSI(T7V, T7U), dist, &(x[WS(ios, 1)]));
 
733
                                                                           ST(&(x[WS(ios, 43)]), VFNMSI(T7T, T7Q), dist, &(x[WS(ios, 1)]));
 
734
                                                                           ST(&(x[WS(ios, 21)]), VFMAI(T7T, T7Q), dist, &(x[WS(ios, 1)]));
 
735
                                                                           ST(&(x[WS(ios, 5)]), VFMAI(T7N, T7M), dist, &(x[WS(ios, 1)]));
 
736
                                                                           ST(&(x[WS(ios, 59)]), VFNMSI(T7N, T7M), dist, &(x[WS(ios, 1)]));
 
737
                                                                           ST(&(x[WS(ios, 37)]), VFMAI(T7L, T7A), dist, &(x[WS(ios, 1)]));
 
738
                                                                           ST(&(x[WS(ios, 27)]), VFNMSI(T7L, T7A), dist, &(x[WS(ios, 1)]));
 
739
                                                                           ST(&(x[WS(ios, 51)]), VFNMSI(T8n, T8m), dist, &(x[WS(ios, 1)]));
 
740
                                                                           ST(&(x[WS(ios, 13)]), VFMAI(T8n, T8m), dist, &(x[WS(ios, 1)]));
 
741
                                                                           ST(&(x[WS(ios, 45)]), VFMAI(T8l, T8i), dist, &(x[WS(ios, 1)]));
 
742
                                                                           ST(&(x[WS(ios, 19)]), VFNMSI(T8l, T8i), dist, &(x[WS(ios, 1)]));
 
743
                                                                           ST(&(x[WS(ios, 61)]), VFMAI(T8f, T8e), dist, &(x[WS(ios, 1)]));
 
744
                                                                           ST(&(x[WS(ios, 3)]), VFNMSI(T8f, T8e), dist, &(x[WS(ios, 1)]));
 
745
                                                                           ST(&(x[WS(ios, 29)]), VFMAI(T8d, T86), dist, &(x[WS(ios, 1)]));
 
746
                                                                           ST(&(x[WS(ios, 35)]), VFNMSI(T8d, T86), dist, &(x[WS(ios, 1)]));
 
747
                                                                           T6r = VFMA(LDK(KP198912367), T5u, T5x);
 
748
                                                                           T5y = VFNMS(LDK(KP198912367), T5x, T5u);
 
749
                                                                      }
 
750
                                                                 }
 
751
                                                            }
 
752
                                                       }
 
753
                                                  }
 
754
                                             }
 
755
                                        }
 
756
                                   }
 
757
                                   {
 
758
                                        V T5N, T5U, T68, T5z, T6U, T6f;
 
759
                                        T5N = VFMA(LDK(KP923879532), T5M, T5F);
 
760
                                        T6L = VFNMS(LDK(KP923879532), T5M, T5F);
 
761
                                        T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
 
762
                                        T5U = VFMA(LDK(KP923879532), T5T, T5Q);
 
763
                                        T68 = VFMA(LDK(KP923879532), T67, T60);
 
764
                                        T6O = VFNMS(LDK(KP923879532), T67, T60);
 
765
                                        T5z = VADD(T5p, T5y);
 
766
                                        T6U = VSUB(T5p, T5y);
 
767
                                        T6P = VFNMS(LDK(KP923879532), T6e, T6b);
 
768
                                        T6f = VFMA(LDK(KP923879532), T6e, T6b);
 
769
                                        {
 
770
                                             V T5V, T6u, T6g, T6v, T6s, T6J;
 
771
                                             T6s = VSUB(T6q, T6r);
 
772
                                             T6J = VADD(T6q, T6r);
 
773
                                             T5V = VFNMS(LDK(KP098491403), T5U, T5N);
 
774
                                             T6u = VFMA(LDK(KP098491403), T5N, T5U);
 
775
                                             T75 = VFMA(LDK(KP980785280), T6U, T6T);
 
776
                                             T6V = VFNMS(LDK(KP980785280), T6U, T6T);
 
777
                                             T5A = VFMA(LDK(KP980785280), T5z, T5g);
 
778
                                             T6A = VFNMS(LDK(KP980785280), T5z, T5g);
 
779
                                             T6g = VFNMS(LDK(KP098491403), T6f, T68);
 
780
                                             T6v = VFMA(LDK(KP098491403), T68, T6f);
 
781
                                             T72 = VFNMS(LDK(KP980785280), T6J, T6I);
 
782
                                             T6K = VFMA(LDK(KP980785280), T6J, T6I);
 
783
                                             T6t = VFMA(LDK(KP980785280), T6s, T6p);
 
784
                                             T6D = VFNMS(LDK(KP980785280), T6s, T6p);
 
785
                                             T6w = VSUB(T6u, T6v);
 
786
                                             T6B = VADD(T6u, T6v);
 
787
                                             T6h = VADD(T5V, T6g);
 
788
                                             T6E = VSUB(T5V, T6g);
 
789
                                        }
 
790
                                   }
 
791
                              }
 
792
                         }
 
793
                    }
 
794
               }
 
795
          }
 
796
          {
 
797
               V T6W, T6N, T6G, T6C, T6z, T6x, T6H, T6F, T6y, T6i, T6X, T6Q;
 
798
               T6W = VFNMS(LDK(KP820678790), T6L, T6M);
 
799
               T6N = VFMA(LDK(KP820678790), T6M, T6L);
 
800
               T6G = VFMA(LDK(KP995184726), T6B, T6A);
 
801
               T6C = VFNMS(LDK(KP995184726), T6B, T6A);
 
802
               T6z = VFMA(LDK(KP995184726), T6w, T6t);
 
803
               T6x = VFNMS(LDK(KP995184726), T6w, T6t);
 
804
               T6H = VFNMS(LDK(KP995184726), T6E, T6D);
 
805
               T6F = VFMA(LDK(KP995184726), T6E, T6D);
 
806
               T6y = VFMA(LDK(KP995184726), T6h, T5A);
 
807
               T6i = VFNMS(LDK(KP995184726), T6h, T5A);
 
808
               T6X = VFNMS(LDK(KP820678790), T6O, T6P);
 
809
               T6Q = VFMA(LDK(KP820678790), T6P, T6O);
 
810
               {
 
811
                    V T73, T6Y, T76, T6R;
 
812
                    ST(&(x[WS(ios, 49)]), VFMAI(T6H, T6G), dist, &(x[WS(ios, 1)]));
 
813
                    ST(&(x[WS(ios, 15)]), VFNMSI(T6H, T6G), dist, &(x[WS(ios, 1)]));
 
814
                    ST(&(x[WS(ios, 47)]), VFNMSI(T6F, T6C), dist, &(x[WS(ios, 1)]));
 
815
                    ST(&(x[WS(ios, 17)]), VFMAI(T6F, T6C), dist, &(x[WS(ios, 1)]));
 
816
                    ST(&(x[WS(ios, 1)]), VFMAI(T6z, T6y), dist, &(x[WS(ios, 1)]));
 
817
                    ST(&(x[WS(ios, 63)]), VFNMSI(T6z, T6y), dist, &(x[WS(ios, 1)]));
 
818
                    ST(&(x[WS(ios, 33)]), VFMAI(T6x, T6i), dist, &(x[WS(ios, 1)]));
 
819
                    ST(&(x[WS(ios, 31)]), VFNMSI(T6x, T6i), dist, &(x[WS(ios, 1)]));
 
820
                    T73 = VADD(T6W, T6X);
 
821
                    T6Y = VSUB(T6W, T6X);
 
822
                    T76 = VSUB(T6N, T6Q);
 
823
                    T6R = VADD(T6N, T6Q);
 
824
                    {
 
825
                         V T78, T74, T71, T6Z, T79, T77, T70, T6S;
 
826
                         T78 = VFNMS(LDK(KP773010453), T73, T72);
 
827
                         T74 = VFMA(LDK(KP773010453), T73, T72);
 
828
                         T71 = VFMA(LDK(KP773010453), T6Y, T6V);
 
829
                         T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
 
830
                         T79 = VFMA(LDK(KP773010453), T76, T75);
 
831
                         T77 = VFNMS(LDK(KP773010453), T76, T75);
 
832
                         T70 = VFMA(LDK(KP773010453), T6R, T6K);
 
833
                         T6S = VFNMS(LDK(KP773010453), T6R, T6K);
 
834
                         ST(&(x[WS(ios, 55)]), VFNMSI(T79, T78), dist, &(x[WS(ios, 1)]));
 
835
                         ST(&(x[WS(ios, 9)]), VFMAI(T79, T78), dist, &(x[WS(ios, 1)]));
 
836
                         ST(&(x[WS(ios, 41)]), VFMAI(T77, T74), dist, &(x[WS(ios, 1)]));
 
837
                         ST(&(x[WS(ios, 23)]), VFNMSI(T77, T74), dist, &(x[WS(ios, 1)]));
 
838
                         ST(&(x[WS(ios, 57)]), VFMAI(T71, T70), dist, &(x[WS(ios, 1)]));
 
839
                         ST(&(x[WS(ios, 7)]), VFNMSI(T71, T70), dist, &(x[WS(ios, 1)]));
 
840
                         ST(&(x[WS(ios, 25)]), VFMAI(T6Z, T6S), dist, &(x[WS(ios, 1)]));
 
841
                         ST(&(x[WS(ios, 39)]), VFNMSI(T6Z, T6S), dist, &(x[WS(ios, 1)]));
 
842
                    }
 
843
               }
 
844
          }
 
845
     }
 
846
     return W;
 
847
}
 
848
 
 
849
static const tw_instr twinstr[] = {
 
850
     VTW(1),
 
851
     VTW(2),
 
852
     VTW(3),
 
853
     VTW(4),
 
854
     VTW(5),
 
855
     VTW(6),
 
856
     VTW(7),
 
857
     VTW(8),
 
858
     VTW(9),
 
859
     VTW(10),
 
860
     VTW(11),
 
861
     VTW(12),
 
862
     VTW(13),
 
863
     VTW(14),
 
864
     VTW(15),
 
865
     VTW(16),
 
866
     VTW(17),
 
867
     VTW(18),
 
868
     VTW(19),
 
869
     VTW(20),
 
870
     VTW(21),
 
871
     VTW(22),
 
872
     VTW(23),
 
873
     VTW(24),
 
874
     VTW(25),
 
875
     VTW(26),
 
876
     VTW(27),
 
877
     VTW(28),
 
878
     VTW(29),
 
879
     VTW(30),
 
880
     VTW(31),
 
881
     VTW(32),
 
882
     VTW(33),
 
883
     VTW(34),
 
884
     VTW(35),
 
885
     VTW(36),
 
886
     VTW(37),
 
887
     VTW(38),
 
888
     VTW(39),
 
889
     VTW(40),
 
890
     VTW(41),
 
891
     VTW(42),
 
892
     VTW(43),
 
893
     VTW(44),
 
894
     VTW(45),
 
895
     VTW(46),
 
896
     VTW(47),
 
897
     VTW(48),
 
898
     VTW(49),
 
899
     VTW(50),
 
900
     VTW(51),
 
901
     VTW(52),
 
902
     VTW(53),
 
903
     VTW(54),
 
904
     VTW(55),
 
905
     VTW(56),
 
906
     VTW(57),
 
907
     VTW(58),
 
908
     VTW(59),
 
909
     VTW(60),
 
910
     VTW(61),
 
911
     VTW(62),
 
912
     VTW(63),
 
913
     {TW_NEXT, VL, 0}
 
914
};
 
915
 
 
916
static const ct_desc desc = { 64, "t2bv_64", twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
 
917
 
 
918
void X(codelet_t2bv_64) (planner *p) {
 
919
     X(kdft_dit_register) (p, t2bv_64, &desc);
 
920
}
 
921
#else                           /* HAVE_FMA */
 
922
 
 
923
/* Generated by: ../../../genfft/gen_twiddle_c -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2bv_64 -include t2b.h -sign 1 */
 
924
 
 
925
/*
 
926
 * This function contains 519 FP additions, 250 FP multiplications,
 
927
 * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
 
928
 * 107 stack variables, and 128 memory accesses
 
929
 */
 
930
/*
 
931
 * Generator Id's : 
 
932
 * $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
 
933
 * $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
 
934
 * $Id: gen_twiddle_c.ml,v 1.14 2006-02-12 23:34:12 athena Exp $
 
935
 */
 
936
 
 
937
#include "t2b.h"
 
938
 
 
939
static const R *t2bv_64(R *ri, R *ii, const R *W, stride ios, INT m, INT dist)
 
940
{
 
941
     DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
 
942
     DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
 
943
     DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
 
944
     DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
 
945
     DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
 
946
     DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
 
947
     DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
 
948
     DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
 
949
     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
 
950
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
 
951
     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
 
952
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
 
953
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
 
954
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
 
955
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
 
956
     INT i;
 
957
     R *x;
 
958
     x = ii;
 
959
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(ios)) {
 
960
          V Tg, T4B, T6v, T7G, T3r, T4w, T5q, T7F, T5Y, T62, T28, T4d, T2g, T4a, T7g;
 
961
          V T7Y, T6f, T6j, T2Z, T4k, T37, T4h, T7n, T81, T7w, T7x, T7y, T5M, T6q, T1k;
 
962
          V T4s, T1r, T4t, T7t, T7u, T7v, T5F, T6p, TV, T4p, T12, T4q, T7A, T7B, TD;
 
963
          V T4x, T3k, T4C, T5x, T6s, T1R, T4b, T7j, T7Z, T2j, T4e, T5V, T63, T2I, T4i;
 
964
          V T7q, T82, T3a, T4l, T6c, T6k;
 
965
          {
 
966
               V T1, T3, T3p, T3n, Tb, Td, Te, T6, T8, T9, T2, T3o, T3m;
 
967
               T1 = LD(&(x[0]), dist, &(x[0]));
 
968
               T2 = LD(&(x[WS(ios, 32)]), dist, &(x[0]));
 
969
               T3 = BYTW(&(W[TWVL * 62]), T2);
 
970
               T3o = LD(&(x[WS(ios, 48)]), dist, &(x[0]));
 
971
               T3p = BYTW(&(W[TWVL * 94]), T3o);
 
972
               T3m = LD(&(x[WS(ios, 16)]), dist, &(x[0]));
 
973
               T3n = BYTW(&(W[TWVL * 30]), T3m);
 
974
               {
 
975
                    V Ta, Tc, T5, T7;
 
976
                    Ta = LD(&(x[WS(ios, 56)]), dist, &(x[0]));
 
977
                    Tb = BYTW(&(W[TWVL * 110]), Ta);
 
978
                    Tc = LD(&(x[WS(ios, 24)]), dist, &(x[0]));
 
979
                    Td = BYTW(&(W[TWVL * 46]), Tc);
 
980
                    Te = VSUB(Tb, Td);
 
981
                    T5 = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
 
982
                    T6 = BYTW(&(W[TWVL * 14]), T5);
 
983
                    T7 = LD(&(x[WS(ios, 40)]), dist, &(x[0]));
 
984
                    T8 = BYTW(&(W[TWVL * 78]), T7);
 
985
                    T9 = VSUB(T6, T8);
 
986
               }
 
987
               {
 
988
                    V T4, Tf, T6t, T6u;
 
989
                    T4 = VSUB(T1, T3);
 
990
                    Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
 
991
                    Tg = VSUB(T4, Tf);
 
992
                    T4B = VADD(T4, Tf);
 
993
                    T6t = VADD(T6, T8);
 
994
                    T6u = VADD(Tb, Td);
 
995
                    T6v = VSUB(T6t, T6u);
 
996
                    T7G = VADD(T6t, T6u);
 
997
               }
 
998
               {
 
999
                    V T3l, T3q, T5o, T5p;
 
1000
                    T3l = VMUL(LDK(KP707106781), VSUB(T9, Te));
 
1001
                    T3q = VSUB(T3n, T3p);
 
1002
                    T3r = VSUB(T3l, T3q);
 
1003
                    T4w = VADD(T3q, T3l);
 
1004
                    T5o = VADD(T1, T3);
 
1005
                    T5p = VADD(T3n, T3p);
 
1006
                    T5q = VSUB(T5o, T5p);
 
1007
                    T7F = VADD(T5o, T5p);
 
1008
               }
 
1009
          }
 
1010
          {
 
1011
               V T24, T26, T61, T2b, T2d, T60, T1W, T5W, T21, T5X, T22, T27;
 
1012
               {
 
1013
                    V T23, T25, T2a, T2c;
 
1014
                    T23 = LD(&(x[WS(ios, 17)]), dist, &(x[WS(ios, 1)]));
 
1015
                    T24 = BYTW(&(W[TWVL * 32]), T23);
 
1016
                    T25 = LD(&(x[WS(ios, 49)]), dist, &(x[WS(ios, 1)]));
 
1017
                    T26 = BYTW(&(W[TWVL * 96]), T25);
 
1018
                    T61 = VADD(T24, T26);
 
1019
                    T2a = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
 
1020
                    T2b = BYTW(&(W[0]), T2a);
 
1021
                    T2c = LD(&(x[WS(ios, 33)]), dist, &(x[WS(ios, 1)]));
 
1022
                    T2d = BYTW(&(W[TWVL * 64]), T2c);
 
1023
                    T60 = VADD(T2b, T2d);
 
1024
               }
 
1025
               {
 
1026
                    V T1T, T1V, T1S, T1U;
 
1027
                    T1S = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
 
1028
                    T1T = BYTW(&(W[TWVL * 16]), T1S);
 
1029
                    T1U = LD(&(x[WS(ios, 41)]), dist, &(x[WS(ios, 1)]));
 
1030
                    T1V = BYTW(&(W[TWVL * 80]), T1U);
 
1031
                    T1W = VSUB(T1T, T1V);
 
1032
                    T5W = VADD(T1T, T1V);
 
1033
               }
 
1034
               {
 
1035
                    V T1Y, T20, T1X, T1Z;
 
1036
                    T1X = LD(&(x[WS(ios, 57)]), dist, &(x[WS(ios, 1)]));
 
1037
                    T1Y = BYTW(&(W[TWVL * 112]), T1X);
 
1038
                    T1Z = LD(&(x[WS(ios, 25)]), dist, &(x[WS(ios, 1)]));
 
1039
                    T20 = BYTW(&(W[TWVL * 48]), T1Z);
 
1040
                    T21 = VSUB(T1Y, T20);
 
1041
                    T5X = VADD(T1Y, T20);
 
1042
               }
 
1043
               T5Y = VSUB(T5W, T5X);
 
1044
               T62 = VSUB(T60, T61);
 
1045
               T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
 
1046
               T27 = VSUB(T24, T26);
 
1047
               T28 = VSUB(T22, T27);
 
1048
               T4d = VADD(T27, T22);
 
1049
               {
 
1050
                    V T2e, T2f, T7e, T7f;
 
1051
                    T2e = VSUB(T2b, T2d);
 
1052
                    T2f = VMUL(LDK(KP707106781), VADD(T1W, T21));
 
1053
                    T2g = VSUB(T2e, T2f);
 
1054
                    T4a = VADD(T2e, T2f);
 
1055
                    T7e = VADD(T60, T61);
 
1056
                    T7f = VADD(T5W, T5X);
 
1057
                    T7g = VSUB(T7e, T7f);
 
1058
                    T7Y = VADD(T7e, T7f);
 
1059
               }
 
1060
          }
 
1061
          {
 
1062
               V T2V, T2X, T6i, T32, T34, T6h, T2N, T6d, T2S, T6e, T2T, T2Y;
 
1063
               {
 
1064
                    V T2U, T2W, T31, T33;
 
1065
                    T2U = LD(&(x[WS(ios, 15)]), dist, &(x[WS(ios, 1)]));
 
1066
                    T2V = BYTW(&(W[TWVL * 28]), T2U);
 
1067
                    T2W = LD(&(x[WS(ios, 47)]), dist, &(x[WS(ios, 1)]));
 
1068
                    T2X = BYTW(&(W[TWVL * 92]), T2W);
 
1069
                    T6i = VADD(T2V, T2X);
 
1070
                    T31 = LD(&(x[WS(ios, 63)]), dist, &(x[WS(ios, 1)]));
 
1071
                    T32 = BYTW(&(W[TWVL * 124]), T31);
 
1072
                    T33 = LD(&(x[WS(ios, 31)]), dist, &(x[WS(ios, 1)]));
 
1073
                    T34 = BYTW(&(W[TWVL * 60]), T33);
 
1074
                    T6h = VADD(T32, T34);
 
1075
               }
 
1076
               {
 
1077
                    V T2K, T2M, T2J, T2L;
 
1078
                    T2J = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
 
1079
                    T2K = BYTW(&(W[TWVL * 12]), T2J);
 
1080
                    T2L = LD(&(x[WS(ios, 39)]), dist, &(x[WS(ios, 1)]));
 
1081
                    T2M = BYTW(&(W[TWVL * 76]), T2L);
 
1082
                    T2N = VSUB(T2K, T2M);
 
1083
                    T6d = VADD(T2K, T2M);
 
1084
               }
 
1085
               {
 
1086
                    V T2P, T2R, T2O, T2Q;
 
1087
                    T2O = LD(&(x[WS(ios, 55)]), dist, &(x[WS(ios, 1)]));
 
1088
                    T2P = BYTW(&(W[TWVL * 108]), T2O);
 
1089
                    T2Q = LD(&(x[WS(ios, 23)]), dist, &(x[WS(ios, 1)]));
 
1090
                    T2R = BYTW(&(W[TWVL * 44]), T2Q);
 
1091
                    T2S = VSUB(T2P, T2R);
 
1092
                    T6e = VADD(T2P, T2R);
 
1093
               }
 
1094
               T6f = VSUB(T6d, T6e);
 
1095
               T6j = VSUB(T6h, T6i);
 
1096
               T2T = VMUL(LDK(KP707106781), VSUB(T2N, T2S));
 
1097
               T2Y = VSUB(T2V, T2X);
 
1098
               T2Z = VSUB(T2T, T2Y);
 
1099
               T4k = VADD(T2Y, T2T);
 
1100
               {
 
1101
                    V T35, T36, T7l, T7m;
 
1102
                    T35 = VSUB(T32, T34);
 
1103
                    T36 = VMUL(LDK(KP707106781), VADD(T2N, T2S));
 
1104
                    T37 = VSUB(T35, T36);
 
1105
                    T4h = VADD(T35, T36);
 
1106
                    T7l = VADD(T6h, T6i);
 
1107
                    T7m = VADD(T6d, T6e);
 
1108
                    T7n = VSUB(T7l, T7m);
 
1109
                    T81 = VADD(T7l, T7m);
 
1110
               }
 
1111
          }
 
1112
          {
 
1113
               V T1g, T1i, T5K, T1m, T1o, T5J, T18, T5G, T1d, T5H, T5I, T5L;
 
1114
               {
 
1115
                    V T1f, T1h, T1l, T1n;
 
1116
                    T1f = LD(&(x[WS(ios, 14)]), dist, &(x[0]));
 
1117
                    T1g = BYTW(&(W[TWVL * 26]), T1f);
 
1118
                    T1h = LD(&(x[WS(ios, 46)]), dist, &(x[0]));
 
1119
                    T1i = BYTW(&(W[TWVL * 90]), T1h);
 
1120
                    T5K = VADD(T1g, T1i);
 
1121
                    T1l = LD(&(x[WS(ios, 62)]), dist, &(x[0]));
 
1122
                    T1m = BYTW(&(W[TWVL * 122]), T1l);
 
1123
                    T1n = LD(&(x[WS(ios, 30)]), dist, &(x[0]));
 
1124
                    T1o = BYTW(&(W[TWVL * 58]), T1n);
 
1125
                    T5J = VADD(T1m, T1o);
 
1126
               }
 
1127
               {
 
1128
                    V T15, T17, T14, T16;
 
1129
                    T14 = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
 
1130
                    T15 = BYTW(&(W[TWVL * 10]), T14);
 
1131
                    T16 = LD(&(x[WS(ios, 38)]), dist, &(x[0]));
 
1132
                    T17 = BYTW(&(W[TWVL * 74]), T16);
 
1133
                    T18 = VSUB(T15, T17);
 
1134
                    T5G = VADD(T15, T17);
 
1135
               }
 
1136
               {
 
1137
                    V T1a, T1c, T19, T1b;
 
1138
                    T19 = LD(&(x[WS(ios, 54)]), dist, &(x[0]));
 
1139
                    T1a = BYTW(&(W[TWVL * 106]), T19);
 
1140
                    T1b = LD(&(x[WS(ios, 22)]), dist, &(x[0]));
 
1141
                    T1c = BYTW(&(W[TWVL * 42]), T1b);
 
1142
                    T1d = VSUB(T1a, T1c);
 
1143
                    T5H = VADD(T1a, T1c);
 
1144
               }
 
1145
               T7w = VADD(T5J, T5K);
 
1146
               T7x = VADD(T5G, T5H);
 
1147
               T7y = VSUB(T7w, T7x);
 
1148
               T5I = VSUB(T5G, T5H);
 
1149
               T5L = VSUB(T5J, T5K);
 
1150
               T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
 
1151
               T6q = VFMA(LDK(KP923879532), T5L, VMUL(LDK(KP382683432), T5I));
 
1152
               {
 
1153
                    V T1e, T1j, T1p, T1q;
 
1154
                    T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
 
1155
                    T1j = VSUB(T1g, T1i);
 
1156
                    T1k = VSUB(T1e, T1j);
 
1157
                    T4s = VADD(T1j, T1e);
 
1158
                    T1p = VSUB(T1m, T1o);
 
1159
                    T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
 
1160
                    T1r = VSUB(T1p, T1q);
 
1161
                    T4t = VADD(T1p, T1q);
 
1162
               }
 
1163
          }
 
1164
          {
 
1165
               V TR, TT, T5A, TX, TZ, T5z, TJ, T5C, TO, T5D, T5B, T5E;
 
1166
               {
 
1167
                    V TQ, TS, TW, TY;
 
1168
                    TQ = LD(&(x[WS(ios, 18)]), dist, &(x[0]));
 
1169
                    TR = BYTW(&(W[TWVL * 34]), TQ);
 
1170
                    TS = LD(&(x[WS(ios, 50)]), dist, &(x[0]));
 
1171
                    TT = BYTW(&(W[TWVL * 98]), TS);
 
1172
                    T5A = VADD(TR, TT);
 
1173
                    TW = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
 
1174
                    TX = BYTW(&(W[TWVL * 2]), TW);
 
1175
                    TY = LD(&(x[WS(ios, 34)]), dist, &(x[0]));
 
1176
                    TZ = BYTW(&(W[TWVL * 66]), TY);
 
1177
                    T5z = VADD(TX, TZ);
 
1178
               }
 
1179
               {
 
1180
                    V TG, TI, TF, TH;
 
1181
                    TF = LD(&(x[WS(ios, 10)]), dist, &(x[0]));
 
1182
                    TG = BYTW(&(W[TWVL * 18]), TF);
 
1183
                    TH = LD(&(x[WS(ios, 42)]), dist, &(x[0]));
 
1184
                    TI = BYTW(&(W[TWVL * 82]), TH);
 
1185
                    TJ = VSUB(TG, TI);
 
1186
                    T5C = VADD(TG, TI);
 
1187
               }
 
1188
               {
 
1189
                    V TL, TN, TK, TM;
 
1190
                    TK = LD(&(x[WS(ios, 58)]), dist, &(x[0]));
 
1191
                    TL = BYTW(&(W[TWVL * 114]), TK);
 
1192
                    TM = LD(&(x[WS(ios, 26)]), dist, &(x[0]));
 
1193
                    TN = BYTW(&(W[TWVL * 50]), TM);
 
1194
                    TO = VSUB(TL, TN);
 
1195
                    T5D = VADD(TL, TN);
 
1196
               }
 
1197
               T7t = VADD(T5z, T5A);
 
1198
               T7u = VADD(T5C, T5D);
 
1199
               T7v = VSUB(T7t, T7u);
 
1200
               T5B = VSUB(T5z, T5A);
 
1201
               T5E = VSUB(T5C, T5D);
 
1202
               T5F = VFMA(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
 
1203
               T6p = VFNMS(LDK(KP382683432), T5E, VMUL(LDK(KP923879532), T5B));
 
1204
               {
 
1205
                    V TP, TU, T10, T11;
 
1206
                    TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
 
1207
                    TU = VSUB(TR, TT);
 
1208
                    TV = VSUB(TP, TU);
 
1209
                    T4p = VADD(TU, TP);
 
1210
                    T10 = VSUB(TX, TZ);
 
1211
                    T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
 
1212
                    T12 = VSUB(T10, T11);
 
1213
                    T4q = VADD(T10, T11);
 
1214
               }
 
1215
          }
 
1216
          {
 
1217
               V Tl, T5r, TB, T5u, Tq, T5s, Tw, T5v, Tr, TC;
 
1218
               {
 
1219
                    V Ti, Tk, Th, Tj;
 
1220
                    Th = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
 
1221
                    Ti = BYTW(&(W[TWVL * 6]), Th);
 
1222
                    Tj = LD(&(x[WS(ios, 36)]), dist, &(x[0]));
 
1223
                    Tk = BYTW(&(W[TWVL * 70]), Tj);
 
1224
                    Tl = VSUB(Ti, Tk);
 
1225
                    T5r = VADD(Ti, Tk);
 
1226
               }
 
1227
               {
 
1228
                    V Ty, TA, Tx, Tz;
 
1229
                    Tx = LD(&(x[WS(ios, 60)]), dist, &(x[0]));
 
1230
                    Ty = BYTW(&(W[TWVL * 118]), Tx);
 
1231
                    Tz = LD(&(x[WS(ios, 28)]), dist, &(x[0]));
 
1232
                    TA = BYTW(&(W[TWVL * 54]), Tz);
 
1233
                    TB = VSUB(Ty, TA);
 
1234
                    T5u = VADD(Ty, TA);
 
1235
               }
 
1236
               {
 
1237
                    V Tn, Tp, Tm, To;
 
1238
                    Tm = LD(&(x[WS(ios, 20)]), dist, &(x[0]));
 
1239
                    Tn = BYTW(&(W[TWVL * 38]), Tm);
 
1240
                    To = LD(&(x[WS(ios, 52)]), dist, &(x[0]));
 
1241
                    Tp = BYTW(&(W[TWVL * 102]), To);
 
1242
                    Tq = VSUB(Tn, Tp);
 
1243
                    T5s = VADD(Tn, Tp);
 
1244
               }
 
1245
               {
 
1246
                    V Tt, Tv, Ts, Tu;
 
1247
                    Ts = LD(&(x[WS(ios, 12)]), dist, &(x[0]));
 
1248
                    Tt = BYTW(&(W[TWVL * 22]), Ts);
 
1249
                    Tu = LD(&(x[WS(ios, 44)]), dist, &(x[0]));
 
1250
                    Tv = BYTW(&(W[TWVL * 86]), Tu);
 
1251
                    Tw = VSUB(Tt, Tv);
 
1252
                    T5v = VADD(Tt, Tv);
 
1253
               }
 
1254
               T7A = VADD(T5r, T5s);
 
1255
               T7B = VADD(T5u, T5v);
 
1256
               Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
 
1257
               TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
 
1258
               TD = VSUB(Tr, TC);
 
1259
               T4x = VADD(Tr, TC);
 
1260
               {
 
1261
                    V T3i, T3j, T5t, T5w;
 
1262
                    T3i = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
 
1263
                    T3j = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
 
1264
                    T3k = VSUB(T3i, T3j);
 
1265
                    T4C = VADD(T3i, T3j);
 
1266
                    T5t = VSUB(T5r, T5s);
 
1267
                    T5w = VSUB(T5u, T5v);
 
1268
                    T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
 
1269
                    T6s = VMUL(LDK(KP707106781), VSUB(T5t, T5w));
 
1270
               }
 
1271
          }
 
1272
          {
 
1273
               V T1z, T5P, T1P, T5T, T1E, T5Q, T1K, T5S;
 
1274
               {
 
1275
                    V T1w, T1y, T1v, T1x;
 
1276
                    T1v = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
 
1277
                    T1w = BYTW(&(W[TWVL * 8]), T1v);
 
1278
                    T1x = LD(&(x[WS(ios, 37)]), dist, &(x[WS(ios, 1)]));
 
1279
                    T1y = BYTW(&(W[TWVL * 72]), T1x);
 
1280
                    T1z = VSUB(T1w, T1y);
 
1281
                    T5P = VADD(T1w, T1y);
 
1282
               }
 
1283
               {
 
1284
                    V T1M, T1O, T1L, T1N;
 
1285
                    T1L = LD(&(x[WS(ios, 13)]), dist, &(x[WS(ios, 1)]));
 
1286
                    T1M = BYTW(&(W[TWVL * 24]), T1L);
 
1287
                    T1N = LD(&(x[WS(ios, 45)]), dist, &(x[WS(ios, 1)]));
 
1288
                    T1O = BYTW(&(W[TWVL * 88]), T1N);
 
1289
                    T1P = VSUB(T1M, T1O);
 
1290
                    T5T = VADD(T1M, T1O);
 
1291
               }
 
1292
               {
 
1293
                    V T1B, T1D, T1A, T1C;
 
1294
                    T1A = LD(&(x[WS(ios, 21)]), dist, &(x[WS(ios, 1)]));
 
1295
                    T1B = BYTW(&(W[TWVL * 40]), T1A);
 
1296
                    T1C = LD(&(x[WS(ios, 53)]), dist, &(x[WS(ios, 1)]));
 
1297
                    T1D = BYTW(&(W[TWVL * 104]), T1C);
 
1298
                    T1E = VSUB(T1B, T1D);
 
1299
                    T5Q = VADD(T1B, T1D);
 
1300
               }
 
1301
               {
 
1302
                    V T1H, T1J, T1G, T1I;
 
1303
                    T1G = LD(&(x[WS(ios, 61)]), dist, &(x[WS(ios, 1)]));
 
1304
                    T1H = BYTW(&(W[TWVL * 120]), T1G);
 
1305
                    T1I = LD(&(x[WS(ios, 29)]), dist, &(x[WS(ios, 1)]));
 
1306
                    T1J = BYTW(&(W[TWVL * 56]), T1I);
 
1307
                    T1K = VSUB(T1H, T1J);
 
1308
                    T5S = VADD(T1H, T1J);
 
1309
               }
 
1310
               {
 
1311
                    V T1F, T1Q, T7h, T7i;
 
1312
                    T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1z));
 
1313
                    T1Q = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1P));
 
1314
                    T1R = VSUB(T1F, T1Q);
 
1315
                    T4b = VADD(T1F, T1Q);
 
1316
                    T7h = VADD(T5P, T5Q);
 
1317
                    T7i = VADD(T5S, T5T);
 
1318
                    T7j = VSUB(T7h, T7i);
 
1319
                    T7Z = VADD(T7h, T7i);
 
1320
               }
 
1321
               {
 
1322
                    V T2h, T2i, T5R, T5U;
 
1323
                    T2h = VFMA(LDK(KP382683432), T1z, VMUL(LDK(KP923879532), T1E));
 
1324
                    T2i = VFNMS(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
 
1325
                    T2j = VSUB(T2h, T2i);
 
1326
                    T4e = VADD(T2h, T2i);
 
1327
                    T5R = VSUB(T5P, T5Q);
 
1328
                    T5U = VSUB(T5S, T5T);
 
1329
                    T5V = VMUL(LDK(KP707106781), VSUB(T5R, T5U));
 
1330
                    T63 = VMUL(LDK(KP707106781), VADD(T5R, T5U));
 
1331
               }
 
1332
          }
 
1333
          {
 
1334
               V T2q, T66, T2G, T6a, T2v, T67, T2B, T69;
 
1335
               {
 
1336
                    V T2n, T2p, T2m, T2o;
 
1337
                    T2m = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
 
1338
                    T2n = BYTW(&(W[TWVL * 4]), T2m);
 
1339
                    T2o = LD(&(x[WS(ios, 35)]), dist, &(x[WS(ios, 1)]));
 
1340
                    T2p = BYTW(&(W[TWVL * 68]), T2o);
 
1341
                    T2q = VSUB(T2n, T2p);
 
1342
                    T66 = VADD(T2n, T2p);
 
1343
               }
 
1344
               {
 
1345
                    V T2D, T2F, T2C, T2E;
 
1346
                    T2C = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)]));
 
1347
                    T2D = BYTW(&(W[TWVL * 20]), T2C);
 
1348
                    T2E = LD(&(x[WS(ios, 43)]), dist, &(x[WS(ios, 1)]));
 
1349
                    T2F = BYTW(&(W[TWVL * 84]), T2E);
 
1350
                    T2G = VSUB(T2D, T2F);
 
1351
                    T6a = VADD(T2D, T2F);
 
1352
               }
 
1353
               {
 
1354
                    V T2s, T2u, T2r, T2t;
 
1355
                    T2r = LD(&(x[WS(ios, 19)]), dist, &(x[WS(ios, 1)]));
 
1356
                    T2s = BYTW(&(W[TWVL * 36]), T2r);
 
1357
                    T2t = LD(&(x[WS(ios, 51)]), dist, &(x[WS(ios, 1)]));
 
1358
                    T2u = BYTW(&(W[TWVL * 100]), T2t);
 
1359
                    T2v = VSUB(T2s, T2u);
 
1360
                    T67 = VADD(T2s, T2u);
 
1361
               }
 
1362
               {
 
1363
                    V T2y, T2A, T2x, T2z;
 
1364
                    T2x = LD(&(x[WS(ios, 59)]), dist, &(x[WS(ios, 1)]));
 
1365
                    T2y = BYTW(&(W[TWVL * 116]), T2x);
 
1366
                    T2z = LD(&(x[WS(ios, 27)]), dist, &(x[WS(ios, 1)]));
 
1367
                    T2A = BYTW(&(W[TWVL * 52]), T2z);
 
1368
                    T2B = VSUB(T2y, T2A);
 
1369
                    T69 = VADD(T2y, T2A);
 
1370
               }
 
1371
               {
 
1372
                    V T2w, T2H, T7o, T7p;
 
1373
                    T2w = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2q));
 
1374
                    T2H = VFMA(LDK(KP923879532), T2B, VMUL(LDK(KP382683432), T2G));
 
1375
                    T2I = VSUB(T2w, T2H);
 
1376
                    T4i = VADD(T2w, T2H);
 
1377
                    T7o = VADD(T66, T67);
 
1378
                    T7p = VADD(T69, T6a);
 
1379
                    T7q = VSUB(T7o, T7p);
 
1380
                    T82 = VADD(T7o, T7p);
 
1381
               }
 
1382
               {
 
1383
                    V T38, T39, T68, T6b;
 
1384
                    T38 = VFMA(LDK(KP382683432), T2q, VMUL(LDK(KP923879532), T2v));
 
1385
                    T39 = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2G));
 
1386
                    T3a = VSUB(T38, T39);
 
1387
                    T4l = VADD(T38, T39);
 
1388
                    T68 = VSUB(T66, T67);
 
1389
                    T6b = VSUB(T69, T6a);
 
1390
                    T6c = VMUL(LDK(KP707106781), VSUB(T68, T6b));
 
1391
                    T6k = VMUL(LDK(KP707106781), VADD(T68, T6b));
 
1392
               }
 
1393
          }
 
1394
          {
 
1395
               V T7s, T7R, T7M, T7U, T7D, T7T, T7J, T7Q;
 
1396
               {
 
1397
                    V T7k, T7r, T7K, T7L;
 
1398
                    T7k = VFNMS(LDK(KP382683432), T7j, VMUL(LDK(KP923879532), T7g));
 
1399
                    T7r = VFMA(LDK(KP923879532), T7n, VMUL(LDK(KP382683432), T7q));
 
1400
                    T7s = VSUB(T7k, T7r);
 
1401
                    T7R = VADD(T7k, T7r);
 
1402
                    T7K = VFMA(LDK(KP382683432), T7g, VMUL(LDK(KP923879532), T7j));
 
1403
                    T7L = VFNMS(LDK(KP382683432), T7n, VMUL(LDK(KP923879532), T7q));
 
1404
                    T7M = VSUB(T7K, T7L);
 
1405
                    T7U = VADD(T7K, T7L);
 
1406
               }
 
1407
               {
 
1408
                    V T7z, T7C, T7H, T7I;
 
1409
                    T7z = VMUL(LDK(KP707106781), VSUB(T7v, T7y));
 
1410
                    T7C = VSUB(T7A, T7B);
 
1411
                    T7D = VSUB(T7z, T7C);
 
1412
                    T7T = VADD(T7C, T7z);
 
1413
                    T7H = VSUB(T7F, T7G);
 
1414
                    T7I = VMUL(LDK(KP707106781), VADD(T7v, T7y));
 
1415
                    T7J = VSUB(T7H, T7I);
 
1416
                    T7Q = VADD(T7H, T7I);
 
1417
               }
 
1418
               {
 
1419
                    V T7E, T7N, T7W, T7X;
 
1420
                    T7E = VBYI(VSUB(T7s, T7D));
 
1421
                    T7N = VSUB(T7J, T7M);
 
1422
                    ST(&(x[WS(ios, 20)]), VADD(T7E, T7N), dist, &(x[0]));
 
1423
                    ST(&(x[WS(ios, 44)]), VSUB(T7N, T7E), dist, &(x[0]));
 
1424
                    T7W = VSUB(T7Q, T7R);
 
1425
                    T7X = VBYI(VSUB(T7U, T7T));
 
1426
                    ST(&(x[WS(ios, 36)]), VSUB(T7W, T7X), dist, &(x[0]));
 
1427
                    ST(&(x[WS(ios, 28)]), VADD(T7W, T7X), dist, &(x[0]));
 
1428
               }
 
1429
               {
 
1430
                    V T7O, T7P, T7S, T7V;
 
1431
                    T7O = VBYI(VADD(T7D, T7s));
 
1432
                    T7P = VADD(T7J, T7M);
 
1433
                    ST(&(x[WS(ios, 12)]), VADD(T7O, T7P), dist, &(x[0]));
 
1434
                    ST(&(x[WS(ios, 52)]), VSUB(T7P, T7O), dist, &(x[0]));
 
1435
                    T7S = VADD(T7Q, T7R);
 
1436
                    T7V = VBYI(VADD(T7T, T7U));
 
1437
                    ST(&(x[WS(ios, 60)]), VSUB(T7S, T7V), dist, &(x[0]));
 
1438
                    ST(&(x[WS(ios, 4)]), VADD(T7S, T7V), dist, &(x[0]));
 
1439
               }
 
1440
          }
 
1441
          {
 
1442
               V T84, T8c, T8l, T8n, T87, T8h, T8b, T8g, T8i, T8m;
 
1443
               {
 
1444
                    V T80, T83, T8j, T8k;
 
1445
                    T80 = VSUB(T7Y, T7Z);
 
1446
                    T83 = VSUB(T81, T82);
 
1447
                    T84 = VMUL(LDK(KP707106781), VSUB(T80, T83));
 
1448
                    T8c = VMUL(LDK(KP707106781), VADD(T80, T83));
 
1449
                    T8j = VADD(T7Y, T7Z);
 
1450
                    T8k = VADD(T81, T82);
 
1451
                    T8l = VBYI(VSUB(T8j, T8k));
 
1452
                    T8n = VADD(T8j, T8k);
 
1453
               }
 
1454
               {
 
1455
                    V T85, T86, T89, T8a;
 
1456
                    T85 = VADD(T7t, T7u);
 
1457
                    T86 = VADD(T7w, T7x);
 
1458
                    T87 = VSUB(T85, T86);
 
1459
                    T8h = VADD(T85, T86);
 
1460
                    T89 = VADD(T7F, T7G);
 
1461
                    T8a = VADD(T7A, T7B);
 
1462
                    T8b = VSUB(T89, T8a);
 
1463
                    T8g = VADD(T89, T8a);
 
1464
               }
 
1465
               T8i = VSUB(T8g, T8h);
 
1466
               ST(&(x[WS(ios, 48)]), VSUB(T8i, T8l), dist, &(x[0]));
 
1467
               ST(&(x[WS(ios, 16)]), VADD(T8i, T8l), dist, &(x[0]));
 
1468
               T8m = VADD(T8g, T8h);
 
1469
               ST(&(x[WS(ios, 32)]), VSUB(T8m, T8n), dist, &(x[0]));
 
1470
               ST(&(x[0]), VADD(T8m, T8n), dist, &(x[0]));
 
1471
               {
 
1472
                    V T88, T8d, T8e, T8f;
 
1473
                    T88 = VBYI(VSUB(T84, T87));
 
1474
                    T8d = VSUB(T8b, T8c);
 
1475
                    ST(&(x[WS(ios, 24)]), VADD(T88, T8d), dist, &(x[0]));
 
1476
                    ST(&(x[WS(ios, 40)]), VSUB(T8d, T88), dist, &(x[0]));
 
1477
                    T8e = VBYI(VADD(T87, T84));
 
1478
                    T8f = VADD(T8b, T8c);
 
1479
                    ST(&(x[WS(ios, 8)]), VADD(T8e, T8f), dist, &(x[0]));
 
1480
                    ST(&(x[WS(ios, 56)]), VSUB(T8f, T8e), dist, &(x[0]));
 
1481
               }
 
1482
          }
 
1483
          {
 
1484
               V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
 
1485
               {
 
1486
                    V T5y, T5N, T6r, T6w;
 
1487
                    T5y = VSUB(T5q, T5x);
 
1488
                    T5N = VSUB(T5F, T5M);
 
1489
                    T5O = VSUB(T5y, T5N);
 
1490
                    T6H = VADD(T5y, T5N);
 
1491
                    T6r = VSUB(T6p, T6q);
 
1492
                    T6w = VSUB(T6s, T6v);
 
1493
                    T6x = VSUB(T6r, T6w);
 
1494
                    T6F = VADD(T6w, T6r);
 
1495
                    {
 
1496
                         V T65, T6y, T6m, T6z;
 
1497
                         {
 
1498
                              V T5Z, T64, T6g, T6l;
 
1499
                              T5Z = VSUB(T5V, T5Y);
 
1500
                              T64 = VSUB(T62, T63);
 
1501
                              T65 = VFMA(LDK(KP831469612), T5Z, VMUL(LDK(KP555570233), T64));
 
1502
                              T6y = VFNMS(LDK(KP555570233), T5Z, VMUL(LDK(KP831469612), T64));
 
1503
                              T6g = VSUB(T6c, T6f);
 
1504
                              T6l = VSUB(T6j, T6k);
 
1505
                              T6m = VFNMS(LDK(KP555570233), T6l, VMUL(LDK(KP831469612), T6g));
 
1506
                              T6z = VFMA(LDK(KP555570233), T6g, VMUL(LDK(KP831469612), T6l));
 
1507
                         }
 
1508
                         T6n = VSUB(T65, T6m);
 
1509
                         T6I = VADD(T6y, T6z);
 
1510
                         T6A = VSUB(T6y, T6z);
 
1511
                         T6E = VADD(T65, T6m);
 
1512
                    }
 
1513
               }
 
1514
               {
 
1515
                    V T6o, T6B, T6K, T6L;
 
1516
                    T6o = VADD(T5O, T6n);
 
1517
                    T6B = VBYI(VADD(T6x, T6A));
 
1518
                    ST(&(x[WS(ios, 54)]), VSUB(T6o, T6B), dist, &(x[0]));
 
1519
                    ST(&(x[WS(ios, 10)]), VADD(T6o, T6B), dist, &(x[0]));
 
1520
                    T6K = VBYI(VADD(T6F, T6E));
 
1521
                    T6L = VADD(T6H, T6I);
 
1522
                    ST(&(x[WS(ios, 6)]), VADD(T6K, T6L), dist, &(x[0]));
 
1523
                    ST(&(x[WS(ios, 58)]), VSUB(T6L, T6K), dist, &(x[0]));
 
1524
               }
 
1525
               {
 
1526
                    V T6C, T6D, T6G, T6J;
 
1527
                    T6C = VSUB(T5O, T6n);
 
1528
                    T6D = VBYI(VSUB(T6A, T6x));
 
1529
                    ST(&(x[WS(ios, 42)]), VSUB(T6C, T6D), dist, &(x[0]));
 
1530
                    ST(&(x[WS(ios, 22)]), VADD(T6C, T6D), dist, &(x[0]));
 
1531
                    T6G = VBYI(VSUB(T6E, T6F));
 
1532
                    T6J = VSUB(T6H, T6I);
 
1533
                    ST(&(x[WS(ios, 26)]), VADD(T6G, T6J), dist, &(x[0]));
 
1534
                    ST(&(x[WS(ios, 38)]), VSUB(T6J, T6G), dist, &(x[0]));
 
1535
               }
 
1536
          }
 
1537
          {
 
1538
               V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
 
1539
               {
 
1540
                    V T6M, T6N, T6X, T6Y;
 
1541
                    T6M = VADD(T5q, T5x);
 
1542
                    T6N = VADD(T6p, T6q);
 
1543
                    T6O = VSUB(T6M, T6N);
 
1544
                    T79 = VADD(T6M, T6N);
 
1545
                    T6X = VADD(T5F, T5M);
 
1546
                    T6Y = VADD(T6v, T6s);
 
1547
                    T6Z = VSUB(T6X, T6Y);
 
1548
                    T77 = VADD(T6Y, T6X);
 
1549
                    {
 
1550
                         V T6R, T70, T6U, T71;
 
1551
                         {
 
1552
                              V T6P, T6Q, T6S, T6T;
 
1553
                              T6P = VADD(T5Y, T5V);
 
1554
                              T6Q = VADD(T62, T63);
 
1555
                              T6R = VFMA(LDK(KP980785280), T6P, VMUL(LDK(KP195090322), T6Q));
 
1556
                              T70 = VFNMS(LDK(KP195090322), T6P, VMUL(LDK(KP980785280), T6Q));
 
1557
                              T6S = VADD(T6f, T6c);
 
1558
                              T6T = VADD(T6j, T6k);
 
1559
                              T6U = VFNMS(LDK(KP195090322), T6T, VMUL(LDK(KP980785280), T6S));
 
1560
                              T71 = VFMA(LDK(KP195090322), T6S, VMUL(LDK(KP980785280), T6T));
 
1561
                         }
 
1562
                         T6V = VSUB(T6R, T6U);
 
1563
                         T7a = VADD(T70, T71);
 
1564
                         T72 = VSUB(T70, T71);
 
1565
                         T76 = VADD(T6R, T6U);
 
1566
                    }
 
1567
               }
 
1568
               {
 
1569
                    V T6W, T73, T7c, T7d;
 
1570
                    T6W = VADD(T6O, T6V);
 
1571
                    T73 = VBYI(VADD(T6Z, T72));
 
1572
                    ST(&(x[WS(ios, 50)]), VSUB(T6W, T73), dist, &(x[0]));
 
1573
                    ST(&(x[WS(ios, 14)]), VADD(T6W, T73), dist, &(x[0]));
 
1574
                    T7c = VBYI(VADD(T77, T76));
 
1575
                    T7d = VADD(T79, T7a);
 
1576
                    ST(&(x[WS(ios, 2)]), VADD(T7c, T7d), dist, &(x[0]));
 
1577
                    ST(&(x[WS(ios, 62)]), VSUB(T7d, T7c), dist, &(x[0]));
 
1578
               }
 
1579
               {
 
1580
                    V T74, T75, T78, T7b;
 
1581
                    T74 = VSUB(T6O, T6V);
 
1582
                    T75 = VBYI(VSUB(T72, T6Z));
 
1583
                    ST(&(x[WS(ios, 46)]), VSUB(T74, T75), dist, &(x[0]));
 
1584
                    ST(&(x[WS(ios, 18)]), VADD(T74, T75), dist, &(x[0]));
 
1585
                    T78 = VBYI(VSUB(T76, T77));
 
1586
                    T7b = VSUB(T79, T7a);
 
1587
                    ST(&(x[WS(ios, 30)]), VADD(T78, T7b), dist, &(x[0]));
 
1588
                    ST(&(x[WS(ios, 34)]), VSUB(T7b, T78), dist, &(x[0]));
 
1589
               }
 
1590
          }
 
1591
          {
 
1592
               V T4z, T5g, T4R, T59, T4H, T5j, T4O, T55, T4o, T4S, T4K, T4P, T52, T5k, T5c;
 
1593
               V T5h;
 
1594
               {
 
1595
                    V T4y, T57, T4v, T58, T4r, T4u;
 
1596
                    T4y = VADD(T4w, T4x);
 
1597
                    T57 = VSUB(T4B, T4C);
 
1598
                    T4r = VFMA(LDK(KP980785280), T4p, VMUL(LDK(KP195090322), T4q));
 
1599
                    T4u = VFNMS(LDK(KP195090322), T4t, VMUL(LDK(KP980785280), T4s));
 
1600
                    T4v = VADD(T4r, T4u);
 
1601
                    T58 = VSUB(T4r, T4u);
 
1602
                    T4z = VSUB(T4v, T4y);
 
1603
                    T5g = VADD(T57, T58);
 
1604
                    T4R = VADD(T4y, T4v);
 
1605
                    T59 = VSUB(T57, T58);
 
1606
               }
 
1607
               {
 
1608
                    V T4D, T54, T4G, T53, T4E, T4F;
 
1609
                    T4D = VADD(T4B, T4C);
 
1610
                    T54 = VSUB(T4x, T4w);
 
1611
                    T4E = VFNMS(LDK(KP195090322), T4p, VMUL(LDK(KP980785280), T4q));
 
1612
                    T4F = VFMA(LDK(KP195090322), T4s, VMUL(LDK(KP980785280), T4t));
 
1613
                    T4G = VADD(T4E, T4F);
 
1614
                    T53 = VSUB(T4E, T4F);
 
1615
                    T4H = VSUB(T4D, T4G);
 
1616
                    T5j = VADD(T54, T53);
 
1617
                    T4O = VADD(T4D, T4G);
 
1618
                    T55 = VSUB(T53, T54);
 
1619
               }
 
1620
               {
 
1621
                    V T4g, T4I, T4n, T4J;
 
1622
                    {
 
1623
                         V T4c, T4f, T4j, T4m;
 
1624
                         T4c = VADD(T4a, T4b);
 
1625
                         T4f = VADD(T4d, T4e);
 
1626
                         T4g = VFNMS(LDK(KP098017140), T4f, VMUL(LDK(KP995184726), T4c));
 
1627
                         T4I = VFMA(LDK(KP098017140), T4c, VMUL(LDK(KP995184726), T4f));
 
1628
                         T4j = VADD(T4h, T4i);
 
1629
                         T4m = VADD(T4k, T4l);
 
1630
                         T4n = VFMA(LDK(KP995184726), T4j, VMUL(LDK(KP098017140), T4m));
 
1631
                         T4J = VFNMS(LDK(KP098017140), T4j, VMUL(LDK(KP995184726), T4m));
 
1632
                    }
 
1633
                    T4o = VSUB(T4g, T4n);
 
1634
                    T4S = VADD(T4I, T4J);
 
1635
                    T4K = VSUB(T4I, T4J);
 
1636
                    T4P = VADD(T4g, T4n);
 
1637
               }
 
1638
               {
 
1639
                    V T4Y, T5a, T51, T5b;
 
1640
                    {
 
1641
                         V T4W, T4X, T4Z, T50;
 
1642
                         T4W = VSUB(T4a, T4b);
 
1643
                         T4X = VSUB(T4e, T4d);
 
1644
                         T4Y = VFNMS(LDK(KP634393284), T4X, VMUL(LDK(KP773010453), T4W));
 
1645
                         T5a = VFMA(LDK(KP634393284), T4W, VMUL(LDK(KP773010453), T4X));
 
1646
                         T4Z = VSUB(T4h, T4i);
 
1647
                         T50 = VSUB(T4l, T4k);
 
1648
                         T51 = VFMA(LDK(KP773010453), T4Z, VMUL(LDK(KP634393284), T50));
 
1649
                         T5b = VFNMS(LDK(KP634393284), T4Z, VMUL(LDK(KP773010453), T50));
 
1650
                    }
 
1651
                    T52 = VSUB(T4Y, T51);
 
1652
                    T5k = VADD(T5a, T5b);
 
1653
                    T5c = VSUB(T5a, T5b);
 
1654
                    T5h = VADD(T4Y, T51);
 
1655
               }
 
1656
               {
 
1657
                    V T4A, T4L, T5i, T5l;
 
1658
                    T4A = VBYI(VSUB(T4o, T4z));
 
1659
                    T4L = VSUB(T4H, T4K);
 
1660
                    ST(&(x[WS(ios, 17)]), VADD(T4A, T4L), dist, &(x[WS(ios, 1)]));
 
1661
                    ST(&(x[WS(ios, 47)]), VSUB(T4L, T4A), dist, &(x[WS(ios, 1)]));
 
1662
                    T5i = VADD(T5g, T5h);
 
1663
                    T5l = VBYI(VADD(T5j, T5k));
 
1664
                    ST(&(x[WS(ios, 57)]), VSUB(T5i, T5l), dist, &(x[WS(ios, 1)]));
 
1665
                    ST(&(x[WS(ios, 7)]), VADD(T5i, T5l), dist, &(x[WS(ios, 1)]));
 
1666
               }
 
1667
               {
 
1668
                    V T5m, T5n, T4M, T4N;
 
1669
                    T5m = VSUB(T5g, T5h);
 
1670
                    T5n = VBYI(VSUB(T5k, T5j));
 
1671
                    ST(&(x[WS(ios, 39)]), VSUB(T5m, T5n), dist, &(x[WS(ios, 1)]));
 
1672
                    ST(&(x[WS(ios, 25)]), VADD(T5m, T5n), dist, &(x[WS(ios, 1)]));
 
1673
                    T4M = VBYI(VADD(T4z, T4o));
 
1674
                    T4N = VADD(T4H, T4K);
 
1675
                    ST(&(x[WS(ios, 15)]), VADD(T4M, T4N), dist, &(x[WS(ios, 1)]));
 
1676
                    ST(&(x[WS(ios, 49)]), VSUB(T4N, T4M), dist, &(x[WS(ios, 1)]));
 
1677
               }
 
1678
               {
 
1679
                    V T4Q, T4T, T56, T5d;
 
1680
                    T4Q = VADD(T4O, T4P);
 
1681
                    T4T = VBYI(VADD(T4R, T4S));
 
1682
                    ST(&(x[WS(ios, 63)]), VSUB(T4Q, T4T), dist, &(x[WS(ios, 1)]));
 
1683
                    ST(&(x[WS(ios, 1)]), VADD(T4Q, T4T), dist, &(x[WS(ios, 1)]));
 
1684
                    T56 = VBYI(VSUB(T52, T55));
 
1685
                    T5d = VSUB(T59, T5c);
 
1686
                    ST(&(x[WS(ios, 23)]), VADD(T56, T5d), dist, &(x[WS(ios, 1)]));
 
1687
                    ST(&(x[WS(ios, 41)]), VSUB(T5d, T56), dist, &(x[WS(ios, 1)]));
 
1688
               }
 
1689
               {
 
1690
                    V T5e, T5f, T4U, T4V;
 
1691
                    T5e = VBYI(VADD(T55, T52));
 
1692
                    T5f = VADD(T59, T5c);
 
1693
                    ST(&(x[WS(ios, 9)]), VADD(T5e, T5f), dist, &(x[WS(ios, 1)]));
 
1694
                    ST(&(x[WS(ios, 55)]), VSUB(T5f, T5e), dist, &(x[WS(ios, 1)]));
 
1695
                    T4U = VSUB(T4O, T4P);
 
1696
                    T4V = VBYI(VSUB(T4S, T4R));
 
1697
                    ST(&(x[WS(ios, 33)]), VSUB(T4U, T4V), dist, &(x[WS(ios, 1)]));
 
1698
                    ST(&(x[WS(ios, 31)]), VADD(T4U, T4V), dist, &(x[WS(ios, 1)]));
 
1699
               }
 
1700
          }
 
1701
          {
 
1702
               V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
 
1703
               V T42;
 
1704
               {
 
1705
                    V TE, T3U, T1t, T3T, T13, T1s;
 
1706
                    TE = VSUB(Tg, TD);
 
1707
                    T3U = VADD(T3r, T3k);
 
1708
                    T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
 
1709
                    T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
 
1710
                    T1t = VSUB(T13, T1s);
 
1711
                    T3T = VADD(T13, T1s);
 
1712
                    T1u = VSUB(TE, T1t);
 
1713
                    T43 = VADD(T3U, T3T);
 
1714
                    T3D = VADD(TE, T1t);
 
1715
                    T3V = VSUB(T3T, T3U);
 
1716
               }
 
1717
               {
 
1718
                    V T3s, T3I, T3h, T3J, T3f, T3g;
 
1719
                    T3s = VSUB(T3k, T3r);
 
1720
                    T3I = VADD(Tg, TD);
 
1721
                    T3f = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
 
1722
                    T3g = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
 
1723
                    T3h = VSUB(T3f, T3g);
 
1724
                    T3J = VADD(T3f, T3g);
 
1725
                    T3t = VSUB(T3h, T3s);
 
1726
                    T45 = VADD(T3I, T3J);
 
1727
                    T3B = VADD(T3s, T3h);
 
1728
                    T3K = VSUB(T3I, T3J);
 
1729
               }
 
1730
               {
 
1731
                    V T2l, T3u, T3c, T3v;
 
1732
                    {
 
1733
                         V T29, T2k, T30, T3b;
 
1734
                         T29 = VSUB(T1R, T28);
 
1735
                         T2k = VSUB(T2g, T2j);
 
1736
                         T2l = VFMA(LDK(KP881921264), T29, VMUL(LDK(KP471396736), T2k));
 
1737
                         T3u = VFNMS(LDK(KP471396736), T29, VMUL(LDK(KP881921264), T2k));
 
1738
                         T30 = VSUB(T2I, T2Z);
 
1739
                         T3b = VSUB(T37, T3a);
 
1740
                         T3c = VFNMS(LDK(KP471396736), T3b, VMUL(LDK(KP881921264), T30));
 
1741
                         T3v = VFMA(LDK(KP471396736), T30, VMUL(LDK(KP881921264), T3b));
 
1742
                    }
 
1743
                    T3d = VSUB(T2l, T3c);
 
1744
                    T3E = VADD(T3u, T3v);
 
1745
                    T3w = VSUB(T3u, T3v);
 
1746
                    T3A = VADD(T2l, T3c);
 
1747
               }
 
1748
               {
 
1749
                    V T3N, T3W, T3Q, T3X;
 
1750
                    {
 
1751
                         V T3L, T3M, T3O, T3P;
 
1752
                         T3L = VADD(T28, T1R);
 
1753
                         T3M = VADD(T2g, T2j);
 
1754
                         T3N = VFMA(LDK(KP956940335), T3L, VMUL(LDK(KP290284677), T3M));
 
1755
                         T3W = VFNMS(LDK(KP290284677), T3L, VMUL(LDK(KP956940335), T3M));
 
1756
                         T3O = VADD(T2Z, T2I);
 
1757
                         T3P = VADD(T37, T3a);
 
1758
                         T3Q = VFNMS(LDK(KP290284677), T3P, VMUL(LDK(KP956940335), T3O));
 
1759
                         T3X = VFMA(LDK(KP290284677), T3O, VMUL(LDK(KP956940335), T3P));
 
1760
                    }
 
1761
                    T3R = VSUB(T3N, T3Q);
 
1762
                    T46 = VADD(T3W, T3X);
 
1763
                    T3Y = VSUB(T3W, T3X);
 
1764
                    T42 = VADD(T3N, T3Q);
 
1765
               }
 
1766
               {
 
1767
                    V T3e, T3x, T44, T47;
 
1768
                    T3e = VADD(T1u, T3d);
 
1769
                    T3x = VBYI(VADD(T3t, T3w));
 
1770
                    ST(&(x[WS(ios, 53)]), VSUB(T3e, T3x), dist, &(x[WS(ios, 1)]));
 
1771
                    ST(&(x[WS(ios, 11)]), VADD(T3e, T3x), dist, &(x[WS(ios, 1)]));
 
1772
                    T44 = VBYI(VSUB(T42, T43));
 
1773
                    T47 = VSUB(T45, T46);
 
1774
                    ST(&(x[WS(ios, 29)]), VADD(T44, T47), dist, &(x[WS(ios, 1)]));
 
1775
                    ST(&(x[WS(ios, 35)]), VSUB(T47, T44), dist, &(x[WS(ios, 1)]));
 
1776
               }
 
1777
               {
 
1778
                    V T48, T49, T3y, T3z;
 
1779
                    T48 = VBYI(VADD(T43, T42));
 
1780
                    T49 = VADD(T45, T46);
 
1781
                    ST(&(x[WS(ios, 3)]), VADD(T48, T49), dist, &(x[WS(ios, 1)]));
 
1782
                    ST(&(x[WS(ios, 61)]), VSUB(T49, T48), dist, &(x[WS(ios, 1)]));
 
1783
                    T3y = VSUB(T1u, T3d);
 
1784
                    T3z = VBYI(VSUB(T3w, T3t));
 
1785
                    ST(&(x[WS(ios, 43)]), VSUB(T3y, T3z), dist, &(x[WS(ios, 1)]));
 
1786
                    ST(&(x[WS(ios, 21)]), VADD(T3y, T3z), dist, &(x[WS(ios, 1)]));
 
1787
               }
 
1788
               {
 
1789
                    V T3C, T3F, T3S, T3Z;
 
1790
                    T3C = VBYI(VSUB(T3A, T3B));
 
1791
                    T3F = VSUB(T3D, T3E);
 
1792
                    ST(&(x[WS(ios, 27)]), VADD(T3C, T3F), dist, &(x[WS(ios, 1)]));
 
1793
                    ST(&(x[WS(ios, 37)]), VSUB(T3F, T3C), dist, &(x[WS(ios, 1)]));
 
1794
                    T3S = VADD(T3K, T3R);
 
1795
                    T3Z = VBYI(VADD(T3V, T3Y));
 
1796
                    ST(&(x[WS(ios, 51)]), VSUB(T3S, T3Z), dist, &(x[WS(ios, 1)]));
 
1797
                    ST(&(x[WS(ios, 13)]), VADD(T3S, T3Z), dist, &(x[WS(ios, 1)]));
 
1798
               }
 
1799
               {
 
1800
                    V T40, T41, T3G, T3H;
 
1801
                    T40 = VSUB(T3K, T3R);
 
1802
                    T41 = VBYI(VSUB(T3Y, T3V));
 
1803
                    ST(&(x[WS(ios, 45)]), VSUB(T40, T41), dist, &(x[WS(ios, 1)]));
 
1804
                    ST(&(x[WS(ios, 19)]), VADD(T40, T41), dist, &(x[WS(ios, 1)]));
 
1805
                    T3G = VBYI(VADD(T3B, T3A));
 
1806
                    T3H = VADD(T3D, T3E);
 
1807
                    ST(&(x[WS(ios, 5)]), VADD(T3G, T3H), dist, &(x[WS(ios, 1)]));
 
1808
                    ST(&(x[WS(ios, 59)]), VSUB(T3H, T3G), dist, &(x[WS(ios, 1)]));
 
1809
               }
 
1810
          }
 
1811
     }
 
1812
     return W;
 
1813
}
 
1814
 
 
1815
static const tw_instr twinstr[] = {
 
1816
     VTW(1),
 
1817
     VTW(2),
 
1818
     VTW(3),
 
1819
     VTW(4),
 
1820
     VTW(5),
 
1821
     VTW(6),
 
1822
     VTW(7),
 
1823
     VTW(8),
 
1824
     VTW(9),
 
1825
     VTW(10),
 
1826
     VTW(11),
 
1827
     VTW(12),
 
1828
     VTW(13),
 
1829
     VTW(14),
 
1830
     VTW(15),
 
1831
     VTW(16),
 
1832
     VTW(17),
 
1833
     VTW(18),
 
1834
     VTW(19),
 
1835
     VTW(20),
 
1836
     VTW(21),
 
1837
     VTW(22),
 
1838
     VTW(23),
 
1839
     VTW(24),
 
1840
     VTW(25),
 
1841
     VTW(26),
 
1842
     VTW(27),
 
1843
     VTW(28),
 
1844
     VTW(29),
 
1845
     VTW(30),
 
1846
     VTW(31),
 
1847
     VTW(32),
 
1848
     VTW(33),
 
1849
     VTW(34),
 
1850
     VTW(35),
 
1851
     VTW(36),
 
1852
     VTW(37),
 
1853
     VTW(38),
 
1854
     VTW(39),
 
1855
     VTW(40),
 
1856
     VTW(41),
 
1857
     VTW(42),
 
1858
     VTW(43),
 
1859
     VTW(44),
 
1860
     VTW(45),
 
1861
     VTW(46),
 
1862
     VTW(47),
 
1863
     VTW(48),
 
1864
     VTW(49),
 
1865
     VTW(50),
 
1866
     VTW(51),
 
1867
     VTW(52),
 
1868
     VTW(53),
 
1869
     VTW(54),
 
1870
     VTW(55),
 
1871
     VTW(56),
 
1872
     VTW(57),
 
1873
     VTW(58),
 
1874
     VTW(59),
 
1875
     VTW(60),
 
1876
     VTW(61),
 
1877
     VTW(62),
 
1878
     VTW(63),
 
1879
     {TW_NEXT, VL, 0}
 
1880
};
 
1881
 
 
1882
static const ct_desc desc = { 64, "t2bv_64", twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
 
1883
 
 
1884
void X(codelet_t2bv_64) (planner *p) {
 
1885
     X(kdft_dit_register) (p, t2bv_64, &desc);
 
1886
}
 
1887
#endif                          /* HAVE_FMA */