~ubuntu-branches/ubuntu/utopic/fftw3/utopic

« back to all changes in this revision

Viewing changes to dft/simd/common/n1bv_16.c

  • Committer: Package Import Robot
  • Author(s): Matthias Klose
  • Date: 2011-12-14 13:21:22 UTC
  • mfrom: (3.1.5 sid)
  • Revision ID: package-import@ubuntu.com-20111214132122-l4avyl2kkr7vq5aj
Tags: 3.3-1ubuntu1
* Merge with Debian; remaining changes:
  - Revert the ARM workaround.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright (c) 2003, 2007-11 Matteo Frigo
 
3
 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
 
4
 *
 
5
 * This program is free software; you can redistribute it and/or modify
 
6
 * it under the terms of the GNU General Public License as published by
 
7
 * the Free Software Foundation; either version 2 of the License, or
 
8
 * (at your option) any later version.
 
9
 *
 
10
 * This program is distributed in the hope that it will be useful,
 
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
13
 * GNU General Public License for more details.
 
14
 *
 
15
 * You should have received a copy of the GNU General Public License
 
16
 * along with this program; if not, write to the Free Software
 
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
18
 *
 
19
 */
 
20
 
 
21
/* This file was automatically generated --- DO NOT EDIT */
 
22
/* Generated on Wed Jul 27 06:13:41 EDT 2011 */
 
23
 
 
24
#include "codelet-dft.h"
 
25
 
 
26
#ifdef HAVE_FMA
 
27
 
 
28
/* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 16 -name n1bv_16 -include n1b.h */
 
29
 
 
30
/*
 
31
 * This function contains 72 FP additions, 34 FP multiplications,
 
32
 * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
 
33
 * 54 stack variables, 3 constants, and 32 memory accesses
 
34
 */
 
35
#include "n1b.h"
 
36
 
 
37
static void n1bv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
 
38
{
 
39
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
 
40
     DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
 
41
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
 
42
     {
 
43
          INT i;
 
44
          const R *xi;
 
45
          R *xo;
 
46
          xi = ii;
 
47
          xo = io;
 
48
          for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) {
 
49
               V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
 
50
               V T16;
 
51
               {
 
52
                    V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
 
53
                    {
 
54
                         V T1, T2, T4, T5, To, Tp, Tr, Ts;
 
55
                         T1 = LD(&(xi[0]), ivs, &(xi[0]));
 
56
                         T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
 
57
                         T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
 
58
                         T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
 
59
                         To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
 
60
                         Tp = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
 
61
                         Tr = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
 
62
                         Ts = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
 
63
                         {
 
64
                              V T8, TI, Tq, TJ, Tt, T9, Tb, Tc, T3, T6;
 
65
                              T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
 
66
                              TH = VSUB(T1, T2);
 
67
                              T3 = VADD(T1, T2);
 
68
                              TU = VSUB(T4, T5);
 
69
                              T6 = VADD(T4, T5);
 
70
                              TI = VSUB(To, Tp);
 
71
                              Tq = VADD(To, Tp);
 
72
                              TJ = VSUB(Tr, Ts);
 
73
                              Tt = VADD(Tr, Ts);
 
74
                              T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
 
75
                              Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
 
76
                              Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
 
77
                              T7 = VSUB(T3, T6);
 
78
                              Tz = VADD(T3, T6);
 
79
                              Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
 
80
                              TK = VADD(TI, TJ);
 
81
                              TV = VSUB(TI, TJ);
 
82
                              TA = VADD(Tq, Tt);
 
83
                              Tu = VSUB(Tq, Tt);
 
84
                              TM = VSUB(T8, T9);
 
85
                              Ta = VADD(T8, T9);
 
86
                              TN = VSUB(Tb, Tc);
 
87
                              Td = VADD(Tb, Tc);
 
88
                              Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
 
89
                              Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
 
90
                              Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
 
91
                         }
 
92
                    }
 
93
                    TF = VADD(Tz, TA);
 
94
                    TB = VSUB(Tz, TA);
 
95
                    T13 = VFNMS(LDK(KP707106781), TK, TH);
 
96
                    TL = VFMA(LDK(KP707106781), TK, TH);
 
97
                    TO = VFNMS(LDK(KP414213562), TN, TM);
 
98
                    TX = VFMA(LDK(KP414213562), TM, TN);
 
99
                    TC = VADD(Ta, Td);
 
100
                    Te = VSUB(Ta, Td);
 
101
                    TP = VSUB(Tf, Tg);
 
102
                    Th = VADD(Tf, Tg);
 
103
                    TQ = VSUB(Tj, Ti);
 
104
                    Tk = VADD(Ti, Tj);
 
105
                    TW = VFMA(LDK(KP707106781), TV, TU);
 
106
                    T16 = VFNMS(LDK(KP707106781), TV, TU);
 
107
               }
 
108
               {
 
109
                    V TY, TR, Tl, TD;
 
110
                    TY = VFMA(LDK(KP414213562), TP, TQ);
 
111
                    TR = VFNMS(LDK(KP414213562), TQ, TP);
 
112
                    Tl = VSUB(Th, Tk);
 
113
                    TD = VADD(Th, Tk);
 
114
                    {
 
115
                         V TS, T17, TZ, T14;
 
116
                         TS = VADD(TO, TR);
 
117
                         T17 = VSUB(TO, TR);
 
118
                         TZ = VSUB(TX, TY);
 
119
                         T14 = VADD(TX, TY);
 
120
                         {
 
121
                              V TE, TG, Tm, Tv;
 
122
                              TE = VSUB(TC, TD);
 
123
                              TG = VADD(TC, TD);
 
124
                              Tm = VADD(Te, Tl);
 
125
                              Tv = VSUB(Te, Tl);
 
126
                              {
 
127
                                   V T18, T1a, TT, T11;
 
128
                                   T18 = VFMA(LDK(KP923879532), T17, T16);
 
129
                                   T1a = VFNMS(LDK(KP923879532), T17, T16);
 
130
                                   TT = VFNMS(LDK(KP923879532), TS, TL);
 
131
                                   T11 = VFMA(LDK(KP923879532), TS, TL);
 
132
                                   {
 
133
                                        V T15, T19, T10, T12;
 
134
                                        T15 = VFNMS(LDK(KP923879532), T14, T13);
 
135
                                        T19 = VFMA(LDK(KP923879532), T14, T13);
 
136
                                        T10 = VFNMS(LDK(KP923879532), TZ, TW);
 
137
                                        T12 = VFMA(LDK(KP923879532), TZ, TW);
 
138
                                        ST(&(xo[0]), VADD(TF, TG), ovs, &(xo[0]));
 
139
                                        ST(&(xo[WS(os, 8)]), VSUB(TF, TG), ovs, &(xo[0]));
 
140
                                        ST(&(xo[WS(os, 4)]), VFMAI(TE, TB), ovs, &(xo[0]));
 
141
                                        ST(&(xo[WS(os, 12)]), VFNMSI(TE, TB), ovs, &(xo[0]));
 
142
                                        {
 
143
                                             V Tw, Ty, Tn, Tx;
 
144
                                             Tw = VFNMS(LDK(KP707106781), Tv, Tu);
 
145
                                             Ty = VFMA(LDK(KP707106781), Tv, Tu);
 
146
                                             Tn = VFNMS(LDK(KP707106781), Tm, T7);
 
147
                                             Tx = VFMA(LDK(KP707106781), Tm, T7);
 
148
                                             ST(&(xo[WS(os, 3)]), VFNMSI(T1a, T19), ovs, &(xo[WS(os, 1)]));
 
149
                                             ST(&(xo[WS(os, 13)]), VFMAI(T1a, T19), ovs, &(xo[WS(os, 1)]));
 
150
                                             ST(&(xo[WS(os, 11)]), VFNMSI(T18, T15), ovs, &(xo[WS(os, 1)]));
 
151
                                             ST(&(xo[WS(os, 5)]), VFMAI(T18, T15), ovs, &(xo[WS(os, 1)]));
 
152
                                             ST(&(xo[WS(os, 15)]), VFNMSI(T12, T11), ovs, &(xo[WS(os, 1)]));
 
153
                                             ST(&(xo[WS(os, 1)]), VFMAI(T12, T11), ovs, &(xo[WS(os, 1)]));
 
154
                                             ST(&(xo[WS(os, 9)]), VFMAI(T10, TT), ovs, &(xo[WS(os, 1)]));
 
155
                                             ST(&(xo[WS(os, 7)]), VFNMSI(T10, TT), ovs, &(xo[WS(os, 1)]));
 
156
                                             ST(&(xo[WS(os, 2)]), VFMAI(Ty, Tx), ovs, &(xo[0]));
 
157
                                             ST(&(xo[WS(os, 14)]), VFNMSI(Ty, Tx), ovs, &(xo[0]));
 
158
                                             ST(&(xo[WS(os, 10)]), VFMAI(Tw, Tn), ovs, &(xo[0]));
 
159
                                             ST(&(xo[WS(os, 6)]), VFNMSI(Tw, Tn), ovs, &(xo[0]));
 
160
                                        }
 
161
                                   }
 
162
                              }
 
163
                         }
 
164
                    }
 
165
               }
 
166
          }
 
167
     }
 
168
     VLEAVE();
 
169
}
 
170
 
 
171
static const kdft_desc desc = { 16, XSIMD_STRING("n1bv_16"), {38, 0, 34, 0}, &GENUS, 0, 0, 0, 0 };
 
172
 
 
173
void XSIMD(codelet_n1bv_16) (planner *p) {
 
174
     X(kdft_register) (p, n1bv_16, &desc);
 
175
}
 
176
 
 
177
#else                           /* HAVE_FMA */
 
178
 
 
179
/* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 16 -name n1bv_16 -include n1b.h */
 
180
 
 
181
/*
 
182
 * This function contains 72 FP additions, 12 FP multiplications,
 
183
 * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
 
184
 * 30 stack variables, 3 constants, and 32 memory accesses
 
185
 */
 
186
#include "n1b.h"
 
187
 
 
188
static void n1bv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
 
189
{
 
190
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
 
191
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
 
192
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
 
193
     {
 
194
          INT i;
 
195
          const R *xi;
 
196
          R *xo;
 
197
          xi = ii;
 
198
          xo = io;
 
199
          for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) {
 
200
               V Tp, T13, Tu, TY, Tm, T14, Tv, TU, T7, T16, Tx, TN, Te, T17, Ty;
 
201
               V TQ;
 
202
               {
 
203
                    V Tn, To, TX, Ts, Tt, TW;
 
204
                    Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
 
205
                    To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
 
206
                    TX = VADD(Tn, To);
 
207
                    Ts = LD(&(xi[0]), ivs, &(xi[0]));
 
208
                    Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
 
209
                    TW = VADD(Ts, Tt);
 
210
                    Tp = VSUB(Tn, To);
 
211
                    T13 = VADD(TW, TX);
 
212
                    Tu = VSUB(Ts, Tt);
 
213
                    TY = VSUB(TW, TX);
 
214
               }
 
215
               {
 
216
                    V Ti, TS, Tl, TT;
 
217
                    {
 
218
                         V Tg, Th, Tj, Tk;
 
219
                         Tg = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
 
220
                         Th = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
 
221
                         Ti = VSUB(Tg, Th);
 
222
                         TS = VADD(Tg, Th);
 
223
                         Tj = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
 
224
                         Tk = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
 
225
                         Tl = VSUB(Tj, Tk);
 
226
                         TT = VADD(Tj, Tk);
 
227
                    }
 
228
                    Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
 
229
                    T14 = VADD(TS, TT);
 
230
                    Tv = VMUL(LDK(KP707106781), VADD(Ti, Tl));
 
231
                    TU = VSUB(TS, TT);
 
232
               }
 
233
               {
 
234
                    V T3, TL, T6, TM;
 
235
                    {
 
236
                         V T1, T2, T4, T5;
 
237
                         T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
 
238
                         T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
 
239
                         T3 = VSUB(T1, T2);
 
240
                         TL = VADD(T1, T2);
 
241
                         T4 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
 
242
                         T5 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
 
243
                         T6 = VSUB(T4, T5);
 
244
                         TM = VADD(T4, T5);
 
245
                    }
 
246
                    T7 = VFNMS(LDK(KP382683432), T6, VMUL(LDK(KP923879532), T3));
 
247
                    T16 = VADD(TL, TM);
 
248
                    Tx = VFMA(LDK(KP382683432), T3, VMUL(LDK(KP923879532), T6));
 
249
                    TN = VSUB(TL, TM);
 
250
               }
 
251
               {
 
252
                    V Ta, TO, Td, TP;
 
253
                    {
 
254
                         V T8, T9, Tb, Tc;
 
255
                         T8 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
 
256
                         T9 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
 
257
                         Ta = VSUB(T8, T9);
 
258
                         TO = VADD(T8, T9);
 
259
                         Tb = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
 
260
                         Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
 
261
                         Td = VSUB(Tb, Tc);
 
262
                         TP = VADD(Tb, Tc);
 
263
                    }
 
264
                    Te = VFMA(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), Td));
 
265
                    T17 = VADD(TO, TP);
 
266
                    Ty = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
 
267
                    TQ = VSUB(TO, TP);
 
268
               }
 
269
               {
 
270
                    V T15, T18, T19, T1a;
 
271
                    T15 = VSUB(T13, T14);
 
272
                    T18 = VBYI(VSUB(T16, T17));
 
273
                    ST(&(xo[WS(os, 12)]), VSUB(T15, T18), ovs, &(xo[0]));
 
274
                    ST(&(xo[WS(os, 4)]), VADD(T15, T18), ovs, &(xo[0]));
 
275
                    T19 = VADD(T13, T14);
 
276
                    T1a = VADD(T16, T17);
 
277
                    ST(&(xo[WS(os, 8)]), VSUB(T19, T1a), ovs, &(xo[0]));
 
278
                    ST(&(xo[0]), VADD(T19, T1a), ovs, &(xo[0]));
 
279
               }
 
280
               {
 
281
                    V TV, T11, T10, T12, TR, TZ;
 
282
                    TR = VMUL(LDK(KP707106781), VSUB(TN, TQ));
 
283
                    TV = VBYI(VSUB(TR, TU));
 
284
                    T11 = VBYI(VADD(TU, TR));
 
285
                    TZ = VMUL(LDK(KP707106781), VADD(TN, TQ));
 
286
                    T10 = VSUB(TY, TZ);
 
287
                    T12 = VADD(TY, TZ);
 
288
                    ST(&(xo[WS(os, 6)]), VADD(TV, T10), ovs, &(xo[0]));
 
289
                    ST(&(xo[WS(os, 14)]), VSUB(T12, T11), ovs, &(xo[0]));
 
290
                    ST(&(xo[WS(os, 10)]), VSUB(T10, TV), ovs, &(xo[0]));
 
291
                    ST(&(xo[WS(os, 2)]), VADD(T11, T12), ovs, &(xo[0]));
 
292
               }
 
293
               {
 
294
                    V Tr, TB, TA, TC;
 
295
                    {
 
296
                         V Tf, Tq, Tw, Tz;
 
297
                         Tf = VSUB(T7, Te);
 
298
                         Tq = VSUB(Tm, Tp);
 
299
                         Tr = VBYI(VSUB(Tf, Tq));
 
300
                         TB = VBYI(VADD(Tq, Tf));
 
301
                         Tw = VSUB(Tu, Tv);
 
302
                         Tz = VSUB(Tx, Ty);
 
303
                         TA = VSUB(Tw, Tz);
 
304
                         TC = VADD(Tw, Tz);
 
305
                    }
 
306
                    ST(&(xo[WS(os, 5)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)]));
 
307
                    ST(&(xo[WS(os, 13)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)]));
 
308
                    ST(&(xo[WS(os, 11)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)]));
 
309
                    ST(&(xo[WS(os, 3)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)]));
 
310
               }
 
311
               {
 
312
                    V TF, TJ, TI, TK;
 
313
                    {
 
314
                         V TD, TE, TG, TH;
 
315
                         TD = VADD(Tu, Tv);
 
316
                         TE = VADD(T7, Te);
 
317
                         TF = VADD(TD, TE);
 
318
                         TJ = VSUB(TD, TE);
 
319
                         TG = VADD(Tp, Tm);
 
320
                         TH = VADD(Tx, Ty);
 
321
                         TI = VBYI(VADD(TG, TH));
 
322
                         TK = VBYI(VSUB(TH, TG));
 
323
                    }
 
324
                    ST(&(xo[WS(os, 15)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)]));
 
325
                    ST(&(xo[WS(os, 7)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)]));
 
326
                    ST(&(xo[WS(os, 1)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)]));
 
327
                    ST(&(xo[WS(os, 9)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)]));
 
328
               }
 
329
          }
 
330
     }
 
331
     VLEAVE();
 
332
}
 
333
 
 
334
static const kdft_desc desc = { 16, XSIMD_STRING("n1bv_16"), {68, 8, 4, 0}, &GENUS, 0, 0, 0, 0 };
 
335
 
 
336
void XSIMD(codelet_n1bv_16) (planner *p) {
 
337
     X(kdft_register) (p, n1bv_16, &desc);
 
338
}
 
339
 
 
340
#endif                          /* HAVE_FMA */