~ubuntu-branches/ubuntu/utopic/fftw3/utopic

« back to all changes in this revision

Viewing changes to dft/simd/codelets/n2fv_10.c

  • Committer: Package Import Robot
  • Author(s): Matthias Klose
  • Date: 2011-12-14 13:21:22 UTC
  • mfrom: (3.1.5 sid)
  • Revision ID: package-import@ubuntu.com-20111214132122-l4avyl2kkr7vq5aj
Tags: 3.3-1ubuntu1
* Merge with Debian; remaining changes:
  - Revert the ARM workaround.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
 * Copyright (c) 2003, 2007-8 Matteo Frigo
3
 
 * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
4
 
 *
5
 
 * This program is free software; you can redistribute it and/or modify
6
 
 * it under the terms of the GNU General Public License as published by
7
 
 * the Free Software Foundation; either version 2 of the License, or
8
 
 * (at your option) any later version.
9
 
 *
10
 
 * This program is distributed in the hope that it will be useful,
11
 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 
 * GNU General Public License for more details.
14
 
 *
15
 
 * You should have received a copy of the GNU General Public License
16
 
 * along with this program; if not, write to the Free Software
17
 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 
 *
19
 
 */
20
 
 
21
 
/* This file was automatically generated --- DO NOT EDIT */
22
 
/* Generated on Sun Jul 12 06:39:53 EDT 2009 */
23
 
 
24
 
#include "codelet-dft.h"
25
 
 
26
 
#ifdef HAVE_FMA
27
 
 
28
 
/* Generated by: ../../../genfft/gen_notw_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n2fv_10 -with-ostride 2 -include n2f.h -store-multiple 2 */
29
 
 
30
 
/*
31
 
 * This function contains 42 FP additions, 22 FP multiplications,
32
 
 * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
33
 
 * 53 stack variables, 4 constants, and 25 memory accesses
34
 
 */
35
 
#include "n2f.h"
36
 
 
37
 
static void n2fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
38
 
{
39
 
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40
 
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
41
 
     DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
42
 
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
43
 
     INT i;
44
 
     const R *xi;
45
 
     R *xo;
46
 
     xi = ri;
47
 
     xo = ro;
48
 
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) {
49
 
          V Tb, Tr, T3, Ts, T6, Tw, Tg, Tt, T9, Tc, T1, T2;
50
 
          T1 = LD(&(xi[0]), ivs, &(xi[0]));
51
 
          T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
52
 
          {
53
 
               V T4, T5, Te, Tf, T7, T8;
54
 
               T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
55
 
               T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
56
 
               Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
57
 
               Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
58
 
               T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
59
 
               T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
60
 
               Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
61
 
               Tr = VADD(T1, T2);
62
 
               T3 = VSUB(T1, T2);
63
 
               Ts = VADD(T4, T5);
64
 
               T6 = VSUB(T4, T5);
65
 
               Tw = VADD(Te, Tf);
66
 
               Tg = VSUB(Te, Tf);
67
 
               Tt = VADD(T7, T8);
68
 
               T9 = VSUB(T7, T8);
69
 
               Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
70
 
          }
71
 
          {
72
 
               V TD, Tu, Tm, Ta, Td, Tv;
73
 
               TD = VSUB(Ts, Tt);
74
 
               Tu = VADD(Ts, Tt);
75
 
               Tm = VSUB(T6, T9);
76
 
               Ta = VADD(T6, T9);
77
 
               Td = VSUB(Tb, Tc);
78
 
               Tv = VADD(Tb, Tc);
79
 
               {
80
 
                    V TC, Tx, Tn, Th;
81
 
                    TC = VSUB(Tv, Tw);
82
 
                    Tx = VADD(Tv, Tw);
83
 
                    Tn = VSUB(Td, Tg);
84
 
                    Th = VADD(Td, Tg);
85
 
                    {
86
 
                         V Ty, TA, TE, TG, Ti, Tk, To, Tq;
87
 
                         Ty = VADD(Tu, Tx);
88
 
                         TA = VSUB(Tu, Tx);
89
 
                         TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
90
 
                         TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
91
 
                         Ti = VADD(Ta, Th);
92
 
                         Tk = VSUB(Ta, Th);
93
 
                         To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
94
 
                         Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
95
 
                         {
96
 
                              V Tz, TH, Tj, TI;
97
 
                              Tz = VFNMS(LDK(KP250000000), Ty, Tr);
98
 
                              TH = VADD(Tr, Ty);
99
 
                              STM2(&(xo[0]), TH, ovs, &(xo[0]));
100
 
                              Tj = VFNMS(LDK(KP250000000), Ti, T3);
101
 
                              TI = VADD(T3, Ti);
102
 
                              STM2(&(xo[10]), TI, ovs, &(xo[2]));
103
 
                              {
104
 
                                   V TB, TF, Tl, Tp;
105
 
                                   TB = VFNMS(LDK(KP559016994), TA, Tz);
106
 
                                   TF = VFMA(LDK(KP559016994), TA, Tz);
107
 
                                   Tl = VFMA(LDK(KP559016994), Tk, Tj);
108
 
                                   Tp = VFNMS(LDK(KP559016994), Tk, Tj);
109
 
                                   {
110
 
                                        V TJ, TK, TL, TM;
111
 
                                        TJ = VFMAI(TG, TF);
112
 
                                        STM2(&(xo[8]), TJ, ovs, &(xo[0]));
113
 
                                        STN2(&(xo[8]), TJ, TI, ovs);
114
 
                                        TK = VFNMSI(TG, TF);
115
 
                                        STM2(&(xo[12]), TK, ovs, &(xo[0]));
116
 
                                        TL = VFNMSI(TE, TB);
117
 
                                        STM2(&(xo[16]), TL, ovs, &(xo[0]));
118
 
                                        TM = VFMAI(TE, TB);
119
 
                                        STM2(&(xo[4]), TM, ovs, &(xo[0]));
120
 
                                        {
121
 
                                             V TN, TO, TP, TQ;
122
 
                                             TN = VFNMSI(Tq, Tp);
123
 
                                             STM2(&(xo[6]), TN, ovs, &(xo[2]));
124
 
                                             STN2(&(xo[4]), TM, TN, ovs);
125
 
                                             TO = VFMAI(Tq, Tp);
126
 
                                             STM2(&(xo[14]), TO, ovs, &(xo[2]));
127
 
                                             STN2(&(xo[12]), TK, TO, ovs);
128
 
                                             TP = VFMAI(To, Tl);
129
 
                                             STM2(&(xo[18]), TP, ovs, &(xo[2]));
130
 
                                             STN2(&(xo[16]), TL, TP, ovs);
131
 
                                             TQ = VFNMSI(To, Tl);
132
 
                                             STM2(&(xo[2]), TQ, ovs, &(xo[2]));
133
 
                                             STN2(&(xo[0]), TH, TQ, ovs);
134
 
                                        }
135
 
                                   }
136
 
                              }
137
 
                         }
138
 
                    }
139
 
               }
140
 
          }
141
 
     }
142
 
}
143
 
 
144
 
static const kdft_desc desc = { 10, "n2fv_10", {24, 4, 18, 0}, &GENUS, 0, 2, 0, 0 };
145
 
 
146
 
void X(codelet_n2fv_10) (planner *p) {
147
 
     X(kdft_register) (p, n2fv_10, &desc);
148
 
}
149
 
 
150
 
#else                           /* HAVE_FMA */
151
 
 
152
 
/* Generated by: ../../../genfft/gen_notw_c -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n2fv_10 -with-ostride 2 -include n2f.h -store-multiple 2 */
153
 
 
154
 
/*
155
 
 * This function contains 42 FP additions, 12 FP multiplications,
156
 
 * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
157
 
 * 36 stack variables, 4 constants, and 25 memory accesses
158
 
 */
159
 
#include "n2f.h"
160
 
 
161
 
static void n2fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
162
 
{
163
 
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
164
 
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
165
 
     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
166
 
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
167
 
     INT i;
168
 
     const R *xi;
169
 
     R *xo;
170
 
     xi = ri;
171
 
     xo = ro;
172
 
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) {
173
 
          V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th;
174
 
          Tg = LD(&(xi[0]), ivs, &(xi[0]));
175
 
          Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
176
 
          Ti = VSUB(Tg, Th);
177
 
          Ty = VADD(Tg, Th);
178
 
          {
179
 
               V T3, Tu, Td, Ts, T6, Tv, Ta, Tr;
180
 
               {
181
 
                    V T1, T2, Tb, Tc;
182
 
                    T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
183
 
                    T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
184
 
                    T3 = VSUB(T1, T2);
185
 
                    Tu = VADD(T1, T2);
186
 
                    Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
187
 
                    Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
188
 
                    Td = VSUB(Tb, Tc);
189
 
                    Ts = VADD(Tb, Tc);
190
 
               }
191
 
               {
192
 
                    V T4, T5, T8, T9;
193
 
                    T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
194
 
                    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
195
 
                    T6 = VSUB(T4, T5);
196
 
                    Tv = VADD(T4, T5);
197
 
                    T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
198
 
                    T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
199
 
                    Ta = VSUB(T8, T9);
200
 
                    Tr = VADD(T8, T9);
201
 
               }
202
 
               Tm = VSUB(T3, T6);
203
 
               Tn = VSUB(Ta, Td);
204
 
               Tw = VSUB(Tu, Tv);
205
 
               Tt = VSUB(Tr, Ts);
206
 
               Tz = VADD(Tu, Tv);
207
 
               TA = VADD(Tr, Ts);
208
 
               TB = VADD(Tz, TA);
209
 
               T7 = VADD(T3, T6);
210
 
               Te = VADD(Ta, Td);
211
 
               Tj = VADD(T7, Te);
212
 
          }
213
 
          {
214
 
               V TH, TI, TK, TL, TM;
215
 
               TH = VADD(Ti, Tj);
216
 
               STM2(&(xo[10]), TH, ovs, &(xo[2]));
217
 
               TI = VADD(Ty, TB);
218
 
               STM2(&(xo[0]), TI, ovs, &(xo[0]));
219
 
               {
220
 
                    V To, Tq, Tl, Tp, Tf, Tk, TJ;
221
 
                    To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn)));
222
 
                    Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn)));
223
 
                    Tf = VMUL(LDK(KP559016994), VSUB(T7, Te));
224
 
                    Tk = VFNMS(LDK(KP250000000), Tj, Ti);
225
 
                    Tl = VADD(Tf, Tk);
226
 
                    Tp = VSUB(Tk, Tf);
227
 
                    TJ = VSUB(Tl, To);
228
 
                    STM2(&(xo[2]), TJ, ovs, &(xo[2]));
229
 
                    STN2(&(xo[0]), TI, TJ, ovs);
230
 
                    TK = VADD(Tq, Tp);
231
 
                    STM2(&(xo[14]), TK, ovs, &(xo[2]));
232
 
                    TL = VADD(To, Tl);
233
 
                    STM2(&(xo[18]), TL, ovs, &(xo[2]));
234
 
                    TM = VSUB(Tp, Tq);
235
 
                    STM2(&(xo[6]), TM, ovs, &(xo[2]));
236
 
               }
237
 
               {
238
 
                    V Tx, TF, TE, TG, TC, TD;
239
 
                    Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt)));
240
 
                    TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
241
 
                    TC = VFNMS(LDK(KP250000000), TB, Ty);
242
 
                    TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
243
 
                    TE = VSUB(TC, TD);
244
 
                    TG = VADD(TD, TC);
245
 
                    {
246
 
                         V TN, TO, TP, TQ;
247
 
                         TN = VADD(Tx, TE);
248
 
                         STM2(&(xo[4]), TN, ovs, &(xo[0]));
249
 
                         STN2(&(xo[4]), TN, TM, ovs);
250
 
                         TO = VSUB(TG, TF);
251
 
                         STM2(&(xo[12]), TO, ovs, &(xo[0]));
252
 
                         STN2(&(xo[12]), TO, TK, ovs);
253
 
                         TP = VSUB(TE, Tx);
254
 
                         STM2(&(xo[16]), TP, ovs, &(xo[0]));
255
 
                         STN2(&(xo[16]), TP, TL, ovs);
256
 
                         TQ = VADD(TF, TG);
257
 
                         STM2(&(xo[8]), TQ, ovs, &(xo[0]));
258
 
                         STN2(&(xo[8]), TQ, TH, ovs);
259
 
                    }
260
 
               }
261
 
          }
262
 
     }
263
 
}
264
 
 
265
 
static const kdft_desc desc = { 10, "n2fv_10", {36, 6, 6, 0}, &GENUS, 0, 2, 0, 0 };
266
 
 
267
 
void X(codelet_n2fv_10) (planner *p) {
268
 
     X(kdft_register) (p, n2fv_10, &desc);
269
 
}
270
 
 
271
 
#endif                          /* HAVE_FMA */