2
* * Copyright (C) 2006-2011 Anders Brander <anders@brander.dk>,
3
* * Anders Kvist <akv@lnxbx.dk> and Klaus Post <klauspost@gmail.com>
5
* This program is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU General Public License
7
* as published by the Free Software Foundation; either version 2
8
* of the License, or (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20
#include "complexfilter.h"
22
#include "fftwindow.h"
27
#if defined (__i386__) || defined (__x86_64__)
29
#if defined (__x86_64__)
30
void DeGridComplexFilter::processSharpenOnlySSE3(ComplexBlock* block) {
31
fftwf_complex* outcur = block->complex;
32
fftwf_complex* gridsample = grid->complex;
33
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
34
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
35
float *wsharpen = sharpenWindow->getLine(0);
37
for (int i = 0; i < 4; i++) {
38
temp[i+0] = 1e-15f; // 0
39
temp[i+4] = gridfraction; // 16
40
temp[i+8] = sigmaSquaredSharpenMin; // 32
41
temp[i+12] = sigmaSquaredSharpenMax; // 48
42
temp[i+16] = 1.0f; // 64
45
if ((size & 7) == 0) { // TODO: Benchmark on non-vm platform - slower under VMWARE
48
"movaps 16(%1),%%xmm14\n" // Load gridfraction into xmm6
49
"loop_sharpenonly_sse3_big:\n"
50
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
51
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
52
"movaps 32(%2), %%xmm8\n" // in r0i0 r1i1
53
"movaps 48(%2), %%xmm9\n" //in r2i2 r3i3
54
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
55
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
56
"movaps 32(%3), %%xmm12\n" // grid r0i0 r1i1
57
"movaps 48(%3), %%xmm13\n" // grid r2i2 r3i3
59
"mulps %%xmm14, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
60
"mulps %%xmm14, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
61
"mulps %%xmm14, %%xmm12\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
62
"mulps %%xmm14, %%xmm13\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
63
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
64
"movaps %%xmm5, %%xmm3\n"
65
"movaps %%xmm12, %%xmm10\n" // maintain gridcorrection in memory
66
"movaps %%xmm13, %%xmm11\n"
67
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
68
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
69
"subps %%xmm12, %%xmm8\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
70
"subps %%xmm13, %%xmm9\n" // re2 im2 re3 im3 -
71
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
72
"movaps %%xmm1, %%xmm5\n"
73
"movaps %%xmm8, %%xmm12\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
74
"movaps %%xmm9, %%xmm13\n"
76
"movaps (%1), %%xmm6\n" // Move 1e-15 into xmm6
77
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
78
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
79
"mulps %%xmm12, %%xmm12\n" //r0i0 r1i1 squared
80
"mulps %%xmm13, %%xmm13\n" //r2i2 r3i3 squared
81
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
82
"haddps %%xmm13, %%xmm12\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
83
"addps %%xmm6, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
84
"addps %%xmm6, %%xmm12\n" // add 1e-15 (xmm4: psd for all 4 pixels)
85
"movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
87
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
88
"movaps 32(%1), %%xmm6\n" // Move sigmaSquaredSharpenMin into xmm6
89
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
90
"movaps %%xmm12, %%xmm13\n" // Copy psd into xmm5
91
"movaps %%xmm7, %%xmm15\n" // Copy sigmaSquaredSharpenMax
92
"addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
93
"addps %%xmm7, %%xmm12\n" // xmm4 = psd + sigmaSquaredSharpenMax
94
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
95
"mulps %%xmm13, %%xmm15\n" // xmm7 = psd*sigmaSquaredSharpenMax
96
"addps %%xmm6, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
97
"addps %%xmm6, %%xmm13\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
99
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
100
"mulps %%xmm12, %%xmm13\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
101
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
102
"movaps (%4), %%xmm6\n" // load wsharpen[0->4]
103
"rcpps %%xmm13, %%xmm13\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
104
"movaps 16(%4), %%xmm4\n" // Load wsharpen
105
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
106
"mulps %%xmm13, %%xmm15\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
107
"movaps 64(%1), %%xmm5\n" // Load "1.0"
108
"rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
109
"rsqrtps %%xmm15, %%xmm15\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
110
"rcpps %%xmm7, %%xmm7\n" // sqrt (..)
111
"rcpps %%xmm15, %%xmm15\n" // sqrt (..)
112
"mulps %%xmm6, %%xmm7\n" // multiply wsharpen
113
"mulps %%xmm4, %%xmm15\n" // multiply wsharpen
114
"addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
115
"addps %%xmm5, %%xmm15\n" // + 1.0 xmm7 = sfact
116
"movaps %%xmm7, %%xmm5\n"
117
"movaps %%xmm15, %%xmm13\n"
118
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
119
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
120
"unpcklps %%xmm15, %%xmm15\n" // unpack low to xmm7
121
"unpckhps %%xmm13, %%xmm13\n" // unpack high to xmm5
123
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
124
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
125
"mulps %%xmm15, %%xmm8\n" // re+im *= sfact
126
"mulps %%xmm13, %%xmm9\n" // re+im *= sfact
127
"addps %%xmm2, %%xmm0\n" // add gridcorrection
128
"addps %%xmm3, %%xmm1\n" // add gridcorrection
129
"addps %%xmm10, %%xmm8\n" // add gridcorrection
130
"addps %%xmm11, %%xmm9\n" // add gridcorrection
131
"movaps %%xmm0, (%2)\n" // Store
132
"movaps %%xmm1, 16(%2)\n" // Store
133
"movaps %%xmm8, 32(%2)\n" // Store
134
"movaps %%xmm9, 48(%2)\n" // Store
135
"sub $8, %0\n" // size -=8
136
"add $64, %2\n" // outcur+=64
137
"add $64, %3\n" // gridsample+=64
138
"add $32, %4\n" // wsharpen+=32
140
"jg loop_sharpenonly_sse3_big\n"
141
: /* no output registers */
142
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
143
: /* %0 %1 %2 %3 %4 */
148
"movaps (%1), %%xmm11\n"
149
"movaps 16(%1), %%xmm12\n"
150
"movaps 32(%1), %%xmm13\n"
151
"movaps 48(%1), %%xmm14\n"
152
"movaps 64(%1), %%xmm15\n"
153
"loop_sharpenonly_sse3:\n"
154
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
155
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
156
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
157
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
159
"mulps %%xmm12, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
160
"mulps %%xmm12, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
161
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
162
"movaps %%xmm5, %%xmm3\n"
163
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
164
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
165
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
166
"movaps %%xmm1, %%xmm5\n"
168
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
169
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
170
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
171
"addps %%xmm11, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
172
"movaps %%xmm14, %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
174
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
175
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
176
"addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
177
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
178
"addps %%xmm13, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
180
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
181
"movaps (%4), %%xmm6\n" // load wsharpen[0->4]
182
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
183
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
184
"rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
185
"rcpps %%xmm7, %%xmm7\n" // sqrt (..)
186
"mulps %%xmm6, %%xmm7\n" // multiply wsharpen
187
"addps %%xmm15, %%xmm7\n" // + 1.0 xmm7 = sfact
188
"movaps %%xmm7, %%xmm5\n"
189
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
190
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
192
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
193
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
194
"addps %%xmm2, %%xmm0\n" // add gridcorrection
195
"addps %%xmm3, %%xmm1\n" // add gridcorrection
196
"movaps %%xmm0, (%2)\n" // Store
197
"movaps %%xmm1, 16(%2)\n" // Store
198
"sub $4, %0\n" // size -=4
199
"add $32, %2\n" // outcur+=32
200
"add $32, %3\n" // gridsample+=32
201
"add $16, %4\n" // wsharpen+=16
203
"jg loop_sharpenonly_sse3\n"
204
: /* no output registers */
205
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
206
: /* %0 %1 %2 %3 %4 */
212
void DeGridComplexFilter::processSharpenOnlySSE3(ComplexBlock* block) {
213
fftwf_complex* outcur = block->complex;
214
fftwf_complex* gridsample = grid->complex;
215
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
216
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
217
float *wsharpen = sharpenWindow->getLine(0);
219
for (int i = 0; i < 4; i++) {
220
temp[i+0] = 1e-15f; // 0
221
temp[i+4] = gridfraction; // 16
222
temp[i+8] = sigmaSquaredSharpenMin; // 32
223
temp[i+12] = sigmaSquaredSharpenMax; // 48
224
temp[i+16] = 1.0f; // 64
229
"loop_sharpenonly_sse3:\n"
230
"movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
231
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
232
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
233
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
234
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
236
"mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
237
"mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
238
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
239
"movaps %%xmm5, %%xmm3\n"
240
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
241
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
242
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
243
"movaps %%xmm1, %%xmm5\n"
245
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
246
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
247
"movaps 32(%1), %%xmm6\n" // Move sigmaSquaredSharpenMin into xmm6
248
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
249
"addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
250
"movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
252
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
253
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
254
"addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
255
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
256
"addps %%xmm6, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
258
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
259
"movaps (%4), %%xmm6\n" // load wsharpen[0->4]
260
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
261
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
262
"movaps 64(%1), %%xmm5\n" // Load "1.0"
263
"rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
264
"rcpps %%xmm7, %%xmm7\n" // sqrt (..)
265
"mulps %%xmm6, %%xmm7\n" // multiply wsharpen
266
"addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
267
"movaps %%xmm7, %%xmm5\n"
268
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
269
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
271
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
272
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
273
"addps %%xmm2, %%xmm0\n" // add gridcorrection
274
"addps %%xmm3, %%xmm1\n" // add gridcorrection
275
"movaps %%xmm0, (%2)\n" // Store
276
"movaps %%xmm1, 16(%2)\n" // Store
277
"sub $4, %0\n" // size -=4
278
"add $32, %2\n" // outcur+=32
279
"add $32, %3\n" // gridsample+=32
280
"add $16, %4\n" // wsharpen+=16
282
"jg loop_sharpenonly_sse3\n"
283
: /* no output registers */
284
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
285
: /* %0 %1 %2 %3 %4 */
289
void DeGridComplexFilter::processSharpenOnlySSE(ComplexBlock* block) {
290
fftwf_complex* outcur = block->complex;
291
fftwf_complex* gridsample = grid->complex;
292
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
293
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
294
float *wsharpen = sharpenWindow->getLine(0);
296
for (int i = 0; i < 4; i++) {
297
temp[i+0] = 1e-15f; // 0
298
temp[i+4] = gridfraction; // 16
299
temp[i+8] = sigmaSquaredSharpenMin; // 32
300
temp[i+12] = sigmaSquaredSharpenMax; // 48
301
temp[i+16] = 1.0f; // 64
306
"loop_sharpenonly_sse:\n"
307
"movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
308
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
309
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
310
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
311
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
313
"mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
314
"mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
315
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
316
"movaps %%xmm5, %%xmm3\n"
317
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
318
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
319
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
320
"movaps %%xmm1, %%xmm5\n"
322
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
323
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
325
"movaps %%xmm4, %%xmm7\n"
326
"shufps $136, %%xmm5, %%xmm4\n" // xmm7 r0r1 r2r3 [10 00 10 00 = 136]
327
"shufps $221, %%xmm5, %%xmm7\n" // xmm6 i0i1 i2i3 [11 01 11 01 = 221]
328
"movaps 32(%1), %%xmm6\n" // Move sigmaSquaredSharpenMin into xmm6
329
"addps %%xmm7, %%xmm4\n"
330
"movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
331
"addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
333
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
334
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
335
"addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
336
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
337
"addps %%xmm6, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
339
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
340
"movaps (%4), %%xmm6\n" // load wsharpen[0->4]
341
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
342
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
343
"movaps 64(%1), %%xmm5\n" // Load "1.0"
344
"rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
345
"rcpps %%xmm7, %%xmm7\n" // sqrt (..)
346
"mulps %%xmm6, %%xmm7\n" // multiply wsharpen
347
"addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
348
"movaps %%xmm7, %%xmm5\n"
349
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
350
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
352
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
353
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
354
"addps %%xmm2, %%xmm0\n" // add gridcorrection
355
"addps %%xmm3, %%xmm1\n" // add gridcorrection
356
"movaps %%xmm0, (%2)\n" // Store
357
"movaps %%xmm1, 16(%2)\n" // Store
358
"sub $4, %0\n" // size -=4
359
"add $32, %2\n" // outcur+=32
360
"add $32, %3\n" // gridsample+=32
361
"add $16, %4\n" // wsharpen+=16
363
"jg loop_sharpenonly_sse\n"
364
: /* no output registers */
365
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
366
: /* %0 %1 %2 %3 %4 */
369
#if defined (__x86_64__)
370
void ComplexWienerFilterDeGrid::processSharpen_SSE3( ComplexBlock* block )
372
fftwf_complex* outcur = block->complex;
373
fftwf_complex* gridsample = grid->complex;
374
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
375
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
376
float *wsharpen = sharpenWindow->getLine(0);
378
for (int i = 0; i < 4; i++) {
379
temp[i+0] = 1e-15f; // 0
380
temp[i+4] = gridfraction; // 16
381
temp[i+8] = sigmaSquaredSharpenMin; // 32
382
temp[i+12] = sigmaSquaredSharpenMax; // 48
383
temp[i+16] = 1.0f; // 64
384
temp[i+20] = sigmaSquaredNoiseNormed; // 80
385
temp[i+24] = lowlimit; // 96
390
"movaps (%1), %%xmm15\n" // 1e-15f
391
"movaps 16(%1),%%xmm14\n" // Load gridfraction into xmm14
392
"movaps 32(%1), %%xmm10\n" //xmm10 sigmaSquaredSharpenMin
393
"movaps 48(%1), %%xmm11\n" // Move sigmaSquaredSharpenMax into xmm11
394
"movaps 64(%1), %%xmm9\n" // Load "1.0"
395
"movaps 80(%1), %%xmm13\n" //sigmaSquaredNoiseNormed in xmm13
396
"movaps 96(%1), %%xmm12\n" // xmm12 = lowlimit
397
"loop_wienerdegridsharpen_sse3:\n"
398
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
399
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
400
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
401
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
403
"mulps %%xmm14, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
404
"mulps %%xmm14, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
405
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
406
"movaps %%xmm5, %%xmm3\n"
407
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
408
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
409
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
410
"movaps %%xmm1, %%xmm5\n"
412
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
413
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
414
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
415
"addps %%xmm15, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
417
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
419
"movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
420
"rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
421
"subps %%xmm13, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
422
"mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
423
"maxps %%xmm12, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
425
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
426
"movaps %%xmm11, %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
427
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
428
"addps %%xmm11, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
429
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
430
"addps %%xmm10, %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
432
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
433
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
434
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
435
"rsqrtps %%xmm7, %%xmm7\n" // 1 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
436
"rcpps %%xmm7, %%xmm7\n" // sqrt(...)
437
"mulps (%4), %%xmm7\n" // multiply wsharpen
438
"addps %%xmm9, %%xmm7\n" // + 1.0 xmm7 = sfact
439
"mulps %%xmm6, %%xmm7\n" // *= Wienerfactor
440
"movaps %%xmm7, %%xmm5\n"
441
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
442
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
444
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
445
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
446
"addps %%xmm2, %%xmm0\n" // add gridcorrection
447
"addps %%xmm3, %%xmm1\n" // add gridcorrection
448
"movaps %%xmm0, (%2)\n" // Store
449
"movaps %%xmm1, 16(%2)\n" // Store
450
"sub $4, %0\n" // size -=4
451
"add $32, %2\n" // outcur+=32
452
"add $32, %3\n" // gridsample+=32
453
"add $16, %4\n" // wsharpen+=16
455
"jg loop_wienerdegridsharpen_sse3\n"
456
: /* no output registers */
457
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
458
: /* %0 %1 %2 %3 %4 */
464
void ComplexWienerFilterDeGrid::processSharpen_SSE3( ComplexBlock* block )
466
fftwf_complex* outcur = block->complex;
467
fftwf_complex* gridsample = grid->complex;
468
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
469
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
470
float *wsharpen = sharpenWindow->getLine(0);
472
for (int i = 0; i < 4; i++) {
473
temp[i+0] = 1e-15f; // 0
474
temp[i+4] = gridfraction; // 16
475
temp[i+8] = sigmaSquaredSharpenMin; // 32
476
temp[i+12] = sigmaSquaredSharpenMax; // 48
477
temp[i+16] = 1.0f; // 64
478
temp[i+20] = sigmaSquaredNoiseNormed; // 80
479
temp[i+24] = lowlimit; // 96
484
"loop_wienerdegridsharpen_sse3:\n"
485
"movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
486
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
487
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
488
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
489
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
491
"mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
492
"mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
493
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
494
"movaps %%xmm5, %%xmm3\n"
495
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
496
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
497
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
498
"movaps %%xmm1, %%xmm5\n"
500
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
501
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
502
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
503
"addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
505
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
507
"movaps 80(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
508
"movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
509
"rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
510
"subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
511
"movaps 96(%1), %%xmm5\n" // xmm5 = lowlimit
512
"mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
513
"maxps %%xmm5, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
515
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
516
"movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
517
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
518
"addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
519
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
520
"addps 32(%1), %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
522
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
523
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
524
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
525
"movaps 64(%1), %%xmm5\n" // Load "1.0"
526
"rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
527
"rcpps %%xmm7, %%xmm7\n" // sqrt (..)
528
"mulps (%4), %%xmm7\n" // multiply wsharpen
529
"addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
530
"mulps %%xmm6, %%xmm7\n" // *= Wienerfactor
531
"movaps %%xmm7, %%xmm5\n"
532
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
533
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
535
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
536
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
537
"addps %%xmm2, %%xmm0\n" // add gridcorrection
538
"addps %%xmm3, %%xmm1\n" // add gridcorrection
539
"movaps %%xmm0, (%2)\n" // Store
540
"movaps %%xmm1, 16(%2)\n" // Store
541
"sub $4, %0\n" // size -=4
542
"add $32, %2\n" // outcur+=32
543
"add $32, %3\n" // gridsample+=32
544
"add $16, %4\n" // wsharpen+=16
546
"jg loop_wienerdegridsharpen_sse3\n"
547
: /* no output registers */
548
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
549
: /* %0 %1 %2 %3 %4 */
554
void ComplexWienerFilterDeGrid::processSharpen_SSE( ComplexBlock* block )
556
fftwf_complex* outcur = block->complex;
557
fftwf_complex* gridsample = grid->complex;
558
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
559
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
560
float *wsharpen = sharpenWindow->getLine(0);
562
for (int i = 0; i < 4; i++) {
563
temp[i+0] = 1e-15f; // 0
564
temp[i+4] = gridfraction; // 16
565
temp[i+8] = sigmaSquaredSharpenMin; // 32
566
temp[i+12] = sigmaSquaredSharpenMax; // 48
567
temp[i+16] = 1.0f; // 64
568
temp[i+20] = sigmaSquaredNoiseNormed; // 72
569
temp[i+24] = lowlimit; // 96
574
"loop_wienerdegridsharpen_sse:\n"
575
"movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
576
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
577
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
578
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
579
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
581
"mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
582
"mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
583
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
584
"movaps %%xmm5, %%xmm3\n"
585
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
586
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
587
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
588
"movaps %%xmm1, %%xmm5\n"
590
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
591
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
592
"movaps %%xmm4, %%xmm7\n"
593
"shufps $136, %%xmm5, %%xmm4\n" // xmm7 r0r1 r2r3 [10 00 10 00 = 136]
594
"shufps $221, %%xmm5, %%xmm7\n" // xmm6 i0i1 i2i3 [11 01 11 01 = 221]
595
"addps %%xmm7, %%xmm4\n"
598
"addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
600
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
602
"movaps 80(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
603
"movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
604
"rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
605
"subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
606
"movaps 96(%1), %%xmm5\n" // xmm5 = lowlimit
607
"mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
608
"maxps %%xmm5, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
610
// float sfact = (1 + wsharpen[x]*sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) )) ;
611
"movaps 48(%1), %%xmm7\n" // Move sigmaSquaredSharpenMax into xmm7
612
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
613
"addps %%xmm7, %%xmm4\n" // xmm4 = psd + sigmaSquaredSharpenMax
614
"mulps %%xmm5, %%xmm7\n" // xmm7 = psd*sigmaSquaredSharpenMax
615
"addps 32(%1), %%xmm5\n" //xmm5 = psd + sigmaSquaredSharpenMin //xmm6 free
617
"mulps %%xmm4, %%xmm5\n" // (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) xmm4 free
618
"rcpps %%xmm5, %%xmm5\n" // 1 / (psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax) (stall)
619
"mulps %%xmm5, %%xmm7\n" // psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax)) - xmm5 free
620
"movaps 64(%1), %%xmm5\n" // Load "1.0"
621
"rsqrtps %%xmm7, %%xmm7\n" // 1.0 / sqrt( psd*sigmaSquaredSharpenMax/((psd + sigmaSquaredSharpenMin)*(psd + sigmaSquaredSharpenMax))
622
"rcpps %%xmm7, %%xmm7\n" // sqrt (..)
623
"mulps (%4), %%xmm7\n" // multiply wsharpen
624
"addps %%xmm5, %%xmm7\n" // + 1.0 xmm7 = sfact
625
"mulps %%xmm6, %%xmm7\n" // *= Wienerfactor
626
"movaps %%xmm7, %%xmm5\n"
627
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
628
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
630
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
631
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
632
"addps %%xmm2, %%xmm0\n" // add gridcorrection
633
"addps %%xmm3, %%xmm1\n" // add gridcorrection
634
"movaps %%xmm0, (%2)\n" // Store
635
"movaps %%xmm1, 16(%2)\n" // Store
636
"sub $4, %0\n" // size -=4
637
"add $32, %2\n" // outcur+=32
638
"add $32, %3\n" // gridsample+=32
639
"add $16, %4\n" // wsharpen+=16
641
"jg loop_wienerdegridsharpen_sse\n"
642
: /* no output registers */
643
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample), "r"(wsharpen)
644
: /* %0 %1 %2 %3 %4 */
649
void ComplexWienerFilterDeGrid::processNoSharpen_SSE( ComplexBlock* block )
651
fftwf_complex* outcur = block->complex;
652
fftwf_complex* gridsample = grid->complex;
653
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
654
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
656
for (int i = 0; i < 4; i++) {
657
temp[i+0] = 1e-15f; // 0
658
temp[i+4] = gridfraction; // 16
659
temp[i+8] = sigmaSquaredNoiseNormed; // 32
660
temp[i+12] = lowlimit; // 48
665
"loop_wienerdegridnosharpen_sse:\n"
666
"movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
667
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
668
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
669
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
670
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
672
"mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
673
"mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
674
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
675
"movaps %%xmm5, %%xmm3\n"
676
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
677
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
678
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
679
"movaps %%xmm1, %%xmm5\n"
681
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
682
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
683
"movaps %%xmm4, %%xmm7\n"
684
"shufps $136, %%xmm5, %%xmm4\n" // xmm7 r0r1 r2r3 [10 00 10 00 = 136]
685
"shufps $221, %%xmm5, %%xmm7\n" // xmm6 i0i1 i2i3 [11 01 11 01 = 221]
686
"addps %%xmm7, %%xmm4\n"
688
"addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
690
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
692
"movaps 32(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
693
"movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
694
"rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
695
"subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
696
"movaps 48(%1), %%xmm5\n" // xmm5 = lowlimit
697
"mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
698
"maxps %%xmm6, %%xmm5\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
700
"movaps %%xmm5, %%xmm7\n"
701
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
702
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
704
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
705
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
706
"addps %%xmm2, %%xmm0\n" // add gridcorrection
707
"addps %%xmm3, %%xmm1\n" // add gridcorrection
708
"movaps %%xmm0, (%2)\n" // Store
709
"movaps %%xmm1, 16(%2)\n" // Store
710
"sub $4, %0\n" // size -=4
711
"add $32, %2\n" // outcur+=32
712
"add $32, %3\n" // gridsample+=32
714
"jg loop_wienerdegridnosharpen_sse\n"
715
: /* no output registers */
716
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
721
#if defined (__x86_64__)
722
void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block )
724
fftwf_complex* outcur = block->complex;
725
fftwf_complex* gridsample = grid->complex;
726
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
727
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
729
for (int i = 0; i < 4; i++) {
730
temp[i+0] = 1e-15f; // 0
731
temp[i+4] = gridfraction; // 16
732
temp[i+8] = sigmaSquaredNoiseNormed; // 32
733
temp[i+12] = lowlimit; // 48
736
if ((size & 7) == 0) { //TODO: Bench me to see if I'm faster
739
"movaps (%1), %%xmm14\n" // xmm14: 1e-15
740
"movaps 16(%1), %%xmm15\n" // Load gridfraction into xmm15
741
"loop_wienerdegridnosharpen_sse3_big:\n"
742
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
743
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
744
"movaps 32(%2), %%xmm8\n" // in r4i4 r5i5
745
"movaps 48(%2), %%xmm9\n" //in r6i6 r7i7
746
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
747
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
748
"movaps 32(%3), %%xmm10\n" // grid r4i4 r5i5
749
"movaps 48(%3), %%xmm11\n" // grid r6i6 r7i7
751
"mulps %%xmm15, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
752
"mulps %%xmm15, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
753
"mulps %%xmm15, %%xmm10\n" //grid r4*gf i4*gf r5*gf i5*gf
754
"mulps %%xmm15, %%xmm11\n" //grid r6*gf i6*gf r7*gf i7*gf
755
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
756
"movaps %%xmm5, %%xmm3\n"
757
"movaps %%xmm10, %%xmm12\n" // maintain gridcorrection in memory
758
"movaps %%xmm11, %%xmm13\n"
759
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
760
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
761
"subps %%xmm10, %%xmm8\n" // re4 im4 re5 im5 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
762
"subps %%xmm11, %%xmm9\n" // re6 im6 re7 im7 -
763
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
764
"movaps %%xmm1, %%xmm5\n"
765
"movaps %%xmm8, %%xmm10\n" // copy re4+im4
766
"movaps %%xmm9, %%xmm11\n"
768
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
769
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
770
"mulps %%xmm10, %%xmm10\n" //r4i4 r5i5 squared
771
"mulps %%xmm11, %%xmm11\n" //r6i6 r7i7 squared
772
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 (all squared) (SSE3!) - xmm 5 free
773
"haddps %%xmm11, %%xmm10\n" //r4+i4 r5+i5 r6+i6 r7+i7 (all squared) (SSE3!) - xmm 11 free
775
"addps %%xmm14, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
776
"addps %%xmm14, %%xmm10\n" // add 1e-15 (xmm10: psd for all 4 pixels)
778
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
780
"movaps %%xmm4, %%xmm5\n" // Copy psd into xmm5
781
"movaps %%xmm10, %%xmm11\n" // Copy psd into xmm11
783
"rcpps %%xmm4, %%xmm4\n" // xmm4: (1 / psd)
784
"movaps 32(%1), %%xmm7\n"
785
"rcpps %%xmm10, %%xmm10\n" // xmm10: (1 / psd)
786
"subps %%xmm7, %%xmm5\n" // xmm5 (psd) - xmm5 (ssnn)
787
"subps %%xmm7, %%xmm11\n" // xmm11 (psd) - xmm5 (ssnn)
788
"movaps 48(%1), %%xmm7\n"
789
"mulps %%xmm4, %%xmm5\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
790
"mulps %%xmm10, %%xmm11\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
791
"maxps %%xmm7, %%xmm5\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
792
"maxps %%xmm7, %%xmm11\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
794
"movaps %%xmm5, %%xmm7\n"
795
"movaps %%xmm11, %%xmm10\n"
796
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
797
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
798
"unpckhps %%xmm11, %%xmm11\n" // unpack high to xmm11
799
"unpcklps %%xmm10, %%xmm10\n" // unpack low to xmm10
801
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
802
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
803
"mulps %%xmm10, %%xmm8\n" // re+im *= sfact
804
"mulps %%xmm11, %%xmm9\n" // re+im *= sfact
805
"addps %%xmm2, %%xmm0\n" // add gridcorrection
806
"addps %%xmm3, %%xmm1\n" // add gridcorrection
807
"addps %%xmm12, %%xmm8\n" // add gridcorrection
808
"addps %%xmm13, %%xmm9\n" // add gridcorrection
809
"movaps %%xmm0, (%2)\n" // Store
810
"movaps %%xmm1, 16(%2)\n" // Store
811
"movaps %%xmm8, 32(%2)\n" // Store
812
"movaps %%xmm9, 48(%2)\n" // Store
813
"sub $8, %0\n" // size -=8
814
"add $64, %2\n" // outcur+=64
815
"add $64, %3\n" // gridsample+=64
817
"jg loop_wienerdegridnosharpen_sse3_big\n"
818
: /* no output registers */
819
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
825
"movaps (%1), %%xmm14\n" // xmm14: 1e-15
826
"movaps 16(%1), %%xmm15\n" // Load gridfraction into xmm15
827
"movaps 32(%1), %%xmm13\n" //sigmaSquaredNoiseNormed in xmm13
828
"movaps 48(%1), %%xmm12\n" // xmm12 = lowlimit
829
"loop_wienerdegridnosharpen_sse3:\n"
830
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
831
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
832
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
833
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
835
"mulps %%xmm15, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
836
"mulps %%xmm15, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
837
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
838
"movaps %%xmm5, %%xmm3\n"
839
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
840
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
841
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
842
"movaps %%xmm1, %%xmm5\n"
844
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
845
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
846
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
848
"addps %%xmm14, %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
850
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
852
"movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
853
"rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
854
"subps %%xmm13, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
855
"mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
856
"maxps %%xmm12, %%xmm6\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
858
"movaps %%xmm6, %%xmm7\n"
859
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
860
"unpckhps %%xmm6, %%xmm6\n" // unpack high to xmm6
862
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
863
"mulps %%xmm6, %%xmm1\n" // re+im *= sfact
864
"addps %%xmm2, %%xmm0\n" // add gridcorrection
865
"addps %%xmm3, %%xmm1\n" // add gridcorrection
866
"movaps %%xmm0, (%2)\n" // Store
867
"movaps %%xmm1, 16(%2)\n" // Store
868
"sub $4, %0\n" // size -=4
869
"add $32, %2\n" // outcur+=32
870
"add $32, %3\n" // gridsample+=32
872
"jg loop_wienerdegridnosharpen_sse3\n"
873
: /* no output registers */
874
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
881
void ComplexWienerFilterDeGrid::processNoSharpen_SSE3( ComplexBlock* block )
883
fftwf_complex* outcur = block->complex;
884
fftwf_complex* gridsample = grid->complex;
885
float gridfraction = degrid*outcur[0][0]/gridsample[0][0];
886
float* temp = block->temp->data; // Get aligned temp area, at least 256 bytes, only used by this thread.
888
for (int i = 0; i < 4; i++) {
889
temp[i+0] = 1e-15f; // 0
890
temp[i+4] = gridfraction; // 16
891
temp[i+8] = sigmaSquaredNoiseNormed; // 32
892
temp[i+12] = lowlimit; // 48
897
"loop_wienerdegridnosharpen_sse3:\n"
898
"movaps 16(%1),%%xmm6\n" // Load gridfraction into xmm6
899
"movaps (%2), %%xmm0\n" // in r0i0 r1i1
900
"movaps 16(%2), %%xmm1\n" //in r2i2 r3i3
901
"movaps (%3), %%xmm4\n" // grid r0i0 r1i1
902
"movaps 16(%3), %%xmm5\n" // grid r2i2 r3i3
904
"mulps %%xmm6, %%xmm4\n" //grid r0*gf i0*gf r1*gf i1*gf (xmm4: gridcorrection0 + 1)
905
"mulps %%xmm6, %%xmm5\n" //grid r2*gf i2*gf r3*gf i3*gf (gridfraction*gridsample[x])
906
"movaps %%xmm4, %%xmm2\n" // maintain gridcorrection in memory
907
"movaps %%xmm5, %%xmm3\n"
908
"subps %%xmm4, %%xmm0\n" // re0 im0 re1 im1 (re = outcur[x][0] - gridcorrection0;, etc) (xmm0 - xmm4)
909
"subps %%xmm5, %%xmm1\n" // re2 im2 re3 im3 -
910
"movaps %%xmm0, %%xmm4\n" // copy re0+im0 ... into xmm4 and 5, xmm0 & 1 retained
911
"movaps %%xmm1, %%xmm5\n"
913
"mulps %%xmm4, %%xmm4\n" //r0i0 r1i1 squared
914
"mulps %%xmm5, %%xmm5\n" //r2i2 r3i3 squared
915
"haddps %%xmm5, %%xmm4\n" //r0+i0 r1+i1 r2+i2 r3+i3 r4+i4 (all squared) (SSE3!) - xmm 5 free
917
"addps (%1), %%xmm4\n" // add 1e-15 (xmm4: psd for all 4 pixels)
919
//WienerFactor = MAX((psd - sigmaSquaredNoiseNormed)/psd, lowlimit); // limited Wiener filter
921
"movaps 32(%1), %%xmm5\n" //sigmaSquaredNoiseNormed in xmm5
922
"movaps %%xmm4, %%xmm6\n" // Copy psd into xmm6
923
"rcpps %%xmm4, %%xmm7\n" // xmm7: (1 / psd)
924
"subps %%xmm5, %%xmm6\n" // xmm6 (psd) - xmm5 (ssnn) xmm5 free
925
"movaps 48(%1), %%xmm5\n" // xmm5 = lowlimit
926
"mulps %%xmm7, %%xmm6\n" // xmm6 = (psd - sigmaSquaredNoiseNormed)/psd
927
"maxps %%xmm6, %%xmm5\n" // xmm6 = Wienerfactor = MAX(xmm6, lowlimit)
929
"movaps %%xmm5, %%xmm7\n"
930
"unpcklps %%xmm7, %%xmm7\n" // unpack low to xmm7
931
"unpckhps %%xmm5, %%xmm5\n" // unpack high to xmm5
933
"mulps %%xmm7, %%xmm0\n" // re+im *= sfact
934
"mulps %%xmm5, %%xmm1\n" // re+im *= sfact
935
"addps %%xmm2, %%xmm0\n" // add gridcorrection
936
"addps %%xmm3, %%xmm1\n" // add gridcorrection
937
"movaps %%xmm0, (%2)\n" // Store
938
"movaps %%xmm1, 16(%2)\n" // Store
939
"sub $4, %0\n" // size -=4
940
"add $32, %2\n" // outcur+=32
941
"add $32, %3\n" // gridsample+=32
943
"jg loop_wienerdegridnosharpen_sse3\n"
944
: /* no output registers */
945
: "r" (size), "r" (temp), "r" (outcur), "r" (gridsample)
951
#endif // defined (__i386__) || defined (__x86_64__)
953
}}// namespace RawStudio::FFTFilter