~ubuntu-branches/ubuntu/vivid/rawstudio/vivid

« back to all changes in this revision

Viewing changes to plugins/lensfun/lensfun-sse2.c

  • Committer: Bazaar Package Importer
  • Author(s): Bernd Zeimetz
  • Date: 2011-07-28 17:36:32 UTC
  • mfrom: (2.1.11 upstream)
  • Revision ID: james.westby@ubuntu.com-20110728173632-5czluz9ye3c83zc5
Tags: 2.0-1
* [3750b2cf] Merge commit 'upstream/2.0'
* [63637468] Removing Patch, not necessary anymore.
* [2fb580dc] Add new build-dependencies.
* [c57d953b] Run dh_autoreconf due to patches in configure.in
* [13febe39] Add patch to remove the libssl requirement.
* [5ae773fe] Replace libjpeg62-dev by libjpeg8-dev :)
* [1969d755] Don't build static libraries.
* [7cfe0a2e] Add a patch to fix the plugin directory path.
  As plugins are shared libraries, they need to go into /usr/lib,
  not into /usr/share.
  Thanks to Andrew McMillan
* [c1d0d9dd] Don't install .la files for all plugins and libraries.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * * Copyright (C) 2006-2011 Anders Brander <anders@brander.dk>,
 
3
 * * Anders Kvist <akv@lnxbx.dk> and Klaus Post <klauspost@gmail.com>
 
4
 *
 
5
 * This program is free software; you can redistribute it and/or
 
6
 * modify it under the terms of the GNU General Public License
 
7
 * as published by the Free Software Foundation; either version 2
 
8
 * of the License, or (at your option) any later version.
 
9
 *
 
10
 * This program is distributed in the hope that it will be useful,
 
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
13
 * GNU General Public License for more details.
 
14
 *
 
15
 * You should have received a copy of the GNU General Public License
 
16
 * along with this program; if not, write to the Free Software
 
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
18
 */
 
19
 
 
20
/* Plugin tmpl version 4 */
 
21
 
 
22
#include <rawstudio.h>
 
23
#include <lensfun.h>
 
24
 
 
25
#if defined (__SSE2__)
 
26
 
 
27
#include <emmintrin.h>
 
28
 
 
29
static gfloat twofiftytwo_ps[4] __attribute__ ((aligned (16))) = {256.0f, 256.0f, 256.0f, 0.0f};
 
30
static gint _zero12[4] __attribute__ ((aligned (16))) = {0,1,2,0};
 
31
static gint _max_coord[4] __attribute__ ((aligned (16))) = {65536,65536,65536,65536};
 
32
 
 
33
gboolean is_sse2_compiled(void)
 
34
{
 
35
        return TRUE;
 
36
}
 
37
 
 
38
void
 
39
rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos, const gint *current_xy, const gint* min_max_xy)
 
40
{
 
41
        const gint m_w = (in->w-1);
 
42
        const gint m_h = (in->h-1);
 
43
 
 
44
        __m128 p0, p1;
 
45
        if ((uintptr_t)pos & 15)
 
46
        {
 
47
                p0 = _mm_loadu_ps(pos);         // y1x1 y0x0
 
48
                p1 = _mm_loadu_ps(pos+4);       // ---- y2x2
 
49
        } else 
 
50
        {
 
51
                p0 = _mm_load_ps(pos);          // y1x1 y0x0
 
52
                p1 = _mm_load_ps(pos+4);        // ---- y2x2
 
53
        }
 
54
 
 
55
        // to x2x2 x1x0 
 
56
        __m128 xf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(0,0,2,0));
 
57
        // to y2y2 y1y0
 
58
        __m128 yf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(1,1,3,1));
 
59
 
 
60
        __m128 fl256 = _mm_load_ps(twofiftytwo_ps);
 
61
        xf = _mm_mul_ps(xf, fl256);
 
62
        yf = _mm_mul_ps(yf, fl256);
 
63
        __m128i x = _mm_cvttps_epi32(xf);
 
64
        __m128i y = _mm_cvttps_epi32(yf);
 
65
 
 
66
        __m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
 
67
        __m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
 
68
        
 
69
        __m128i x_gt, y_gt;
 
70
 
 
71
        /* Clamping */
 
72
        x_gt = _mm_cmpgt_epi32(x, _m_w);
 
73
        y_gt = _mm_cmpgt_epi32(y, _m_h);
 
74
        
 
75
        x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
 
76
        y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
 
77
 
 
78
        __m128i current_pos = _mm_loadl_epi64((__m128i*)current_xy);
 
79
        __m128i current_x = _mm_shuffle_epi32(current_pos,_MM_SHUFFLE(0,0,0,0));
 
80
        __m128i current_y = _mm_shuffle_epi32(current_pos,_MM_SHUFFLE(1,1,1,1));
 
81
        __m128i max_x = _mm_load_si128((__m128i*)&min_max_xy[8]);
 
82
        __m128i max_y = _mm_load_si128((__m128i*)&min_max_xy[12]);
 
83
        __m128i max_coord = _mm_load_si128((__m128i*)_max_coord);
 
84
        __m128i eq_max_x = _mm_cmpeq_epi32(max_coord, max_x);
 
85
        __m128i eq_max_y = _mm_cmpeq_epi32(max_coord, max_y);
 
86
        x_gt = _mm_and_si128(x_gt, eq_max_x);
 
87
        y_gt = _mm_and_si128(y_gt, eq_max_y);
 
88
        __m128i insert_x = _mm_and_si128(x_gt, current_x);
 
89
        __m128i insert_y = _mm_and_si128(y_gt, current_y);
 
90
        max_x = _mm_or_si128(insert_x, _mm_andnot_si128(x_gt, max_x));
 
91
        max_y = _mm_or_si128(insert_y, _mm_andnot_si128(y_gt, max_y));
 
92
        _mm_store_si128((__m128i*)&min_max_xy[8], max_x);
 
93
        _mm_store_si128((__m128i*)&min_max_xy[12], max_y);
 
94
 
 
95
        __m128i zero = _mm_setzero_si128();
 
96
        __m128i x_lt = _mm_cmplt_epi32(x, zero);
 
97
        __m128i y_lt = _mm_cmplt_epi32(y, zero);
 
98
        x = _mm_andnot_si128(x_lt, x);
 
99
        y = _mm_andnot_si128(y_lt, y);
 
100
        __m128i min_x = _mm_load_si128((__m128i*)&min_max_xy[0]);
 
101
        __m128i min_y = _mm_load_si128((__m128i*)&min_max_xy[4]);
 
102
        insert_x = _mm_and_si128(x_lt, current_x);
 
103
        insert_y = _mm_and_si128(y_lt, current_y);
 
104
        min_x = _mm_or_si128(insert_x, _mm_andnot_si128(x_lt, min_x));
 
105
        min_y = _mm_or_si128(insert_y, _mm_andnot_si128(y_lt, min_y));
 
106
        _mm_store_si128((__m128i*)&min_max_xy[0], min_x);
 
107
        _mm_store_si128((__m128i*)&min_max_xy[4], min_y);
 
108
        
 
109
        __m128i one = _mm_set1_epi32(1);
 
110
        __m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
 
111
        __m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
 
112
 
 
113
        /* Check that 'next' pixels are in bounds */
 
114
        _m_w = _mm_srai_epi32(_m_w, 8);
 
115
        _m_h = _mm_srai_epi32(_m_h, 8);
 
116
 
 
117
        x_gt = _mm_cmpgt_epi32(nx, _m_w);
 
118
        y_gt = _mm_cmpgt_epi32(ny, _m_h);
 
119
        
 
120
        nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w, x_gt));
 
121
        ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h, y_gt));
 
122
 
 
123
        int xfer[16] __attribute__ ((aligned (16)));
 
124
 
 
125
        /* Pitch as pixels */
 
126
        __m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >> 2)<<16));
 
127
 
 
128
        /* Remove remainder */
 
129
        __m128i tx = _mm_srai_epi32(x, 8);
 
130
        __m128i ty = _mm_srai_epi32(y, 8);
 
131
        
 
132
        /* Multiply y by pitch */
 
133
        ty = _mm_packs_epi32(ty, ty);
 
134
        __m128i ty_lo = _mm_mullo_epi16(ty, pitch);
 
135
        __m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
 
136
        ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
 
137
        
 
138
        /* Same to next pixel */
 
139
        ny = _mm_packs_epi32(ny, ny);
 
140
        __m128i ny_lo = _mm_mullo_epi16(ny, pitch);
 
141
        __m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
 
142
        ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
 
143
        
 
144
        /* Add pitch and x offset */
 
145
        __m128i a_offset =  _mm_add_epi32(tx, ty);
 
146
        __m128i b_offset =  _mm_add_epi32(nx, ty);
 
147
        __m128i c_offset =  _mm_add_epi32(tx, ny);
 
148
        __m128i d_offset =  _mm_add_epi32(nx, ny);
 
149
 
 
150
        /* Multiply by pixelsize and add RGB offsets */
 
151
        __m128i zero12 = _mm_load_si128((__m128i*)_zero12);
 
152
        a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
 
153
        b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
 
154
        c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
 
155
        d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
 
156
 
 
157
        _mm_store_si128((__m128i*)xfer, a_offset);
 
158
        _mm_store_si128((__m128i*)&xfer[4], b_offset);
 
159
        _mm_store_si128((__m128i*)&xfer[8], c_offset);
 
160
        _mm_store_si128((__m128i*)&xfer[12], d_offset);
 
161
        
 
162
        gushort* pixels[12];
 
163
        
 
164
        /* Loop unrolled, allows agressive instruction reordering */
 
165
        /* Red, then G & B */
 
166
        pixels[0] = in->pixels + xfer[0];       // a
 
167
        pixels[1] = in->pixels + xfer[4];       // b
 
168
        pixels[2] = in->pixels + xfer[8];       // c
 
169
        pixels[3] = in->pixels + xfer[12];      // d
 
170
                
 
171
        pixels[4] = in->pixels + xfer[1+0];             // a
 
172
        pixels[5] = in->pixels + xfer[1+4];             // b
 
173
        pixels[6] = in->pixels + xfer[1+8];             // c
 
174
        pixels[7] = in->pixels + xfer[1+12];    // d
 
175
 
 
176
        pixels[8] = in->pixels + xfer[2+0];             // a
 
177
        pixels[9] = in->pixels + xfer[2+4];             // b
 
178
        pixels[10] = in->pixels + xfer[2+8];    // c
 
179
        pixels[11] = in->pixels + xfer[2+12];   // d
 
180
 
 
181
        /* Calculate distances */
 
182
        __m128i twofiftyfive = _mm_set1_epi32(255);
 
183
        __m128i diffx = _mm_and_si128(x, twofiftyfive); 
 
184
        __m128i diffy = _mm_and_si128(y, twofiftyfive); 
 
185
        __m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
 
186
        __m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
 
187
 
 
188
        /* Calculate weights */
 
189
        __m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
 
190
        __m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
 
191
        __m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
 
192
        __m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
 
193
 
 
194
        _mm_store_si128((__m128i*)xfer, aw);
 
195
        _mm_store_si128((__m128i*)&xfer[4], bw);
 
196
        _mm_store_si128((__m128i*)&xfer[8], cw);
 
197
        _mm_store_si128((__m128i*)&xfer[12], dw);
 
198
        
 
199
        gushort** p = pixels;
 
200
        /* Loop unrolled */
 
201
        out[0]  = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] * *p[2] + xfer[12] * *p[3]  + 16384) >> 15 );
 
202
        p+=4;
 
203
        out[1]  = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] * *p[2] + xfer[1+12] * *p[3]  + 16384) >> 15 );
 
204
        p+=4;
 
205
        out[2]  = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] * *p[2] + xfer[2+12] * *p[3]  + 16384) >> 15 );
 
206
}
 
207
 
 
208
void
 
209
rs_image16_bilinear_nomeasure_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
 
210
{
 
211
        const gint m_w = (in->w-1);
 
212
        const gint m_h = (in->h-1);
 
213
 
 
214
        __m128 p0, p1;
 
215
        if ((uintptr_t)pos & 15)
 
216
        {
 
217
                p0 = _mm_loadu_ps(pos);         // y1x1 y0x0
 
218
                p1 = _mm_loadu_ps(pos+4);       // ---- y2x2
 
219
        } else 
 
220
        {
 
221
                p0 = _mm_load_ps(pos);          // y1x1 y0x0
 
222
                p1 = _mm_load_ps(pos+4);        // ---- y2x2
 
223
        }
 
224
 
 
225
        // to x2x2 x1x0 
 
226
        __m128 xf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(0,0,2,0));
 
227
        // to y2y2 y1y0
 
228
        __m128 yf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(1,1,3,1));
 
229
 
 
230
        __m128 fl256 = _mm_load_ps(twofiftytwo_ps);
 
231
        xf = _mm_mul_ps(xf, fl256);
 
232
        yf = _mm_mul_ps(yf, fl256);
 
233
        __m128i x = _mm_cvttps_epi32(xf);
 
234
        __m128i y = _mm_cvttps_epi32(yf);
 
235
 
 
236
        __m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
 
237
        __m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
 
238
        
 
239
        __m128i x_gt, y_gt;
 
240
 
 
241
        /* Clamping */
 
242
        x_gt = _mm_cmpgt_epi32(x, _m_w);
 
243
        y_gt = _mm_cmpgt_epi32(y, _m_h);
 
244
        
 
245
        x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
 
246
        y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
 
247
 
 
248
        __m128i zero = _mm_setzero_si128();
 
249
        __m128i x_lt = _mm_cmplt_epi32(x, zero);
 
250
        __m128i y_lt = _mm_cmplt_epi32(y, zero);
 
251
        x = _mm_andnot_si128(x_lt, x);
 
252
        y = _mm_andnot_si128(y_lt, y);
 
253
 
 
254
        __m128i one = _mm_set1_epi32(1);
 
255
        __m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
 
256
        __m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
 
257
 
 
258
        /* Check that 'next' pixels are in bounds */
 
259
        _m_w = _mm_srai_epi32(_m_w, 8);
 
260
        _m_h = _mm_srai_epi32(_m_h, 8);
 
261
 
 
262
        x_gt = _mm_cmpgt_epi32(nx, _m_w);
 
263
        y_gt = _mm_cmpgt_epi32(ny, _m_h);
 
264
        
 
265
        nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w, x_gt));
 
266
        ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h, y_gt));
 
267
 
 
268
        int xfer[16] __attribute__ ((aligned (16)));
 
269
 
 
270
        /* Pitch as pixels */
 
271
        __m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >> 2)<<16));
 
272
 
 
273
        /* Remove remainder */
 
274
        __m128i tx = _mm_srai_epi32(x, 8);
 
275
        __m128i ty = _mm_srai_epi32(y, 8);
 
276
        
 
277
        /* Multiply y by pitch */
 
278
        ty = _mm_packs_epi32(ty, ty);
 
279
        __m128i ty_lo = _mm_mullo_epi16(ty, pitch);
 
280
        __m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
 
281
        ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
 
282
        
 
283
        /* Same to next pixel */
 
284
        ny = _mm_packs_epi32(ny, ny);
 
285
        __m128i ny_lo = _mm_mullo_epi16(ny, pitch);
 
286
        __m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
 
287
        ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
 
288
        
 
289
        /* Add pitch and x offset */
 
290
        __m128i a_offset =  _mm_add_epi32(tx, ty);
 
291
        __m128i b_offset =  _mm_add_epi32(nx, ty);
 
292
        __m128i c_offset =  _mm_add_epi32(tx, ny);
 
293
        __m128i d_offset =  _mm_add_epi32(nx, ny);
 
294
 
 
295
        /* Multiply by pixelsize and add RGB offsets */
 
296
        __m128i zero12 = _mm_load_si128((__m128i*)_zero12);
 
297
        a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
 
298
        b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
 
299
        c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
 
300
        d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
 
301
 
 
302
        _mm_store_si128((__m128i*)xfer, a_offset);
 
303
        _mm_store_si128((__m128i*)&xfer[4], b_offset);
 
304
        _mm_store_si128((__m128i*)&xfer[8], c_offset);
 
305
        _mm_store_si128((__m128i*)&xfer[12], d_offset);
 
306
        
 
307
        gushort* pixels[12];
 
308
        
 
309
        /* Loop unrolled, allows agressive instruction reordering */
 
310
        /* Red, then G & B */
 
311
        pixels[0] = in->pixels + xfer[0];       // a
 
312
        pixels[1] = in->pixels + xfer[4];       // b
 
313
        pixels[2] = in->pixels + xfer[8];       // c
 
314
        pixels[3] = in->pixels + xfer[12];      // d
 
315
                
 
316
        pixels[4] = in->pixels + xfer[1+0];             // a
 
317
        pixels[5] = in->pixels + xfer[1+4];             // b
 
318
        pixels[6] = in->pixels + xfer[1+8];             // c
 
319
        pixels[7] = in->pixels + xfer[1+12];    // d
 
320
 
 
321
        pixels[8] = in->pixels + xfer[2+0];             // a
 
322
        pixels[9] = in->pixels + xfer[2+4];             // b
 
323
        pixels[10] = in->pixels + xfer[2+8];    // c
 
324
        pixels[11] = in->pixels + xfer[2+12];   // d
 
325
 
 
326
        /* Calculate distances */
 
327
        __m128i twofiftyfive = _mm_set1_epi32(255);
 
328
        __m128i diffx = _mm_and_si128(x, twofiftyfive); 
 
329
        __m128i diffy = _mm_and_si128(y, twofiftyfive); 
 
330
        __m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
 
331
        __m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
 
332
 
 
333
        /* Calculate weights */
 
334
        __m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
 
335
        __m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
 
336
        __m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
 
337
        __m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
 
338
 
 
339
        _mm_store_si128((__m128i*)xfer, aw);
 
340
        _mm_store_si128((__m128i*)&xfer[4], bw);
 
341
        _mm_store_si128((__m128i*)&xfer[8], cw);
 
342
        _mm_store_si128((__m128i*)&xfer[12], dw);
 
343
        
 
344
        gushort** p = pixels;
 
345
        /* Loop unrolled */
 
346
        out[0]  = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] * *p[2] + xfer[12] * *p[3]  + 16384) >> 15 );
 
347
        p+=4;
 
348
        out[1]  = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] * *p[2] + xfer[1+12] * *p[3]  + 16384) >> 15 );
 
349
        p+=4;
 
350
        out[2]  = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] * *p[2] + xfer[2+12] * *p[3]  + 16384) >> 15 );
 
351
}
 
352
 
 
353
#else // NO SSE2
 
354
 
 
355
gboolean is_sse2_compiled(void)
 
356
{
 
357
        return FALSE;
 
358
}
 
359
 
 
360
void
 
361
rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos,const gint *current_xy, const gint* min_max_xy)
 
362
{
 
363
}
 
364
 
 
365
void
 
366
rs_image16_bilinear_nomeasure_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
 
367
{
 
368
}
 
369
 
 
370
#endif // defined (__SSE2__)