2
* * Copyright (C) 2006-2011 Anders Brander <anders@brander.dk>,
3
* * Anders Kvist <akv@lnxbx.dk> and Klaus Post <klauspost@gmail.com>
5
* This program is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU General Public License
7
* as published by the Free Software Foundation; either version 2
8
* of the License, or (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20
/* Plugin tmpl version 4 */
22
#include <rawstudio.h>
25
#if defined (__SSE2__)
27
#include <emmintrin.h>
29
static gfloat twofiftytwo_ps[4] __attribute__ ((aligned (16))) = {256.0f, 256.0f, 256.0f, 0.0f};
30
static gint _zero12[4] __attribute__ ((aligned (16))) = {0,1,2,0};
31
static gint _max_coord[4] __attribute__ ((aligned (16))) = {65536,65536,65536,65536};
33
gboolean is_sse2_compiled(void)
39
rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos, const gint *current_xy, const gint* min_max_xy)
41
const gint m_w = (in->w-1);
42
const gint m_h = (in->h-1);
45
if ((uintptr_t)pos & 15)
47
p0 = _mm_loadu_ps(pos); // y1x1 y0x0
48
p1 = _mm_loadu_ps(pos+4); // ---- y2x2
51
p0 = _mm_load_ps(pos); // y1x1 y0x0
52
p1 = _mm_load_ps(pos+4); // ---- y2x2
56
__m128 xf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(0,0,2,0));
58
__m128 yf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(1,1,3,1));
60
__m128 fl256 = _mm_load_ps(twofiftytwo_ps);
61
xf = _mm_mul_ps(xf, fl256);
62
yf = _mm_mul_ps(yf, fl256);
63
__m128i x = _mm_cvttps_epi32(xf);
64
__m128i y = _mm_cvttps_epi32(yf);
66
__m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
67
__m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
72
x_gt = _mm_cmpgt_epi32(x, _m_w);
73
y_gt = _mm_cmpgt_epi32(y, _m_h);
75
x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
76
y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
78
__m128i current_pos = _mm_loadl_epi64((__m128i*)current_xy);
79
__m128i current_x = _mm_shuffle_epi32(current_pos,_MM_SHUFFLE(0,0,0,0));
80
__m128i current_y = _mm_shuffle_epi32(current_pos,_MM_SHUFFLE(1,1,1,1));
81
__m128i max_x = _mm_load_si128((__m128i*)&min_max_xy[8]);
82
__m128i max_y = _mm_load_si128((__m128i*)&min_max_xy[12]);
83
__m128i max_coord = _mm_load_si128((__m128i*)_max_coord);
84
__m128i eq_max_x = _mm_cmpeq_epi32(max_coord, max_x);
85
__m128i eq_max_y = _mm_cmpeq_epi32(max_coord, max_y);
86
x_gt = _mm_and_si128(x_gt, eq_max_x);
87
y_gt = _mm_and_si128(y_gt, eq_max_y);
88
__m128i insert_x = _mm_and_si128(x_gt, current_x);
89
__m128i insert_y = _mm_and_si128(y_gt, current_y);
90
max_x = _mm_or_si128(insert_x, _mm_andnot_si128(x_gt, max_x));
91
max_y = _mm_or_si128(insert_y, _mm_andnot_si128(y_gt, max_y));
92
_mm_store_si128((__m128i*)&min_max_xy[8], max_x);
93
_mm_store_si128((__m128i*)&min_max_xy[12], max_y);
95
__m128i zero = _mm_setzero_si128();
96
__m128i x_lt = _mm_cmplt_epi32(x, zero);
97
__m128i y_lt = _mm_cmplt_epi32(y, zero);
98
x = _mm_andnot_si128(x_lt, x);
99
y = _mm_andnot_si128(y_lt, y);
100
__m128i min_x = _mm_load_si128((__m128i*)&min_max_xy[0]);
101
__m128i min_y = _mm_load_si128((__m128i*)&min_max_xy[4]);
102
insert_x = _mm_and_si128(x_lt, current_x);
103
insert_y = _mm_and_si128(y_lt, current_y);
104
min_x = _mm_or_si128(insert_x, _mm_andnot_si128(x_lt, min_x));
105
min_y = _mm_or_si128(insert_y, _mm_andnot_si128(y_lt, min_y));
106
_mm_store_si128((__m128i*)&min_max_xy[0], min_x);
107
_mm_store_si128((__m128i*)&min_max_xy[4], min_y);
109
__m128i one = _mm_set1_epi32(1);
110
__m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
111
__m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
113
/* Check that 'next' pixels are in bounds */
114
_m_w = _mm_srai_epi32(_m_w, 8);
115
_m_h = _mm_srai_epi32(_m_h, 8);
117
x_gt = _mm_cmpgt_epi32(nx, _m_w);
118
y_gt = _mm_cmpgt_epi32(ny, _m_h);
120
nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w, x_gt));
121
ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h, y_gt));
123
int xfer[16] __attribute__ ((aligned (16)));
125
/* Pitch as pixels */
126
__m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >> 2)<<16));
128
/* Remove remainder */
129
__m128i tx = _mm_srai_epi32(x, 8);
130
__m128i ty = _mm_srai_epi32(y, 8);
132
/* Multiply y by pitch */
133
ty = _mm_packs_epi32(ty, ty);
134
__m128i ty_lo = _mm_mullo_epi16(ty, pitch);
135
__m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
136
ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
138
/* Same to next pixel */
139
ny = _mm_packs_epi32(ny, ny);
140
__m128i ny_lo = _mm_mullo_epi16(ny, pitch);
141
__m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
142
ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
144
/* Add pitch and x offset */
145
__m128i a_offset = _mm_add_epi32(tx, ty);
146
__m128i b_offset = _mm_add_epi32(nx, ty);
147
__m128i c_offset = _mm_add_epi32(tx, ny);
148
__m128i d_offset = _mm_add_epi32(nx, ny);
150
/* Multiply by pixelsize and add RGB offsets */
151
__m128i zero12 = _mm_load_si128((__m128i*)_zero12);
152
a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
153
b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
154
c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
155
d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
157
_mm_store_si128((__m128i*)xfer, a_offset);
158
_mm_store_si128((__m128i*)&xfer[4], b_offset);
159
_mm_store_si128((__m128i*)&xfer[8], c_offset);
160
_mm_store_si128((__m128i*)&xfer[12], d_offset);
164
/* Loop unrolled, allows agressive instruction reordering */
165
/* Red, then G & B */
166
pixels[0] = in->pixels + xfer[0]; // a
167
pixels[1] = in->pixels + xfer[4]; // b
168
pixels[2] = in->pixels + xfer[8]; // c
169
pixels[3] = in->pixels + xfer[12]; // d
171
pixels[4] = in->pixels + xfer[1+0]; // a
172
pixels[5] = in->pixels + xfer[1+4]; // b
173
pixels[6] = in->pixels + xfer[1+8]; // c
174
pixels[7] = in->pixels + xfer[1+12]; // d
176
pixels[8] = in->pixels + xfer[2+0]; // a
177
pixels[9] = in->pixels + xfer[2+4]; // b
178
pixels[10] = in->pixels + xfer[2+8]; // c
179
pixels[11] = in->pixels + xfer[2+12]; // d
181
/* Calculate distances */
182
__m128i twofiftyfive = _mm_set1_epi32(255);
183
__m128i diffx = _mm_and_si128(x, twofiftyfive);
184
__m128i diffy = _mm_and_si128(y, twofiftyfive);
185
__m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
186
__m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
188
/* Calculate weights */
189
__m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
190
__m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
191
__m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
192
__m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
194
_mm_store_si128((__m128i*)xfer, aw);
195
_mm_store_si128((__m128i*)&xfer[4], bw);
196
_mm_store_si128((__m128i*)&xfer[8], cw);
197
_mm_store_si128((__m128i*)&xfer[12], dw);
199
gushort** p = pixels;
201
out[0] = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] * *p[2] + xfer[12] * *p[3] + 16384) >> 15 );
203
out[1] = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] * *p[2] + xfer[1+12] * *p[3] + 16384) >> 15 );
205
out[2] = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] * *p[2] + xfer[2+12] * *p[3] + 16384) >> 15 );
209
rs_image16_bilinear_nomeasure_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
211
const gint m_w = (in->w-1);
212
const gint m_h = (in->h-1);
215
if ((uintptr_t)pos & 15)
217
p0 = _mm_loadu_ps(pos); // y1x1 y0x0
218
p1 = _mm_loadu_ps(pos+4); // ---- y2x2
221
p0 = _mm_load_ps(pos); // y1x1 y0x0
222
p1 = _mm_load_ps(pos+4); // ---- y2x2
226
__m128 xf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(0,0,2,0));
228
__m128 yf = _mm_shuffle_ps(p0, p1, _MM_SHUFFLE(1,1,3,1));
230
__m128 fl256 = _mm_load_ps(twofiftytwo_ps);
231
xf = _mm_mul_ps(xf, fl256);
232
yf = _mm_mul_ps(yf, fl256);
233
__m128i x = _mm_cvttps_epi32(xf);
234
__m128i y = _mm_cvttps_epi32(yf);
236
__m128i _m_w = _mm_slli_epi32(_mm_set1_epi32(m_w), 8);
237
__m128i _m_h = _mm_slli_epi32(_mm_set1_epi32(m_h), 8);
242
x_gt = _mm_cmpgt_epi32(x, _m_w);
243
y_gt = _mm_cmpgt_epi32(y, _m_h);
245
x = _mm_or_si128(_mm_andnot_si128(x_gt, x), _mm_and_si128(_m_w, x_gt));
246
y = _mm_or_si128(_mm_andnot_si128(y_gt, y), _mm_and_si128(_m_h, y_gt));
248
__m128i zero = _mm_setzero_si128();
249
__m128i x_lt = _mm_cmplt_epi32(x, zero);
250
__m128i y_lt = _mm_cmplt_epi32(y, zero);
251
x = _mm_andnot_si128(x_lt, x);
252
y = _mm_andnot_si128(y_lt, y);
254
__m128i one = _mm_set1_epi32(1);
255
__m128i nx = _mm_add_epi32(one, _mm_srai_epi32(x, 8));
256
__m128i ny = _mm_add_epi32(one, _mm_srai_epi32(y, 8));
258
/* Check that 'next' pixels are in bounds */
259
_m_w = _mm_srai_epi32(_m_w, 8);
260
_m_h = _mm_srai_epi32(_m_h, 8);
262
x_gt = _mm_cmpgt_epi32(nx, _m_w);
263
y_gt = _mm_cmpgt_epi32(ny, _m_h);
265
nx = _mm_or_si128(_mm_andnot_si128(x_gt, nx), _mm_and_si128(_m_w, x_gt));
266
ny = _mm_or_si128(_mm_andnot_si128(y_gt, ny), _mm_and_si128(_m_h, y_gt));
268
int xfer[16] __attribute__ ((aligned (16)));
270
/* Pitch as pixels */
271
__m128i pitch = _mm_set1_epi32(in->rowstride >> 2 | ((in->rowstride >> 2)<<16));
273
/* Remove remainder */
274
__m128i tx = _mm_srai_epi32(x, 8);
275
__m128i ty = _mm_srai_epi32(y, 8);
277
/* Multiply y by pitch */
278
ty = _mm_packs_epi32(ty, ty);
279
__m128i ty_lo = _mm_mullo_epi16(ty, pitch);
280
__m128i ty_hi = _mm_mulhi_epi16(ty, pitch);
281
ty = _mm_unpacklo_epi16(ty_lo, ty_hi);
283
/* Same to next pixel */
284
ny = _mm_packs_epi32(ny, ny);
285
__m128i ny_lo = _mm_mullo_epi16(ny, pitch);
286
__m128i ny_hi = _mm_mulhi_epi16(ny, pitch);
287
ny = _mm_unpacklo_epi16(ny_lo, ny_hi);
289
/* Add pitch and x offset */
290
__m128i a_offset = _mm_add_epi32(tx, ty);
291
__m128i b_offset = _mm_add_epi32(nx, ty);
292
__m128i c_offset = _mm_add_epi32(tx, ny);
293
__m128i d_offset = _mm_add_epi32(nx, ny);
295
/* Multiply by pixelsize and add RGB offsets */
296
__m128i zero12 = _mm_load_si128((__m128i*)_zero12);
297
a_offset = _mm_add_epi32(zero12, _mm_slli_epi32(a_offset, 2));
298
b_offset = _mm_add_epi32(zero12, _mm_slli_epi32(b_offset, 2));
299
c_offset = _mm_add_epi32(zero12, _mm_slli_epi32(c_offset, 2));
300
d_offset = _mm_add_epi32(zero12, _mm_slli_epi32(d_offset, 2));
302
_mm_store_si128((__m128i*)xfer, a_offset);
303
_mm_store_si128((__m128i*)&xfer[4], b_offset);
304
_mm_store_si128((__m128i*)&xfer[8], c_offset);
305
_mm_store_si128((__m128i*)&xfer[12], d_offset);
309
/* Loop unrolled, allows agressive instruction reordering */
310
/* Red, then G & B */
311
pixels[0] = in->pixels + xfer[0]; // a
312
pixels[1] = in->pixels + xfer[4]; // b
313
pixels[2] = in->pixels + xfer[8]; // c
314
pixels[3] = in->pixels + xfer[12]; // d
316
pixels[4] = in->pixels + xfer[1+0]; // a
317
pixels[5] = in->pixels + xfer[1+4]; // b
318
pixels[6] = in->pixels + xfer[1+8]; // c
319
pixels[7] = in->pixels + xfer[1+12]; // d
321
pixels[8] = in->pixels + xfer[2+0]; // a
322
pixels[9] = in->pixels + xfer[2+4]; // b
323
pixels[10] = in->pixels + xfer[2+8]; // c
324
pixels[11] = in->pixels + xfer[2+12]; // d
326
/* Calculate distances */
327
__m128i twofiftyfive = _mm_set1_epi32(255);
328
__m128i diffx = _mm_and_si128(x, twofiftyfive);
329
__m128i diffy = _mm_and_si128(y, twofiftyfive);
330
__m128i inv_diffx = _mm_andnot_si128(diffx, twofiftyfive);
331
__m128i inv_diffy = _mm_andnot_si128(diffy, twofiftyfive);
333
/* Calculate weights */
334
__m128i aw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, inv_diffy),1);
335
__m128i bw = _mm_srai_epi32(_mm_mullo_epi16(diffx, inv_diffy),1);
336
__m128i cw = _mm_srai_epi32(_mm_mullo_epi16(inv_diffx, diffy),1);
337
__m128i dw = _mm_srai_epi32(_mm_mullo_epi16(diffx, diffy),1);
339
_mm_store_si128((__m128i*)xfer, aw);
340
_mm_store_si128((__m128i*)&xfer[4], bw);
341
_mm_store_si128((__m128i*)&xfer[8], cw);
342
_mm_store_si128((__m128i*)&xfer[12], dw);
344
gushort** p = pixels;
346
out[0] = (gushort) ((xfer[0] * *p[0] + xfer[4] * *p[1] + xfer[8] * *p[2] + xfer[12] * *p[3] + 16384) >> 15 );
348
out[1] = (gushort) ((xfer[1] * *p[0] + xfer[1+4] * *p[1] + xfer[1+8] * *p[2] + xfer[1+12] * *p[3] + 16384) >> 15 );
350
out[2] = (gushort) ((xfer[2] * *p[0] + xfer[2+4] * *p[1] + xfer[2+8] * *p[2] + xfer[2+12] * *p[3] + 16384) >> 15 );
355
gboolean is_sse2_compiled(void)
361
rs_image16_bilinear_full_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos,const gint *current_xy, const gint* min_max_xy)
366
rs_image16_bilinear_nomeasure_sse2(RS_IMAGE16 *in, gushort *out, gfloat *pos)
370
#endif // defined (__SSE2__)