2
* Copyright © 2013 Soren Sandmann Pedersen
3
* Copyright © 2013 Red Hat, Inc.
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
* DEALINGS IN THE SOFTWARE.
24
* Author: Soren Sandmann (soren.sandmann@gmail.com)
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
#include <tmmintrin.h>
35
#include "pixman-private.h"
36
#include "pixman-inlines.h"
53
ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
54
int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
56
uint32_t *bits = image->bits + y * image->rowstride;
57
__m128i vx = _mm_set_epi16 (
58
- (x + 1), x, - (x + 1), x,
59
- (x + ux + 1), x + ux, - (x + ux + 1), x + ux);
60
__m128i vux = _mm_set_epi16 (
61
- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
62
- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
63
__m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
64
__m128i *b = (__m128i *)line->buffer;
71
vrl1 = _mm_loadl_epi64 (
72
(__m128i *)(bits + pixman_fixed_to_int (x + ux)));
76
vrl0 = _mm_loadl_epi64 (
77
(__m128i *)(bits + pixman_fixed_to_int (x)));
80
/* The weights are based on vx which is a vector of
82
* - (x + 1), x, - (x + 1), x,
83
* - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
85
* so the 16 bit weights end up like this:
87
* iw0, w0, iw0, w0, iw1, w1, iw1, w1
89
* and after shifting and packing, we get these bytes:
91
* iw0, w0, iw0, w0, iw1, w1, iw1, w1,
92
* iw0, w0, iw0, w0, iw1, w1, iw1, w1,
94
* which means the first and the second input pixel
95
* have to be interleaved like this:
97
* la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
98
* lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
100
* before maddubsw can be used.
104
vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
105
/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
108
vw = _mm_packus_epi16 (vw, vw);
109
/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
110
* iw0, w0, iw0, w0, iw1, w1, iw1, w1
112
vx = _mm_add_epi16 (vx, vux);
116
vr = _mm_unpacklo_epi16 (vrl1, vrl0);
117
/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
119
s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
120
/* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
122
vr = _mm_unpackhi_epi8 (vr, s);
123
/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
124
* lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
127
vr = _mm_maddubs_epi16 (vr, vw);
129
/* When the weight is 0, the inverse weight is
130
* 128 which can't be represented in a signed byte.
131
* As a result maddubsw computes the following:
133
* r = l * -128 + r * 0
135
* rather than the desired
137
* r = l * 128 + r * 0
139
* We fix this by taking the absolute value of the
142
vr = _mm_abs_epi16 (vr);
144
/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
145
_mm_store_si128 (b++, vr);
150
vrl1 = _mm_setzero_si128();
158
ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
160
pixman_fixed_t fx, ux;
161
bilinear_info_t *info = iter->data;
162
line_t *line0, *line1;
169
ux = iter->image->common.transform->matrix[0][0];
171
y0 = pixman_fixed_to_int (info->y);
174
line0 = &info->lines[y0 & 0x01];
175
line1 = &info->lines[y1 & 0x01];
179
ssse3_fetch_horizontal (
180
&iter->image->bits, line0, y0, fx, ux, iter->width);
185
ssse3_fetch_horizontal (
186
&iter->image->bits, line1, y1, fx, ux, iter->width);
189
dist_y = pixman_fixed_to_bilinear_weight (info->y);
190
dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
193
dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
195
for (i = 0; i + 3 < iter->width; i += 4)
197
__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
198
__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
199
__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
200
__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
201
__m128i r0, r1, tmp, p;
203
r0 = _mm_mulhi_epu16 (
204
_mm_sub_epi16 (bot0, top0), vw);
205
tmp = _mm_cmplt_epi16 (bot0, top0);
206
tmp = _mm_and_si128 (tmp, vw);
207
r0 = _mm_sub_epi16 (r0, tmp);
208
r0 = _mm_add_epi16 (r0, top0);
209
r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
210
/* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
211
r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
212
/* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
214
r1 = _mm_mulhi_epu16 (
215
_mm_sub_epi16 (bot1, top1), vw);
216
tmp = _mm_cmplt_epi16 (bot1, top1);
217
tmp = _mm_and_si128 (tmp, vw);
218
r1 = _mm_sub_epi16 (r1, tmp);
219
r1 = _mm_add_epi16 (r1, top1);
220
r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
221
r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
222
/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
224
p = _mm_packus_epi16 (r0, r1);
226
_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
229
while (i < iter->width)
231
__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
232
__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
235
r0 = _mm_mulhi_epu16 (
236
_mm_sub_epi16 (bot0, top0), vw);
237
tmp = _mm_cmplt_epi16 (bot0, top0);
238
tmp = _mm_and_si128 (tmp, vw);
239
r0 = _mm_sub_epi16 (r0, tmp);
240
r0 = _mm_add_epi16 (r0, top0);
241
r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
242
/* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
243
r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
244
/* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
246
p = _mm_packus_epi16 (r0, r0);
248
if (iter->width - i == 1)
250
*(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
255
_mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
260
info->y += iter->image->common.transform->matrix[1][1];
266
ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
272
ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
274
int width = iter->width;
275
bilinear_info_t *info;
278
/* Reference point is the center of the pixel */
279
v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
280
v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
281
v.vector[2] = pixman_fixed_1;
283
if (!pixman_transform_point_3d (iter->image->common.transform, &v))
286
info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
290
info->x = v.vector[0] - pixman_fixed_1 / 2;
291
info->y = v.vector[1] - pixman_fixed_1 / 2;
293
#define ALIGN(addr) \
294
((void *)((((uintptr_t)(addr)) + 15) & (~15)))
296
/* It is safe to set the y coordinates to -1 initially
297
* because COVER_CLIP_BILINEAR ensures that we will only
298
* be asked to fetch lines in the [0, height) interval
300
info->lines[0].y = -1;
301
info->lines[0].buffer = ALIGN (&(info->data[0]));
302
info->lines[1].y = -1;
303
info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
305
iter->get_scanline = ssse3_fetch_bilinear_cover;
306
iter->fini = ssse3_bilinear_cover_iter_fini;
312
/* Something went wrong, either a bad matrix or OOM; in such cases,
313
* we don't guarantee any particular rendering.
316
FUNC, "Allocation failure or bad matrix, skipping rendering\n");
318
iter->get_scanline = _pixman_iter_get_scanline_noop;
322
static const pixman_iter_info_t ssse3_iters[] =
325
(FAST_PATH_STANDARD_FLAGS |
326
FAST_PATH_SCALE_TRANSFORM |
327
FAST_PATH_BILINEAR_FILTER |
328
FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
329
ITER_NARROW | ITER_SRC,
330
ssse3_bilinear_cover_iter_init,
337
static const pixman_fast_path_t ssse3_fast_paths[] =
342
pixman_implementation_t *
343
_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
345
pixman_implementation_t *imp =
346
_pixman_implementation_create (fallback, ssse3_fast_paths);
348
imp->iter_info = ssse3_iters;