~ubuntu-branches/ubuntu/vivid/freerdp/vivid

« back to all changes in this revision

Viewing changes to libfreerdp/primitives/prim_colors_opt.c

  • Committer: Package Import Robot
  • Author(s): Iain Lane
  • Date: 2014-11-11 12:20:50 UTC
  • mfrom: (1.1.9) (9.1.17 sid)
  • Revision ID: package-import@ubuntu.com-20141111122050-wyr8hrnwco9fcmum
Tags: 1.1.0~git20140921.1.440916e+dfsg1-2ubuntu1
* Merge with Debian unstable, remaining changes
  - Disable ffmpeg support
* Disable gstreamer support, this relies on gstreamer 0.10 and we don't want
  to add any more deps on that.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/* FreeRDP: A Remote Desktop Protocol Client
 
2
 * Optimized Color conversion operations.
 
3
 * vi:ts=4 sw=4:
 
4
 *
 
5
 * Copyright 2011 Stephen Erisman
 
6
 * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
 
7
 * Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
 
8
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
 
9
 *
 
10
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 
11
 * not use this file except in compliance with the License. You may obtain
 
12
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
 
13
 * Unless required by applicable law or agreed to in writing, software
 
14
 * distributed under the License is distributed on an "AS IS" BASIS,
 
15
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 
16
 * or implied. See the License for the specific language governing
 
17
 * permissions and limitations under the License.
 
18
 */
 
19
 
 
20
#ifdef HAVE_CONFIG_H
 
21
#include "config.h"
 
22
#endif
 
23
 
 
24
#include <freerdp/types.h>
 
25
#include <freerdp/primitives.h>
 
26
#include <winpr/sysinfo.h>
 
27
 
 
28
#ifdef WITH_SSE2
 
29
#include <emmintrin.h>
 
30
#elif defined(WITH_NEON)
 
31
#include <arm_neon.h>
 
32
#endif /* WITH_SSE2 else WITH_NEON */
 
33
 
 
34
#include "prim_internal.h"
 
35
#include "prim_templates.h"
 
36
#include "prim_colors.h"
 
37
 
 
38
#ifdef WITH_SSE2
 
39
 
 
40
#ifdef __GNUC__
 
41
# define GNU_INLINE \
 
42
        __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 
43
#else
 
44
# define GNU_INLINE
 
45
#endif
 
46
 
 
47
#define CACHE_LINE_BYTES        64
 
48
 
 
49
#define _mm_between_epi16(_val, _min, _max) \
 
50
        do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
 
51
 
 
52
#ifdef DO_PREFETCH
 
53
/*---------------------------------------------------------------------------*/
 
54
static inline void GNU_INLINE _mm_prefetch_buffer(
 
55
        char * buffer, 
 
56
        int num_bytes)
 
57
{
 
58
        __m128i * buf = (__m128i*) buffer;
 
59
        unsigned int i;
 
60
        for (i = 0; i < (num_bytes / sizeof(__m128i)); 
 
61
                i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
 
62
        {
 
63
                _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
 
64
        }
 
65
}
 
66
#endif /* DO_PREFETCH */
 
67
 
 
68
/*---------------------------------------------------------------------------*/
 
69
PRIMITIVES_HIDDEN pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
 
70
        const INT16 *pSrc[3],
 
71
        int srcStep,
 
72
        INT16 *pDst[3],
 
73
        int dstStep,
 
74
        const prim_size_t *roi) /* region of interest */
 
75
{
 
76
        __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
 
77
        __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
 
78
        int srcbump, dstbump, yp, imax;
 
79
 
 
80
        if (((ULONG_PTR) (pSrc[0]) & 0x0f)
 
81
                        || ((ULONG_PTR) (pSrc[1]) & 0x0f)
 
82
                        || ((ULONG_PTR) (pSrc[2]) & 0x0f)
 
83
                        || ((ULONG_PTR) (pDst[0]) & 0x0f)
 
84
                        || ((ULONG_PTR) (pDst[1]) & 0x0f)
 
85
                        || ((ULONG_PTR) (pDst[2]) & 0x0f)
 
86
                        || (roi->width & 0x07)
 
87
                        || (srcStep & 127)
 
88
                        || (dstStep & 127))
 
89
        {
 
90
                /* We can't maintain 16-byte alignment. */
 
91
                return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
 
92
                        pDst, dstStep, roi);
 
93
        }
 
94
 
 
95
        zero = _mm_setzero_si128();
 
96
        max = _mm_set1_epi16(255);
 
97
 
 
98
        y_buf  = (__m128i*) (pSrc[0]);
 
99
        cb_buf = (__m128i*) (pSrc[1]);
 
100
        cr_buf = (__m128i*) (pSrc[2]);
 
101
        r_buf  = (__m128i*) (pDst[0]);
 
102
        g_buf  = (__m128i*) (pDst[1]);
 
103
        b_buf  = (__m128i*) (pDst[2]);
 
104
 
 
105
        r_cr = _mm_set1_epi16(22986);   /*  1.403 << 14 */
 
106
        g_cb = _mm_set1_epi16(-5636);   /* -0.344 << 14 */
 
107
        g_cr = _mm_set1_epi16(-11698);  /* -0.714 << 14 */
 
108
        b_cb = _mm_set1_epi16(28999);   /*  1.770 << 14 */
 
109
        c4096 = _mm_set1_epi16(4096);
 
110
        srcbump = srcStep / sizeof(__m128i);
 
111
        dstbump = dstStep / sizeof(__m128i);
 
112
 
 
113
#ifdef DO_PREFETCH
 
114
        /* Prefetch Y's, Cb's, and Cr's. */
 
115
        for (yp=0; yp<roi->height; yp++)
 
116
        {
 
117
                int i;
 
118
                for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
 
119
                        i += (CACHE_LINE_BYTES / sizeof(__m128i)))
 
120
                {
 
121
                        _mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
 
122
                        _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
 
123
                        _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
 
124
                }
 
125
                y_buf  += srcbump;
 
126
                cb_buf += srcbump;
 
127
                cr_buf += srcbump;
 
128
        }
 
129
        y_buf  = (__m128i*) (pSrc[0]);
 
130
        cb_buf = (__m128i*) (pSrc[1]);
 
131
        cr_buf = (__m128i*) (pSrc[2]);
 
132
#endif /* DO_PREFETCH */
 
133
 
 
134
        imax = roi->width * sizeof(INT16) / sizeof(__m128i);
 
135
        for (yp=0; yp<roi->height; ++yp)
 
136
        {
 
137
                int i;
 
138
                for (i=0; i<imax; i++)
 
139
                {
 
140
                        /* In order to use SSE2 signed 16-bit integer multiplication
 
141
                         * we need to convert the floating point factors to signed int
 
142
                         * without losing information.
 
143
                         * The result of this multiplication is 32 bit and we have two
 
144
                         * SSE instructions that return either the hi or lo word.
 
145
                         * Thus we will multiply the factors by the highest possible 2^n,
 
146
                         * take the upper 16 bits of the signed 32-bit result
 
147
                         * (_mm_mulhi_epi16) and correct this result by multiplying
 
148
                         * it by 2^(16-n).
 
149
                         *
 
150
                         * For the given factors in the conversion matrix the best
 
151
                         * possible n is 14.
 
152
                         *
 
153
                         * Example for calculating r:
 
154
                         * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
 
155
                         * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
 
156
                         * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
 
157
                         * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
 
158
                         */
 
159
 
 
160
                        /* y = (y_r_buf[i] + 4096) >> 2 */
 
161
                        __m128i y, cb, cr, r, g, b;
 
162
                        y = _mm_load_si128(y_buf + i);
 
163
                        y = _mm_add_epi16(y, c4096);
 
164
                        y = _mm_srai_epi16(y, 2);
 
165
                        /* cb = cb_g_buf[i]; */
 
166
                        cb = _mm_load_si128(cb_buf + i);
 
167
                        /* cr = cr_b_buf[i]; */
 
168
                        cr = _mm_load_si128(cr_buf + i);
 
169
 
 
170
                        /* (y + HIWORD(cr*22986)) >> 3 */
 
171
                        r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
 
172
                        r = _mm_srai_epi16(r, 3);
 
173
 
 
174
                        /* r_buf[i] = MINMAX(r, 0, 255); */
 
175
                        _mm_between_epi16(r, zero, max);
 
176
                        _mm_store_si128(r_buf + i, r);
 
177
 
 
178
                        /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
 
179
                        g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
 
180
                        g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
 
181
                        g = _mm_srai_epi16(g, 3);
 
182
 
 
183
                        /* g_buf[i] = MINMAX(g, 0, 255); */
 
184
                        _mm_between_epi16(g, zero, max);
 
185
                        _mm_store_si128(g_buf + i, g);
 
186
 
 
187
                        /* (y + HIWORD(cb*28999)) >> 3 */
 
188
                        b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
 
189
                        b = _mm_srai_epi16(b, 3);
 
190
                        /* b_buf[i] = MINMAX(b, 0, 255); */
 
191
                        _mm_between_epi16(b, zero, max);
 
192
                        _mm_store_si128(b_buf + i, b);
 
193
                }
 
194
                y_buf  += srcbump;
 
195
                cb_buf += srcbump;
 
196
                cr_buf += srcbump;
 
197
                r_buf += dstbump;
 
198
                g_buf += dstbump;
 
199
                b_buf += dstbump;
 
200
        }
 
201
 
 
202
        return PRIMITIVES_SUCCESS;
 
203
}
 
204
 
 
205
/*---------------------------------------------------------------------------*/
 
206
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
 
207
 * numbers. See the general code above.
 
208
 */
 
209
PRIMITIVES_HIDDEN pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
 
210
        const INT16 *pSrc[3],
 
211
        int srcStep,
 
212
        INT16 *pDst[3],
 
213
        int dstStep,
 
214
        const prim_size_t *roi) /* region of interest */
 
215
{
 
216
        __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
 
217
        __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
 
218
        int srcbump, dstbump, yp, imax;
 
219
 
 
220
        if (((ULONG_PTR) (pSrc[0]) & 0x0f)
 
221
                        || ((ULONG_PTR) (pSrc[1]) & 0x0f)
 
222
                        || ((ULONG_PTR) (pSrc[2]) & 0x0f)
 
223
                        || ((ULONG_PTR) (pDst[0]) & 0x0f)
 
224
                        || ((ULONG_PTR) (pDst[1]) & 0x0f)
 
225
                        || ((ULONG_PTR) (pDst[2]) & 0x0f)
 
226
                        || (roi->width & 0x07)
 
227
                        || (srcStep & 127)
 
228
                        || (dstStep & 127))
 
229
        {
 
230
                /* We can't maintain 16-byte alignment. */
 
231
                return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
 
232
                        pDst, dstStep, roi);
 
233
        }
 
234
 
 
235
        min = _mm_set1_epi16(-128 << 5);
 
236
        max = _mm_set1_epi16(127 << 5);
 
237
 
 
238
        r_buf  = (__m128i*) (pSrc[0]);
 
239
        g_buf  = (__m128i*) (pSrc[1]);
 
240
        b_buf  = (__m128i*) (pSrc[2]);
 
241
        y_buf  = (__m128i*) (pDst[0]);
 
242
        cb_buf = (__m128i*) (pDst[1]);
 
243
        cr_buf = (__m128i*) (pDst[2]);
 
244
 
 
245
        y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
 
246
        y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
 
247
        y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
 
248
        cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
 
249
        cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
 
250
        cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
 
251
        cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
 
252
        cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
 
253
        cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
 
254
 
 
255
        srcbump = srcStep / sizeof(__m128i);
 
256
        dstbump = dstStep / sizeof(__m128i);
 
257
 
 
258
#ifdef DO_PREFETCH
 
259
        /* Prefetch RGB's. */
 
260
        for (yp=0; yp<roi->height; yp++)
 
261
        {
 
262
                int i;
 
263
                for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
 
264
                        i += (CACHE_LINE_BYTES / sizeof(__m128i)))
 
265
                {
 
266
                        _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
 
267
                        _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
 
268
                        _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
 
269
                }
 
270
                r_buf += srcbump;
 
271
                g_buf += srcbump;
 
272
                b_buf += srcbump;
 
273
        }
 
274
        r_buf = (__m128i*) (pSrc[0]);
 
275
        g_buf = (__m128i*) (pSrc[1]);
 
276
        b_buf = (__m128i*) (pSrc[2]);
 
277
#endif /* DO_PREFETCH */
 
278
 
 
279
        imax = roi->width * sizeof(INT16) / sizeof(__m128i);
 
280
        for (yp=0; yp<roi->height; ++yp)
 
281
        {
 
282
                int i;
 
283
                for (i=0; i<imax; i++)
 
284
                {
 
285
                        /* In order to use SSE2 signed 16-bit integer multiplication we
 
286
                         * need to convert the floating point factors to signed int
 
287
                         * without loosing information.  The result of this multiplication
 
288
                         * is 32 bit and using SSE2 we get either the product's hi or lo
 
289
                         * word.  Thus we will multiply the factors by the highest
 
290
                         * possible 2^n and take the upper 16 bits of the signed 32-bit
 
291
                         * result (_mm_mulhi_epi16).  Since the final result needs to
 
292
                         * be scaled by << 5 and also in in order to keep the precision
 
293
                         * within the upper 16 bits we will also have to scale the RGB
 
294
                         * values used in the multiplication by << 5+(16-n).
 
295
                         */
 
296
                        __m128i r, g, b, y, cb, cr;
 
297
                        r = _mm_load_si128(y_buf+i);
 
298
                        g = _mm_load_si128(g_buf+i);
 
299
                        b = _mm_load_si128(b_buf+i);
 
300
 
 
301
                        /* r<<6; g<<6; b<<6 */
 
302
                        r = _mm_slli_epi16(r, 6);
 
303
                        g = _mm_slli_epi16(g, 6);
 
304
                        b = _mm_slli_epi16(b, 6);
 
305
 
 
306
                        /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
 
307
                        y = _mm_mulhi_epi16(r, y_r);
 
308
                        y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
 
309
                        y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
 
310
                        y = _mm_add_epi16(y, min);
 
311
                        /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
 
312
                        _mm_between_epi16(y, min, max);
 
313
                        _mm_store_si128(y_buf+i, y);
 
314
 
 
315
                        /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
 
316
                        cb = _mm_mulhi_epi16(r, cb_r);
 
317
                        cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
 
318
                        cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
 
319
                        /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
 
320
                        _mm_between_epi16(cb, min, max);
 
321
                        _mm_store_si128(cb_buf+i, cb);
 
322
 
 
323
                        /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
 
324
                        cr = _mm_mulhi_epi16(r, cr_r);
 
325
                        cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
 
326
                        cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
 
327
                        /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
 
328
                        _mm_between_epi16(cr, min, max);
 
329
                        _mm_store_si128(cr_buf+i, cr);
 
330
                }
 
331
                y_buf  += srcbump;
 
332
                cb_buf += srcbump;
 
333
                cr_buf += srcbump;
 
334
                r_buf += dstbump;
 
335
                g_buf += dstbump;
 
336
                b_buf += dstbump;
 
337
        }
 
338
 
 
339
        return PRIMITIVES_SUCCESS;
 
340
}
 
341
 
 
342
/*---------------------------------------------------------------------------*/
 
343
#define LOAD128(_src_) \
 
344
        _mm_load_si128((__m128i *) _src_)
 
345
#define STORE128(_dst_, _src_) \
 
346
        _mm_store_si128((__m128i *) _dst_, _src_)
 
347
#define PUNPCKLBW(_dst_, _src_) \
 
348
        _dst_ = _mm_unpacklo_epi8(_src_, _dst_)
 
349
#define PUNPCKHBW(_dst_, _src_) \
 
350
        _dst_ = _mm_unpackhi_epi8(_src_, _dst_)
 
351
#define PUNPCKLWD(_dst_, _src_) \
 
352
        _dst_ = _mm_unpacklo_epi16(_src_, _dst_)
 
353
#define PUNPCKHWD(_dst_, _src_) \
 
354
        _dst_ = _mm_unpackhi_epi16(_src_, _dst_)
 
355
#define PACKUSWB(_dst_, _src_) \
 
356
        _dst_ = _mm_packus_epi16(_dst_, _src_)
 
357
#define PREFETCH(_ptr_) \
 
358
        _mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
 
359
#define XMM_ALL_ONES \
 
360
        _mm_set1_epi32(0xFFFFFFFFU)
 
361
 
 
362
PRIMITIVES_HIDDEN pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
 
363
        const INT16 *pSrc[3],   /* 16-bit R,G, and B arrays */
 
364
        INT32 srcStep,                  /* bytes between rows in source data */
 
365
        BYTE *pDst,                             /* 32-bit interleaved ARGB (ABGR?) data */
 
366
        INT32 dstStep,                  /* bytes between rows in dest data */
 
367
        const prim_size_t *roi) /* region of interest */
 
368
{
 
369
        const UINT16 *r = (const UINT16 *) (pSrc[0]);
 
370
        const UINT16 *g = (const UINT16 *) (pSrc[1]);
 
371
        const UINT16 *b = (const UINT16 *) (pSrc[2]);
 
372
        BYTE *out;
 
373
        int srcbump, dstbump, y;
 
374
 
 
375
        /* Ensure 16-byte alignment on all pointers,
 
376
         * that width is a multiple of 8,
 
377
         * and that the next row will also remain aligned.
 
378
         * Since this is usually used for 64x64 aligned arrays,
 
379
         * these checks should presumably pass.
 
380
         */
 
381
        if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
 
382
                        || (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
 
383
                        || (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
 
384
                        || (((ULONG_PTR) pDst & 0x0f) != 0)
 
385
                        || (roi->width & 0x0f)
 
386
                        || (srcStep & 0x0f)
 
387
                        || (dstStep & 0x0f))
 
388
        {
 
389
                return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
 
390
        }
 
391
 
 
392
        out = (BYTE *) pDst;
 
393
        srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
 
394
        dstbump = (dstStep - (roi->width * sizeof(UINT32)));
 
395
 
 
396
        for (y=0; y<roi->height; ++y)
 
397
        {
 
398
                int width = roi->width;
 
399
                do {
 
400
                        __m128i R0, R1, R2, R3, R4;
 
401
                        /* The comments below pretend these are 8-byte registers
 
402
                         * rather than 16-byte, for readability.
 
403
                         */
 
404
                        R0 = LOAD128(b);  b += 8;               /* R0 = 00B300B200B100B0 */
 
405
                        R1 = LOAD128(b);  b += 8;               /* R1 = 00B700B600B500B4 */
 
406
                        PACKUSWB(R0,R1);                                /* R0 = B7B6B5B4B3B2B1B0 */
 
407
                        R1 = LOAD128(g);  g += 8;               /* R1 = 00G300G200G100G0 */
 
408
                        R2 = LOAD128(g);  g += 8;               /* R2 = 00G700G600G500G4 */
 
409
                        PACKUSWB(R1,R2);                                /* R1 = G7G6G5G4G3G2G1G0 */
 
410
                        R2 = R1;                                                /* R2 = G7G6G5G4G3G2G1G0 */
 
411
                        PUNPCKLBW(R2,R0);                               /* R2 = G3B3G2B2G1B1G0B0 */
 
412
                        PUNPCKHBW(R1,R0);                               /* R1 = G7B7G6B7G5B5G4B4 */
 
413
                        R0 = LOAD128(r);  r += 8;               /* R0 = 00R300R200R100R0 */
 
414
                        R3 = LOAD128(r);  r += 8;               /* R3 = 00R700R600R500R4 */
 
415
                        PACKUSWB(R0,R3);                                /* R0 = R7R6R5R4R3R2R1R0 */
 
416
                        R3 = XMM_ALL_ONES;                              /* R3 = FFFFFFFFFFFFFFFF */
 
417
                        R4 = R3;                                                /* R4 = FFFFFFFFFFFFFFFF */
 
418
                        PUNPCKLBW(R4,R0);                               /* R4 = FFR3FFR2FFR1FFR0 */
 
419
                        PUNPCKHBW(R3,R0);                               /* R3 = FFR7FFR6FFR5FFR4 */
 
420
                        R0 = R4;                                                /* R0 = R4               */
 
421
                        PUNPCKLWD(R0,R2);                               /* R0 = FFR1G1B1FFR0G0B0 */
 
422
                        PUNPCKHWD(R4,R2);                               /* R4 = FFR3G3B3FFR2G2B2 */
 
423
                        R2 = R3;                                                /* R2 = R3               */
 
424
                        PUNPCKLWD(R2,R1);                               /* R2 = FFR5G5B5FFR4G4B4 */
 
425
                        PUNPCKHWD(R3,R1);                               /* R3 = FFR7G7B7FFR6G6B6 */
 
426
                        STORE128(out, R0);  out += 16;  /* FFR1G1B1FFR0G0B0      */
 
427
                        STORE128(out, R4);  out += 16;  /* FFR3G3B3FFR2G2B2      */
 
428
                        STORE128(out, R2);  out += 16;  /* FFR5G5B5FFR4G4B4      */
 
429
                        STORE128(out, R3);  out += 16;  /* FFR7G7B7FFR6G6B6      */
 
430
                } while (width -= 16);
 
431
                /* Jump to next row. */
 
432
                r += srcbump;
 
433
                g += srcbump;
 
434
                b += srcbump;
 
435
                out += dstbump;
 
436
        }
 
437
        return PRIMITIVES_SUCCESS;
 
438
}
 
439
#endif /* WITH_SSE2 */
 
440
 
 
441
/*---------------------------------------------------------------------------*/
 
442
#ifdef WITH_NEON
 
443
PRIMITIVES_HIDDEN pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
 
444
        const INT16 *pSrc[3],
 
445
        int srcStep,
 
446
        INT16 *pDst[3],
 
447
        int dstStep,
 
448
        const prim_size_t *roi) /* region of interest */
 
449
{
 
450
        /* TODO: If necessary, check alignments and call the general version. */
 
451
 
 
452
        int16x8_t zero = vdupq_n_s16(0);
 
453
        int16x8_t max = vdupq_n_s16(255);
 
454
 
 
455
        int16x8_t r_cr = vdupq_n_s16(22986);    //  1.403 << 14
 
456
        int16x8_t g_cb = vdupq_n_s16(-5636);    // -0.344 << 14
 
457
        int16x8_t g_cr = vdupq_n_s16(-11698);   // -0.714 << 14
 
458
        int16x8_t b_cb = vdupq_n_s16(28999);    //  1.770 << 14
 
459
        int16x8_t c4096 = vdupq_n_s16(4096);
 
460
 
 
461
        int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
 
462
        int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
 
463
        int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
 
464
        int16x8_t* r_buf  = (int16x8_t*) pDst[0];
 
465
        int16x8_t* g_buf  = (int16x8_t*) pDst[1];
 
466
        int16x8_t* b_buf  = (int16x8_t*) pDst[2];
 
467
 
 
468
        int srcbump = srcStep / sizeof(int16x8_t);
 
469
        int dstbump = dstStep / sizeof(int16x8_t);
 
470
        int yp;
 
471
 
 
472
        int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
 
473
        for (yp=0; yp<roi->height; ++yp)
 
474
        {
 
475
                int i;
 
476
                for (i=0; i<imax; i++)
 
477
                {
 
478
                        /*
 
479
                                In order to use NEON signed 16-bit integer multiplication we need to convert
 
480
                                the floating point factors to signed int without loosing information.
 
481
                                The result of this multiplication is 32 bit and we have a NEON instruction
 
482
                                that returns the hi word of the saturated double.
 
483
                                Thus we will multiply the factors by the highest possible 2^n, take the 
 
484
                                upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
 
485
                                shift by 1 to reverse the doubling) and correct this result by multiplying it 
 
486
                                by 2^(16-n).
 
487
                                For the given factors in the conversion matrix the best possible n is 14.
 
488
 
 
489
                                Example for calculating r:
 
490
                                r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
 
491
                                r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
 
492
                                r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
 
493
                                r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
 
494
                        */
 
495
                
 
496
                        /* y = (y_buf[i] + 4096) >> 2 */
 
497
                        int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
 
498
                        y = vaddq_s16(y, c4096);
 
499
                        y = vshrq_n_s16(y, 2);
 
500
                        /* cb = cb_buf[i]; */
 
501
                        int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
 
502
                        /* cr = cr_buf[i]; */
 
503
                        int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
 
504
 
 
505
                        /* (y + HIWORD(cr*22986)) >> 3 */
 
506
                        int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
 
507
                        r = vshrq_n_s16(r, 3);
 
508
                        /* r_buf[i] = MINMAX(r, 0, 255); */
 
509
                        r = vminq_s16(vmaxq_s16(r, zero), max);
 
510
                        vst1q_s16((INT16*)&r_buf[i], r);
 
511
 
 
512
                        /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
 
513
                        int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
 
514
                        g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
 
515
                        g = vshrq_n_s16(g, 3);
 
516
                        /* g_buf[i] = MINMAX(g, 0, 255); */
 
517
                        g = vminq_s16(vmaxq_s16(g, zero), max);
 
518
                        vst1q_s16((INT16*)&g_buf[i], g);
 
519
 
 
520
                        /* (y + HIWORD(cb*28999)) >> 3 */
 
521
                        int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
 
522
                        b = vshrq_n_s16(b, 3);
 
523
                        /* b_buf[i] = MINMAX(b, 0, 255); */
 
524
                        b = vminq_s16(vmaxq_s16(b, zero), max);
 
525
                        vst1q_s16((INT16*)&b_buf[i], b);
 
526
                }
 
527
 
 
528
                y_buf  += srcbump;
 
529
                cb_buf += srcbump;
 
530
                cr_buf += srcbump;
 
531
                r_buf += dstbump;
 
532
                g_buf += dstbump;
 
533
                b_buf += dstbump;
 
534
        }
 
535
        return PRIMITIVES_SUCCESS;
 
536
}
 
537
#endif /* WITH_NEON */
 
538
 
 
539
 
 
540
/* I don't see a direct IPP version of this, since the input is INT16
 
541
 * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
 
542
 * But that would likely be slower.
 
543
 */
 
544
 
 
545
/* ------------------------------------------------------------------------- */
 
546
void primitives_init_colors_opt(primitives_t* prims)
 
547
{
 
548
#if defined(WITH_SSE2)
 
549
        if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
 
550
        {
 
551
                prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
 
552
                prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
 
553
                prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
 
554
        }
 
555
#elif defined(WITH_NEON)
 
556
        if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
 
557
        {
 
558
                prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
 
559
        }
 
560
#endif /* WITH_SSE2 */
 
561
}
 
562