1
/* FreeRDP: A Remote Desktop Protocol Client
2
* Optimized Color conversion operations.
5
* Copyright 2011 Stephen Erisman
6
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
7
* Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
8
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
10
* Licensed under the Apache License, Version 2.0 (the "License"); you may
11
* not use this file except in compliance with the License. You may obtain
12
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16
* or implied. See the License for the specific language governing
17
* permissions and limitations under the License.
24
#include <freerdp/types.h>
25
#include <freerdp/primitives.h>
26
#include <winpr/sysinfo.h>
29
#include <emmintrin.h>
30
#elif defined(WITH_NEON)
32
#endif /* WITH_SSE2 else WITH_NEON */
34
#include "prim_internal.h"
35
#include "prim_templates.h"
36
#include "prim_colors.h"
42
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
47
#define CACHE_LINE_BYTES 64
49
#define _mm_between_epi16(_val, _min, _max) \
50
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
53
/*---------------------------------------------------------------------------*/
54
static inline void GNU_INLINE _mm_prefetch_buffer(
58
__m128i * buf = (__m128i*) buffer;
60
for (i = 0; i < (num_bytes / sizeof(__m128i));
61
i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
63
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
66
#endif /* DO_PREFETCH */
68
/*---------------------------------------------------------------------------*/
69
PRIMITIVES_HIDDEN pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
74
const prim_size_t *roi) /* region of interest */
76
__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
77
__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
78
int srcbump, dstbump, yp, imax;
80
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
81
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
82
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
83
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
84
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
85
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
86
|| (roi->width & 0x07)
90
/* We can't maintain 16-byte alignment. */
91
return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
95
zero = _mm_setzero_si128();
96
max = _mm_set1_epi16(255);
98
y_buf = (__m128i*) (pSrc[0]);
99
cb_buf = (__m128i*) (pSrc[1]);
100
cr_buf = (__m128i*) (pSrc[2]);
101
r_buf = (__m128i*) (pDst[0]);
102
g_buf = (__m128i*) (pDst[1]);
103
b_buf = (__m128i*) (pDst[2]);
105
r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
106
g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
107
g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
108
b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
109
c4096 = _mm_set1_epi16(4096);
110
srcbump = srcStep / sizeof(__m128i);
111
dstbump = dstStep / sizeof(__m128i);
114
/* Prefetch Y's, Cb's, and Cr's. */
115
for (yp=0; yp<roi->height; yp++)
118
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
119
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
121
_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
122
_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
123
_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
129
y_buf = (__m128i*) (pSrc[0]);
130
cb_buf = (__m128i*) (pSrc[1]);
131
cr_buf = (__m128i*) (pSrc[2]);
132
#endif /* DO_PREFETCH */
134
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
135
for (yp=0; yp<roi->height; ++yp)
138
for (i=0; i<imax; i++)
140
/* In order to use SSE2 signed 16-bit integer multiplication
141
* we need to convert the floating point factors to signed int
142
* without losing information.
143
* The result of this multiplication is 32 bit and we have two
144
* SSE instructions that return either the hi or lo word.
145
* Thus we will multiply the factors by the highest possible 2^n,
146
* take the upper 16 bits of the signed 32-bit result
147
* (_mm_mulhi_epi16) and correct this result by multiplying
150
* For the given factors in the conversion matrix the best
153
* Example for calculating r:
154
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
155
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
156
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
157
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
160
/* y = (y_r_buf[i] + 4096) >> 2 */
161
__m128i y, cb, cr, r, g, b;
162
y = _mm_load_si128(y_buf + i);
163
y = _mm_add_epi16(y, c4096);
164
y = _mm_srai_epi16(y, 2);
165
/* cb = cb_g_buf[i]; */
166
cb = _mm_load_si128(cb_buf + i);
167
/* cr = cr_b_buf[i]; */
168
cr = _mm_load_si128(cr_buf + i);
170
/* (y + HIWORD(cr*22986)) >> 3 */
171
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
172
r = _mm_srai_epi16(r, 3);
174
/* r_buf[i] = MINMAX(r, 0, 255); */
175
_mm_between_epi16(r, zero, max);
176
_mm_store_si128(r_buf + i, r);
178
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
179
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
180
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
181
g = _mm_srai_epi16(g, 3);
183
/* g_buf[i] = MINMAX(g, 0, 255); */
184
_mm_between_epi16(g, zero, max);
185
_mm_store_si128(g_buf + i, g);
187
/* (y + HIWORD(cb*28999)) >> 3 */
188
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
189
b = _mm_srai_epi16(b, 3);
190
/* b_buf[i] = MINMAX(b, 0, 255); */
191
_mm_between_epi16(b, zero, max);
192
_mm_store_si128(b_buf + i, b);
202
return PRIMITIVES_SUCCESS;
205
/*---------------------------------------------------------------------------*/
206
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
207
* numbers. See the general code above.
209
PRIMITIVES_HIDDEN pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
210
const INT16 *pSrc[3],
214
const prim_size_t *roi) /* region of interest */
216
__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
217
__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
218
int srcbump, dstbump, yp, imax;
220
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
221
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
222
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
223
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
224
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
225
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
226
|| (roi->width & 0x07)
230
/* We can't maintain 16-byte alignment. */
231
return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
235
min = _mm_set1_epi16(-128 << 5);
236
max = _mm_set1_epi16(127 << 5);
238
r_buf = (__m128i*) (pSrc[0]);
239
g_buf = (__m128i*) (pSrc[1]);
240
b_buf = (__m128i*) (pSrc[2]);
241
y_buf = (__m128i*) (pDst[0]);
242
cb_buf = (__m128i*) (pDst[1]);
243
cr_buf = (__m128i*) (pDst[2]);
245
y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
246
y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
247
y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
248
cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
249
cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
250
cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
251
cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
252
cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
253
cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
255
srcbump = srcStep / sizeof(__m128i);
256
dstbump = dstStep / sizeof(__m128i);
259
/* Prefetch RGB's. */
260
for (yp=0; yp<roi->height; yp++)
263
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
264
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
266
_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
267
_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
268
_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
274
r_buf = (__m128i*) (pSrc[0]);
275
g_buf = (__m128i*) (pSrc[1]);
276
b_buf = (__m128i*) (pSrc[2]);
277
#endif /* DO_PREFETCH */
279
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
280
for (yp=0; yp<roi->height; ++yp)
283
for (i=0; i<imax; i++)
285
/* In order to use SSE2 signed 16-bit integer multiplication we
286
* need to convert the floating point factors to signed int
287
* without loosing information. The result of this multiplication
288
* is 32 bit and using SSE2 we get either the product's hi or lo
289
* word. Thus we will multiply the factors by the highest
290
* possible 2^n and take the upper 16 bits of the signed 32-bit
291
* result (_mm_mulhi_epi16). Since the final result needs to
292
* be scaled by << 5 and also in in order to keep the precision
293
* within the upper 16 bits we will also have to scale the RGB
294
* values used in the multiplication by << 5+(16-n).
296
__m128i r, g, b, y, cb, cr;
297
r = _mm_load_si128(y_buf+i);
298
g = _mm_load_si128(g_buf+i);
299
b = _mm_load_si128(b_buf+i);
301
/* r<<6; g<<6; b<<6 */
302
r = _mm_slli_epi16(r, 6);
303
g = _mm_slli_epi16(g, 6);
304
b = _mm_slli_epi16(b, 6);
306
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
307
y = _mm_mulhi_epi16(r, y_r);
308
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
309
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
310
y = _mm_add_epi16(y, min);
311
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
312
_mm_between_epi16(y, min, max);
313
_mm_store_si128(y_buf+i, y);
315
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
316
cb = _mm_mulhi_epi16(r, cb_r);
317
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
318
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
319
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
320
_mm_between_epi16(cb, min, max);
321
_mm_store_si128(cb_buf+i, cb);
323
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
324
cr = _mm_mulhi_epi16(r, cr_r);
325
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
326
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
327
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
328
_mm_between_epi16(cr, min, max);
329
_mm_store_si128(cr_buf+i, cr);
339
return PRIMITIVES_SUCCESS;
342
/*---------------------------------------------------------------------------*/
343
#define LOAD128(_src_) \
344
_mm_load_si128((__m128i *) _src_)
345
#define STORE128(_dst_, _src_) \
346
_mm_store_si128((__m128i *) _dst_, _src_)
347
#define PUNPCKLBW(_dst_, _src_) \
348
_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
349
#define PUNPCKHBW(_dst_, _src_) \
350
_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
351
#define PUNPCKLWD(_dst_, _src_) \
352
_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
353
#define PUNPCKHWD(_dst_, _src_) \
354
_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
355
#define PACKUSWB(_dst_, _src_) \
356
_dst_ = _mm_packus_epi16(_dst_, _src_)
357
#define PREFETCH(_ptr_) \
358
_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
359
#define XMM_ALL_ONES \
360
_mm_set1_epi32(0xFFFFFFFFU)
362
PRIMITIVES_HIDDEN pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
363
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
364
INT32 srcStep, /* bytes between rows in source data */
365
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
366
INT32 dstStep, /* bytes between rows in dest data */
367
const prim_size_t *roi) /* region of interest */
369
const UINT16 *r = (const UINT16 *) (pSrc[0]);
370
const UINT16 *g = (const UINT16 *) (pSrc[1]);
371
const UINT16 *b = (const UINT16 *) (pSrc[2]);
373
int srcbump, dstbump, y;
375
/* Ensure 16-byte alignment on all pointers,
376
* that width is a multiple of 8,
377
* and that the next row will also remain aligned.
378
* Since this is usually used for 64x64 aligned arrays,
379
* these checks should presumably pass.
381
if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
382
|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
383
|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
384
|| (((ULONG_PTR) pDst & 0x0f) != 0)
385
|| (roi->width & 0x0f)
389
return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
393
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
394
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
396
for (y=0; y<roi->height; ++y)
398
int width = roi->width;
400
__m128i R0, R1, R2, R3, R4;
401
/* The comments below pretend these are 8-byte registers
402
* rather than 16-byte, for readability.
404
R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */
405
R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */
406
PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */
407
R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */
408
R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */
409
PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */
410
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
411
PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */
412
PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */
413
R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */
414
R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */
415
PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */
416
R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */
417
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
418
PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */
419
PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */
420
R0 = R4; /* R0 = R4 */
421
PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */
422
PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */
423
R2 = R3; /* R2 = R3 */
424
PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */
425
PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */
426
STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */
427
STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */
428
STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */
429
STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */
430
} while (width -= 16);
431
/* Jump to next row. */
437
return PRIMITIVES_SUCCESS;
439
#endif /* WITH_SSE2 */
441
/*---------------------------------------------------------------------------*/
443
PRIMITIVES_HIDDEN pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
444
const INT16 *pSrc[3],
448
const prim_size_t *roi) /* region of interest */
450
/* TODO: If necessary, check alignments and call the general version. */
452
int16x8_t zero = vdupq_n_s16(0);
453
int16x8_t max = vdupq_n_s16(255);
455
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
456
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
457
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
458
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
459
int16x8_t c4096 = vdupq_n_s16(4096);
461
int16x8_t* y_buf = (int16x8_t*) pSrc[0];
462
int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
463
int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
464
int16x8_t* r_buf = (int16x8_t*) pDst[0];
465
int16x8_t* g_buf = (int16x8_t*) pDst[1];
466
int16x8_t* b_buf = (int16x8_t*) pDst[2];
468
int srcbump = srcStep / sizeof(int16x8_t);
469
int dstbump = dstStep / sizeof(int16x8_t);
472
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
473
for (yp=0; yp<roi->height; ++yp)
476
for (i=0; i<imax; i++)
479
In order to use NEON signed 16-bit integer multiplication we need to convert
480
the floating point factors to signed int without loosing information.
481
The result of this multiplication is 32 bit and we have a NEON instruction
482
that returns the hi word of the saturated double.
483
Thus we will multiply the factors by the highest possible 2^n, take the
484
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
485
shift by 1 to reverse the doubling) and correct this result by multiplying it
487
For the given factors in the conversion matrix the best possible n is 14.
489
Example for calculating r:
490
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
491
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
492
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
493
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
496
/* y = (y_buf[i] + 4096) >> 2 */
497
int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
498
y = vaddq_s16(y, c4096);
499
y = vshrq_n_s16(y, 2);
500
/* cb = cb_buf[i]; */
501
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
502
/* cr = cr_buf[i]; */
503
int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
505
/* (y + HIWORD(cr*22986)) >> 3 */
506
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
507
r = vshrq_n_s16(r, 3);
508
/* r_buf[i] = MINMAX(r, 0, 255); */
509
r = vminq_s16(vmaxq_s16(r, zero), max);
510
vst1q_s16((INT16*)&r_buf[i], r);
512
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
513
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
514
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
515
g = vshrq_n_s16(g, 3);
516
/* g_buf[i] = MINMAX(g, 0, 255); */
517
g = vminq_s16(vmaxq_s16(g, zero), max);
518
vst1q_s16((INT16*)&g_buf[i], g);
520
/* (y + HIWORD(cb*28999)) >> 3 */
521
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
522
b = vshrq_n_s16(b, 3);
523
/* b_buf[i] = MINMAX(b, 0, 255); */
524
b = vminq_s16(vmaxq_s16(b, zero), max);
525
vst1q_s16((INT16*)&b_buf[i], b);
535
return PRIMITIVES_SUCCESS;
537
#endif /* WITH_NEON */
540
/* I don't see a direct IPP version of this, since the input is INT16
541
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
542
* But that would likely be slower.
545
/* ------------------------------------------------------------------------- */
546
void primitives_init_colors_opt(primitives_t* prims)
548
#if defined(WITH_SSE2)
549
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
551
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
552
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
553
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
555
#elif defined(WITH_NEON)
556
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
558
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
560
#endif /* WITH_SSE2 */