1
/* FreeRDP: A Remote Desktop Protocol Client
2
* Optimized sign operations.
5
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
* Licensed under the Apache License, Version 2.0 (the "License"); you may
7
* not use this file except in compliance with the License. You may obtain
8
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
* Unless required by applicable law or agreed to in writing, software
10
* distributed under the License is distributed on an "AS IS" BASIS,
11
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
* or implied. See the License for the specific language governing
13
* permissions and limitations under the License.
20
#include <freerdp/types.h>
21
#include <freerdp/primitives.h>
22
#include <winpr/sysinfo.h>
25
#include <emmintrin.h>
26
#include <tmmintrin.h>
27
#endif /* WITH_SSE2 */
29
#include "prim_internal.h"
30
#include "prim_sign.h"
34
/* ------------------------------------------------------------------------- */
35
PRIMITIVES_HIDDEN pstatus_t ssse3_sign_16s(
40
const INT16 *sptr = (const INT16 *) pSrc;
41
INT16 *dptr = (INT16 *) pDst;
46
return general_sign_16s(pSrc, pDst, len);
49
/* Check for 16-byte alignment (eventually). */
50
if ((ULONG_PTR) pDst & 0x01)
52
return general_sign_16s(pSrc, pDst, len);
55
/* Seek 16-byte alignment. */
56
while ((ULONG_PTR) dptr & 0x0f)
59
*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
60
if (--len == 0) return PRIMITIVES_SUCCESS;
63
/* Do 32-short chunks using 8 XMM registers. */
64
count = len >> 5; /* / 32 */
65
len -= count << 5; /* * 32 */
66
if ((ULONG_PTR) sptr & 0x0f)
71
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
72
xmm0 = _mm_set1_epi16(0x0001U);
73
xmm1 = _mm_set1_epi16(0x0001U);
74
xmm2 = _mm_set1_epi16(0x0001U);
75
xmm3 = _mm_set1_epi16(0x0001U);
76
xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
77
xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
78
xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
79
xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
80
xmm0 = _mm_sign_epi16(xmm0, xmm4);
81
xmm1 = _mm_sign_epi16(xmm1, xmm5);
82
xmm2 = _mm_sign_epi16(xmm2, xmm6);
83
xmm3 = _mm_sign_epi16(xmm3, xmm7);
84
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
85
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
86
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
87
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
95
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
96
xmm0 = _mm_set1_epi16(0x0001U);
97
xmm1 = _mm_set1_epi16(0x0001U);
98
xmm2 = _mm_set1_epi16(0x0001U);
99
xmm3 = _mm_set1_epi16(0x0001U);
100
xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
101
xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
102
xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
103
xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
104
xmm0 = _mm_sign_epi16(xmm0, xmm4);
105
xmm1 = _mm_sign_epi16(xmm1, xmm5);
106
xmm2 = _mm_sign_epi16(xmm2, xmm6);
107
xmm3 = _mm_sign_epi16(xmm3, xmm7);
108
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
109
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
110
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
111
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
115
/* Do 8-short chunks using two XMM registers. */
120
__m128i xmm0 = _mm_set1_epi16(0x0001U);
121
__m128i xmm1 = LOAD_SI128(sptr); sptr += 8;
122
xmm0 = _mm_sign_epi16(xmm0, xmm1);
123
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
130
*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
133
return PRIMITIVES_SUCCESS;
135
#endif /* WITH_SSE2 */
137
/* ------------------------------------------------------------------------- */
138
void primitives_init_sign_opt(primitives_t *prims)
140
/* Pick tuned versions if possible. */
141
/* I didn't spot an IPP version of this. */
142
#if defined(WITH_SSE2)
143
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)
144
&& IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
146
prims->sign_16s = ssse3_sign_16s;