2
* Copyright © 2008 Rodrigo Kumpera
3
* Copyright © 2008 André Tupinambá
5
* Permission to use, copy, modify, distribute, and sell this software and its
6
* documentation for any purpose is hereby granted without fee, provided that
7
* the above copyright notice appear in all copies and that both that
8
* copyright notice and this permission notice appear in supporting
9
* documentation, and that the name of Red Hat not be used in advertising or
10
* publicity pertaining to distribution of the software without specific,
11
* written prior permission. Red Hat makes no representations about the
12
* suitability of this software for any purpose. It is provided "as is"
13
* without express or implied warranty.
15
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24
* Author: Rodrigo Kumpera (kumpera@gmail.com)
25
* André Tupinambá (andrelrt@gmail.com)
27
* Based on work by Owen Taylor and Søren Sandmann
34
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35
#include <emmintrin.h> /* for SSE2 intrinsics */
37
#include "pixman-sse2.h"
41
/* -------------------------------------------------------------------------------------------------
45
static __m64 xMask0080;
46
static __m64 xMask00ff;
47
static __m64 xMask0101;
48
static __m64 xMaskAlpha;
50
static __m64 xMask565rgb;
51
static __m64 xMask565Unpack;
53
static __m128i Mask0080;
54
static __m128i Mask00ff;
55
static __m128i Mask0101;
56
static __m128i Maskffff;
57
static __m128i Maskff000000;
58
static __m128i MaskAlpha;
60
static __m128i Mask565r;
61
static __m128i Mask565g1, Mask565g2;
62
static __m128i Mask565b;
63
static __m128i MaskRed;
64
static __m128i MaskGreen;
65
static __m128i MaskBlue;
67
static __m128i Mask565FixRB;
68
static __m128i Mask565FixG;
70
/* -------------------------------------------------------------------------------------------------
73
static force_inline __m128i
74
unpack_32_1x128 (uint32_t data)
76
return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
79
static force_inline void
80
unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
82
*dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
83
*dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86
static force_inline __m128i
87
unpack565to8888 (__m128i lo)
89
__m128i r, g, b, rb, t;
91
r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
92
g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
93
b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
95
rb = _mm_or_si128 (r, b);
96
t = _mm_and_si128 (rb, Mask565FixRB);
97
t = _mm_srli_epi32 (t, 5);
98
rb = _mm_or_si128 (rb, t);
100
t = _mm_and_si128 (g, Mask565FixG);
101
t = _mm_srli_epi32 (t, 6);
102
g = _mm_or_si128 (g, t);
104
return _mm_or_si128 (rb, g);
107
static force_inline void
108
unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
112
lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
113
hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
115
lo = unpack565to8888 (lo);
116
hi = unpack565to8888 (hi);
118
unpack_128_2x128 (lo, data0, data1);
119
unpack_128_2x128 (hi, data2, data3);
122
static force_inline uint16_t
123
pack565_32_16 (uint32_t pixel)
125
return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
128
static force_inline __m128i
129
pack_2x128_128 (__m128i lo, __m128i hi)
131
return _mm_packus_epi16 (lo, hi);
134
static force_inline __m128i
135
pack565_2x128_128 (__m128i lo, __m128i hi)
138
__m128i r, g1, g2, b;
140
data = pack_2x128_128 ( lo, hi );
142
r = _mm_and_si128 (data , Mask565r);
143
g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
144
g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
145
b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
147
return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
150
static force_inline __m128i
151
pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
153
return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
156
static force_inline uint32_t
157
packAlpha (__m128i x)
159
return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24),
160
_mm_setzero_si128 ()),
161
_mm_setzero_si128 ()));
164
static force_inline __m128i
165
expandPixel_32_1x128 (uint32_t data)
167
return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
170
static force_inline __m128i
171
expandAlpha_1x128 (__m128i data)
173
return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
176
static force_inline void
177
expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
181
lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
182
hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
183
*alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
184
*alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
187
static force_inline void
188
expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
192
lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
193
hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
194
*alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
195
*alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
198
static force_inline void
199
pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
203
lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
204
hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
205
lo = _mm_adds_epu16 (lo, Mask0080);
206
hi = _mm_adds_epu16 (hi, Mask0080);
207
*retLo = _mm_mulhi_epu16 (lo, Mask0101);
208
*retHi = _mm_mulhi_epu16 (hi, Mask0101);
211
static force_inline void
212
pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
213
__m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
214
__m128i* retLo, __m128i* retHi)
217
__m128i mulLo, mulHi;
219
lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
220
hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
221
mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
222
mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
223
lo = _mm_adds_epu16 (lo, Mask0080);
224
hi = _mm_adds_epu16 (hi, Mask0080);
225
lo = _mm_adds_epu16 (lo, mulLo);
226
hi = _mm_adds_epu16 (hi, mulHi);
227
*retLo = _mm_mulhi_epu16 (lo, Mask0101);
228
*retHi = _mm_mulhi_epu16 (hi, Mask0101);
231
static force_inline void
232
negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
234
*negLo = _mm_xor_si128 (dataLo, Mask00ff);
235
*negHi = _mm_xor_si128 (dataHi, Mask00ff);
238
static force_inline void
239
invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
243
lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
244
hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
245
*invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
246
*invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
249
static force_inline void
250
over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
254
negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
256
pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
258
*dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
259
*dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
262
static force_inline void
263
overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
266
__m128i alphaLo, alphaHi;
268
expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
270
lo = _mm_or_si128 (alphaLo, MaskAlpha);
271
hi = _mm_or_si128 (alphaHi, MaskAlpha);
273
invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
275
pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
277
over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
280
static force_inline void
281
inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
282
__m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
287
pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
288
pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
290
over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
293
static force_inline void
294
cachePrefetch (__m128i* addr)
296
_mm_prefetch (addr, _MM_HINT_T0);
299
static force_inline void
300
cachePrefetchNext (__m128i* addr)
302
_mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
305
/* load 4 pixels from a 16-byte boundary aligned address */
306
static force_inline __m128i
307
load128Aligned (__m128i* src)
309
return _mm_load_si128 (src);
312
/* load 4 pixels from a unaligned address */
313
static force_inline __m128i
314
load128Unaligned (__m128i* src)
316
return _mm_loadu_si128 (src);
319
/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
320
static force_inline void
321
save128WriteCombining (__m128i* dst, __m128i data)
323
_mm_stream_si128 (dst, data);
326
/* save 4 pixels on a 16-byte boundary aligned address */
327
static force_inline void
328
save128Aligned (__m128i* dst, __m128i data)
330
_mm_store_si128 (dst, data);
333
/* save 4 pixels on a unaligned address */
334
static force_inline void
335
save128Unaligned (__m128i* dst, __m128i data)
337
_mm_storeu_si128 (dst, data);
340
/* -------------------------------------------------------------------------------------------------
344
static force_inline __m64
345
unpack_32_1x64 (uint32_t data)
347
return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
350
static force_inline __m64
351
expandAlpha_1x64 (__m64 data)
353
return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
356
static force_inline __m64
357
expandAlphaRev_1x64 (__m64 data)
359
return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
362
static force_inline __m64
363
expandPixel_8_1x64 (uint8_t data)
365
return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
368
static force_inline __m64
369
pixMultiply_1x64 (__m64 data, __m64 alpha)
371
return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
376
static force_inline __m64
377
pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
379
return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
381
_mm_mullo_pi16 (*dst, *alphaSrc)),
385
static force_inline __m64
386
negate_1x64 (__m64 data)
388
return _mm_xor_si64 (data, xMask00ff);
391
static force_inline __m64
392
invertColors_1x64 (__m64 data)
394
return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
397
static force_inline __m64
398
over_1x64 (__m64 src, __m64 alpha, __m64 dst)
400
return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
403
static force_inline __m64
404
inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
406
return over_1x64 (pixMultiply_1x64 (*src, *mask),
407
pixMultiply_1x64 (*alpha, *mask),
411
static force_inline __m64
412
overRevNonPre_1x64 (__m64 src, __m64 dst)
414
__m64 alpha = expandAlpha_1x64 (src);
416
return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
417
_mm_or_si64 (alpha, xMaskAlpha)),
422
static force_inline uint32_t
423
pack_1x64_32( __m64 data )
425
return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
428
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
432
* --- Expanding 565 in the low word ---
434
* m = (m << (32 - 3)) | (m << (16 - 5)) | m;
435
* m = m & (01f0003f001f);
436
* m = m * (008404100840);
439
* Note the trick here - the top word is shifted by another nibble to
440
* avoid it bumping into the middle word
442
static force_inline __m64
443
expand565_16_1x64 (uint16_t pixel)
448
p = _mm_cvtsi32_si64 ((uint32_t) pixel);
450
t1 = _mm_slli_si64 (p, 36 - 11);
451
t2 = _mm_slli_si64 (p, 16 - 5);
453
p = _mm_or_si64 (t1, p);
454
p = _mm_or_si64 (t2, p);
455
p = _mm_and_si64 (p, xMask565rgb);
456
p = _mm_mullo_pi16 (p, xMask565Unpack);
458
return _mm_srli_pi16 (p, 8);
461
/* -------------------------------------------------------------------------------------------------
462
* Compose Core transformations
464
static force_inline uint32_t
465
coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
478
ms = unpack_32_1x64 (src);
479
return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
485
static force_inline void
486
coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w)
491
__m128i xmmDstLo, xmmDstHi;
492
__m128i xmmSrcLo, xmmSrcHi;
493
__m128i xmmAlphaLo, xmmAlphaHi;
495
/* call prefetch hint to optimize cache load*/
496
cachePrefetch ((__m128i*)ps);
497
cachePrefetch ((__m128i*)pd);
499
/* Align dst on a 16-byte boundary */
501
((unsigned long)pd & 15))
506
*pd++ = coreCombineOverUPixelsse2 (s, d);
510
/* call prefetch hint to optimize cache load*/
511
cachePrefetch ((__m128i*)ps);
512
cachePrefetch ((__m128i*)pd);
516
/* fill cache line with next memory */
517
cachePrefetchNext ((__m128i*)ps);
518
cachePrefetchNext ((__m128i*)pd);
520
/* I'm loading unaligned because I'm not sure about the address alignment. */
521
xmmSrcHi = load128Unaligned ((__m128i*) ps);
523
/* Check the alpha channel */
524
pa = packAlpha (xmmSrcHi);
526
if (pa == 0xffffffff)
528
save128Aligned ((__m128i*)pd, xmmSrcHi);
532
xmmDstHi = load128Aligned ((__m128i*) pd);
534
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
535
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
537
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
539
over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
541
/* rebuid the 4 pixel data and save*/
542
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
555
*pd++ = coreCombineOverUPixelsse2 (s, d);
560
static force_inline void
561
coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w)
565
__m128i xmmDstLo, xmmDstHi;
566
__m128i xmmSrcLo, xmmSrcHi;
567
__m128i xmmAlphaLo, xmmAlphaHi;
569
/* call prefetch hint to optimize cache load*/
570
cachePrefetch ((__m128i*)ps);
571
cachePrefetch ((__m128i*)pd);
573
/* Align dst on a 16-byte boundary */
575
((unsigned long)pd & 15))
580
*pd++ = coreCombineOverUPixelsse2 (d, s);
584
/* call prefetch hint to optimize cache load*/
585
cachePrefetch ((__m128i*)ps);
586
cachePrefetch ((__m128i*)pd);
590
/* fill cache line with next memory */
591
cachePrefetchNext ((__m128i*)ps);
592
cachePrefetchNext ((__m128i*)pd);
594
/* I'm loading unaligned because I'm not sure about the address alignment. */
595
xmmSrcHi = load128Unaligned ((__m128i*) ps);
596
xmmDstHi = load128Aligned ((__m128i*) pd);
598
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
599
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
601
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
603
over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
605
/* rebuid the 4 pixel data and save*/
606
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
618
*pd++ = coreCombineOverUPixelsse2 (d, s);
623
static force_inline uint32_t
624
coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
626
uint32_t maska = src >> 24;
632
else if (maska != 0xff)
634
return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
640
static force_inline void
641
coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
645
__m128i xmmSrcLo, xmmSrcHi;
646
__m128i xmmDstLo, xmmDstHi;
648
/* call prefetch hint to optimize cache load*/
649
cachePrefetch ((__m128i*)ps);
650
cachePrefetch ((__m128i*)pd);
652
while (w && ((unsigned long) pd & 15))
657
*pd++ = coreCombineInUPixelsse2 (d, s);
661
/* call prefetch hint to optimize cache load*/
662
cachePrefetch ((__m128i*)ps);
663
cachePrefetch ((__m128i*)pd);
667
/* fill cache line with next memory */
668
cachePrefetchNext ((__m128i*)ps);
669
cachePrefetchNext ((__m128i*)pd);
671
xmmDstHi = load128Aligned ((__m128i*) pd);
672
xmmSrcHi = load128Unaligned ((__m128i*) ps);
674
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
675
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
677
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
678
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
680
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
692
*pd++ = coreCombineInUPixelsse2 (d, s);
697
static force_inline void
698
coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
702
__m128i xmmSrcLo, xmmSrcHi;
703
__m128i xmmDstLo, xmmDstHi;
705
/* call prefetch hint to optimize cache load*/
706
cachePrefetch ((__m128i*)ps);
707
cachePrefetch ((__m128i*)pd);
709
while (w && ((unsigned long) pd & 15))
714
*pd++ = coreCombineInUPixelsse2 (s, d);
718
/* call prefetch hint to optimize cache load*/
719
cachePrefetch ((__m128i*)ps);
720
cachePrefetch ((__m128i*)pd);
724
/* fill cache line with next memory */
725
cachePrefetchNext ((__m128i*)ps);
726
cachePrefetchNext ((__m128i*)pd);
728
xmmDstHi = load128Aligned ((__m128i*) pd);
729
xmmSrcHi = load128Unaligned ((__m128i*) ps);
731
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
732
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
734
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
735
pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
737
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
749
*pd++ = coreCombineInUPixelsse2 (s, d);
754
static force_inline void
755
coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
757
/* call prefetch hint to optimize cache load*/
758
cachePrefetch ((__m128i*)ps);
759
cachePrefetch ((__m128i*)pd);
761
while (w && ((unsigned long) pd & 15))
766
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
770
/* call prefetch hint to optimize cache load*/
771
cachePrefetch ((__m128i*)ps);
772
cachePrefetch ((__m128i*)pd);
776
__m128i xmmSrcLo, xmmSrcHi;
777
__m128i xmmDstLo, xmmDstHi;
779
/* fill cache line with next memory */
780
cachePrefetchNext ((__m128i*)ps);
781
cachePrefetchNext ((__m128i*)pd);
783
xmmSrcHi = load128Unaligned ((__m128i*) ps);
784
xmmDstHi = load128Aligned ((__m128i*) pd);
786
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
787
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
789
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
790
negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
792
pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
794
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
806
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
811
static force_inline void
812
coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
814
/* call prefetch hint to optimize cache load*/
815
cachePrefetch ((__m128i*)ps);
816
cachePrefetch ((__m128i*)pd);
818
while (w && ((unsigned long) pd & 15))
823
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
827
/* call prefetch hint to optimize cache load*/
828
cachePrefetch ((__m128i*)ps);
829
cachePrefetch ((__m128i*)pd);
833
__m128i xmmSrcLo, xmmSrcHi;
834
__m128i xmmDstLo, xmmDstHi;
836
/* fill cache line with next memory */
837
cachePrefetchNext ((__m128i*)ps);
838
cachePrefetchNext ((__m128i*)pd);
840
xmmSrcHi = load128Unaligned ((__m128i*) ps);
841
xmmDstHi = load128Aligned ((__m128i*) pd);
843
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
844
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
846
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
847
negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
849
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
851
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
863
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
868
static force_inline uint32_t
869
coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
871
__m64 s = unpack_32_1x64 (src);
872
__m64 d = unpack_32_1x64 (dst);
874
__m64 sa = negate_1x64 (expandAlpha_1x64 (s));
875
__m64 da = expandAlpha_1x64 (d);
877
return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
880
static force_inline void
881
coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
885
__m128i xmmSrcLo, xmmSrcHi;
886
__m128i xmmDstLo, xmmDstHi;
887
__m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
888
__m128i xmmAlphaDstLo, xmmAlphaDstHi;
890
/* call prefetch hint to optimize cache load*/
891
cachePrefetch ((__m128i*)ps);
892
cachePrefetch ((__m128i*)pd);
894
while (w && ((unsigned long) pd & 15))
899
*pd++ = coreCombineAtopUPixelsse2 (s, d);
903
/* call prefetch hint to optimize cache load*/
904
cachePrefetch ((__m128i*)ps);
905
cachePrefetch ((__m128i*)pd);
909
/* fill cache line with next memory */
910
cachePrefetchNext ((__m128i*)ps);
911
cachePrefetchNext ((__m128i*)pd);
913
xmmSrcHi = load128Unaligned ((__m128i*) ps);
914
xmmDstHi = load128Aligned ((__m128i*) pd);
916
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
917
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
919
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
920
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
922
negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
924
pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
925
&xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
926
&xmmDstLo, &xmmDstHi );
928
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
940
*pd++ = coreCombineAtopUPixelsse2 (s, d);
945
static force_inline uint32_t
946
coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
948
__m64 s = unpack_32_1x64 (src);
949
__m64 d = unpack_32_1x64 (dst);
951
__m64 sa = expandAlpha_1x64 (s);
952
__m64 da = negate_1x64 (expandAlpha_1x64 (d));
954
return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
957
static force_inline void
958
coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
962
__m128i xmmSrcLo, xmmSrcHi;
963
__m128i xmmDstLo, xmmDstHi;
964
__m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
965
__m128i xmmAlphaDstLo, xmmAlphaDstHi;
967
/* call prefetch hint to optimize cache load*/
968
cachePrefetch ((__m128i*)ps);
969
cachePrefetch ((__m128i*)pd);
971
while (w && ((unsigned long) pd & 15))
976
*pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
980
/* call prefetch hint to optimize cache load*/
981
cachePrefetch ((__m128i*)ps);
982
cachePrefetch ((__m128i*)pd);
986
/* fill cache line with next memory */
987
cachePrefetchNext ((__m128i*)ps);
988
cachePrefetchNext ((__m128i*)pd);
990
xmmSrcHi = load128Unaligned ((__m128i*) ps);
991
xmmDstHi = load128Aligned ((__m128i*) pd);
993
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
994
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
996
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
997
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
999
negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1001
pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1002
&xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1003
&xmmDstLo, &xmmDstHi );
1005
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1017
*pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1022
static force_inline uint32_t
1023
coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
1025
__m64 s = unpack_32_1x64 (src);
1026
__m64 d = unpack_32_1x64 (dst);
1028
__m64 negD = negate_1x64 (expandAlpha_1x64 (d));
1029
__m64 negS = negate_1x64 (expandAlpha_1x64 (s));
1031
return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
1034
static force_inline void
1035
coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width)
1040
const uint32_t* ps = src;
1042
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
1043
__m128i xmmDst, xmmDstLo, xmmDstHi;
1044
__m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1045
__m128i xmmAlphaDstLo, xmmAlphaDstHi;
1047
/* call prefetch hint to optimize cache load*/
1048
cachePrefetch ((__m128i*)ps);
1049
cachePrefetch ((__m128i*)pd);
1051
while (w && ((unsigned long) pd & 15))
1056
*pd++ = coreCombineXorUPixelsse2 (s, d);
1060
/* call prefetch hint to optimize cache load*/
1061
cachePrefetch ((__m128i*)ps);
1062
cachePrefetch ((__m128i*)pd);
1066
/* fill cache line with next memory */
1067
cachePrefetchNext ((__m128i*)ps);
1068
cachePrefetchNext ((__m128i*)pd);
1070
xmmSrc = load128Unaligned ((__m128i*) ps);
1071
xmmDst = load128Aligned ((__m128i*) pd);
1073
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
1074
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
1076
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1077
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1079
negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1080
negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1082
pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1083
&xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1084
&xmmDstLo, &xmmDstHi );
1086
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1098
*pd++ = coreCombineXorUPixelsse2 (s, d);
1103
static force_inline void
1104
coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width)
1109
const uint32_t* ps = src;
1111
/* call prefetch hint to optimize cache load*/
1112
cachePrefetch ((__m128i*)ps);
1113
cachePrefetch ((__m128i*)pd);
1115
while (w && (unsigned long)pd & 15)
1119
*pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1123
/* call prefetch hint to optimize cache load*/
1124
cachePrefetch ((__m128i*)ps);
1125
cachePrefetch ((__m128i*)pd);
1129
/* fill cache line with next memory */
1130
cachePrefetchNext ((__m128i*)ps);
1131
cachePrefetchNext ((__m128i*)pd);
1133
save128Aligned( (__m128i*)pd,
1134
_mm_adds_epu8( load128Unaligned((__m128i*)ps),
1135
load128Aligned ((__m128i*)pd)) );
1145
*pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1149
static force_inline uint32_t
1150
coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
1152
__m64 ms = unpack_32_1x64 (src);
1153
__m64 md = unpack_32_1x64 (dst);
1154
uint32_t sa = src >> 24;
1155
uint32_t da = ~dst >> 24;
1159
ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
1162
return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1165
static force_inline void
1166
coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w)
1171
__m128i xmmSrc, xmmDst;
1173
/* call prefetch hint to optimize cache load*/
1174
cachePrefetch ((__m128i*)ps);
1175
cachePrefetch ((__m128i*)pd);
1177
while (w && (unsigned long)pd & 15)
1181
*pd++ = coreCombineSaturateUPixelsse2 (s, d);
1185
/* call prefetch hint to optimize cache load*/
1186
cachePrefetch ((__m128i*)ps);
1187
cachePrefetch ((__m128i*)pd);
1191
/* fill cache line with next memory */
1192
cachePrefetchNext ((__m128i*)ps);
1193
cachePrefetchNext ((__m128i*)pd);
1195
xmmDst = load128Aligned ((__m128i*)pd);
1196
xmmSrc = load128Unaligned((__m128i*)ps);
1198
packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
1199
_mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
1201
/* if some alpha src is grater than respective ~alpha dst */
1206
*pd++ = coreCombineSaturateUPixelsse2 (s, d);
1210
*pd++ = coreCombineSaturateUPixelsse2 (s, d);
1214
*pd++ = coreCombineSaturateUPixelsse2 (s, d);
1218
*pd++ = coreCombineSaturateUPixelsse2 (s, d);
1222
save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
1235
*pd++ = coreCombineSaturateUPixelsse2 (s, d);
1239
static force_inline void
1240
coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1244
__m128i xmmSrcLo, xmmSrcHi;
1245
__m128i xmmMaskLo, xmmMaskHi;
1246
__m128i xmmDstLo, xmmDstHi;
1248
/* call prefetch hint to optimize cache load*/
1249
cachePrefetch ((__m128i*)ps);
1250
cachePrefetch ((__m128i*)pd);
1251
cachePrefetch ((__m128i*)pm);
1253
while (w && (unsigned long)pd & 15)
1257
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1261
/* call prefetch hint to optimize cache load*/
1262
cachePrefetch ((__m128i*)ps);
1263
cachePrefetch ((__m128i*)pd);
1264
cachePrefetch ((__m128i*)pm);
1268
/* fill cache line with next memory */
1269
cachePrefetchNext ((__m128i*)ps);
1270
cachePrefetchNext ((__m128i*)pd);
1271
cachePrefetchNext ((__m128i*)pm);
1273
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1274
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1276
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1277
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1279
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1281
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1293
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1298
static force_inline uint32_t
1299
coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1301
__m64 s = unpack_32_1x64 (src);
1302
__m64 expAlpha = expandAlpha_1x64 (s);
1303
__m64 unpkMask = unpack_32_1x64 (mask);
1304
__m64 unpkDst = unpack_32_1x64 (dst);
1306
return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
1309
static force_inline void
1310
coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1314
__m128i xmmAlphaLo, xmmAlphaHi;
1315
__m128i xmmSrcLo, xmmSrcHi;
1316
__m128i xmmDstLo, xmmDstHi;
1317
__m128i xmmMaskLo, xmmMaskHi;
1319
/* call prefetch hint to optimize cache load*/
1320
cachePrefetch ((__m128i*)ps);
1321
cachePrefetch ((__m128i*)pd);
1322
cachePrefetch ((__m128i*)pm);
1324
while (w && (unsigned long)pd & 15)
1330
*pd++ = coreCombineOverCPixelsse2 (s, m, d);
1334
/* call prefetch hint to optimize cache load*/
1335
cachePrefetch ((__m128i*)ps);
1336
cachePrefetch ((__m128i*)pd);
1337
cachePrefetch ((__m128i*)pm);
1341
/* fill cache line with next memory */
1342
cachePrefetchNext ((__m128i*)ps);
1343
cachePrefetchNext ((__m128i*)pd);
1344
cachePrefetchNext ((__m128i*)pm);
1346
xmmDstHi = load128Aligned ((__m128i*)pd);
1347
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1348
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1350
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1351
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1352
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1354
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1356
inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1358
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1372
*pd++ = coreCombineOverCPixelsse2 (s, m, d);
1377
static force_inline uint32_t
1378
coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1380
__m64 d = unpack_32_1x64 (dst);
1382
return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1385
static force_inline void
1386
coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1390
__m128i xmmAlphaLo, xmmAlphaHi;
1391
__m128i xmmSrcLo, xmmSrcHi;
1392
__m128i xmmDstLo, xmmDstHi;
1393
__m128i xmmMaskLo, xmmMaskHi;
1395
/* call prefetch hint to optimize cache load*/
1396
cachePrefetch ((__m128i*)ps);
1397
cachePrefetch ((__m128i*)pd);
1398
cachePrefetch ((__m128i*)pm);
1400
while (w && (unsigned long)pd & 15)
1406
*pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1410
/* call prefetch hint to optimize cache load*/
1411
cachePrefetch ((__m128i*)ps);
1412
cachePrefetch ((__m128i*)pd);
1413
cachePrefetch ((__m128i*)pm);
1417
/* fill cache line with next memory */
1418
cachePrefetchNext ((__m128i*)ps);
1419
cachePrefetchNext ((__m128i*)pd);
1420
cachePrefetchNext ((__m128i*)pm);
1422
xmmDstHi = load128Aligned ((__m128i*)pd);
1423
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1424
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1426
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1427
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1428
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1430
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1431
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1433
over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1435
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
1449
*pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1454
static force_inline void
1455
coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1459
__m128i xmmAlphaLo, xmmAlphaHi;
1460
__m128i xmmSrcLo, xmmSrcHi;
1461
__m128i xmmDstLo, xmmDstHi;
1462
__m128i xmmMaskLo, xmmMaskHi;
1464
/* call prefetch hint to optimize cache load*/
1465
cachePrefetch ((__m128i*)ps);
1466
cachePrefetch ((__m128i*)pd);
1467
cachePrefetch ((__m128i*)pm);
1469
while (w && (unsigned long)pd & 15)
1475
*pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1476
expandAlpha_1x64 (unpack_32_1x64 (d))));
1480
/* call prefetch hint to optimize cache load*/
1481
cachePrefetch ((__m128i*)ps);
1482
cachePrefetch ((__m128i*)pd);
1483
cachePrefetch ((__m128i*)pm);
1487
/* fill cache line with next memory */
1488
cachePrefetchNext ((__m128i*)ps);
1489
cachePrefetchNext ((__m128i*)pd);
1490
cachePrefetchNext ((__m128i*)pm);
1492
xmmDstHi = load128Aligned ((__m128i*)pd);
1493
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1494
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1496
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1497
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1498
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1500
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1501
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1503
pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1505
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1519
*pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1520
expandAlpha_1x64 (unpack_32_1x64 (d))));
1525
static force_inline void
1526
coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1530
__m128i xmmAlphaLo, xmmAlphaHi;
1531
__m128i xmmSrcLo, xmmSrcHi;
1532
__m128i xmmDstLo, xmmDstHi;
1533
__m128i xmmMaskLo, xmmMaskHi;
1535
/* call prefetch hint to optimize cache load*/
1536
cachePrefetch ((__m128i*)ps);
1537
cachePrefetch ((__m128i*)pd);
1538
cachePrefetch ((__m128i*)pm);
1540
while (w && (unsigned long)pd & 15)
1546
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1547
pixMultiply_1x64 (unpack_32_1x64 (m),
1548
expandAlpha_1x64 (unpack_32_1x64 (s)))));
1552
/* call prefetch hint to optimize cache load*/
1553
cachePrefetch ((__m128i*)ps);
1554
cachePrefetch ((__m128i*)pd);
1555
cachePrefetch ((__m128i*)pm);
1559
/* fill cache line with next memory */
1560
cachePrefetchNext ((__m128i*)ps);
1561
cachePrefetchNext ((__m128i*)pd);
1562
cachePrefetchNext ((__m128i*)pm);
1564
xmmDstHi = load128Aligned ((__m128i*)pd);
1565
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1566
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1568
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1569
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1570
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1572
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1573
pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1575
pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1577
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1591
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1592
pixMultiply_1x64 (unpack_32_1x64 (m),
1593
expandAlpha_1x64 (unpack_32_1x64 (s)))));
1598
static force_inline void
1599
coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1603
__m128i xmmAlphaLo, xmmAlphaHi;
1604
__m128i xmmSrcLo, xmmSrcHi;
1605
__m128i xmmDstLo, xmmDstHi;
1606
__m128i xmmMaskLo, xmmMaskHi;
1608
/* call prefetch hint to optimize cache load*/
1609
cachePrefetch ((__m128i*)ps);
1610
cachePrefetch ((__m128i*)pd);
1611
cachePrefetch ((__m128i*)pm);
1613
while (w && (unsigned long)pd & 15)
1619
*pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1620
negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1624
/* call prefetch hint to optimize cache load*/
1625
cachePrefetch ((__m128i*)ps);
1626
cachePrefetch ((__m128i*)pd);
1627
cachePrefetch ((__m128i*)pm);
1631
/* fill cache line with next memory */
1632
cachePrefetchNext ((__m128i*)ps);
1633
cachePrefetchNext ((__m128i*)pd);
1634
cachePrefetchNext ((__m128i*)pm);
1636
xmmDstHi = load128Aligned ((__m128i*)pd);
1637
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1638
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1640
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1641
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1642
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1644
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1645
negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1647
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1648
pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1650
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1664
*pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1665
negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1670
static force_inline void
1671
coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1675
__m128i xmmAlphaLo, xmmAlphaHi;
1676
__m128i xmmSrcLo, xmmSrcHi;
1677
__m128i xmmDstLo, xmmDstHi;
1678
__m128i xmmMaskLo, xmmMaskHi;
1680
/* call prefetch hint to optimize cache load*/
1681
cachePrefetch ((__m128i*)ps);
1682
cachePrefetch ((__m128i*)pd);
1683
cachePrefetch ((__m128i*)pm);
1685
while (w && (unsigned long)pd & 15)
1691
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1692
negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1693
expandAlpha_1x64 (unpack_32_1x64 (s))))));
1697
/* call prefetch hint to optimize cache load*/
1698
cachePrefetch ((__m128i*)ps);
1699
cachePrefetch ((__m128i*)pd);
1700
cachePrefetch ((__m128i*)pm);
1704
/* fill cache line with next memory */
1705
cachePrefetchNext ((__m128i*)ps);
1706
cachePrefetchNext ((__m128i*)pd);
1707
cachePrefetchNext ((__m128i*)pm);
1709
xmmDstHi = load128Aligned ((__m128i*)pd);
1710
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1711
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1713
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1714
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1715
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1717
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1719
pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1721
negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1723
pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1725
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1739
*pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1740
negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1741
expandAlpha_1x64 (unpack_32_1x64 (s))))));
1746
static force_inline uint32_t
1747
coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1749
__m64 m = unpack_32_1x64 (mask);
1750
__m64 s = unpack_32_1x64 (src);
1751
__m64 d = unpack_32_1x64 (dst);
1752
__m64 sa = expandAlpha_1x64 (s);
1753
__m64 da = expandAlpha_1x64 (d);
1755
s = pixMultiply_1x64 (s, m);
1756
m = negate_1x64 (pixMultiply_1x64 (m, sa));
1758
return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1761
static force_inline void
1762
coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1766
__m128i xmmSrcLo, xmmSrcHi;
1767
__m128i xmmDstLo, xmmDstHi;
1768
__m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1769
__m128i xmmAlphaDstLo, xmmAlphaDstHi;
1770
__m128i xmmMaskLo, xmmMaskHi;
1772
/* call prefetch hint to optimize cache load*/
1773
cachePrefetch ((__m128i*)ps);
1774
cachePrefetch ((__m128i*)pd);
1775
cachePrefetch ((__m128i*)pm);
1777
while (w && (unsigned long)pd & 15)
1783
*pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1787
/* call prefetch hint to optimize cache load*/
1788
cachePrefetch ((__m128i*)ps);
1789
cachePrefetch ((__m128i*)pd);
1790
cachePrefetch ((__m128i*)pm);
1794
/* fill cache line with next memory */
1795
cachePrefetchNext ((__m128i*)ps);
1796
cachePrefetchNext ((__m128i*)pd);
1797
cachePrefetchNext ((__m128i*)pm);
1799
xmmDstHi = load128Aligned ((__m128i*)pd);
1800
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1801
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1803
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1804
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1805
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1807
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1808
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1810
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1811
pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1813
negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1815
pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
1816
&xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1817
&xmmDstLo, &xmmDstHi);
1819
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1833
*pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1838
static force_inline uint32_t
1839
coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1841
__m64 m = unpack_32_1x64 (mask);
1842
__m64 s = unpack_32_1x64 (src);
1843
__m64 d = unpack_32_1x64 (dst);
1845
__m64 da = negate_1x64 (expandAlpha_1x64 (d));
1846
__m64 sa = expandAlpha_1x64 (s);
1848
s = pixMultiply_1x64 (s, m);
1849
m = pixMultiply_1x64 (m, sa);
1851
return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1854
static force_inline void
1855
coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1859
__m128i xmmSrcLo, xmmSrcHi;
1860
__m128i xmmDstLo, xmmDstHi;
1861
__m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1862
__m128i xmmAlphaDstLo, xmmAlphaDstHi;
1863
__m128i xmmMaskLo, xmmMaskHi;
1865
/* call prefetch hint to optimize cache load*/
1866
cachePrefetch ((__m128i*)ps);
1867
cachePrefetch ((__m128i*)pd);
1868
cachePrefetch ((__m128i*)pm);
1870
while (w && (unsigned long)pd & 15)
1876
*pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
1880
/* call prefetch hint to optimize cache load*/
1881
cachePrefetch ((__m128i*)ps);
1882
cachePrefetch ((__m128i*)pd);
1883
cachePrefetch ((__m128i*)pm);
1887
/* fill cache line with next memory */
1888
cachePrefetchNext ((__m128i*)ps);
1889
cachePrefetchNext ((__m128i*)pd);
1890
cachePrefetchNext ((__m128i*)pm);
1892
xmmDstHi = load128Aligned ((__m128i*)pd);
1893
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1894
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1896
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1897
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1898
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1900
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1901
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1903
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1904
pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1906
negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1908
pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
1909
&xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1910
&xmmDstLo, &xmmDstHi);
1912
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1926
*pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
1931
static force_inline uint32_t
1932
coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1934
__m64 a = unpack_32_1x64 (mask);
1935
__m64 s = unpack_32_1x64 (src);
1936
__m64 d = unpack_32_1x64 (dst);
1938
__m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
1939
__m64 dest = pixMultiply_1x64 (s, a);
1940
__m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
1942
return pack_1x64_32 (pixAddMultiply_1x64 (&d,
1948
static force_inline void
1949
coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1953
__m128i xmmSrcLo, xmmSrcHi;
1954
__m128i xmmDstLo, xmmDstHi;
1955
__m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1956
__m128i xmmAlphaDstLo, xmmAlphaDstHi;
1957
__m128i xmmMaskLo, xmmMaskHi;
1959
/* call prefetch hint to optimize cache load*/
1960
cachePrefetch ((__m128i*)ps);
1961
cachePrefetch ((__m128i*)pd);
1962
cachePrefetch ((__m128i*)pm);
1964
while (w && (unsigned long)pd & 15)
1970
*pd++ = coreCombineXorCPixelsse2 (s, m, d);
1974
/* call prefetch hint to optimize cache load*/
1975
cachePrefetch ((__m128i*)ps);
1976
cachePrefetch ((__m128i*)pd);
1977
cachePrefetch ((__m128i*)pm);
1981
/* fill cache line with next memory */
1982
cachePrefetchNext ((__m128i*)ps);
1983
cachePrefetchNext ((__m128i*)pd);
1984
cachePrefetchNext ((__m128i*)pm);
1986
xmmDstHi = load128Aligned ((__m128i*)pd);
1987
xmmSrcHi = load128Unaligned ((__m128i*)ps);
1988
xmmMaskHi = load128Unaligned ((__m128i*)pm);
1990
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1991
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1992
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1994
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1995
expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1997
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1998
pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2000
negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2001
negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2003
pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2004
&xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2005
&xmmDstLo, &xmmDstHi);
2007
save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2021
*pd++ = coreCombineXorCPixelsse2 (s, m, d);
2026
static force_inline void
2027
coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
2031
__m128i xmmSrcLo, xmmSrcHi;
2032
__m128i xmmDstLo, xmmDstHi;
2033
__m128i xmmMaskLo, xmmMaskHi;
2035
/* call prefetch hint to optimize cache load*/
2036
cachePrefetch ((__m128i*)ps);
2037
cachePrefetch ((__m128i*)pd);
2038
cachePrefetch ((__m128i*)pm);
2040
while (w && (unsigned long)pd & 15)
2046
*pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2047
unpack_32_1x64 (m)),
2048
unpack_32_1x64 (d)));
2052
/* call prefetch hint to optimize cache load*/
2053
cachePrefetch ((__m128i*)ps);
2054
cachePrefetch ((__m128i*)pd);
2055
cachePrefetch ((__m128i*)pm);
2059
/* fill cache line with next memory */
2060
cachePrefetchNext ((__m128i*)ps);
2061
cachePrefetchNext ((__m128i*)pd);
2062
cachePrefetchNext ((__m128i*)pm);
2064
xmmSrcHi = load128Unaligned ((__m128i*)ps);
2065
xmmMaskHi = load128Unaligned ((__m128i*)pm);
2066
xmmDstHi = load128Aligned ((__m128i*)pd);
2068
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2069
unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2070
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2072
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2074
save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
2075
_mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
2089
*pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2090
unpack_32_1x64 (m)),
2091
unpack_32_1x64 (d)));
2096
/* -------------------------------------------------------------------------------------------------
2097
* fbComposeSetupSSE2
2099
static force_inline __m64
2100
createMask_16_64 (uint16_t mask)
2102
return _mm_set1_pi16 (mask);
2105
static force_inline __m128i
2106
createMask_16_128 (uint16_t mask)
2108
return _mm_set1_epi16 (mask);
2111
static force_inline __m64
2112
createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
2114
return _mm_set_pi32 (mask0, mask1);
2117
static force_inline __m128i
2118
createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
2120
return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2123
/* SSE2 code patch for fbcompose.c */
2125
static FASTCALL void
2126
sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width)
2128
coreCombineReverseInUsse2 (dst, src, width);
2132
static FASTCALL void
2133
sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width)
2135
coreCombineOverUsse2 (dst, src, width);
2139
static FASTCALL void
2140
sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width)
2142
coreCombineOverReverseUsse2 (dst, src, width);
2146
static FASTCALL void
2147
sse2CombineInU (uint32_t *dst, const uint32_t *src, int width)
2149
coreCombineInUsse2 (dst, src, width);
2153
static FASTCALL void
2154
sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width)
2156
coreCombineReverseInUsse2 (dst, src, width);
2160
static FASTCALL void
2161
sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width)
2163
coreCombineOutUsse2 (dst, src, width);
2167
static FASTCALL void
2168
sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width)
2170
coreCombineReverseOutUsse2 (dst, src, width);
2174
static FASTCALL void
2175
sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width)
2177
coreCombineAtopUsse2 (dst, src, width);
2181
static FASTCALL void
2182
sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width)
2184
coreCombineReverseAtopUsse2 (dst, src, width);
2188
static FASTCALL void
2189
sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width)
2191
coreCombineXorUsse2 (dst, src, width);
2195
static FASTCALL void
2196
sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width)
2198
coreCombineAddUsse2 (dst, src, width);
2202
static FASTCALL void
2203
sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width)
2205
coreCombineSaturateUsse2 (dst, src, width);
2209
static FASTCALL void
2210
sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2212
coreCombineSrcCsse2 (dst, src, mask, width);
2216
static FASTCALL void
2217
sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2219
coreCombineOverCsse2 (dst, src, mask, width);
2223
static FASTCALL void
2224
sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2226
coreCombineOverReverseCsse2 (dst, src, mask, width);
2230
static FASTCALL void
2231
sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2233
coreCombineInCsse2 (dst, src, mask, width);
2237
static FASTCALL void
2238
sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2240
coreCombineInReverseCsse2 (dst, src, mask, width);
2244
static FASTCALL void
2245
sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2247
coreCombineOutCsse2 (dst, src, mask, width);
2251
static FASTCALL void
2252
sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2254
coreCombineOutReverseCsse2 (dst, src, mask, width);
2258
static FASTCALL void
2259
sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2261
coreCombineAtopCsse2 (dst, src, mask, width);
2265
static FASTCALL void
2266
sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2268
coreCombineReverseAtopCsse2 (dst, src, mask, width);
2272
static FASTCALL void
2273
sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2275
coreCombineXorCsse2 (dst, src, mask, width);
2279
static FASTCALL void
2280
sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2282
coreCombineAddCsse2 (dst, src, mask, width);
2287
fbComposeSetupSSE2(void)
2289
static pixman_bool_t initialized = FALSE;
2294
/* check if we have SSE2 support and initialize accordingly */
2295
if (pixman_have_sse2())
2297
/* SSE2 constants */
2298
Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
2299
Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
2300
Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
2301
Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
2302
MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
2303
MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
2304
MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
2305
Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
2306
Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
2307
Mask0080 = createMask_16_128 (0x0080);
2308
Mask00ff = createMask_16_128 (0x00ff);
2309
Mask0101 = createMask_16_128 (0x0101);
2310
Maskffff = createMask_16_128 (0xffff);
2311
Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
2312
MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
2315
xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
2316
xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
2318
xMask0080 = createMask_16_64 (0x0080);
2319
xMask00ff = createMask_16_64 (0x00ff);
2320
xMask0101 = createMask_16_64 (0x0101);
2321
xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
2323
/* SSE code patch for fbcompose.c */
2324
pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU;
2325
pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
2326
pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU;
2327
pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
2328
pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU;
2330
pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
2331
pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU;
2332
pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
2333
pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU;
2334
pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU;
2336
pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
2338
pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC;
2339
pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC;
2340
pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
2341
pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC;
2342
pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
2343
pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC;
2344
pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
2345
pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC;
2346
pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
2347
pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC;
2348
pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC;
2350
pixman_composeFunctions.combineMaskU = sse2CombineMaskU;
2359
/* -------------------------------------------------------------------------------------------------
2360
* fbCompositeSolid_nx8888
2364
fbCompositeSolid_nx8888sse2 (pixman_op_t op,
2365
pixman_image_t * pSrc,
2366
pixman_image_t * pMask,
2367
pixman_image_t * pDst,
2378
uint32_t *dstLine, *dst, d;
2381
__m128i xmmSrc, xmmAlpha;
2382
__m128i xmmDst, xmmDstLo, xmmDstHi;
2384
fbComposeGetSolid(pSrc, src, pDst->bits.format);
2389
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2391
xmmSrc = expandPixel_32_1x128 (src);
2392
xmmAlpha = expandAlpha_1x128 (xmmSrc);
2398
/* call prefetch hint to optimize cache load*/
2399
cachePrefetch ((__m128i*)dst);
2401
dstLine += dstStride;
2404
while (w && (unsigned long)dst & 15)
2407
*dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2408
_mm_movepi64_pi64 (xmmAlpha),
2409
unpack_32_1x64 (d)));
2413
cachePrefetch ((__m128i*)dst);
2417
/* fill cache line with next memory */
2418
cachePrefetchNext ((__m128i*)dst);
2420
xmmDst = load128Aligned ((__m128i*)dst);
2422
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2424
over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
2426
/* rebuid the 4 pixel data and save*/
2427
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2436
*dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2437
_mm_movepi64_pi64 (xmmAlpha),
2438
unpack_32_1x64 (d)));
2446
/* -------------------------------------------------------------------------------------------------
2447
* fbCompositeSolid_nx0565
2450
fbCompositeSolid_nx0565sse2 (pixman_op_t op,
2451
pixman_image_t * pSrc,
2452
pixman_image_t * pMask,
2453
pixman_image_t * pDst,
2464
uint16_t *dstLine, *dst, d;
2467
__m128i xmmSrc, xmmAlpha;
2468
__m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2470
fbComposeGetSolid(pSrc, src, pDst->bits.format);
2475
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2477
xmmSrc = expandPixel_32_1x128 (src);
2478
xmmAlpha = expandAlpha_1x128 (xmmSrc);
2484
/* call prefetch hint to optimize cache load*/
2485
cachePrefetch ((__m128i*)dst);
2487
dstLine += dstStride;
2490
while (w && (unsigned long)dst & 15)
2494
*dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2495
_mm_movepi64_pi64 (xmmAlpha),
2496
expand565_16_1x64 (d))));
2500
/* call prefetch hint to optimize cache load*/
2501
cachePrefetch ((__m128i*)dst);
2505
/* fill cache line with next memory */
2506
cachePrefetchNext ((__m128i*)dst);
2508
xmmDst = load128Aligned ((__m128i*)dst);
2510
unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2512
over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
2513
over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
2515
xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2516
save128Aligned ((__m128i*)dst, xmmDst);
2525
*dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2526
_mm_movepi64_pi64 (xmmAlpha),
2527
expand565_16_1x64 (d))));
2534
/* -------------------------------------------------------------------------------------------------
2535
* fbCompositeSolidMask_nx8888x8888C
2539
fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op,
2540
pixman_image_t * pSrc,
2541
pixman_image_t * pMask,
2542
pixman_image_t * pDst,
2553
uint32_t *dstLine, d;
2554
uint32_t *maskLine, m;
2556
int dstStride, maskStride;
2558
__m128i xmmSrc, xmmAlpha;
2559
__m128i xmmDst, xmmDstLo, xmmDstHi;
2560
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
2562
__m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
2564
fbComposeGetSolid(pSrc, src, pDst->bits.format);
2570
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2571
fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
2573
xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
2574
xmmAlpha = expandAlpha_1x128 (xmmSrc);
2575
mmxSrc = _mm_movepi64_pi64 (xmmSrc);
2576
mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
2581
uint32_t *pm = (uint32_t *)maskLine;
2582
uint32_t *pd = (uint32_t *)dstLine;
2584
dstLine += dstStride;
2585
maskLine += maskStride;
2587
/* call prefetch hint to optimize cache load*/
2588
cachePrefetch ((__m128i*)pd);
2589
cachePrefetch ((__m128i*)pm);
2591
while (w && (unsigned long)pd & 15)
2598
mmxMask = unpack_32_1x64 (m);
2599
mmxDst = unpack_32_1x64 (d);
2601
*pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2611
/* call prefetch hint to optimize cache load*/
2612
cachePrefetch ((__m128i*)pd);
2613
cachePrefetch ((__m128i*)pm);
2617
/* fill cache line with next memory */
2618
cachePrefetchNext ((__m128i*)pd);
2619
cachePrefetchNext ((__m128i*)pm);
2621
xmmMask = load128Unaligned ((__m128i*)pm);
2623
packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
2625
/* if all bits in mask are zero, packCmp are equal to 0xffff */
2626
if (packCmp != 0xffff)
2628
xmmDst = load128Aligned ((__m128i*)pd);
2630
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
2631
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2633
inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
2635
save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2650
mmxMask = unpack_32_1x64 (m);
2651
mmxDst = unpack_32_1x64 (d);
2653
*pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2668
/* -------------------------------------------------------------------------------------------------
2669
* fbCompositeSrc_8888x8x8888
2673
fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op,
2674
pixman_image_t * pSrc,
2675
pixman_image_t * pMask,
2676
pixman_image_t * pDst,
2686
uint32_t *dstLine, *dst;
2687
uint32_t *srcLine, *src;
2690
int dstStride, srcStride;
2693
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2694
__m128i xmmDst, xmmDstLo, xmmDstHi;
2695
__m128i xmmAlphaLo, xmmAlphaHi;
2697
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2698
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2699
fbComposeGetSolid (pMask, mask, pDst->bits.format);
2701
xmmMask = createMask_16_128 (mask >> 24);
2706
dstLine += dstStride;
2708
srcLine += srcStride;
2711
/* call prefetch hint to optimize cache load*/
2712
cachePrefetch ((__m128i*)dst);
2713
cachePrefetch ((__m128i*)src);
2715
while (w && (unsigned long)dst & 15)
2717
uint32_t s = *src++;
2720
__m64 ms = unpack_32_1x64 (s);
2721
__m64 alpha = expandAlpha_1x64 (ms);
2722
__m64 dest = _mm_movepi64_pi64 (xmmMask);
2723
__m64 alphaDst = unpack_32_1x64 (d);
2725
*dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2733
/* call prefetch hint to optimize cache load*/
2734
cachePrefetch ((__m128i*)dst);
2735
cachePrefetch ((__m128i*)src);
2739
/* fill cache line with next memory */
2740
cachePrefetchNext ((__m128i*)dst);
2741
cachePrefetchNext ((__m128i*)src);
2743
xmmSrc = load128Unaligned ((__m128i*)src);
2744
xmmDst = load128Aligned ((__m128i*)dst);
2746
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2747
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2748
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2750
inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2752
save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2761
uint32_t s = *src++;
2764
__m64 ms = unpack_32_1x64 (s);
2765
__m64 alpha = expandAlpha_1x64 (ms);
2766
__m64 mask = _mm_movepi64_pi64 (xmmMask);
2767
__m64 dest = unpack_32_1x64 (d);
2769
*dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2781
/* -------------------------------------------------------------------------------------------------
2782
* fbCompositeSrc_x888xnx8888
2785
fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op,
2786
pixman_image_t * pSrc,
2787
pixman_image_t * pMask,
2788
pixman_image_t * pDst,
2798
uint32_t *dstLine, *dst;
2799
uint32_t *srcLine, *src;
2801
int dstStride, srcStride;
2804
__m128i xmmMask, xmmAlpha;
2805
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2806
__m128i xmmDst, xmmDstLo, xmmDstHi;
2808
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2809
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2810
fbComposeGetSolid (pMask, mask, pDst->bits.format);
2812
xmmMask = createMask_16_128 (mask >> 24);
2813
xmmAlpha = Mask00ff;
2818
dstLine += dstStride;
2820
srcLine += srcStride;
2823
/* call prefetch hint to optimize cache load*/
2824
cachePrefetch ((__m128i*)dst);
2825
cachePrefetch ((__m128i*)src);
2827
while (w && (unsigned long)dst & 15)
2829
uint32_t s = (*src++) | 0xff000000;
2832
__m64 src = unpack_32_1x64 (s);
2833
__m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2834
__m64 mask = _mm_movepi64_pi64 (xmmMask);
2835
__m64 dest = unpack_32_1x64 (d);
2837
*dst++ = pack_1x64_32 (inOver_1x64 (&src,
2845
/* call prefetch hint to optimize cache load*/
2846
cachePrefetch ((__m128i*)dst);
2847
cachePrefetch ((__m128i*)src);
2851
/* fill cache line with next memory */
2852
cachePrefetchNext ((__m128i*)dst);
2853
cachePrefetchNext ((__m128i*)src);
2855
xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
2856
xmmDst = load128Aligned ((__m128i*)dst);
2858
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2859
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2861
inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2863
save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2873
uint32_t s = (*src++) | 0xff000000;
2876
__m64 src = unpack_32_1x64 (s);
2877
__m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2878
__m64 mask = _mm_movepi64_pi64 (xmmMask);
2879
__m64 dest = unpack_32_1x64 (d);
2881
*dst++ = pack_1x64_32 (inOver_1x64 (&src,
2893
/* -------------------------------------------------------------------------------------------------
2894
* fbCompositeSrc_8888x8888
2897
fbCompositeSrc_8888x8888sse2 (pixman_op_t op,
2898
pixman_image_t * pSrc,
2899
pixman_image_t * pMask,
2900
pixman_image_t * pDst,
2910
int dstStride, srcStride;
2911
uint32_t *dstLine, *dst;
2912
uint32_t *srcLine, *src;
2914
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2915
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2922
coreCombineOverUsse2 (dst, src, width);
2930
/* -------------------------------------------------------------------------------------------------
2931
* fbCompositeSrc_8888x0565
2933
static force_inline uint16_t
2934
fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
2938
ms = unpack_32_1x64 (src);
2939
return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
2940
expandAlpha_1x64 (ms),
2941
expand565_16_1x64 (dst))));
2945
fbCompositeSrc_8888x0565sse2 (pixman_op_t op,
2946
pixman_image_t * pSrc,
2947
pixman_image_t * pMask,
2948
pixman_image_t * pDst,
2958
uint16_t *dstLine, *dst, d;
2959
uint32_t *srcLine, *src, s;
2960
int dstStride, srcStride;
2963
__m128i xmmAlphaLo, xmmAlphaHi;
2964
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2965
__m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2967
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2968
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2973
* I copy the code from MMX one and keep the fixme.
2974
* If it's a problem there, probably is a problem here.
2976
assert (pSrc->pDrawable == pMask->pDrawable);
2984
/* call prefetch hint to optimize cache load*/
2985
cachePrefetch ((__m128i*)src);
2986
cachePrefetch ((__m128i*)dst);
2988
dstLine += dstStride;
2989
srcLine += srcStride;
2992
/* Align dst on a 16-byte boundary */
2994
((unsigned long)dst & 15))
2999
*dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3003
/* call prefetch hint to optimize cache load*/
3004
cachePrefetch ((__m128i*)src);
3005
cachePrefetch ((__m128i*)dst);
3007
/* It's a 8 pixel loop */
3010
/* fill cache line with next memory */
3011
cachePrefetchNext ((__m128i*)src);
3012
cachePrefetchNext ((__m128i*)dst);
3014
/* I'm loading unaligned because I'm not sure about the address alignment. */
3015
xmmSrc = load128Unaligned ((__m128i*) src);
3016
xmmDst = load128Aligned ((__m128i*) dst);
3019
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3020
unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3021
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3023
/* I'm loading next 4 pixels from memory before to optimze the memory read. */
3024
xmmSrc = load128Unaligned ((__m128i*) (src+4));
3026
over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
3029
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3030
expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3032
over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
3034
save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3046
*dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3053
/* -------------------------------------------------------------------------------------------------
3054
* fbCompositeSolidMask_nx8x8888
3058
fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op,
3059
pixman_image_t * pSrc,
3060
pixman_image_t * pMask,
3061
pixman_image_t * pDst,
3072
uint32_t *dstLine, *dst;
3073
uint8_t *maskLine, *mask;
3074
int dstStride, maskStride;
3078
__m128i xmmSrc, xmmAlpha, xmmDef;
3079
__m128i xmmDst, xmmDstLo, xmmDstHi;
3080
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
3082
__m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3084
fbComposeGetSolid(pSrc, src, pDst->bits.format);
3090
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3091
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3093
xmmDef = createMask_2x32_128 (src, src);
3094
xmmSrc = expandPixel_32_1x128 (src);
3095
xmmAlpha = expandAlpha_1x128 (xmmSrc);
3096
mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3097
mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3102
dstLine += dstStride;
3104
maskLine += maskStride;
3107
/* call prefetch hint to optimize cache load*/
3108
cachePrefetch ((__m128i*)mask);
3109
cachePrefetch ((__m128i*)dst);
3111
while (w && (unsigned long)dst & 15)
3113
uint8_t m = *mask++;
3118
mmxMask = expandPixel_8_1x64 (m);
3119
mmxDest = unpack_32_1x64 (d);
3121
*dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3131
/* call prefetch hint to optimize cache load*/
3132
cachePrefetch ((__m128i*)mask);
3133
cachePrefetch ((__m128i*)dst);
3137
/* fill cache line with next memory */
3138
cachePrefetchNext ((__m128i*)mask);
3139
cachePrefetchNext ((__m128i*)dst);
3141
m = *((uint32_t*)mask);
3143
if (srca == 0xff && m == 0xffffffff)
3145
save128Aligned ((__m128i*)dst, xmmDef);
3149
xmmDst = load128Aligned ((__m128i*) dst);
3150
xmmMask = unpack_32_1x128 (m);
3151
xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3154
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3155
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3157
expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3159
inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
3161
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3171
uint8_t m = *mask++;
3176
mmxMask = expandPixel_8_1x64 (m);
3177
mmxDest = unpack_32_1x64 (d);
3179
*dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3193
/* -------------------------------------------------------------------------------------------------
3194
* fbCompositeSolidMask_nx8x8888
3198
pixmanFillsse2 (uint32_t *bits,
3207
uint32_t byte_width;
3212
if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3215
if (bpp != 16 && bpp != 32)
3220
stride = stride * (int) sizeof (uint32_t) / 2;
3221
byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3222
byte_width = 2 * width;
3227
stride = stride * (int) sizeof (uint32_t) / 4;
3228
byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3229
byte_width = 4 * width;
3233
cachePrefetch ((__m128i*)byte_line);
3234
xmmDef = createMask_2x32_128 (data, data);
3239
uint8_t *d = byte_line;
3240
byte_line += stride;
3244
cachePrefetchNext ((__m128i*)d);
3246
while (w >= 2 && ((unsigned long)d & 3))
3248
*(uint16_t *)d = data;
3253
while (w >= 4 && ((unsigned long)d & 15))
3255
*(uint32_t *)d = data;
3261
cachePrefetchNext ((__m128i*)d);
3265
cachePrefetch (((__m128i*)d) + 12);
3267
save128Aligned ((__m128i*)(d), xmmDef);
3268
save128Aligned ((__m128i*)(d+16), xmmDef);
3269
save128Aligned ((__m128i*)(d+32), xmmDef);
3270
save128Aligned ((__m128i*)(d+48), xmmDef);
3271
save128Aligned ((__m128i*)(d+64), xmmDef);
3272
save128Aligned ((__m128i*)(d+80), xmmDef);
3273
save128Aligned ((__m128i*)(d+96), xmmDef);
3274
save128Aligned ((__m128i*)(d+112), xmmDef);
3282
cachePrefetch (((__m128i*)d) + 8);
3284
save128Aligned ((__m128i*)(d), xmmDef);
3285
save128Aligned ((__m128i*)(d+16), xmmDef);
3286
save128Aligned ((__m128i*)(d+32), xmmDef);
3287
save128Aligned ((__m128i*)(d+48), xmmDef);
3293
cachePrefetchNext ((__m128i*)d);
3297
save128Aligned ((__m128i*)(d), xmmDef);
3298
save128Aligned ((__m128i*)(d+16), xmmDef);
3306
save128Aligned ((__m128i*)(d), xmmDef);
3312
cachePrefetchNext ((__m128i*)d);
3316
*(uint32_t *)d = data;
3324
*(uint16_t *)d = data;
3335
fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op,
3336
pixman_image_t * pSrc,
3337
pixman_image_t * pMask,
3338
pixman_image_t * pDst,
3349
uint32_t *dstLine, *dst;
3350
uint8_t *maskLine, *mask;
3351
int dstStride, maskStride;
3355
__m128i xmmSrc, xmmDef;
3356
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
3358
fbComposeGetSolid(pSrc, src, pDst->bits.format);
3363
pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
3364
PIXMAN_FORMAT_BPP (pDst->bits.format),
3365
xDst, yDst, width, height, 0);
3369
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3370
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3372
xmmDef = createMask_2x32_128 (src, src);
3373
xmmSrc = expandPixel_32_1x128 (src);
3378
dstLine += dstStride;
3380
maskLine += maskStride;
3383
/* call prefetch hint to optimize cache load*/
3384
cachePrefetch ((__m128i*)mask);
3385
cachePrefetch ((__m128i*)dst);
3387
while (w && (unsigned long)dst & 15)
3389
uint8_t m = *mask++;
3393
*dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3404
/* call prefetch hint to optimize cache load*/
3405
cachePrefetch ((__m128i*)mask);
3406
cachePrefetch ((__m128i*)dst);
3410
/* fill cache line with next memory */
3411
cachePrefetchNext ((__m128i*)mask);
3412
cachePrefetchNext ((__m128i*)dst);
3414
m = *((uint32_t*)mask);
3416
if (srca == 0xff && m == 0xffffffff)
3418
save128Aligned ((__m128i*)dst, xmmDef);
3422
xmmMask = unpack_32_1x128 (m);
3423
xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3426
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3428
expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3430
pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3432
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
3436
save128Aligned ((__m128i*)dst, _mm_setzero_si128());
3446
uint8_t m = *mask++;
3450
*dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3465
/* -------------------------------------------------------------------------------------------------
3466
* fbCompositeSolidMask_nx8x0565
3470
fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op,
3471
pixman_image_t * pSrc,
3472
pixman_image_t * pMask,
3473
pixman_image_t * pDst,
3484
uint16_t *dstLine, *dst, d;
3485
uint8_t *maskLine, *mask;
3486
int dstStride, maskStride;
3489
__m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3491
__m128i xmmSrc, xmmAlpha;
3492
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
3493
__m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3495
fbComposeGetSolid(pSrc, src, pDst->bits.format);
3501
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3502
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3504
xmmSrc = expandPixel_32_1x128 (src);
3505
xmmAlpha = expandAlpha_1x128 (xmmSrc);
3506
mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3507
mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3512
dstLine += dstStride;
3514
maskLine += maskStride;
3517
/* call prefetch hint to optimize cache load*/
3518
cachePrefetch ((__m128i*)mask);
3519
cachePrefetch ((__m128i*)dst);
3521
while (w && (unsigned long)dst & 15)
3528
mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3529
mmxDest = expand565_16_1x64 (d);
3531
*dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3541
/* call prefetch hint to optimize cache load*/
3542
cachePrefetch ((__m128i*)mask);
3543
cachePrefetch ((__m128i*)dst);
3547
/* fill cache line with next memory */
3548
cachePrefetchNext ((__m128i*)mask);
3549
cachePrefetchNext ((__m128i*)dst);
3551
xmmDst = load128Aligned ((__m128i*) dst);
3552
unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3554
m = *((uint32_t*)mask);
3559
xmmMask = unpack_32_1x128 (m);
3560
xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3563
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3565
expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3566
inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3569
m = *((uint32_t*)mask);
3574
xmmMask = unpack_32_1x128 (m);
3575
xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3578
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3580
expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3581
inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3584
save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3597
mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3598
mmxDest = expand565_16_1x64 (d);
3600
*dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3614
/* -------------------------------------------------------------------------------------------------
3615
* fbCompositeSrc_8888RevNPx0565
3619
fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op,
3620
pixman_image_t * pSrc,
3621
pixman_image_t * pMask,
3622
pixman_image_t * pDst,
3632
uint16_t *dstLine, *dst, d;
3633
uint32_t *srcLine, *src, s;
3634
int dstStride, srcStride;
3639
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3640
__m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3642
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3643
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3648
* I copy the code from MMX one and keep the fixme.
3649
* If it's a problem there, probably is a problem here.
3651
assert (pSrc->pDrawable == pMask->pDrawable);
3657
dstLine += dstStride;
3659
srcLine += srcStride;
3662
/* call prefetch hint to optimize cache load*/
3663
cachePrefetch ((__m128i*)src);
3664
cachePrefetch ((__m128i*)dst);
3666
while (w && (unsigned long)dst & 15)
3671
ms = unpack_32_1x64 (s);
3673
*dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3677
/* call prefetch hint to optimize cache load*/
3678
cachePrefetch ((__m128i*)src);
3679
cachePrefetch ((__m128i*)dst);
3683
/* fill cache line with next memory */
3684
cachePrefetchNext ((__m128i*)src);
3685
cachePrefetchNext ((__m128i*)dst);
3688
xmmSrc = load128Unaligned((__m128i*)src);
3689
xmmDst = load128Aligned ((__m128i*)dst);
3691
packCmp = packAlpha (xmmSrc);
3693
unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3694
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3696
/* preload next round*/
3697
xmmSrc = load128Unaligned((__m128i*)(src+4));
3698
/* preload next round*/
3700
if (packCmp == 0xffffffff)
3702
invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3706
overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3710
packCmp = packAlpha (xmmSrc);
3712
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3714
if (packCmp == 0xffffffff)
3716
invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3720
overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3723
save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3735
ms = unpack_32_1x64 (s);
3737
*dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3745
/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
3747
/* -------------------------------------------------------------------------------------------------
3748
* fbCompositeSrc_8888RevNPx8888
3752
fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op,
3753
pixman_image_t * pSrc,
3754
pixman_image_t * pMask,
3755
pixman_image_t * pDst,
3765
uint32_t *dstLine, *dst, d;
3766
uint32_t *srcLine, *src, s;
3767
int dstStride, srcStride;
3771
__m128i xmmSrcLo, xmmSrcHi;
3772
__m128i xmmDstLo, xmmDstHi;
3774
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3775
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3780
* I copy the code from MMX one and keep the fixme.
3781
* If it's a problem there, probably is a problem here.
3783
assert (pSrc->pDrawable == pMask->pDrawable);
3789
dstLine += dstStride;
3791
srcLine += srcStride;
3794
/* call prefetch hint to optimize cache load*/
3795
cachePrefetch ((__m128i*)src);
3796
cachePrefetch ((__m128i*)dst);
3798
while (w && (unsigned long)dst & 15)
3803
*dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3808
/* call prefetch hint to optimize cache load*/
3809
cachePrefetch ((__m128i*)src);
3810
cachePrefetch ((__m128i*)dst);
3814
/* fill cache line with next memory */
3815
cachePrefetchNext ((__m128i*)src);
3816
cachePrefetchNext ((__m128i*)dst);
3818
xmmSrcHi = load128Unaligned((__m128i*)src);
3820
packCmp = packAlpha (xmmSrcHi);
3822
unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
3824
if (packCmp == 0xffffffff)
3826
invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3828
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3832
xmmDstHi = load128Aligned ((__m128i*)dst);
3834
unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
3836
overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3838
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3851
*dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3860
/* -------------------------------------------------------------------------------------------------
3861
* fbCompositeSolidMask_nx8888x0565C
3865
fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op,
3866
pixman_image_t * pSrc,
3867
pixman_image_t * pMask,
3868
pixman_image_t * pDst,
3879
uint16_t *dstLine, *dst, d;
3880
uint32_t *maskLine, *mask, m;
3881
int dstStride, maskStride;
3885
__m128i xmmSrc, xmmAlpha;
3886
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
3887
__m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3889
__m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3891
fbComposeGetSolid(pSrc, src, pDst->bits.format);
3897
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3898
fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
3900
xmmSrc = expandPixel_32_1x128 (src);
3901
xmmAlpha = expandAlpha_1x128 (xmmSrc);
3902
mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3903
mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3910
maskLine += maskStride;
3911
dstLine += dstStride;
3913
/* call prefetch hint to optimize cache load*/
3914
cachePrefetch ((__m128i*)mask);
3915
cachePrefetch ((__m128i*)dst);
3917
while (w && ((unsigned long)dst & 15))
3919
m = *(uint32_t *) mask;
3924
mmxMask = unpack_32_1x64 (m);
3925
mmxDest = expand565_16_1x64 (d);
3927
*dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3938
/* call prefetch hint to optimize cache load*/
3939
cachePrefetch ((__m128i*)mask);
3940
cachePrefetch ((__m128i*)dst);
3944
/* fill cache line with next memory */
3945
cachePrefetchNext ((__m128i*)mask);
3946
cachePrefetchNext ((__m128i*)dst);
3949
xmmMask = load128Unaligned((__m128i*)mask);
3950
xmmDst = load128Aligned((__m128i*)dst);
3952
packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
3954
unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3955
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3957
/* preload next round*/
3958
xmmMask = load128Unaligned((__m128i*)(mask+4));
3959
/* preload next round*/
3961
if (packCmp != 0xffff)
3963
inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3967
packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
3969
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3971
if (packCmp != 0xffff)
3973
inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3976
save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3985
m = *(uint32_t *) mask;
3990
mmxMask = unpack_32_1x64 (m);
3991
mmxDest = expand565_16_1x64 (d);
3993
*dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
4008
/* -------------------------------------------------------------------------------------------------
4009
* fbCompositeIn_nx8x8
4013
fbCompositeIn_nx8x8sse2 (pixman_op_t op,
4014
pixman_image_t * pSrc,
4015
pixman_image_t * pMask,
4016
pixman_image_t * pDst,
4026
uint8_t *dstLine, *dst;
4027
uint8_t *maskLine, *mask;
4028
int dstStride, maskStride;
4034
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
4035
__m128i xmmDst, xmmDstLo, xmmDstHi;
4037
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4038
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4040
fbComposeGetSolid(pSrc, src, pDst->bits.format);
4046
xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4051
dstLine += dstStride;
4053
maskLine += maskStride;
4056
/* call prefetch hint to optimize cache load*/
4057
cachePrefetch ((__m128i*)mask);
4058
cachePrefetch ((__m128i*)dst);
4060
while (w && ((unsigned long)dst & 15))
4062
m = (uint32_t) *mask++;
4063
d = (uint32_t) *dst;
4065
*dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4066
unpack_32_1x64 (d)));
4070
/* call prefetch hint to optimize cache load*/
4071
cachePrefetch ((__m128i*)mask);
4072
cachePrefetch ((__m128i*)dst);
4076
/* fill cache line with next memory */
4077
cachePrefetchNext ((__m128i*)mask);
4078
cachePrefetchNext ((__m128i*)dst);
4080
xmmMask = load128Unaligned((__m128i*)mask);
4081
xmmDst = load128Aligned((__m128i*)dst);
4083
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4084
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4086
pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4087
pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4089
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4098
m = (uint32_t) *mask++;
4099
d = (uint32_t) *dst;
4101
*dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4102
unpack_32_1x64 (d)));
4110
/* -------------------------------------------------------------------------------------------------
4115
fbCompositeIn_8x8sse2 (pixman_op_t op,
4116
pixman_image_t * pSrc,
4117
pixman_image_t * pMask,
4118
pixman_image_t * pDst,
4128
uint8_t *dstLine, *dst;
4129
uint8_t *srcLine, *src;
4130
int srcStride, dstStride;
4134
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4135
__m128i xmmDst, xmmDstLo, xmmDstHi;
4137
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4138
fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4143
dstLine += dstStride;
4145
srcLine += srcStride;
4148
/* call prefetch hint to optimize cache load*/
4149
cachePrefetch ((__m128i*)src);
4150
cachePrefetch ((__m128i*)dst);
4152
while (w && ((unsigned long)dst & 15))
4154
s = (uint32_t) *src++;
4155
d = (uint32_t) *dst;
4157
*dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4161
/* call prefetch hint to optimize cache load*/
4162
cachePrefetch ((__m128i*)src);
4163
cachePrefetch ((__m128i*)dst);
4167
/* fill cache line with next memory */
4168
cachePrefetchNext ((__m128i*)src);
4169
cachePrefetchNext ((__m128i*)dst);
4171
xmmSrc = load128Unaligned((__m128i*)src);
4172
xmmDst = load128Aligned((__m128i*)dst);
4174
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4175
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4177
pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4179
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4188
s = (uint32_t) *src++;
4189
d = (uint32_t) *dst;
4191
*dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4199
/* -------------------------------------------------------------------------------------------------
4200
* fbCompositeSrcAdd_8888x8x8
4204
fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op,
4205
pixman_image_t * pSrc,
4206
pixman_image_t * pMask,
4207
pixman_image_t * pDst,
4217
uint8_t *dstLine, *dst;
4218
uint8_t *maskLine, *mask;
4219
int dstStride, maskStride;
4226
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
4227
__m128i xmmDst, xmmDstLo, xmmDstHi;
4229
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4230
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4232
fbComposeGetSolid(pSrc, src, pDst->bits.format);
4238
xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4243
dstLine += dstStride;
4245
maskLine += maskStride;
4248
/* call prefetch hint to optimize cache load*/
4249
cachePrefetch ((__m128i*)mask);
4250
cachePrefetch ((__m128i*)dst);
4252
while (w && ((unsigned long)dst & 15))
4254
m = (uint32_t) *mask++;
4255
d = (uint32_t) *dst;
4257
*dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4258
unpack_32_1x64 (d)));
4262
/* call prefetch hint to optimize cache load*/
4263
cachePrefetch ((__m128i*)mask);
4264
cachePrefetch ((__m128i*)dst);
4268
/* fill cache line with next memory */
4269
cachePrefetchNext ((__m128i*)mask);
4270
cachePrefetchNext ((__m128i*)dst);
4272
xmmMask = load128Unaligned((__m128i*)mask);
4273
xmmDst = load128Aligned((__m128i*)dst);
4275
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4276
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4278
pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4280
xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
4281
xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
4283
save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4292
m = (uint32_t) *mask++;
4293
d = (uint32_t) *dst;
4295
*dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4296
unpack_32_1x64 (d)));
4304
/* -------------------------------------------------------------------------------------------------
4305
* fbCompositeSrcAdd_8000x8000
4309
fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op,
4310
pixman_image_t * pSrc,
4311
pixman_image_t * pMask,
4312
pixman_image_t * pDst,
4322
uint8_t *dstLine, *dst;
4323
uint8_t *srcLine, *src;
4324
int dstStride, srcStride;
4328
fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4329
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4336
/* call prefetch hint to optimize cache load*/
4337
cachePrefetch ((__m128i*)src);
4338
cachePrefetch ((__m128i*)dst);
4340
dstLine += dstStride;
4341
srcLine += srcStride;
4345
while (w && (unsigned long)dst & 3)
4347
t = (*dst) + (*src++);
4348
*dst++ = t | (0 - (t >> 8));
4352
coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2);
4362
t = (*dst) + (*src++);
4363
*dst++ = t | (0 - (t >> 8));
4371
/* -------------------------------------------------------------------------------------------------
4372
* fbCompositeSrcAdd_8888x8888
4375
fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op,
4376
pixman_image_t * pSrc,
4377
pixman_image_t * pMask,
4378
pixman_image_t * pDst,
4388
uint32_t *dstLine, *dst;
4389
uint32_t *srcLine, *src;
4390
int dstStride, srcStride;
4392
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4393
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4398
dstLine += dstStride;
4400
srcLine += srcStride;
4402
coreCombineAddUsse2 (dst, src, width);
4408
/* -------------------------------------------------------------------------------------------------
4409
* fbCompositeCopyAreasse2
4413
pixmanBltsse2 (uint32_t *src_bits,
4419
int src_x, int src_y,
4420
int dst_x, int dst_y,
4421
int width, int height)
4423
uint8_t * src_bytes;
4424
uint8_t * dst_bytes;
4427
if (src_bpp != dst_bpp)
4432
src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4433
dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4434
src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4435
dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4436
byte_width = 2 * width;
4440
else if (src_bpp == 32)
4442
src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4443
dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4444
src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4445
dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4446
byte_width = 4 * width;
4455
cachePrefetch ((__m128i*)src_bytes);
4456
cachePrefetch ((__m128i*)dst_bytes);
4461
uint8_t *s = src_bytes;
4462
uint8_t *d = dst_bytes;
4463
src_bytes += src_stride;
4464
dst_bytes += dst_stride;
4467
cachePrefetchNext ((__m128i*)s);
4468
cachePrefetchNext ((__m128i*)d);
4470
while (w >= 2 && ((unsigned long)d & 3))
4472
*(uint16_t *)d = *(uint16_t *)s;
4478
while (w >= 4 && ((unsigned long)d & 15))
4480
*(uint32_t *)d = *(uint32_t *)s;
4487
cachePrefetchNext ((__m128i*)s);
4488
cachePrefetchNext ((__m128i*)d);
4492
__m128i xmm0, xmm1, xmm2, xmm3;
4494
/* 128 bytes ahead */
4495
cachePrefetch (((__m128i*)s) + 8);
4496
cachePrefetch (((__m128i*)d) + 8);
4498
xmm0 = load128Unaligned ((__m128i*)(s));
4499
xmm1 = load128Unaligned ((__m128i*)(s+16));
4500
xmm2 = load128Unaligned ((__m128i*)(s+32));
4501
xmm3 = load128Unaligned ((__m128i*)(s+48));
4503
save128Aligned ((__m128i*)(d), xmm0);
4504
save128Aligned ((__m128i*)(d+16), xmm1);
4505
save128Aligned ((__m128i*)(d+32), xmm2);
4506
save128Aligned ((__m128i*)(d+48), xmm3);
4513
cachePrefetchNext ((__m128i*)s);
4514
cachePrefetchNext ((__m128i*)d);
4518
save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
4525
cachePrefetchNext ((__m128i*)s);
4526
cachePrefetchNext ((__m128i*)d);
4530
*(uint32_t *)d = *(uint32_t *)s;
4539
*(uint16_t *)d = *(uint16_t *)s;
4552
fbCompositeCopyAreasse2 (pixman_op_t op,
4553
pixman_image_t * pSrc,
4554
pixman_image_t * pMask,
4555
pixman_image_t * pDst,
4565
pixmanBltsse2 (pSrc->bits.bits,
4567
pSrc->bits.rowstride,
4568
pDst->bits.rowstride,
4569
PIXMAN_FORMAT_BPP (pSrc->bits.format),
4570
PIXMAN_FORMAT_BPP (pDst->bits.format),
4571
xSrc, ySrc, xDst, yDst, width, height);
4575
/* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4577
fbCompositeOver_x888x8x8888sse2 (pixman_op_t op,
4578
pixman_image_t * pSrc,
4579
pixman_image_t * pMask,
4580
pixman_image_t * pDst,
4590
uint32_t *src, *srcLine, s;
4591
uint32_t *dst, *dstLine, d;
4592
uint8_t *mask, *maskLine;
4594
int srcStride, maskStride, dstStride;
4597
__m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4598
__m128i xmmDst, xmmDstLo, xmmDstHi;
4599
__m128i xmmMask, xmmMaskLo, xmmMaskHi;
4601
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4602
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4603
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4608
srcLine += srcStride;
4610
dstLine += dstStride;
4612
maskLine += maskStride;
4616
/* call prefetch hint to optimize cache load*/
4617
cachePrefetch ((__m128i*)src);
4618
cachePrefetch ((__m128i*)dst);
4619
cachePrefetch ((__m128i*)mask);
4621
while (w && (unsigned long)dst & 15)
4623
s = 0xff000000 | *src++;
4624
m = (uint32_t) *mask++;
4627
__m64 ms = unpack_32_1x64 (s);
4631
ms = inOver_1x64 (ms,
4633
expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4634
unpack_32_1x64 (d));
4637
*dst++ = pack_1x64_32 (ms);
4641
/* call prefetch hint to optimize cache load*/
4642
cachePrefetch ((__m128i*)src);
4643
cachePrefetch ((__m128i*)dst);
4644
cachePrefetch ((__m128i*)mask);
4648
/* fill cache line with next memory */
4649
cachePrefetchNext ((__m128i*)src);
4650
cachePrefetchNext ((__m128i*)dst);
4651
cachePrefetchNext ((__m128i*)mask);
4653
m = *(uint32_t*) mask;
4654
xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
4656
if (m == 0xffffffff)
4658
save128Aligned ((__m128i*)dst, xmmSrc);
4662
xmmDst = load128Aligned ((__m128i*)dst);
4664
xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4666
unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4667
unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4668
unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4670
expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4672
inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
4674
save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4685
m = (uint32_t) *mask++;
4689
s = 0xff000000 | *src;
4699
*dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
4701
expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4702
unpack_32_1x64 (d)));
4717
#endif /* USE_SSE2 */