2
* Copyright © 2004, 2005 Red Hat, Inc.
3
* Copyright © 2004 Nicholas Miell
4
* Copyright © 2005 Trolltech AS
6
* Permission to use, copy, modify, distribute, and sell this software and its
7
* documentation for any purpose is hereby granted without fee, provided that
8
* the above copyright notice appear in all copies and that both that
9
* copyright notice and this permission notice appear in supporting
10
* documentation, and that the name of Red Hat not be used in advertising or
11
* publicity pertaining to distribution of the software without specific,
12
* written prior permission. Red Hat makes no representations about the
13
* suitability of this software for any purpose. It is provided "as is"
14
* without express or implied warranty.
16
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25
* Author: Søren Sandmann (sandmann@redhat.com)
26
* Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27
* MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29
* Based on work by Owen Taylor
35
#if defined(__amd64__) || defined(__x86_64__)
41
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
44
#include "pixman-mmx.h"
49
#define WRITE(ptr,v) (*(ptr) = (v));
54
#define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__)
59
/* Notes about writing mmx code
61
* give memory operands as the second operand. If you give it as the
62
* first, gcc will first load it into a register, then use that
67
* _mm_mullo_pi16 (x, mmx_constant);
71
* _mm_mullo_pi16 (mmx_constant, x);
73
* Also try to minimize dependencies. i.e. when you need a value, try
74
* to calculate it from a value that was calculated as early as
78
/* --------------- MMX primitivess ------------------------------------ */
80
typedef unsigned long long ullong;
83
typedef ullong mmxdatafield;
86
typedef unsigned __int64 ullong;
87
typedef __m64 mmxdatafield;
92
mmxdatafield mmx_4x00ff;
93
mmxdatafield mmx_4x0080;
94
mmxdatafield mmx_565_rgb;
95
mmxdatafield mmx_565_unpack_multiplier;
96
mmxdatafield mmx_565_r;
97
mmxdatafield mmx_565_g;
98
mmxdatafield mmx_565_b;
99
mmxdatafield mmx_mask_0;
100
mmxdatafield mmx_mask_1;
101
mmxdatafield mmx_mask_2;
102
mmxdatafield mmx_mask_3;
103
mmxdatafield mmx_full_alpha;
104
mmxdatafield mmx_ffff0000ffff0000;
105
mmxdatafield mmx_0000ffff00000000;
106
mmxdatafield mmx_000000000000ffff;
109
static const MMXData c =
112
.mmx_4x00ff = 0x00ff00ff00ff00ffULL,
113
.mmx_4x0080 = 0x0080008000800080ULL,
114
.mmx_565_rgb = 0x000001f0003f001fULL,
115
.mmx_565_unpack_multiplier = 0x0000008404100840ULL,
116
.mmx_565_r = 0x000000f800000000ULL,
117
.mmx_565_g = 0x0000000000fc0000ULL,
118
.mmx_565_b = 0x00000000000000f8ULL,
119
.mmx_mask_0 = 0xffffffffffff0000ULL,
120
.mmx_mask_1 = 0xffffffff0000ffffULL,
121
.mmx_mask_2 = 0xffff0000ffffffffULL,
122
.mmx_mask_3 = 0x0000ffffffffffffULL,
123
.mmx_full_alpha = 0x00ff000000000000ULL,
124
.mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL,
125
.mmx_0000ffff00000000 = 0x0000ffff00000000ULL,
126
.mmx_000000000000ffff = 0x000000000000ffffULL,
129
{ 0x00ff00ff00ff00ffUI64 },
130
{ 0x0080008000800080UI64 },
131
{ 0x000001f0003f001fUI64 },
132
{ 0x0000008404100840UI64 },
133
{ 0x000000f800000000UI64 },
134
{ 0x0000000000fc0000UI64 },
135
{ 0x00000000000000f8UI64 },
136
{ 0xffffffffffff0000UI64 },
137
{ 0xffffffff0000ffffUI64 },
138
{ 0xffff0000ffffffffUI64 },
139
{ 0x0000ffffffffffffUI64 },
140
{ 0x00ff000000000000UI64 },
141
{ 0xffff0000ffff0000UI64 },
142
{ 0x0000ffff00000000UI64 },
143
{ 0x000000000000ffffUI64 },
149
#define inline __forceinline
153
#define MC(x) ((__m64) c.mmx_##x)
156
#define MC(x) c.mmx_##x
160
shift (__m64 v, int s)
163
return _mm_slli_si64 (v, s);
165
return _mm_srli_si64 (v, -s);
173
return _mm_xor_si64 (mask, MC(4x00ff));
177
pix_multiply (__m64 a, __m64 b)
181
res = _mm_mullo_pi16 (a, b);
182
res = _mm_adds_pu16 (res, MC(4x0080));
183
res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
184
res = _mm_srli_pi16 (res, 8);
190
pix_add (__m64 a, __m64 b)
192
return _mm_adds_pu8 (a, b);
198
expand_alpha (__m64 pixel)
200
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
204
expand_alpha_rev (__m64 pixel)
206
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
210
invert_colors (__m64 pixel)
212
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
218
expand_alpha (__m64 pixel)
222
t1 = shift (pixel, -48);
224
t1 = _mm_or_si64 (t1, t2);
226
t1 = _mm_or_si64 (t1, t2);
232
expand_alpha_rev (__m64 pixel)
236
/* move alpha to low 16 bits and zero the rest */
237
t1 = shift (pixel, 48);
238
t1 = shift (t1, -48);
241
t1 = _mm_or_si64 (t1, t2);
243
t1 = _mm_or_si64 (t1, t2);
249
invert_colors (__m64 pixel)
255
x = _mm_and_si64 (x, MC(ffff0000ffff0000));
256
y = _mm_and_si64 (y, MC(000000000000ffff));
257
z = _mm_and_si64 (z, MC(0000ffff00000000));
262
x = _mm_or_si64 (x, y);
263
x = _mm_or_si64 (x, z);
271
over (__m64 src, __m64 srca, __m64 dest)
273
return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
277
over_rev_non_pre (__m64 src, __m64 dest)
279
__m64 srca = expand_alpha (src);
280
__m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
282
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
289
return pix_multiply (src, mask);
293
in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
295
src = _mm_or_si64 (src, MC(full_alpha));
297
return over(in (src, mask), mask, dest);
307
return over(in(src, mask), pix_multiply(srca, mask), dest);
310
#define in_over(src, srca, mask, dest) over(in(src, mask), pix_multiply(srca, mask), dest)
314
load8888 (uint32_t v)
316
return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
320
pack8888 (__m64 lo, __m64 hi)
322
return _mm_packs_pu16 (lo, hi);
325
static inline uint32_t
328
return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
331
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
335
* --- Expanding 565 in the low word ---
337
* m = (m << (32 - 3)) | (m << (16 - 5)) | m;
338
* m = m & (01f0003f001f);
339
* m = m * (008404100840);
342
* Note the trick here - the top word is shifted by another nibble to
343
* avoid it bumping into the middle word
346
expand565 (__m64 pixel, int pos)
351
/* move pixel to low 16 bit and zero the rest */
352
p = shift (shift (p, (3 - pos) * 16), -48);
354
t1 = shift (p, 36 - 11);
355
t2 = shift (p, 16 - 5);
357
p = _mm_or_si64 (t1, p);
358
p = _mm_or_si64 (t2, p);
359
p = _mm_and_si64 (p, MC(565_rgb));
361
pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
362
return _mm_srli_pi16 (pixel, 8);
366
expand8888 (__m64 in, int pos)
369
return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
371
return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
375
pack565 (__m64 pixel, __m64 target, int pos)
381
r = _mm_and_si64 (p, MC(565_r));
382
g = _mm_and_si64 (p, MC(565_g));
383
b = _mm_and_si64 (p, MC(565_b));
385
r = shift (r, - (32 - 8) + pos * 16);
386
g = shift (g, - (16 - 3) + pos * 16);
387
b = shift (b, - (0 + 3) + pos * 16);
390
t = _mm_and_si64 (t, MC(mask_0));
392
t = _mm_and_si64 (t, MC(mask_1));
394
t = _mm_and_si64 (t, MC(mask_2));
396
t = _mm_and_si64 (t, MC(mask_3));
398
p = _mm_or_si64 (r, t);
399
p = _mm_or_si64 (g, p);
401
return _mm_or_si64 (b, p);
406
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
408
x = _mm_mullo_pi16 (x, a);
409
y = _mm_mullo_pi16 (y, b);
410
x = _mm_adds_pu16 (x, MC(4x0080));
411
x = _mm_adds_pu16 (x, y);
412
x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
413
x = _mm_srli_pi16 (x, 8);
418
#define pix_add_mul(x, a, y, b) \
419
( x = _mm_mullo_pi16 (x, a), \
420
y = _mm_mullo_pi16 (y, b), \
421
x = _mm_adds_pu16 (x, MC(4x0080)), \
422
x = _mm_adds_pu16 (x, y), \
423
x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \
424
_mm_srli_pi16 (x, 8) )
427
/* --------------- MMX code patch for fbcompose.c --------------------- */
430
mmxCombineMaskU (uint32_t *src, const uint32_t *mask, int width)
432
const uint32_t *end = mask + width;
434
uint32_t mmask = *mask;
435
uint32_t maska = mmask >> 24;
438
} else if (maska != 0xff) {
439
__m64 a = load8888(mmask);
440
__m64 s = load8888(*src);
442
s = pix_multiply(s, a);
453
mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
455
const uint32_t *end = dest + width;
458
uint32_t ssrc = *src;
459
uint32_t a = ssrc >> 24;
465
sa = expand_alpha(s);
466
*dest = store8888(over(s, sa, load8888(*dest)));
475
mmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)
477
const uint32_t *end = dest + width;
482
da = expand_alpha(d);
483
*dest = store8888(over (d, da, load8888(*src)));
491
mmxCombineInU (uint32_t *dest, const uint32_t *src, int width)
493
const uint32_t *end = dest + width;
500
x = pix_multiply(x, a);
501
*dest = store8888(x);
509
mmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)
511
const uint32_t *end = dest + width;
518
x = pix_multiply(x, a);
519
*dest = store8888(x);
527
mmxCombineOutU (uint32_t *dest, const uint32_t *src, int width)
529
const uint32_t *end = dest + width;
537
x = pix_multiply(x, a);
538
*dest = store8888(x);
546
mmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)
548
const uint32_t *end = dest + width;
556
x = pix_multiply(x, a);
557
*dest = store8888(x);
565
mmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width)
567
const uint32_t *end = dest + width;
573
sia = expand_alpha(s);
575
da = expand_alpha(d);
576
s = pix_add_mul (s, da, d, sia);
577
*dest = store8888(s);
585
mmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
595
sa = expand_alpha(s);
596
dia = expand_alpha(d);
598
s = pix_add_mul (s, dia, d, sa);
599
*dest = store8888(s);
607
mmxCombineXorU (uint32_t *dest, const uint32_t *src, int width)
609
const uint32_t *end = dest + width;
612
__m64 s, dia, d, sia;
615
sia = expand_alpha(s);
616
dia = expand_alpha(d);
619
s = pix_add_mul (s, dia, d, sia);
620
*dest = store8888(s);
628
mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
630
const uint32_t *end = dest + width;
636
*dest = store8888(s);
644
mmxCombineSaturateU (uint32_t *dest, const uint32_t *src, int width)
646
const uint32_t *end = dest + width;
650
__m64 ms = load8888(s);
651
__m64 md = load8888(d);
652
uint32_t sa = s >> 24;
653
uint32_t da = ~d >> 24;
656
__m64 msa = load8888(FbIntDiv(da, sa) << 24);
657
msa = expand_alpha(msa);
658
ms = pix_multiply(ms, msa);
660
md = pix_add(md, ms);
661
*dest = store8888(md);
670
mmxCombineSrcC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
672
const uint32_t *end = src + width;
674
__m64 a = load8888(*mask);
675
__m64 s = load8888(*src);
676
s = pix_multiply(s, a);
677
*dest = store8888(s);
686
mmxCombineOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
688
const uint32_t *end = src + width;
690
__m64 a = load8888(*mask);
691
__m64 s = load8888(*src);
692
__m64 d = load8888(*dest);
693
__m64 sa = expand_alpha(s);
695
*dest = store8888(in_over (s, sa, a, d));
705
mmxCombineOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
707
const uint32_t *end = src + width;
709
__m64 a = load8888(*mask);
710
__m64 s = load8888(*src);
711
__m64 d = load8888(*dest);
712
__m64 da = expand_alpha(d);
714
*dest = store8888(over (d, da, in (s, a)));
725
mmxCombineInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
727
const uint32_t *end = src + width;
729
__m64 a = load8888(*mask);
730
__m64 s = load8888(*src);
731
__m64 d = load8888(*dest);
732
__m64 da = expand_alpha(d);
733
s = pix_multiply(s, a);
734
s = pix_multiply(s, da);
735
*dest = store8888(s);
744
mmxCombineInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
746
const uint32_t *end = src + width;
748
__m64 a = load8888(*mask);
749
__m64 s = load8888(*src);
750
__m64 d = load8888(*dest);
751
__m64 sa = expand_alpha(s);
752
a = pix_multiply(a, sa);
753
d = pix_multiply(d, a);
754
*dest = store8888(d);
763
mmxCombineOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
765
const uint32_t *end = src + width;
767
__m64 a = load8888(*mask);
768
__m64 s = load8888(*src);
769
__m64 d = load8888(*dest);
770
__m64 da = expand_alpha(d);
772
s = pix_multiply(s, a);
773
s = pix_multiply(s, da);
774
*dest = store8888(s);
783
mmxCombineOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
785
const uint32_t *end = src + width;
787
__m64 a = load8888(*mask);
788
__m64 s = load8888(*src);
789
__m64 d = load8888(*dest);
790
__m64 sa = expand_alpha(s);
791
a = pix_multiply(a, sa);
793
d = pix_multiply(d, a);
794
*dest = store8888(d);
803
mmxCombineAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
805
const uint32_t *end = src + width;
807
__m64 a = load8888(*mask);
808
__m64 s = load8888(*src);
809
__m64 d = load8888(*dest);
810
__m64 da = expand_alpha(d);
811
__m64 sa = expand_alpha(s);
812
s = pix_multiply(s, a);
813
a = pix_multiply(a, sa);
815
d = pix_add_mul (d, a, s, da);
816
*dest = store8888(d);
825
mmxCombineAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
827
const uint32_t *end = src + width;
829
__m64 a = load8888(*mask);
830
__m64 s = load8888(*src);
831
__m64 d = load8888(*dest);
832
__m64 da = expand_alpha(d);
833
__m64 sa = expand_alpha(s);
834
s = pix_multiply(s, a);
835
a = pix_multiply(a, sa);
837
d = pix_add_mul (d, a, s, da);
838
*dest = store8888(d);
847
mmxCombineXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
849
const uint32_t *end = src + width;
851
__m64 a = load8888(*mask);
852
__m64 s = load8888(*src);
853
__m64 d = load8888(*dest);
854
__m64 da = expand_alpha(d);
855
__m64 sa = expand_alpha(s);
856
s = pix_multiply(s, a);
857
a = pix_multiply(a, sa);
860
d = pix_add_mul (d, a, s, da);
861
*dest = store8888(d);
870
mmxCombineAddC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
872
const uint32_t *end = src + width;
874
__m64 a = load8888(*mask);
875
__m64 s = load8888(*src);
876
__m64 d = load8888(*dest);
877
s = pix_multiply(s, a);
879
*dest = store8888(d);
887
void fbComposeSetupMMX(void)
889
/* check if we have MMX support and initialize accordingly */
890
if (pixman_have_mmx())
892
pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = mmxCombineOverU;
893
pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseU;
894
pixman_composeFunctions.combineU[PIXMAN_OP_IN] = mmxCombineInU;
895
pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseU;
896
pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = mmxCombineOutU;
897
pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseU;
898
pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = mmxCombineAtopU;
899
pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseU;
900
pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = mmxCombineXorU;
901
pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = mmxCombineAddU;
902
pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = mmxCombineSaturateU;
904
pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = mmxCombineSrcC;
905
pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = mmxCombineOverC;
906
pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseC;
907
pixman_composeFunctions.combineC[PIXMAN_OP_IN] = mmxCombineInC;
908
pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseC;
909
pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = mmxCombineOutC;
910
pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseC;
911
pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = mmxCombineAtopC;
912
pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseC;
913
pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = mmxCombineXorC;
914
pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = mmxCombineAddC;
916
pixman_composeFunctions.combineMaskU = mmxCombineMaskU;
921
/* ------------------ MMX code paths called from fbpict.c ----------------------- */
924
fbCompositeSolid_nx8888mmx (pixman_op_t op,
925
pixman_image_t * pSrc,
926
pixman_image_t * pMask,
927
pixman_image_t * pDst,
938
uint32_t *dstLine, *dst;
945
fbComposeGetSolid(pSrc, src, pDst->bits.format);
950
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
952
vsrc = load8888 (src);
953
vsrca = expand_alpha (vsrc);
958
dstLine += dstStride;
963
while (w && (unsigned long)dst & 7)
965
*dst = store8888(over(vsrc, vsrca, load8888(*dst)));
976
vdest = *(__m64 *)dst;
978
dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
979
dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
981
*(__m64 *)dst = pack8888(dest0, dest1);
991
*dst = store8888(over(vsrc, vsrca, load8888(*dst)));
1002
fbCompositeSolid_nx0565mmx (pixman_op_t op,
1003
pixman_image_t * pSrc,
1004
pixman_image_t * pMask,
1005
pixman_image_t * pDst,
1016
uint16_t *dstLine, *dst;
1023
fbComposeGetSolid(pSrc, src, pDst->bits.format);
1028
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
1030
vsrc = load8888 (src);
1031
vsrca = expand_alpha (vsrc);
1036
dstLine += dstStride;
1041
while (w && (unsigned long)dst & 7)
1044
__m64 vdest = expand565 ((__m64)d, 0);
1045
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
1046
*dst = (ullong)vdest;
1056
vdest = *(__m64 *)dst;
1058
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
1059
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
1060
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
1061
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
1063
*(__m64 *)dst = vdest;
1074
__m64 vdest = expand565 ((__m64)d, 0);
1075
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
1076
*dst = (ullong)vdest;
1087
fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op,
1088
pixman_image_t * pSrc,
1089
pixman_image_t * pMask,
1090
pixman_image_t * pDst,
1103
int dstStride, maskStride;
1108
fbComposeGetSolid(pSrc, src, pDst->bits.format);
1114
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
1115
fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
1117
vsrc = load8888(src);
1118
vsrca = expand_alpha(vsrc);
1123
uint32_t *p = (uint32_t *)maskLine;
1124
uint32_t *q = (uint32_t *)dstLine;
1126
while (twidth && (unsigned long)q & 7)
1128
uint32_t m = *(uint32_t *)p;
1132
__m64 vdest = load8888(*q);
1133
vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1134
*q = store8888(vdest);
1151
__m64 vdest = *(__m64 *)q;
1153
dest0 = in_over(vsrc, vsrca, load8888(m0),
1154
expand8888 (vdest, 0));
1155
dest1 = in_over(vsrc, vsrca, load8888(m1),
1156
expand8888 (vdest, 1));
1158
*(__m64 *)q = pack8888(dest0, dest1);
1168
uint32_t m = *(uint32_t *)p;
1172
__m64 vdest = load8888(*q);
1173
vdest = in_over(vsrc, vsrca, load8888(m), vdest);
1174
*q = store8888(vdest);
1182
dstLine += dstStride;
1183
maskLine += maskStride;
1190
fbCompositeSrc_8888x8x8888mmx (pixman_op_t op,
1191
pixman_image_t * pSrc,
1192
pixman_image_t * pMask,
1193
pixman_image_t * pDst,
1203
uint32_t *dstLine, *dst;
1204
uint32_t *srcLine, *src;
1207
int dstStride, srcStride;
1213
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
1214
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
1216
fbComposeGetSolid (pMask, mask, pDst->bits.format);
1217
mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1218
vmask = load8888 (mask);
1224
dstLine += dstStride;
1226
srcLine += srcStride;
1229
while (w && (unsigned long)dst & 7)
1231
__m64 s = load8888 (*src);
1232
__m64 d = load8888 (*dst);
1234
*dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1243
__m64 vs = *(__m64 *)src;
1244
__m64 vd = *(__m64 *)dst;
1245
__m64 vsrc0 = expand8888 (vs, 0);
1246
__m64 vsrc1 = expand8888 (vs, 1);
1248
*(__m64 *)dst = pack8888 (
1249
in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1250
in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1259
__m64 s = load8888 (*src);
1260
__m64 d = load8888 (*dst);
1262
*dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1274
fbCompositeSrc_x888xnx8888mmx (pixman_op_t op,
1275
pixman_image_t * pSrc,
1276
pixman_image_t * pMask,
1277
pixman_image_t * pDst,
1287
uint32_t *dstLine, *dst;
1288
uint32_t *srcLine, *src;
1291
int dstStride, srcStride;
1297
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
1298
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
1299
fbComposeGetSolid (pMask, mask, pDst->bits.format);
1301
mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1302
vmask = load8888 (mask);
1308
dstLine += dstStride;
1310
srcLine += srcStride;
1313
while (w && (unsigned long)dst & 7)
1315
__m64 s = load8888 (*src | 0xff000000);
1316
__m64 d = load8888 (*dst);
1318
*dst = store8888 (in_over (s, srca, vmask, d));
1327
__m64 vd0 = *(__m64 *)(dst + 0);
1328
__m64 vd1 = *(__m64 *)(dst + 2);
1329
__m64 vd2 = *(__m64 *)(dst + 4);
1330
__m64 vd3 = *(__m64 *)(dst + 6);
1331
__m64 vd4 = *(__m64 *)(dst + 8);
1332
__m64 vd5 = *(__m64 *)(dst + 10);
1333
__m64 vd6 = *(__m64 *)(dst + 12);
1334
__m64 vd7 = *(__m64 *)(dst + 14);
1336
__m64 vs0 = *(__m64 *)(src + 0);
1337
__m64 vs1 = *(__m64 *)(src + 2);
1338
__m64 vs2 = *(__m64 *)(src + 4);
1339
__m64 vs3 = *(__m64 *)(src + 6);
1340
__m64 vs4 = *(__m64 *)(src + 8);
1341
__m64 vs5 = *(__m64 *)(src + 10);
1342
__m64 vs6 = *(__m64 *)(src + 12);
1343
__m64 vs7 = *(__m64 *)(src + 14);
1346
in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1347
in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1350
in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1351
in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1354
in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1355
in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1358
in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1359
in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1362
in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1363
in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1366
in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1367
in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1370
in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1371
in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1374
in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1375
in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1377
*(__m64 *)(dst + 0) = vd0;
1378
*(__m64 *)(dst + 2) = vd1;
1379
*(__m64 *)(dst + 4) = vd2;
1380
*(__m64 *)(dst + 6) = vd3;
1381
*(__m64 *)(dst + 8) = vd4;
1382
*(__m64 *)(dst + 10) = vd5;
1383
*(__m64 *)(dst + 12) = vd6;
1384
*(__m64 *)(dst + 14) = vd7;
1393
__m64 s = load8888 (*src | 0xff000000);
1394
__m64 d = load8888 (*dst);
1396
*dst = store8888 (in_over (s, srca, vmask, d));
1408
fbCompositeSrc_8888x8888mmx (pixman_op_t op,
1409
pixman_image_t * pSrc,
1410
pixman_image_t * pMask,
1411
pixman_image_t * pDst,
1421
uint32_t *dstLine, *dst;
1422
uint32_t *srcLine, *src;
1424
int dstStride, srcStride;
1430
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
1431
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
1436
dstLine += dstStride;
1438
srcLine += srcStride;
1450
sa = expand_alpha(ms);
1451
*dst = store8888(over(ms, sa, load8888(*dst)));
1460
fbCompositeSrc_8888x0565mmx (pixman_op_t op,
1461
pixman_image_t * pSrc,
1462
pixman_image_t * pMask,
1463
pixman_image_t * pDst,
1473
uint16_t *dstLine, *dst;
1474
uint32_t *srcLine, *src;
1475
int dstStride, srcStride;
1480
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
1481
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
1485
assert (pSrc->pDrawable == pMask->pDrawable);
1491
dstLine += dstStride;
1493
srcLine += srcStride;
1498
while (w && (unsigned long)dst & 7)
1500
__m64 vsrc = load8888 (*src);
1502
__m64 vdest = expand565 ((__m64)d, 0);
1504
vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0);
1506
*dst = (ullong)vdest;
1517
__m64 vsrc0, vsrc1, vsrc2, vsrc3;
1520
vsrc0 = load8888(*(src + 0));
1521
vsrc1 = load8888(*(src + 1));
1522
vsrc2 = load8888(*(src + 2));
1523
vsrc3 = load8888(*(src + 3));
1525
vdest = *(__m64 *)dst;
1527
vdest = pack565(over(vsrc0, expand_alpha(vsrc0), expand565(vdest, 0)), vdest, 0);
1528
vdest = pack565(over(vsrc1, expand_alpha(vsrc1), expand565(vdest, 1)), vdest, 1);
1529
vdest = pack565(over(vsrc2, expand_alpha(vsrc2), expand565(vdest, 2)), vdest, 2);
1530
vdest = pack565(over(vsrc3, expand_alpha(vsrc3), expand565(vdest, 3)), vdest, 3);
1532
*(__m64 *)dst = vdest;
1543
__m64 vsrc = load8888 (*src);
1545
__m64 vdest = expand565 ((__m64)d, 0);
1547
vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0);
1549
*dst = (ullong)vdest;
1561
fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op,
1562
pixman_image_t * pSrc,
1563
pixman_image_t * pMask,
1564
pixman_image_t * pDst,
1575
uint32_t *dstLine, *dst;
1576
uint8_t *maskLine, *mask;
1577
int dstStride, maskStride;
1584
fbComposeGetSolid(pSrc, src, pDst->bits.format);
1590
srcsrc = (unsigned long long)src << 32 | src;
1592
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
1593
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
1595
vsrc = load8888 (src);
1596
vsrca = expand_alpha (vsrc);
1601
dstLine += dstStride;
1603
maskLine += maskStride;
1608
while (w && (unsigned long)dst & 7)
1614
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
1615
*dst = store8888(vdest);
1631
if (srca == 0xff && (m0 & m1) == 0xff)
1633
*(unsigned long long *)dst = srcsrc;
1640
vdest = *(__m64 *)dst;
1642
dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
1643
dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
1645
*(__m64 *)dst = pack8888(dest0, dest1);
1661
__m64 vdest = load8888(*dst);
1662
vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
1663
*dst = store8888(vdest);
1676
pixman_fill_mmx (uint32_t *bits,
1687
uint32_t byte_width;
1690
__m64 v1, v2, v3, v4, v5, v6, v7;
1693
if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1696
if (bpp != 16 && bpp != 32)
1701
stride = stride * sizeof (uint32_t) / 2;
1702
byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1703
byte_width = 2 * width;
1708
stride = stride * sizeof (uint32_t) / 4;
1709
byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1710
byte_width = 4 * width;
1714
fill = ((ullong)xor << 32) | xor;
1715
vfill = (__m64)fill;
1726
: "=y" (v1), "=y" (v2), "=y" (v3),
1727
"=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
1734
uint8_t *d = byte_line;
1735
byte_line += stride;
1738
while (w >= 2 && ((unsigned long)d & 3))
1740
*(uint16_t *)d = xor;
1745
while (w >= 4 && ((unsigned long)d & 7))
1747
*(uint32_t *)d = xor;
1767
"y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1768
"y" (v4), "y" (v5), "y" (v6), "y" (v7)
1771
*(__m64*) (d + 0) = vfill;
1772
*(__m64*) (d + 8) = vfill;
1773
*(__m64*) (d + 16) = vfill;
1774
*(__m64*) (d + 24) = vfill;
1775
*(__m64*) (d + 32) = vfill;
1776
*(__m64*) (d + 40) = vfill;
1777
*(__m64*) (d + 48) = vfill;
1778
*(__m64*) (d + 56) = vfill;
1786
*(uint32_t *)d = xor;
1793
*(uint16_t *)d = xor;
1804
fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op,
1805
pixman_image_t * pSrc,
1806
pixman_image_t * pMask,
1807
pixman_image_t * pDst,
1818
uint32_t *dstLine, *dst;
1819
uint8_t *maskLine, *mask;
1820
int dstStride, maskStride;
1827
fbComposeGetSolid(pSrc, src, pDst->bits.format);
1832
pixman_fill_mmx (pDst->bits.bits, pDst->bits.rowstride, PIXMAN_FORMAT_BPP (pDst->bits.format),
1833
xDst, yDst, width, height, 0);
1837
srcsrc = (ullong)src << 32 | src;
1839
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
1840
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
1842
vsrc = load8888 (src);
1843
vsrca = expand_alpha (vsrc);
1848
dstLine += dstStride;
1850
maskLine += maskStride;
1855
while (w && (unsigned long)dst & 7)
1861
__m64 vdest = in(vsrc, expand_alpha_rev ((__m64)m));
1862
*dst = store8888(vdest);
1882
if (srca == 0xff && (m0 & m1) == 0xff)
1884
*(ullong *)dst = srcsrc;
1891
vdest = *(__m64 *)dst;
1893
dest0 = in(vsrc, expand_alpha_rev ((__m64)m0));
1894
dest1 = in(vsrc, expand_alpha_rev ((__m64)m1));
1896
*(__m64 *)dst = pack8888(dest0, dest1);
1916
__m64 vdest = load8888(*dst);
1917
vdest = in(vsrc, expand_alpha_rev ((__m64)m));
1918
*dst = store8888(vdest);
1935
fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op,
1936
pixman_image_t * pSrc,
1937
pixman_image_t * pMask,
1938
pixman_image_t * pDst,
1949
uint16_t *dstLine, *dst;
1950
uint8_t *maskLine, *mask;
1951
int dstStride, maskStride;
1954
unsigned long long srcsrcsrcsrc, src16;
1958
fbComposeGetSolid(pSrc, src, pDst->bits.format);
1964
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
1965
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
1967
vsrc = load8888 (src);
1968
vsrca = expand_alpha (vsrc);
1970
src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
1972
srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
1973
(ullong)src16 << 16 | (ullong)src16;
1978
dstLine += dstStride;
1980
maskLine += maskStride;
1985
while (w && (unsigned long)dst & 7)
1992
__m64 vd = (__m64)d;
1993
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1994
*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
2006
ullong m0, m1, m2, m3;
2012
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2014
*(unsigned long long *)dst = srcsrcsrcsrc;
2016
else if (m0 | m1 | m2 | m3)
2019
__m64 vm0, vm1, vm2, vm3;
2021
vdest = *(__m64 *)dst;
2024
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
2026
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
2028
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
2030
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
2032
*(__m64 *)dst = vdest;
2049
__m64 vd = (__m64)d;
2050
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
2051
*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
2064
fbCompositeSrc_8888RevNPx0565mmx (pixman_op_t op,
2065
pixman_image_t * pSrc,
2066
pixman_image_t * pMask,
2067
pixman_image_t * pDst,
2077
uint16_t *dstLine, *dst;
2078
uint32_t *srcLine, *src;
2079
int dstStride, srcStride;
2084
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2085
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2089
assert (pSrc->pDrawable == pMask->pDrawable);
2095
dstLine += dstStride;
2097
srcLine += srcStride;
2102
while (w && (unsigned long)dst & 7)
2104
__m64 vsrc = load8888 (*src);
2106
__m64 vdest = expand565 ((__m64)d, 0);
2108
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
2110
*dst = (ullong)vdest;
2121
uint32_t s0, s1, s2, s3;
2122
unsigned char a0, a1, a2, a3;
2134
if ((a0 & a1 & a2 & a3) == 0xFF)
2137
vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
2138
vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
2139
vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
2140
vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
2142
*(__m64 *)dst = vdest;
2144
else if (a0 | a1 | a2 | a3)
2146
__m64 vdest = *(__m64 *)dst;
2148
vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
2149
vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
2150
vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
2151
vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
2153
*(__m64 *)dst = vdest;
2165
__m64 vsrc = load8888 (*src);
2167
__m64 vdest = expand565 ((__m64)d, 0);
2169
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
2171
*dst = (ullong)vdest;
2182
/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
2185
fbCompositeSrc_8888RevNPx8888mmx (pixman_op_t op,
2186
pixman_image_t * pSrc,
2187
pixman_image_t * pMask,
2188
pixman_image_t * pDst,
2198
uint32_t *dstLine, *dst;
2199
uint32_t *srcLine, *src;
2200
int dstStride, srcStride;
2205
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2206
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2210
assert (pSrc->pDrawable == pMask->pDrawable);
2216
dstLine += dstStride;
2218
srcLine += srcStride;
2221
while (w && (unsigned long)dst & 7)
2223
__m64 s = load8888 (*src);
2224
__m64 d = load8888 (*dst);
2226
*dst = store8888 (over_rev_non_pre (s, d));
2236
unsigned char a0, a1;
2245
if ((a0 & a1) == 0xFF)
2247
d0 = invert_colors(load8888(s0));
2248
d1 = invert_colors(load8888(s1));
2250
*(__m64 *)dst = pack8888 (d0, d1);
2254
__m64 vdest = *(__m64 *)dst;
2256
d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
2257
d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
2259
*(__m64 *)dst = pack8888 (d0, d1);
2269
__m64 s = load8888 (*src);
2270
__m64 d = load8888 (*dst);
2272
*dst = store8888 (over_rev_non_pre (s, d));
2284
fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op,
2285
pixman_image_t * pSrc,
2286
pixman_image_t * pMask,
2287
pixman_image_t * pDst,
2300
int dstStride, maskStride;
2305
fbComposeGetSolid(pSrc, src, pDst->bits.format);
2311
fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2312
fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
2314
vsrc = load8888 (src);
2315
vsrca = expand_alpha (vsrc);
2320
uint32_t *p = (uint32_t *)maskLine;
2321
uint16_t *q = (uint16_t *)dstLine;
2323
while (twidth && ((unsigned long)q & 7))
2325
uint32_t m = *(uint32_t *)p;
2330
__m64 vdest = expand565 ((__m64)d, 0);
2331
vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2342
uint32_t m0, m1, m2, m3;
2349
if ((m0 | m1 | m2 | m3))
2351
__m64 vdest = *(__m64 *)q;
2353
vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
2354
vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
2355
vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
2356
vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
2358
*(__m64 *)q = vdest;
2373
__m64 vdest = expand565((__m64)d, 0);
2374
vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
2383
maskLine += maskStride;
2384
dstLine += dstStride;
2391
fbCompositeIn_nx8x8mmx (pixman_op_t op,
2392
pixman_image_t * pSrc,
2393
pixman_image_t * pMask,
2394
pixman_image_t * pDst,
2404
uint8_t *dstLine, *dst;
2405
uint8_t *maskLine, *mask;
2406
int dstStride, maskStride;
2412
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
2413
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
2415
fbComposeGetSolid(pSrc, src, pDst->bits.format);
2421
vsrc = load8888(src);
2422
vsrca = expand_alpha(vsrc);
2427
dstLine += dstStride;
2429
maskLine += maskStride;
2432
if ((((unsigned long)pDst & 3) == 0) &&
2433
(((unsigned long)pSrc & 3) == 0))
2443
vmask = load8888 (*(uint32_t *)mask);
2444
vdest = load8888 (*(uint32_t *)dst);
2446
*(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2464
m = FbInU (sa, 0, a, tmp);
2465
r = FbInU (m, 0, d, tmp);
2475
fbCompositeIn_8x8mmx (pixman_op_t op,
2476
pixman_image_t * pSrc,
2477
pixman_image_t * pMask,
2478
pixman_image_t * pDst,
2488
uint8_t *dstLine, *dst;
2489
uint8_t *srcLine, *src;
2490
int srcStride, dstStride;
2493
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
2494
fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
2499
dstLine += dstStride;
2501
srcLine += srcStride;
2504
if ((((unsigned long)pDst & 3) == 0) &&
2505
(((unsigned long)pSrc & 3) == 0))
2509
uint32_t *s = (uint32_t *)src;
2510
uint32_t *d = (uint32_t *)dst;
2512
*d = store8888 (in (load8888 (*s), load8888 (*d)));
2528
*dst = FbInU (s, 0, d, tmp);
2539
fbCompositeSrcAdd_8888x8x8mmx (pixman_op_t op,
2540
pixman_image_t * pSrc,
2541
pixman_image_t * pMask,
2542
pixman_image_t * pDst,
2552
uint8_t *dstLine, *dst;
2553
uint8_t *maskLine, *mask;
2554
int dstStride, maskStride;
2560
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
2561
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
2563
fbComposeGetSolid(pSrc, src, pDst->bits.format);
2569
vsrc = load8888(src);
2570
vsrca = expand_alpha(vsrc);
2575
dstLine += dstStride;
2577
maskLine += maskStride;
2580
if ((((unsigned long)pMask & 3) == 0) &&
2581
(((unsigned long)pDst & 3) == 0))
2585
__m64 vmask = load8888 (*(uint32_t *)mask);
2586
__m64 vdest = load8888 (*(uint32_t *)dst);
2588
*(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2606
m = FbInU (sa, 0, a, tmp);
2607
r = FbAdd (m, d, 0, tmp);
2617
fbCompositeSrcAdd_8000x8000mmx (pixman_op_t op,
2618
pixman_image_t * pSrc,
2619
pixman_image_t * pMask,
2620
pixman_image_t * pDst,
2630
uint8_t *dstLine, *dst;
2631
uint8_t *srcLine, *src;
2632
int dstStride, srcStride;
2639
fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
2640
fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
2645
dstLine += dstStride;
2647
srcLine += srcStride;
2650
while (w && (unsigned long)dst & 7)
2655
s = t | (0 - (t >> 8));
2665
*(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
2676
s = t | (0 - (t >> 8));
2689
fbCompositeSrcAdd_8888x8888mmx (pixman_op_t op,
2690
pixman_image_t * pSrc,
2691
pixman_image_t * pMask,
2692
pixman_image_t * pDst,
2702
uint32_t *dstLine, *dst;
2703
uint32_t *srcLine, *src;
2704
int dstStride, srcStride;
2709
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2710
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2715
dstLine += dstStride;
2717
srcLine += srcStride;
2720
while (w && (unsigned long)dst & 7)
2722
*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
2723
_mm_cvtsi32_si64(*dst)));
2731
*(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
2739
*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
2740
_mm_cvtsi32_si64(*dst)));
2749
pixman_blt_mmx (uint32_t *src_bits,
2755
int src_x, int src_y,
2756
int dst_x, int dst_y,
2757
int width, int height)
2759
uint8_t * src_bytes;
2760
uint8_t * dst_bytes;
2763
if (src_bpp != dst_bpp)
2768
src_stride = src_stride * sizeof (uint32_t) / 2;
2769
dst_stride = dst_stride * sizeof (uint32_t) / 2;
2770
src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
2771
dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
2772
byte_width = 2 * width;
2775
} else if (src_bpp == 32) {
2776
src_stride = src_stride * sizeof (uint32_t) / 4;
2777
dst_stride = dst_stride * sizeof (uint32_t) / 4;
2778
src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
2779
dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
2780
byte_width = 4 * width;
2790
uint8_t *s = src_bytes;
2791
uint8_t *d = dst_bytes;
2792
src_bytes += src_stride;
2793
dst_bytes += dst_stride;
2796
while (w >= 2 && ((unsigned long)d & 3))
2798
*(uint16_t *)d = *(uint16_t *)s;
2804
while (w >= 4 && ((unsigned long)d & 7))
2806
*(uint32_t *)d = *(uint32_t *)s;
2817
"movq (%1), %%mm0\n"
2818
"movq 8(%1), %%mm1\n"
2819
"movq 16(%1), %%mm2\n"
2820
"movq 24(%1), %%mm3\n"
2821
"movq 32(%1), %%mm4\n"
2822
"movq 40(%1), %%mm5\n"
2823
"movq 48(%1), %%mm6\n"
2824
"movq 56(%1), %%mm7\n"
2826
"movq %%mm0, (%0)\n"
2827
"movq %%mm1, 8(%0)\n"
2828
"movq %%mm2, 16(%0)\n"
2829
"movq %%mm3, 24(%0)\n"
2830
"movq %%mm4, 32(%0)\n"
2831
"movq %%mm5, 40(%0)\n"
2832
"movq %%mm6, 48(%0)\n"
2833
"movq %%mm7, 56(%0)\n"
2837
"%mm0", "%mm1", "%mm2", "%mm3",
2838
"%mm4", "%mm5", "%mm6", "%mm7");
2840
__m64 v0 = *(__m64 *)(s + 0);
2841
__m64 v1 = *(__m64 *)(s + 8);
2842
__m64 v2 = *(__m64 *)(s + 16);
2843
__m64 v3 = *(__m64 *)(s + 24);
2844
__m64 v4 = *(__m64 *)(s + 32);
2845
__m64 v5 = *(__m64 *)(s + 40);
2846
__m64 v6 = *(__m64 *)(s + 48);
2847
__m64 v7 = *(__m64 *)(s + 56);
2848
*(__m64 *)(d + 0) = v0;
2849
*(__m64 *)(d + 8) = v1;
2850
*(__m64 *)(d + 16) = v2;
2851
*(__m64 *)(d + 24) = v3;
2852
*(__m64 *)(d + 32) = v4;
2853
*(__m64 *)(d + 40) = v5;
2854
*(__m64 *)(d + 48) = v6;
2855
*(__m64 *)(d + 56) = v7;
2864
*(uint32_t *)d = *(uint32_t *)s;
2872
*(uint16_t *)d = *(uint16_t *)s;
2885
fbCompositeCopyAreammx (pixman_op_t op,
2886
pixman_image_t * pSrc,
2887
pixman_image_t * pMask,
2888
pixman_image_t * pDst,
2898
pixman_blt_mmx (pSrc->bits.bits,
2900
pSrc->bits.rowstride,
2901
pDst->bits.rowstride,
2902
PIXMAN_FORMAT_BPP (pSrc->bits.format),
2903
PIXMAN_FORMAT_BPP (pDst->bits.format),
2904
xSrc, ySrc, xDst, yDst, width, height);
2908
fbCompositeOver_x888x8x8888mmx (pixman_op_t op,
2909
pixman_image_t * pSrc,
2910
pixman_image_t * pMask,
2911
pixman_image_t * pDst,
2921
uint32_t *src, *srcLine;
2922
uint32_t *dst, *dstLine;
2923
uint8_t *mask, *maskLine;
2924
int srcStride, maskStride, dstStride;
2929
fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2930
fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
2931
fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2936
srcLine += srcStride;
2938
dstLine += dstStride;
2940
maskLine += maskStride;
2950
__m64 s = load8888 (*src | 0xff000000);
2953
*dst = store8888 (s);
2956
__m64 sa = expand_alpha (s);
2957
__m64 vm = expand_alpha_rev ((__m64)m);
2958
__m64 vdest = in_over(s, sa, vm, load8888 (*dst));
2960
*dst = store8888 (vdest);
2975
#endif /* USE_MMX */