2
* Copyright © 2004, 2005 Red Hat, Inc.
3
* Copyright © 2004 Nicholas Miell
4
* Copyright © 2005 Trolltech AS
6
* Permission to use, copy, modify, distribute, and sell this software and its
7
* documentation for any purpose is hereby granted without fee, provided that
8
* the above copyright notice appear in all copies and that both that
9
* copyright notice and this permission notice appear in supporting
10
* documentation, and that the name of Red Hat not be used in advertising or
11
* publicity pertaining to distribution of the software without specific,
12
* written prior permission. Red Hat makes no representations about the
13
* suitability of this software for any purpose. It is provided "as is"
14
* without express or implied warranty.
16
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25
* Author: Søren Sandmann (sandmann@redhat.com)
26
* Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27
* MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29
* Based on work by Owen Taylor
36
#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38
#ifdef USE_LOONGSON_MMI
39
#include <loongson-mmintrin.h>
43
#include "pixman-private.h"
44
#include "pixman-combine32.h"
45
#include "pixman-inlines.h"
48
#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
53
#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54
/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63
# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64
# include <xmmintrin.h>
66
/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67
* instructions to be generated that we don't want. Just duplicate the
68
* functions we want to use. */
69
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70
_mm_movemask_pi8 (__m64 __A)
74
asm ("pmovmskb %1, %0\n\t"
82
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
85
asm ("pmulhuw %1, %0\n\t"
93
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94
_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
98
asm ("pshufw %2, %1, %0\n\t"
100
: "y" (__A), "K" (__N)
106
# define _mm_shuffle_pi16(A, N) \
110
asm ("pshufw %2, %1, %0\n\t" \
112
: "y" (A), "K" ((const int8_t)N) \
122
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
123
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
126
/* Notes about writing mmx code
128
* give memory operands as the second operand. If you give it as the
129
* first, gcc will first load it into a register, then use that
134
* _mm_mullo_pi16 (x, mmx_constant);
138
* _mm_mullo_pi16 (mmx_constant, x);
140
* Also try to minimize dependencies. i.e. when you need a value, try
141
* to calculate it from a value that was calculated as early as
145
/* --------------- MMX primitives ------------------------------------- */
147
/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
148
* the name of the member used to access the data.
149
* If __m64 requires using mm_cvt* intrinsics functions to convert between
150
* uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
151
* If __m64 and uint64_t values can just be cast to each other directly,
152
* then define USE_M64_CASTS.
153
* If __m64 is a double datatype, then define USE_M64_DOUBLE.
156
# define M64_MEMBER m64_u64
158
# define USE_CVT_INTRINSICS
159
#elif defined(USE_LOONGSON_MMI)
160
# define USE_M64_DOUBLE
161
#elif defined(__GNUC__)
162
# define USE_M64_CASTS
163
#elif defined(__SUNPRO_C)
164
# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
165
/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
166
* support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
167
* is defined. If it is used, then the mm_cvt* intrinsics must be used.
169
# define USE_CVT_INTRINSICS
171
/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
172
* disabled, __m64 is defined as a struct containing "unsigned long long l_".
174
# define M64_MEMBER l_
178
#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
179
typedef uint64_t mmxdatafield;
181
typedef __m64 mmxdatafield;
186
mmxdatafield mmx_4x00ff;
187
mmxdatafield mmx_4x0080;
188
mmxdatafield mmx_565_rgb;
189
mmxdatafield mmx_565_unpack_multiplier;
190
mmxdatafield mmx_565_pack_multiplier;
191
mmxdatafield mmx_565_r;
192
mmxdatafield mmx_565_g;
193
mmxdatafield mmx_565_b;
194
mmxdatafield mmx_packed_565_rb;
195
mmxdatafield mmx_packed_565_g;
196
mmxdatafield mmx_expand_565_g;
197
mmxdatafield mmx_expand_565_b;
198
mmxdatafield mmx_expand_565_r;
199
#ifndef USE_LOONGSON_MMI
200
mmxdatafield mmx_mask_0;
201
mmxdatafield mmx_mask_1;
202
mmxdatafield mmx_mask_2;
203
mmxdatafield mmx_mask_3;
205
mmxdatafield mmx_full_alpha;
206
mmxdatafield mmx_4x0101;
207
mmxdatafield mmx_ff000000;
210
#if defined(_MSC_VER)
211
# define MMXDATA_INIT(field, val) { val ## UI64 }
212
#elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
213
# define MMXDATA_INIT(field, val) field = { val ## ULL }
214
#else /* mmxdatafield is an integral type */
215
# define MMXDATA_INIT(field, val) field = val ## ULL
218
static const mmx_data_t c =
220
MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
221
MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
222
MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
223
MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
224
MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
225
MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
226
MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
227
MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
228
MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
229
MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
230
MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
231
MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
232
MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
233
#ifndef USE_LOONGSON_MMI
234
MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
235
MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
236
MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
237
MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
239
MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
240
MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
241
MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
244
#ifdef USE_CVT_INTRINSICS
245
# define MC(x) to_m64 (c.mmx_ ## x)
246
#elif defined(USE_M64_CASTS)
247
# define MC(x) ((__m64)c.mmx_ ## x)
248
#elif defined(USE_M64_DOUBLE)
249
# define MC(x) (*(__m64 *)&c.mmx_ ## x)
251
# define MC(x) c.mmx_ ## x
254
static force_inline __m64
257
#ifdef USE_CVT_INTRINSICS
258
return _mm_cvtsi64_m64 (x);
259
#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
264
#elif defined USE_M64_DOUBLE
266
#else /* USE_M64_CASTS */
271
static force_inline uint64_t
274
#ifdef USE_CVT_INTRINSICS
275
return _mm_cvtm64_si64 (x);
276
#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
277
uint64_t res = x.M64_MEMBER;
279
#elif defined USE_M64_DOUBLE
280
return *(uint64_t *)&x;
281
#else /* USE_M64_CASTS */
286
static force_inline __m64
291
return _mm_slli_si64 (v, s);
293
return _mm_srli_si64 (v, -s);
298
static force_inline __m64
301
return _mm_xor_si64 (mask, MC (4x00ff));
304
/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
305
* and maps its result to the same range.
307
* Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
308
* Notation, Notation, Notation", the first of which is
310
* prod(a, b) = (a * b + 128) / 255.
312
* By approximating the division by 255 as 257/65536 it can be replaced by a
313
* multiply and a right shift. This is the implementation that we use in
314
* pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
315
* 3DNow!, and unavailable at the time of the book's publication) to perform
316
* the multiplication and right shift in a single operation.
318
* prod(a, b) = ((a * b + 128) * 257) >> 16.
320
* A third way (how pix_multiply() was implemented prior to 14208344) exists
321
* also that performs the multiplication by 257 with adds and shifts.
323
* Where temp = a * b + 128
325
* prod(a, b) = (temp + (temp >> 8)) >> 8.
327
static force_inline __m64
328
pix_multiply (__m64 a, __m64 b)
332
res = _mm_mullo_pi16 (a, b);
333
res = _mm_adds_pu16 (res, MC (4x0080));
334
res = _mm_mulhi_pu16 (res, MC (4x0101));
339
static force_inline __m64
340
pix_add (__m64 a, __m64 b)
342
return _mm_adds_pu8 (a, b);
345
static force_inline __m64
346
expand_alpha (__m64 pixel)
348
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
351
static force_inline __m64
352
expand_alpha_rev (__m64 pixel)
354
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
357
static force_inline __m64
358
invert_colors (__m64 pixel)
360
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
363
static force_inline __m64
368
return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
371
static force_inline __m64
372
over_rev_non_pre (__m64 src, __m64 dest)
374
__m64 srca = expand_alpha (src);
375
__m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
377
return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
380
static force_inline __m64
381
in (__m64 src, __m64 mask)
383
return pix_multiply (src, mask);
387
static force_inline __m64
388
in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
390
return over (in (src, mask), pix_multiply (srca, mask), dest);
395
#define in_over(src, srca, mask, dest) \
396
over (in (src, mask), pix_multiply (srca, mask), dest)
400
/* Elemental unaligned loads */
402
static force_inline __m64 ldq_u(__m64 *p)
405
/* x86's alignment restrictions are very relaxed. */
407
#elif defined USE_ARM_IWMMXT
408
int align = (uintptr_t)p & 7;
412
aligned_p = (__m64 *)((uintptr_t)p & ~7);
413
return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
415
struct __una_u64 { __m64 x __attribute__((packed)); };
416
const struct __una_u64 *ptr = (const struct __una_u64 *) p;
417
return (__m64) ptr->x;
421
static force_inline uint32_t ldl_u(const uint32_t *p)
424
/* x86's alignment restrictions are very relaxed. */
427
struct __una_u32 { uint32_t x __attribute__((packed)); };
428
const struct __una_u32 *ptr = (const struct __una_u32 *) p;
433
static force_inline __m64
434
load (const uint32_t *v)
436
#ifdef USE_LOONGSON_MMI
438
asm ("lwc1 %0, %1\n\t"
444
return _mm_cvtsi32_si64 (*v);
448
static force_inline __m64
449
load8888 (const uint32_t *v)
451
#ifdef USE_LOONGSON_MMI
452
return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
454
return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
458
static force_inline __m64
459
load8888u (const uint32_t *v)
461
uint32_t l = ldl_u (v);
462
return load8888 (&l);
465
static force_inline __m64
466
pack8888 (__m64 lo, __m64 hi)
468
return _mm_packs_pu16 (lo, hi);
471
static force_inline void
472
store (uint32_t *dest, __m64 v)
474
#ifdef USE_LOONGSON_MMI
475
asm ("swc1 %1, %0\n\t"
481
*dest = _mm_cvtsi64_si32 (v);
485
static force_inline void
486
store8888 (uint32_t *dest, __m64 v)
488
v = pack8888 (v, _mm_setzero_si64 ());
492
static force_inline pixman_bool_t
493
is_equal (__m64 a, __m64 b)
495
#ifdef USE_LOONGSON_MMI
496
/* __m64 is double, we can compare directly. */
499
return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
503
static force_inline pixman_bool_t
506
#ifdef USE_LOONGSON_MMI
507
return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
509
__m64 ffs = _mm_cmpeq_pi8 (v, v);
510
return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
514
static force_inline pixman_bool_t
517
return is_equal (v, _mm_setzero_si64 ());
520
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
524
* --- Expanding 565 in the low word ---
526
* m = (m << (32 - 3)) | (m << (16 - 5)) | m;
527
* m = m & (01f0003f001f);
528
* m = m * (008404100840);
531
* Note the trick here - the top word is shifted by another nibble to
532
* avoid it bumping into the middle word
534
static force_inline __m64
535
expand565 (__m64 pixel, int pos)
540
/* move pixel to low 16 bit and zero the rest */
541
#ifdef USE_LOONGSON_MMI
542
p = loongson_extract_pi16 (p, pos);
544
p = shift (shift (p, (3 - pos) * 16), -48);
547
t1 = shift (p, 36 - 11);
548
t2 = shift (p, 16 - 5);
550
p = _mm_or_si64 (t1, p);
551
p = _mm_or_si64 (t2, p);
552
p = _mm_and_si64 (p, MC (565_rgb));
554
pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
555
return _mm_srli_pi16 (pixel, 8);
558
/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
562
static force_inline void
563
expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
565
__m64 t0, t1, alpha = _mm_setzero_si64 ();
566
__m64 r = _mm_and_si64 (vin, MC (expand_565_r));
567
__m64 g = _mm_and_si64 (vin, MC (expand_565_g));
568
__m64 b = _mm_and_si64 (vin, MC (expand_565_b));
570
alpha = _mm_cmpeq_pi32 (alpha, alpha);
572
/* Replicate high bits into empty low bits. */
573
r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
574
g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
575
b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
577
r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
578
g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
579
b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
581
t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
582
t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
584
*vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
585
*vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
588
static force_inline __m64
589
expand8888 (__m64 in, int pos)
592
return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
594
return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
597
static force_inline __m64
598
expandx888 (__m64 in, int pos)
600
return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
603
static force_inline void
604
expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
607
expand_4xpacked565 (vin, &v0, &v1, full_alpha);
608
*vout0 = expand8888 (v0, 0);
609
*vout1 = expand8888 (v0, 1);
610
*vout2 = expand8888 (v1, 0);
611
*vout3 = expand8888 (v1, 1);
614
static force_inline __m64
615
pack_565 (__m64 pixel, __m64 target, int pos)
621
r = _mm_and_si64 (p, MC (565_r));
622
g = _mm_and_si64 (p, MC (565_g));
623
b = _mm_and_si64 (p, MC (565_b));
625
#ifdef USE_LOONGSON_MMI
626
r = shift (r, -(32 - 8));
627
g = shift (g, -(16 - 3));
628
b = shift (b, -(0 + 3));
630
p = _mm_or_si64 (r, g);
631
p = _mm_or_si64 (p, b);
632
return loongson_insert_pi16 (t, p, pos);
634
r = shift (r, -(32 - 8) + pos * 16);
635
g = shift (g, -(16 - 3) + pos * 16);
636
b = shift (b, -(0 + 3) + pos * 16);
639
t = _mm_and_si64 (t, MC (mask_0));
641
t = _mm_and_si64 (t, MC (mask_1));
643
t = _mm_and_si64 (t, MC (mask_2));
645
t = _mm_and_si64 (t, MC (mask_3));
647
p = _mm_or_si64 (r, t);
648
p = _mm_or_si64 (g, p);
650
return _mm_or_si64 (b, p);
654
static force_inline __m64
655
pack_4xpacked565 (__m64 a, __m64 b)
657
__m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
658
__m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
660
__m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
661
__m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
663
__m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
664
__m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
666
t0 = _mm_or_si64 (t0, g0);
667
t1 = _mm_or_si64 (t1, g1);
670
#ifdef USE_ARM_IWMMXT
672
return _mm_packs_pu32 (t0, t1);
674
t1 = shift(t1, -5 + 16);
675
return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
681
static force_inline __m64
682
pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
684
return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
687
static force_inline __m64
688
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
690
x = pix_multiply (x, a);
691
y = pix_multiply (y, b);
693
return pix_add (x, y);
698
/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
700
#define pack_4x565(v0, v1, v2, v3) \
701
pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
703
#define pix_add_mul(x, a, y, b) \
704
( x = pix_multiply (x, a), \
705
y = pix_multiply (y, b), \
710
/* --------------- MMX code patch for fbcompose.c --------------------- */
712
static force_inline __m64
713
combine (const uint32_t *src, const uint32_t *mask)
715
__m64 vsrc = load8888 (src);
719
__m64 m = load8888 (mask);
721
m = expand_alpha (m);
722
vsrc = pix_multiply (vsrc, m);
728
static force_inline __m64
729
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
731
vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
733
if (is_opaque (vsrc))
737
else if (!is_zero (vsrc))
739
return over (vsrc, expand_alpha (vsrc),
740
_mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
743
return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
747
mmx_combine_over_u (pixman_implementation_t *imp,
750
const uint32_t * src,
751
const uint32_t * mask,
754
const uint32_t *end = dest + width;
758
__m64 vsrc = combine (src, mask);
760
if (is_opaque (vsrc))
762
store8888 (dest, vsrc);
764
else if (!is_zero (vsrc))
766
__m64 sa = expand_alpha (vsrc);
767
store8888 (dest, over (vsrc, sa, load8888 (dest)));
779
mmx_combine_over_reverse_u (pixman_implementation_t *imp,
782
const uint32_t * src,
783
const uint32_t * mask,
786
const uint32_t *end = dest + width;
791
__m64 s = combine (src, mask);
794
da = expand_alpha (d);
795
store8888 (dest, over (d, da, s));
806
mmx_combine_in_u (pixman_implementation_t *imp,
809
const uint32_t * src,
810
const uint32_t * mask,
813
const uint32_t *end = dest + width;
818
__m64 x = combine (src, mask);
821
a = expand_alpha (a);
822
x = pix_multiply (x, a);
835
mmx_combine_in_reverse_u (pixman_implementation_t *imp,
838
const uint32_t * src,
839
const uint32_t * mask,
842
const uint32_t *end = dest + width;
846
__m64 a = combine (src, mask);
850
a = expand_alpha (a);
851
x = pix_multiply (x, a);
863
mmx_combine_out_u (pixman_implementation_t *imp,
866
const uint32_t * src,
867
const uint32_t * mask,
870
const uint32_t *end = dest + width;
875
__m64 x = combine (src, mask);
878
a = expand_alpha (a);
880
x = pix_multiply (x, a);
892
mmx_combine_out_reverse_u (pixman_implementation_t *imp,
895
const uint32_t * src,
896
const uint32_t * mask,
899
const uint32_t *end = dest + width;
903
__m64 a = combine (src, mask);
907
a = expand_alpha (a);
909
x = pix_multiply (x, a);
922
mmx_combine_atop_u (pixman_implementation_t *imp,
925
const uint32_t * src,
926
const uint32_t * mask,
929
const uint32_t *end = dest + width;
934
__m64 s = combine (src, mask);
937
sia = expand_alpha (s);
939
da = expand_alpha (d);
940
s = pix_add_mul (s, da, d, sia);
952
mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
955
const uint32_t * src,
956
const uint32_t * mask,
966
__m64 s = combine (src, mask);
969
sa = expand_alpha (s);
970
dia = expand_alpha (d);
972
s = pix_add_mul (s, dia, d, sa);
984
mmx_combine_xor_u (pixman_implementation_t *imp,
987
const uint32_t * src,
988
const uint32_t * mask,
991
const uint32_t *end = dest + width;
996
__m64 s = combine (src, mask);
999
sia = expand_alpha (s);
1000
dia = expand_alpha (d);
1003
s = pix_add_mul (s, dia, d, sia);
1004
store8888 (dest, s);
1015
mmx_combine_add_u (pixman_implementation_t *imp,
1018
const uint32_t * src,
1019
const uint32_t * mask,
1022
const uint32_t *end = dest + width;
1027
__m64 s = combine (src, mask);
1029
d = load8888 (dest);
1031
store8888 (dest, s);
1042
mmx_combine_saturate_u (pixman_implementation_t *imp,
1045
const uint32_t * src,
1046
const uint32_t * mask,
1049
const uint32_t *end = dest + width;
1055
__m64 ms = combine (src, mask);
1056
__m64 md = load8888 (dest);
1064
uint32_t quot = DIV_UN8 (da, sa) << 24;
1065
__m64 msa = load8888 (");
1066
msa = expand_alpha (msa);
1067
ms = pix_multiply (ms, msa);
1070
md = pix_add (md, ms);
1071
store8888 (dest, md);
1082
mmx_combine_src_ca (pixman_implementation_t *imp,
1085
const uint32_t * src,
1086
const uint32_t * mask,
1089
const uint32_t *end = src + width;
1093
__m64 a = load8888 (mask);
1094
__m64 s = load8888 (src);
1096
s = pix_multiply (s, a);
1097
store8888 (dest, s);
1107
mmx_combine_over_ca (pixman_implementation_t *imp,
1110
const uint32_t * src,
1111
const uint32_t * mask,
1114
const uint32_t *end = src + width;
1118
__m64 a = load8888 (mask);
1119
__m64 s = load8888 (src);
1120
__m64 d = load8888 (dest);
1121
__m64 sa = expand_alpha (s);
1123
store8888 (dest, in_over (s, sa, a, d));
1133
mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1136
const uint32_t * src,
1137
const uint32_t * mask,
1140
const uint32_t *end = src + width;
1144
__m64 a = load8888 (mask);
1145
__m64 s = load8888 (src);
1146
__m64 d = load8888 (dest);
1147
__m64 da = expand_alpha (d);
1149
store8888 (dest, over (d, da, in (s, a)));
1159
mmx_combine_in_ca (pixman_implementation_t *imp,
1162
const uint32_t * src,
1163
const uint32_t * mask,
1166
const uint32_t *end = src + width;
1170
__m64 a = load8888 (mask);
1171
__m64 s = load8888 (src);
1172
__m64 d = load8888 (dest);
1173
__m64 da = expand_alpha (d);
1175
s = pix_multiply (s, a);
1176
s = pix_multiply (s, da);
1177
store8888 (dest, s);
1187
mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1190
const uint32_t * src,
1191
const uint32_t * mask,
1194
const uint32_t *end = src + width;
1198
__m64 a = load8888 (mask);
1199
__m64 s = load8888 (src);
1200
__m64 d = load8888 (dest);
1201
__m64 sa = expand_alpha (s);
1203
a = pix_multiply (a, sa);
1204
d = pix_multiply (d, a);
1205
store8888 (dest, d);
1215
mmx_combine_out_ca (pixman_implementation_t *imp,
1218
const uint32_t * src,
1219
const uint32_t * mask,
1222
const uint32_t *end = src + width;
1226
__m64 a = load8888 (mask);
1227
__m64 s = load8888 (src);
1228
__m64 d = load8888 (dest);
1229
__m64 da = expand_alpha (d);
1232
s = pix_multiply (s, a);
1233
s = pix_multiply (s, da);
1234
store8888 (dest, s);
1244
mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1247
const uint32_t * src,
1248
const uint32_t * mask,
1251
const uint32_t *end = src + width;
1255
__m64 a = load8888 (mask);
1256
__m64 s = load8888 (src);
1257
__m64 d = load8888 (dest);
1258
__m64 sa = expand_alpha (s);
1260
a = pix_multiply (a, sa);
1262
d = pix_multiply (d, a);
1263
store8888 (dest, d);
1273
mmx_combine_atop_ca (pixman_implementation_t *imp,
1276
const uint32_t * src,
1277
const uint32_t * mask,
1280
const uint32_t *end = src + width;
1284
__m64 a = load8888 (mask);
1285
__m64 s = load8888 (src);
1286
__m64 d = load8888 (dest);
1287
__m64 da = expand_alpha (d);
1288
__m64 sa = expand_alpha (s);
1290
s = pix_multiply (s, a);
1291
a = pix_multiply (a, sa);
1293
d = pix_add_mul (d, a, s, da);
1294
store8888 (dest, d);
1304
mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1307
const uint32_t * src,
1308
const uint32_t * mask,
1311
const uint32_t *end = src + width;
1315
__m64 a = load8888 (mask);
1316
__m64 s = load8888 (src);
1317
__m64 d = load8888 (dest);
1318
__m64 da = expand_alpha (d);
1319
__m64 sa = expand_alpha (s);
1321
s = pix_multiply (s, a);
1322
a = pix_multiply (a, sa);
1324
d = pix_add_mul (d, a, s, da);
1325
store8888 (dest, d);
1335
mmx_combine_xor_ca (pixman_implementation_t *imp,
1338
const uint32_t * src,
1339
const uint32_t * mask,
1342
const uint32_t *end = src + width;
1346
__m64 a = load8888 (mask);
1347
__m64 s = load8888 (src);
1348
__m64 d = load8888 (dest);
1349
__m64 da = expand_alpha (d);
1350
__m64 sa = expand_alpha (s);
1352
s = pix_multiply (s, a);
1353
a = pix_multiply (a, sa);
1356
d = pix_add_mul (d, a, s, da);
1357
store8888 (dest, d);
1367
mmx_combine_add_ca (pixman_implementation_t *imp,
1370
const uint32_t * src,
1371
const uint32_t * mask,
1374
const uint32_t *end = src + width;
1378
__m64 a = load8888 (mask);
1379
__m64 s = load8888 (src);
1380
__m64 d = load8888 (dest);
1382
s = pix_multiply (s, a);
1384
store8888 (dest, d);
1393
/* ------------- MMX code paths called from fbpict.c -------------------- */
1396
mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1397
pixman_composite_info_t *info)
1399
PIXMAN_COMPOSITE_ARGS (info);
1401
uint32_t *dst_line, *dst;
1408
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1413
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1415
vsrc = load8888 (&src);
1416
vsrca = expand_alpha (vsrc);
1421
dst_line += dst_stride;
1426
while (w && (uintptr_t)dst & 7)
1428
store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1439
vdest = *(__m64 *)dst;
1441
dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1442
dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1444
*(__m64 *)dst = pack8888 (dest0, dest1);
1454
store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1462
mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1463
pixman_composite_info_t *info)
1465
PIXMAN_COMPOSITE_ARGS (info);
1467
uint16_t *dst_line, *dst;
1474
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1479
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1481
vsrc = load8888 (&src);
1482
vsrca = expand_alpha (vsrc);
1487
dst_line += dst_stride;
1492
while (w && (uintptr_t)dst & 7)
1495
__m64 vdest = expand565 (to_m64 (d), 0);
1497
vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1498
*dst = to_uint64 (vdest);
1506
__m64 vdest = *(__m64 *)dst;
1507
__m64 v0, v1, v2, v3;
1509
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1511
v0 = over (vsrc, vsrca, v0);
1512
v1 = over (vsrc, vsrca, v1);
1513
v2 = over (vsrc, vsrca, v2);
1514
v3 = over (vsrc, vsrca, v3);
1516
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1527
__m64 vdest = expand565 (to_m64 (d), 0);
1529
vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1530
*dst = to_uint64 (vdest);
1541
mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1542
pixman_composite_info_t *info)
1544
PIXMAN_COMPOSITE_ARGS (info);
1547
uint32_t *mask_line;
1548
int dst_stride, mask_stride;
1553
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1558
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1559
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1561
vsrc = load8888 (&src);
1562
vsrca = expand_alpha (vsrc);
1567
uint32_t *p = (uint32_t *)mask_line;
1568
uint32_t *q = (uint32_t *)dst_line;
1570
while (twidth && (uintptr_t)q & 7)
1572
uint32_t m = *(uint32_t *)p;
1576
__m64 vdest = load8888 (q);
1577
vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1578
store8888 (q, vdest);
1595
__m64 vdest = *(__m64 *)q;
1597
dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1598
expand8888 (vdest, 0));
1599
dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1600
expand8888 (vdest, 1));
1602
*(__m64 *)q = pack8888 (dest0, dest1);
1612
uint32_t m = *(uint32_t *)p;
1616
__m64 vdest = load8888 (q);
1617
vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1618
store8888 (q, vdest);
1626
dst_line += dst_stride;
1627
mask_line += mask_stride;
1634
mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1635
pixman_composite_info_t *info)
1637
PIXMAN_COMPOSITE_ARGS (info);
1638
uint32_t *dst_line, *dst;
1639
uint32_t *src_line, *src;
1642
int dst_stride, src_stride;
1647
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1648
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1650
mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1651
vmask = expand_alpha (load8888 (&mask));
1656
dst_line += dst_stride;
1658
src_line += src_stride;
1661
while (w && (uintptr_t)dst & 7)
1663
__m64 s = load8888 (src);
1664
__m64 d = load8888 (dst);
1666
store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1675
__m64 vs = ldq_u ((__m64 *)src);
1676
__m64 vd = *(__m64 *)dst;
1677
__m64 vsrc0 = expand8888 (vs, 0);
1678
__m64 vsrc1 = expand8888 (vs, 1);
1680
*(__m64 *)dst = pack8888 (
1681
in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1682
in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1691
__m64 s = load8888 (src);
1692
__m64 d = load8888 (dst);
1694
store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1702
mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1703
pixman_composite_info_t *info)
1705
PIXMAN_COMPOSITE_ARGS (info);
1706
uint32_t *dst_line, *dst;
1707
uint32_t *src_line, *src;
1710
int dst_stride, src_stride;
1716
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1717
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1718
mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1720
vmask = expand_alpha (load8888 (&mask));
1726
dst_line += dst_stride;
1728
src_line += src_stride;
1731
while (w && (uintptr_t)dst & 7)
1733
uint32_t ssrc = *src | 0xff000000;
1734
__m64 s = load8888 (&ssrc);
1735
__m64 d = load8888 (dst);
1737
store8888 (dst, in_over (s, srca, vmask, d));
1746
__m64 vd0 = *(__m64 *)(dst + 0);
1747
__m64 vd1 = *(__m64 *)(dst + 2);
1748
__m64 vd2 = *(__m64 *)(dst + 4);
1749
__m64 vd3 = *(__m64 *)(dst + 6);
1750
__m64 vd4 = *(__m64 *)(dst + 8);
1751
__m64 vd5 = *(__m64 *)(dst + 10);
1752
__m64 vd6 = *(__m64 *)(dst + 12);
1753
__m64 vd7 = *(__m64 *)(dst + 14);
1755
__m64 vs0 = ldq_u ((__m64 *)(src + 0));
1756
__m64 vs1 = ldq_u ((__m64 *)(src + 2));
1757
__m64 vs2 = ldq_u ((__m64 *)(src + 4));
1758
__m64 vs3 = ldq_u ((__m64 *)(src + 6));
1759
__m64 vs4 = ldq_u ((__m64 *)(src + 8));
1760
__m64 vs5 = ldq_u ((__m64 *)(src + 10));
1761
__m64 vs6 = ldq_u ((__m64 *)(src + 12));
1762
__m64 vs7 = ldq_u ((__m64 *)(src + 14));
1765
in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1766
in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1769
in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1770
in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1773
in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1774
in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1777
in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1778
in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1781
in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1782
in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1785
in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1786
in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1789
in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1790
in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1793
in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1794
in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1796
*(__m64 *)(dst + 0) = vd0;
1797
*(__m64 *)(dst + 2) = vd1;
1798
*(__m64 *)(dst + 4) = vd2;
1799
*(__m64 *)(dst + 6) = vd3;
1800
*(__m64 *)(dst + 8) = vd4;
1801
*(__m64 *)(dst + 10) = vd5;
1802
*(__m64 *)(dst + 12) = vd6;
1803
*(__m64 *)(dst + 14) = vd7;
1812
uint32_t ssrc = *src | 0xff000000;
1813
__m64 s = load8888 (&ssrc);
1814
__m64 d = load8888 (dst);
1816
store8888 (dst, in_over (s, srca, vmask, d));
1828
mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1829
pixman_composite_info_t *info)
1831
PIXMAN_COMPOSITE_ARGS (info);
1832
uint32_t *dst_line, *dst;
1833
uint32_t *src_line, *src;
1835
int dst_stride, src_stride;
1841
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1842
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1847
dst_line += dst_stride;
1849
src_line += src_stride;
1865
sa = expand_alpha (ms);
1866
store8888 (dst, over (ms, sa, load8888 (dst)));
1876
mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1877
pixman_composite_info_t *info)
1879
PIXMAN_COMPOSITE_ARGS (info);
1880
uint16_t *dst_line, *dst;
1881
uint32_t *src_line, *src;
1882
int dst_stride, src_stride;
1887
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1888
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1892
assert (src_image->drawable == mask_image->drawable);
1898
dst_line += dst_stride;
1900
src_line += src_stride;
1905
while (w && (uintptr_t)dst & 7)
1907
__m64 vsrc = load8888 (src);
1909
__m64 vdest = expand565 (to_m64 (d), 0);
1912
over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1914
*dst = to_uint64 (vdest);
1925
__m64 vdest = *(__m64 *)dst;
1926
__m64 v0, v1, v2, v3;
1927
__m64 vsrc0, vsrc1, vsrc2, vsrc3;
1929
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1931
vsrc0 = load8888 ((src + 0));
1932
vsrc1 = load8888 ((src + 1));
1933
vsrc2 = load8888 ((src + 2));
1934
vsrc3 = load8888 ((src + 3));
1936
v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1937
v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1938
v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1939
v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1941
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1952
__m64 vsrc = load8888 (src);
1954
__m64 vdest = expand565 (to_m64 (d), 0);
1956
vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1958
*dst = to_uint64 (vdest);
1970
mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1971
pixman_composite_info_t *info)
1973
PIXMAN_COMPOSITE_ARGS (info);
1975
uint32_t *dst_line, *dst;
1976
uint8_t *mask_line, *mask;
1977
int dst_stride, mask_stride;
1984
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1990
srcsrc = (uint64_t)src << 32 | src;
1992
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1993
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1995
vsrc = load8888 (&src);
1996
vsrca = expand_alpha (vsrc);
2001
dst_line += dst_stride;
2003
mask_line += mask_stride;
2008
while (w && (uintptr_t)dst & 7)
2014
__m64 vdest = in_over (vsrc, vsrca,
2015
expand_alpha_rev (to_m64 (m)),
2018
store8888 (dst, vdest);
2035
if (srca == 0xff && (m0 & m1) == 0xff)
2037
*(uint64_t *)dst = srcsrc;
2044
vdest = *(__m64 *)dst;
2046
dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2047
expand8888 (vdest, 0));
2048
dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2049
expand8888 (vdest, 1));
2051
*(__m64 *)dst = pack8888 (dest0, dest1);
2067
__m64 vdest = load8888 (dst);
2070
vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2071
store8888 (dst, vdest);
2079
static pixman_bool_t
2080
mmx_fill (pixman_implementation_t *imp,
2092
uint32_t byte_width;
2095
#if defined __GNUC__ && defined USE_X86_MMX
2096
__m64 v1, v2, v3, v4, v5, v6, v7;
2099
if (bpp != 16 && bpp != 32 && bpp != 8)
2104
stride = stride * (int) sizeof (uint32_t) / 1;
2105
byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2108
filler = (filler & 0xff) * 0x01010101;
2112
stride = stride * (int) sizeof (uint32_t) / 2;
2113
byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2114
byte_width = 2 * width;
2116
filler = (filler & 0xffff) * 0x00010001;
2120
stride = stride * (int) sizeof (uint32_t) / 4;
2121
byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2122
byte_width = 4 * width;
2126
fill = ((uint64_t)filler << 32) | filler;
2127
vfill = to_m64 (fill);
2129
#if defined __GNUC__ && defined USE_X86_MMX
2138
: "=&y" (v1), "=&y" (v2), "=&y" (v3),
2139
"=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2146
uint8_t *d = byte_line;
2148
byte_line += stride;
2151
if (w >= 1 && ((uintptr_t)d & 1))
2153
*(uint8_t *)d = (filler & 0xff);
2158
if (w >= 2 && ((uintptr_t)d & 3))
2160
*(uint16_t *)d = filler;
2165
while (w >= 4 && ((uintptr_t)d & 7))
2167
*(uint32_t *)d = filler;
2175
#if defined __GNUC__ && defined USE_X86_MMX
2187
"y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2188
"y" (v4), "y" (v5), "y" (v6), "y" (v7)
2191
*(__m64*) (d + 0) = vfill;
2192
*(__m64*) (d + 8) = vfill;
2193
*(__m64*) (d + 16) = vfill;
2194
*(__m64*) (d + 24) = vfill;
2195
*(__m64*) (d + 32) = vfill;
2196
*(__m64*) (d + 40) = vfill;
2197
*(__m64*) (d + 48) = vfill;
2198
*(__m64*) (d + 56) = vfill;
2206
*(uint32_t *)d = filler;
2213
*(uint16_t *)d = filler;
2219
*(uint8_t *)d = (filler & 0xff);
2231
mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2232
pixman_composite_info_t *info)
2234
PIXMAN_COMPOSITE_ARGS (info);
2235
uint16_t *dst_line, *dst;
2236
uint32_t *src_line, *src, s;
2237
int dst_stride, src_stride;
2240
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2241
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2246
dst_line += dst_stride;
2248
src_line += src_stride;
2251
while (w && (uintptr_t)dst & 7)
2254
*dst = convert_8888_to_0565 (s);
2262
__m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2263
__m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2265
vdest = pack_4xpacked565 (vsrc0, vsrc1);
2267
*(__m64 *)dst = vdest;
2277
*dst = convert_8888_to_0565 (s);
2287
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2288
pixman_composite_info_t *info)
2290
PIXMAN_COMPOSITE_ARGS (info);
2292
uint32_t *dst_line, *dst;
2293
uint8_t *mask_line, *mask;
2294
int dst_stride, mask_stride;
2301
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2306
mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2307
PIXMAN_FORMAT_BPP (dest_image->bits.format),
2308
dest_x, dest_y, width, height, 0);
2312
srcsrc = (uint64_t)src << 32 | src;
2314
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2315
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2317
vsrc = load8888 (&src);
2322
dst_line += dst_stride;
2324
mask_line += mask_stride;
2329
while (w && (uintptr_t)dst & 7)
2335
__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2337
store8888 (dst, vdest);
2357
if (srca == 0xff && (m0 & m1) == 0xff)
2359
*(uint64_t *)dst = srcsrc;
2365
dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2366
dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2368
*(__m64 *)dst = pack8888 (dest0, dest1);
2372
*(uint64_t *)dst = 0;
2388
__m64 vdest = load8888 (dst);
2390
vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2391
store8888 (dst, vdest);
2404
mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2405
pixman_composite_info_t *info)
2407
PIXMAN_COMPOSITE_ARGS (info);
2409
uint16_t *dst_line, *dst;
2410
uint8_t *mask_line, *mask;
2411
int dst_stride, mask_stride;
2413
__m64 vsrc, vsrca, tmp;
2418
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2424
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2425
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2427
vsrc = load8888 (&src);
2428
vsrca = expand_alpha (vsrc);
2430
tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2431
srcsrcsrcsrc = expand_alpha_rev (tmp);
2436
dst_line += dst_stride;
2438
mask_line += mask_stride;
2443
while (w && (uintptr_t)dst & 7)
2450
__m64 vd = to_m64 (d);
2451
__m64 vdest = in_over (
2452
vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2454
vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2455
*dst = to_uint64 (vd);
2467
uint64_t m0, m1, m2, m3;
2473
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2475
*(__m64 *)dst = srcsrcsrcsrc;
2477
else if (m0 | m1 | m2 | m3)
2479
__m64 vdest = *(__m64 *)dst;
2480
__m64 v0, v1, v2, v3;
2481
__m64 vm0, vm1, vm2, vm3;
2483
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2486
v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2489
v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2492
v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2495
v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2497
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2514
__m64 vd = to_m64 (d);
2515
__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2517
vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2518
*dst = to_uint64 (vd);
2531
mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2532
pixman_composite_info_t *info)
2534
PIXMAN_COMPOSITE_ARGS (info);
2535
uint16_t *dst_line, *dst;
2536
uint32_t *src_line, *src;
2537
int dst_stride, src_stride;
2542
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2543
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2547
assert (src_image->drawable == mask_image->drawable);
2553
dst_line += dst_stride;
2555
src_line += src_stride;
2560
while (w && (uintptr_t)dst & 7)
2562
__m64 vsrc = load8888 (src);
2564
__m64 vdest = expand565 (to_m64 (d), 0);
2566
vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2568
*dst = to_uint64 (vdest);
2579
uint32_t s0, s1, s2, s3;
2580
unsigned char a0, a1, a2, a3;
2592
if ((a0 & a1 & a2 & a3) == 0xFF)
2594
__m64 v0 = invert_colors (load8888 (&s0));
2595
__m64 v1 = invert_colors (load8888 (&s1));
2596
__m64 v2 = invert_colors (load8888 (&s2));
2597
__m64 v3 = invert_colors (load8888 (&s3));
2599
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2601
else if (s0 | s1 | s2 | s3)
2603
__m64 vdest = *(__m64 *)dst;
2604
__m64 v0, v1, v2, v3;
2606
__m64 vsrc0 = load8888 (&s0);
2607
__m64 vsrc1 = load8888 (&s1);
2608
__m64 vsrc2 = load8888 (&s2);
2609
__m64 vsrc3 = load8888 (&s3);
2611
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2613
v0 = over_rev_non_pre (vsrc0, v0);
2614
v1 = over_rev_non_pre (vsrc1, v1);
2615
v2 = over_rev_non_pre (vsrc2, v2);
2616
v3 = over_rev_non_pre (vsrc3, v3);
2618
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2630
__m64 vsrc = load8888 (src);
2632
__m64 vdest = expand565 (to_m64 (d), 0);
2634
vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2636
*dst = to_uint64 (vdest);
2648
mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2649
pixman_composite_info_t *info)
2651
PIXMAN_COMPOSITE_ARGS (info);
2652
uint32_t *dst_line, *dst;
2653
uint32_t *src_line, *src;
2654
int dst_stride, src_stride;
2659
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2660
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2664
assert (src_image->drawable == mask_image->drawable);
2670
dst_line += dst_stride;
2672
src_line += src_stride;
2675
while (w && (uintptr_t)dst & 7)
2677
__m64 s = load8888 (src);
2678
__m64 d = load8888 (dst);
2680
store8888 (dst, over_rev_non_pre (s, d));
2690
unsigned char a0, a1;
2699
if ((a0 & a1) == 0xFF)
2701
d0 = invert_colors (load8888 (&s0));
2702
d1 = invert_colors (load8888 (&s1));
2704
*(__m64 *)dst = pack8888 (d0, d1);
2708
__m64 vdest = *(__m64 *)dst;
2710
d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2711
d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2713
*(__m64 *)dst = pack8888 (d0, d1);
2723
__m64 s = load8888 (src);
2724
__m64 d = load8888 (dst);
2726
store8888 (dst, over_rev_non_pre (s, d));
2734
mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2735
pixman_composite_info_t *info)
2737
PIXMAN_COMPOSITE_ARGS (info);
2740
uint32_t *mask_line;
2741
int dst_stride, mask_stride;
2746
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2751
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2752
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2754
vsrc = load8888 (&src);
2755
vsrca = expand_alpha (vsrc);
2760
uint32_t *p = (uint32_t *)mask_line;
2761
uint16_t *q = (uint16_t *)dst_line;
2763
while (twidth && ((uintptr_t)q & 7))
2765
uint32_t m = *(uint32_t *)p;
2770
__m64 vdest = expand565 (to_m64 (d), 0);
2771
vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2772
*q = to_uint64 (vdest);
2782
uint32_t m0, m1, m2, m3;
2789
if ((m0 | m1 | m2 | m3))
2791
__m64 vdest = *(__m64 *)q;
2792
__m64 v0, v1, v2, v3;
2794
expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2796
v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2797
v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2798
v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2799
v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2801
*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2816
__m64 vdest = expand565 (to_m64 (d), 0);
2817
vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2818
*q = to_uint64 (vdest);
2826
mask_line += mask_stride;
2827
dst_line += dst_stride;
2834
mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2835
pixman_composite_info_t *info)
2837
PIXMAN_COMPOSITE_ARGS (info);
2838
uint8_t *dst_line, *dst;
2839
uint8_t *mask_line, *mask;
2840
int dst_stride, mask_stride;
2846
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2847
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2849
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2853
vsrc = load8888 (&src);
2854
vsrca = expand_alpha (vsrc);
2859
dst_line += dst_stride;
2861
mask_line += mask_stride;
2864
while (w && (uintptr_t)dst & 7)
2873
m = MUL_UN8 (sa, a, tmp);
2874
d = MUL_UN8 (m, d, tmp);
2885
vmask = load8888u ((uint32_t *)mask);
2886
vdest = load8888 ((uint32_t *)dst);
2888
store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2904
m = MUL_UN8 (sa, a, tmp);
2905
d = MUL_UN8 (m, d, tmp);
2915
mmx_composite_in_8_8 (pixman_implementation_t *imp,
2916
pixman_composite_info_t *info)
2918
PIXMAN_COMPOSITE_ARGS (info);
2919
uint8_t *dst_line, *dst;
2920
uint8_t *src_line, *src;
2921
int src_stride, dst_stride;
2924
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2925
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2930
dst_line += dst_stride;
2932
src_line += src_stride;
2935
while (w && (uintptr_t)dst & 3)
2943
*dst = MUL_UN8 (s, d, tmp);
2952
uint32_t *s = (uint32_t *)src;
2953
uint32_t *d = (uint32_t *)dst;
2955
store8888 (d, in (load8888u (s), load8888 (d)));
2970
*dst = MUL_UN8 (s, d, tmp);
2981
mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2982
pixman_composite_info_t *info)
2984
PIXMAN_COMPOSITE_ARGS (info);
2985
uint8_t *dst_line, *dst;
2986
uint8_t *mask_line, *mask;
2987
int dst_stride, mask_stride;
2993
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2994
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2996
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3003
vsrc = load8888 (&src);
3004
vsrca = expand_alpha (vsrc);
3009
dst_line += dst_stride;
3011
mask_line += mask_stride;
3014
while (w && (uintptr_t)dst & 3)
3024
m = MUL_UN8 (sa, a, tmp);
3025
r = ADD_UN8 (m, d, tmp);
3036
vmask = load8888u ((uint32_t *)mask);
3037
vdest = load8888 ((uint32_t *)dst);
3039
store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3056
m = MUL_UN8 (sa, a, tmp);
3057
r = ADD_UN8 (m, d, tmp);
3067
mmx_composite_add_8_8 (pixman_implementation_t *imp,
3068
pixman_composite_info_t *info)
3070
PIXMAN_COMPOSITE_ARGS (info);
3071
uint8_t *dst_line, *dst;
3072
uint8_t *src_line, *src;
3073
int dst_stride, src_stride;
3080
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3081
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3086
dst_line += dst_stride;
3088
src_line += src_stride;
3091
while (w && (uintptr_t)dst & 7)
3096
s = t | (0 - (t >> 8));
3106
*(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3117
s = t | (0 - (t >> 8));
3130
mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3131
pixman_composite_info_t *info)
3133
PIXMAN_COMPOSITE_ARGS (info);
3134
uint16_t *dst_line, *dst;
3136
uint16_t *src_line, *src;
3138
int dst_stride, src_stride;
3143
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3144
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3149
dst_line += dst_stride;
3151
src_line += src_stride;
3154
while (w && (uintptr_t)dst & 7)
3160
s = convert_0565_to_8888 (s);
3163
d = convert_0565_to_8888 (d);
3164
UN8x4_ADD_UN8x4 (s, d);
3166
*dst = convert_8888_to_0565 (s);
3174
__m64 vdest = *(__m64 *)dst;
3175
__m64 vsrc = ldq_u ((__m64 *)src);
3179
expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3180
expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3182
vd0 = _mm_adds_pu8 (vd0, vs0);
3183
vd1 = _mm_adds_pu8 (vd1, vs1);
3185
*(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3198
s = convert_0565_to_8888 (s);
3201
d = convert_0565_to_8888 (d);
3202
UN8x4_ADD_UN8x4 (s, d);
3204
*dst = convert_8888_to_0565 (s);
3214
mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3215
pixman_composite_info_t *info)
3217
PIXMAN_COMPOSITE_ARGS (info);
3218
uint32_t *dst_line, *dst;
3219
uint32_t *src_line, *src;
3220
int dst_stride, src_stride;
3225
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3226
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3231
dst_line += dst_stride;
3233
src_line += src_stride;
3236
while (w && (uintptr_t)dst & 7)
3238
store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3239
load ((const uint32_t *)dst)));
3247
*(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3255
store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3256
load ((const uint32_t *)dst)));
3264
static pixman_bool_t
3265
mmx_blt (pixman_implementation_t *imp,
3266
uint32_t * src_bits,
3267
uint32_t * dst_bits,
3279
uint8_t * src_bytes;
3280
uint8_t * dst_bytes;
3283
if (src_bpp != dst_bpp)
3288
src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3289
dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3290
src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3291
dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3292
byte_width = 2 * width;
3296
else if (src_bpp == 32)
3298
src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3299
dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3300
src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3301
dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3302
byte_width = 4 * width;
3314
uint8_t *s = src_bytes;
3315
uint8_t *d = dst_bytes;
3316
src_bytes += src_stride;
3317
dst_bytes += dst_stride;
3320
if (w >= 1 && ((uintptr_t)d & 1))
3322
*(uint8_t *)d = *(uint8_t *)s;
3328
if (w >= 2 && ((uintptr_t)d & 3))
3330
*(uint16_t *)d = *(uint16_t *)s;
3336
while (w >= 4 && ((uintptr_t)d & 7))
3338
*(uint32_t *)d = ldl_u ((uint32_t *)s);
3347
#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3349
"movq (%1), %%mm0\n"
3350
"movq 8(%1), %%mm1\n"
3351
"movq 16(%1), %%mm2\n"
3352
"movq 24(%1), %%mm3\n"
3353
"movq 32(%1), %%mm4\n"
3354
"movq 40(%1), %%mm5\n"
3355
"movq 48(%1), %%mm6\n"
3356
"movq 56(%1), %%mm7\n"
3358
"movq %%mm0, (%0)\n"
3359
"movq %%mm1, 8(%0)\n"
3360
"movq %%mm2, 16(%0)\n"
3361
"movq %%mm3, 24(%0)\n"
3362
"movq %%mm4, 32(%0)\n"
3363
"movq %%mm5, 40(%0)\n"
3364
"movq %%mm6, 48(%0)\n"
3365
"movq %%mm7, 56(%0)\n"
3369
"%mm0", "%mm1", "%mm2", "%mm3",
3370
"%mm4", "%mm5", "%mm6", "%mm7");
3372
__m64 v0 = ldq_u ((__m64 *)(s + 0));
3373
__m64 v1 = ldq_u ((__m64 *)(s + 8));
3374
__m64 v2 = ldq_u ((__m64 *)(s + 16));
3375
__m64 v3 = ldq_u ((__m64 *)(s + 24));
3376
__m64 v4 = ldq_u ((__m64 *)(s + 32));
3377
__m64 v5 = ldq_u ((__m64 *)(s + 40));
3378
__m64 v6 = ldq_u ((__m64 *)(s + 48));
3379
__m64 v7 = ldq_u ((__m64 *)(s + 56));
3380
*(__m64 *)(d + 0) = v0;
3381
*(__m64 *)(d + 8) = v1;
3382
*(__m64 *)(d + 16) = v2;
3383
*(__m64 *)(d + 24) = v3;
3384
*(__m64 *)(d + 32) = v4;
3385
*(__m64 *)(d + 40) = v5;
3386
*(__m64 *)(d + 48) = v6;
3387
*(__m64 *)(d + 56) = v7;
3396
*(uint32_t *)d = ldl_u ((uint32_t *)s);
3404
*(uint16_t *)d = *(uint16_t *)s;
3417
mmx_composite_copy_area (pixman_implementation_t *imp,
3418
pixman_composite_info_t *info)
3420
PIXMAN_COMPOSITE_ARGS (info);
3422
mmx_blt (imp, src_image->bits.bits,
3423
dest_image->bits.bits,
3424
src_image->bits.rowstride,
3425
dest_image->bits.rowstride,
3426
PIXMAN_FORMAT_BPP (src_image->bits.format),
3427
PIXMAN_FORMAT_BPP (dest_image->bits.format),
3428
src_x, src_y, dest_x, dest_y, width, height);
3432
mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3433
pixman_composite_info_t *info)
3435
PIXMAN_COMPOSITE_ARGS (info);
3436
uint32_t *src, *src_line;
3437
uint32_t *dst, *dst_line;
3438
uint8_t *mask, *mask_line;
3439
int src_stride, mask_stride, dst_stride;
3442
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3443
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3444
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3449
src_line += src_stride;
3451
dst_line += dst_stride;
3453
mask_line += mask_stride;
3463
uint32_t ssrc = *src | 0xff000000;
3464
__m64 s = load8888 (&ssrc);
3472
__m64 sa = expand_alpha (s);
3473
__m64 vm = expand_alpha_rev (to_m64 (m));
3474
__m64 vdest = in_over (s, sa, vm, load8888 (dst));
3476
store8888 (dst, vdest);
3490
mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3491
pixman_composite_info_t *info)
3493
PIXMAN_COMPOSITE_ARGS (info);
3495
uint32_t *dst_line, *dst;
3502
src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3507
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3509
vsrc = load8888 (&src);
3514
dst_line += dst_stride;
3519
while (w && (uintptr_t)dst & 7)
3521
__m64 vdest = load8888 (dst);
3523
store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3531
__m64 vdest = *(__m64 *)dst;
3532
__m64 dest0 = expand8888 (vdest, 0);
3533
__m64 dest1 = expand8888 (vdest, 1);
3536
dest0 = over (dest0, expand_alpha (dest0), vsrc);
3537
dest1 = over (dest1, expand_alpha (dest1), vsrc);
3539
*(__m64 *)dst = pack8888 (dest0, dest1);
3549
__m64 vdest = load8888 (dst);
3551
store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3558
#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3559
#define BMSK (BSHIFT - 1)
3561
#define BILINEAR_DECLARE_VARIABLES \
3562
const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3563
const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3564
const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3565
const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3566
const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3567
const __m64 mm_zero = _mm_setzero_si64 (); \
3568
__m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3570
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3572
/* fetch 2x2 pixel block into 2 mmx registers */ \
3573
__m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3574
__m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3575
/* vertical interpolation */ \
3576
__m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3577
__m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3578
__m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3579
__m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3580
__m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3581
__m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3582
/* calculate horizontal weights */ \
3583
__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3584
_mm_srli_pi16 (mm_x, \
3585
16 - BILINEAR_INTERPOLATION_BITS))); \
3586
/* horizontal interpolation */ \
3587
__m64 p = _mm_unpacklo_pi16 (lo, hi); \
3588
__m64 q = _mm_unpackhi_pi16 (lo, hi); \
3590
lo = _mm_madd_pi16 (p, mm_wh); \
3591
hi = _mm_madd_pi16 (q, mm_wh); \
3592
mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3593
/* shift and pack the result */ \
3594
hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3595
lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3596
lo = _mm_packs_pi32 (lo, hi); \
3597
lo = _mm_packs_pu16 (lo, lo); \
3601
#define BILINEAR_SKIP_ONE_PIXEL() \
3604
mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3607
static force_inline void
3608
scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3609
const uint32_t * mask,
3610
const uint32_t * src_top,
3611
const uint32_t * src_bottom,
3616
pixman_fixed_t unit_x,
3617
pixman_fixed_t max_vx,
3618
pixman_bool_t zero_src)
3620
BILINEAR_DECLARE_VARIABLES;
3625
BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3633
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3634
scaled_bilinear_scanline_mmx_8888_8888_SRC,
3635
uint32_t, uint32_t, uint32_t,
3637
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3638
scaled_bilinear_scanline_mmx_8888_8888_SRC,
3639
uint32_t, uint32_t, uint32_t,
3641
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3642
scaled_bilinear_scanline_mmx_8888_8888_SRC,
3643
uint32_t, uint32_t, uint32_t,
3645
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3646
scaled_bilinear_scanline_mmx_8888_8888_SRC,
3647
uint32_t, uint32_t, uint32_t,
3650
static force_inline void
3651
scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3652
const uint32_t * mask,
3653
const uint32_t * src_top,
3654
const uint32_t * src_bottom,
3659
pixman_fixed_t unit_x,
3660
pixman_fixed_t max_vx,
3661
pixman_bool_t zero_src)
3663
BILINEAR_DECLARE_VARIABLES;
3668
BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3670
if (!is_zero (pix1))
3673
store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3683
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3684
scaled_bilinear_scanline_mmx_8888_8888_OVER,
3685
uint32_t, uint32_t, uint32_t,
3687
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3688
scaled_bilinear_scanline_mmx_8888_8888_OVER,
3689
uint32_t, uint32_t, uint32_t,
3691
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3692
scaled_bilinear_scanline_mmx_8888_8888_OVER,
3693
uint32_t, uint32_t, uint32_t,
3695
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3696
scaled_bilinear_scanline_mmx_8888_8888_OVER,
3697
uint32_t, uint32_t, uint32_t,
3700
static force_inline void
3701
scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3702
const uint8_t * mask,
3703
const uint32_t * src_top,
3704
const uint32_t * src_bottom,
3709
pixman_fixed_t unit_x,
3710
pixman_fixed_t max_vx,
3711
pixman_bool_t zero_src)
3713
BILINEAR_DECLARE_VARIABLES;
3719
m = (uint32_t) *mask++;
3723
BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3725
if (m == 0xff && is_opaque (pix1))
3731
__m64 ms, md, ma, msa;
3734
ma = expand_alpha_rev (to_m64 (m));
3735
ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3736
md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3738
msa = expand_alpha (ms);
3740
store8888 (dst, (in_over (ms, msa, ma, md)));
3745
BILINEAR_SKIP_ONE_PIXEL ();
3755
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3756
scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3757
uint32_t, uint8_t, uint32_t,
3758
COVER, FLAG_HAVE_NON_SOLID_MASK)
3759
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3760
scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3761
uint32_t, uint8_t, uint32_t,
3762
PAD, FLAG_HAVE_NON_SOLID_MASK)
3763
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3764
scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3765
uint32_t, uint8_t, uint32_t,
3766
NONE, FLAG_HAVE_NON_SOLID_MASK)
3767
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3768
scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3769
uint32_t, uint8_t, uint32_t,
3770
NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3773
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3775
int w = iter->width;
3776
uint32_t *dst = iter->buffer;
3777
uint32_t *src = (uint32_t *)iter->bits;
3779
iter->bits += iter->stride;
3781
while (w && ((uintptr_t)dst) & 7)
3783
*dst++ = (*src++) | 0xff000000;
3789
__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3790
__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3791
__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3792
__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3794
*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3795
*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3796
*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3797
*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3806
*dst++ = (*src++) | 0xff000000;
3811
return iter->buffer;
3815
mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3817
int w = iter->width;
3818
uint32_t *dst = iter->buffer;
3819
uint16_t *src = (uint16_t *)iter->bits;
3821
iter->bits += iter->stride;
3823
while (w && ((uintptr_t)dst) & 0x0f)
3825
uint16_t s = *src++;
3827
*dst++ = convert_0565_to_8888 (s);
3833
__m64 vsrc = ldq_u ((__m64 *)src);
3836
expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3838
*(__m64 *)(dst + 0) = mm0;
3839
*(__m64 *)(dst + 2) = mm1;
3848
uint16_t s = *src++;
3850
*dst++ = convert_0565_to_8888 (s);
3855
return iter->buffer;
3859
mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3861
int w = iter->width;
3862
uint32_t *dst = iter->buffer;
3863
uint8_t *src = iter->bits;
3865
iter->bits += iter->stride;
3867
while (w && (((uintptr_t)dst) & 15))
3869
*dst++ = *(src++) << 24;
3875
__m64 mm0 = ldq_u ((__m64 *)src);
3877
__m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3878
__m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3879
__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3880
__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3881
__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3882
__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3884
*(__m64 *)(dst + 0) = mm3;
3885
*(__m64 *)(dst + 2) = mm4;
3886
*(__m64 *)(dst + 4) = mm5;
3887
*(__m64 *)(dst + 6) = mm6;
3896
*dst++ = *(src++) << 24;
3901
return iter->buffer;
3904
#define IMAGE_FLAGS \
3905
(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3906
FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3908
static const pixman_iter_info_t mmx_iters[] =
3910
{ PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3911
_pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
3913
{ PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
3914
_pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
3916
{ PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3917
_pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
3922
static const pixman_fast_path_t mmx_fast_paths[] =
3924
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
3925
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
3926
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
3927
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
3928
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
3929
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
3930
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3931
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3932
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
3933
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3934
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3935
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
3936
PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3937
PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
3938
PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
3939
PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3940
PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
3941
PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
3942
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
3943
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
3944
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
3945
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
3946
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
3947
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
3948
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
3949
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
3950
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
3951
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3952
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
3953
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
3954
PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
3955
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
3956
PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3957
PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
3958
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3959
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3961
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
3962
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
3963
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
3964
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
3965
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
3966
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
3968
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3969
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3971
PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
3972
PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
3973
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
3974
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
3975
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
3976
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
3978
PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
3979
PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
3980
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
3981
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
3982
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
3983
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
3984
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
3985
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
3986
PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
3987
PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
3988
PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3989
PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3990
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
3991
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
3992
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
3993
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
3995
PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
3996
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
3998
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
3999
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4000
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4001
SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4002
SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4003
SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4005
SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4006
SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4007
SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4008
SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4010
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4011
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4012
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4013
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4018
pixman_implementation_t *
4019
_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4021
pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4023
imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4024
imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4025
imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4026
imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4027
imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4028
imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4029
imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4030
imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4031
imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4032
imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4033
imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4035
imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4036
imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4037
imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4038
imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4039
imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4040
imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4041
imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4042
imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4043
imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4044
imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4045
imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4048
imp->fill = mmx_fill;
4050
imp->iter_info = mmx_iters;
4055
#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */