1
/* -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*-
3
* The GIMP -- an image manipulation program
4
* Copyright (C) 1995 Spencer Kimball and Peter Mattis
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 2 of the License, or
10
* (at your option) any later version.
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
* GNU General Public License for more details.
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22
/* Much of the content of this file are derivative works of David
23
* Monniaux which are Copyright (C) 1999, 2001 David Monniaux
24
* Tip-o-the-hat to David for pioneering this effort.
26
* All of these functions use the mmx registers and expect them to
27
* remain intact across multiple asm() constructs. This may not work
28
* in the future, if the compiler allocates mmx registers for it's own
36
#include <glib-object.h>
38
#include "base/base-types.h"
39
#include "base/cpu-accel.h"
41
#include "gimp-composite.h"
42
#include "gimp-composite-mmx.h"
44
#ifdef COMPILE_MMX_IS_OKAY
46
#include "gimp-composite-x86.h"
48
#define pminub(src,dst,tmp) "\tmovq %%" #dst ", %%" #tmp ";" "psubusb %%" #src ", %%" #tmp ";" "psubb %%" #tmp ", %%" #dst "\n"
49
#define pmaxub(a,b,tmp) "\tmovq %%" #a ", %%" #tmp ";" "psubusb %%" #b ", %%" #tmp ";" "paddb %%" #tmp ", %%" #b "\n"
53
debug_display_mmx(void)
55
#define mask32(x) ((x)& (unsigned long long) 0xFFFFFFFF)
56
#define print64(reg) { unsigned long long reg; asm("movq %%" #reg ",%0" : "=m" (reg)); printf(#reg"=%08llx %08llx", mask32(reg>>32), mask32(reg)); }
57
printf("--------------------------------------------\n");
58
print64(mm0); printf(" "); print64(mm1); printf("\n");
59
print64(mm2); printf(" "); print64(mm3); printf("\n");
60
print64(mm4); printf(" "); print64(mm5); printf("\n");
61
print64(mm6); printf(" "); print64(mm7); printf("\n");
62
printf("--------------------------------------------\n");
65
const guint32 rgba8_alpha_mask_64[2] = { 0xFF000000, 0xFF000000 };
66
const guint32 rgba8_b1_64[2] = { 0x01010101, 0x01010101 };
67
const guint32 rgba8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
68
const guint32 rgba8_w1_64[2] = { 0x00010001, 0x00010001 };
69
const guint32 rgba8_w2_64[2] = { 0x00020002, 0x00020002 };
70
const guint32 rgba8_w128_64[2] = { 0x00800080, 0x00800080 };
71
const guint32 rgba8_w256_64[2] = { 0x01000100, 0x01000100 };
72
const guint32 rgba8_w255_64[2] = { 0X00FF00FF, 0X00FF00FF };
74
const guint32 va8_alpha_mask_64[2] = { 0xFF00FF00, 0xFF00FF00 };
75
const guint32 va8_b255_64[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
76
const guint32 va8_w1_64[2] = { 0x00010001, 0x00010001 };
77
const guint32 va8_w255_64[2] = { 0x00FF00FF, 0x00FF00FF };
79
/*const static guint32 v8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00};
80
const static guint32 v8_mul_shift[2] = { 0x00800080, 0x00800080 };*/
87
gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
89
uint64 *d = (uint64 *) _op->D;
90
uint64 *a = (uint64 *) _op->A;
91
uint64 *b = (uint64 *) _op->B;
92
gulong n_pixels = _op->n_pixels;
94
asm volatile ("movq %0,%%mm0"
96
: "m" (*rgba8_alpha_mask_64)
99
for (; n_pixels >= 2; n_pixels -= 2)
101
asm volatile (" movq %1, %%mm2\n"
103
"\tmovq %%mm2, %%mm4\n"
104
"\tpaddusb %%mm3, %%mm4\n"
105
"\tmovq %%mm0, %%mm1\n"
106
"\tpandn %%mm4, %%mm1\n"
107
"\t" pminub(mm3, mm2, mm4) "\n"
108
"\tpand %%mm0, %%mm2\n"
109
"\tpor %%mm2, %%mm1\n"
113
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
121
asm volatile (" movd %1, %%mm2\n"
123
"\tmovq %%mm2, %%mm4\n"
124
"\tpaddusb %%mm3, %%mm4\n"
125
"\tmovq %%mm0, %%mm1\n"
126
"\tpandn %%mm4, %%mm1\n"
127
"\t" pminub(mm3, mm2, mm4) "\n"
128
"\tpand %%mm0, %%mm2\n"
129
"\tpor %%mm2, %%mm1\n"
133
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
140
gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
142
uint64 *d = (uint64 *) _op->D;
143
uint64 *a = (uint64 *) _op->A;
144
uint64 *b = (uint64 *) _op->B;
145
gulong n_pixels = _op->n_pixels;
147
for (; n_pixels >= 2; n_pixels -= 2)
149
asm volatile (" movq %1,%%mm0\n"
153
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
154
"\tpxor %%mm4,%%mm4\n"
155
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
157
"\tmovq %%mm1,%%mm3\n"
158
"\tpxor %%mm5,%%mm5\n"
159
"\tpunpcklbw %%mm5,%%mm3\n"
161
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
163
"\t" pdivwqX(mm4,mm5,mm7) "\n"
166
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
167
"\tpxor %%mm4,%%mm4\n"
168
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
170
"\tmovq %%mm1,%%mm3\n"
171
"\tpxor %%mm5,%%mm5\n"
172
"\tpunpckhbw %%mm5,%%mm3\n"
174
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
175
"\t" pdivwqX(mm4,mm5,mm6) "\n"
178
"\tmovq %%mm4,%%mm5\n"
179
"\tpsubusw %%mm6,%%mm4\n"
180
"\tpsubusw %%mm7,%%mm5\n"
182
"\tpackuswb %%mm4,%%mm5\n"
184
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
186
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
187
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
189
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
190
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
194
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
195
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
203
asm volatile (" movd %1,%%mm0\n"
207
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
208
"\tpxor %%mm4,%%mm4\n"
209
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
211
"\tmovq %%mm1,%%mm3\n"
212
"\tpxor %%mm5,%%mm5\n"
213
"\tpunpcklbw %%mm5,%%mm3\n"
215
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
217
"\t" pdivwqX(mm4,mm5,mm7) "\n"
220
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
221
"\tpxor %%mm4,%%mm4\n"
222
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
224
"\tmovq %%mm1,%%mm3\n"
225
"\tpxor %%mm5,%%mm5\n"
226
"\tpunpckhbw %%mm5,%%mm3\n"
228
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
229
"\t" pdivwqX(mm4,mm5,mm6) "\n"
232
"\tmovq %%mm4,%%mm5\n"
233
"\tpsubusw %%mm6,%%mm4\n"
234
"\tpsubusw %%mm7,%%mm5\n"
236
"\tpackuswb %%mm4,%%mm5\n"
238
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
241
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
243
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
244
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
248
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
249
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
257
gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
259
uint64 *d = (uint64 *) _op->D;
260
uint64 *a = (uint64 *) _op->A;
261
uint64 *b = (uint64 *) _op->B;
262
gulong n_pixels = _op->n_pixels;
264
for (; n_pixels >= 2; n_pixels -= 2)
266
asm volatile (" movq %1, %%mm2\n"
268
"\t" pminub(mm3, mm2, mm4) "\n"
272
: "%mm1", "%mm2", "%mm3", "%mm4");
280
asm volatile (" movd %1, %%mm2\n"
282
"\t" pminub(mm3, mm2, mm4) "\n"
286
: "%mm2", "%mm3", "%mm4");
293
gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
295
uint64 *d = (uint64 *) _op->D;
296
uint64 *a = (uint64 *) _op->A;
297
uint64 *b = (uint64 *) _op->B;
298
gulong n_pixels = _op->n_pixels;
300
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
302
for (; n_pixels >= 2; n_pixels -= 2)
304
asm volatile (" movq %1, %%mm2\n"
306
"\tmovq %%mm2, %%mm4\n"
307
"\tmovq %%mm3, %%mm5\n"
308
"\tpsubusb %%mm3, %%mm4\n"
309
"\tpsubusb %%mm2, %%mm5\n"
310
"\tpaddb %%mm5, %%mm4\n"
311
"\tmovq %%mm0, %%mm1\n"
312
"\tpandn %%mm4, %%mm1\n"
313
"\t" pminub(mm3,mm2,mm4) "\n"
314
"\tpand %%mm0, %%mm2\n"
315
"\tpor %%mm2, %%mm1\n"
319
: "%mm1", "%mm2", "%mm3", "%mm4");
327
asm volatile (" movd %1, %%mm2\n"
329
"\tmovq %%mm2, %%mm4\n"
330
"\tmovq %%mm3, %%mm5\n"
331
"\tpsubusb %%mm3, %%mm4\n"
332
"\tpsubusb %%mm2, %%mm5\n"
333
"\tpaddb %%mm5, %%mm4\n"
334
"\tmovq %%mm0, %%mm1\n"
335
"\tpandn %%mm4, %%mm1\n"
336
"\t" pminub(mm3,mm2,mm4) "\n"
337
"\tpand %%mm0, %%mm2\n"
338
"\tpor %%mm2, %%mm1\n"
342
: "%mm1", "%mm2", "%mm3", "%mm4");
350
xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
352
uint64 *d = (uint64 *) _op->D;
353
uint64 *a = (uint64 *) _op->A;
354
uint64 *b = (uint64 *) _op->B;
355
gulong n_pixels = _op->n_pixels;
357
asm volatile (" movq %0, %%mm0\n"
360
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
363
for (; n_pixels >= 2; n_pixels -= 2)
365
asm volatile (" movq %1,%%mm0\n"
367
"\tpxor %%mm2,%%mm2\n"
368
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
370
"\tmovq %%mm1,%%mm3\n"
371
"\tpxor %%mm5,%%mm5\n"
372
"\tpunpcklbw %%mm5,%%mm3\n"
373
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
375
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
377
"\tpxor %%mm2,%%mm2\n"
378
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
380
"\tmovq %%mm1,%%mm3\n"
381
"\tpxor %%mm6,%%mm6\n"
382
"\tpunpckhbw %%mm6,%%mm3\n"
383
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
385
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
387
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
389
"\t" pminub(mm0,mm1,mm3) "\n"
391
"\tmovq %%mm3,%%mm2\n"
393
"\tpandn %%mm5,%%mm3\n"
395
"\tpand %%mm2,%%mm1\n"
396
"\tpor %%mm1,%%mm3\n"
400
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask_64)
401
: pdivwuqX_clobber, "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
409
asm volatile (" movd %1,%%mm0\n"
411
"\tpxor %%mm2,%%mm2\n"
412
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
414
"\tmovq %%mm1,%%mm3\n"
415
"\tpxor %%mm5,%%mm5\n"
416
"\tpunpcklbw %%mm5,%%mm3\n"
417
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
419
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
421
"\tpxor %%mm2,%%mm2\n"
422
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
424
"\tmovq %%mm1,%%mm3\n"
425
"\tpxor %%mm6,%%mm6\n"
426
"\tpunpckhbw %%mm6,%%mm3\n"
427
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
429
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
431
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
433
"\t" pminub(mm0,mm1,mm3) "\n"
435
"\tmovq %%mm3,%%mm2\n"
437
"\tpandn %%mm5,%%mm3\n"
439
"\tpand %%mm2,%%mm1\n"
440
"\tpor %%mm1,%%mm3\n"
444
: "m" (*a), "m" (*b), "m" (*rgba8_alpha_mask_64)
445
: pdivwuqX_clobber, "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
454
xxxgimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
456
uint64 *d = (uint64 *) _op->D;
457
uint64 *a = (uint64 *) _op->A;
458
uint64 *b = (uint64 *) _op->B;
459
gulong n_pixels = _op->n_pixels;
461
for (; n_pixels >= 2; n_pixels -= 2)
463
asm volatile (" movq %1,%%mm0\n"
465
"\tmovq %%mm1,%%mm3\n"
466
"\tpxor %%mm2,%%mm2\n"
467
"\tpunpcklbw %%mm2,%%mm3\n"
468
"\tpunpcklbw %%mm0,%%mm2\n"
471
"\tpsubw %%mm3,%%mm4\n"
473
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
475
"\tmovq %%mm1,%%mm3\n"
476
"\tpxor %%mm2,%%mm2\n"
477
"\tpunpckhbw %%mm2,%%mm3\n"
478
"\tpunpckhbw %%mm0,%%mm2\n"
481
"\tpsubw %%mm3,%%mm4\n"
483
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
485
"\tpackuswb %%mm6,%%mm5\n"
488
"\tmovq %%mm1,%%mm7\n"
489
"\t" pminub(mm0,mm7,mm2) "\n"
490
"\tpand %%mm6,%%mm7\n"
491
"\tpandn %%mm5,%%mm6\n"
493
"\tpor %%mm6,%%mm7\n"
497
: "m" (*a), "m" (*b), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
498
: pdivwuqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
506
asm volatile (" movd %0,%%mm0\n"
508
"\tmovq %%mm1,%%mm3\n"
509
"\tpxor %%mm2,%%mm2\n"
510
"\tpunpcklbw %%mm2,%%mm3\n"
511
"\tpunpcklbw %%mm0,%%mm2\n"
514
"\tpsubw %%mm3,%%mm4\n"
516
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
518
"\tmovq %%mm1,%%mm3\n"
519
"\tpxor %%mm2,%%mm2\n"
520
"\tpunpckhbw %%mm2,%%mm3\n"
521
"\tpunpckhbw %%mm0,%%mm2\n"
524
"\tpsubw %%mm3,%%mm4\n"
526
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
528
"\tpackuswb %%mm6,%%mm5\n"
531
"\tmovq %%mm1,%%mm7\n"
532
"\t" pminub(mm0,mm7,mm2) "\n"
533
"\tpand %%mm6,%%mm7\n"
534
"\tpandn %%mm5,%%mm6\n"
536
"\tpor %%mm6,%%mm7\n"
540
: "m" (*a), "m" (*b), "m" (*d), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
541
: pdivwuqX_clobber, "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
549
gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
551
uint64 *d = (uint64 *) _op->D;
552
uint64 *a = (uint64 *) _op->A;
553
uint64 *b = (uint64 *) _op->B;
554
gulong n_pixels = _op->n_pixels;
556
asm volatile ("movq %0,%%mm0\n"
560
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
561
: "%mm0", "%mm7", "%mm6");
563
for (; n_pixels >= 2; n_pixels -= 2)
565
asm volatile (" movq %1,%%mm2\n"
567
mmx_low_bytes_to_words(mm2,mm4,mm6)
568
mmx_low_bytes_to_words(mm3,mm5,mm6)
569
"\tpsubw %%mm5,%%mm4\n"
570
"\tpaddw %%mm7,%%mm4\n"
571
"\tmovq %%mm4,%%mm1\n"
573
mmx_high_bytes_to_words(mm2,mm4,mm6)
574
mmx_high_bytes_to_words(mm3,mm5,mm6)
576
"\tpsubw %%mm5,%%mm4\n"
577
"\tpaddw %%mm7,%%mm4\n"
579
"\tpackuswb %%mm4,%%mm1\n"
580
"\tmovq %%mm1,%%mm4\n"
582
"\tmovq %%mm0,%%mm1\n"
583
"\tpandn %%mm4,%%mm1\n"
585
"\t" pminub(mm3,mm2,mm4) "\n"
586
"\tpand %%mm0,%%mm2\n"
588
"\tpor %%mm2,%%mm1\n"
592
: "%mm1", "%mm2", "%mm3", "%mm4");
600
asm volatile (" movd %1, %%mm2\n"
603
mmx_low_bytes_to_words(mm2,mm4,mm6)
604
mmx_low_bytes_to_words(mm3,mm5,mm6)
606
"\tpsubw %%mm5, %%mm4\n"
607
"\tpaddw %%mm7, %%mm4\n"
608
"\tmovq %%mm4, %%mm1\n"
610
"\tpackuswb %%mm6, %%mm1\n"
612
"\tmovq %%mm1, %%mm4\n"
614
"\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n"
616
"\t" pminub(mm3,mm2,mm4) "\n"
617
"\tpand %%mm0, %%mm2\n"
619
"\tpor %%mm2, %%mm1\n"
623
: "%mm1", "%mm2", "%mm3", "%mm4");
630
gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
632
uint64 *d = (uint64 *) _op->D;
633
uint64 *a = (uint64 *) _op->A;
634
uint64 *b = (uint64 *) _op->B;
635
gulong n_pixels = _op->n_pixels;
637
asm volatile ("movq %0, %%mm0\n"
638
"pxor %%mm6, %%mm6\n"
641
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
642
: "%mm0", "%mm6", "%mm7");
644
for (; n_pixels >= 2; n_pixels -= 2)
646
asm volatile (" movq %1, %%mm2\n"
649
mmx_low_bytes_to_words(mm2,mm4,mm6)
650
mmx_low_bytes_to_words(mm3,mm5,mm6)
651
"\tpaddw %%mm5, %%mm4\n"
652
"\tpsubw %%mm7, %%mm4\n"
654
mmx_high_bytes_to_words(mm2,mm1,mm6)
655
mmx_high_bytes_to_words(mm3,mm5,mm6)
656
"\tpaddw %%mm5, %%mm1\n"
657
"\tpsubw %%mm7, %%mm1\n"
659
"\tpackuswb %%mm1, %%mm4\n"
661
"\t" pminub(mm3,mm2,mm5) "\n"
662
"\tpand %%mm0, %%mm2\n"
664
"\tmovq %%mm0, %%mm1\n"
665
"\tpandn %%mm4, %%mm1\n"
666
"\tpor %%mm2, %%mm1\n"
670
: "%mm1", "%mm2", "%mm3", "%mm4");
678
asm volatile (" movd %1, %%mm2\n"
681
mmx_low_bytes_to_words(mm2,mm4,mm6)
682
mmx_low_bytes_to_words(mm3,mm5,mm6)
684
"\tpaddw %%mm5, %%mm4\n"
685
"\tpsubw %%mm7, %%mm4\n"
686
"\tmovq %%mm4, %%mm1\n"
687
"\tpackuswb %%mm6, %%mm1\n"
689
"\tmovq %%mm1, %%mm4\n"
691
"\tmovq %%mm0, %%mm1; pandn %%mm4, %%mm1\n"
693
"\t" pminub(mm3,mm2,mm4) "\n"
694
"\tpand %%mm0, %%mm2\n"
696
"\tpor %%mm2, %%mm1\n"
700
: "%mm1", "%mm2", "%mm3", "%mm4");
707
gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
709
uint64 *d = (uint64 *) _op->D;
710
uint64 *a = (uint64 *) _op->A;
711
uint64 *b = (uint64 *) _op->B;
712
gulong n_pixels = _op->n_pixels;
714
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
716
for (; n_pixels >= 2; n_pixels -= 2)
718
asm volatile (" movq %1, %%mm2\n"
720
"\tmovq %%mm2, %%mm4\n"
721
"\t" pmaxub(mm3,mm4,mm5) "\n"
722
"\tmovq %%mm0, %%mm1\n"
723
"\tpandn %%mm4, %%mm1\n"
724
"\t" pminub(mm2,mm3,mm4) "\n"
725
"\tpand %%mm0, %%mm3\n"
726
"\tpor %%mm3, %%mm1\n"
730
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
738
asm volatile (" movd %1, %%mm2\n"
740
"\tmovq %%mm2, %%mm4\n"
741
"\t" pmaxub(mm3,mm4,mm5) "\n"
743
"\tmovq %%mm0, %%mm1\n"
744
"\tpandn %%mm4, %%mm1\n"
746
"\t" pminub(mm2,mm3,mm4) "\n"
748
"\tpand %%mm0, %%mm3\n"
749
"\tpor %%mm3, %%mm1\n"
753
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
760
gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
762
uint64 *d = (uint64 *) _op->D;
763
uint64 *a = (uint64 *) _op->A;
764
uint64 *b = (uint64 *) _op->B;
765
gulong n_pixels = _op->n_pixels;
772
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
773
: "%mm6", "%mm7", "%mm0");
775
for (; n_pixels >= 2; n_pixels -= 2)
777
asm volatile (" movq %1, %%mm2\n"
780
mmx_low_bytes_to_words(mm2,mm1,mm6)
781
mmx_low_bytes_to_words(mm3,mm5,mm6)
782
mmx_int_mult(mm5,mm1,mm7)
784
mmx_high_bytes_to_words(mm2,mm4,mm6)
785
mmx_high_bytes_to_words(mm3,mm5,mm6)
786
mmx_int_mult(mm5,mm4,mm7)
788
"\tpackuswb %%mm4, %%mm1\n"
790
"\tmovq %%mm0, %%mm4\n"
791
"\tpandn %%mm1, %%mm4\n"
792
"\tmovq %%mm4, %%mm1\n"
793
"\t" pminub(mm3,mm2,mm4) "\n"
794
"\tpand %%mm0, %%mm2\n"
795
"\tpor %%mm2, %%mm1\n"
800
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
808
asm volatile (" movd %1, %%mm2\n"
811
mmx_low_bytes_to_words(mm2,mm1,mm6)
812
mmx_low_bytes_to_words(mm3,mm5,mm6)
815
"\tpackuswb %%mm6, %%mm1\n"
817
"\tmovq %%mm0, %%mm4\n"
818
"\tpandn %%mm1, %%mm4\n"
819
"\tmovq %%mm4, %%mm1\n"
820
"\t" pminub(mm3,mm2,mm4) "\n"
821
"\tpand %%mm0, %%mm2\n"
822
"\tpor %%mm2, %%mm1\n"
827
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
838
mmx_low_bytes_to_words(mm3,mm5,mm0)
839
"\tpcmpeqb %%mm4,%%mm4\n"
840
"\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */
841
"\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
842
"\tmovq %0,%%mm6\n" /* mm6 = words of value 2 */
843
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
844
mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */
847
mmx_high_bytes_to_words(mm3,mm5,mm0)
848
"\tpcmpeqb %%mm1,%%mm1\n"
849
"\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */
850
"\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
851
"\tmovq %0,%%mm6\n" /* mm6 = words of value 2 */
852
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
853
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */
855
"\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */
857
mmx_low_bytes_to_words(mm4,mm5,mm0)
858
mmx_low_bytes_to_words(mm2,mm6,mm0)
859
"\tpaddw %%mm6,%%mm5\n"
860
mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */
862
mmx_high_bytes_to_words(mm4,mm1,mm0)
863
mmx_high_bytes_to_words(mm2,mm6,mm0)
864
"\tpaddw %%mm6,%%mm1\n"
865
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */
867
"\tpackuswb %%mm1,%%mm5\n"
870
"\tmovq %%mm0,%%mm1\n"
871
"\tpandn %%mm5,%%mm1\n"
873
"\t" pminub(mm2,mm3,mm4) "\n"
874
"\tpand %%mm0,%%mm3\n"
876
"\tpor %%mm3,%%mm1\n"
879
: "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
885
xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
887
uint64 *d = (uint64 *) _op->D;
888
uint64 *a = (uint64 *) _op->A;
889
uint64 *b = (uint64 *) _op->B;
890
gulong n_pixels = _op->n_pixels;
892
asm volatile ("pxor %%mm0,%%mm0\n"
895
: "m" (*rgba8_w128_64) : "%mm0");
897
for (; n_pixels >= 2; n_pixels -= 2)
899
asm volatile (" movq %0,%%mm2\n"
903
mmx_low_bytes_to_words(mm3,mm5,mm0)
904
"\tpcmpeqb %%mm4,%%mm4\n"
905
"\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */
906
"\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
907
"\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */
908
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
909
mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */
912
mmx_high_bytes_to_words(mm3,mm5,mm0)
913
"\tpcmpeqb %%mm1,%%mm1\n"
914
"\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */
915
"\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
916
"\tmovq %3,%%mm6\n" /* mm6 = words of value 2 */
917
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
918
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */
920
"\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */
922
mmx_low_bytes_to_words(mm4,mm5,mm0)
923
mmx_low_bytes_to_words(mm2,mm6,mm0)
924
"\tpaddw %%mm6,%%mm5\n"
925
mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */
927
mmx_high_bytes_to_words(mm4,mm1,mm0)
928
mmx_high_bytes_to_words(mm2,mm6,mm0)
929
"\tpaddw %%mm6,%%mm1\n"
930
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */
932
"\tpackuswb %%mm1,%%mm5\n"
935
"\tmovq %%mm0,%%mm1\n"
936
"\tpandn %%mm5,%%mm1\n"
938
"\t" pminub(mm2,mm3,mm4) "\n"
939
"\tpand %%mm0,%%mm3\n"
941
"\tpor %%mm3,%%mm1\n"
944
: "+m" (*a), "+m" (*b), "+m" (*d)
945
: "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
946
: "%mm1", "%mm2", "%mm3", "%mm4");
954
asm volatile (" movd %1,%%mm2\n"
958
mmx_low_bytes_to_words(mm3,mm5,mm0)
959
"\tpcmpeqb %%mm4,%%mm4\n"
960
"\tpsubb %%mm2,%%mm4\n" /* mm4 = 255 - A */
961
"\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
962
"\tmovq %3,%%mm6\n" /* mm6 = words of integer value 2 */
963
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
964
mmx_int_mult(mm6,mm4,mm7) /* mm4 = INT_MULT(mm6, mm4) */
967
mmx_high_bytes_to_words(mm3,mm5,mm0)
968
"\tpcmpeqb %%mm1,%%mm1\n"
969
"\tpsubb %%mm2,%%mm1\n" /* mm1 = 255 - A */
970
"\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
971
"\tmovq %3,%%mm6\n" /* mm6 = words of integer value 2 */
972
"\tpmullw %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
973
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) */
975
"\tpackuswb %%mm1,%%mm4\n" /* mm4 = intermediate value */
977
mmx_low_bytes_to_words(mm4,mm5,mm0)
978
mmx_low_bytes_to_words(mm2,mm6,mm0)
979
"\tpaddw %%mm6,%%mm5\n"
980
mmx_int_mult(mm6,mm5,mm7) /* mm5 = INT_MULT(mm6, mm5) low bytes */
982
mmx_high_bytes_to_words(mm4,mm1,mm0)
983
mmx_high_bytes_to_words(mm2,mm6,mm0)
984
"\tpaddw %%mm6,%%mm1\n"
985
mmx_int_mult(mm6,mm1,mm7) /* mm1 = INT_MULT(mm6, mm1) high bytes */
987
"\tpackuswb %%mm1,%%mm5\n"
990
"\tmovq %%mm0,%%mm1\n"
991
"\tpandn %%mm5,%%mm1\n"
993
"\t" pminub(mm2,mm3,mm4) "\n"
994
"\tpand %%mm0,%%mm3\n"
996
"\tpor %%mm3,%%mm1\n"
1000
: "m" (*a), "m" (*b), "m" (*rgba8_w2_64), "m" (*rgba8_alpha_mask_64)
1001
: "%mm1", "%mm2", "%mm3", "%mm4");
1009
gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
1011
uint64 *d = (uint64 *) _op->D;
1012
uint64 *a = (uint64 *) _op->A;
1013
gulong n_pixels = _op->n_pixels;
1015
asm volatile ("pxor %%mm0,%%mm0\n"
1017
"\tmovl %%eax,%%ebx\n"
1019
"\torl %%ebx,%%eax\n"
1020
"\tmovd %%eax,%%mm5\n"
1021
"\tmovd %%eax,%%mm3\n"
1022
"\tpsllq $32,%%mm5\n"
1023
"\tpor %%mm5,%%mm3\n"
1026
: "m" (_op->scale.scale), "m" (*rgba8_w128_64)
1027
: "%eax", "%ebx", "%mm0", "%mm5", "%mm6", "%mm7");
1029
for (; n_pixels >= 2; n_pixels -= 2)
1031
asm volatile ("movq %1,%%mm2\n"
1032
"\tmovq %%mm2,%%mm1\n"
1033
"\tpunpcklbw %%mm0,%%mm1\n"
1034
"\tmovq %%mm3,%%mm5\n"
1036
"\t" pmulwX(mm5,mm1,mm7) "\n"
1038
"\tmovq %%mm2,%%mm4\n"
1039
"\tpunpckhbw %%mm0,%%mm4\n"
1040
"\tmovq %%mm3,%%mm5\n"
1042
"\t" pmulwX(mm5,mm4,mm7) "\n"
1044
"\tpackuswb %%mm4,%%mm1\n"
1049
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
1056
asm volatile ("movd %1,%%mm2\n"
1057
"\tmovq %%mm2,%%mm1\n"
1058
"\tpunpcklbw %%mm0,%%mm1\n"
1059
"\tmovq %%mm3,%%mm5\n"
1061
"\t" pmulwX(mm5,mm1,mm7) "\n"
1063
"\tpackuswb %%mm0,%%mm1\n"
1067
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
1074
gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
1076
uint64 *d = (uint64 *) _op->D;
1077
uint64 *a = (uint64 *) _op->A;
1078
uint64 *b = (uint64 *) _op->B;
1079
gulong n_pixels = _op->n_pixels;
1081
asm volatile ("pxor %%mm6,%%mm6\n"
1085
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
1086
: "%mm0", "%mm6", "%mm7");
1088
for (; n_pixels >= 2; n_pixels -= 2)
1090
asm volatile (" movq %1,%%mm2\n"
1093
"\tpcmpeqb %%mm4,%%mm4\n"
1094
"\tpsubb %%mm2,%%mm4\n"
1095
"\tpcmpeqb %%mm5,%%mm5\n"
1096
"\tpsubb %%mm3,%%mm5\n"
1098
"\tpunpcklbw %%mm6,%%mm4\n"
1099
"\tpunpcklbw %%mm6,%%mm5\n"
1100
"\tpmullw %%mm4,%%mm5\n"
1101
"\tpaddw %%mm7,%%mm5\n"
1102
"\tmovq %%mm5,%%mm1\n"
1103
"\tpsrlw $ 8,%%mm1\n"
1104
"\tpaddw %%mm5,%%mm1\n"
1105
"\tpsrlw $ 8,%%mm1\n"
1107
"\tpcmpeqb %%mm4,%%mm4\n"
1108
"\tpsubb %%mm2,%%mm4\n"
1109
"\tpcmpeqb %%mm5,%%mm5\n"
1110
"\tpsubb %%mm3,%%mm5\n"
1112
"\tpunpckhbw %%mm6,%%mm4\n"
1113
"\tpunpckhbw %%mm6,%%mm5\n"
1114
"\tpmullw %%mm4,%%mm5\n"
1115
"\tpaddw %%mm7,%%mm5\n"
1116
"\tmovq %%mm5,%%mm4\n"
1117
"\tpsrlw $ 8,%%mm4\n"
1118
"\tpaddw %%mm5,%%mm4\n"
1119
"\tpsrlw $ 8,%%mm4\n"
1121
"\tpackuswb %%mm4,%%mm1\n"
1123
"\tpcmpeqb %%mm4,%%mm4\n"
1124
"\tpsubb %%mm1,%%mm4\n"
1126
"\tmovq %%mm0,%%mm1\n"
1127
"\tpandn %%mm4,%%mm1\n"
1129
"\t" pminub(mm2,mm3,mm4) "\n"
1130
"\tpand %%mm0,%%mm3\n"
1132
"\tpor %%mm3,%%mm1\n"
1136
: "m" (*a), "m" (*b)
1137
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
1145
asm volatile (" movd %1,%%mm2\n"
1148
"\tpcmpeqb %%mm4,%%mm4\n"
1149
"\tpsubb %%mm2,%%mm4\n"
1150
"\tpcmpeqb %%mm5,%%mm5\n"
1151
"\tpsubb %%mm3,%%mm5\n"
1153
"\tpunpcklbw %%mm6,%%mm4\n"
1154
"\tpunpcklbw %%mm6,%%mm5\n"
1155
"\tpmullw %%mm4,%%mm5\n"
1156
"\tpaddw %%mm7,%%mm5\n"
1157
"\tmovq %%mm5,%%mm1\n"
1158
"\tpsrlw $ 8,%%mm1\n"
1159
"\tpaddw %%mm5,%%mm1\n"
1160
"\tpsrlw $ 8,%%mm1\n"
1162
"\tpcmpeqb %%mm4,%%mm4\n"
1163
"\tpsubb %%mm2,%%mm4\n"
1164
"\tpcmpeqb %%mm5,%%mm5\n"
1165
"\tpsubb %%mm3,%%mm5\n"
1167
"\tpunpckhbw %%mm6,%%mm4\n"
1168
"\tpunpckhbw %%mm6,%%mm5\n"
1169
"\tpmullw %%mm4,%%mm5\n"
1170
"\tpaddw %%mm7,%%mm5\n"
1171
"\tmovq %%mm5,%%mm4\n"
1172
"\tpsrlw $ 8,%%mm4\n"
1173
"\tpaddw %%mm5,%%mm4\n"
1174
"\tpsrlw $ 8,%%mm4\n"
1176
"\tpackuswb %%mm4,%%mm1\n"
1178
"\tpcmpeqb %%mm4,%%mm4\n"
1179
"\tpsubb %%mm1,%%mm4\n"
1181
"\tmovq %%mm0,%%mm1\n"
1182
"\tpandn %%mm4,%%mm1\n"
1184
"\t" pminub(mm2,mm3,mm4) "\n"
1185
"\tpand %%mm0,%%mm3\n"
1187
"\tpor %%mm3,%%mm1\n"
1191
: "m" (*a), "m" (*b)
1192
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
1195
asm volatile ("emms");
1200
gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
1202
uint64 *d = (uint64 *) _op->D;
1203
uint64 *a = (uint64 *) _op->A;
1204
uint64 *b = (uint64 *) _op->B;
1205
gulong n_pixels = _op->n_pixels;
1207
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
1209
for (; n_pixels >= 2; n_pixels -= 2)
1211
asm volatile (" movq %1,%%mm2\n"
1214
"\tmovq %%mm2,%%mm4\n"
1215
"\tpsubusb %%mm3,%%mm4\n"
1217
"\tmovq %%mm0,%%mm1\n"
1218
"\tpandn %%mm4,%%mm1\n"
1220
"\t" pminub(mm3,mm2,mm4) "\n"
1222
"\tpand %%mm0,%%mm2\n"
1223
"\tpor %%mm2,%%mm1\n"
1226
: "m" (*a), "m" (*b)
1227
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
1235
asm volatile (" movd %1,%%mm2\n"
1238
"\tmovq %%mm2,%%mm4\n"
1239
"\tpsubusb %%mm3,%%mm4\n"
1241
"\tmovq %%mm0,%%mm1\n"
1242
"\tpandn %%mm4,%%mm1\n"
1244
"\t" pminub(mm3,mm2,mm4) "\n"
1246
"\tpand %%mm0,%%mm2\n"
1247
"\tpor %%mm2,%%mm1\n"
1250
: "m" (*a), "m" (*b)
1251
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
1254
asm volatile ("emms");
1258
gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
1260
uint64 *a = (uint64 *) _op->A;
1261
uint64 *b = (uint64 *) _op->B;
1262
gulong n_pixels = _op->n_pixels;
1264
for (; n_pixels >= 2; n_pixels -= 2)
1266
asm volatile (" movq %0,%%mm2\n"
1270
: "+m" (*a), "+m" (*b)
1272
: "%mm1", "%mm2", "%mm3", "%mm4");
1279
asm volatile (" movd %0,%%mm2\n"
1283
: "+m" (*a), "+m" (*b)
1285
: "%mm1", "%mm2", "%mm3", "%mm4");
1294
gimp_composite_addition_va8_va8_va8_mmx (GimpCompositeContext *_op)
1296
uint64 *d = (uint64 *) _op->D;
1297
uint64 *a = (uint64 *) _op->A;
1298
uint64 *b = (uint64 *) _op->B;
1299
gulong n_pixels = _op->n_pixels;
1301
asm volatile ("movq %0,%%mm0"
1303
: "m" (*va8_alpha_mask_64)
1306
for (; n_pixels >= 4; n_pixels -= 4)
1308
asm volatile (" movq %1, %%mm2\n"
1309
"\tmovq %2, %%mm3\n"
1310
"\tmovq %%mm2, %%mm4\n"
1311
"\tpaddusb %%mm3, %%mm4\n"
1312
"\tmovq %%mm0, %%mm1\n"
1313
"\tpandn %%mm4, %%mm1\n"
1314
"\t" pminub(mm3, mm2, mm4) "\n"
1315
"\tpand %%mm0, %%mm2\n"
1316
"\tpor %%mm2, %%mm1\n"
1318
"\tmovq %%mm1, %0\n"
1320
"\tmovntq %%mm1, %0\n"
1323
: "m" (*a), "m" (*b)
1324
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1330
uint32 *a32 = (uint32 *) a;
1331
uint32 *b32 = (uint32 *) b;
1332
uint32 *d32 = (uint32 *) d;
1334
for (; n_pixels >= 2; n_pixels -= 2)
1336
asm volatile (" movd %1, %%mm2\n"
1337
"\tmovd %2, %%mm3\n"
1338
"\tmovq %%mm2, %%mm4\n"
1339
"\tpaddusb %%mm3, %%mm4\n"
1340
"\tmovq %%mm0, %%mm1\n"
1341
"\tpandn %%mm4, %%mm1\n"
1342
"\t" pminub(mm3, mm2, mm4) "\n"
1343
"\tpand %%mm0, %%mm2\n"
1344
"\tpor %%mm2, %%mm1\n"
1345
"\tmovd %%mm1, %0\n"
1347
: "m" (*a32), "m" (*b32)
1348
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1354
uint16 *a16 = (uint16 *) a32;
1355
uint16 *b16 = (uint16 *) b32;
1356
uint16 *d16 = (uint16 *) d32;
1358
for (; n_pixels >= 1; n_pixels -= 1)
1360
asm volatile (" movw %1, %%ax ; movd %%eax, %%mm2\n"
1361
"\tmovw %2, %%ax ; movd %%eax, %%mm3\n"
1362
"\tmovq %%mm2, %%mm4\n"
1363
"\tpaddusb %%mm3, %%mm4\n"
1364
"\tmovq %%mm0, %%mm1\n"
1365
"\tpandn %%mm4, %%mm1\n"
1366
"\t" pminub(mm3, mm2, mm4) "\n"
1367
"\tpand %%mm0, %%mm2\n"
1368
"\tpor %%mm2, %%mm1\n"
1369
"\tmovd %%mm1, %%eax\n"
1372
: "m" (*a16), "m" (*b16)
1373
: "%eax", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1386
gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op)
1388
GimpCompositeContext op = *_op;
1392
: "m" (*va8_alpha_mask)
1395
for (; op.n_pixels >= 4; op.n_pixels -= 4)
1397
asm volatile (" movq %0,%%mm0\n"
1401
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1402
"\tpxor %%mm4,%%mm4\n"
1403
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1405
"\tmovq %%mm1,%%mm3\n"
1406
"\tpxor %%mm5,%%mm5\n"
1407
"\tpunpcklbw %%mm5,%%mm3\n"
1409
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1411
"\t" pdivwqX(mm4,mm5,mm7) "\n"
1414
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1415
"\tpxor %%mm4,%%mm4\n"
1416
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1418
"\tmovq %%mm1,%%mm3\n"
1419
"\tpxor %%mm5,%%mm5\n"
1420
"\tpunpckhbw %%mm5,%%mm3\n"
1422
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1423
"\t" pdivwqX(mm4,mm5,mm6) "\n"
1426
"\tmovq %%mm4,%%mm5\n"
1427
"\tpsubusw %%mm6,%%mm4\n"
1428
"\tpsubusw %%mm7,%%mm5\n"
1430
"\tpackuswb %%mm4,%%mm5\n"
1432
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
1435
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
1437
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
1438
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
1442
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
1443
: "%mm1", "%mm2", "%mm3", "%mm4");
1451
asm volatile (" movd %0,%%mm0\n"
1454
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1455
"\tpxor %%mm4,%%mm4\n"
1456
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1458
"\tmovq %%mm1,%%mm3\n"
1459
"\tpxor %%mm5,%%mm5\n"
1460
"\tpunpcklbw %%mm5,%%mm3\n"
1462
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1464
"\t" pdivwqX(mm4,mm5,mm7) "\n"
1467
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1468
"\tpxor %%mm4,%%mm4\n"
1469
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1471
"\tmovq %%mm1,%%mm3\n"
1472
"\tpxor %%mm5,%%mm5\n"
1473
"\tpunpckhbw %%mm5,%%mm3\n"
1475
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1476
"\t" pdivwqX(mm4,mm5,mm6) "\n"
1479
"\tmovq %%mm4,%%mm5\n"
1480
"\tpsubusw %%mm6,%%mm4\n"
1481
"\tpsubusw %%mm7,%%mm5\n"
1483
"\tpackuswb %%mm4,%%mm5\n"
1485
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
1488
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
1490
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
1491
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
1495
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
1496
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
1503
xxxgimp_composite_coloronly_va8_va8_va8_mmx (GimpCompositeContext *_op)
1505
GimpCompositeContext op = *_op;
1510
xxxgimp_composite_darken_va8_va8_va8_mmx (GimpCompositeContext *_op)
1512
GimpCompositeContext op = *_op;
1516
asm("movl 12(%esp), %edi");
1517
asm("movq v8_alpha_mask, %mm0");
1518
asm("subl $ 4, %ecx");
1519
asm("jl .darken_pixels_1a_1a_last3");
1520
asm("movl $ 8, %ebx");
1521
asm(".darken_pixels_1a_1a_loop:");
1522
asm("movq (%eax), %mm2");
1523
asm("movq (%edx), %mm3");
1525
asm("movq %mm2, %mm4");
1526
asm("psubusb %mm3, %mm4");
1527
asm("psubb %mm4, %mm2");
1528
asm("movq %mm2, %mm1");
1529
asm("movq %mm1, (%edi)");
1530
asm("addl %ebx, %eax");
1531
asm("addl %ebx, %edx");
1532
asm("addl %ebx, %edi");
1533
asm("subl $ 4, %ecx");
1534
asm("jge .darken_pixels_1a_1a_loop");
1536
asm(".darken_pixels_1a_1a_last3:");
1537
asm("test $ 2, %ecx");
1538
asm("jz .darken_pixels_1a_1a_last1");
1539
asm("movd (%eax), %mm2");
1540
asm("movd (%edx), %mm3");
1542
asm("movq %mm2, %mm4");
1543
asm("psubusb %mm3, %mm4");
1544
asm("psubb %mm4, %mm2");
1545
asm("movq %mm2, %mm1");
1546
asm("addl $ 4, %eax");
1547
asm("addl $ 4, %edx");
1548
asm("addl $ 4, %edi");
1550
asm(".darken_pixels_1a_1a_last1:");
1551
asm("test $ 1, %ecx");
1552
asm("jz .darken_pixels_1a_1a_end");
1554
asm("movw (%eax), %bx");
1555
asm("movd %ebx, %mm2");
1556
asm("movw (%edx), %bx");
1557
asm("movd %ebx, %mm3");
1559
asm("movq %mm2, %mm4");
1560
asm("psubusb %mm3, %mm4");
1561
asm("psubb %mm4, %mm2");
1562
asm("movq %mm2, %mm1");
1563
asm("movd %mm1, %ebx");
1564
asm("movw %bx, (%edi)");
1566
asm(".darken_pixels_1a_1a_end:");
1574
xxxgimp_composite_difference_va8_va8_va8_mmx (GimpCompositeContext *_op)
1576
GimpCompositeContext op = *_op;
1580
asm("movl 12(%esp), %edi");
1581
asm("movq v8_alpha_mask, %mm0");
1582
asm("subl $ 4, %ecx");
1583
asm("jl .difference_pixels_1a_1a_last3");
1584
asm("movl $ 8, %ebx");
1585
asm(".difference_pixels_1a_1a_loop:");
1586
asm("movq (%eax), %mm2");
1587
asm("movq (%edx), %mm3");
1589
asm("movq %mm2, %mm4");
1590
asm("movq %mm3, %mm5");
1591
asm("psubusb %mm3, %mm4");
1592
asm("psubusb %mm2, %mm5");
1593
asm("movq %mm0, %mm1");
1594
asm("paddb %mm5, %mm4");
1595
asm("pandn %mm4, %mm1");
1596
asm("psubb %mm4, %mm2");
1597
asm("pand %mm0, %mm2");
1598
asm("por %mm2, %mm1");
1599
asm("movq %mm1, (%edi)");
1600
asm("addl %ebx, %eax");
1601
asm("addl %ebx, %edx");
1602
asm("addl %ebx, %edi");
1603
asm("subl $ 4, %ecx");
1604
asm("jge .difference_pixels_1a_1a_loop");
1606
asm(".difference_pixels_1a_1a_last3:");
1607
asm("test $ 2, %ecx");
1608
asm("jz .difference_pixels_1a_1a_last1");
1609
asm("movd (%eax), %mm2");
1610
asm("movd (%edx), %mm3");
1612
asm("movq %mm2, %mm4");
1613
asm("movq %mm3, %mm5");
1614
asm("psubusb %mm3, %mm4");
1615
asm("psubusb %mm2, %mm5");
1616
asm("movq %mm0, %mm1");
1617
asm("paddb %mm5, %mm4");
1618
asm("pandn %mm4, %mm1");
1619
asm("psubb %mm4, %mm2");
1620
asm("pand %mm0, %mm2");
1621
asm("por %mm2, %mm1");
1622
asm("addl $ 4, %eax");
1623
asm("addl $ 4, %edx");
1624
asm("addl $ 4, %edi");
1626
asm(".difference_pixels_1a_1a_last1:");
1627
asm("test $ 1, %ecx");
1628
asm("jz .difference_pixels_1a_1a_end");
1630
asm("movw (%eax), %bx");
1631
asm("movd %ebx, %mm2");
1632
asm("movw (%edx), %bx");
1633
asm("movd %ebx, %mm3");
1635
asm("movq %mm2, %mm4");
1636
asm("movq %mm3, %mm5");
1637
asm("psubusb %mm3, %mm4");
1638
asm("psubusb %mm2, %mm5");
1639
asm("movq %mm0, %mm1");
1640
asm("paddb %mm5, %mm4");
1641
asm("pandn %mm4, %mm1");
1642
asm("psubb %mm4, %mm2");
1643
asm("pand %mm0, %mm2");
1644
asm("por %mm2, %mm1");
1645
asm("movd %mm1, %ebx");
1646
asm("movw %bx, (%edi)");
1648
asm(".difference_pixels_1a_1a_end:");
1656
xxxgimp_composite_dissolve_va8_va8_va8_mmx (GimpCompositeContext *_op)
1658
GimpCompositeContext op = *_op;
1663
xxxgimp_composite_divide_va8_va8_va8_mmx (GimpCompositeContext *_op)
1665
GimpCompositeContext op = *_op;
1670
xxxgimp_composite_dodge_va8_va8_va8_mmx (GimpCompositeContext *_op)
1672
GimpCompositeContext op = *_op;
1677
xxxgimp_composite_grain_extract_va8_va8_va8_mmx (GimpCompositeContext *_op)
1679
GimpCompositeContext op = *_op;
1684
xxxgimp_composite_grain_merge_va8_va8_va8_mmx (GimpCompositeContext *_op)
1686
GimpCompositeContext op = *_op;
1691
xxxgimp_composite_hardlight_va8_va8_va8_mmx (GimpCompositeContext *_op)
1693
GimpCompositeContext op = *_op;
1698
xxxgimp_composite_hueonly_va8_va8_va8_mmx (GimpCompositeContext *_op)
1700
GimpCompositeContext op = *_op;
1705
xxxgimp_composite_lighten_va8_va8_va8_mmx (GimpCompositeContext *_op)
1707
GimpCompositeContext op = *_op;
1711
asm("movl 12(%esp), %edi");
1712
asm("movq v8_alpha_mask, %mm0");
1713
asm("subl $ 4, %ecx");
1714
asm("jl .lighten_pixels_1a_1a_last3");
1715
asm("movl $ 8, %ebx");
1716
asm(".lighten_pixels_1a_1a_loop:");
1717
asm("movq (%eax), %mm2");
1718
asm("movq (%edx), %mm3");
1720
asm("movq %mm2, %mm4");
1721
asm("psubusb %mm3, %mm4");
1722
asm("paddb %mm4, %mm3");
1723
asm("movq %mm0, %mm1");
1724
asm("pandn %mm3, %mm1");
1726
asm("psubb %mm4, %mm2");
1727
asm("pand %mm0, %mm2");
1728
asm("por %mm2, %mm1");
1729
asm("movq %mm1, (%edi)");
1730
asm("addl %ebx, %eax");
1731
asm("addl %ebx, %edx");
1732
asm("addl %ebx, %edi");
1733
asm("subl $ 4, %ecx");
1734
asm("jge .lighten_pixels_1a_1a_loop");
1736
asm(".lighten_pixels_1a_1a_last3:");
1737
asm("test $ 2, %ecx");
1738
asm("jz .lighten_pixels_1a_1a_last1");
1739
asm("movd (%eax), %mm2");
1740
asm("movd (%edx), %mm3");
1742
asm("movq %mm2, %mm4");
1743
asm("psubusb %mm3, %mm4");
1744
asm("paddb %mm4, %mm3");
1745
asm("movq %mm0, %mm1");
1746
asm("pandn %mm3, %mm1");
1748
asm("psubb %mm4, %mm2");
1749
asm("pand %mm0, %mm2");
1750
asm("por %mm2, %mm1");
1751
asm("addl $ 4, %eax");
1752
asm("addl $ 4, %edx");
1753
asm("addl $ 4, %edi");
1755
asm(".lighten_pixels_1a_1a_last1:");
1756
asm("test $ 1, %ecx");
1757
asm("jz .lighten_pixels_1a_1a_end");
1759
asm("movw (%eax), %bx");
1760
asm("movd %ebx, %mm2");
1761
asm("movw (%edx), %bx");
1762
asm("movd %ebx, %mm3");
1764
asm("movq %mm2, %mm4");
1765
asm("psubusb %mm3, %mm4");
1766
asm("paddb %mm4, %mm3");
1767
asm("movq %mm0, %mm1");
1768
asm("pandn %mm3, %mm1");
1770
asm("psubb %mm4, %mm2");
1771
asm("pand %mm0, %mm2");
1772
asm("por %mm2, %mm1");
1773
asm("movd %mm1, %ebx");
1774
asm("movw %bx, (%edi)");
1776
asm(".lighten_pixels_1a_1a_end:");
1784
xxxgimp_composite_multiply_va8_va8_va8_mmx (GimpCompositeContext *_op)
1786
GimpCompositeContext op = *_op;
1790
asm("movl 12(%esp), %edi");
1791
asm("movq v8_alpha_mask, %mm0");
1792
asm("subl $ 4, %ecx");
1793
asm("jl .multiply_pixels_1a_1a_last3");
1794
asm("movl $ 8, %ebx");
1795
asm(".multiply_pixels_1a_1a_loop:");
1796
asm("movq (%eax), %mm2");
1797
asm("movq (%edx), %mm3");
1800
asm("movq %mm2, %mm1");
1801
asm("punpcklbw %mm6, %mm1");
1802
asm("movq %mm3, %mm5");
1803
asm("punpcklbw %mm6, %mm5");
1804
asm("pmullw %mm5, %mm1");
1805
asm("paddw %mm7, %mm1");
1806
asm("movq %mm1, %mm5");
1807
asm("psrlw $ 8, %mm5");
1808
asm("paddw %mm5, %mm1");
1809
asm("psrlw $ 8, %mm1");
1811
asm("movq %mm2, %mm4");
1812
asm("punpckhbw %mm6, %mm4");
1813
asm("movq %mm3, %mm5");
1814
asm("punpckhbw %mm6, %mm5");
1815
asm("pmullw %mm5, %mm4");
1816
asm("paddw %mm7, %mm4");
1817
asm("movq %mm4, %mm5");
1818
asm("psrlw $ 8, %mm5");
1819
asm("paddw %mm5, %mm4");
1820
asm("psrlw $ 8, %mm4");
1822
asm("packuswb %mm4, %mm1");
1824
asm("movq %mm0, %mm4");
1825
asm("pandn %mm1, %mm4");
1826
asm("movq %mm4, %mm1");
1828
asm("movq %mm2, %mm4");
1829
asm("psubusb %mm3, %mm4");
1830
asm("psubb %mm4, %mm2");
1831
asm("pand %mm0, %mm2");
1832
asm("por %mm2, %mm1");
1833
asm("movq %mm1, (%edi)");
1834
asm("addl %ebx, %eax");
1835
asm("addl %ebx, %edx");
1836
asm("addl %ebx, %edi");
1837
asm("subl $ 4, %ecx");
1838
asm("jge .multiply_pixels_1a_1a_loop");
1840
asm(".multiply_pixels_1a_1a_last3:");
1841
asm("test $ 2, %ecx");
1842
asm("jz .multiply_pixels_1a_1a_last1");
1843
asm("movd (%eax), %mm2");
1844
asm("movd (%edx), %mm3");
1847
asm("movq %mm2, %mm1");
1848
asm("punpcklbw %mm6, %mm1");
1849
asm("movq %mm3, %mm5");
1850
asm("punpcklbw %mm6, %mm5");
1851
asm("pmullw %mm5, %mm1");
1852
asm("paddw %mm7, %mm1");
1853
asm("movq %mm1, %mm5");
1854
asm("psrlw $ 8, %mm5");
1855
asm("paddw %mm5, %mm1");
1856
asm("psrlw $ 8, %mm1");
1858
asm("movq %mm2, %mm4");
1859
asm("punpckhbw %mm6, %mm4");
1860
asm("movq %mm3, %mm5");
1861
asm("punpckhbw %mm6, %mm5");
1862
asm("pmullw %mm5, %mm4");
1863
asm("paddw %mm7, %mm4");
1864
asm("movq %mm4, %mm5");
1865
asm("psrlw $ 8, %mm5");
1866
asm("paddw %mm5, %mm4");
1867
asm("psrlw $ 8, %mm4");
1869
asm("packuswb %mm4, %mm1");
1871
asm("movq %mm0, %mm4");
1872
asm("pandn %mm1, %mm4");
1873
asm("movq %mm4, %mm1");
1875
asm("movq %mm2, %mm4");
1876
asm("psubusb %mm3, %mm4");
1877
asm("psubb %mm4, %mm2");
1878
asm("pand %mm0, %mm2");
1879
asm("por %mm2, %mm1");
1880
asm("addl $ 4, %eax");
1881
asm("addl $ 4, %edx");
1882
asm("addl $ 4, %edi");
1884
asm(".multiply_pixels_1a_1a_last1:");
1885
asm("test $ 1, %ecx");
1886
asm("jz .multiply_pixels_1a_1a_end");
1888
asm("movw (%eax), %bx");
1889
asm("movd %ebx, %mm2");
1890
asm("movw (%edx), %bx");
1891
asm("movd %ebx, %mm3");
1894
asm("movq %mm2, %mm1");
1895
asm("punpcklbw %mm6, %mm1");
1896
asm("movq %mm3, %mm5");
1897
asm("punpcklbw %mm6, %mm5");
1898
asm("pmullw %mm5, %mm1");
1899
asm("paddw %mm7, %mm1");
1900
asm("movq %mm1, %mm5");
1901
asm("psrlw $ 8, %mm5");
1902
asm("paddw %mm5, %mm1");
1903
asm("psrlw $ 8, %mm1");
1905
asm("movq %mm2, %mm4");
1906
asm("punpckhbw %mm6, %mm4");
1907
asm("movq %mm3, %mm5");
1908
asm("punpckhbw %mm6, %mm5");
1909
asm("pmullw %mm5, %mm4");
1910
asm("paddw %mm7, %mm4");
1911
asm("movq %mm4, %mm5");
1912
asm("psrlw $ 8, %mm5");
1913
asm("paddw %mm5, %mm4");
1914
asm("psrlw $ 8, %mm4");
1916
asm("packuswb %mm4, %mm1");
1918
asm("movq %mm0, %mm4");
1919
asm("pandn %mm1, %mm4");
1920
asm("movq %mm4, %mm1");
1922
asm("movq %mm2, %mm4");
1923
asm("psubusb %mm3, %mm4");
1924
asm("psubb %mm4, %mm2");
1925
asm("pand %mm0, %mm2");
1926
asm("por %mm2, %mm1");
1927
asm("movd %mm1, %ebx");
1928
asm("movw %bx, (%edi)");
1930
asm(".multiply_pixels_1a_1a_end:");
1938
xxxgimp_composite_overlay_va8_va8_va8_mmx (GimpCompositeContext *_op)
1940
GimpCompositeContext op = *_op;
1944
asm("movl 12(%esp), %edi");
1945
asm("movq v8_alpha_mask, %mm0");
1946
asm("subl $ 4, %ecx");
1947
asm("jl .overlay_pixels_1a_1a_last3");
1948
asm("movl $ 8, %ebx");
1949
asm(".overlay_pixels_1a_1a_loop:");
1950
asm("movq (%eax), %mm2");
1951
asm("movq (%edx), %mm3");
1952
asm("call op_overlay");
1953
asm("movq %mm1, (%edi)");
1954
asm("addl %ebx, %eax");
1955
asm("addl %ebx, %edx");
1956
asm("addl %ebx, %edi");
1957
asm("subl $ 4, %ecx");
1958
asm("jge .overlay_pixels_1a_1a_loop");
1960
asm(".overlay_pixels_1a_1a_last3:");
1961
asm("test $ 2, %ecx");
1962
asm("jz .overlay_pixels_1a_1a_last1");
1963
asm("movd (%eax), %mm2");
1964
asm("movd (%edx), %mm3");
1965
asm("call op_overlay");
1966
asm("addl $ 4, %eax");
1967
asm("addl $ 4, %edx");
1968
asm("addl $ 4, %edi");
1970
asm(".overlay_pixels_1a_1a_last1:");
1971
asm("test $ 1, %ecx");
1972
asm("jz .overlay_pixels_1a_1a_end");
1974
asm("movw (%eax), %bx");
1975
asm("movd %ebx, %mm2");
1976
asm("movw (%edx), %bx");
1977
asm("movd %ebx, %mm3");
1978
asm("call op_overlay");
1979
asm("movd %mm1, %ebx");
1980
asm("movw %bx, (%edi)");
1982
asm(".overlay_pixels_1a_1a_end:");
1990
xxxgimp_composite_replace_va8_va8_va8_mmx (GimpCompositeContext *_op)
1992
GimpCompositeContext op = *_op;
1997
xxxgimp_composite_saturationonly_va8_va8_va8_mmx (GimpCompositeContext *_op)
1999
GimpCompositeContext op = *_op;
2004
xxxgimp_composite_screen_va8_va8_va8_mmx (GimpCompositeContext *_op)
2006
GimpCompositeContext op = *_op;
2010
asm("movl 12(%esp), %edi");
2011
asm("movq v8_alpha_mask, %mm0");
2012
asm("subl $ 4, %ecx");
2013
asm("jl .screen_pixels_1a_1a_last3");
2014
asm("movl $ 8, %ebx");
2015
asm(".screen_pixels_1a_1a_loop:");
2016
asm("movq (%eax), %mm2");
2017
asm("movq (%edx), %mm3");
2020
asm("pcmpeqb %mm4, %mm4");
2021
asm("psubb %mm2, %mm4");
2022
asm("pcmpeqb %mm5, %mm5");
2023
asm("psubb %mm3, %mm5");
2025
asm("movq %mm4, %mm1");
2026
asm("punpcklbw %mm6, %mm1");
2027
asm("movq %mm5, %mm3");
2028
asm("punpcklbw %mm6, %mm3");
2029
asm("pmullw %mm3, %mm1");
2030
asm("paddw %mm7, %mm1");
2031
asm("movq %mm1, %mm3");
2032
asm("psrlw $ 8, %mm3");
2033
asm("paddw %mm3, %mm1");
2034
asm("psrlw $ 8, %mm1");
2036
asm("movq %mm4, %mm2");
2037
asm("punpckhbw %mm6, %mm2");
2038
asm("movq %mm5, %mm3");
2039
asm("punpckhbw %mm6, %mm3");
2040
asm("pmullw %mm3, %mm2");
2041
asm("paddw %mm7, %mm2");
2042
asm("movq %mm2, %mm3");
2043
asm("psrlw $ 8, %mm3");
2044
asm("paddw %mm3, %mm2");
2045
asm("psrlw $ 8, %mm2");
2047
asm("packuswb %mm2, %mm1");
2049
asm("pcmpeqb %mm3, %mm3");
2050
asm("psubb %mm1, %mm3");
2052
asm("movq %mm0, %mm1");
2053
asm("pandn %mm3, %mm1");
2055
asm("movq %mm2, %mm4");
2056
asm("psubusb %mm5, %mm2");
2057
asm("paddb %mm2, %mm5");
2058
asm("pcmpeqb %mm3, %mm3");
2059
asm("psubb %mm5, %mm3");
2061
asm("pand %mm0, %mm3");
2062
asm("por %mm3, %mm1");
2063
asm("movq %mm1, (%edi)");
2064
asm("addl %ebx, %eax");
2065
asm("addl %ebx, %edx");
2066
asm("addl %ebx, %edi");
2067
asm("subl $ 4, %ecx");
2068
asm("jge .screen_pixels_1a_1a_loop");
2070
asm(".screen_pixels_1a_1a_last3:");
2071
asm("test $ 2, %ecx");
2072
asm("jz .screen_pixels_1a_1a_last1");
2073
asm("movd (%eax), %mm2");
2074
asm("movd (%edx), %mm3");
2077
asm("pcmpeqb %mm4, %mm4");
2078
asm("psubb %mm2, %mm4");
2079
asm("pcmpeqb %mm5, %mm5");
2080
asm("psubb %mm3, %mm5");
2082
asm("movq %mm4, %mm1");
2083
asm("punpcklbw %mm6, %mm1");
2084
asm("movq %mm5, %mm3");
2085
asm("punpcklbw %mm6, %mm3");
2086
asm("pmullw %mm3, %mm1");
2087
asm("paddw %mm7, %mm1");
2088
asm("movq %mm1, %mm3");
2089
asm("psrlw $ 8, %mm3");
2090
asm("paddw %mm3, %mm1");
2091
asm("psrlw $ 8, %mm1");
2093
asm("movq %mm4, %mm2");
2094
asm("punpckhbw %mm6, %mm2");
2095
asm("movq %mm5, %mm3");
2096
asm("punpckhbw %mm6, %mm3");
2097
asm("pmullw %mm3, %mm2");
2098
asm("paddw %mm7, %mm2");
2099
asm("movq %mm2, %mm3");
2100
asm("psrlw $ 8, %mm3");
2101
asm("paddw %mm3, %mm2");
2102
asm("psrlw $ 8, %mm2");
2104
asm("packuswb %mm2, %mm1");
2106
asm("pcmpeqb %mm3, %mm3");
2107
asm("psubb %mm1, %mm3");
2109
asm("movq %mm0, %mm1");
2110
asm("pandn %mm3, %mm1");
2112
asm("movq %mm2, %mm4");
2113
asm("psubusb %mm5, %mm2");
2114
asm("paddb %mm2, %mm5");
2115
asm("pcmpeqb %mm3, %mm3");
2116
asm("psubb %mm5, %mm3");
2118
asm("pand %mm0, %mm3");
2119
asm("por %mm3, %mm1");
2120
asm("addl $ 4, %eax");
2121
asm("addl $ 4, %edx");
2122
asm("addl $ 4, %edi");
2124
asm(".screen_pixels_1a_1a_last1:");
2125
asm("test $ 1, %ecx");
2126
asm("jz .screen_pixels_1a_1a_end");
2128
asm("movw (%eax), %bx");
2129
asm("movd %ebx, %mm2");
2130
asm("movw (%edx), %bx");
2131
asm("movd %ebx, %mm3");
2134
asm("pcmpeqb %mm4, %mm4");
2135
asm("psubb %mm2, %mm4");
2136
asm("pcmpeqb %mm5, %mm5");
2137
asm("psubb %mm3, %mm5");
2139
asm("movq %mm4, %mm1");
2140
asm("punpcklbw %mm6, %mm1");
2141
asm("movq %mm5, %mm3");
2142
asm("punpcklbw %mm6, %mm3");
2143
asm("pmullw %mm3, %mm1");
2144
asm("paddw %mm7, %mm1");
2145
asm("movq %mm1, %mm3");
2146
asm("psrlw $ 8, %mm3");
2147
asm("paddw %mm3, %mm1");
2148
asm("psrlw $ 8, %mm1");
2150
asm("movq %mm4, %mm2");
2151
asm("punpckhbw %mm6, %mm2");
2152
asm("movq %mm5, %mm3");
2153
asm("punpckhbw %mm6, %mm3");
2154
asm("pmullw %mm3, %mm2");
2155
asm("paddw %mm7, %mm2");
2156
asm("movq %mm2, %mm3");
2157
asm("psrlw $ 8, %mm3");
2158
asm("paddw %mm3, %mm2");
2159
asm("psrlw $ 8, %mm2");
2161
asm("packuswb %mm2, %mm1");
2163
asm("pcmpeqb %mm3, %mm3");
2164
asm("psubb %mm1, %mm3");
2166
asm("movq %mm0, %mm1");
2167
asm("pandn %mm3, %mm1");
2169
asm("movq %mm2, %mm4");
2170
asm("psubusb %mm5, %mm2");
2171
asm("paddb %mm2, %mm5");
2172
asm("pcmpeqb %mm3, %mm3");
2173
asm("psubb %mm5, %mm3");
2175
asm("pand %mm0, %mm3");
2176
asm("por %mm3, %mm1");
2177
asm("movd %mm1, %ebx");
2178
asm("movw %bx, (%edi)");
2180
asm(".screen_pixels_1a_1a_end:");
2188
xxxgimp_composite_softlight_va8_va8_va8_mmx (GimpCompositeContext *_op)
2190
GimpCompositeContext op = *_op;
2195
xxxgimp_composite_subtract_va8_va8_va8_mmx (GimpCompositeContext *_op)
2197
GimpCompositeContext op = *_op;
2201
asm("movl 12(%esp), %edi");
2202
asm("movq v8_alpha_mask, %mm0");
2203
asm("subl $ 4, %ecx");
2204
asm("jl .substract_pixels_1a_1a_last3");
2205
asm("movl $ 8, %ebx");
2206
asm(".substract_pixels_1a_1a_loop:");
2207
asm("movq (%eax), %mm2");
2208
asm("movq (%edx), %mm3");
2210
asm("movq %mm2, %mm4");
2211
asm("psubusb %mm3, %mm4");
2212
asm("movq %mm0, %mm1");
2213
asm("pandn %mm4, %mm1");
2214
asm("psubb %mm4, %mm2");
2215
asm("pand %mm0, %mm2");
2216
asm("por %mm2, %mm1");
2217
asm("movq %mm1, (%edi)");
2218
asm("addl %ebx, %eax");
2219
asm("addl %ebx, %edx");
2220
asm("addl %ebx, %edi");
2221
asm("subl $ 4, %ecx");
2222
asm("jge .substract_pixels_1a_1a_loop");
2224
asm(".substract_pixels_1a_1a_last3:");
2225
asm("test $ 2, %ecx");
2226
asm("jz .substract_pixels_1a_1a_last1");
2227
asm("movd (%eax), %mm2");
2228
asm("movd (%edx), %mm3");
2230
asm("movq %mm2, %mm4");
2231
asm("psubusb %mm3, %mm4");
2232
asm("movq %mm0, %mm1");
2233
asm("pandn %mm4, %mm1");
2234
asm("psubb %mm4, %mm2");
2235
asm("pand %mm0, %mm2");
2236
asm("por %mm2, %mm1");
2237
asm("addl $ 4, %eax");
2238
asm("addl $ 4, %edx");
2239
asm("addl $ 4, %edi");
2241
asm(".substract_pixels_1a_1a_last1:");
2242
asm("test $ 1, %ecx");
2243
asm("jz .substract_pixels_1a_1a_end");
2245
asm("movw (%eax), %bx");
2246
asm("movd %ebx, %mm2");
2247
asm("movw (%edx), %bx");
2248
asm("movd %ebx, %mm3");
2250
asm("movq %mm2, %mm4");
2251
asm("psubusb %mm3, %mm4");
2252
asm("movq %mm0, %mm1");
2253
asm("pandn %mm4, %mm1");
2254
asm("psubb %mm4, %mm2");
2255
asm("pand %mm0, %mm2");
2256
asm("por %mm2, %mm1");
2257
asm("movd %mm1, %ebx");
2258
asm("movw %bx, (%edi)");
2260
asm(".substract_pixels_1a_1a_end:");
2267
xxxgimp_composite_swap_va8_va8_va8_mmx (GimpCompositeContext *_op)
2269
GimpCompositeContext op = *_op;
2274
xxxgimp_composite_valueonly_va8_va8_va8_mmx (GimpCompositeContext *_op)
2276
GimpCompositeContext op = *_op;
2281
#endif /* COMPILE_IS_OKAY */
2284
gimp_composite_mmx_init (void)
2286
#ifdef COMPILE_MMX_IS_OKAY
2287
if (cpu_accel () & CPU_ACCEL_X86_MMX)