140
gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
142
uint64 *d = (uint64 *) _op->D;
143
uint64 *a = (uint64 *) _op->A;
144
uint64 *b = (uint64 *) _op->B;
145
gulong n_pixels = _op->n_pixels;
147
for (; n_pixels >= 2; n_pixels -= 2)
149
asm volatile (" movq %1,%%mm0\n"
153
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
154
"\tpxor %%mm4,%%mm4\n"
155
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
157
"\tmovq %%mm1,%%mm3\n"
158
"\tpxor %%mm5,%%mm5\n"
159
"\tpunpcklbw %%mm5,%%mm3\n"
161
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
163
"\t" pdivwqX(mm4,mm5,mm7) "\n"
166
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
167
"\tpxor %%mm4,%%mm4\n"
168
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
170
"\tmovq %%mm1,%%mm3\n"
171
"\tpxor %%mm5,%%mm5\n"
172
"\tpunpckhbw %%mm5,%%mm3\n"
174
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
175
"\t" pdivwqX(mm4,mm5,mm6) "\n"
178
"\tmovq %%mm4,%%mm5\n"
179
"\tpsubusw %%mm6,%%mm4\n"
180
"\tpsubusw %%mm7,%%mm5\n"
182
"\tpackuswb %%mm4,%%mm5\n"
184
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
186
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
187
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
189
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
190
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
194
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
195
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
203
asm volatile (" movd %1,%%mm0\n"
207
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
208
"\tpxor %%mm4,%%mm4\n"
209
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
211
"\tmovq %%mm1,%%mm3\n"
212
"\tpxor %%mm5,%%mm5\n"
213
"\tpunpcklbw %%mm5,%%mm3\n"
215
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
217
"\t" pdivwqX(mm4,mm5,mm7) "\n"
220
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
221
"\tpxor %%mm4,%%mm4\n"
222
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
224
"\tmovq %%mm1,%%mm3\n"
225
"\tpxor %%mm5,%%mm5\n"
226
"\tpunpckhbw %%mm5,%%mm3\n"
228
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
229
"\t" pdivwqX(mm4,mm5,mm6) "\n"
232
"\tmovq %%mm4,%%mm5\n"
233
"\tpsubusw %%mm6,%%mm4\n"
234
"\tpsubusw %%mm7,%%mm5\n"
236
"\tpackuswb %%mm4,%%mm5\n"
238
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
241
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
243
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
244
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
248
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
249
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
139
257
gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
1164
1284
"\tmovd %%mm2,%1\n"
1165
1285
: "+m" (*a), "+m" (*b)
1167
: "%mm1", "%mm2", "%mm3", "%mm4");
1175
1296
gimp_composite_addition_va8_va8_va8_mmx (GimpCompositeContext *_op)
1177
1298
uint64 *d = (uint64 *) _op->D;
1178
1299
uint64 *a = (uint64 *) _op->A;
1179
1300
uint64 *b = (uint64 *) _op->B;
1180
gulong n_pixels = _op->n_pixels;
1182
asm volatile ("movq %0,%%mm0"
1184
: "m" (*va8_alpha_mask_64)
1187
for (; n_pixels >= 4; n_pixels -= 4)
1189
asm volatile (" movq %1, %%mm2\n"
1190
"\tmovq %2, %%mm3\n"
1191
"\tmovq %%mm2, %%mm4\n"
1192
"\tpaddusb %%mm3, %%mm4\n"
1193
"\tmovq %%mm0, %%mm1\n"
1194
"\tpandn %%mm4, %%mm1\n"
1195
"\t" pminub(mm3, mm2, mm4) "\n"
1196
"\tpand %%mm0, %%mm2\n"
1197
"\tpor %%mm2, %%mm1\n"
1198
"\tmovq %%mm1, %0\n"
1200
: "m" (*a), "m" (*b)
1201
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1207
uint32 *a32 = (uint32 *) a;
1208
uint32 *b32 = (uint32 *) b;
1209
uint32 *d32 = (uint32 *) d;
1211
for (; n_pixels >= 2; n_pixels -= 2)
1213
asm volatile (" movd %1, %%mm2\n"
1214
"\tmovd %2, %%mm3\n"
1215
"\tmovq %%mm2, %%mm4\n"
1216
"\tpaddusb %%mm3, %%mm4\n"
1217
"\tmovq %%mm0, %%mm1\n"
1218
"\tpandn %%mm4, %%mm1\n"
1219
"\t" pminub(mm3, mm2, mm4) "\n"
1220
"\tpand %%mm0, %%mm2\n"
1221
"\tpor %%mm2, %%mm1\n"
1222
"\tmovd %%mm1, %0\n"
1224
: "m" (*a32), "m" (*b32)
1225
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1231
uint16 *a16 = (uint16 *) a32;
1232
uint16 *b16 = (uint16 *) b32;
1233
uint16 *d16 = (uint16 *) d32;
1235
for (; n_pixels >= 1; n_pixels -= 1)
1237
asm volatile (" movw %1, %%ax ; movd %%eax, %%mm2\n"
1238
"\tmovw %2, %%ax ; movd %%eax, %%mm3\n"
1239
"\tmovq %%mm2, %%mm4\n"
1240
"\tpaddusb %%mm3, %%mm4\n"
1241
"\tmovq %%mm0, %%mm1\n"
1242
"\tpandn %%mm4, %%mm1\n"
1243
"\t" pminub(mm3, mm2, mm4) "\n"
1244
"\tpand %%mm0, %%mm2\n"
1245
"\tpor %%mm2, %%mm1\n"
1246
"\tmovd %%mm1, %%eax\n"
1249
: "m" (*a16), "m" (*b16)
1250
: "%eax", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1307
gulong n_pixels = _op->n_pixels;
1309
asm volatile ("movq %0,%%mm0"
1311
: "m" (*va8_alpha_mask_64)
1314
for (; n_pixels >= 4; n_pixels -= 4)
1316
asm volatile (" movq %1, %%mm2\n"
1317
"\tmovq %2, %%mm3\n"
1318
"\tmovq %%mm2, %%mm4\n"
1319
"\tpaddusb %%mm3, %%mm4\n"
1320
"\tmovq %%mm0, %%mm1\n"
1321
"\tpandn %%mm4, %%mm1\n"
1322
"\t" pminub(mm3, mm2, mm4) "\n"
1323
"\tpand %%mm0, %%mm2\n"
1324
"\tpor %%mm2, %%mm1\n"
1325
"\tmovq %%mm1, %0\n"
1327
: "m" (*a), "m" (*b)
1328
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1338
for (; n_pixels >= 2; n_pixels -= 2)
1340
asm volatile (" movd %1, %%mm2\n"
1341
"\tmovd %2, %%mm3\n"
1342
"\tmovq %%mm2, %%mm4\n"
1343
"\tpaddusb %%mm3, %%mm4\n"
1344
"\tmovq %%mm0, %%mm1\n"
1345
"\tpandn %%mm4, %%mm1\n"
1346
"\t" pminub(mm3, mm2, mm4) "\n"
1347
"\tpand %%mm0, %%mm2\n"
1348
"\tpor %%mm2, %%mm1\n"
1349
"\tmovd %%mm1, %0\n"
1351
: "m" (*a32), "m" (*b32)
1352
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1358
a16 = (uint16 *) a32;
1359
b16 = (uint16 *) b32;
1360
d16 = (uint16 *) d32;
1362
for (; n_pixels >= 1; n_pixels -= 1)
1364
asm volatile (" movw %1, %%ax ; movd %%eax, %%mm2\n"
1365
"\tmovw %2, %%ax ; movd %%eax, %%mm3\n"
1366
"\tmovq %%mm2, %%mm4\n"
1367
"\tpaddusb %%mm3, %%mm4\n"
1368
"\tmovq %%mm0, %%mm1\n"
1369
"\tpandn %%mm4, %%mm1\n"
1370
"\t" pminub(mm3, mm2, mm4) "\n"
1371
"\tpand %%mm0, %%mm2\n"
1372
"\tpor %%mm2, %%mm1\n"
1373
"\tmovd %%mm1, %%eax\n"
1376
: "m" (*a16), "m" (*b16)
1377
: "%eax", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1388
gimp_composite_subtract_va8_va8_va8_mmx (GimpCompositeContext *_op)
1390
uint64 *d = (uint64 *) _op->D;
1391
uint64 *a = (uint64 *) _op->A;
1392
uint64 *b = (uint64 *) _op->B;
1399
gulong n_pixels = _op->n_pixels;
1401
asm volatile ("movq %0,%%mm0"
1403
: "m" (*va8_alpha_mask_64)
1406
for (; n_pixels >= 4; n_pixels -= 4)
1408
asm volatile (" movq %1, %%mm2\n"
1409
"\tmovq %2, %%mm3\n"
1410
"\tmovq %%mm2, %%mm4\n"
1411
"\tpsubusb %%mm3, %%mm4\n"
1412
"\tmovq %%mm0, %%mm1\n"
1413
"\tpandn %%mm4, %%mm1\n"
1414
"\t" pminub(mm3, mm2, mm4) "\n"
1415
"\tpand %%mm0, %%mm2\n"
1416
"\tpor %%mm2, %%mm1\n"
1417
"\tmovq %%mm1, %0\n"
1419
: "m" (*a), "m" (*b)
1420
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1430
for (; n_pixels >= 2; n_pixels -= 2)
1432
asm volatile (" movd %1, %%mm2\n"
1433
"\tmovd %2, %%mm3\n"
1434
"\tmovq %%mm2, %%mm4\n"
1435
"\tpsubusb %%mm3, %%mm4\n"
1436
"\tmovq %%mm0, %%mm1\n"
1437
"\tpandn %%mm4, %%mm1\n"
1438
"\t" pminub(mm3, mm2, mm4) "\n"
1439
"\tpand %%mm0, %%mm2\n"
1440
"\tpor %%mm2, %%mm1\n"
1441
"\tmovd %%mm1, %0\n"
1443
: "m" (*a32), "m" (*b32)
1444
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1450
a16 = (uint16 *) a32;
1451
b16 = (uint16 *) b32;
1452
d16 = (uint16 *) d32;
1454
for (; n_pixels >= 1; n_pixels -= 1)
1456
asm volatile (" movw %1, %%ax ; movd %%eax, %%mm2\n"
1457
"\tmovw %2, %%ax ; movd %%eax, %%mm3\n"
1458
"\tmovq %%mm2, %%mm4\n"
1459
"\tpsubusb %%mm3, %%mm4\n"
1460
"\tmovq %%mm0, %%mm1\n"
1461
"\tpandn %%mm4, %%mm1\n"
1462
"\t" pminub(mm3, mm2, mm4) "\n"
1463
"\tpand %%mm0, %%mm2\n"
1464
"\tpor %%mm2, %%mm1\n"
1465
"\tmovd %%mm1, %%eax\n"
1468
: "m" (*a16), "m" (*b16)
1469
: "%eax", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
1481
gimp_composite_multiply_va8_va8_va8_mmx (GimpCompositeContext *_op)
1483
uint64 *d = (uint64 *) _op->D;
1484
uint64 *a = (uint64 *) _op->A;
1485
uint64 *b = (uint64 *) _op->B;
1486
gulong n_pixels = _op->n_pixels;
1491
"pxor %%mm6,%%mm6\n"
1493
: "m" (*va8_alpha_mask_64), "m" (*va8_w128_64)
1494
: "%mm6", "%mm7", "%mm0");
1496
for (; n_pixels >= 2; n_pixels -= 2)
1498
asm volatile (" movq %1, %%mm2\n"
1499
"\tmovq %2, %%mm3\n"
1501
mmx_low_bytes_to_words(mm2,mm1,mm6)
1502
mmx_low_bytes_to_words(mm3,mm5,mm6)
1503
mmx_int_mult(mm5,mm1,mm7)
1505
mmx_high_bytes_to_words(mm2,mm4,mm6)
1506
mmx_high_bytes_to_words(mm3,mm5,mm6)
1507
mmx_int_mult(mm5,mm4,mm7)
1509
"\tpackuswb %%mm4, %%mm1\n"
1511
"\tmovq %%mm0, %%mm4\n"
1512
"\tpandn %%mm1, %%mm4\n"
1513
"\tmovq %%mm4, %%mm1\n"
1514
"\t" pminub(mm3,mm2,mm4) "\n"
1515
"\tpand %%mm0, %%mm2\n"
1516
"\tpor %%mm2, %%mm1\n"
1518
"\tmovq %%mm1, %0\n"
1520
: "m" (*a), "m" (*b)
1521
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
1529
asm volatile (" movd %1, %%mm2\n"
1530
"\tmovd %2, %%mm3\n"
1532
mmx_low_bytes_to_words(mm2,mm1,mm6)
1533
mmx_low_bytes_to_words(mm3,mm5,mm6)
1536
"\tpackuswb %%mm6, %%mm1\n"
1538
"\tmovq %%mm0, %%mm4\n"
1539
"\tpandn %%mm1, %%mm4\n"
1540
"\tmovq %%mm4, %%mm1\n"
1541
"\t" pminub(mm3,mm2,mm4) "\n"
1542
"\tpand %%mm0, %%mm2\n"
1543
"\tpor %%mm2, %%mm1\n"
1545
"\tmovd %%mm1, %0\n"
1547
: "m" (*a), "m" (*b)
1548
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
1557
gimp_composite_burn_va8_va8_va8_mmx (GimpCompositeContext *_op)
1559
GimpCompositeContext op = *_op;
1563
: "m" (*va8_alpha_mask)
1566
for (; op.n_pixels >= 4; op.n_pixels -= 4)
1568
asm volatile (" movq %0,%%mm0\n"
1572
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1573
"\tpxor %%mm4,%%mm4\n"
1574
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1576
"\tmovq %%mm1,%%mm3\n"
1577
"\tpxor %%mm5,%%mm5\n"
1578
"\tpunpcklbw %%mm5,%%mm3\n"
1580
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1582
"\t" pdivwqX(mm4,mm5,mm7) "\n"
1585
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1586
"\tpxor %%mm4,%%mm4\n"
1587
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1589
"\tmovq %%mm1,%%mm3\n"
1590
"\tpxor %%mm5,%%mm5\n"
1591
"\tpunpckhbw %%mm5,%%mm3\n"
1593
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1594
"\t" pdivwqX(mm4,mm5,mm6) "\n"
1597
"\tmovq %%mm4,%%mm5\n"
1598
"\tpsubusw %%mm6,%%mm4\n"
1599
"\tpsubusw %%mm7,%%mm5\n"
1601
"\tpackuswb %%mm4,%%mm5\n"
1603
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
1606
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
1608
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
1609
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
1613
: "+m" (*op.A), "+m" (*op.B), "+m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
1614
: "%mm1", "%mm2", "%mm3", "%mm4");
1622
asm volatile (" movd %0,%%mm0\n"
1625
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1626
"\tpxor %%mm4,%%mm4\n"
1627
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1629
"\tmovq %%mm1,%%mm3\n"
1630
"\tpxor %%mm5,%%mm5\n"
1631
"\tpunpcklbw %%mm5,%%mm3\n"
1633
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1635
"\t" pdivwqX(mm4,mm5,mm7) "\n"
1638
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1639
"\tpxor %%mm4,%%mm4\n"
1640
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1642
"\tmovq %%mm1,%%mm3\n"
1643
"\tpxor %%mm5,%%mm5\n"
1644
"\tpunpckhbw %%mm5,%%mm3\n"
1646
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1647
"\t" pdivwqX(mm4,mm5,mm6) "\n"
1650
"\tmovq %%mm4,%%mm5\n"
1651
"\tpsubusw %%mm6,%%mm4\n"
1652
"\tpsubusw %%mm7,%%mm5\n"
1654
"\tpackuswb %%mm4,%%mm5\n"
1656
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
1659
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
1661
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
1662
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
1666
: "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255_64), "m" (*va8_alpha_mask)
1667
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
1262
1674
xxxgimp_composite_coloronly_va8_va8_va8_mmx (GimpCompositeContext *_op)