115
gimp_composite_burn_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
117
uint64 *d = (uint64 *) _op->D;
118
uint64 *a = (uint64 *) _op->A;
119
uint64 *b = (uint64 *) _op->B;
120
gulong n_pixels = _op->n_pixels;
122
for (; n_pixels >= 2; n_pixels -= 2)
124
asm volatile (" movq %1,%%mm0\n"
128
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
129
"\tpxor %%mm4,%%mm4\n"
130
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
132
"\tmovq %%mm1,%%mm3\n"
133
"\tpxor %%mm5,%%mm5\n"
134
"\tpunpcklbw %%mm5,%%mm3\n"
136
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
138
"\t" pdivwqX(mm4,mm5,mm7) "\n"
141
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
142
"\tpxor %%mm4,%%mm4\n"
143
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
145
"\tmovq %%mm1,%%mm3\n"
146
"\tpxor %%mm5,%%mm5\n"
147
"\tpunpckhbw %%mm5,%%mm3\n"
149
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
150
"\t" pdivwqX(mm4,mm5,mm6) "\n"
153
"\tmovq %%mm4,%%mm5\n"
154
"\tpsubusw %%mm6,%%mm4\n"
155
"\tpsubusw %%mm7,%%mm5\n"
157
"\tpackuswb %%mm4,%%mm5\n"
159
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
161
"\tmovq %6,%%mm7\n" /* mm6 = rgba8_alpha_mask_64 */
162
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
164
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
165
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
169
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
170
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
178
asm volatile (" movd %1,%%mm0\n"
182
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
183
"\tpxor %%mm4,%%mm4\n"
184
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
186
"\tmovq %%mm1,%%mm3\n"
187
"\tpxor %%mm5,%%mm5\n"
188
"\tpunpcklbw %%mm5,%%mm3\n"
190
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
192
"\t" pdivwqX(mm4,mm5,mm7) "\n"
195
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
196
"\tpxor %%mm4,%%mm4\n"
197
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
199
"\tmovq %%mm1,%%mm3\n"
200
"\tpxor %%mm5,%%mm5\n"
201
"\tpunpckhbw %%mm5,%%mm3\n"
203
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
204
"\t" pdivwqX(mm4,mm5,mm6) "\n"
207
"\tmovq %%mm4,%%mm5\n"
208
"\tpsubusw %%mm6,%%mm4\n"
209
"\tpsubusw %%mm7,%%mm5\n"
211
"\tpackuswb %%mm4,%%mm5\n"
213
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
216
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
218
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
219
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
223
: "m" (*a), "m" (*b), "m" (*rgba8_b255_64), "m" (*rgba8_w1_64), "m" (*rgba8_w255_64), "m" (*rgba8_alpha_mask_64)
224
: pdivwqX_clobber, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
117
232
gimp_composite_darken_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
199
314
"\tmovd %%mm1, %0\n"
201
316
: "m" (*a), "m" (*b)
202
: "%mm1", "%mm2", "%mm3", "%mm4");
210
xxxgimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
212
GimpCompositeContext op = *_op;
214
asm volatile (" movq %0, %%mm0\n"
217
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
220
for (; op.n_pixels >= 2; op.n_pixels -= 2)
222
asm volatile (" movq %1,%%mm0\n"
224
"\tpxor %%mm2,%%mm2\n"
225
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
227
"\tmovq %%mm1,%%mm3\n"
228
"\tpxor %%mm5,%%mm5\n"
229
"\tpunpcklbw %%mm5,%%mm3\n"
230
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
232
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
234
"\tpxor %%mm2,%%mm2\n"
235
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
237
"\tmovq %%mm1,%%mm3\n"
238
"\tpxor %%mm6,%%mm6\n"
239
"\tpunpckhbw %%mm6,%%mm3\n"
240
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
242
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
244
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
246
"\tpminub %%mm0,%%mm1\n"
248
"\tmovq %%mm3,%%mm2\n"
250
"\tpandn %%mm5,%%mm3\n"
252
"\tpand %%mm2,%%mm1\n"
253
"\tpor %%mm1,%%mm3\n"
257
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
258
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
266
asm volatile (" movd %1,%%mm0\n"
269
"\tpxor %%mm2,%%mm2\n"
270
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
272
"\tmovq %%mm1,%%mm3\n"
273
"\tpxor %%mm5,%%mm5\n"
274
"\tpunpcklbw %%mm5,%%mm3\n"
275
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
277
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
279
"\tpxor %%mm2,%%mm2\n"
280
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
282
"\tmovq %%mm1,%%mm3\n"
283
"\tpxor %%mm6,%%mm6\n"
284
"\tpunpckhbw %%mm6,%%mm3\n"
285
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
287
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
289
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
291
"\tpminub %%mm0,%%mm1\n"
293
"\tmovq %%mm3,%%mm2\n"
295
"\tpandn %%mm5,%%mm3\n"
297
"\tpand %%mm2,%%mm1\n"
298
"\tpor %%mm1,%%mm3\n"
302
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
303
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
312
xxxgimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
314
GimpCompositeContext op = *_op;
316
for (; op.n_pixels >= 2; op.n_pixels -= 2)
318
asm volatile (" movq %1,%%mm0\n"
320
"\tmovq %%mm1,%%mm3\n"
321
"\tpxor %%mm2,%%mm2\n"
322
"\tpunpcklbw %%mm2,%%mm3\n"
323
"\tpunpcklbw %%mm0,%%mm2\n"
326
"\tpsubw %%mm3,%%mm4\n"
328
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
330
"\tmovq %%mm1,%%mm3\n"
331
"\tpxor %%mm2,%%mm2\n"
332
"\tpunpckhbw %%mm2,%%mm3\n"
333
"\tpunpckhbw %%mm0,%%mm2\n"
336
"\tpsubw %%mm3,%%mm4\n"
338
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
340
"\tpackuswb %%mm6,%%mm5\n"
343
"\tmovq %%mm1,%%mm7\n"
344
"\t" pminub(mm0,mm7,mm2) "\n"
345
"\tpand %%mm6,%%mm7\n"
346
"\tpandn %%mm5,%%mm6\n"
348
"\tpor %%mm6,%%mm7\n"
352
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
353
: "%eax", "%ecx", "%edx", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
361
asm volatile (" movd %1,%%mm0\n"
363
"\tmovq %%mm1,%%mm3\n"
364
"\tpxor %%mm2,%%mm2\n"
365
"\tpunpcklbw %%mm2,%%mm3\n"
366
"\tpunpcklbw %%mm0,%%mm2\n"
369
"\tpsubw %%mm3,%%mm4\n"
371
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
373
"\tmovq %%mm1,%%mm3\n"
374
"\tpxor %%mm2,%%mm2\n"
375
"\tpunpckhbw %%mm2,%%mm3\n"
376
"\tpunpckhbw %%mm0,%%mm2\n"
379
"\tpsubw %%mm3,%%mm4\n"
381
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
383
"\tpackuswb %%mm6,%%mm5\n"
386
"\tmovq %%mm1,%%mm7\n"
387
"\tpminub %%mm0,%%mm7\n"
388
"\tpand %%mm6,%%mm7\n"
389
"\tpandn %%mm5,%%mm6\n"
391
"\tpor %%mm6,%%mm7\n"
395
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
396
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
317
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
404
325
gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
1139
xxxgimp_composite_burn_va8_va8_va8_sse (GimpCompositeContext *_op)
1141
GimpCompositeContext op = *_op;
1145
: "m" (*va8_alpha_mask)
1148
for (; op.n_pixels >= 4; op.n_pixels -= 4)
1150
asm volatile (" movq (%0),%%mm0; addl $8,%0\n"
1151
"\tmovq (%1),%%mm1; addl $8,%1\n"
1154
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1155
"\tpxor %%mm4,%%mm4\n"
1156
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1158
"\tmovq %%mm1,%%mm3\n"
1159
"\tpxor %%mm5,%%mm5\n"
1160
"\tpunpcklbw %%mm5,%%mm3\n"
1162
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1164
"\t" pdivwqX(mm4,mm5,mm7) "\n"
1167
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1168
"\tpxor %%mm4,%%mm4\n"
1169
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1171
"\tmovq %%mm1,%%mm3\n"
1172
"\tpxor %%mm5,%%mm5\n"
1173
"\tpunpckhbw %%mm5,%%mm3\n"
1175
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1176
"\t" pdivwqX(mm4,mm5,mm6) "\n"
1179
"\tmovq %%mm4,%%mm5\n"
1180
"\tpsubusw %%mm6,%%mm4\n"
1181
"\tpsubusw %%mm7,%%mm5\n"
1183
"\tpackuswb %%mm4,%%mm5\n"
1185
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
1188
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
1190
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
1191
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
1193
"\tmovq %%mm7,(%2); addl $8,%2\n"
1194
: "+r" (op.A), "+r" (op.B), "+r" (op.D)
1195
: "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask)
1196
: "%mm1", "%mm2", "%mm3", "%mm4");
1201
asm volatile (" movd (%0),%%mm0\n"
1202
"\tmovd (%1),%%mm1\n"
1205
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1206
"\tpxor %%mm4,%%mm4\n"
1207
"\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1209
"\tmovq %%mm1,%%mm3\n"
1210
"\tpxor %%mm5,%%mm5\n"
1211
"\tpunpcklbw %%mm5,%%mm3\n"
1213
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1215
"\t" pdivwqX(mm4,mm5,mm7) "\n"
1218
"\tpsubb %%mm0,%%mm2\n" /* mm2 = 255 - A */
1219
"\tpxor %%mm4,%%mm4\n"
1220
"\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256 */
1222
"\tmovq %%mm1,%%mm3\n"
1223
"\tpxor %%mm5,%%mm5\n"
1224
"\tpunpckhbw %%mm5,%%mm3\n"
1226
"\tpaddusw %%mm3,%%mm5\n" /* mm5 = B + 1 */
1227
"\t" pdivwqX(mm4,mm5,mm6) "\n"
1230
"\tmovq %%mm4,%%mm5\n"
1231
"\tpsubusw %%mm6,%%mm4\n"
1232
"\tpsubusw %%mm7,%%mm5\n"
1234
"\tpackuswb %%mm4,%%mm5\n"
1236
"\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
1239
"\tpand %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
1241
"\tpandn %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
1242
"\tpor %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
1244
"\tmovd %%mm7,(%2)\n"
1246
: "r" (op.A), "r" (op.B), "r" (op.D), "m" (*va8_b255), "m" (*va8_w1), "m" (*va8_w255), "m" (*va8_alpha_mask)
1247
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
1217
1254
xxxgimp_composite_coloronly_va8_va8_va8_sse (GimpCompositeContext *_op)
1219
1256
GimpCompositeContext op = *_op;