1
#if defined(Q_CC_GNU) && defined(__i386__)
9
static uint detectCPUFeatures() {
11
/* see p. 118 of amd64 instruction set manual Vol3 */
16
"xor $0x00200000, %%eax\n"
25
"mov $0x00000001, %%eax\n"
32
: "%eax", "%ecx", "%edx"
36
// result now contains the standard feature bits
37
if (result & (1 << 15))
39
if (result & (1 << 23))
41
if (result & (1 << 25))
43
if (result & (1 << 26))
49
static void sse_memfill(uint *target, uint value, int len)
51
uint *end = target + len;
54
int align = (((ulong)target) & 0xf) >> 2;
66
asm("movd %1, %%xmm1\n"
67
"pshufd $0x0, %%xmm1, %%xmm0\n"
70
"1: movdqa %%xmm0, (%%eax)\n"
79
: "%eax", "%edx", "%xmm0", "%xmm1"
82
while (target < end) {
88
#define qt_alpha_pixel(s, t, a, ra) { int tmp = s*a + t*ra; t = qt_div_255(tmp); }
89
#define qt_alpha_pixel_pm(s, t, ra) { int tmp = s + t*ra; t = qt_div_255(tmp); }
91
static void blend_color_sse(ARGB *target, const QSpan *span, ARGB color)
96
int alpha = qt_div_255(color.a * span->coverage);
97
int pr = alpha * color.r;
98
int pg = alpha * color.g;
99
int pb = alpha * color.b;
101
int rev_alpha = 255 - alpha;
104
= { (ushort)pb, (ushort)pg, (ushort)pr, 0 };
106
= { 0, 0, 0, 0xffff };
107
if (span->len > 1 ) {
110
xmm0: premultiplied src
116
asm("pxor %%xmm7, %%xmm7\n" // clear xmm7
117
"movlps %2, %%xmm0\n" // src to xmm0
118
"movlhps %%xmm0, %%xmm0\n"
119
"movlps %5, %%xmm6\n" // src to xmm0
120
"movlhps %%xmm6, %%xmm6\n"
121
"movd %3, %%xmm1\n" // rev_alpha to xmm1
122
// #### should work without the line below
123
"punpcklbw %%xmm7, %%xmm1\n"
124
"pshuflw $0, %%xmm1, %%xmm1\n"
125
"movlhps %%xmm1, %%xmm1\n" // spread rev_alpha over all channels
127
"prefetchnta 128(%1)\n"
128
"movlps (%1), %%xmm2\n" // target to xmm2
129
"punpcklbw %%xmm7, %%xmm2\n" // to xmm1
130
"pmullw %%xmm1, %%xmm2\n" // target * ralpha
131
"paddw %%xmm0, %%xmm2\n" // sum to xmm1
132
"por %%xmm6, %%xmm2\n" // make sure alpha is set to 0xff
133
"psrlw $8, %%xmm2\n" // shift right
134
"packuswb %%xmm2, %%xmm2\n" // pack to 8 bits
135
"movlps %%xmm2, (%1)\n"
146
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
150
qt_alpha_pixel_pm(pr, target->r, rev_alpha);
151
qt_alpha_pixel_pm(pg, target->g, rev_alpha);
152
qt_alpha_pixel_pm(pb, target->b, rev_alpha);
157
#define CMOV_PIX(pixel, out, mask, image_bits, offset) \
158
asm ("mov %1, %%edx\n" \
159
"and $"#mask",%%edx\n" \
160
"cmovnz (%2,%3,0x4), %%edx\n" \
169
static void blend_transformed_bilinear_sse(ARGB *target, const QSpan *span,
170
qreal ix, qreal iy, qreal dx, qreal dy,
171
ARGB *image_bits, int image_width, int image_height)
173
const int fixed_scale = 1 << 16;
174
int x = int((ix + dx * span->x) * fixed_scale);
175
int y = int((iy + dy * span->x) * fixed_scale);
177
int fdx = (int)(dx * fixed_scale);
178
int fdy = (int)(dy * fixed_scale);
181
set up constant xmm registers:
184
xmm5 : alpha mask on xmm5
188
const ushort mask[4] = { 0, 0, 0, 0xffff };
189
uint coverage = span->coverage;
190
asm("pxor %%xmm7, %%xmm7\n" // clear xmm7
191
"movd %0, %%xmm6\n" // coverage to xmm6
192
"punpcklbw %%xmm7, %%xmm6\n"
193
"pshuflw $0, %%xmm6, %%xmm6\n" // spread over all channels
194
"movlps %1, %%xmm5\n" // mask to xmm5
195
"movlhps %%xmm5, %%xmm5\n"
197
"pcmpeqb %%xmm4, %%xmm4\n"
198
"psrlw $8, %%xmm4\n" // 0x255 in xmm4
202
: "%xmm5", "%xmm6", "%xmm7"
205
for (int i = 0; i < span->len; ++i) {
206
const int x1 = (x >> 16);
207
const int y1 = (y >> 16);
209
const int distx = ((x - (x1 << 16)) >> 8);
210
const int disty = ((y - (y1 << 16)) >> 8);
211
const int idistx = 256 - distx;
212
const int idisty = 256 - disty;
214
const long y1_offset = y1 * image_width;
215
const long y2_offset = y1_offset + image_width;
227
const int x2 = x1 + 1;
228
const int y2 = y1 + 1;
235
register const uint out = (x1 >= 0 & x1 < image_width)
236
| ((x2 >= 0 & x2 < image_width) << 1)
237
| ((y1 >= 0 & y1 < image_height) << 2)
238
| ((y2 >= 0 & y2 < image_height) << 3);
239
CMOV_PIX(left.tl, out, 0x5, image_bits, y1_offset + x1); // X1Out|Y2Out
240
CMOV_PIX(left.bl, out, 0x8, image_bits, y2_offset + x1); // X1Out|Y2Out
241
CMOV_PIX(right.tr, out, 0x6, image_bits, y1_offset + x2); // X2Out|Y1Out
242
CMOV_PIX(right.br, out, 0xa, image_bits, y2_offset + x2); // X2Out|Y2Out
254
asm("movlps %0, %%xmm0\n" // left to xmm0
255
"punpcklbw %%xmm7, %%xmm0\n"
257
"movlps %1, %%xmm1\n"
258
"punpcklbw %%xmm7, %%xmm1\n"
261
"pshuflw $0, %%xmm2, %%xmm2\n"
262
"movlhps %%xmm2, %%xmm2\n" // spread distx
263
"pmullw %%xmm2, %%xmm1\n"
266
"pshuflw $0, %%xmm2, %%xmm2\n"
267
"movlhps %%xmm2, %%xmm2\n" // spread distx
268
"pmullw %%xmm2, %%xmm0\n"
270
"paddw %%xmm1, %%xmm0\n" // now contains xtop and xbottom
274
"pshuflw $0, %%xmm2, %%xmm2\n"
276
"pshuflw $0, %%xmm3, %%xmm3\n"
277
"movlhps %%xmm2, %%xmm3\n" // disty and idisty in mm2
279
"pmullw %%xmm3, %%xmm0\n"
281
"movhlps %%xmm0, %%xmm1\n"
282
"paddw %%xmm1, %%xmm0\n"
283
"psrlw $8, %%xmm0\n" // src is now in xmm0, ready for blend
285
// blend operation follows
292
"movd (%6), %%xmm1\n" // target to mm1
293
"punpcklbw %%xmm7, %%xmm1\n"
294
"pshuflw $255, %%xmm0, %%xmm2\n" // spread alpha over all channels
295
"pmullw %%xmm6, %%xmm2\n" // alpha *= coverage
296
"psrlw $8, %%xmm2\n" // shift right
297
"pmullw %%xmm2, %%xmm0\n" // src *= alpha
298
"movdqa %%xmm4, %%xmm3\n"
299
"psubw %%xmm2, %%xmm3\n" // 0x255 - alpha in xmm3
300
"pmullw %%xmm3, %%xmm1\n" // target *= ralpha
301
"paddw %%xmm1, %%xmm0\n" // sum to xmm1
302
"por %%xmm5, %%xmm0\n" // make sure alpha is set to 0xff
303
"psrlw $8, %%xmm0\n" // shift right
304
"packuswb %%xmm0, %%xmm0\n" // pack to 8 bits
305
"movd %%xmm0, (%6)\n"
314
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
316
// qDebug("target = %x", *((uint *)target));
323
#elif defined Q_CC_MSVC
326
#endif // Q_CC_GCC and Q_CC_MSVC
328
void qInitDrawhelperAsm()
330
static uint features = 0;
334
#if defined (Q_CC_GNU) && defined (__i386__)
335
features = detectCPUFeatures();
337
if (features & SSE2) {
338
dh[DrawHelper_RGB32]->blendColor = blend_color_sse;
339
dh[DrawHelper_RGB32]->blendTransformedBilinear = blend_transformed_bilinear_sse;