244
273
name = format.short_name()
246
275
print 'static void'
247
print 'lp_tile_%s_write_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
276
print 'lp_tile_%s_unswizzle_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)' % (name, src_suffix, src_native_type)
249
278
if format.layout == PLAIN \
250
279
and format.colorspace == 'rgb' \
251
280
and format.block_size() <= 32 \
252
281
and format.is_pot() \
253
282
and not format.is_mixed() \
254
and format.channels[0].type == UNSIGNED:
255
emit_unrolled_write_code(format, src_channel)
283
and (format.channels[0].type == UNSIGNED \
284
or format.channels[1].type == UNSIGNED):
285
emit_unrolled_unswizzle_code(format, src_channel)
257
emit_tile_pixel_write_code(format, src_channel)
287
emit_tile_pixel_unswizzle_code(format, src_channel)
262
def generate_read(formats, dst_channel, dst_native_type, dst_suffix):
292
def generate_ssse3():
294
#if defined(PIPE_ARCH_SSE)
296
#include "util/u_sse.h"
299
lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
300
const uint8_t *src, unsigned src_stride,
301
unsigned x0, unsigned y0)
305
__m128i *pdst = (__m128i*) dst;
306
const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
307
unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
308
unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
310
const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
311
const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
312
const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
313
const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
315
const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
316
const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
317
const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
318
const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
320
const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
321
const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
322
const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
323
const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
325
const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
326
const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
327
const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
328
const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
330
for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
331
__m128i line0 = *(__m128i*)ysrc0;
332
const uint8_t *ysrc = ysrc0 + src_stride;
333
ysrc0 += tile_stridey;
335
for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
336
__m128i r, g, b, a, line1;
337
line1 = *(__m128i*)ysrc;
338
PIPE_READ_WRITE_BARRIER();
340
r = _mm_shuffle_epi8(line0, shuffle00);
341
g = _mm_shuffle_epi8(line0, shuffle01);
342
b = _mm_shuffle_epi8(line0, shuffle02);
343
a = _mm_shuffle_epi8(line0, shuffle03);
345
line0 = *(__m128i*)ysrc;
346
PIPE_READ_WRITE_BARRIER();
348
r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
349
g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
350
b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
351
a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
353
line1 = *(__m128i*)ysrc;
354
PIPE_READ_WRITE_BARRIER();
355
ysrc -= tile_stridex;
356
r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
357
g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
358
b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
359
a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
361
if (x + 1 < TILE_SIZE) {
362
line0 = *(__m128i*)ysrc;
366
PIPE_READ_WRITE_BARRIER();
367
r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
368
g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
369
b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
370
a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
382
lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
383
uint8_t *dst, unsigned dst_stride,
384
unsigned x0, unsigned y0)
387
const __m128i *psrc = (__m128i*) src;
388
const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
389
uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
390
__m128i c0 = *psrc++;
393
const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
394
const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
395
const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
396
const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
398
const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
399
const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
400
const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
401
const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
403
const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
404
const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
405
const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
406
const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
408
const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
409
const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
410
const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
411
const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
413
for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
414
__m128i *tile = (__m128i*) pdst;
415
pdst += dst_stride * TILE_VECTOR_HEIGHT;
416
for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
417
uint8_t *linep = (uint8_t*) (tile++);
418
__m128i line0, line1, line2, line3;
420
c1 = *psrc++; /* r */
421
PIPE_READ_WRITE_BARRIER();
422
line0 = _mm_shuffle_epi8(c0, shuffle00);
423
line1 = _mm_shuffle_epi8(c0, shuffle01);
424
line2 = _mm_shuffle_epi8(c0, shuffle02);
425
line3 = _mm_shuffle_epi8(c0, shuffle03);
427
c0 = *psrc++; /* g */
428
PIPE_READ_WRITE_BARRIER();
429
line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
430
line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
431
line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
432
line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
434
c1 = *psrc++; /* b */
435
PIPE_READ_WRITE_BARRIER();
436
line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
437
line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
438
line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
439
line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
442
c0 = *psrc++; /* a */
443
PIPE_READ_WRITE_BARRIER();
444
line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
445
line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
446
line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
447
line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
449
*(__m128i*) (linep) = line0;
450
*(__m128i*) (((char*)linep) + dst_stride) = line1;
451
*(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
452
*(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
457
#endif /* PIPE_ARCH_SSSE3 */
461
def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
263
462
'''Generate the dispatch function to read pixels from any format'''
265
464
for format in formats: