113
67
block_full_4(task, tri, x + ix, y + iy);
118
* Pass the 4x4 pixel block to the shader function.
119
* Determination of which of the 16 pixels lies inside the triangle
120
* will be done as part of the fragment shader.
123
do_block_4(struct lp_rasterizer_task *task,
124
const struct lp_rast_triangle *tri,
126
int c1, int c2, int c3)
131
lp_rast_shade_quads(task, &tri->inputs, x, y, -c1, -c2, -c3);
136
* Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out
137
* of the triangle's bounds.
140
do_block_16(struct lp_rasterizer_task *task,
141
const struct lp_rast_triangle *tri,
143
int c0, int c1, int c2)
70
#if !defined(PIPE_ARCH_SSE)
71
static INLINE unsigned
72
build_mask(int c, int dcdx, int dcdy)
81
mask |= ((c0 + 0 * dcdy) >> 31) & (1 << 0);
82
mask |= ((c0 + 1 * dcdy) >> 31) & (1 << 2);
83
mask |= ((c0 + 2 * dcdy) >> 31) & (1 << 8);
84
mask |= ((c0 + 3 * dcdy) >> 31) & (1 << 10);
85
mask |= ((c1 + 0 * dcdy) >> 31) & (1 << 1);
86
mask |= ((c1 + 1 * dcdy) >> 31) & (1 << 3);
87
mask |= ((c1 + 2 * dcdy) >> 31) & (1 << 9);
88
mask |= ((c1 + 3 * dcdy) >> 31) & (1 << 11);
89
mask |= ((c2 + 0 * dcdy) >> 31) & (1 << 4);
90
mask |= ((c2 + 1 * dcdy) >> 31) & (1 << 6);
91
mask |= ((c2 + 2 * dcdy) >> 31) & (1 << 12);
92
mask |= ((c2 + 3 * dcdy) >> 31) & (1 << 14);
93
mask |= ((c3 + 0 * dcdy) >> 31) & (1 << 5);
94
mask |= ((c3 + 1 * dcdy) >> 31) & (1 << 7);
95
mask |= ((c3 + 2 * dcdy) >> 31) & (1 << 13);
96
mask |= ((c3 + 3 * dcdy) >> 31) & (1 << 15);
102
static INLINE unsigned
103
build_mask_linear(int c, int dcdx, int dcdy)
112
mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
113
mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
114
mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
115
mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
116
mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
117
mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
118
mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
119
mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
120
mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
121
mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
122
mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
123
mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
124
mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
125
mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
126
mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
127
mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
141
*outmask |= build_mask_linear(c, dcdx, dcdy);
142
*partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
146
#include <emmintrin.h>
147
#include "util/u_sse.h"
158
__m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
159
__m128i xdcdy = _mm_set1_epi32(dcdy);
161
/* Get values across the quad
163
__m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
164
__m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
165
__m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
168
__m128i cstep01, cstep23, result;
170
cstep01 = _mm_packs_epi32(cstep0, cstep1);
171
cstep23 = _mm_packs_epi32(cstep2, cstep3);
172
result = _mm_packs_epi16(cstep01, cstep23);
174
*outmask |= _mm_movemask_epi8(result);
179
__m128i cio4 = _mm_set1_epi32(cdiff);
180
__m128i cstep01, cstep23, result;
182
cstep0 = _mm_add_epi32(cstep0, cio4);
183
cstep1 = _mm_add_epi32(cstep1, cio4);
184
cstep2 = _mm_add_epi32(cstep2, cio4);
185
cstep3 = _mm_add_epi32(cstep3, cio4);
187
cstep01 = _mm_packs_epi32(cstep0, cstep1);
188
cstep23 = _mm_packs_epi32(cstep2, cstep3);
189
result = _mm_packs_epi16(cstep01, cstep23);
191
*partmask |= _mm_movemask_epi8(result);
196
static INLINE unsigned
197
build_mask_linear(int c, int dcdx, int dcdy)
199
__m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
200
__m128i xdcdy = _mm_set1_epi32(dcdy);
202
/* Get values across the quad
204
__m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
205
__m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
206
__m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
208
/* pack pairs of results into epi16
210
__m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
211
__m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
213
/* pack into epi8, preserving sign bits
215
__m128i result = _mm_packs_epi16(cstep01, cstep23);
217
/* extract sign bits to create mask
219
return _mm_movemask_epi8(result);
222
static INLINE unsigned
223
build_mask(int c, int dcdx, int dcdy)
225
__m128i step = _mm_setr_epi32(0, dcdx, dcdy, dcdx + dcdy);
226
__m128i c0 = _mm_set1_epi32(c);
228
/* Get values across the quad
230
__m128i cstep0 = _mm_add_epi32(c0, step);
232
/* Scale up step for moving between quads.
234
__m128i step4 = _mm_add_epi32(step, step);
236
/* Get values for the remaining quads:
238
__m128i cstep1 = _mm_add_epi32(cstep0,
239
_mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
240
__m128i cstep2 = _mm_add_epi32(cstep0,
241
_mm_shuffle_epi32(step4, _MM_SHUFFLE(2,2,2,2)));
242
__m128i cstep3 = _mm_add_epi32(cstep2,
243
_mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
245
/* pack pairs of results into epi16
247
__m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
248
__m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
250
/* pack into epi8, preserving sign bits
252
__m128i result = _mm_packs_epi16(cstep01, cstep23);
254
/* extract sign bits to create mask
256
return _mm_movemask_epi8(result);
266
#include "lp_rast_tri_tmp.h"
270
#include "lp_rast_tri_tmp.h"
274
#include "lp_rast_tri_tmp.h"
278
#include "lp_rast_tri_tmp.h"
282
#include "lp_rast_tri_tmp.h"
286
#include "lp_rast_tri_tmp.h"
290
#include "lp_rast_tri_tmp.h"
294
#include "lp_rast_tri_tmp.h"
297
/* Special case for 3 plane triangle which is contained entirely
298
* within a 16x16 block.
301
lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
302
const union lp_rast_cmd_arg arg)
304
const struct lp_rast_triangle *tri = arg.triangle.tri;
305
const struct lp_rast_plane *plane = tri->plane;
306
unsigned mask = arg.triangle.plane_mask;
307
const int x = task->x + (mask & 0xf) * 16;
308
const int y = task->y + (mask >> 4) * 16;
309
unsigned outmask, inmask, partmask, partial_mask;
155
eo[0] = tri->eo1 * 4;
156
eo[1] = tri->eo2 * 4;
157
eo[2] = tri->eo3 * 4;
163
for (j = 0; j < 3; j++) {
164
const int *step = tri->inputs.step[j];
165
const int cx = c[j] + eo[j];
167
/* Mask has bits set whenever we are outside any of the edges.
169
for (i = 0; i < 16; i++) {
170
int out = cx + step[i] * 4;
171
mask |= (out >> 31) & (1 << i);
175
mask = ~mask & 0xffff;
177
int i = ffs(mask) - 1;
178
int px = x + pos_table4[i][0];
179
int py = y + pos_table4[i][1];
180
int cx1 = c0 + tri->inputs.step[0][i] * 4;
181
int cx2 = c1 + tri->inputs.step[1][i] * 4;
182
int cx3 = c2 + tri->inputs.step[2][i] * 4;
186
/* Don't bother testing if the 4x4 block is entirely in/out of
187
* the triangle. It's a little faster to do it in the jit code.
189
LP_COUNT(nr_non_empty_4);
190
do_block_4(task, tri, px, py, cx1, cx2, cx3);
196
* Scan the tile in chunks and figure out which pixels to rasterize
200
lp_rast_triangle(struct lp_rasterizer_task *task,
201
const union lp_rast_cmd_arg arg)
203
const struct lp_rast_triangle *tri = arg.triangle;
204
const int x = task->x, y = task->y;
205
int ei[3], eo[3], c[3];
206
unsigned outmask, inmask, partial_mask;
209
c[0] = tri->c1 + tri->dx12 * y - tri->dy12 * x;
210
c[1] = tri->c2 + tri->dx23 * y - tri->dy23 * x;
211
c[2] = tri->c3 + tri->dx31 * y - tri->dy31 * x;
213
eo[0] = tri->eo1 * 16;
214
eo[1] = tri->eo2 * 16;
215
eo[2] = tri->eo3 * 16;
217
ei[0] = tri->ei1 * 16;
218
ei[1] = tri->ei2 * 16;
219
ei[2] = tri->ei3 * 16;
224
for (j = 0; j < 3; j++) {
225
const int *step = tri->inputs.step[j];
226
const int cox = c[j] + eo[j];
227
const int cio = ei[j]- eo[j];
229
/* Outmask has bits set whenever we are outside any of the
232
/* Inmask has bits set whenever we are inside all of the edges.
234
for (i = 0; i < 16; i++) {
235
int out = cox + step[i] * 16;
237
outmask |= (out >> 31) & (1 << i);
238
inmask &= ~((in >> 31) & (1 << i));
242
assert((outmask & inmask) == 0);
313
outmask = 0; /* outside one or more trivial reject planes */
314
partmask = 0; /* outside one or more trivial accept planes */
316
for (j = 0; j < 3; j++) {
317
c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
320
const int dcdx = -plane[j].dcdx * 4;
321
const int dcdy = plane[j].dcdy * 4;
322
const int cox = plane[j].eo * 4;
323
const int cio = plane[j].ei * 4 - 1;
325
build_masks(c[j] + cox,
328
&outmask, /* sign bits from c[i][0..15] + cox */
329
&partmask); /* sign bits from c[i][0..15] + cio */
244
333
if (outmask == 0xffff)
247
/* Invert mask, so that bits are set whenever we are at least
248
* partially inside all of the edges:
250
partial_mask = ~inmask & ~outmask & 0xffff;
336
/* Mask of sub-blocks which are inside all trivial accept planes:
338
inmask = ~partmask & 0xffff;
340
/* Mask of sub-blocks which are inside all trivial reject planes,
341
* but outside at least one trivial accept plane:
343
partial_mask = partmask & ~outmask;
345
assert((partial_mask & inmask) == 0);
252
347
/* Iterate over partials:
254
349
while (partial_mask) {
255
350
int i = ffs(partial_mask) - 1;
256
int px = x + pos_table16[i][0];
257
int py = y + pos_table16[i][1];
258
int cx1 = c[0] + tri->inputs.step[0][i] * 16;
259
int cx2 = c[1] + tri->inputs.step[1][i] * 16;
260
int cx3 = c[2] + tri->inputs.step[2][i] * 16;
351
int ix = (i & 3) * 4;
352
int iy = (i >> 2) * 4;
262
357
partial_mask &= ~(1 << i);
264
LP_COUNT(nr_partially_covered_16);
265
do_block_16(task, tri, px, py, cx1, cx2, cx3);
359
for (j = 0; j < 3; j++)
362
+ plane[j].dcdy * iy);
364
do_block_4_3(task, tri, plane, px, py, cx);
268
367
/* Iterate over fulls:
271
370
int i = ffs(inmask) - 1;
272
int px = x + pos_table16[i][0];
273
int py = y + pos_table16[i][1];
371
int ix = (i & 3) * 4;
372
int iy = (i >> 2) * 4;
275
376
inmask &= ~(1 << i);
277
LP_COUNT(nr_fully_covered_16);
278
block_full_16(task, tri, px, py);
378
block_full_4(task, tri, px, py);