1
/**************************************************************************
3
* Copyright 2009-2010 VMware, Inc.
6
* Permission is hereby granted, free of charge, to any person obtaining a
7
* copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
14
* The above copyright notice and this permission notice (including the
15
* next paragraph) shall be included in all copies or substantial portions
18
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26
**************************************************************************/
30
* Depth/stencil testing to LLVM IR translation.
32
* To be done accurately/efficiently the depth/stencil test must be done with
33
* the same type/format of the depth/stencil buffer, which implies massaging
34
* the incoming depths to fit into place. Using a more straightforward
35
* type/format for depth/stencil values internally and only convert when
36
* flushing would avoid this, but it would most likely result in depth fighting
39
* Since we're using linear layout for everything, but we need to deal with
40
* 2x2 quads, we need to load/store multiple values and swizzle them into
41
* place (we could avoid this by doing depth/stencil testing in linear format,
42
* which would be easy for late depth/stencil test as we could do that after
43
* the fragment shader loop just as we do for color buffers, but more tricky
44
* for early depth test as we'd need both masks and interpolated depth in
48
* @author Jose Fonseca <jfonseca@vmware.com>
49
* @author Brian Paul <jfonseca@vmware.com>
52
#include "pipe/p_state.h"
53
#include "util/format/u_format.h"
54
#include "util/u_cpu_detect.h"
56
#include "gallivm/lp_bld_type.h"
57
#include "gallivm/lp_bld_arit.h"
58
#include "gallivm/lp_bld_bitarit.h"
59
#include "gallivm/lp_bld_const.h"
60
#include "gallivm/lp_bld_conv.h"
61
#include "gallivm/lp_bld_logic.h"
62
#include "gallivm/lp_bld_flow.h"
63
#include "gallivm/lp_bld_intr.h"
64
#include "gallivm/lp_bld_debug.h"
65
#include "gallivm/lp_bld_swizzle.h"
66
#include "gallivm/lp_bld_pack.h"
68
#include "lp_bld_depth.h"
69
#include "lp_state_fs.h"
72
/** Used to select fields from pipe_stencil_state */
82
* Do the stencil test comparison (compare FB stencil values against ref value).
83
* This will be used twice when generating two-sided stencil code.
84
* \param stencil the front/back stencil state
85
* \param stencilRef the stencil reference value, replicated as a vector
86
* \param stencilVals vector of stencil values from framebuffer
87
* \return vector mask of pass/fail values (~0 or 0)
90
lp_build_stencil_test_single(struct lp_build_context *bld,
91
const struct pipe_stencil_state *stencil,
92
LLVMValueRef stencilRef,
93
LLVMValueRef stencilVals)
95
LLVMBuilderRef builder = bld->gallivm->builder;
96
const unsigned stencilMax = 255; /* XXX fix */
97
struct lp_type type = bld->type;
101
* SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
102
* are between 0..255 so ensure we generate the fastest comparisons for
105
if (type.width <= 8) {
111
assert(stencil->enabled);
113
if (stencil->valuemask != stencilMax) {
114
/* compute stencilRef = stencilRef & valuemask */
115
LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);
116
stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");
117
/* compute stencilVals = stencilVals & valuemask */
118
stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");
121
res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
128
* Do the one or two-sided stencil test comparison.
129
* \sa lp_build_stencil_test_single
130
* \param front_facing an integer vector mask, indicating front (~0) or back
131
* (0) facing polygon. If NULL, assume front-facing.
134
lp_build_stencil_test(struct lp_build_context *bld,
135
const struct pipe_stencil_state stencil[2],
136
LLVMValueRef stencilRefs[2],
137
LLVMValueRef stencilVals,
138
LLVMValueRef front_facing)
142
assert(stencil[0].enabled);
144
/* do front face test */
145
res = lp_build_stencil_test_single(bld, &stencil[0],
146
stencilRefs[0], stencilVals);
148
if (stencil[1].enabled && front_facing != NULL) {
149
/* do back face test */
150
LLVMValueRef back_res;
152
back_res = lp_build_stencil_test_single(bld, &stencil[1],
153
stencilRefs[1], stencilVals);
155
res = lp_build_select(bld, front_facing, res, back_res);
163
* Apply the stencil operator (add/sub/keep/etc) to the given vector
165
* \return new stencil values vector
168
lp_build_stencil_op_single(struct lp_build_context *bld,
169
const struct pipe_stencil_state *stencil,
171
LLVMValueRef stencilRef,
172
LLVMValueRef stencilVals)
175
LLVMBuilderRef builder = bld->gallivm->builder;
176
struct lp_type type = bld->type;
178
LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);
185
stencil_op = stencil->fail_op;
188
stencil_op = stencil->zfail_op;
191
stencil_op = stencil->zpass_op;
194
assert(0 && "Invalid stencil_op mode");
195
stencil_op = PIPE_STENCIL_OP_KEEP;
198
switch (stencil_op) {
199
case PIPE_STENCIL_OP_KEEP:
201
/* we can return early for this case */
203
case PIPE_STENCIL_OP_ZERO:
206
case PIPE_STENCIL_OP_REPLACE:
209
case PIPE_STENCIL_OP_INCR:
210
res = lp_build_add(bld, stencilVals, bld->one);
211
res = lp_build_min(bld, res, max);
213
case PIPE_STENCIL_OP_DECR:
214
res = lp_build_sub(bld, stencilVals, bld->one);
215
res = lp_build_max(bld, res, bld->zero);
217
case PIPE_STENCIL_OP_INCR_WRAP:
218
res = lp_build_add(bld, stencilVals, bld->one);
219
res = LLVMBuildAnd(builder, res, max, "");
221
case PIPE_STENCIL_OP_DECR_WRAP:
222
res = lp_build_sub(bld, stencilVals, bld->one);
223
res = LLVMBuildAnd(builder, res, max, "");
225
case PIPE_STENCIL_OP_INVERT:
226
res = LLVMBuildNot(builder, stencilVals, "");
227
res = LLVMBuildAnd(builder, res, max, "");
230
assert(0 && "bad stencil op mode");
239
* Do the one or two-sided stencil test op/update.
242
lp_build_stencil_op(struct lp_build_context *bld,
243
const struct pipe_stencil_state stencil[2],
245
LLVMValueRef stencilRefs[2],
246
LLVMValueRef stencilVals,
248
LLVMValueRef front_facing)
251
LLVMBuilderRef builder = bld->gallivm->builder;
254
assert(stencil[0].enabled);
256
/* do front face op */
257
res = lp_build_stencil_op_single(bld, &stencil[0], op,
258
stencilRefs[0], stencilVals);
260
if (stencil[1].enabled && front_facing != NULL) {
261
/* do back face op */
262
LLVMValueRef back_res;
264
back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
265
stencilRefs[1], stencilVals);
267
res = lp_build_select(bld, front_facing, res, back_res);
270
if (stencil[0].writemask != 0xff ||
271
(stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {
272
/* mask &= stencil[0].writemask */
273
LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
274
stencil[0].writemask);
275
if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {
276
LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
277
stencil[1].writemask);
278
writemask = lp_build_select(bld, front_facing, writemask, back_writemask);
281
mask = LLVMBuildAnd(builder, mask, writemask, "");
282
/* res = (res & mask) | (stencilVals & ~mask) */
283
res = lp_build_select_bitwise(bld, mask, res, stencilVals);
286
/* res = mask ? res : stencilVals */
287
res = lp_build_select(bld, mask, res, stencilVals);
296
* Return a type that matches the depth/stencil format.
299
lp_depth_type(const struct util_format_description *format_desc,
305
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
306
assert(format_desc->block.width == 1);
307
assert(format_desc->block.height == 1);
309
memset(&type, 0, sizeof type);
310
type.width = format_desc->block.bits;
312
z_swizzle = format_desc->swizzle[0];
314
if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
315
type.floating = TRUE;
316
assert(z_swizzle == 0);
317
assert(format_desc->channel[z_swizzle].size == 32);
319
else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
320
assert(format_desc->block.bits <= 32);
321
assert(format_desc->channel[z_swizzle].normalized);
322
if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {
323
/* Prefer signed integers when possible, as SSE has less support
324
* for unsigned comparison;
333
type.length = length;
340
* Compute bitmask and bit shift to apply to the incoming fragment Z values
341
* and the Z buffer values needed before doing the Z comparison.
343
* Note that we leave the Z bits in the position that we find them
344
* in the Z buffer (typically 0xffffff00 or 0x00ffffff). That lets us
345
* get by with fewer bit twiddling steps.
348
get_z_shift_and_mask(const struct util_format_description *format_desc,
349
unsigned *shift, unsigned *width, unsigned *mask)
354
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
355
assert(format_desc->block.width == 1);
356
assert(format_desc->block.height == 1);
358
/* 64bit d/s format is special already extracted 32 bits */
359
total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
361
z_swizzle = format_desc->swizzle[0];
363
if (z_swizzle == PIPE_SWIZZLE_NONE)
366
*width = format_desc->channel[z_swizzle].size;
367
/* & 31 is for the same reason as the 32-bit limit above */
368
*shift = format_desc->channel[z_swizzle].shift & 31;
370
if (*width == total_bits) {
373
*mask = ((1 << *width) - 1) << *shift;
381
* Compute bitmask and bit shift to apply to the framebuffer pixel values
382
* to put the stencil bits in the least significant position.
386
get_s_shift_and_mask(const struct util_format_description *format_desc,
387
unsigned *shift, unsigned *mask)
392
s_swizzle = format_desc->swizzle[1];
394
if (s_swizzle == PIPE_SWIZZLE_NONE)
397
/* just special case 64bit d/s format */
398
if (format_desc->block.bits > 32) {
399
/* XXX big-endian? */
400
assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
406
*shift = format_desc->channel[s_swizzle].shift;
407
sz = format_desc->channel[s_swizzle].size;
408
*mask = (1U << sz) - 1U;
415
* Perform the occlusion test and increase the counter.
416
* Test the depth mask. Add the number of channel which has none zero mask
417
* into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
418
* The counter will add 4.
419
* TODO: could get that out of the fs loop.
421
* \param type holds element type of the mask vector.
422
* \param maskvalue is the depth test mask.
423
* \param counter is a pointer of the uint32 counter.
426
lp_build_occlusion_count(struct gallivm_state *gallivm,
428
LLVMValueRef maskvalue,
429
LLVMValueRef counter)
431
LLVMBuilderRef builder = gallivm->builder;
432
LLVMContextRef context = gallivm->context;
433
LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
434
LLVMValueRef count, newcount;
436
assert(type.length <= 16);
437
assert(type.floating);
439
if(util_get_cpu_caps()->has_sse && type.length == 4) {
440
const char *movmskintr = "llvm.x86.sse.movmsk.ps";
441
const char *popcntintr = "llvm.ctpop.i32";
442
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
443
lp_build_vec_type(gallivm, type), "");
444
bits = lp_build_intrinsic_unary(builder, movmskintr,
445
LLVMInt32TypeInContext(context), bits);
446
count = lp_build_intrinsic_unary(builder, popcntintr,
447
LLVMInt32TypeInContext(context), bits);
448
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
450
else if(util_get_cpu_caps()->has_avx && type.length == 8) {
451
const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
452
const char *popcntintr = "llvm.ctpop.i32";
453
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
454
lp_build_vec_type(gallivm, type), "");
455
bits = lp_build_intrinsic_unary(builder, movmskintr,
456
LLVMInt32TypeInContext(context), bits);
457
count = lp_build_intrinsic_unary(builder, popcntintr,
458
LLVMInt32TypeInContext(context), bits);
459
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
463
LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
464
LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
465
LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
466
LLVMValueRef shufflev, countd;
467
LLVMValueRef shuffles[16];
468
const char *popcntintr = NULL;
470
countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
472
for (i = 0; i < type.length; i++) {
473
#if UTIL_ARCH_LITTLE_ENDIAN
474
shuffles[i] = lp_build_const_int32(gallivm, 4*i);
476
shuffles[i] = lp_build_const_int32(gallivm, (4*i) + 3);
480
shufflev = LLVMConstVector(shuffles, type.length);
481
countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
482
countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
486
* this is bad on cpus without popcount (on x86 supported by intel
487
* nehalem, amd barcelona, and up - not tied to sse42).
488
* Would be much faster to just sum the 4 elements of the vector with
489
* some horizontal add (shuffle/add/shuffle/add after the initial and).
491
switch (type.length) {
493
popcntintr = "llvm.ctpop.i32";
496
popcntintr = "llvm.ctpop.i64";
499
popcntintr = "llvm.ctpop.i128";
504
count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
506
if (type.length > 8) {
507
count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
509
else if (type.length < 8) {
510
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
513
newcount = LLVMBuildLoad(builder, counter, "origcount");
514
newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
515
LLVMBuildStore(builder, newcount, counter);
520
* Load depth/stencil values.
521
* The stored values are linear, swizzle them.
523
* \param type the data type of the fragment depth/stencil values
524
* \param format_desc description of the depth/stencil surface
525
* \param is_1d whether this resource has only one dimension
526
* \param loop_counter the current loop iteration
527
* \param depth_ptr pointer to the depth/stencil values of this 4x4 block
528
* \param depth_stride stride of the depth/stencil buffer
529
* \param z_fb contains z values loaded from fb (may include padding)
530
* \param s_fb contains s values loaded from fb (may include padding)
533
lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
534
struct lp_type z_src_type,
535
const struct util_format_description *format_desc,
537
LLVMValueRef depth_ptr,
538
LLVMValueRef depth_stride,
541
LLVMValueRef loop_counter)
543
LLVMBuilderRef builder = gallivm->builder;
544
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
545
LLVMValueRef zs_dst1, zs_dst2;
546
LLVMValueRef zs_dst_ptr;
547
LLVMValueRef depth_offset1, depth_offset2;
548
LLVMTypeRef load_ptr_type;
549
unsigned depth_bytes = format_desc->block.bits / 8;
550
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
551
struct lp_type zs_load_type = zs_type;
553
zs_load_type.length = zs_load_type.length / 2;
554
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
556
if (z_src_type.length == 4) {
558
LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
559
lp_build_const_int32(gallivm, 1), "");
560
LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
561
lp_build_const_int32(gallivm, 2), "");
562
LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
564
depth_offset1 = LLVMBuildMul(builder, looplsb,
565
lp_build_const_int32(gallivm, depth_bytes * 2), "");
566
depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
568
/* just concatenate the loaded 2x2 values into 4-wide vector */
569
for (i = 0; i < 4; i++) {
570
shuffles[i] = lp_build_const_int32(gallivm, i);
575
LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
576
lp_build_const_int32(gallivm, 1), "");
577
assert(z_src_type.length == 8);
578
depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
580
* We load 2x4 values, and need to swizzle them (order
581
* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
583
for (i = 0; i < 8; i++) {
584
shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
588
depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
590
/* Load current z/stencil values from z/stencil buffer */
591
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
592
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
593
zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
595
zs_dst2 = lp_build_undef(gallivm, zs_load_type);
598
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
599
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
600
zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
603
*z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
604
LLVMConstVector(shuffles, zs_type.length), "");
607
if (format_desc->block.bits == 8) {
608
/* Extend stencil-only 8 bit values (S8_UINT) */
609
*s_fb = LLVMBuildZExt(builder, *s_fb,
610
lp_build_int_vec_type(gallivm, z_src_type), "");
613
if (format_desc->block.bits < z_src_type.width) {
614
/* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
615
*z_fb = LLVMBuildZExt(builder, *z_fb,
616
lp_build_int_vec_type(gallivm, z_src_type), "");
619
else if (format_desc->block.bits > 32) {
620
/* rely on llvm to handle too wide vector we have here nicely */
622
struct lp_type typex2 = zs_type;
623
struct lp_type s_type = zs_type;
624
LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
625
LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
628
typex2.width = typex2.width / 2;
629
typex2.length = typex2.length * 2;
630
s_type.width = s_type.width / 2;
633
tmp = LLVMBuildBitCast(builder, *z_fb,
634
lp_build_vec_type(gallivm, typex2), "");
636
for (i = 0; i < zs_type.length; i++) {
637
shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
638
shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
640
*z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
641
LLVMConstVector(shuffles1, zs_type.length), "");
642
*s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
643
LLVMConstVector(shuffles2, zs_type.length), "");
644
*s_fb = LLVMBuildBitCast(builder, *s_fb,
645
lp_build_vec_type(gallivm, s_type), "");
646
lp_build_name(*s_fb, "s_dst");
649
lp_build_name(*z_fb, "z_dst");
650
lp_build_name(*s_fb, "s_dst");
651
lp_build_name(*z_fb, "z_dst");
655
* Store depth/stencil values.
656
* Incoming values are swizzled (typically n 2x2 quads), stored linear.
657
* If there's a mask it will do select/store otherwise just store.
659
* \param type the data type of the fragment depth/stencil values
660
* \param format_desc description of the depth/stencil surface
661
* \param is_1d whether this resource has only one dimension
662
* \param mask_value the alive/dead pixel mask for the quad (vector)
663
* \param z_fb z values read from fb (with padding)
664
* \param s_fb s values read from fb (with padding)
665
* \param loop_counter the current loop iteration
666
* \param depth_ptr pointer to the depth/stencil values of this 4x4 block
667
* \param depth_stride stride of the depth/stencil buffer
668
* \param z_value the depth values to store (with padding)
669
* \param s_value the stencil values to store (with padding)
672
lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
673
struct lp_type z_src_type,
674
const struct util_format_description *format_desc,
676
LLVMValueRef mask_value,
679
LLVMValueRef loop_counter,
680
LLVMValueRef depth_ptr,
681
LLVMValueRef depth_stride,
682
LLVMValueRef z_value,
683
LLVMValueRef s_value)
685
struct lp_build_context z_bld;
686
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
687
LLVMBuilderRef builder = gallivm->builder;
688
LLVMValueRef zs_dst1, zs_dst2;
689
LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
690
LLVMValueRef depth_offset1, depth_offset2;
691
LLVMTypeRef load_ptr_type;
692
unsigned depth_bytes = format_desc->block.bits / 8;
693
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
694
struct lp_type z_type = zs_type;
695
struct lp_type zs_load_type = zs_type;
697
zs_load_type.length = zs_load_type.length / 2;
698
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
700
z_type.width = z_src_type.width;
702
lp_build_context_init(&z_bld, gallivm, z_type);
705
* This is far from ideal, at least for late depth write we should do this
706
* outside the fs loop to avoid all the swizzle stuff.
708
if (z_src_type.length == 4) {
709
LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
710
lp_build_const_int32(gallivm, 1), "");
711
LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
712
lp_build_const_int32(gallivm, 2), "");
713
LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
715
depth_offset1 = LLVMBuildMul(builder, looplsb,
716
lp_build_const_int32(gallivm, depth_bytes * 2), "");
717
depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
721
LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
722
lp_build_const_int32(gallivm, 1), "");
723
assert(z_src_type.length == 8);
724
depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
726
* We load 2x4 values, and need to swizzle them (order
727
* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
729
for (i = 0; i < 8; i++) {
730
shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
734
depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
736
zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
737
zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
738
zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
739
zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
741
if (format_desc->block.bits > 32) {
742
s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
746
z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
747
if (format_desc->block.bits > 32) {
748
s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
749
s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
753
if (zs_type.width < z_src_type.width) {
754
/* Truncate ZS values (e.g., when writing to Z16_UNORM) */
755
z_value = LLVMBuildTrunc(builder, z_value,
756
lp_build_int_vec_type(gallivm, zs_type), "");
759
if (format_desc->block.bits <= 32) {
760
if (z_src_type.length == 4) {
761
zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
762
zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
765
assert(z_src_type.length == 8);
766
zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
767
LLVMConstVector(&shuffles[0],
768
zs_load_type.length), "");
769
zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
770
LLVMConstVector(&shuffles[4],
771
zs_load_type.length), "");
775
if (z_src_type.length == 4) {
776
zs_dst1 = lp_build_interleave2(gallivm, z_type,
777
z_value, s_value, 0);
778
zs_dst2 = lp_build_interleave2(gallivm, z_type,
779
z_value, s_value, 1);
783
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
784
assert(z_src_type.length == 8);
785
for (i = 0; i < 8; i++) {
786
shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
787
shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
790
zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
791
LLVMConstVector(&shuffles[0],
792
z_src_type.length), "");
793
zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
794
LLVMConstVector(&shuffles[8],
795
z_src_type.length), "");
797
zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
798
lp_build_vec_type(gallivm, zs_load_type), "");
799
zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
800
lp_build_vec_type(gallivm, zs_load_type), "");
803
LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
805
LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
810
* Generate code for performing depth and/or stencil tests.
811
* We operate on a vector of values (typically n 2x2 quads).
813
* \param depth the depth test state
814
* \param stencil the front/back stencil state
815
* \param type the data type of the fragment depth/stencil values
816
* \param format_desc description of the depth/stencil surface
817
* \param mask the alive/dead pixel mask for the quad (vector)
818
* \param cov_mask coverage mask
819
* \param stencil_refs the front/back stencil ref values (scalar)
820
* \param z_src the incoming depth/stencil values (n 2x2 quad values, float32)
821
* \param zs_dst the depth/stencil values in framebuffer
822
* \param face contains boolean value indicating front/back facing polygon
825
lp_build_depth_stencil_test(struct gallivm_state *gallivm,
826
const struct lp_depth_state *depth,
827
const struct pipe_stencil_state stencil[2],
828
struct lp_type z_src_type,
829
const struct util_format_description *format_desc,
830
struct lp_build_mask_context *mask,
831
LLVMValueRef *cov_mask,
832
LLVMValueRef stencil_refs[2],
837
LLVMValueRef *z_value,
838
LLVMValueRef *s_value,
841
LLVMBuilderRef builder = gallivm->builder;
842
struct lp_type z_type;
843
struct lp_build_context z_bld;
844
struct lp_build_context s_bld;
845
struct lp_type s_type;
846
unsigned z_shift = 0, z_width = 0, z_mask = 0;
847
LLVMValueRef z_dst = NULL;
848
LLVMValueRef stencil_vals = NULL;
849
LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
850
LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
851
LLVMValueRef current_mask = mask ? lp_build_mask_value(mask) : *cov_mask;
852
LLVMValueRef front_facing = NULL;
853
boolean have_z, have_s;
856
* Depths are expected to be between 0 and 1, even if they are stored in
857
* floats. Setting these bits here will ensure that the lp_build_conv() call
858
* below won't try to unnecessarily clamp the incoming values.
860
if(z_src_type.floating) {
861
z_src_type.sign = FALSE;
862
z_src_type.norm = TRUE;
865
assert(!z_src_type.sign);
866
assert(z_src_type.norm);
869
/* Pick the type matching the depth-stencil format. */
870
z_type = lp_depth_type(format_desc, z_src_type.length);
872
/* Pick the intermediate type for depth operations. */
873
z_type.width = z_src_type.width;
874
assert(z_type.length == z_src_type.length);
876
/* FIXME: for non-float depth/stencil might generate better code
877
* if we'd always split it up to use 128bit operations.
878
* For stencil we'd almost certainly want to pack to 8xi16 values,
879
* for z just run twice.
882
/* Sanity checking */
884
ASSERTED const unsigned z_swizzle = format_desc->swizzle[0];
885
ASSERTED const unsigned s_swizzle = format_desc->swizzle[1];
887
assert(z_swizzle != PIPE_SWIZZLE_NONE ||
888
s_swizzle != PIPE_SWIZZLE_NONE);
890
assert(depth->enabled || stencil[0].enabled);
892
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
893
assert(format_desc->block.width == 1);
894
assert(format_desc->block.height == 1);
896
if (stencil[0].enabled) {
897
assert(s_swizzle < 4);
898
assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
899
assert(format_desc->channel[s_swizzle].pure_integer);
900
assert(!format_desc->channel[s_swizzle].normalized);
901
assert(format_desc->channel[s_swizzle].size == 8);
904
if (depth->enabled) {
905
assert(z_swizzle < 4);
906
if (z_type.floating) {
907
assert(z_swizzle == 0);
908
assert(format_desc->channel[z_swizzle].type ==
909
UTIL_FORMAT_TYPE_FLOAT);
910
assert(format_desc->channel[z_swizzle].size == 32);
913
assert(format_desc->channel[z_swizzle].type ==
914
UTIL_FORMAT_TYPE_UNSIGNED);
915
assert(format_desc->channel[z_swizzle].normalized);
916
assert(!z_type.fixed);
922
/* Setup build context for Z vals */
923
lp_build_context_init(&z_bld, gallivm, z_type);
925
/* Setup build context for stencil vals */
926
s_type = lp_int_type(z_type);
927
lp_build_context_init(&s_bld, gallivm, s_type);
929
/* Compute and apply the Z/stencil bitmasks and shifts.
932
unsigned s_shift, s_mask;
937
have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
938
have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
941
if (z_mask != 0xffffffff) {
942
z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
946
* Align the framebuffer Z 's LSB to the right.
949
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
950
z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
951
} else if (z_bitmask) {
952
z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
954
lp_build_name(z_dst, "z_dst");
960
LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
961
stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
962
stencil_shift = shift; /* used below */
965
if (s_mask != 0xffffffff) {
966
LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
967
stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
970
lp_build_name(stencil_vals, "s_dst");
974
if (stencil[0].enabled) {
979
* XXX: the scalar expansion below produces atrocious code
980
* (basically producing a 64bit scalar value, then moving the 2
981
* 32bit pieces separately to simd, plus 4 shuffles, which is
982
* seriously lame). But the scalar-simd transitions are always
983
* tricky, so no big surprise there.
984
* This here would be way better, however llvm has some serious
985
* trouble later using it in the select, probably because it will
986
* recognize the expression as constant and move the simd value
987
* away (out of the loop) - and then it will suddenly try
988
* constructing i1 high-bit masks out of it later...
989
* (Try piglit stencil-twoside.)
990
* Note this is NOT due to using SExt/Trunc, it fails exactly the
991
* same even when using native compare/select.
992
* I cannot reproduce this problem when using stand-alone compiler
993
* though, suggesting some problem with optimization passes...
994
* (With stand-alone compilation, the construction of this mask
995
* value, no matter if the easy 3 instruction here or the complex
996
* 16+ one below, never gets separated from where it's used.)
997
* The scalar code still has the same problem, but the generated
998
* code looks a bit better at least for some reason, even if
999
* mostly by luck (the fundamental issue clearly is the same).
1001
front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);
1002
/* front_facing = face != 0 ? ~0 : 0 */
1003
front_facing = lp_build_compare(gallivm, s_bld.type,
1005
front_facing, s_bld.zero);
1007
LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
1009
/* front_facing = face != 0 ? ~0 : 0 */
1010
front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
1011
front_facing = LLVMBuildSExt(builder, front_facing,
1012
LLVMIntTypeInContext(gallivm->context,
1013
s_bld.type.length*s_bld.type.width),
1015
front_facing = LLVMBuildBitCast(builder, front_facing,
1016
s_bld.int_vec_type, "");
1021
s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
1022
stencil_refs, stencil_vals,
1025
/* apply stencil-fail operator */
1027
LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);
1028
stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
1029
stencil_refs, stencil_vals,
1030
s_fail_mask, front_facing);
1034
if (depth->enabled) {
1036
* Convert fragment Z to the desired type, aligning the LSB to the right.
1039
assert(z_type.width == z_src_type.width);
1040
assert(z_type.length == z_src_type.length);
1041
assert(lp_check_value(z_src_type, z_src));
1042
if (z_src_type.floating) {
1044
* Convert from floating point values
1047
if (!z_type.floating) {
1048
z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
1055
* Convert from unsigned normalized values.
1058
assert(!z_src_type.sign);
1059
assert(!z_src_type.fixed);
1060
assert(z_src_type.norm);
1061
assert(!z_type.floating);
1062
if (z_src_type.width > z_width) {
1063
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
1064
z_src_type.width - z_width);
1065
z_src = LLVMBuildLShr(builder, z_src, shift, "");
1068
assert(lp_check_value(z_type, z_src));
1070
lp_build_name(z_src, "z_src");
1072
/* compare src Z to dst Z, returning 'pass' mask */
1073
z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
1075
/* mask off bits that failed stencil test */
1077
current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
1080
if (!stencil[0].enabled && mask) {
1081
/* We can potentially skip all remaining operations here, but only
1082
* if stencil is disabled because we still need to update the stencil
1083
* buffer values. Don't need to update Z buffer values.
1085
lp_build_mask_update(mask, z_pass);
1088
lp_build_mask_check(mask);
1092
if (depth->writemask) {
1093
LLVMValueRef z_pass_mask;
1095
/* mask off bits that failed Z test */
1096
z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
1098
/* Mix the old and new Z buffer values.
1099
* z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
1101
z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);
1104
if (stencil[0].enabled) {
1105
/* update stencil buffer values according to z pass/fail result */
1106
LLVMValueRef z_fail_mask, z_pass_mask;
1108
/* apply Z-fail operator */
1109
z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);
1110
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
1111
stencil_refs, stencil_vals,
1112
z_fail_mask, front_facing);
1114
/* apply Z-pass operator */
1115
z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
1116
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
1117
stencil_refs, stencil_vals,
1118
z_pass_mask, front_facing);
1122
/* No depth test: apply Z-pass operator to stencil buffer values which
1123
* passed the stencil test.
1125
s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
1126
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
1127
stencil_refs, stencil_vals,
1128
s_pass_mask, front_facing);
1131
/* Put Z and stencil bits in the right place */
1132
if (have_z && z_shift) {
1133
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
1134
z_dst = LLVMBuildShl(builder, z_dst, shift, "");
1136
if (stencil_vals && stencil_shift)
1137
stencil_vals = LLVMBuildShl(builder, stencil_vals,
1140
/* Finally, merge the z/stencil values */
1141
if (format_desc->block.bits <= 32) {
1142
if (have_z && have_s)
1143
*z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
1147
*z_value = stencil_vals;
1148
*s_value = *z_value;
1152
*s_value = stencil_vals;
1157
lp_build_mask_update(mask, s_pass_mask);
1159
if (depth->enabled && stencil[0].enabled)
1160
lp_build_mask_update(mask, z_pass);
1162
LLVMValueRef tmp_mask = *cov_mask;
1164
tmp_mask = LLVMBuildAnd(builder, tmp_mask, s_pass_mask, "");
1166
/* for multisample we don't do the stencil optimisation so update always */
1168
tmp_mask = LLVMBuildAnd(builder, tmp_mask, z_pass, "");
1169
*cov_mask = tmp_mask;