2
* Copyright © 2014-2015 Broadcom
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24
#include "compiler/nir/nir.h"
25
#include "compiler/nir/nir_deref.h"
26
#include "compiler/nir/nir_worklist.h"
27
#include "nir/nir_to_tgsi.h"
28
#include "pipe/p_screen.h"
29
#include "pipe/p_state.h"
30
#include "tgsi/tgsi_dump.h"
31
#include "tgsi/tgsi_from_mesa.h"
32
#include "tgsi/tgsi_info.h"
33
#include "tgsi/tgsi_ureg.h"
34
#include "tgsi/tgsi_util.h"
35
#include "util/debug.h"
36
#include "util/u_math.h"
37
#include "util/u_memory.h"
38
#include "util/u_dynarray.h"
41
enum tgsi_opcode opcode;
42
struct ureg_dst dst[2];
43
struct ureg_src src[4];
44
enum tgsi_texture_type tex_target;
45
enum tgsi_return_type tex_return_type;
46
struct tgsi_texture_offset tex_offset;
48
unsigned mem_qualifier;
49
enum pipe_format mem_format;
57
/* Array of struct ntt_insn */
58
struct util_dynarray insns;
63
struct ntt_reg_interval {
69
nir_function_impl *impl;
70
const struct nir_to_tgsi_options *options;
71
struct pipe_screen *screen;
72
struct ureg_program *ureg;
74
bool needs_texcoord_semantic;
78
bool addr_declared[3];
79
struct ureg_dst addr_reg[3];
81
/* if condition set up at the end of a block, for ntt_emit_if(). */
82
struct ureg_src if_cond;
84
/* TGSI temps for our NIR SSA and register values. */
85
struct ureg_dst *reg_temp;
86
struct ureg_src *ssa_temp;
88
struct ntt_reg_interval *liveness;
90
/* Map from nir_block to ntt_block */
91
struct hash_table *blocks;
92
struct ntt_block *cur_block;
93
unsigned current_if_else;
96
/* Whether we're currently emitting instructiosn for a precise NIR instruction. */
100
unsigned first_non_array_temp;
102
/* Mappings from driver_location to TGSI input/output number.
104
* We'll be declaring TGSI input/outputs in an arbitrary order, and they get
105
* their numbers assigned incrementally, unlike inputs or constants.
107
struct ureg_src *input_index_map;
108
uint64_t centroid_inputs;
112
struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
115
static struct ureg_dst
116
ntt_temp(struct ntt_compile *c)
118
return ureg_dst_register(TGSI_FILE_TEMPORARY, c->num_temps++);
121
static struct ntt_block *
122
ntt_block_from_nir(struct ntt_compile *c, struct nir_block *block)
124
struct hash_entry *entry = _mesa_hash_table_search(c->blocks, block);
128
static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
129
static void ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list);
131
static struct ntt_insn *
132
ntt_insn(struct ntt_compile *c, enum tgsi_opcode opcode,
134
struct ureg_src src0, struct ureg_src src1,
135
struct ureg_src src2, struct ureg_src src3)
137
struct ntt_insn insn = {
139
.dst = { dst, ureg_dst_undef() },
140
.src = { src0, src1, src2, src3 },
141
.precise = c->precise,
143
util_dynarray_append(&c->cur_block->insns, struct ntt_insn, insn);
144
return util_dynarray_top_ptr(&c->cur_block->insns, struct ntt_insn);
148
static inline void ntt_##op(struct ntt_compile *c) \
150
ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
154
static inline void ntt_##op(struct ntt_compile *c, \
155
struct ureg_src src0) \
157
ntt_insn(c, TGSI_OPCODE_##op, ureg_dst_undef(), src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
162
static inline void ntt_##op(struct ntt_compile *c, \
163
struct ureg_dst dst) \
165
ntt_insn(c, TGSI_OPCODE_##op, dst, ureg_src_undef(), ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
169
static inline void ntt_##op(struct ntt_compile *c, \
170
struct ureg_dst dst, \
171
struct ureg_src src0) \
173
ntt_insn(c, TGSI_OPCODE_##op, dst, src0, ureg_src_undef(), ureg_src_undef(), ureg_src_undef()); \
177
static inline void ntt_##op(struct ntt_compile *c, \
178
struct ureg_dst dst, \
179
struct ureg_src src0, \
180
struct ureg_src src1) \
182
ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, ureg_src_undef(), ureg_src_undef()); \
186
static inline void ntt_##op(struct ntt_compile *c, \
187
struct ureg_dst dst, \
188
struct ureg_src src0, \
189
struct ureg_src src1, \
190
struct ureg_src src2) \
192
ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, ureg_src_undef()); \
196
static inline void ntt_##op(struct ntt_compile *c, \
197
struct ureg_dst dst, \
198
struct ureg_src src0, \
199
struct ureg_src src1, \
200
struct ureg_src src2, \
201
struct ureg_src src3) \
203
ntt_insn(c, TGSI_OPCODE_##op, dst, src0, src1, src2, src3); \
206
/* We hand-craft our tex instructions */
210
/* Use a template include to generate a correctly-typed ntt_OP()
211
* function for each TGSI opcode:
213
#include "gallium/auxiliary/tgsi/tgsi_opcode_tmp.h"
216
* Interprets a nir_load_const used as a NIR src as a uint.
218
* For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
219
* instruction (or in a phi-web used by an integer ALU instruction) were
220
* converted to floats and the ALU instruction swapped to the float equivalent.
221
* However, this means that integer load_consts used by intrinsics (which don't
222
* normally get that conversion) may have been reformatted to be floats. Given
223
* that all of our intrinsic nir_src_as_uint() calls are expected to be small,
224
* we can just look and see if they look like floats and convert them back to
228
ntt_src_as_uint(struct ntt_compile *c, nir_src src)
230
uint32_t val = nir_src_as_uint(src);
231
if (!c->native_integers && val >= fui(1.0))
232
val = (uint32_t)uif(val);
237
ntt_64bit_write_mask(unsigned write_mask)
239
return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
242
static struct ureg_src
243
ntt_64bit_1f(struct ntt_compile *c)
245
return ureg_imm4u(c->ureg,
246
0x00000000, 0x3ff00000,
247
0x00000000, 0x3ff00000);
250
/* Per-channel masks of def/use within the block, and the per-channel
251
* livein/liveout for the block as a whole.
253
struct ntt_live_reg_block_state {
254
uint8_t *def, *use, *livein, *liveout, *defin, *defout;
257
struct ntt_live_reg_state {
258
unsigned bitset_words;
260
struct ntt_reg_interval *regs;
262
/* Used in propagate_across_edge() */
263
BITSET_WORD *tmp_live;
265
struct ntt_live_reg_block_state *blocks;
267
nir_block_worklist worklist;
271
ntt_live_reg_mark_use(struct ntt_compile *c, struct ntt_live_reg_block_state *bs,
272
int ip, unsigned index, unsigned used_mask)
274
bs->use[index] |= used_mask & ~bs->def[index];
276
c->liveness[index].start = MIN2(c->liveness[index].start, ip);
277
c->liveness[index].end = MAX2(c->liveness[index].end, ip);
281
ntt_live_reg_setup_def_use(struct ntt_compile *c, nir_function_impl *impl, struct ntt_live_reg_state *state)
283
for (int i = 0; i < impl->num_blocks; i++) {
284
state->blocks[i].def = rzalloc_array(state->blocks, uint8_t, c->num_temps);
285
state->blocks[i].defin = rzalloc_array(state->blocks, uint8_t, c->num_temps);
286
state->blocks[i].defout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
287
state->blocks[i].use = rzalloc_array(state->blocks, uint8_t, c->num_temps);
288
state->blocks[i].livein = rzalloc_array(state->blocks, uint8_t, c->num_temps);
289
state->blocks[i].liveout = rzalloc_array(state->blocks, uint8_t, c->num_temps);
293
nir_foreach_block(block, impl) {
294
struct ntt_live_reg_block_state *bs = &state->blocks[block->index];
295
struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
297
ntt_block->start_ip = ip;
299
util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
300
const struct tgsi_opcode_info *opcode_info =
301
tgsi_get_opcode_info(insn->opcode);
303
/* Set up use[] for the srcs.
305
* Uses are the channels of the reg read in the block that don't have a
306
* preceding def to screen them off. Note that we don't do per-element
307
* tracking of array regs, so they're never screened off.
309
for (int i = 0; i < opcode_info->num_src; i++) {
310
if (insn->src[i].File != TGSI_FILE_TEMPORARY)
312
int index = insn->src[i].Index;
314
uint32_t used_mask = tgsi_util_get_src_usage_mask(insn->opcode, i,
315
insn->dst->WriteMask,
316
insn->src[i].SwizzleX,
317
insn->src[i].SwizzleY,
318
insn->src[i].SwizzleZ,
319
insn->src[i].SwizzleW,
323
assert(!insn->src[i].Indirect || index < c->first_non_array_temp);
324
ntt_live_reg_mark_use(c, bs, ip, index, used_mask);
327
if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY)
328
ntt_live_reg_mark_use(c, bs, ip, insn->tex_offset.Index, 0xf);
330
/* Set up def[] for the srcs.
332
* Defs are the unconditionally-written (not R/M/W) channels of the reg in
333
* the block that don't have a preceding use.
335
for (int i = 0; i < opcode_info->num_dst; i++) {
336
if (insn->dst[i].File != TGSI_FILE_TEMPORARY)
338
int index = insn->dst[i].Index;
339
uint32_t writemask = insn->dst[i].WriteMask;
341
bs->def[index] |= writemask & ~bs->use[index];
342
bs->defout[index] |= writemask;
344
assert(!insn->dst[i].Indirect || index < c->first_non_array_temp);
345
c->liveness[index].start = MIN2(c->liveness[index].start, ip);
346
c->liveness[index].end = MAX2(c->liveness[index].end, ip);
351
ntt_block->end_ip = ip;
356
ntt_live_regs(struct ntt_compile *c, nir_function_impl *impl)
358
nir_metadata_require(impl, nir_metadata_block_index);
360
c->liveness = rzalloc_array(c, struct ntt_reg_interval, c->num_temps);
362
struct ntt_live_reg_state state = {
363
.blocks = rzalloc_array(impl, struct ntt_live_reg_block_state, impl->num_blocks),
366
/* The intervals start out with start > end (indicating unused) */
367
for (int i = 0; i < c->num_temps; i++)
368
c->liveness[i].start = ~0;
370
ntt_live_reg_setup_def_use(c, impl, &state);
372
/* Make a forward-order worklist of all the blocks. */
373
nir_block_worklist_init(&state.worklist, impl->num_blocks, NULL);
374
nir_foreach_block(block, impl) {
375
nir_block_worklist_push_tail(&state.worklist, block);
378
/* Propagate defin/defout down the CFG to calculate the live variables
379
* potentially defined along any possible control flow path. We'll use this
380
* to keep things like conditional defs of the reg (or array regs where we
381
* don't track defs!) from making the reg's live range extend back to the
382
* start of the program.
384
while (!nir_block_worklist_is_empty(&state.worklist)) {
385
nir_block *block = nir_block_worklist_pop_head(&state.worklist);
386
for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
387
nir_block *succ = block->successors[j];
388
if (!succ || succ->index == impl->num_blocks)
391
for (int i = 0; i < c->num_temps; i++) {
392
uint8_t new_def = state.blocks[block->index].defout[i] & ~state.blocks[succ->index].defin[i];
395
state.blocks[succ->index].defin[i] |= new_def;
396
state.blocks[succ->index].defout[i] |= new_def;
397
nir_block_worklist_push_tail(&state.worklist, succ);
403
/* Make a reverse-order worklist of all the blocks. */
404
nir_foreach_block(block, impl) {
405
nir_block_worklist_push_head(&state.worklist, block);
408
/* We're now ready to work through the worklist and update the liveness sets
409
* of each of the blocks. As long as we keep the worklist up-to-date as we
410
* go, everything will get covered.
412
while (!nir_block_worklist_is_empty(&state.worklist)) {
413
/* We pop them off in the reverse order we pushed them on. This way
414
* the first walk of the instructions is backwards so we only walk
415
* once in the case of no control flow.
417
nir_block *block = nir_block_worklist_pop_head(&state.worklist);
418
struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
419
struct ntt_live_reg_block_state *bs = &state.blocks[block->index];
421
for (int i = 0; i < c->num_temps; i++) {
422
/* Collect livein from our successors to include in our liveout. */
423
for (int j = 0; j < ARRAY_SIZE(block->successors); j++) {
424
nir_block *succ = block->successors[j];
425
if (!succ || succ->index == impl->num_blocks)
427
struct ntt_live_reg_block_state *sbs = &state.blocks[succ->index];
429
uint8_t new_liveout = sbs->livein[i] & ~bs->liveout[i];
431
if (state.blocks[block->index].defout[i])
432
c->liveness[i].end = MAX2(c->liveness[i].end, ntt_block->end_ip);
433
bs->liveout[i] |= sbs->livein[i];
437
/* Propagate use requests from either our block's uses or our
438
* non-screened-off liveout up to our predecessors.
440
uint8_t new_livein = ((bs->use[i] | (bs->liveout[i] & ~bs->def[i])) &
443
bs->livein[i] |= new_livein;
444
set_foreach(block->predecessors, entry) {
445
nir_block *pred = (void *)entry->key;
446
nir_block_worklist_push_tail(&state.worklist, pred);
449
if (new_livein & state.blocks[block->index].defin[i])
450
c->liveness[i].start = MIN2(c->liveness[i].start, ntt_block->start_ip);
455
ralloc_free(state.blocks);
456
nir_block_worklist_fini(&state.worklist);
460
ntt_ra_check(struct ntt_compile *c, unsigned *ra_map, BITSET_WORD *released, int ip, unsigned index)
462
if (index < c->first_non_array_temp)
465
if (c->liveness[index].start == ip && ra_map[index] == ~0)
466
ra_map[index] = ureg_DECL_temporary(c->ureg).Index;
468
if (c->liveness[index].end == ip && !BITSET_TEST(released, index)) {
469
ureg_release_temporary(c->ureg, ureg_dst_register(TGSI_FILE_TEMPORARY, ra_map[index]));
470
BITSET_SET(released, index);
475
ntt_allocate_regs(struct ntt_compile *c, nir_function_impl *impl)
477
ntt_live_regs(c, impl);
479
unsigned *ra_map = ralloc_array(c, unsigned, c->num_temps);
480
unsigned *released = rzalloc_array(c, BITSET_WORD, BITSET_WORDS(c->num_temps));
482
/* No RA on NIR array regs */
483
for (int i = 0; i < c->first_non_array_temp; i++)
486
for (int i = c->first_non_array_temp; i < c->num_temps; i++)
490
nir_foreach_block(block, impl) {
491
struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
493
for (int i = 0; i < c->num_temps; i++)
494
ntt_ra_check(c, ra_map, released, ip, i);
496
util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
497
const struct tgsi_opcode_info *opcode_info =
498
tgsi_get_opcode_info(insn->opcode);
500
for (int i = 0; i < opcode_info->num_src; i++) {
501
if (insn->src[i].File == TGSI_FILE_TEMPORARY) {
502
ntt_ra_check(c, ra_map, released, ip, insn->src[i].Index);
503
insn->src[i].Index = ra_map[insn->src[i].Index];
507
if (insn->is_tex && insn->tex_offset.File == TGSI_FILE_TEMPORARY) {
508
ntt_ra_check(c, ra_map, released, ip, insn->tex_offset.Index);
509
insn->tex_offset.Index = ra_map[insn->tex_offset.Index];
512
for (int i = 0; i < opcode_info->num_dst; i++) {
513
if (insn->dst[i].File == TGSI_FILE_TEMPORARY) {
514
ntt_ra_check(c, ra_map, released, ip, insn->dst[i].Index);
515
insn->dst[i].Index = ra_map[insn->dst[i].Index];
521
for (int i = 0; i < c->num_temps; i++)
522
ntt_ra_check(c, ra_map, released, ip, i);
527
* Try to find an iadd of a constant value with a non-constant value in the
528
* nir_src's first component, returning the constant offset and replacing *src
529
* with the non-constant component.
531
static const uint32_t
532
ntt_extract_const_src_offset(nir_src *src)
537
nir_ssa_scalar s = nir_get_ssa_scalar(src->ssa, 0);
539
while (nir_ssa_scalar_is_alu(s)) {
540
nir_alu_instr *alu = nir_instr_as_alu(s.def->parent_instr);
542
for (int i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
543
if (!alu->src[i].src.is_ssa)
547
if (alu->op == nir_op_iadd) {
548
for (int i = 0; i < 2; i++) {
549
nir_const_value *v = nir_src_as_const_value(alu->src[i].src);
550
if (v && !alu->src[i].negate && !alu->src[i].abs) {
551
*src = alu->src[1 - i].src;
552
return v[alu->src[i].swizzle[s.comp]].u32;
559
/* We'd like to reuse nir_ssa_scalar_chase_movs(), but it assumes SSA and that
560
* seems reasonable for something used in inner loops of the compiler.
562
if (!nir_alu_instr_is_copy(alu))
565
if (alu->op == nir_op_mov) {
566
s.def = alu->src[0].src.ssa;
567
s.comp = alu->src[0].swizzle[s.comp];
568
} else if (nir_op_is_vec(alu->op)) {
569
s.def = alu->src[s.comp].src.ssa;
570
s.comp = alu->src[s.comp].swizzle[0];
579
static const struct glsl_type *
580
ntt_shader_input_type(struct ntt_compile *c,
581
struct nir_variable *var)
583
switch (c->s->info.stage) {
584
case MESA_SHADER_GEOMETRY:
585
case MESA_SHADER_TESS_EVAL:
586
case MESA_SHADER_TESS_CTRL:
587
if (glsl_type_is_array(var->type))
588
return glsl_get_array_element(var->type);
597
ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
598
unsigned *semantic_name, unsigned *semantic_index)
600
/* We want to use most of tgsi_get_gl_varying_semantic(), but the
601
* !texcoord shifting has already been applied, so avoid that.
603
if (!c->needs_texcoord_semantic &&
604
(location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
605
*semantic_name = TGSI_SEMANTIC_GENERIC;
606
*semantic_index = location - VARYING_SLOT_VAR0;
610
tgsi_get_gl_varying_semantic(location, true,
611
semantic_name, semantic_index);
614
/* TGSI varying declarations have a component usage mask associated (used by
618
ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
621
uint32_t usage_mask =
622
u_bit_consecutive(start_component, num_components);
625
if (start_component >= 2)
628
uint32_t tgsi_usage_mask = 0;
630
if (usage_mask & TGSI_WRITEMASK_X)
631
tgsi_usage_mask |= TGSI_WRITEMASK_XY;
632
if (usage_mask & TGSI_WRITEMASK_Y)
633
tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
635
return tgsi_usage_mask;
641
/* TGSI varying declarations have a component usage mask associated (used by
645
ntt_tgsi_var_usage_mask(const struct nir_variable *var)
647
const struct glsl_type *type_without_array =
648
glsl_without_array(var->type);
649
unsigned num_components = glsl_get_vector_elements(type_without_array);
650
if (num_components == 0) /* structs */
653
return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
654
glsl_type_is_64bit(type_without_array));
657
static struct ureg_dst
658
ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
660
nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
661
int base = nir_intrinsic_base(instr);
662
*frac = nir_intrinsic_component(instr);
663
bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
666
if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
667
unsigned semantic_name, semantic_index;
668
tgsi_get_gl_frag_result_semantic(semantics.location,
669
&semantic_name, &semantic_index);
670
semantic_index += semantics.dual_source_blend_index;
672
switch (semantics.location) {
673
case FRAG_RESULT_DEPTH:
674
*frac = 2; /* z write is the to the .z channel in TGSI */
676
case FRAG_RESULT_STENCIL:
683
out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
685
unsigned semantic_name, semantic_index;
687
ntt_get_gl_varying_semantic(c, semantics.location,
688
&semantic_name, &semantic_index);
690
uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
691
instr->num_components,
693
uint32_t gs_streams = semantics.gs_streams;
694
for (int i = 0; i < 4; i++) {
695
if (!(usage_mask & (1 << i)))
696
gs_streams &= ~(0x3 << 2 * i);
699
/* No driver appears to use array_id of outputs. */
700
unsigned array_id = 0;
702
/* This bit is lost in the i/o semantics, but it's unused in in-tree
705
bool invariant = semantics.invariant;
707
out = ureg_DECL_output_layout(c->ureg,
708
semantic_name, semantic_index,
718
if (nir_intrinsic_has_write_mask(instr))
719
write_mask = nir_intrinsic_write_mask(instr);
721
write_mask = ((1 << instr->num_components) - 1) << *frac;
724
write_mask = ntt_64bit_write_mask(write_mask);
726
write_mask = write_mask << 2;
728
write_mask = write_mask << *frac;
730
return ureg_writemask(out, write_mask);
733
/* If this reg or SSA def is used only for storing an output, then in the simple
734
* cases we can write directly to the TGSI output instead of having store_output
738
ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
739
struct list_head *uses, struct list_head *if_uses)
741
*dst = ureg_dst_undef();
743
switch (c->s->info.stage) {
744
case MESA_SHADER_FRAGMENT:
745
case MESA_SHADER_VERTEX:
748
/* tgsi_exec (at least) requires that output stores happen per vertex
749
* emitted, you don't get to reuse a previous output value for the next
755
if (!list_is_empty(if_uses) || !list_is_singular(uses))
758
nir_src *src = list_first_entry(uses, nir_src, use_link);
760
if (src->parent_instr->type != nir_instr_type_intrinsic)
763
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
764
if (intr->intrinsic != nir_intrinsic_store_output ||
765
!nir_src_is_const(intr->src[1])) {
770
*dst = ntt_output_decl(c, intr, &frac);
771
dst->Index += ntt_src_as_uint(c, intr->src[1]);
777
ntt_setup_inputs(struct ntt_compile *c)
779
if (c->s->info.stage != MESA_SHADER_FRAGMENT)
782
unsigned num_inputs = 0;
783
int num_input_arrays = 0;
785
nir_foreach_shader_in_variable(var, c->s) {
786
const struct glsl_type *type = ntt_shader_input_type(c, var);
788
glsl_count_attribute_slots(type, false);
790
num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
793
c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
795
nir_foreach_shader_in_variable(var, c->s) {
796
const struct glsl_type *type = ntt_shader_input_type(c, var);
798
glsl_count_attribute_slots(type, false);
800
unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
802
struct ureg_src decl;
804
if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
806
tgsi_get_interp_mode(var->data.interpolation,
807
var->data.location == VARYING_SLOT_COL0 ||
808
var->data.location == VARYING_SLOT_COL1);
810
if (var->data.location == VARYING_SLOT_POS)
811
interpolation = TGSI_INTERPOLATE_LINEAR;
814
unsigned semantic_name, semantic_index;
815
ntt_get_gl_varying_semantic(c, var->data.location,
816
&semantic_name, &semantic_index);
818
if (var->data.sample) {
819
sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
820
} else if (var->data.centroid) {
821
sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
822
c->centroid_inputs |= (BITSET_MASK(array_len) <<
823
var->data.driver_location);
825
sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
828
unsigned array_id = 0;
829
if (glsl_type_is_array(type))
830
array_id = ++num_input_arrays;
832
uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
834
decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
839
var->data.driver_location,
841
array_id, array_len);
843
if (semantic_name == TGSI_SEMANTIC_FACE) {
844
struct ureg_dst temp = ntt_temp(c);
845
if (c->native_integers) {
846
/* NIR is ~0 front and 0 back, while TGSI is +1 front */
847
ntt_SGE(c, temp, decl, ureg_imm1f(c->ureg, 0));
849
/* tgsi docs say that floating point FACE will be positive for
850
* frontface and negative for backface, but realistically
851
* GLSL-to-TGSI had been doing MOV_SAT to turn it into 0.0 vs 1.0.
852
* Copy that behavior, since some drivers (r300) have been doing a
853
* 0.0 vs 1.0 backface (and I don't think anybody has a non-1.0
856
temp.Saturate = true;
857
ntt_MOV(c, temp, decl);
860
decl = ureg_src(temp);
863
for (unsigned i = 0; i < array_len; i++) {
864
c->input_index_map[var->data.driver_location + i] = decl;
865
c->input_index_map[var->data.driver_location + i].Index += i;
871
ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
873
return a->data.location - b->data.location;
877
* Workaround for virglrenderer requiring that TGSI FS output color variables
878
* are declared in order. Besides, it's a lot nicer to read the TGSI this way.
881
ntt_setup_outputs(struct ntt_compile *c)
883
if (c->s->info.stage != MESA_SHADER_FRAGMENT)
886
nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
888
nir_foreach_shader_out_variable(var, c->s) {
889
if (var->data.location == FRAG_RESULT_COLOR)
890
ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
892
unsigned semantic_name, semantic_index;
893
tgsi_get_gl_frag_result_semantic(var->data.location,
894
&semantic_name, &semantic_index);
896
(void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
900
static enum tgsi_texture_type
901
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
904
case GLSL_SAMPLER_DIM_1D:
906
return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
908
return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
909
case GLSL_SAMPLER_DIM_2D:
910
case GLSL_SAMPLER_DIM_EXTERNAL:
912
return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
914
return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
915
case GLSL_SAMPLER_DIM_3D:
916
return TGSI_TEXTURE_3D;
917
case GLSL_SAMPLER_DIM_CUBE:
919
return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
921
return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
922
case GLSL_SAMPLER_DIM_RECT:
924
return TGSI_TEXTURE_SHADOWRECT;
926
return TGSI_TEXTURE_RECT;
927
case GLSL_SAMPLER_DIM_MS:
928
return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
929
case GLSL_SAMPLER_DIM_BUF:
930
return TGSI_TEXTURE_BUFFER;
932
unreachable("unknown sampler dim");
936
static enum tgsi_return_type
937
tgsi_return_type_from_base_type(enum glsl_base_type type)
941
return TGSI_RETURN_TYPE_SINT;
943
return TGSI_RETURN_TYPE_UINT;
944
case GLSL_TYPE_FLOAT:
945
return TGSI_RETURN_TYPE_FLOAT;
947
unreachable("unexpected texture type");
952
ntt_setup_uniforms(struct ntt_compile *c)
954
nir_foreach_uniform_variable(var, c->s) {
955
if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
956
glsl_type_is_texture(glsl_without_array(var->type))) {
957
/* Don't use this size for the check for samplers -- arrays of structs
958
* containing samplers should be ignored, and just the separate lowered
959
* sampler uniform decl used.
961
int size = glsl_type_get_sampler_count(var->type) +
962
glsl_type_get_texture_count(var->type);
964
const struct glsl_type *stype = glsl_without_array(var->type);
965
enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
966
glsl_sampler_type_is_array(stype),
967
glsl_sampler_type_is_shadow(stype));
968
enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
969
for (int i = 0; i < size; i++) {
970
ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
971
target, ret_type, ret_type, ret_type, ret_type);
972
ureg_DECL_sampler(c->ureg, var->data.binding + i);
974
} else if (glsl_contains_atomic(var->type)) {
975
uint32_t offset = var->data.offset / 4;
976
uint32_t size = glsl_atomic_size(var->type) / 4;
977
ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
980
/* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
981
* size declaration happens with other UBOs below.
985
nir_foreach_image_variable(var, c->s) {
986
int image_count = glsl_type_get_image_count(var->type);
987
const struct glsl_type *itype = glsl_without_array(var->type);
988
enum tgsi_texture_type tex_type =
989
tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
990
glsl_sampler_type_is_array(itype), false);
992
for (int i = 0; i < image_count; i++) {
993
c->images[var->data.binding] = ureg_DECL_image(c->ureg,
994
var->data.binding + i,
996
var->data.image.format,
997
!(var->data.access & ACCESS_NON_WRITEABLE),
1004
unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
1005
nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
1006
int ubo = var->data.driver_location;
1010
if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
1011
c->first_ubo = MIN2(c->first_ubo, ubo);
1013
unsigned size = glsl_get_explicit_size(var->interface_type, false);
1016
if (glsl_type_is_interface(glsl_without_array(var->type)))
1017
array_size = MAX2(1, glsl_get_aoa_size(var->type));
1019
for (int i = 0; i < array_size; i++) {
1020
/* Even if multiple NIR variables are in the same uniform block, their
1021
* explicit size is the size of the block.
1023
if (ubo_sizes[ubo + i])
1024
assert(ubo_sizes[ubo + i] == size);
1026
ubo_sizes[ubo + i] = size;
1030
for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
1032
ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
1035
for (int i = 0; i < c->s->info.num_ssbos; i++) {
1036
/* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
1039
bool atomic = false;
1040
ureg_DECL_buffer(c->ureg, i, atomic);
1045
ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
1047
assert(c->num_temps == 0);
1048
/* Permanently allocate all the array regs at the start. */
1049
foreach_list_typed(nir_register, nir_reg, node, list) {
1050
if (nir_reg->num_array_elems != 0) {
1051
struct ureg_dst decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems, true);
1052
c->reg_temp[nir_reg->index] = decl;
1053
assert(c->num_temps == decl.Index);
1054
c->num_temps += nir_reg->num_array_elems;
1057
c->first_non_array_temp = c->num_temps;
1059
/* After that, allocate non-array regs in our virtual space that we'll
1060
* register-allocate before ureg emit.
1062
foreach_list_typed(nir_register, nir_reg, node, list) {
1063
if (nir_reg->num_array_elems == 0) {
1064
struct ureg_dst decl;
1065
uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
1066
if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
1067
if (nir_reg->bit_size == 64) {
1068
if (nir_reg->num_components > 2) {
1069
fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
1070
nir_reg->num_components, nir_reg->index);
1073
write_mask = ntt_64bit_write_mask(write_mask);
1076
decl = ureg_writemask(ntt_temp(c), write_mask);
1078
c->reg_temp[nir_reg->index] = decl;
1083
static struct ureg_src
1084
ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
1086
int num_components = instr->def.num_components;
1088
if (!c->native_integers) {
1090
assert(instr->def.bit_size == 32);
1091
for (int i = 0; i < num_components; i++)
1092
values[i] = uif(instr->value[i].u32);
1094
return ureg_DECL_immediate(c->ureg, values, num_components);
1098
if (instr->def.bit_size == 32) {
1099
for (int i = 0; i < num_components; i++)
1100
values[i] = instr->value[i].u32;
1102
assert(num_components <= 2);
1103
for (int i = 0; i < num_components; i++) {
1104
values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
1105
values[i * 2 + 1] = instr->value[i].u64 >> 32;
1107
num_components *= 2;
1110
return ureg_DECL_immediate_uint(c->ureg, values, num_components);
1114
static struct ureg_src
1115
ntt_reladdr(struct ntt_compile *c, struct ureg_src addr, int addr_index)
1117
assert(addr_index < ARRAY_SIZE(c->addr_reg));
1119
for (int i = 0; i <= addr_index; i++) {
1120
if (!c->addr_declared[i]) {
1121
c->addr_reg[i] = ureg_writemask(ureg_DECL_address(c->ureg),
1123
c->addr_declared[i] = true;
1127
if (c->native_integers)
1128
ntt_UARL(c, c->addr_reg[addr_index], addr);
1130
ntt_ARL(c, c->addr_reg[addr_index], addr);
1131
return ureg_scalar(ureg_src(c->addr_reg[addr_index]), 0);
1134
static struct ureg_src
1135
ntt_get_src(struct ntt_compile *c, nir_src src)
1138
if (src.ssa->parent_instr->type == nir_instr_type_load_const)
1139
return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
1141
return c->ssa_temp[src.ssa->index];
1143
nir_register *reg = src.reg.reg;
1144
struct ureg_dst reg_temp = c->reg_temp[reg->index];
1145
reg_temp.Index += src.reg.base_offset;
1147
if (src.reg.indirect) {
1148
struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
1149
return ureg_src_indirect(ureg_src(reg_temp),
1150
ntt_reladdr(c, offset, 0));
1152
return ureg_src(reg_temp);
1157
static struct ureg_src
1158
ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
1160
nir_alu_src src = instr->src[i];
1161
struct ureg_src usrc = ntt_get_src(c, src.src);
1163
if (nir_src_bit_size(src.src) == 64) {
1164
int chan0 = 0, chan1 = 1;
1165
if (nir_op_infos[instr->op].input_sizes[i] == 0) {
1166
chan0 = ffs(instr->dest.write_mask) - 1;
1167
chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
1171
usrc = ureg_swizzle(usrc,
1172
src.swizzle[chan0] * 2,
1173
src.swizzle[chan0] * 2 + 1,
1174
src.swizzle[chan1] * 2,
1175
src.swizzle[chan1] * 2 + 1);
1177
usrc = ureg_swizzle(usrc,
1185
usrc = ureg_abs(usrc);
1187
usrc = ureg_negate(usrc);
1192
/* Reswizzles a source so that the unset channels in the write mask still refer
1193
* to one of the channels present in the write mask.
1195
static struct ureg_src
1196
ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
1199
int first_chan = ffs(write_mask) - 1;
1200
return ureg_swizzle(src,
1201
(write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
1202
(write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
1203
(write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
1204
(write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
1207
static struct ureg_dst
1208
ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
1210
uint32_t writemask = BITSET_MASK(ssa->num_components);
1211
if (ssa->bit_size == 64)
1212
writemask = ntt_64bit_write_mask(writemask);
1214
struct ureg_dst dst;
1215
if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
1218
c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
1220
return ureg_writemask(dst, writemask);
1223
static struct ureg_dst
1224
ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
1227
return ntt_get_ssa_def_decl(c, &dest->ssa);
1229
return c->reg_temp[dest->reg.reg->index];
1232
static struct ureg_dst
1233
ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
1235
struct ureg_dst dst = ntt_get_dest_decl(c, dest);
1237
if (!dest->is_ssa) {
1238
dst.Index += dest->reg.base_offset;
1240
if (dest->reg.indirect) {
1241
struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
1242
dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset, 0));
1249
/* For an SSA dest being populated by a constant src, replace the storage with
1250
* a copy of the ureg_src.
1253
ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
1255
if (!src.Indirect && !src.DimIndirect) {
1257
case TGSI_FILE_IMMEDIATE:
1258
case TGSI_FILE_INPUT:
1259
case TGSI_FILE_CONSTANT:
1260
case TGSI_FILE_SYSTEM_VALUE:
1261
c->ssa_temp[def->index] = src;
1266
ntt_MOV(c, ntt_get_ssa_def_decl(c, def), src);
1270
ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
1273
ntt_store_def(c, &dest->ssa, src);
1275
struct ureg_dst dst = ntt_get_dest(c, dest);
1276
ntt_MOV(c, dst, src);
1281
ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
1282
struct ureg_dst dst,
1283
struct ureg_src src0,
1284
struct ureg_src src1)
1288
/* POW is the only 2-operand scalar op. */
1289
if (tgsi_op != TGSI_OPCODE_POW)
1292
for (i = 0; i < 4; i++) {
1293
if (dst.WriteMask & (1 << i)) {
1294
ntt_insn(c, tgsi_op,
1295
ureg_writemask(dst, 1 << i),
1296
ureg_scalar(src0, i),
1297
ureg_scalar(src1, i),
1298
ureg_src_undef(), ureg_src_undef());
1304
ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
1306
struct ureg_src src[4];
1307
struct ureg_dst dst;
1309
int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
1310
int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
1311
int num_srcs = nir_op_infos[instr->op].num_inputs;
1313
c->precise = instr->exact;
1315
assert(num_srcs <= ARRAY_SIZE(src));
1316
for (i = 0; i < num_srcs; i++)
1317
src[i] = ntt_get_alu_src(c, instr, i);
1318
for (; i < ARRAY_SIZE(src); i++)
1319
src[i] = ureg_src_undef();
1321
dst = ntt_get_dest(c, &instr->dest.dest);
1323
if (instr->dest.saturate)
1324
dst.Saturate = true;
1327
dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
1329
dst = ureg_writemask(dst, instr->dest.write_mask);
1331
static enum tgsi_opcode op_map[][2] = {
1332
[nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
1334
/* fabs/fneg 32-bit are special-cased below. */
1335
[nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
1336
[nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
1338
[nir_op_fdot2] = { TGSI_OPCODE_DP2 },
1339
[nir_op_fdot3] = { TGSI_OPCODE_DP3 },
1340
[nir_op_fdot4] = { TGSI_OPCODE_DP4 },
1341
[nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
1342
[nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
1343
[nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
1344
[nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
1345
[nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
1346
[nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
1347
[nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
1348
[nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
1349
[nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
1350
[nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
1352
[nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
1353
[nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
1354
[nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
1356
/* The conversions will have one combination of src and dst bitsize. */
1357
[nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
1358
[nir_op_f2f64] = { TGSI_OPCODE_F2D },
1359
[nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
1361
[nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
1362
[nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
1363
[nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
1364
[nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
1365
[nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
1366
[nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
1367
[nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
1368
[nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
1370
[nir_op_slt] = { TGSI_OPCODE_SLT },
1371
[nir_op_sge] = { TGSI_OPCODE_SGE },
1372
[nir_op_seq] = { TGSI_OPCODE_SEQ },
1373
[nir_op_sne] = { TGSI_OPCODE_SNE },
1375
[nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
1376
[nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
1377
[nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
1378
[nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
1380
[nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
1381
[nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
1382
[nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
1383
[nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
1385
[nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
1386
[nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
1388
[nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
1389
[nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
1390
[nir_op_fsign] = { TGSI_OPCODE_SSG },
1391
[nir_op_isign] = { TGSI_OPCODE_ISSG },
1392
[nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
1393
[nir_op_fddx] = { TGSI_OPCODE_DDX },
1394
[nir_op_fddy] = { TGSI_OPCODE_DDY },
1395
[nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
1396
[nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
1397
[nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
1398
[nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
1399
[nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
1400
[nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
1401
[nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
1402
[nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
1403
[nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
1404
[nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
1405
[nir_op_bit_count] = { TGSI_OPCODE_POPC },
1406
[nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
1407
[nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
1408
[nir_op_find_lsb] = { TGSI_OPCODE_LSB },
1409
[nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
1410
[nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
1411
[nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
1412
[nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
1413
[nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
1414
[nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
1415
[nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
1416
[nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
1417
[nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
1418
[nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
1419
[nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
1421
/* These bitwise ops don't care about 32 vs 64 types, so they have the
1424
[nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
1425
[nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
1426
[nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
1427
[nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
1429
[nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
1430
[nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
1431
[nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
1432
[nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
1433
[nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
1434
[nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
1435
[nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
1436
[nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
1439
/* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
1440
* of .xy. Store to a temp and move it to the real dst.
1442
bool tgsi_64bit_compare = src_64 && !dst_64 &&
1444
nir_op_infos[instr->op].output_type == nir_type_bool32) &&
1445
(dst.WriteMask != TGSI_WRITEMASK_X);
1447
/* TGSI 64bit-to-32-bit conversions only generate results in the .xy
1448
* channels and will need to get fixed up.
1450
bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
1451
num_srcs == 1 && !tgsi_64bit_compare &&
1452
(dst.WriteMask & ~TGSI_WRITEMASK_XY));
1454
struct ureg_dst real_dst = ureg_dst_undef();
1455
if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
1460
bool table_op64 = src_64;
1461
if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
1462
/* The normal path for NIR to TGSI ALU op translation */
1463
ntt_insn(c, op_map[instr->op][table_op64],
1464
dst, src[0], src[1], src[2], src[3]);
1466
/* Special cases for NIR to TGSI ALU op translation. */
1468
/* TODO: Use something like the ntt_store() path for the MOV calls so we
1469
* don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
1472
switch (instr->op) {
1474
ntt_AND(c, dst, ureg_swizzle(src[0],
1475
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1476
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1477
ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1483
ntt_MOV(c, dst, ureg_swizzle(src[0],
1484
TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1485
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1489
if (c->options->lower_fabs)
1490
ntt_MAX(c, dst, src[0], ureg_negate(src[0]));
1492
ntt_MOV(c, dst, ureg_abs(src[0]));
1497
ntt_MIN(c, dst, src[0], ntt_64bit_1f(c));
1498
ntt_MAX(c, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1500
ntt_MOV(c, ureg_saturate(dst), src[0]);
1505
ntt_MOV(c, dst, ureg_negate(src[0]));
1508
/* NOTE: TGSI 32-bit math ops have the old "one source channel
1509
* replicated to all dst channels" behavior, while 64 is normal mapping
1510
* of src channels to dst.
1514
ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], ureg_src_undef());
1519
ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], ureg_src_undef());
1524
ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], ureg_src_undef());
1529
ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], ureg_src_undef());
1534
ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], ureg_src_undef());
1538
ntt_AND(c, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1543
ureg_swizzle(src[0],
1544
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1545
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1551
ntt_DSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1553
ntt_FSNE(c, dst, src[0], ureg_imm1f(c->ureg, 0));
1558
ntt_U64SNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1560
ntt_USNE(c, dst, src[0], ureg_imm1u(c->ureg, 0));
1564
ntt_AND(c, dst, src[0], ureg_imm1u(c->ureg, 1));
1569
ureg_swizzle(src[0],
1570
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1571
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1572
ureg_imm4u(c->ureg, 1, 0, 1, 0));
1576
ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], ureg_src_undef());
1580
ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], ureg_src_undef());
1585
ntt_ADD(c, dst, src[0], ureg_negate(src[1]));
1590
ntt_UADD(c, dst, src[0], ureg_negate(src[1]));
1594
unreachable("should be handled by .lower_fmod = true");
1598
ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1602
ntt_LRP(c, dst, src[2], src[1], src[0]);
1605
case nir_op_pack_64_2x32_split:
1606
ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1607
ureg_swizzle(src[0],
1608
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1609
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1610
ntt_MOV(c, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1611
ureg_swizzle(src[1],
1612
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1613
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1616
case nir_op_unpack_64_2x32_split_x:
1617
ntt_MOV(c, dst, ureg_swizzle(src[0],
1618
TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1619
TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1622
case nir_op_unpack_64_2x32_split_y:
1623
ntt_MOV(c, dst, ureg_swizzle(src[0],
1624
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1625
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1628
case nir_op_b32csel:
1629
if (nir_src_bit_size(instr->src[1].src) == 64) {
1630
ntt_UCMP(c, dst, ureg_swizzle(src[0],
1631
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1632
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1635
ntt_UCMP(c, dst, src[0], src[1], src[2]);
1640
/* NIR fcsel is src0 != 0 ? src1 : src2.
1641
* TGSI CMP is src0 < 0 ? src1 : src2.
1643
* However, fcsel so far as I can find only appears on bools-as-floats
1644
* (1.0 or 0.0), so we can just negate it for the TGSI op. It's
1645
* important to not have an abs here, as i915g has to make extra
1646
* instructions to do the abs.
1648
if (c->options->lower_cmp) {
1649
/* If the HW doesn't support TGSI CMP (r300 VS), then lower it to a
1650
* LRP on the boolean 1.0/0.0 value, instead of requiring the
1651
* backend to turn the src0 into 1.0/0.0 first.
1653
* We don't use this in general because some hardware (i915 FS) the
1654
* LRP gets expanded to MUL/MAD.
1656
ntt_LRP(c, dst, src[0], src[1], src[2]);
1658
ntt_CMP(c, dst, ureg_negate(src[0]), src[1], src[2]);
1662
/* It would be nice if we could get this left as scalar in NIR, since
1663
* the TGSI op is scalar.
1665
case nir_op_frexp_sig:
1666
case nir_op_frexp_exp: {
1668
struct ureg_dst temp = ntt_temp(c);
1670
for (int chan = 0; chan < 2; chan++) {
1673
if (!(instr->dest.write_mask & wm))
1676
struct ureg_dst dsts[2] = { temp, temp };
1677
if (instr->op == nir_op_frexp_sig) {
1678
dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1680
dsts[1] = ureg_writemask(dst, wm);
1683
struct ureg_src chan_src = ureg_swizzle(src[0],
1684
chan * 2, chan * 2 + 1,
1685
chan * 2, chan * 2 + 1);
1687
struct ntt_insn *insn = ntt_insn(c, TGSI_OPCODE_DFRACEXP,
1692
insn->dst[1] = dsts[1];
1698
assert(dst_64); /* 32bit handled in table. */
1699
ntt_DLDEXP(c, dst, src[0],
1700
ureg_swizzle(src[1],
1701
TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1702
TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1708
unreachable("covered by nir_lower_vec_to_movs()");
1711
fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1712
unreachable("Unknown NIR opcode");
1716
/* 64-bit op fixup movs */
1717
if (!ureg_dst_is_undef(real_dst)) {
1718
if (tgsi_64bit_compare) {
1719
ntt_MOV(c, real_dst,
1720
ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
1722
assert(tgsi_64bit_downconvert);
1723
uint8_t swizzle[] = {0, 0, 0, 0};
1724
uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
1726
swizzle[ffs(second_bit) - 1] = 1;
1727
ntt_MOV(c, real_dst, ureg_swizzle(ureg_src(dst),
1738
static struct ureg_src
1739
ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1740
nir_src src, int addr_reg)
1742
if (nir_src_is_const(src)) {
1743
usrc.Index += ntt_src_as_uint(c, src);
1746
return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src), addr_reg));
1750
static struct ureg_dst
1751
ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1754
if (nir_src_is_const(src)) {
1755
dst.Index += ntt_src_as_uint(c, src);
1758
return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src), 0));
1762
static struct ureg_src
1763
ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1766
if (nir_src_is_const(src)) {
1767
return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1771
return ureg_src_dimension_indirect(usrc,
1772
ntt_reladdr(c, ntt_get_src(c, src), 1),
1777
static struct ureg_dst
1778
ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1781
if (nir_src_is_const(src)) {
1782
return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1784
return ureg_dst_dimension_indirect(udst,
1785
ntt_reladdr(c, ntt_get_src(c, src), 1),
1789
/* Some load operations in NIR will have a fractional offset that we need to
1790
* swizzle down before storing to the result register.
1792
static struct ureg_src
1793
ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1795
return ureg_swizzle(src,
1797
frac + MIN2(num_components - 1, 1),
1798
frac + MIN2(num_components - 1, 2),
1799
frac + MIN2(num_components - 1, 3));
1804
ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1806
int bit_size = nir_dest_bit_size(instr->dest);
1807
assert(bit_size == 32 || instr->num_components <= 2);
1809
struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1811
struct ureg_dst addr_temp = ureg_dst_undef();
1813
if (nir_src_is_const(instr->src[0])) {
1814
src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1816
/* virglrenderer requires that indirect UBO references have the UBO
1817
* array's base index in the Index field, not added to the indrect
1820
* Many nir intrinsics have a base address const value for the start of
1821
* their array indirection, but load_ubo doesn't. We fake it by
1822
* subtracting it off here.
1824
addr_temp = ntt_temp(c);
1825
ntt_UADD(c, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1826
src = ureg_src_dimension_indirect(src,
1827
ntt_reladdr(c, ureg_src(addr_temp), 1),
1831
if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1832
/* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1835
src.Index = nir_intrinsic_base(instr);
1837
if (nir_src_is_const(instr->src[1])) {
1838
src.Index += ntt_src_as_uint(c, instr->src[1]);
1840
src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1]), 0));
1843
int start_component = nir_intrinsic_component(instr);
1845
start_component *= 2;
1847
src = ntt_shift_by_frac(src, start_component,
1848
instr->num_components * bit_size / 32);
1850
ntt_store(c, &instr->dest, src);
1852
/* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1853
* TGSI_OPCODE_LOAD instruction from the const file.
1855
struct ntt_insn *insn =
1856
ntt_insn(c, TGSI_OPCODE_LOAD,
1857
ntt_get_dest(c, &instr->dest),
1858
src, ntt_get_src(c, instr->src[1]),
1859
ureg_src_undef(), ureg_src_undef());
1860
insn->is_mem = true;
1861
insn->tex_target = 0;
1862
insn->mem_qualifier = 0;
1863
insn->mem_format = 0; /* unused */
1868
ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1870
enum gl_access_qualifier access = nir_intrinsic_access(instr);
1871
unsigned qualifier = 0;
1873
if (access & ACCESS_COHERENT)
1874
qualifier |= TGSI_MEMORY_COHERENT;
1875
if (access & ACCESS_VOLATILE)
1876
qualifier |= TGSI_MEMORY_VOLATILE;
1877
if (access & ACCESS_RESTRICT)
1878
qualifier |= TGSI_MEMORY_RESTRICT;
1884
ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1885
nir_variable_mode mode)
1887
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1888
instr->intrinsic == nir_intrinsic_store_shared);
1889
bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1890
instr->intrinsic == nir_intrinsic_load_ssbo ||
1891
instr->intrinsic == nir_intrinsic_load_shared);
1893
struct ureg_src src[4];
1896
struct ureg_dst addr_temp = ureg_dst_undef();
1898
struct ureg_src memory;
1900
case nir_var_mem_ssbo:
1901
memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1902
instr->src[is_store ? 1 : 0], 2);
1905
case nir_var_mem_shared:
1906
memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1909
case nir_var_uniform: { /* HW atomic buffers */
1910
nir_src src = instr->src[0];
1911
uint32_t offset = ntt_extract_const_src_offset(&src) / 4;
1912
memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, offset);
1913
/* ntt_ureg_src_indirect, except dividing by 4 */
1914
if (nir_src_is_const(src)) {
1915
memory.Index += nir_src_as_uint(src) / 4;
1917
addr_temp = ntt_temp(c);
1918
ntt_USHR(c, addr_temp, ntt_get_src(c, src), ureg_imm1i(c->ureg, 2));
1919
memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp), 2));
1921
memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
1927
unreachable("unknown memory type");
1931
src[num_src++] = ntt_get_src(c, instr->src[next_src + 1]); /* offset */
1932
src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1934
src[num_src++] = memory;
1935
if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1936
src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* offset */
1937
switch (instr->intrinsic) {
1938
case nir_intrinsic_atomic_counter_inc:
1939
src[num_src++] = ureg_imm1i(c->ureg, 1);
1941
case nir_intrinsic_atomic_counter_post_dec:
1942
src[num_src++] = ureg_imm1i(c->ureg, -1);
1946
src[num_src++] = ntt_get_src(c, instr->src[next_src++]); /* value */
1953
switch (instr->intrinsic) {
1954
case nir_intrinsic_atomic_counter_add:
1955
case nir_intrinsic_atomic_counter_inc:
1956
case nir_intrinsic_atomic_counter_post_dec:
1957
case nir_intrinsic_ssbo_atomic_add:
1958
case nir_intrinsic_shared_atomic_add:
1959
opcode = TGSI_OPCODE_ATOMUADD;
1961
case nir_intrinsic_ssbo_atomic_fadd:
1962
case nir_intrinsic_shared_atomic_fadd:
1963
opcode = TGSI_OPCODE_ATOMFADD;
1965
case nir_intrinsic_atomic_counter_min:
1966
case nir_intrinsic_ssbo_atomic_imin:
1967
case nir_intrinsic_shared_atomic_imin:
1968
opcode = TGSI_OPCODE_ATOMIMIN;
1970
case nir_intrinsic_atomic_counter_max:
1971
case nir_intrinsic_ssbo_atomic_imax:
1972
case nir_intrinsic_shared_atomic_imax:
1973
opcode = TGSI_OPCODE_ATOMIMAX;
1975
case nir_intrinsic_ssbo_atomic_umin:
1976
case nir_intrinsic_shared_atomic_umin:
1977
opcode = TGSI_OPCODE_ATOMUMIN;
1979
case nir_intrinsic_ssbo_atomic_umax:
1980
case nir_intrinsic_shared_atomic_umax:
1981
opcode = TGSI_OPCODE_ATOMUMAX;
1983
case nir_intrinsic_atomic_counter_and:
1984
case nir_intrinsic_ssbo_atomic_and:
1985
case nir_intrinsic_shared_atomic_and:
1986
opcode = TGSI_OPCODE_ATOMAND;
1988
case nir_intrinsic_atomic_counter_or:
1989
case nir_intrinsic_ssbo_atomic_or:
1990
case nir_intrinsic_shared_atomic_or:
1991
opcode = TGSI_OPCODE_ATOMOR;
1993
case nir_intrinsic_atomic_counter_xor:
1994
case nir_intrinsic_ssbo_atomic_xor:
1995
case nir_intrinsic_shared_atomic_xor:
1996
opcode = TGSI_OPCODE_ATOMXOR;
1998
case nir_intrinsic_atomic_counter_exchange:
1999
case nir_intrinsic_ssbo_atomic_exchange:
2000
case nir_intrinsic_shared_atomic_exchange:
2001
opcode = TGSI_OPCODE_ATOMXCHG;
2003
case nir_intrinsic_atomic_counter_comp_swap:
2004
case nir_intrinsic_ssbo_atomic_comp_swap:
2005
case nir_intrinsic_shared_atomic_comp_swap:
2006
opcode = TGSI_OPCODE_ATOMCAS;
2007
src[num_src++] = ntt_get_src(c, instr->src[next_src++]);
2009
case nir_intrinsic_atomic_counter_read:
2010
case nir_intrinsic_load_ssbo:
2011
case nir_intrinsic_load_shared:
2012
opcode = TGSI_OPCODE_LOAD;
2014
case nir_intrinsic_store_ssbo:
2015
case nir_intrinsic_store_shared:
2016
opcode = TGSI_OPCODE_STORE;
2018
case nir_intrinsic_get_ssbo_size:
2019
opcode = TGSI_OPCODE_RESQ;
2022
unreachable("unknown memory op");
2025
unsigned qualifier = 0;
2026
if (mode == nir_var_mem_ssbo &&
2027
instr->intrinsic != nir_intrinsic_get_ssbo_size) {
2028
qualifier = ntt_get_access_qualifier(instr);
2031
struct ureg_dst dst;
2033
dst = ureg_dst(memory);
2035
unsigned write_mask = nir_intrinsic_write_mask(instr);
2036
if (nir_src_bit_size(instr->src[0]) == 64)
2037
write_mask = ntt_64bit_write_mask(write_mask);
2038
dst = ureg_writemask(dst, write_mask);
2040
dst = ntt_get_dest(c, &instr->dest);
2043
struct ntt_insn *insn = ntt_insn(c, opcode, dst, src[0], src[1], src[2], src[3]);
2044
insn->tex_target = TGSI_TEXTURE_BUFFER;
2045
insn->mem_qualifier = qualifier;
2046
insn->mem_format = 0; /* unused */
2047
insn->is_mem = true;
2051
ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
2054
struct ureg_src srcs[4];
2056
enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
2057
bool is_array = nir_intrinsic_image_array(instr);
2059
struct ureg_dst temp = ureg_dst_undef();
2061
enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
2063
struct ureg_src resource =
2064
ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
2067
struct ureg_dst dst;
2068
if (instr->intrinsic == nir_intrinsic_image_store) {
2069
dst = ureg_dst(resource);
2071
srcs[num_src++] = resource;
2072
dst = ntt_get_dest(c, &instr->dest);
2074
struct ureg_dst opcode_dst = dst;
2076
if (instr->intrinsic != nir_intrinsic_image_size && instr->intrinsic != nir_intrinsic_image_samples) {
2077
struct ureg_src coord = ntt_get_src(c, instr->src[1]);
2079
if (dim == GLSL_SAMPLER_DIM_MS) {
2081
ntt_MOV(c, temp, coord);
2082
ntt_MOV(c, ureg_writemask(temp, TGSI_WRITEMASK_W),
2083
ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
2084
coord = ureg_src(temp);
2086
srcs[num_src++] = coord;
2088
if (instr->intrinsic != nir_intrinsic_image_load) {
2089
srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
2090
if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
2091
srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
2095
switch (instr->intrinsic) {
2096
case nir_intrinsic_image_load:
2097
op = TGSI_OPCODE_LOAD;
2099
case nir_intrinsic_image_store:
2100
op = TGSI_OPCODE_STORE;
2102
case nir_intrinsic_image_size:
2103
op = TGSI_OPCODE_RESQ;
2105
case nir_intrinsic_image_samples:
2106
op = TGSI_OPCODE_RESQ;
2107
opcode_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2109
case nir_intrinsic_image_atomic_add:
2110
op = TGSI_OPCODE_ATOMUADD;
2112
case nir_intrinsic_image_atomic_fadd:
2113
op = TGSI_OPCODE_ATOMFADD;
2115
case nir_intrinsic_image_atomic_imin:
2116
op = TGSI_OPCODE_ATOMIMIN;
2118
case nir_intrinsic_image_atomic_umin:
2119
op = TGSI_OPCODE_ATOMUMIN;
2121
case nir_intrinsic_image_atomic_imax:
2122
op = TGSI_OPCODE_ATOMIMAX;
2124
case nir_intrinsic_image_atomic_umax:
2125
op = TGSI_OPCODE_ATOMUMAX;
2127
case nir_intrinsic_image_atomic_and:
2128
op = TGSI_OPCODE_ATOMAND;
2130
case nir_intrinsic_image_atomic_or:
2131
op = TGSI_OPCODE_ATOMOR;
2133
case nir_intrinsic_image_atomic_xor:
2134
op = TGSI_OPCODE_ATOMXOR;
2136
case nir_intrinsic_image_atomic_exchange:
2137
op = TGSI_OPCODE_ATOMXCHG;
2139
case nir_intrinsic_image_atomic_comp_swap:
2140
op = TGSI_OPCODE_ATOMCAS;
2143
unreachable("bad op");
2146
struct ntt_insn *insn = ntt_insn(c, op, opcode_dst, srcs[0], srcs[1], srcs[2], srcs[3]);
2147
insn->tex_target = target;
2148
insn->mem_qualifier = ntt_get_access_qualifier(instr);
2149
insn->mem_format = nir_intrinsic_format(instr);
2150
insn->is_mem = true;
2152
if (instr->intrinsic == nir_intrinsic_image_samples)
2153
ntt_MOV(c, dst, ureg_scalar(ureg_src(opcode_dst), 3));
2157
ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
2159
uint32_t frac = nir_intrinsic_component(instr);
2160
uint32_t num_components = instr->num_components;
2161
unsigned base = nir_intrinsic_base(instr);
2162
struct ureg_src input;
2163
nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2164
bool is_64 = nir_dest_bit_size(instr->dest) == 64;
2166
if (c->s->info.stage == MESA_SHADER_VERTEX) {
2167
input = ureg_DECL_vs_input(c->ureg, base);
2168
for (int i = 1; i < semantics.num_slots; i++)
2169
ureg_DECL_vs_input(c->ureg, base + i);
2170
} else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2171
unsigned semantic_name, semantic_index;
2172
ntt_get_gl_varying_semantic(c, semantics.location,
2173
&semantic_name, &semantic_index);
2175
/* XXX: ArrayID is used in r600 gs inputs */
2176
uint32_t array_id = 0;
2178
input = ureg_DECL_input_layout(c->ureg,
2182
ntt_tgsi_usage_mask(frac,
2183
instr->num_components,
2186
semantics.num_slots);
2188
input = c->input_index_map[base];
2192
num_components *= 2;
2194
input = ntt_shift_by_frac(input, frac, num_components);
2196
switch (instr->intrinsic) {
2197
case nir_intrinsic_load_input:
2198
input = ntt_ureg_src_indirect(c, input, instr->src[0], 0);
2199
ntt_store(c, &instr->dest, input);
2202
case nir_intrinsic_load_per_vertex_input:
2203
input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2204
input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
2205
ntt_store(c, &instr->dest, input);
2208
case nir_intrinsic_load_interpolated_input: {
2209
input = ntt_ureg_src_indirect(c, input, instr->src[1], 0);
2211
nir_intrinsic_instr *bary_instr =
2212
nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
2214
switch (bary_instr->intrinsic) {
2215
case nir_intrinsic_load_barycentric_pixel:
2216
case nir_intrinsic_load_barycentric_sample:
2217
/* For these, we know that the barycentric load matches the
2218
* interpolation on the input declaration, so we can use it directly.
2220
ntt_store(c, &instr->dest, input);
2223
case nir_intrinsic_load_barycentric_centroid:
2224
/* If the input was declared centroid, then there's no need to
2225
* emit the extra TGSI interp instruction, we can just read the
2228
if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
2229
ntt_store(c, &instr->dest, input);
2231
ntt_INTERP_CENTROID(c, ntt_get_dest(c, &instr->dest), input);
2235
case nir_intrinsic_load_barycentric_at_sample:
2236
/* We stored the sample in the fake "bary" dest. */
2237
ntt_INTERP_SAMPLE(c, ntt_get_dest(c, &instr->dest), input,
2238
ntt_get_src(c, instr->src[0]));
2241
case nir_intrinsic_load_barycentric_at_offset:
2242
/* We stored the offset in the fake "bary" dest. */
2243
ntt_INTERP_OFFSET(c, ntt_get_dest(c, &instr->dest), input,
2244
ntt_get_src(c, instr->src[0]));
2248
unreachable("bad barycentric interp intrinsic\n");
2254
unreachable("bad load input intrinsic\n");
2259
ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2261
struct ureg_src src = ntt_get_src(c, instr->src[0]);
2263
if (src.File == TGSI_FILE_OUTPUT) {
2264
/* If our src is the output file, that's an indication that we were able
2265
* to emit the output stores in the generating instructions and we have
2266
* nothing to do here.
2272
struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2274
if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
2275
out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
2276
out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
2278
out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2281
uint8_t swizzle[4] = { 0, 0, 0, 0 };
2282
for (int i = frac; i <= 4; i++) {
2283
if (out.WriteMask & (1 << i))
2284
swizzle[i] = i - frac;
2287
src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2289
ntt_MOV(c, out, src);
2293
ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
2295
nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
2297
/* ntt_try_store_in_tgsi_output() optimization is not valid if normal
2298
* load_output is present.
2300
assert(c->s->info.stage != MESA_SHADER_VERTEX &&
2301
(c->s->info.stage != MESA_SHADER_FRAGMENT || semantics.fb_fetch_output));
2304
struct ureg_dst out = ntt_output_decl(c, instr, &frac);
2306
if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
2307
out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
2308
out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
2310
out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
2313
if (semantics.fb_fetch_output)
2314
ntt_FBFETCH(c, ntt_get_dest(c, &instr->dest), ureg_src(out));
2316
ntt_MOV(c, ntt_get_dest(c, &instr->dest), ureg_src(out));
2320
ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
2322
gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
2323
enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
2324
struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
2326
/* virglrenderer doesn't like references to channels of the sysval that
2327
* aren't defined, even if they aren't really read. (GLSL compile fails on
2328
* gl_NumWorkGroups.w, for example).
2330
uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
2331
sv = ntt_swizzle_for_write_mask(sv, write_mask);
2333
/* TGSI and NIR define these intrinsics as always loading ints, but they can
2334
* still appear on hardware with non-native-integers fragment shaders using
2335
* the draw path (i915g). In that case, having called nir_lower_int_to_float
2336
* means that we actually want floats instead.
2338
if (!c->native_integers) {
2339
switch (instr->intrinsic) {
2340
case nir_intrinsic_load_vertex_id:
2341
case nir_intrinsic_load_instance_id:
2342
ntt_U2F(c, ntt_get_dest(c, &instr->dest), sv);
2350
ntt_store(c, &instr->dest, sv);
2354
ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
2356
switch (instr->intrinsic) {
2357
case nir_intrinsic_load_ubo:
2358
case nir_intrinsic_load_ubo_vec4:
2359
ntt_emit_load_ubo(c, instr);
2363
case nir_intrinsic_load_vertex_id:
2364
case nir_intrinsic_load_vertex_id_zero_base:
2365
case nir_intrinsic_load_base_vertex:
2366
case nir_intrinsic_load_base_instance:
2367
case nir_intrinsic_load_instance_id:
2368
case nir_intrinsic_load_draw_id:
2369
case nir_intrinsic_load_invocation_id:
2370
case nir_intrinsic_load_frag_coord:
2371
case nir_intrinsic_load_point_coord:
2372
case nir_intrinsic_load_front_face:
2373
case nir_intrinsic_load_sample_id:
2374
case nir_intrinsic_load_sample_pos:
2375
case nir_intrinsic_load_sample_mask_in:
2376
case nir_intrinsic_load_helper_invocation:
2377
case nir_intrinsic_load_tess_coord:
2378
case nir_intrinsic_load_patch_vertices_in:
2379
case nir_intrinsic_load_primitive_id:
2380
case nir_intrinsic_load_tess_level_outer:
2381
case nir_intrinsic_load_tess_level_inner:
2382
case nir_intrinsic_load_local_invocation_id:
2383
case nir_intrinsic_load_workgroup_id:
2384
case nir_intrinsic_load_num_workgroups:
2385
case nir_intrinsic_load_workgroup_size:
2386
case nir_intrinsic_load_subgroup_size:
2387
case nir_intrinsic_load_subgroup_invocation:
2388
case nir_intrinsic_load_subgroup_eq_mask:
2389
case nir_intrinsic_load_subgroup_ge_mask:
2390
case nir_intrinsic_load_subgroup_gt_mask:
2391
case nir_intrinsic_load_subgroup_lt_mask:
2392
ntt_emit_load_sysval(c, instr);
2395
case nir_intrinsic_load_input:
2396
case nir_intrinsic_load_per_vertex_input:
2397
case nir_intrinsic_load_interpolated_input:
2398
ntt_emit_load_input(c, instr);
2401
case nir_intrinsic_store_output:
2402
case nir_intrinsic_store_per_vertex_output:
2403
ntt_emit_store_output(c, instr);
2406
case nir_intrinsic_load_output:
2407
case nir_intrinsic_load_per_vertex_output:
2408
ntt_emit_load_output(c, instr);
2411
case nir_intrinsic_discard:
2415
case nir_intrinsic_discard_if: {
2416
struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
2418
if (c->native_integers) {
2419
struct ureg_dst temp = ureg_writemask(ntt_temp(c), 1);
2420
ntt_AND(c, temp, cond, ureg_imm1f(c->ureg, 1.0));
2421
ntt_KILL_IF(c, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
2423
/* For !native_integers, the bool got lowered to 1.0 or 0.0. */
2424
ntt_KILL_IF(c, ureg_negate(cond));
2429
case nir_intrinsic_load_ssbo:
2430
case nir_intrinsic_store_ssbo:
2431
case nir_intrinsic_ssbo_atomic_add:
2432
case nir_intrinsic_ssbo_atomic_fadd:
2433
case nir_intrinsic_ssbo_atomic_imin:
2434
case nir_intrinsic_ssbo_atomic_imax:
2435
case nir_intrinsic_ssbo_atomic_umin:
2436
case nir_intrinsic_ssbo_atomic_umax:
2437
case nir_intrinsic_ssbo_atomic_and:
2438
case nir_intrinsic_ssbo_atomic_or:
2439
case nir_intrinsic_ssbo_atomic_xor:
2440
case nir_intrinsic_ssbo_atomic_exchange:
2441
case nir_intrinsic_ssbo_atomic_comp_swap:
2442
case nir_intrinsic_get_ssbo_size:
2443
ntt_emit_mem(c, instr, nir_var_mem_ssbo);
2446
case nir_intrinsic_load_shared:
2447
case nir_intrinsic_store_shared:
2448
case nir_intrinsic_shared_atomic_add:
2449
case nir_intrinsic_shared_atomic_fadd:
2450
case nir_intrinsic_shared_atomic_imin:
2451
case nir_intrinsic_shared_atomic_imax:
2452
case nir_intrinsic_shared_atomic_umin:
2453
case nir_intrinsic_shared_atomic_umax:
2454
case nir_intrinsic_shared_atomic_and:
2455
case nir_intrinsic_shared_atomic_or:
2456
case nir_intrinsic_shared_atomic_xor:
2457
case nir_intrinsic_shared_atomic_exchange:
2458
case nir_intrinsic_shared_atomic_comp_swap:
2459
ntt_emit_mem(c, instr, nir_var_mem_shared);
2462
case nir_intrinsic_atomic_counter_read:
2463
case nir_intrinsic_atomic_counter_add:
2464
case nir_intrinsic_atomic_counter_inc:
2465
case nir_intrinsic_atomic_counter_post_dec:
2466
case nir_intrinsic_atomic_counter_min:
2467
case nir_intrinsic_atomic_counter_max:
2468
case nir_intrinsic_atomic_counter_and:
2469
case nir_intrinsic_atomic_counter_or:
2470
case nir_intrinsic_atomic_counter_xor:
2471
case nir_intrinsic_atomic_counter_exchange:
2472
case nir_intrinsic_atomic_counter_comp_swap:
2473
ntt_emit_mem(c, instr, nir_var_uniform);
2475
case nir_intrinsic_atomic_counter_pre_dec:
2476
unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
2479
case nir_intrinsic_image_load:
2480
case nir_intrinsic_image_store:
2481
case nir_intrinsic_image_size:
2482
case nir_intrinsic_image_samples:
2483
case nir_intrinsic_image_atomic_add:
2484
case nir_intrinsic_image_atomic_fadd:
2485
case nir_intrinsic_image_atomic_imin:
2486
case nir_intrinsic_image_atomic_umin:
2487
case nir_intrinsic_image_atomic_imax:
2488
case nir_intrinsic_image_atomic_umax:
2489
case nir_intrinsic_image_atomic_and:
2490
case nir_intrinsic_image_atomic_or:
2491
case nir_intrinsic_image_atomic_xor:
2492
case nir_intrinsic_image_atomic_exchange:
2493
case nir_intrinsic_image_atomic_comp_swap:
2494
ntt_emit_image_load_store(c, instr);
2497
case nir_intrinsic_control_barrier:
2498
case nir_intrinsic_memory_barrier_tcs_patch:
2502
case nir_intrinsic_memory_barrier:
2503
ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2504
TGSI_MEMBAR_SHADER_BUFFER |
2505
TGSI_MEMBAR_ATOMIC_BUFFER |
2506
TGSI_MEMBAR_SHADER_IMAGE |
2507
TGSI_MEMBAR_SHARED));
2510
case nir_intrinsic_memory_barrier_atomic_counter:
2511
ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
2514
case nir_intrinsic_memory_barrier_buffer:
2515
ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
2518
case nir_intrinsic_memory_barrier_image:
2519
ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
2522
case nir_intrinsic_memory_barrier_shared:
2523
ntt_MEMBAR(c, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
2526
case nir_intrinsic_group_memory_barrier:
2527
ntt_MEMBAR(c, ureg_imm1u(c->ureg,
2528
TGSI_MEMBAR_SHADER_BUFFER |
2529
TGSI_MEMBAR_ATOMIC_BUFFER |
2530
TGSI_MEMBAR_SHADER_IMAGE |
2531
TGSI_MEMBAR_SHARED |
2532
TGSI_MEMBAR_THREAD_GROUP));
2535
case nir_intrinsic_end_primitive:
2536
ntt_ENDPRIM(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2539
case nir_intrinsic_emit_vertex:
2540
ntt_EMIT(c, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2543
/* In TGSI we don't actually generate the barycentric coords, and emit
2544
* interp intrinsics later. However, we do need to store the
2545
* load_barycentric_at_* argument so that we can use it at that point.
2547
case nir_intrinsic_load_barycentric_pixel:
2548
case nir_intrinsic_load_barycentric_centroid:
2549
case nir_intrinsic_load_barycentric_sample:
2551
case nir_intrinsic_load_barycentric_at_sample:
2552
case nir_intrinsic_load_barycentric_at_offset:
2553
ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
2556
case nir_intrinsic_shader_clock:
2557
ntt_CLOCK(c, ntt_get_dest(c, &instr->dest));
2561
fprintf(stderr, "Unknown intrinsic: ");
2562
nir_print_instr(&instr->instr, stderr);
2563
fprintf(stderr, "\n");
2568
struct ntt_tex_operand_state {
2569
struct ureg_src srcs[4];
2574
ntt_push_tex_arg(struct ntt_compile *c,
2575
nir_tex_instr *instr,
2576
nir_tex_src_type tex_src_type,
2577
struct ntt_tex_operand_state *s)
2579
int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2583
s->srcs[s->i++] = ntt_get_src(c, instr->src[tex_src].src);
2587
ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2589
struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2590
enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2591
unsigned tex_opcode;
2593
struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2594
int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2595
if (sampler_src >= 0) {
2596
struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2597
sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr, 2));
2600
switch (instr->op) {
2602
if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2603
MAX2(instr->coord_components, 2) + instr->is_shadow)
2604
tex_opcode = TGSI_OPCODE_TXP;
2606
tex_opcode = TGSI_OPCODE_TEX;
2609
case nir_texop_txf_ms:
2610
tex_opcode = TGSI_OPCODE_TXF;
2612
if (c->has_txf_lz) {
2613
int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2615
nir_src_is_const(instr->src[lod_src].src) &&
2616
ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2617
tex_opcode = TGSI_OPCODE_TXF_LZ;
2622
tex_opcode = TGSI_OPCODE_TXL;
2625
tex_opcode = TGSI_OPCODE_TXB;
2628
tex_opcode = TGSI_OPCODE_TXD;
2631
tex_opcode = TGSI_OPCODE_TXQ;
2634
tex_opcode = TGSI_OPCODE_TG4;
2636
case nir_texop_query_levels:
2637
tex_opcode = TGSI_OPCODE_TXQ;
2640
tex_opcode = TGSI_OPCODE_LODQ;
2642
case nir_texop_texture_samples:
2643
tex_opcode = TGSI_OPCODE_TXQS;
2646
unreachable("unsupported tex op");
2649
struct ntt_tex_operand_state s = { .i = 0 };
2650
ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2651
ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2653
/* non-coord arg for TXQ */
2654
if (tex_opcode == TGSI_OPCODE_TXQ) {
2655
ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2656
/* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2659
s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2663
if (tex_opcode == TGSI_OPCODE_TEX)
2664
tex_opcode = TGSI_OPCODE_TEX2;
2665
if (tex_opcode == TGSI_OPCODE_TXB)
2666
tex_opcode = TGSI_OPCODE_TXB2;
2667
if (tex_opcode == TGSI_OPCODE_TXL)
2668
tex_opcode = TGSI_OPCODE_TXL2;
2671
if (instr->op == nir_texop_txd) {
2672
/* Derivs appear in their own src args */
2673
int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2674
int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2675
s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2676
s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2679
if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2680
if (c->screen->get_param(c->screen,
2681
PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2682
sampler = ureg_scalar(sampler, instr->component);
2683
s.srcs[s.i++] = ureg_src_undef();
2685
s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2689
s.srcs[s.i++] = sampler;
2691
enum tgsi_return_type tex_type;
2692
switch (instr->dest_type) {
2693
case nir_type_float32:
2694
tex_type = TGSI_RETURN_TYPE_FLOAT;
2696
case nir_type_int32:
2697
tex_type = TGSI_RETURN_TYPE_SINT;
2699
case nir_type_uint32:
2700
tex_type = TGSI_RETURN_TYPE_UINT;
2703
unreachable("unknown texture type");
2706
struct tgsi_texture_offset tex_offset = {
2707
.File = TGSI_FILE_NULL
2709
int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2710
if (tex_offset_src >= 0) {
2711
struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2713
tex_offset.File = offset.File;
2714
tex_offset.Index = offset.Index;
2715
tex_offset.SwizzleX = offset.SwizzleX;
2716
tex_offset.SwizzleY = offset.SwizzleY;
2717
tex_offset.SwizzleZ = offset.SwizzleZ;
2718
tex_offset.Padding = 0;
2721
struct ureg_dst tex_dst;
2722
if (instr->op == nir_texop_query_levels)
2723
tex_dst = ureg_writemask(ntt_temp(c), TGSI_WRITEMASK_W);
2728
s.srcs[s.i++] = ureg_src_undef();
2730
struct ntt_insn *insn = ntt_insn(c, tex_opcode, tex_dst, s.srcs[0], s.srcs[1], s.srcs[2], s.srcs[3]);
2731
insn->tex_target = target;
2732
insn->tex_return_type = tex_type;
2733
insn->tex_offset = tex_offset;
2734
insn->is_tex = true;
2736
if (instr->op == nir_texop_query_levels)
2737
ntt_MOV(c, dst, ureg_scalar(ureg_src(tex_dst), 3));
2741
ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2743
switch (jump->type) {
2744
case nir_jump_break:
2748
case nir_jump_continue:
2753
fprintf(stderr, "Unknown jump instruction: ");
2754
nir_print_instr(&jump->instr, stderr);
2755
fprintf(stderr, "\n");
2761
ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2763
/* Nothing to do but make sure that we have some storage to deref. */
2764
(void)ntt_get_ssa_def_decl(c, &instr->def);
2768
ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2770
switch (instr->type) {
2771
case nir_instr_type_deref:
2772
/* ignored, will be walked by nir_intrinsic_image_*_deref. */
2775
case nir_instr_type_alu:
2776
ntt_emit_alu(c, nir_instr_as_alu(instr));
2779
case nir_instr_type_intrinsic:
2780
ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2783
case nir_instr_type_load_const:
2784
/* Nothing to do here, as load consts are done directly from
2785
* ntt_get_src() (since many constant NIR srcs will often get folded
2786
* directly into a register file index instead of as a TGSI src).
2790
case nir_instr_type_tex:
2791
ntt_emit_texture(c, nir_instr_as_tex(instr));
2794
case nir_instr_type_jump:
2795
ntt_emit_jump(c, nir_instr_as_jump(instr));
2798
case nir_instr_type_ssa_undef:
2799
ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2803
fprintf(stderr, "Unknown NIR instr type: ");
2804
nir_print_instr(instr, stderr);
2805
fprintf(stderr, "\n");
2811
ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2813
if (c->native_integers)
2814
ntt_UIF(c, c->if_cond);
2816
ntt_IF(c, c->if_cond);
2818
ntt_emit_cf_list(c, &if_stmt->then_list);
2820
if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2822
ntt_emit_cf_list(c, &if_stmt->else_list);
2829
ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2832
ntt_emit_cf_list(c, &loop->body);
2837
ntt_emit_block(struct ntt_compile *c, nir_block *block)
2839
struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2840
c->cur_block = ntt_block;
2842
nir_foreach_instr(instr, block) {
2843
ntt_emit_instr(c, instr);
2845
/* Sanity check that we didn't accidentally ureg_OPCODE() instead of ntt_OPCODE(). */
2846
if (ureg_get_instruction_number(c->ureg) != 0) {
2847
fprintf(stderr, "Emitted ureg insn during: ");
2848
nir_print_instr(instr, stderr);
2849
fprintf(stderr, "\n");
2850
unreachable("emitted ureg insn");
2854
/* Set up the if condition for ntt_emit_if(), which we have to do before
2855
* freeing up the temps (the "if" is treated as inside the block for liveness
2856
* purposes, despite not being an instruction)
2858
* Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2859
* looks at all of .xyzw. No harm in working around the bug.
2861
nir_if *nif = nir_block_get_following_if(block);
2863
c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2867
ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2869
foreach_list_typed(nir_cf_node, node, node, list) {
2870
switch (node->type) {
2871
case nir_cf_node_block:
2872
ntt_emit_block(c, nir_cf_node_as_block(node));
2875
case nir_cf_node_if:
2876
ntt_emit_if(c, nir_cf_node_as_if(node));
2879
case nir_cf_node_loop:
2880
ntt_emit_loop(c, nir_cf_node_as_loop(node));
2884
unreachable("unknown CF type");
2890
ntt_emit_block_ureg(struct ntt_compile *c, struct nir_block *block)
2892
struct ntt_block *ntt_block = ntt_block_from_nir(c, block);
2894
/* Emit the ntt insns to tgsi_ureg. */
2895
util_dynarray_foreach(&ntt_block->insns, struct ntt_insn, insn) {
2896
const struct tgsi_opcode_info *opcode_info =
2897
tgsi_get_opcode_info(insn->opcode);
2899
switch (insn->opcode) {
2900
case TGSI_OPCODE_UIF:
2901
ureg_UIF(c->ureg, insn->src[0], &c->cf_label);
2904
case TGSI_OPCODE_IF:
2905
ureg_IF(c->ureg, insn->src[0], &c->cf_label);
2908
case TGSI_OPCODE_ELSE:
2909
ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2910
ureg_ELSE(c->ureg, &c->cf_label);
2911
c->current_if_else = c->cf_label;
2914
case TGSI_OPCODE_ENDIF:
2915
ureg_fixup_label(c->ureg, c->current_if_else, ureg_get_instruction_number(c->ureg));
2916
ureg_ENDIF(c->ureg);
2919
case TGSI_OPCODE_BGNLOOP:
2920
/* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2921
* does reference BGNLOOP's. Follow the former behavior unless something comes up
2924
ureg_BGNLOOP(c->ureg, &c->cf_label);
2927
case TGSI_OPCODE_ENDLOOP:
2928
ureg_ENDLOOP(c->ureg, &c->cf_label);
2933
ureg_tex_insn(c->ureg, insn->opcode,
2934
insn->dst, opcode_info->num_dst,
2935
insn->tex_target, insn->tex_return_type,
2937
insn->tex_offset.File != TGSI_FILE_NULL ? 1 : 0,
2938
insn->src, opcode_info->num_src);
2939
} else if (insn->is_mem) {
2940
ureg_memory_insn(c->ureg, insn->opcode,
2941
insn->dst, opcode_info->num_dst,
2942
insn->src, opcode_info->num_src,
2943
insn->mem_qualifier,
2947
ureg_insn(c->ureg, insn->opcode,
2948
insn->dst, opcode_info->num_dst,
2949
insn->src, opcode_info->num_src,
2957
ntt_emit_if_ureg(struct ntt_compile *c, nir_if *if_stmt)
2959
/* Note: the last block emitted our IF opcode. */
2961
int if_stack = c->current_if_else;
2962
c->current_if_else = c->cf_label;
2964
/* Either the then or else block includes the ENDIF, which will fix up the
2965
* IF(/ELSE)'s label for jumping
2967
ntt_emit_cf_list_ureg(c, &if_stmt->then_list);
2968
ntt_emit_cf_list_ureg(c, &if_stmt->else_list);
2970
c->current_if_else = if_stack;
2974
ntt_emit_cf_list_ureg(struct ntt_compile *c, struct exec_list *list)
2976
foreach_list_typed(nir_cf_node, node, node, list) {
2977
switch (node->type) {
2978
case nir_cf_node_block:
2979
ntt_emit_block_ureg(c, nir_cf_node_as_block(node));
2982
case nir_cf_node_if:
2983
ntt_emit_if_ureg(c, nir_cf_node_as_if(node));
2986
case nir_cf_node_loop:
2987
/* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2988
* does reference BGNLOOP's. Follow the former behavior unless something comes up
2991
ntt_emit_cf_list_ureg(c, &nir_cf_node_as_loop(node)->body);
2995
unreachable("unknown CF type");
3001
ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
3005
c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
3006
c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
3008
/* Set up the struct ntt_blocks to put insns in */
3009
c->blocks = _mesa_pointer_hash_table_create(c);
3010
nir_foreach_block(block, impl) {
3011
struct ntt_block *ntt_block = rzalloc(c->blocks, struct ntt_block);
3012
util_dynarray_init(&ntt_block->insns, ntt_block);
3013
_mesa_hash_table_insert(c->blocks, block, ntt_block);
3016
ntt_setup_registers(c, &impl->registers);
3018
c->cur_block = ntt_block_from_nir(c, nir_start_block(impl));
3019
ntt_setup_inputs(c);
3020
ntt_setup_outputs(c);
3021
ntt_setup_uniforms(c);
3023
/* Emit the ntt insns */
3024
ntt_emit_cf_list(c, &impl->body);
3026
ntt_allocate_regs(c, impl);
3028
/* Turn the ntt insns into actual TGSI tokens */
3029
ntt_emit_cf_list_ureg(c, &impl->body);
3031
ralloc_free(c->liveness);
3037
type_size(const struct glsl_type *type, bool bindless)
3039
return glsl_count_attribute_slots(type, false);
3042
/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
3043
* can handle for 64-bit values in TGSI.
3046
ntt_should_vectorize_instr(const nir_instr *instr, void *data)
3048
if (instr->type != nir_instr_type_alu)
3051
nir_alu_instr *alu = nir_instr_as_alu(instr);
3054
case nir_op_ibitfield_extract:
3055
case nir_op_ubitfield_extract:
3056
case nir_op_bitfield_insert:
3057
/* virglrenderer only looks at the .x channel of the offset/bits operands
3058
* when translating to GLSL. tgsi.rst doesn't seem to require scalar
3059
* offset/bits operands.
3061
* https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
3069
int src_bit_size = nir_src_bit_size(alu->src[0].src);
3070
int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
3072
if (src_bit_size == 64 || dst_bit_size == 64) {
3073
/* Avoid vectorizing 64-bit instructions at all. Despite tgsi.rst
3074
* claiming support, virglrenderer generates bad shaders on the host when
3075
* presented with them. Maybe we can make virgl avoid tickling the
3076
* virglrenderer bugs, but given that glsl-to-TGSI didn't generate vector
3077
* 64-bit instrs in the first place, I don't see much reason to care about
3087
ntt_should_vectorize_io(unsigned align, unsigned bit_size,
3088
unsigned num_components, unsigned high_offset,
3089
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
3095
/* Our offset alignment should aways be at least 4 bytes */
3099
/* No wrapping off the end of a TGSI reg. We could do a bit better by
3100
* looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
3101
* need this restriction.
3103
unsigned worst_start_component = align == 4 ? 3 : align / 4;
3104
if (worst_start_component + num_components > 4)
3110
static nir_variable_mode
3111
ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
3113
unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3114
unsigned indirect_mask = 0;
3116
if (!screen->get_shader_param(screen, pipe_stage,
3117
PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
3118
indirect_mask |= nir_var_shader_in;
3121
if (!screen->get_shader_param(screen, pipe_stage,
3122
PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
3123
indirect_mask |= nir_var_shader_out;
3126
if (!screen->get_shader_param(screen, pipe_stage,
3127
PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
3128
indirect_mask |= nir_var_function_temp;
3131
return indirect_mask;
3135
ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
3138
unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
3139
unsigned control_flow_depth =
3140
screen->get_shader_param(screen, pipe_stage,
3141
PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
3145
NIR_PASS_V(s, nir_lower_vars_to_ssa);
3147
NIR_PASS(progress, s, nir_copy_prop);
3148
NIR_PASS(progress, s, nir_opt_algebraic);
3149
NIR_PASS(progress, s, nir_opt_constant_folding);
3150
NIR_PASS(progress, s, nir_opt_remove_phis);
3151
NIR_PASS(progress, s, nir_opt_conditional_discard);
3152
NIR_PASS(progress, s, nir_opt_dce);
3153
NIR_PASS(progress, s, nir_opt_dead_cf);
3154
NIR_PASS(progress, s, nir_opt_cse);
3155
NIR_PASS(progress, s, nir_opt_find_array_copies);
3156
NIR_PASS(progress, s, nir_opt_copy_prop_vars);
3157
NIR_PASS(progress, s, nir_opt_dead_write_vars);
3159
NIR_PASS(progress, s, nir_opt_if, true);
3160
NIR_PASS(progress, s, nir_opt_peephole_select,
3161
control_flow_depth == 0 ? ~0 : 8, true, true);
3162
NIR_PASS(progress, s, nir_opt_algebraic);
3163
NIR_PASS(progress, s, nir_opt_constant_folding);
3164
nir_load_store_vectorize_options vectorize_opts = {
3165
.modes = nir_var_mem_ubo,
3166
.callback = ntt_should_vectorize_io,
3169
NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
3170
NIR_PASS(progress, s, nir_opt_shrink_stores, true);
3171
NIR_PASS(progress, s, nir_opt_shrink_vectors);
3172
NIR_PASS(progress, s, nir_opt_trivial_continues);
3173
NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
3174
NIR_PASS(progress, s, nir_opt_undef);
3175
NIR_PASS(progress, s, nir_opt_loop_unroll);
3177
/* Try to fold addressing math into ubo_vec4's base to avoid load_consts
3178
* and ALU ops for it.
3180
static const nir_opt_offsets_options offset_options = {
3183
/* No const offset in TGSI for shared accesses. */
3186
/* unused intrinsics */
3190
NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
3193
NIR_PASS_V(s, nir_lower_var_copies);
3196
/* Scalarizes all 64-bit ALU ops. Note that we only actually need to
3197
* scalarize vec3/vec4s, should probably fix that.
3200
scalarize_64bit(const nir_instr *instr, const void *data)
3202
const nir_alu_instr *alu = nir_instr_as_alu(instr);
3204
return (nir_dest_bit_size(alu->dest.dest) == 64 ||
3205
nir_src_bit_size(alu->src[0].src) == 64);
3209
nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
3211
b->cursor = nir_after_instr(&instr->instr);
3213
switch (instr->intrinsic) {
3214
case nir_intrinsic_load_ubo:
3215
case nir_intrinsic_load_ubo_vec4:
3216
case nir_intrinsic_load_ssbo:
3217
case nir_intrinsic_load_input:
3218
case nir_intrinsic_load_interpolated_input:
3219
case nir_intrinsic_load_per_vertex_input:
3220
case nir_intrinsic_store_output:
3221
case nir_intrinsic_store_ssbo:
3227
if (instr->num_components <= 2)
3230
bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
3232
if (nir_dest_bit_size(instr->dest) != 64)
3235
if (nir_src_bit_size(instr->src[0]) != 64)
3239
nir_intrinsic_instr *first =
3240
nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3241
nir_intrinsic_instr *second =
3242
nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
3244
switch (instr->intrinsic) {
3245
case nir_intrinsic_load_ubo:
3246
case nir_intrinsic_load_ubo_vec4:
3247
case nir_intrinsic_load_ssbo:
3248
case nir_intrinsic_store_ssbo:
3252
nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
3253
semantics.location++;
3254
semantics.num_slots--;
3255
nir_intrinsic_set_io_semantics(second, semantics);
3257
nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
3262
first->num_components = 2;
3263
second->num_components -= 2;
3265
first->dest.ssa.num_components = 2;
3266
second->dest.ssa.num_components -= 2;
3269
nir_builder_instr_insert(b, &first->instr);
3270
nir_builder_instr_insert(b, &second->instr);
3273
/* Merge the two loads' results back into a vector. */
3274
nir_ssa_scalar channels[4] = {
3275
nir_get_ssa_scalar(&first->dest.ssa, 0),
3276
nir_get_ssa_scalar(&first->dest.ssa, 1),
3277
nir_get_ssa_scalar(&second->dest.ssa, 0),
3278
nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0),
3280
nir_ssa_def *new = nir_vec_scalars(b, channels, instr->num_components);
3281
nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
3283
/* Split the src value across the two stores. */
3284
b->cursor = nir_before_instr(&instr->instr);
3286
nir_ssa_def *src0 = instr->src[0].ssa;
3287
nir_ssa_scalar channels[4] = { 0 };
3288
for (int i = 0; i < instr->num_components; i++)
3289
channels[i] = nir_get_ssa_scalar(src0, i);
3291
nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
3292
nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
3294
nir_instr_rewrite_src(&first->instr, &first->src[0],
3295
nir_src_for_ssa(nir_vec_scalars(b, channels, 2)));
3296
nir_instr_rewrite_src(&second->instr, &second->src[0],
3297
nir_src_for_ssa(nir_vec_scalars(b, &channels[2],
3298
second->num_components)));
3301
int offset_src = -1;
3302
uint32_t offset_amount = 16;
3304
switch (instr->intrinsic) {
3305
case nir_intrinsic_load_ssbo:
3306
case nir_intrinsic_load_ubo:
3309
case nir_intrinsic_load_ubo_vec4:
3313
case nir_intrinsic_store_ssbo:
3319
if (offset_src != -1) {
3320
b->cursor = nir_before_instr(&second->instr);
3321
nir_ssa_def *second_offset =
3322
nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
3323
nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
3324
nir_src_for_ssa(second_offset));
3327
/* DCE stores we generated with no writemask (nothing else does this
3331
if (nir_intrinsic_write_mask(first) == 0)
3332
nir_instr_remove(&first->instr);
3333
if (nir_intrinsic_write_mask(second) == 0)
3334
nir_instr_remove(&second->instr);
3337
nir_instr_remove(&instr->instr);
3343
nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
3345
int num_components = instr->def.num_components;
3347
if (instr->def.bit_size != 64 || num_components <= 2)
3350
b->cursor = nir_before_instr(&instr->instr);
3352
nir_load_const_instr *first =
3353
nir_load_const_instr_create(b->shader, 2, 64);
3354
nir_load_const_instr *second =
3355
nir_load_const_instr_create(b->shader, num_components - 2, 64);
3357
first->value[0] = instr->value[0];
3358
first->value[1] = instr->value[1];
3359
second->value[0] = instr->value[2];
3360
if (num_components == 4)
3361
second->value[1] = instr->value[3];
3363
nir_builder_instr_insert(b, &first->instr);
3364
nir_builder_instr_insert(b, &second->instr);
3366
nir_ssa_def *channels[4] = {
3367
nir_channel(b, &first->def, 0),
3368
nir_channel(b, &first->def, 1),
3369
nir_channel(b, &second->def, 0),
3370
num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
3372
nir_ssa_def *new = nir_vec(b, channels, num_components);
3373
nir_ssa_def_rewrite_uses(&instr->def, new);
3374
nir_instr_remove(&instr->instr);
3380
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
3383
switch (instr->type) {
3384
case nir_instr_type_load_const:
3385
return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
3387
case nir_instr_type_intrinsic:
3388
return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
3395
nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
3397
return nir_shader_instructions_pass(s,
3398
nir_to_tgsi_lower_64bit_to_vec2_instr,
3399
nir_metadata_block_index |
3400
nir_metadata_dominance,
3404
struct ntt_lower_tex_state {
3405
nir_ssa_scalar channels[8];
3410
nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
3411
nir_tex_instr *instr,
3412
nir_tex_src_type tex_src_type,
3413
struct ntt_lower_tex_state *s)
3415
int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
3419
assert(instr->src[tex_src].src.is_ssa);
3421
nir_ssa_def *def = instr->src[tex_src].src.ssa;
3422
for (int i = 0; i < def->num_components; i++) {
3423
s->channels[s->i++] = nir_get_ssa_scalar(def, i);
3426
nir_tex_instr_remove_src(instr, tex_src);
3430
* Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
3431
* src. This lets NIR handle the coalescing of the vec4 rather than trying to
3432
* manage it on our own, and may lead to more vectorization.
3435
nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
3437
if (instr->type != nir_instr_type_tex)
3440
nir_tex_instr *tex = nir_instr_as_tex(instr);
3442
if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
3445
/* NIR after lower_tex will have LOD set to 0 for tex ops that wanted
3446
* implicit lod in shader stages that don't have quad-based derivatives.
3447
* TGSI doesn't want that, it requires that the backend do implict LOD 0 for
3450
if (!nir_shader_supports_implicit_lod(b->shader) && tex->op == nir_texop_txl) {
3451
int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
3452
nir_src *lod_src = &tex->src[lod_index].src;
3453
if (nir_src_is_const(*lod_src) && nir_src_as_uint(*lod_src) == 0) {
3454
nir_tex_instr_remove_src(tex, lod_index);
3455
tex->op = nir_texop_tex;
3459
b->cursor = nir_before_instr(instr);
3461
struct ntt_lower_tex_state s = {0};
3463
nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
3464
/* We always have at least two slots for the coordinate, even on 1D. */
3467
nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
3470
nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
3473
nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
3474
nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
3475
nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
3477
/* No need to pack undefs in unused channels of the tex instr */
3478
while (!s.channels[s.i - 1].def)
3481
/* Instead of putting undefs in the unused slots of the vecs, just put in
3482
* another used channel. Otherwise, we'll get unnecessary moves into
3485
assert(s.channels[0].def != NULL);
3486
for (int i = 1; i < s.i; i++) {
3487
if (!s.channels[i].def)
3488
s.channels[i] = s.channels[0];
3491
nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec_scalars(b, s.channels, MIN2(s.i, 4))));
3493
nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec_scalars(b, &s.channels[4], s.i - 4)));
3499
nir_to_tgsi_lower_tex(nir_shader *s)
3501
return nir_shader_instructions_pass(s,
3502
nir_to_tgsi_lower_tex_instr,
3503
nir_metadata_block_index |
3504
nir_metadata_dominance,
3509
ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
3510
const struct nir_to_tgsi_options *ntt_options)
3512
const struct nir_shader_compiler_options *options = s->options;
3514
!screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
3515
PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
3517
nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3519
if (!options->lower_extract_byte ||
3520
!options->lower_extract_word ||
3521
!options->lower_insert_byte ||
3522
!options->lower_insert_word ||
3523
!options->lower_fdph ||
3524
!options->lower_flrp64 ||
3525
!options->lower_fmod ||
3526
!options->lower_rotate ||
3527
!options->lower_uniforms_to_ubo ||
3528
!options->lower_vector_cmp ||
3529
options->lower_fsqrt != lower_fsqrt ||
3530
options->force_indirect_unrolling != no_indirects_mask) {
3531
nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
3532
*new_options = *s->options;
3534
new_options->lower_extract_byte = true;
3535
new_options->lower_extract_word = true;
3536
new_options->lower_insert_byte = true;
3537
new_options->lower_insert_word = true;
3538
new_options->lower_fdph = true;
3539
new_options->lower_flrp64 = true;
3540
new_options->lower_fmod = true;
3541
new_options->lower_rotate = true;
3542
new_options->lower_uniforms_to_ubo = true,
3543
new_options->lower_vector_cmp = true;
3544
new_options->lower_fsqrt = lower_fsqrt;
3545
new_options->force_indirect_unrolling = no_indirects_mask;
3547
s->options = new_options;
3552
ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
3554
return (instr->type == nir_instr_type_intrinsic &&
3555
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
3558
static nir_ssa_def *
3559
ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
3561
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3563
nir_ssa_def *old_result = &intr->dest.ssa;
3564
intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
3566
return nir_iadd_imm(b, old_result, -1);
3570
ntt_lower_atomic_pre_dec(nir_shader *s)
3572
return nir_shader_lower_instructions(s,
3573
ntt_lower_atomic_pre_dec_filter,
3574
ntt_lower_atomic_pre_dec_lower, NULL);
3577
/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
3579
nir_to_tgsi_lower_txp(nir_shader *s)
3581
nir_lower_tex_options lower_tex_options = {
3585
nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
3586
nir_foreach_instr(instr, block) {
3587
if (instr->type != nir_instr_type_tex)
3589
nir_tex_instr *tex = nir_instr_as_tex(instr);
3591
if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
3594
bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
3595
bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
3596
bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
3598
/* We can do TXP for any tex (not txg) where we can fit all the
3599
* coordinates and comparator and projector in one vec4 without any
3600
* other modifiers to add on.
3602
* nir_lower_tex() only handles the lowering on a sampler-dim basis, so
3603
* if we get any funny projectors then we just blow them all away.
3605
if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
3606
lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3610
/* nir_lower_tex must be run even if no options are set, because we need the
3611
* LOD to be set for query_levels and for non-fragment shaders.
3613
NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3617
nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3619
return (instr->type == nir_instr_type_intrinsic &&
3620
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3623
static nir_ssa_def *
3624
nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3626
nir_variable *var = *(nir_variable **)data;
3628
var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
3629
var->data.location = VARYING_SLOT_PRIMITIVE_ID;
3630
b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
3631
var->data.driver_location = b->shader->num_outputs++;
3633
*(nir_variable **)data = var;
3636
nir_io_semantics semantics = {
3637
.location = var->data.location,
3640
return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3641
.base = var->data.driver_location,
3642
.io_semantics = semantics);
3646
nir_lower_primid_sysval_to_input(nir_shader *s)
3648
nir_variable *input = NULL;
3650
return nir_shader_lower_instructions(s,
3651
nir_lower_primid_sysval_to_input_filter,
3652
nir_lower_primid_sysval_to_input_lower, &input);
3656
nir_to_tgsi(struct nir_shader *s,
3657
struct pipe_screen *screen)
3659
static const struct nir_to_tgsi_options default_ntt_options = {0};
3660
return nir_to_tgsi_options(s, screen, &default_ntt_options);
3664
* Translates the NIR shader to TGSI.
3666
* This requires some lowering of the NIR shader to prepare it for translation.
3667
* We take ownership of the NIR shader passed, returning a reference to the new
3668
* TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
3670
const void *nir_to_tgsi_options(struct nir_shader *s,
3671
struct pipe_screen *screen,
3672
const struct nir_to_tgsi_options *options)
3674
struct ntt_compile *c;
3675
const void *tgsi_tokens;
3676
nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3677
bool native_integers = screen->get_shader_param(screen,
3678
pipe_shader_type_from_mesa(s->info.stage),
3679
PIPE_SHADER_CAP_INTEGERS);
3680
const struct nir_shader_compiler_options *original_options = s->options;
3682
ntt_fix_nir_options(screen, s, options);
3684
NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3685
type_size, (nir_lower_io_options)0);
3686
NIR_PASS_V(s, nir_lower_regs_to_ssa);
3688
nir_to_tgsi_lower_txp(s);
3689
NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3691
/* While TGSI can represent PRIMID as either an input or a system value,
3692
* glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3695
if (s->info.stage == MESA_SHADER_GEOMETRY)
3696
NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3698
if (s->info.num_abos)
3699
NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3701
if (!original_options->lower_uniforms_to_ubo) {
3702
NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3703
screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3707
/* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3708
* TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3709
* duplication logic we just make it so that we only see vec2s.
3711
NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3712
NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3714
if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3715
NIR_PASS_V(s, nir_lower_ubo_vec4);
3717
ntt_optimize_nir(s, screen);
3719
NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3724
NIR_PASS(progress, s, nir_opt_algebraic_late);
3726
NIR_PASS_V(s, nir_copy_prop);
3727
NIR_PASS_V(s, nir_opt_dce);
3728
NIR_PASS_V(s, nir_opt_cse);
3732
if (screen->get_shader_param(screen,
3733
pipe_shader_type_from_mesa(s->info.stage),
3734
PIPE_SHADER_CAP_INTEGERS)) {
3735
NIR_PASS_V(s, nir_lower_bool_to_int32);
3737
NIR_PASS_V(s, nir_lower_int_to_float);
3738
NIR_PASS_V(s, nir_lower_bool_to_float);
3739
/* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3740
NIR_PASS_V(s, nir_copy_prop);
3741
NIR_PASS_V(s, nir_opt_dce);
3744
nir_move_options move_all =
3745
nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
3746
nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
3748
NIR_PASS_V(s, nir_opt_move, move_all);
3750
/* Only lower 32-bit floats. The only other modifier type officially
3751
* supported by TGSI is 32-bit integer negates, but even those are broken on
3752
* virglrenderer, so skip lowering all integer and f64 float mods.
3754
* The options->lower_fabs requests that we not have native source modifiers
3755
* for fabs, and instead emit MAX(a,-a) for nir_op_fabs.
3757
nir_lower_to_source_mods_flags source_mods = nir_lower_fneg_source_mods;
3758
if (!options->lower_fabs)
3759
source_mods |= nir_lower_fabs_source_mods;
3760
NIR_PASS_V(s, nir_lower_to_source_mods, source_mods);
3762
NIR_PASS_V(s, nir_convert_from_ssa, true);
3763
NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);
3765
/* locals_to_regs will leave dead derefs that are good to clean up. */
3766
NIR_PASS_V(s, nir_lower_locals_to_regs);
3767
NIR_PASS_V(s, nir_opt_dce);
3769
if (NIR_DEBUG(TGSI)) {
3770
fprintf(stderr, "NIR before translation to TGSI:\n");
3771
nir_print_shader(s, stderr);
3774
c = rzalloc(NULL, struct ntt_compile);
3776
c->options = options;
3778
c->needs_texcoord_semantic =
3779
screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
3781
screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
3784
c->native_integers = native_integers;
3785
c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
3786
ureg_setup_shader_info(c->ureg, &s->info);
3788
if (s->info.stage == MESA_SHADER_FRAGMENT) {
3789
/* The draw module's polygon stipple layer doesn't respect the chosen
3790
* coordinate mode, so leave it as unspecified unless we're actually
3791
* reading the position in the shader already. See
3792
* gl-2.1-polygon-stipple-fs on softpipe.
3794
if ((s->info.inputs_read & VARYING_BIT_POS) ||
3795
BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
3796
ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
3797
s->info.fs.origin_upper_left ?
3798
TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
3799
TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
3801
ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
3802
s->info.fs.pixel_center_integer ?
3803
TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
3804
TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
3807
/* Emit the main function */
3808
nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
3809
ntt_emit_impl(c, impl);
3812
tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
3814
if (NIR_DEBUG(TGSI)) {
3815
fprintf(stderr, "TGSI after translation from NIR:\n");
3816
tgsi_dump(tgsi_tokens, 0);
3819
ureg_destroy(c->ureg);
3827
static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
3828
.fdot_replicates = true,
3829
.fuse_ffma32 = true,
3830
.fuse_ffma64 = true,
3831
.lower_extract_byte = true,
3832
.lower_extract_word = true,
3833
.lower_insert_byte = true,
3834
.lower_insert_word = true,
3836
.lower_flrp64 = true,
3838
.lower_rotate = true,
3839
.lower_uniforms_to_ubo = true,
3840
.lower_vector_cmp = true,
3841
.use_interpolated_input_intrinsics = true,
3844
/* Returns a default compiler options for drivers with only nir-to-tgsi-based
3848
nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
3849
enum pipe_shader_ir ir,
3852
assert(ir == PIPE_SHADER_IR_NIR);
3853
return &nir_to_tgsi_compiler_options;