37
gather_intrinsic_load_input_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
38
struct radv_shader_info *info)
37
gather_intrinsic_load_input_info(const nir_shader *nir, const nir_intrinsic_instr *instr, struct radv_shader_info *info)
40
39
switch (nir->info.stage) {
41
40
case MESA_SHADER_VERTEX: {
134
gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
135
struct radv_shader_info *info, bool consider_force_vrs)
132
gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr, struct radv_shader_info *info,
133
bool consider_force_vrs)
137
135
switch (instr->intrinsic) {
138
136
case nir_intrinsic_load_barycentric_sample:
170
168
info->ps.needs_sample_positions = true;
171
case nir_intrinsic_load_provoking_vtx_amd:
172
info->ps.load_provoking_vtx = true;
174
case nir_intrinsic_load_sample_positions_amd:
175
info->ps.needs_sample_positions = true;
177
case nir_intrinsic_load_rasterization_primitive_amd:
178
info->ps.load_rasterization_prim = true;
173
180
case nir_intrinsic_load_local_invocation_id:
174
181
case nir_intrinsic_load_workgroup_id: {
175
182
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
198
205
case nir_intrinsic_image_deref_load:
199
206
case nir_intrinsic_image_deref_sparse_load:
200
207
case nir_intrinsic_image_deref_store:
201
case nir_intrinsic_image_deref_atomic_add:
202
case nir_intrinsic_image_deref_atomic_imin:
203
case nir_intrinsic_image_deref_atomic_umin:
204
case nir_intrinsic_image_deref_atomic_imax:
205
case nir_intrinsic_image_deref_atomic_umax:
206
case nir_intrinsic_image_deref_atomic_and:
207
case nir_intrinsic_image_deref_atomic_or:
208
case nir_intrinsic_image_deref_atomic_xor:
209
case nir_intrinsic_image_deref_atomic_exchange:
210
case nir_intrinsic_image_deref_atomic_comp_swap:
211
case nir_intrinsic_image_deref_atomic_fmin:
212
case nir_intrinsic_image_deref_atomic_fmax:
208
case nir_intrinsic_image_deref_atomic:
209
case nir_intrinsic_image_deref_atomic_swap:
213
210
case nir_intrinsic_image_deref_size:
214
211
case nir_intrinsic_image_deref_samples: {
216
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
212
nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
217
213
mark_sampler_desc(var, info);
258
gather_info_block(const nir_shader *nir, const nir_block *block, struct radv_shader_info *info,
259
bool consider_force_vrs)
260
gather_info_block(const nir_shader *nir, const nir_block *block, struct radv_shader_info *info, bool consider_force_vrs)
261
262
nir_foreach_instr (instr, block) {
262
263
switch (instr->type) {
321
assign_outinfo_param(struct radv_vs_output_info *outinfo, gl_varying_slot idx,
322
unsigned *total_param_exports, unsigned extra_offset)
322
assign_outinfo_param(struct radv_vs_output_info *outinfo, gl_varying_slot idx, unsigned *total_param_exports,
323
unsigned extra_offset)
324
325
if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
325
326
outinfo->vs_output_param_offset[idx] = extra_offset + (*total_param_exports)++;
329
assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask,
330
unsigned *total_param_exports, unsigned extra_offset)
330
assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask, unsigned *total_param_exports,
331
unsigned extra_offset)
332
u_foreach_bit64(idx, mask) {
333
if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
334
idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT)
333
u_foreach_bit64 (idx, mask) {
334
if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
335
idx == VARYING_SLOT_VIEWPORT)
335
336
assign_outinfo_param(outinfo, idx, total_param_exports, extra_offset);
340
radv_get_wave_size(struct radv_device *device, gl_shader_stage stage,
341
const struct radv_shader_info *info)
341
radv_get_wave_size(struct radv_device *device, gl_shader_stage stage, const struct radv_shader_info *info,
342
const struct radv_shader_stage_key *stage_key)
344
if (stage_key->subgroup_required_size)
345
return stage_key->subgroup_required_size * 32;
343
347
if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
345
349
else if (stage == MESA_SHADER_COMPUTE)
358
radv_get_ballot_bit_size(struct radv_device *device, gl_shader_stage stage,
359
const struct radv_shader_info *info)
362
radv_get_ballot_bit_size(struct radv_device *device, gl_shader_stage stage, const struct radv_shader_info *info,
363
const struct radv_shader_stage_key *stage_key)
361
if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
362
return info->cs.subgroup_size;
363
else if (gl_shader_stage_is_rt(stage))
364
return device->physical_device->rt_wave_size;
365
if (stage_key->subgroup_required_size)
366
return stage_key->subgroup_required_size * 32;
381
384
info->vs.vb_desc_usage_mask |= BITFIELD_BIT(key->vs.vertex_attribute_bindings[location]);
383
info->vs.input_slot_usage_mask |=
384
BITFIELD_RANGE(location, glsl_count_attribute_slots(type, false));
386
info->vs.input_slot_usage_mask |= BITFIELD_RANGE(location, glsl_count_attribute_slots(type, false));
385
387
} else if (glsl_type_is_matrix(type) || glsl_type_is_array(type)) {
386
388
const struct glsl_type *elem = glsl_get_array_element(type);
387
389
unsigned stride = glsl_count_attribute_slots(elem, false);
403
gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
404
const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
405
gather_shader_info_vs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key,
406
struct radv_shader_info *info)
406
408
if (pipeline_key->vs.has_prolog && nir->info.inputs_read) {
407
409
info->vs.has_prolog = true;
418
420
info->vs.needs_base_instance |= info->vs.has_prolog;
419
421
info->vs.needs_draw_id |= info->vs.has_prolog;
421
nir_foreach_shader_in_variable(var, nir)
422
gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type,
423
nir_foreach_shader_in_variable (var, nir)
424
gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type, pipeline_key, info);
425
426
if (info->vs.dynamic_inputs)
426
427
info->vs.vb_desc_usage_mask = BITFIELD_MASK(util_last_bit(info->vs.vb_desc_usage_mask));
437
gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
438
const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
438
gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key,
439
struct radv_shader_info *info)
440
441
info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
444
445
info->num_tess_patches =
445
446
get_tcs_num_patches(pipeline_key->tcs.tess_input_vertices, nir->info.tess.tcs_vertices_out,
446
447
info->tcs.num_linked_inputs, info->tcs.num_linked_outputs,
447
info->tcs.num_linked_patch_outputs,
448
device->physical_device->hs.tess_offchip_block_dw_size,
449
device->physical_device->rad_info.gfx_level,
450
device->physical_device->rad_info.family);
448
info->tcs.num_linked_patch_outputs, device->physical_device->hs.tess_offchip_block_dw_size,
449
device->physical_device->rad_info.gfx_level, device->physical_device->rad_info.family);
452
451
/* LDS size used by VS+TCS for storing TCS inputs and outputs. */
453
452
info->tcs.num_lds_blocks =
454
calculate_tess_lds_size(device->physical_device->rad_info.gfx_level,
455
pipeline_key->tcs.tess_input_vertices,
456
nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
457
info->num_tess_patches, info->tcs.num_linked_outputs,
458
info->tcs.num_linked_patch_outputs);
453
calculate_tess_lds_size(device->physical_device->rad_info.gfx_level, pipeline_key->tcs.tess_input_vertices,
454
nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, info->num_tess_patches,
455
info->tcs.num_linked_outputs, info->tcs.num_linked_patch_outputs);
480
477
info->gs.input_prim = nir->info.gs.input_primitive;
481
478
info->gs.output_prim = nir->info.gs.output_primitive;
482
479
info->gs.invocations = nir->info.gs.invocations;
483
info->gs.max_stream =
484
nir->info.gs.active_stream_mask ? util_last_bit(nir->info.gs.active_stream_mask) - 1 : 0;
480
info->gs.max_stream = nir->info.gs.active_stream_mask ? util_last_bit(nir->info.gs.active_stream_mask) - 1 : 0;
486
nir_foreach_shader_out_variable(var, nir) {
482
nir_foreach_shader_out_variable (var, nir) {
487
483
unsigned num_components = glsl_get_component_slots(var->type);
488
484
unsigned stream = var->data.stream;
534
530
ngg_info->prim_amp_factor = nir->info.mesh.max_primitives_out;
535
531
ngg_info->vgt_esgs_ring_itemsize = 1;
537
unsigned min_ngg_workgroup_size =
538
ac_compute_ngg_workgroup_size(ngg_info->hw_max_esverts, ngg_info->max_gsprims,
539
ngg_info->max_out_verts, ngg_info->prim_amp_factor);
533
unsigned min_ngg_workgroup_size = ac_compute_ngg_workgroup_size(ngg_info->hw_max_esverts, ngg_info->max_gsprims,
534
ngg_info->max_out_verts, ngg_info->prim_amp_factor);
541
unsigned api_workgroup_size =
542
ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
536
unsigned api_workgroup_size = ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
544
538
info->workgroup_size = MAX2(min_ngg_workgroup_size, api_workgroup_size);
566
560
info->ps.can_discard = nir->info.fs.uses_discard;
567
info->ps.early_fragment_test = nir->info.fs.early_fragment_tests ||
568
(nir->info.fs.early_and_late_fragment_tests &&
569
nir->info.fs.depth_layout == FRAG_DEPTH_LAYOUT_NONE &&
570
nir->info.fs.stencil_front_layout == FRAG_STENCIL_LAYOUT_NONE &&
571
nir->info.fs.stencil_back_layout == FRAG_STENCIL_LAYOUT_NONE);
561
info->ps.early_fragment_test =
562
nir->info.fs.early_fragment_tests ||
563
(nir->info.fs.early_and_late_fragment_tests && nir->info.fs.depth_layout == FRAG_DEPTH_LAYOUT_NONE &&
564
nir->info.fs.stencil_front_layout == FRAG_STENCIL_LAYOUT_NONE &&
565
nir->info.fs.stencil_back_layout == FRAG_STENCIL_LAYOUT_NONE);
572
566
info->ps.post_depth_coverage = nir->info.fs.post_depth_coverage;
573
567
info->ps.depth_layout = nir->info.fs.depth_layout;
574
568
info->ps.uses_sample_shading = nir->info.fs.uses_sample_shading;
585
579
info->ps.reads_frag_shading_rate = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_SHADING_RATE);
586
580
info->ps.reads_front_face = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
587
581
info->ps.reads_barycentric_model = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL);
588
info->ps.reads_fully_covered =
589
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FULLY_COVERED);
582
info->ps.reads_fully_covered = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FULLY_COVERED);
591
bool uses_persp_or_linear_interp = info->ps.reads_persp_center ||
592
info->ps.reads_persp_centroid ||
593
info->ps.reads_persp_sample ||
594
info->ps.reads_linear_center ||
595
info->ps.reads_linear_centroid ||
596
info->ps.reads_linear_sample;
584
bool uses_persp_or_linear_interp = info->ps.reads_persp_center || info->ps.reads_persp_centroid ||
585
info->ps.reads_persp_sample || info->ps.reads_linear_center ||
586
info->ps.reads_linear_centroid || info->ps.reads_linear_sample;
598
588
info->ps.allow_flat_shading =
599
!(uses_persp_or_linear_interp || info->ps.needs_sample_positions ||
600
info->ps.reads_frag_shading_rate || info->ps.writes_memory ||
601
nir->info.fs.needs_quad_helper_invocations ||
589
!(uses_persp_or_linear_interp || info->ps.needs_sample_positions || info->ps.reads_frag_shading_rate ||
590
info->ps.writes_memory || nir->info.fs.needs_quad_helper_invocations ||
602
591
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
603
592
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_POINT_COORD) ||
604
593
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
606
595
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) ||
607
596
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION));
598
info->ps.pops_is_per_sample =
599
info->ps.pops && (nir->info.fs.sample_interlock_ordered || nir->info.fs.sample_interlock_unordered);
609
601
info->ps.spi_ps_input = radv_compute_spi_ps_input(pipeline_key, info);
611
603
info->ps.has_epilog = pipeline_key->ps.has_epilog && info->ps.colors_written;
613
info->ps.writes_mrt0_alpha =
614
(pipeline_key->ps.alpha_to_coverage_via_mrtz && (info->ps.color0_written & 0x8)) &&
615
(info->ps.writes_z || info->ps.writes_stencil || info->ps.writes_sample_mask);
605
info->ps.writes_mrt0_alpha = (pipeline_key->ps.alpha_to_coverage_via_mrtz && (info->ps.color0_written & 0x8)) &&
606
(info->ps.writes_z || info->ps.writes_stencil || info->ps.writes_sample_mask);
617
608
info->ps.mrt0_is_dual_src = pipeline_key->ps.epilog.mrt0_is_dual_src;
619
610
info->ps.spi_shader_col_format = pipeline_key->ps.epilog.spi_shader_col_format;
621
nir_foreach_shader_in_variable(var, nir) {
622
unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
612
nir_foreach_shader_in_variable (var, nir) {
613
const struct glsl_type *type = var->data.per_vertex ? glsl_get_array_element(var->type) : var->type;
614
unsigned attrib_count = glsl_count_attribute_slots(type, false);
623
615
int idx = var->data.location;
635
627
unsigned component_count = var->data.location_frac + glsl_get_length(var->type);
636
628
attrib_count = (component_count + 3) / 4;
638
mark_16bit_ps_input(info, var->type, var->data.driver_location);
630
mark_16bit_ps_input(info, type, var->data.driver_location);
641
633
uint64_t mask = ((1ull << attrib_count) - 1);
645
637
info->ps.flat_shaded_mask |= mask << var->data.driver_location;
646
638
else if (var->data.interpolation == INTERP_MODE_EXPLICIT)
647
639
info->ps.explicit_shaded_mask |= mask << var->data.driver_location;
640
else if (var->data.per_vertex)
641
info->ps.per_vertex_shaded_mask |= mask << var->data.driver_location;
650
644
if (var->data.location >= VARYING_SLOT_VAR0) {
654
648
info->ps.input_mask |= mask << (var->data.location - VARYING_SLOT_VAR0);
652
/* Disable VRS and use the rates from PS_ITER_SAMPLES if:
654
* - The fragment shader reads gl_SampleMaskIn because the 16-bit sample coverage mask isn't enough for MSAA8x and
655
* 2x2 coarse shading.
656
* - On GFX10.3, if the fragment shader requests a fragment interlock execution mode even if the ordered section was
657
* optimized out, to consistently implement fragmentShadingRateWithFragmentShaderInterlock = VK_FALSE.
659
info->ps.force_sample_iter_shading_rate =
660
(info->ps.reads_sample_mask_in && !info->ps.needs_poly_line_smooth) ||
661
(device->physical_device->rad_info.gfx_level == GFX10_3 &&
662
(nir->info.fs.sample_interlock_ordered || nir->info.fs.sample_interlock_unordered ||
663
nir->info.fs.pixel_interlock_ordered || nir->info.fs.pixel_interlock_unordered));
665
/* DB_SHADER_CONTROL based on other fragment shader info fields. */
667
unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
668
if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
669
conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
670
else if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
671
conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
674
info->ps.early_fragment_test || !info->ps.writes_memory ? V_02880C_EARLY_Z_THEN_LATE_Z : V_02880C_LATE_Z;
676
/* It shouldn't be needed to export gl_SampleMask when MSAA is disabled, but this appears to break Project Cars
677
* (DXVK). See https://bugs.freedesktop.org/show_bug.cgi?id=109401
679
const bool mask_export_enable = info->ps.writes_sample_mask;
681
const bool disable_rbplus =
682
device->physical_device->rad_info.has_rbplus && !device->physical_device->rad_info.rbplus_allowed;
684
info->ps.db_shader_control =
685
S_02880C_Z_EXPORT_ENABLE(info->ps.writes_z) | S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->ps.writes_stencil) |
686
S_02880C_KILL_ENABLE(info->ps.can_discard) | S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
687
S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
688
S_02880C_DEPTH_BEFORE_SHADER(info->ps.early_fragment_test) |
689
S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(info->ps.post_depth_coverage) |
690
S_02880C_EXEC_ON_HIER_FAIL(info->ps.writes_memory) | S_02880C_EXEC_ON_NOOP(info->ps.writes_memory) |
691
S_02880C_DUAL_QUAD_DISABLE(disable_rbplus) | S_02880C_PRIMITIVE_ORDERED_PIXEL_SHADER(info->ps.pops);
671
gather_shader_info_cs(struct radv_device *device, const nir_shader *nir,
672
const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
706
gather_shader_info_cs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key,
707
struct radv_shader_info *info)
674
709
info->cs.uses_ray_launch_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_RAY_LAUNCH_SIZE_ADDR_AMD);
676
unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
677
unsigned req_subgroup_size = subgroup_size;
678
bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
680
711
unsigned default_wave_size = device->physical_device->cs_wave_size;
681
712
if (info->cs.uses_rt)
682
713
default_wave_size = device->physical_device->rt_wave_size;
685
subgroup_size = default_wave_size;
687
unsigned local_size =
688
nir->info.workgroup_size[0] * nir->info.workgroup_size[1] * nir->info.workgroup_size[2];
715
unsigned local_size = nir->info.workgroup_size[0] * nir->info.workgroup_size[1] * nir->info.workgroup_size[2];
690
717
/* Games don't always request full subgroups when they should, which can cause bugs if cswave32
693
if (default_wave_size == 32 && nir->info.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
694
local_size % RADV_SUBGROUP_SIZE == 0)
695
require_full_subgroups = true;
697
if (require_full_subgroups && !req_subgroup_size) {
698
/* don't use wave32 pretending to be wave64 */
699
subgroup_size = RADV_SUBGROUP_SIZE;
720
const bool require_full_subgroups =
721
pipeline_key->stage_info[MESA_SHADER_COMPUTE].subgroup_require_full ||
722
(default_wave_size == 32 && nir->info.uses_wide_subgroup_intrinsics && local_size % RADV_SUBGROUP_SIZE == 0);
724
const unsigned required_subgroup_size = pipeline_key->stage_info[MESA_SHADER_COMPUTE].subgroup_required_size * 32;
726
if (required_subgroup_size) {
727
info->cs.subgroup_size = required_subgroup_size;
728
} else if (require_full_subgroups) {
729
info->cs.subgroup_size = RADV_SUBGROUP_SIZE;
730
} else if (device->physical_device->rad_info.gfx_level >= GFX10 && local_size <= 32) {
731
/* Use wave32 for small workgroups. */
732
info->cs.subgroup_size = 32;
734
info->cs.subgroup_size = default_wave_size;
702
info->cs.subgroup_size = subgroup_size;
704
737
if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) {
705
info->cs.regalloc_hang_bug =
706
info->cs.block_size[0] * info->cs.block_size[1] * info->cs.block_size[2] > 256;
738
info->cs.regalloc_hang_bug = info->cs.block_size[0] * info->cs.block_size[1] * info->cs.block_size[2] > 256;
726
758
/* Task->Mesh dispatch is linear when Y = Z = 1.
727
759
* GFX11 CP can optimize this case with a field in its draw packets.
729
info->cs.linear_taskmesh_dispatch = nir->info.mesh.ts_mesh_dispatch_dimensions[1] == 1 &&
730
nir->info.mesh.ts_mesh_dispatch_dimensions[2] == 1;
761
info->cs.linear_taskmesh_dispatch =
762
nir->info.mesh.ts_mesh_dispatch_dimensions[1] == 1 && nir->info.mesh.ts_mesh_dispatch_dimensions[2] == 1;
767
799
assert(info->stage != MESA_SHADER_MESH);
768
800
return R_00B130_SPI_SHADER_USER_DATA_VS_0;
769
801
case MESA_SHADER_TESS_CTRL:
770
return gfx_level == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
771
: R_00B430_SPI_SHADER_USER_DATA_HS_0;
802
return gfx_level == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 : R_00B430_SPI_SHADER_USER_DATA_HS_0;
772
803
case MESA_SHADER_GEOMETRY:
773
return gfx_level == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
774
: R_00B230_SPI_SHADER_USER_DATA_GS_0;
804
return gfx_level == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B230_SPI_SHADER_USER_DATA_GS_0;
775
805
case MESA_SHADER_FRAGMENT:
776
806
return R_00B030_SPI_SHADER_USER_DATA_PS_0;
777
807
case MESA_SHADER_COMPUTE:
778
808
case MESA_SHADER_TASK:
779
809
case MESA_SHADER_RAYGEN:
810
case MESA_SHADER_CALLABLE:
811
case MESA_SHADER_CLOSEST_HIT:
812
case MESA_SHADER_MISS:
813
case MESA_SHADER_INTERSECTION:
814
case MESA_SHADER_ANY_HIT:
780
815
return R_00B900_COMPUTE_USER_DATA_0;
782
817
unreachable("invalid shader stage");
796
radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *nir,
797
gl_shader_stage next_stage,
798
const struct radv_pipeline_layout *layout,
799
const struct radv_pipeline_key *pipeline_key,
800
const enum radv_pipeline_type pipeline_type,
801
bool consider_force_vrs,
831
radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *nir, gl_shader_stage next_stage,
832
const struct radv_pipeline_layout *layout, const struct radv_pipeline_key *pipeline_key,
833
const enum radv_pipeline_type pipeline_type, bool consider_force_vrs,
802
834
struct radv_shader_info *info)
804
836
info->stage = nir->info.stage;
828
860
uint64_t special_mask = BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) |
829
861
BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) |
830
862
BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
831
uint64_t per_prim_mask =
832
nir->info.outputs_written & nir->info.per_primitive_outputs & ~special_mask;
833
uint64_t per_vtx_mask =
834
nir->info.outputs_written & ~nir->info.per_primitive_outputs & ~special_mask;
863
uint64_t per_prim_mask = nir->info.outputs_written & nir->info.per_primitive_outputs & ~special_mask;
864
uint64_t per_vtx_mask = nir->info.outputs_written & ~nir->info.per_primitive_outputs & ~special_mask;
836
866
/* Mesh multivew is only lowered in ac_nir_lower_ngg, so we have to fake it here. */
837
867
if (nir->info.stage == MESA_SHADER_MESH && pipeline_key->has_multiview_view_index) {
873
903
outinfo->pos_exports = util_bitcount(pos_written);
875
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
876
sizeof(outinfo->vs_output_param_offset));
905
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, sizeof(outinfo->vs_output_param_offset));
878
907
unsigned total_param_exports = 0;
948
info->wave_size = radv_get_wave_size(device, nir->info.stage, info);
949
info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info);
977
const struct radv_shader_stage_key *stage_key = &pipeline_key->stage_info[nir->info.stage];
978
info->wave_size = radv_get_wave_size(device, nir->info.stage, info, stage_key);
979
info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info, stage_key);
951
981
switch (nir->info.stage) {
952
982
case MESA_SHADER_COMPUTE:
953
983
case MESA_SHADER_TASK:
954
info->workgroup_size =
955
ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
984
info->workgroup_size = ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
957
986
/* Allow the compiler to assume that the shader always has full subgroups,
958
987
* meaning that the initial EXEC mask is -1 in all waves (all lanes enabled).
959
988
* This assumption is incorrect for ray tracing and internal (meta) shaders
960
989
* because they can use unaligned dispatch.
962
info->cs.uses_full_subgroups =
963
pipeline_type != RADV_PIPELINE_RAY_TRACING &&
964
!nir->info.internal &&
965
(info->workgroup_size % info->wave_size) == 0;
991
info->cs.uses_full_subgroups = pipeline_type != RADV_PIPELINE_RAY_TRACING && !nir->info.internal &&
992
(info->workgroup_size % info->wave_size) == 0;
967
994
case MESA_SHADER_MESH:
968
995
/* Already computed in gather_shader_info_mesh(). */
1020
1047
struct radv_legacy_gs_info *out = &gs_stage->info.gs_ring_info;
1022
1049
const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
1023
const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
1024
gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
1050
const bool uses_adjacency =
1051
gs_info->gs.input_prim == MESA_PRIM_LINES_ADJACENCY || gs_info->gs.input_prim == MESA_PRIM_TRIANGLES_ADJACENCY;
1026
1053
/* All these are in dwords: */
1027
1054
/* We can't allow using the whole LDS, because GS waves compete with
1113
1139
out->vgt_esgs_ring_itemsize = esgs_itemsize;
1114
1140
assert(max_prims_per_subgroup <= max_out_prims);
1116
unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
1117
es_verts_per_subgroup, gs_inst_prims_in_subgroup);
1142
unsigned workgroup_size =
1143
ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size, es_verts_per_subgroup, gs_inst_prims_in_subgroup);
1118
1144
es_info->workgroup_size = workgroup_size;
1119
1145
gs_info->workgroup_size = workgroup_size;
1161
1184
if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
1162
1185
if (es_stage->nir->info.tess.point_mode)
1163
return SHADER_PRIM_POINTS;
1186
return MESA_PRIM_POINTS;
1164
1187
if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
1165
return SHADER_PRIM_LINES;
1166
return SHADER_PRIM_TRIANGLES;
1188
return MESA_PRIM_LINES;
1189
return MESA_PRIM_TRIANGLES;
1169
return SHADER_PRIM_TRIANGLES;
1192
return MESA_PRIM_TRIANGLES;
1183
1206
const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
1185
1208
const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
1186
const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
1187
input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
1209
const bool uses_adjacency = input_prim == MESA_PRIM_LINES_ADJACENCY || input_prim == MESA_PRIM_TRIANGLES_ADJACENCY;
1189
1211
/* All these are in dwords: */
1190
1212
/* We can't allow using the whole LDS, because GS waves compete with
1202
1224
/* All these are per subgroup: */
1203
1225
const unsigned min_esverts = gfx_level >= GFX11 ? 3 : /* gfx11 requires at least 1 primitive per TG */
1204
gfx_level >= GFX10_3 ? 29 : 24;
1226
gfx_level >= GFX10_3 ? 29
1205
1228
bool max_vert_out_per_gs_instance = false;
1206
1229
unsigned max_esverts_base = 128;
1207
1230
unsigned max_gsprims_base = 128; /* default prim group size clamp */
1305
1328
max_esverts = align(max_esverts, wavesize);
1306
1329
max_esverts = MIN2(max_esverts, max_esverts_base);
1307
1330
if (esvert_lds_size)
1309
MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
1331
max_esverts = MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
1310
1332
max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1312
1334
/* Hardware restriction: minimum value of max_esverts */
1325
1347
* for triangles.
1327
1349
unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1328
max_gsprims = MIN2(max_gsprims,
1329
(max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
1350
max_gsprims = MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
1331
1352
clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
1332
1353
assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
1386
1406
assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
1388
1408
unsigned workgroup_size =
1389
ac_compute_ngg_workgroup_size(
1390
max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
1409
ac_compute_ngg_workgroup_size(max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
1391
1410
if (gs_stage) {
1392
1411
gs_info->workgroup_size = workgroup_size;
1398
1417
gfx10_get_ngg_query_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
1399
struct radv_pipeline_stage *gs_stage,
1400
const struct radv_pipeline_key *pipeline_key)
1418
struct radv_pipeline_stage *gs_stage, const struct radv_pipeline_key *pipeline_key)
1402
1420
struct radv_shader_info *info = gs_stage ? &gs_stage->info : &es_stage->info;
1404
info->gs.has_ngg_pipeline_stat_query =
1405
device->physical_device->emulate_ngg_gs_query_pipeline_stat && !!gs_stage;
1422
info->gs.has_ngg_pipeline_stat_query = device->physical_device->emulate_ngg_gs_query_pipeline_stat && !!gs_stage;
1406
1423
info->has_ngg_xfb_query = gs_stage ? !!gs_stage->nir->xfb_info : !!es_stage->nir->xfb_info;
1407
1424
info->has_ngg_prim_query = pipeline_key->primitives_generated_query || info->has_ngg_xfb_query;
1411
1428
radv_determine_ngg_settings(struct radv_device *device, struct radv_pipeline_stage *es_stage,
1412
struct radv_pipeline_stage *fs_stage,
1413
const struct radv_pipeline_key *pipeline_key)
1429
struct radv_pipeline_stage *fs_stage, const struct radv_pipeline_key *pipeline_key)
1415
1431
assert(es_stage->stage == MESA_SHADER_VERTEX || es_stage->stage == MESA_SHADER_TESS_EVAL);
1416
assert(fs_stage->stage == MESA_SHADER_FRAGMENT);
1432
assert(!fs_stage || fs_stage->stage == MESA_SHADER_FRAGMENT);
1418
uint64_t ps_inputs_read = fs_stage->nir->info.inputs_read;
1434
uint64_t ps_inputs_read = fs_stage ? fs_stage->nir->info.inputs_read : 0;
1420
1436
unsigned num_vertices_per_prim = 0;
1421
1437
if (es_stage->stage == MESA_SHADER_VERTEX) {
1422
1438
num_vertices_per_prim = radv_get_num_vertices_per_prim(pipeline_key);
1423
1439
} else if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
1424
num_vertices_per_prim = es_stage->nir->info.tess.point_mode ? 1 :
1425
es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2 : 3;
1440
num_vertices_per_prim = es_stage->nir->info.tess.point_mode ? 1
1441
: es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2
1428
1445
/* TODO: Enable culling for LLVM. */
1429
es_stage->info.has_ngg_culling =
1430
radv_consider_culling(device->physical_device, es_stage->nir, ps_inputs_read,
1431
num_vertices_per_prim, &es_stage->info) &&
1432
!radv_use_llvm_for_stage(device, es_stage->stage);
1446
es_stage->info.has_ngg_culling = radv_consider_culling(device->physical_device, es_stage->nir, ps_inputs_read,
1447
num_vertices_per_prim, &es_stage->info) &&
1448
!radv_use_llvm_for_stage(device, es_stage->stage);
1434
1450
nir_function_impl *impl = nir_shader_get_entrypoint(es_stage->nir);
1435
1451
es_stage->info.has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
1437
1453
/* NGG passthrough mode should be disabled when culling and when the vertex shader
1438
1454
* exports the primitive ID.
1440
es_stage->info.is_ngg_passthrough = !es_stage->info.has_ngg_culling &&
1441
!(es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id);
1456
es_stage->info.is_ngg_passthrough = !es_stage->info.has_ngg_culling && !(es_stage->stage == MESA_SHADER_VERTEX &&
1457
es_stage->info.outinfo.export_prim_id);
1445
radv_link_shaders_info(struct radv_device *device,
1446
struct radv_pipeline_stage *producer, struct radv_pipeline_stage *consumer,
1447
const struct radv_pipeline_key *pipeline_key)
1461
radv_link_shaders_info(struct radv_device *device, struct radv_pipeline_stage *producer,
1462
struct radv_pipeline_stage *consumer, const struct radv_pipeline_key *pipeline_key)
1449
1464
/* Export primitive ID and clip/cull distances if read by the FS, or export unconditionally when
1450
1465
* the next stage is unknown (with graphics pipeline library).
1452
if (!consumer || consumer->stage == MESA_SHADER_FRAGMENT) {
1467
if ((consumer && consumer->stage == MESA_SHADER_FRAGMENT) ||
1468
!(pipeline_key->lib_flags & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1453
1469
struct radv_vs_output_info *outinfo = &producer->info.outinfo;
1454
1470
const bool ps_prim_id_in = !consumer || consumer->info.ps.prim_id_input;
1455
1471
const bool ps_clip_dists_in = !consumer || !!consumer->info.ps.num_input_clips_culls;
1457
if (ps_prim_id_in &&
1458
(producer->stage == MESA_SHADER_VERTEX || producer->stage == MESA_SHADER_TESS_EVAL)) {
1473
if (ps_prim_id_in && (producer->stage == MESA_SHADER_VERTEX || producer->stage == MESA_SHADER_TESS_EVAL)) {
1459
1474
/* Mark the primitive ID as output when it's implicitly exported by VS or TES. */
1460
1475
if (outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED)
1461
1476
outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
1488
1503
/* Compute the ESGS item size for VS or TES as ES. */
1489
1504
producer->info.esgs_itemsize = num_outputs_written * 16;
1506
/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
1507
* conflicts, i.e. each vertex will start on a different bank.
1509
if (device->physical_device->rad_info.gfx_level >= GFX9 && producer->info.esgs_itemsize)
1510
producer->info.esgs_itemsize += 4;
1492
1513
/* Compute NGG info (GFX10+) or GS info. */
1493
1514
if (producer->info.is_ngg) {
1494
struct radv_pipeline_stage *gs_stage =
1495
consumer && consumer->stage == MESA_SHADER_GEOMETRY ? consumer : NULL;
1515
struct radv_pipeline_stage *gs_stage = consumer && consumer->stage == MESA_SHADER_GEOMETRY ? consumer : NULL;
1497
1517
gfx10_get_ngg_info(device, producer, gs_stage);
1498
1518
gfx10_get_ngg_query_info(device, producer, gs_stage, pipeline_key);
1500
1520
/* Determine other NGG settings like culling for VS or TES without GS. */
1501
if (!gs_stage && consumer) {
1502
1522
radv_determine_ngg_settings(device, producer, consumer, pipeline_key);
1504
1524
} else if (consumer && consumer->stage == MESA_SHADER_GEOMETRY) {
1520
1539
vs_stage->info.workgroup_size = 256;
1521
1540
tcs_stage->info.workgroup_size = 256;
1523
vs_stage->info.workgroup_size =
1524
ac_compute_lshs_workgroup_size(device->physical_device->rad_info.gfx_level,
1525
MESA_SHADER_VERTEX, tcs_stage->info.num_tess_patches,
1526
pipeline_key->tcs.tess_input_vertices,
1527
tcs_stage->info.tcs.tcs_vertices_out);
1542
vs_stage->info.workgroup_size = ac_compute_lshs_workgroup_size(
1543
device->physical_device->rad_info.gfx_level, MESA_SHADER_VERTEX, tcs_stage->info.num_tess_patches,
1544
pipeline_key->tcs.tess_input_vertices, tcs_stage->info.tcs.tcs_vertices_out);
1529
tcs_stage->info.workgroup_size =
1530
ac_compute_lshs_workgroup_size(device->physical_device->rad_info.gfx_level,
1531
MESA_SHADER_TESS_CTRL, tcs_stage->info.num_tess_patches,
1532
pipeline_key->tcs.tess_input_vertices,
1533
tcs_stage->info.tcs.tcs_vertices_out);
1546
tcs_stage->info.workgroup_size = ac_compute_lshs_workgroup_size(
1547
device->physical_device->rad_info.gfx_level, MESA_SHADER_TESS_CTRL, tcs_stage->info.num_tess_patches,
1548
pipeline_key->tcs.tess_input_vertices, tcs_stage->info.tcs.tcs_vertices_out);
1535
1550
if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
1536
1551
/* When the number of TCS input and output vertices are the same (typically 3):
1545
1560
vs_stage->info.vs.tcs_in_out_eq =
1546
1561
device->physical_device->rad_info.gfx_level >= GFX9 &&
1547
1562
pipeline_key->tcs.tess_input_vertices == tcs_stage->info.tcs.tcs_vertices_out &&
1548
vs_stage->nir->info.float_controls_execution_mode ==
1549
tcs_stage->nir->info.float_controls_execution_mode;
1563
vs_stage->nir->info.float_controls_execution_mode == tcs_stage->nir->info.float_controls_execution_mode;
1551
1565
if (vs_stage->info.vs.tcs_in_out_eq)
1552
1566
vs_stage->info.vs.tcs_temp_only_input_mask =
1553
tcs_stage->nir->info.inputs_read &
1554
vs_stage->nir->info.outputs_written &
1567
tcs_stage->nir->info.inputs_read & vs_stage->nir->info.outputs_written &
1555
1568
~tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read &
1556
~tcs_stage->nir->info.inputs_read_indirectly &
1557
~vs_stage->nir->info.outputs_accessed_indirectly;
1569
~tcs_stage->nir->info.inputs_read_indirectly & ~vs_stage->nir->info.outputs_accessed_indirectly;
1605
1617
dst_info->tes = src_info->tes;
1608
if (dst->stage == MESA_SHADER_GEOMETRY) {
1609
dst_info->is_ngg = src_info->is_ngg;
1620
if (dst->stage == MESA_SHADER_GEOMETRY)
1610
1621
dst_info->gs.es_type = src->stage;
1614
1624
static const gl_shader_stage graphics_shader_order[] = {
1616
MESA_SHADER_TESS_CTRL,
1617
MESA_SHADER_TESS_EVAL,
1618
MESA_SHADER_GEOMETRY,
1625
MESA_SHADER_VERTEX, MESA_SHADER_TESS_CTRL, MESA_SHADER_TESS_EVAL, MESA_SHADER_GEOMETRY,
1627
MESA_SHADER_TASK, MESA_SHADER_MESH,
1626
1632
struct radv_pipeline_stage *stages)
1628
1634
/* Walk backwards to link */
1629
struct radv_pipeline_stage *next_stage =
1630
stages[MESA_SHADER_FRAGMENT].nir ? &stages[MESA_SHADER_FRAGMENT] : NULL;
1635
struct radv_pipeline_stage *next_stage = stages[MESA_SHADER_FRAGMENT].nir ? &stages[MESA_SHADER_FRAGMENT] : NULL;
1632
1637
for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; i--) {
1633
1638
gl_shader_stage s = graphics_shader_order[i];
1647
1652
/* Merge shader info for VS+GS or TES+GS. */
1648
1653
if (stages[MESA_SHADER_GEOMETRY].nir) {
1649
gl_shader_stage pre_stage =
1650
stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
1654
gl_shader_stage pre_stage = stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
1652
1656
radv_nir_shader_info_merge(&stages[pre_stage], &stages[MESA_SHADER_GEOMETRY]);