~mmach/netext73/mesa-ryzen

« back to all changes in this revision

Viewing changes to src/amd/vulkan/radv_shader_info.c

  • Committer: mmach
  • Date: 2023-11-02 21:31:35 UTC
  • Revision ID: netbit73@gmail.com-20231102213135-18d4tzh7tj0uz752
2023-11-02 22:11:57

Show diffs side-by-side

added added

removed removed

Lines of Context:
34
34
}
35
35
 
36
36
static void
37
 
gather_intrinsic_load_input_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
38
 
                                 struct radv_shader_info *info)
 
37
gather_intrinsic_load_input_info(const nir_shader *nir, const nir_intrinsic_instr *instr, struct radv_shader_info *info)
39
38
{
40
39
   switch (nir->info.stage) {
41
40
   case MESA_SHADER_VERTEX: {
112
111
}
113
112
 
114
113
static void
115
 
gather_push_constant_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
116
 
                          struct radv_shader_info *info)
 
114
gather_push_constant_info(const nir_shader *nir, const nir_intrinsic_instr *instr, struct radv_shader_info *info)
117
115
{
118
116
   info->loads_push_constants = true;
119
117
 
131
129
}
132
130
 
133
131
static void
134
 
gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
135
 
                      struct radv_shader_info *info, bool consider_force_vrs)
 
132
gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr, struct radv_shader_info *info,
 
133
                      bool consider_force_vrs)
136
134
{
137
135
   switch (instr->intrinsic) {
138
136
   case nir_intrinsic_load_barycentric_sample:
170
168
         info->ps.needs_sample_positions = true;
171
169
      break;
172
170
   }
 
171
   case nir_intrinsic_load_provoking_vtx_amd:
 
172
      info->ps.load_provoking_vtx = true;
 
173
      break;
 
174
   case nir_intrinsic_load_sample_positions_amd:
 
175
      info->ps.needs_sample_positions = true;
 
176
      break;
 
177
   case nir_intrinsic_load_rasterization_primitive_amd:
 
178
      info->ps.load_rasterization_prim = true;
 
179
      break;
173
180
   case nir_intrinsic_load_local_invocation_id:
174
181
   case nir_intrinsic_load_workgroup_id: {
175
182
      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
198
205
   case nir_intrinsic_image_deref_load:
199
206
   case nir_intrinsic_image_deref_sparse_load:
200
207
   case nir_intrinsic_image_deref_store:
201
 
   case nir_intrinsic_image_deref_atomic_add:
202
 
   case nir_intrinsic_image_deref_atomic_imin:
203
 
   case nir_intrinsic_image_deref_atomic_umin:
204
 
   case nir_intrinsic_image_deref_atomic_imax:
205
 
   case nir_intrinsic_image_deref_atomic_umax:
206
 
   case nir_intrinsic_image_deref_atomic_and:
207
 
   case nir_intrinsic_image_deref_atomic_or:
208
 
   case nir_intrinsic_image_deref_atomic_xor:
209
 
   case nir_intrinsic_image_deref_atomic_exchange:
210
 
   case nir_intrinsic_image_deref_atomic_comp_swap:
211
 
   case nir_intrinsic_image_deref_atomic_fmin:
212
 
   case nir_intrinsic_image_deref_atomic_fmax:
 
208
   case nir_intrinsic_image_deref_atomic:
 
209
   case nir_intrinsic_image_deref_atomic_swap:
213
210
   case nir_intrinsic_image_deref_size:
214
211
   case nir_intrinsic_image_deref_samples: {
215
 
      nir_variable *var =
216
 
         nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
 
212
      nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
217
213
      mark_sampler_desc(var, info);
218
214
      break;
219
215
   }
232
228
   case nir_intrinsic_bvh64_intersect_ray_amd:
233
229
      info->cs.uses_rt = true;
234
230
      break;
 
231
   case nir_intrinsic_load_poly_line_smooth_enabled:
 
232
      info->ps.needs_poly_line_smooth = true;
 
233
      break;
 
234
   case nir_intrinsic_begin_invocation_interlock:
 
235
      info->ps.pops = true;
 
236
      break;
235
237
   default:
236
238
      break;
237
239
   }
255
257
}
256
258
 
257
259
static void
258
 
gather_info_block(const nir_shader *nir, const nir_block *block, struct radv_shader_info *info,
259
 
                  bool consider_force_vrs)
 
260
gather_info_block(const nir_shader *nir, const nir_block *block, struct radv_shader_info *info, bool consider_force_vrs)
260
261
{
261
262
   nir_foreach_instr (instr, block) {
262
263
      switch (instr->type) {
318
319
}
319
320
 
320
321
static void
321
 
assign_outinfo_param(struct radv_vs_output_info *outinfo, gl_varying_slot idx,
322
 
                     unsigned *total_param_exports, unsigned extra_offset)
 
322
assign_outinfo_param(struct radv_vs_output_info *outinfo, gl_varying_slot idx, unsigned *total_param_exports,
 
323
                     unsigned extra_offset)
323
324
{
324
325
   if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
325
326
      outinfo->vs_output_param_offset[idx] = extra_offset + (*total_param_exports)++;
326
327
}
327
328
 
328
329
static void
329
 
assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask,
330
 
                      unsigned *total_param_exports, unsigned extra_offset)
 
330
assign_outinfo_params(struct radv_vs_output_info *outinfo, uint64_t mask, unsigned *total_param_exports,
 
331
                      unsigned extra_offset)
331
332
{
332
 
   u_foreach_bit64(idx, mask) {
333
 
      if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
334
 
          idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT)
 
333
   u_foreach_bit64 (idx, mask) {
 
334
      if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
 
335
          idx == VARYING_SLOT_VIEWPORT)
335
336
         assign_outinfo_param(outinfo, idx, total_param_exports, extra_offset);
336
337
   }
337
338
}
338
339
 
339
340
static uint8_t
340
 
radv_get_wave_size(struct radv_device *device,  gl_shader_stage stage,
341
 
                   const struct radv_shader_info *info)
 
341
radv_get_wave_size(struct radv_device *device, gl_shader_stage stage, const struct radv_shader_info *info,
 
342
                   const struct radv_shader_stage_key *stage_key)
342
343
{
 
344
   if (stage_key->subgroup_required_size)
 
345
      return stage_key->subgroup_required_size * 32;
 
346
 
343
347
   if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
344
348
      return 64;
345
349
   else if (stage == MESA_SHADER_COMPUTE)
355
359
}
356
360
 
357
361
static uint8_t
358
 
radv_get_ballot_bit_size(struct radv_device *device, gl_shader_stage stage,
359
 
                         const struct radv_shader_info *info)
 
362
radv_get_ballot_bit_size(struct radv_device *device, gl_shader_stage stage, const struct radv_shader_info *info,
 
363
                         const struct radv_shader_stage_key *stage_key)
360
364
{
361
 
   if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
362
 
      return info->cs.subgroup_size;
363
 
   else if (gl_shader_stage_is_rt(stage))
364
 
      return device->physical_device->rt_wave_size;
 
365
   if (stage_key->subgroup_required_size)
 
366
      return stage_key->subgroup_required_size * 32;
 
367
 
365
368
   return 64;
366
369
}
367
370
 
380
383
      else
381
384
         info->vs.vb_desc_usage_mask |= BITFIELD_BIT(key->vs.vertex_attribute_bindings[location]);
382
385
 
383
 
      info->vs.input_slot_usage_mask |=
384
 
         BITFIELD_RANGE(location, glsl_count_attribute_slots(type, false));
 
386
      info->vs.input_slot_usage_mask |= BITFIELD_RANGE(location, glsl_count_attribute_slots(type, false));
385
387
   } else if (glsl_type_is_matrix(type) || glsl_type_is_array(type)) {
386
388
      const struct glsl_type *elem = glsl_get_array_element(type);
387
389
      unsigned stride = glsl_count_attribute_slots(elem, false);
400
402
}
401
403
 
402
404
static void
403
 
gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
404
 
                      const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
 
405
gather_shader_info_vs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key,
 
406
                      struct radv_shader_info *info)
405
407
{
406
408
   if (pipeline_key->vs.has_prolog && nir->info.inputs_read) {
407
409
      info->vs.has_prolog = true;
409
411
   }
410
412
 
411
413
   /* Use per-attribute vertex descriptors to prevent faults and for correct bounds checking. */
412
 
   info->vs.use_per_attribute_vb_descs = device->robust_buffer_access || info->vs.dynamic_inputs;
 
414
   info->vs.use_per_attribute_vb_descs = pipeline_key->vertex_robustness1 || info->vs.dynamic_inputs;
413
415
 
414
416
   /* We have to ensure consistent input register assignments between the main shader and the
415
417
    * prolog.
418
420
   info->vs.needs_base_instance |= info->vs.has_prolog;
419
421
   info->vs.needs_draw_id |= info->vs.has_prolog;
420
422
 
421
 
   nir_foreach_shader_in_variable(var, nir)
422
 
      gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type,
423
 
                                pipeline_key, info);
 
423
   nir_foreach_shader_in_variable (var, nir)
 
424
      gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type, pipeline_key, info);
424
425
 
425
426
   if (info->vs.dynamic_inputs)
426
427
      info->vs.vb_desc_usage_mask = BITFIELD_MASK(util_last_bit(info->vs.vb_desc_usage_mask));
434
435
}
435
436
 
436
437
static void
437
 
gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,
438
 
                       const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
 
438
gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key,
 
439
                       struct radv_shader_info *info)
439
440
{
440
441
   info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
441
442
 
444
445
      info->num_tess_patches =
445
446
         get_tcs_num_patches(pipeline_key->tcs.tess_input_vertices, nir->info.tess.tcs_vertices_out,
446
447
                             info->tcs.num_linked_inputs, info->tcs.num_linked_outputs,
447
 
                             info->tcs.num_linked_patch_outputs,
448
 
                             device->physical_device->hs.tess_offchip_block_dw_size,
449
 
                             device->physical_device->rad_info.gfx_level,
450
 
                             device->physical_device->rad_info.family);
 
448
                             info->tcs.num_linked_patch_outputs, device->physical_device->hs.tess_offchip_block_dw_size,
 
449
                             device->physical_device->rad_info.gfx_level, device->physical_device->rad_info.family);
451
450
 
452
451
      /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
453
452
      info->tcs.num_lds_blocks =
454
 
         calculate_tess_lds_size(device->physical_device->rad_info.gfx_level,
455
 
                                 pipeline_key->tcs.tess_input_vertices,
456
 
                                 nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
457
 
                                 info->num_tess_patches, info->tcs.num_linked_outputs,
458
 
                                 info->tcs.num_linked_patch_outputs);
 
453
         calculate_tess_lds_size(device->physical_device->rad_info.gfx_level, pipeline_key->tcs.tess_input_vertices,
 
454
                                 nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, info->num_tess_patches,
 
455
                                 info->tcs.num_linked_outputs, info->tcs.num_linked_patch_outputs);
459
456
   }
460
457
}
461
458
 
480
477
   info->gs.input_prim = nir->info.gs.input_primitive;
481
478
   info->gs.output_prim = nir->info.gs.output_primitive;
482
479
   info->gs.invocations = nir->info.gs.invocations;
483
 
   info->gs.max_stream =
484
 
      nir->info.gs.active_stream_mask ? util_last_bit(nir->info.gs.active_stream_mask) - 1 : 0;
 
480
   info->gs.max_stream = nir->info.gs.active_stream_mask ? util_last_bit(nir->info.gs.active_stream_mask) - 1 : 0;
485
481
 
486
 
   nir_foreach_shader_out_variable(var, nir) {
 
482
   nir_foreach_shader_out_variable (var, nir) {
487
483
      unsigned num_components = glsl_get_component_slots(var->type);
488
484
      unsigned stream = var->data.stream;
489
485
 
534
530
   ngg_info->prim_amp_factor = nir->info.mesh.max_primitives_out;
535
531
   ngg_info->vgt_esgs_ring_itemsize = 1;
536
532
 
537
 
   unsigned min_ngg_workgroup_size =
538
 
      ac_compute_ngg_workgroup_size(ngg_info->hw_max_esverts, ngg_info->max_gsprims,
539
 
                                    ngg_info->max_out_verts, ngg_info->prim_amp_factor);
 
533
   unsigned min_ngg_workgroup_size = ac_compute_ngg_workgroup_size(ngg_info->hw_max_esverts, ngg_info->max_gsprims,
 
534
                                                                   ngg_info->max_out_verts, ngg_info->prim_amp_factor);
540
535
 
541
 
   unsigned api_workgroup_size =
542
 
      ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
 
536
   unsigned api_workgroup_size = ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
543
537
 
544
538
   info->workgroup_size = MAX2(min_ngg_workgroup_size, api_workgroup_size);
545
539
}
564
558
   }
565
559
 
566
560
   info->ps.can_discard = nir->info.fs.uses_discard;
567
 
   info->ps.early_fragment_test = nir->info.fs.early_fragment_tests ||
568
 
                                  (nir->info.fs.early_and_late_fragment_tests &&
569
 
                                   nir->info.fs.depth_layout == FRAG_DEPTH_LAYOUT_NONE &&
570
 
                                   nir->info.fs.stencil_front_layout == FRAG_STENCIL_LAYOUT_NONE &&
571
 
                                   nir->info.fs.stencil_back_layout == FRAG_STENCIL_LAYOUT_NONE);
 
561
   info->ps.early_fragment_test =
 
562
      nir->info.fs.early_fragment_tests ||
 
563
      (nir->info.fs.early_and_late_fragment_tests && nir->info.fs.depth_layout == FRAG_DEPTH_LAYOUT_NONE &&
 
564
       nir->info.fs.stencil_front_layout == FRAG_STENCIL_LAYOUT_NONE &&
 
565
       nir->info.fs.stencil_back_layout == FRAG_STENCIL_LAYOUT_NONE);
572
566
   info->ps.post_depth_coverage = nir->info.fs.post_depth_coverage;
573
567
   info->ps.depth_layout = nir->info.fs.depth_layout;
574
568
   info->ps.uses_sample_shading = nir->info.fs.uses_sample_shading;
585
579
   info->ps.reads_frag_shading_rate = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_SHADING_RATE);
586
580
   info->ps.reads_front_face = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
587
581
   info->ps.reads_barycentric_model = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL);
588
 
   info->ps.reads_fully_covered =
589
 
      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FULLY_COVERED);
 
582
   info->ps.reads_fully_covered = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FULLY_COVERED);
590
583
 
591
 
   bool uses_persp_or_linear_interp = info->ps.reads_persp_center ||
592
 
                                      info->ps.reads_persp_centroid ||
593
 
                                      info->ps.reads_persp_sample ||
594
 
                                      info->ps.reads_linear_center ||
595
 
                                      info->ps.reads_linear_centroid ||
596
 
                                      info->ps.reads_linear_sample;
 
584
   bool uses_persp_or_linear_interp = info->ps.reads_persp_center || info->ps.reads_persp_centroid ||
 
585
                                      info->ps.reads_persp_sample || info->ps.reads_linear_center ||
 
586
                                      info->ps.reads_linear_centroid || info->ps.reads_linear_sample;
597
587
 
598
588
   info->ps.allow_flat_shading =
599
 
      !(uses_persp_or_linear_interp || info->ps.needs_sample_positions ||
600
 
        info->ps.reads_frag_shading_rate || info->ps.writes_memory ||
601
 
        nir->info.fs.needs_quad_helper_invocations ||
 
589
      !(uses_persp_or_linear_interp || info->ps.needs_sample_positions || info->ps.reads_frag_shading_rate ||
 
590
        info->ps.writes_memory || nir->info.fs.needs_quad_helper_invocations ||
602
591
        BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
603
592
        BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_POINT_COORD) ||
604
593
        BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
606
595
        BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) ||
607
596
        BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION));
608
597
 
 
598
   info->ps.pops_is_per_sample =
 
599
      info->ps.pops && (nir->info.fs.sample_interlock_ordered || nir->info.fs.sample_interlock_unordered);
 
600
 
609
601
   info->ps.spi_ps_input = radv_compute_spi_ps_input(pipeline_key, info);
610
602
 
611
603
   info->ps.has_epilog = pipeline_key->ps.has_epilog && info->ps.colors_written;
612
604
 
613
 
   info->ps.writes_mrt0_alpha =
614
 
      (pipeline_key->ps.alpha_to_coverage_via_mrtz && (info->ps.color0_written & 0x8)) &&
615
 
      (info->ps.writes_z || info->ps.writes_stencil || info->ps.writes_sample_mask);
 
605
   info->ps.writes_mrt0_alpha = (pipeline_key->ps.alpha_to_coverage_via_mrtz && (info->ps.color0_written & 0x8)) &&
 
606
                                (info->ps.writes_z || info->ps.writes_stencil || info->ps.writes_sample_mask);
616
607
 
617
608
   info->ps.mrt0_is_dual_src = pipeline_key->ps.epilog.mrt0_is_dual_src;
618
609
 
619
610
   info->ps.spi_shader_col_format = pipeline_key->ps.epilog.spi_shader_col_format;
620
611
 
621
 
   nir_foreach_shader_in_variable(var, nir) {
622
 
      unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
 
612
   nir_foreach_shader_in_variable (var, nir) {
 
613
      const struct glsl_type *type = var->data.per_vertex ? glsl_get_array_element(var->type) : var->type;
 
614
      unsigned attrib_count = glsl_count_attribute_slots(type, false);
623
615
      int idx = var->data.location;
624
616
 
625
617
      switch (idx) {
635
627
         unsigned component_count = var->data.location_frac + glsl_get_length(var->type);
636
628
         attrib_count = (component_count + 3) / 4;
637
629
      } else {
638
 
         mark_16bit_ps_input(info, var->type, var->data.driver_location);
 
630
         mark_16bit_ps_input(info, type, var->data.driver_location);
639
631
      }
640
632
 
641
633
      uint64_t mask = ((1ull << attrib_count) - 1);
645
637
            info->ps.flat_shaded_mask |= mask << var->data.driver_location;
646
638
         else if (var->data.interpolation == INTERP_MODE_EXPLICIT)
647
639
            info->ps.explicit_shaded_mask |= mask << var->data.driver_location;
 
640
         else if (var->data.per_vertex)
 
641
            info->ps.per_vertex_shaded_mask |= mask << var->data.driver_location;
648
642
      }
649
643
 
650
644
      if (var->data.location >= VARYING_SLOT_VAR0) {
654
648
            info->ps.input_mask |= mask << (var->data.location - VARYING_SLOT_VAR0);
655
649
      }
656
650
   }
 
651
 
 
652
   /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
 
653
    *
 
654
    * - The fragment shader reads gl_SampleMaskIn because the 16-bit sample coverage mask isn't enough for MSAA8x and
 
655
    *   2x2 coarse shading.
 
656
    * - On GFX10.3, if the fragment shader requests a fragment interlock execution mode even if the ordered section was
 
657
    *   optimized out, to consistently implement fragmentShadingRateWithFragmentShaderInterlock = VK_FALSE.
 
658
    */
 
659
   info->ps.force_sample_iter_shading_rate =
 
660
      (info->ps.reads_sample_mask_in && !info->ps.needs_poly_line_smooth) ||
 
661
      (device->physical_device->rad_info.gfx_level == GFX10_3 &&
 
662
       (nir->info.fs.sample_interlock_ordered || nir->info.fs.sample_interlock_unordered ||
 
663
        nir->info.fs.pixel_interlock_ordered || nir->info.fs.pixel_interlock_unordered));
 
664
 
 
665
   /* DB_SHADER_CONTROL based on other fragment shader info fields. */
 
666
 
 
667
   unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
 
668
   if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
 
669
      conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
 
670
   else if (info->ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
 
671
      conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
 
672
 
 
673
   unsigned z_order =
 
674
      info->ps.early_fragment_test || !info->ps.writes_memory ? V_02880C_EARLY_Z_THEN_LATE_Z : V_02880C_LATE_Z;
 
675
 
 
676
   /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled, but this appears to break Project Cars
 
677
    * (DXVK). See https://bugs.freedesktop.org/show_bug.cgi?id=109401
 
678
    */
 
679
   const bool mask_export_enable = info->ps.writes_sample_mask;
 
680
 
 
681
   const bool disable_rbplus =
 
682
      device->physical_device->rad_info.has_rbplus && !device->physical_device->rad_info.rbplus_allowed;
 
683
 
 
684
   info->ps.db_shader_control =
 
685
      S_02880C_Z_EXPORT_ENABLE(info->ps.writes_z) | S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(info->ps.writes_stencil) |
 
686
      S_02880C_KILL_ENABLE(info->ps.can_discard) | S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
 
687
      S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
 
688
      S_02880C_DEPTH_BEFORE_SHADER(info->ps.early_fragment_test) |
 
689
      S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(info->ps.post_depth_coverage) |
 
690
      S_02880C_EXEC_ON_HIER_FAIL(info->ps.writes_memory) | S_02880C_EXEC_ON_NOOP(info->ps.writes_memory) |
 
691
      S_02880C_DUAL_QUAD_DISABLE(disable_rbplus) | S_02880C_PRIMITIVE_ORDERED_PIXEL_SHADER(info->ps.pops);
657
692
}
658
693
 
659
694
static void
668
703
}
669
704
 
670
705
static void
671
 
gather_shader_info_cs(struct radv_device *device, const nir_shader *nir,
672
 
                      const struct radv_pipeline_key *pipeline_key, struct radv_shader_info *info)
 
706
gather_shader_info_cs(struct radv_device *device, const nir_shader *nir, const struct radv_pipeline_key *pipeline_key,
 
707
                      struct radv_shader_info *info)
673
708
{
674
709
   info->cs.uses_ray_launch_size = BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_RAY_LAUNCH_SIZE_ADDR_AMD);
675
710
 
676
 
   unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
677
 
   unsigned req_subgroup_size = subgroup_size;
678
 
   bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
679
 
 
680
711
   unsigned default_wave_size = device->physical_device->cs_wave_size;
681
712
   if (info->cs.uses_rt)
682
713
      default_wave_size = device->physical_device->rt_wave_size;
683
714
 
684
 
   if (!subgroup_size)
685
 
      subgroup_size = default_wave_size;
686
 
 
687
 
   unsigned local_size =
688
 
      nir->info.workgroup_size[0] * nir->info.workgroup_size[1] * nir->info.workgroup_size[2];
 
715
   unsigned local_size = nir->info.workgroup_size[0] * nir->info.workgroup_size[1] * nir->info.workgroup_size[2];
689
716
 
690
717
   /* Games don't always request full subgroups when they should, which can cause bugs if cswave32
691
718
    * is enabled.
692
719
    */
693
 
   if (default_wave_size == 32 && nir->info.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
694
 
       local_size % RADV_SUBGROUP_SIZE == 0)
695
 
      require_full_subgroups = true;
696
 
 
697
 
   if (require_full_subgroups && !req_subgroup_size) {
698
 
      /* don't use wave32 pretending to be wave64 */
699
 
      subgroup_size = RADV_SUBGROUP_SIZE;
 
720
   const bool require_full_subgroups =
 
721
      pipeline_key->stage_info[MESA_SHADER_COMPUTE].subgroup_require_full ||
 
722
      (default_wave_size == 32 && nir->info.uses_wide_subgroup_intrinsics && local_size % RADV_SUBGROUP_SIZE == 0);
 
723
 
 
724
   const unsigned required_subgroup_size = pipeline_key->stage_info[MESA_SHADER_COMPUTE].subgroup_required_size * 32;
 
725
 
 
726
   if (required_subgroup_size) {
 
727
      info->cs.subgroup_size = required_subgroup_size;
 
728
   } else if (require_full_subgroups) {
 
729
      info->cs.subgroup_size = RADV_SUBGROUP_SIZE;
 
730
   } else if (device->physical_device->rad_info.gfx_level >= GFX10 && local_size <= 32) {
 
731
      /* Use wave32 for small workgroups. */
 
732
      info->cs.subgroup_size = 32;
 
733
   } else {
 
734
      info->cs.subgroup_size = default_wave_size;
700
735
   }
701
736
 
702
 
   info->cs.subgroup_size = subgroup_size;
703
 
 
704
737
   if (device->physical_device->rad_info.has_cs_regalloc_hang_bug) {
705
 
      info->cs.regalloc_hang_bug =
706
 
         info->cs.block_size[0] * info->cs.block_size[1] * info->cs.block_size[2] > 256;
 
738
      info->cs.regalloc_hang_bug = info->cs.block_size[0] * info->cs.block_size[1] * info->cs.block_size[2] > 256;
707
739
   }
708
740
}
709
741
 
726
758
   /* Task->Mesh dispatch is linear when Y = Z = 1.
727
759
    * GFX11 CP can optimize this case with a field in its draw packets.
728
760
    */
729
 
   info->cs.linear_taskmesh_dispatch = nir->info.mesh.ts_mesh_dispatch_dimensions[1] == 1 &&
730
 
                                       nir->info.mesh.ts_mesh_dispatch_dimensions[2] == 1;
 
761
   info->cs.linear_taskmesh_dispatch =
 
762
      nir->info.mesh.ts_mesh_dispatch_dimensions[1] == 1 && nir->info.mesh.ts_mesh_dispatch_dimensions[2] == 1;
731
763
}
732
764
 
733
765
static uint32_t
767
799
      assert(info->stage != MESA_SHADER_MESH);
768
800
      return R_00B130_SPI_SHADER_USER_DATA_VS_0;
769
801
   case MESA_SHADER_TESS_CTRL:
770
 
      return gfx_level == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
771
 
                               : R_00B430_SPI_SHADER_USER_DATA_HS_0;
 
802
      return gfx_level == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0 : R_00B430_SPI_SHADER_USER_DATA_HS_0;
772
803
   case MESA_SHADER_GEOMETRY:
773
 
      return gfx_level == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
774
 
                               : R_00B230_SPI_SHADER_USER_DATA_GS_0;
 
804
      return gfx_level == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : R_00B230_SPI_SHADER_USER_DATA_GS_0;
775
805
   case MESA_SHADER_FRAGMENT:
776
806
      return R_00B030_SPI_SHADER_USER_DATA_PS_0;
777
807
   case MESA_SHADER_COMPUTE:
778
808
   case MESA_SHADER_TASK:
779
809
   case MESA_SHADER_RAYGEN:
 
810
   case MESA_SHADER_CALLABLE:
 
811
   case MESA_SHADER_CLOSEST_HIT:
 
812
   case MESA_SHADER_MISS:
 
813
   case MESA_SHADER_INTERSECTION:
 
814
   case MESA_SHADER_ANY_HIT:
780
815
      return R_00B900_COMPUTE_USER_DATA_0;
781
816
   default:
782
817
      unreachable("invalid shader stage");
793
828
}
794
829
 
795
830
void
796
 
radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *nir,
797
 
                          gl_shader_stage next_stage,
798
 
                          const struct radv_pipeline_layout *layout,
799
 
                          const struct radv_pipeline_key *pipeline_key,
800
 
                          const enum radv_pipeline_type pipeline_type,
801
 
                          bool consider_force_vrs,
 
831
radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *nir, gl_shader_stage next_stage,
 
832
                          const struct radv_pipeline_layout *layout, const struct radv_pipeline_key *pipeline_key,
 
833
                          const enum radv_pipeline_type pipeline_type, bool consider_force_vrs,
802
834
                          struct radv_shader_info *info)
803
835
{
804
836
   info->stage = nir->info.stage;
828
860
      uint64_t special_mask = BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) |
829
861
                              BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) |
830
862
                              BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
831
 
      uint64_t per_prim_mask =
832
 
         nir->info.outputs_written & nir->info.per_primitive_outputs & ~special_mask;
833
 
      uint64_t per_vtx_mask =
834
 
         nir->info.outputs_written & ~nir->info.per_primitive_outputs & ~special_mask;
 
863
      uint64_t per_prim_mask = nir->info.outputs_written & nir->info.per_primitive_outputs & ~special_mask;
 
864
      uint64_t per_vtx_mask = nir->info.outputs_written & ~nir->info.per_primitive_outputs & ~special_mask;
835
865
 
836
866
      /* Mesh multivew is only lowered in ac_nir_lower_ngg, so we have to fake it here. */
837
867
      if (nir->info.stage == MESA_SHADER_MESH && pipeline_key->has_multiview_view_index) {
872
902
 
873
903
      outinfo->pos_exports = util_bitcount(pos_written);
874
904
 
875
 
      memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
876
 
             sizeof(outinfo->vs_output_param_offset));
 
905
      memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED, sizeof(outinfo->vs_output_param_offset));
877
906
 
878
907
      unsigned total_param_exports = 0;
879
908
 
945
974
      break;
946
975
   }
947
976
 
948
 
   info->wave_size = radv_get_wave_size(device, nir->info.stage, info);
949
 
   info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info);
 
977
   const struct radv_shader_stage_key *stage_key = &pipeline_key->stage_info[nir->info.stage];
 
978
   info->wave_size = radv_get_wave_size(device, nir->info.stage, info, stage_key);
 
979
   info->ballot_bit_size = radv_get_ballot_bit_size(device, nir->info.stage, info, stage_key);
950
980
 
951
981
   switch (nir->info.stage) {
952
982
   case MESA_SHADER_COMPUTE:
953
983
   case MESA_SHADER_TASK:
954
 
      info->workgroup_size =
955
 
         ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
 
984
      info->workgroup_size = ac_compute_cs_workgroup_size(nir->info.workgroup_size, false, UINT32_MAX);
956
985
 
957
986
      /* Allow the compiler to assume that the shader always has full subgroups,
958
987
       * meaning that the initial EXEC mask is -1 in all waves (all lanes enabled).
959
988
       * This assumption is incorrect for ray tracing and internal (meta) shaders
960
989
       * because they can use unaligned dispatch.
961
990
       */
962
 
      info->cs.uses_full_subgroups =
963
 
         pipeline_type != RADV_PIPELINE_RAY_TRACING &&
964
 
         !nir->info.internal &&
965
 
         (info->workgroup_size % info->wave_size) == 0;
 
991
      info->cs.uses_full_subgroups = pipeline_type != RADV_PIPELINE_RAY_TRACING && !nir->info.internal &&
 
992
                                     (info->workgroup_size % info->wave_size) == 0;
966
993
      break;
967
994
   case MESA_SHADER_MESH:
968
995
      /* Already computed in gather_shader_info_mesh(). */
1020
1047
   struct radv_legacy_gs_info *out = &gs_stage->info.gs_ring_info;
1021
1048
 
1022
1049
   const unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
1023
 
   const bool uses_adjacency = gs_info->gs.input_prim == SHADER_PRIM_LINES_ADJACENCY ||
1024
 
                               gs_info->gs.input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
 
1050
   const bool uses_adjacency =
 
1051
      gs_info->gs.input_prim == MESA_PRIM_LINES_ADJACENCY || gs_info->gs.input_prim == MESA_PRIM_TRIANGLES_ADJACENCY;
1025
1052
 
1026
1053
   /* All these are in dwords: */
1027
1054
   /* We can't allow using the whole LDS, because GS waves compete with
1046
1073
    * Make sure we don't go over the maximum value.
1047
1074
    */
1048
1075
   if (gs_info->gs.vertices_out > 0) {
1049
 
      max_gs_prims =
1050
 
         MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
 
1076
      max_gs_prims = MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
1051
1077
   }
1052
1078
   assert(max_gs_prims > 0);
1053
1079
 
1113
1139
   out->vgt_esgs_ring_itemsize = esgs_itemsize;
1114
1140
   assert(max_prims_per_subgroup <= max_out_prims);
1115
1141
 
1116
 
   unsigned workgroup_size = ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size,
1117
 
      es_verts_per_subgroup, gs_inst_prims_in_subgroup);
 
1142
   unsigned workgroup_size =
 
1143
      ac_compute_esgs_workgroup_size(gfx_level, es_info->wave_size, es_verts_per_subgroup, gs_inst_prims_in_subgroup);
1118
1144
   es_info->workgroup_size = workgroup_size;
1119
1145
   gs_info->workgroup_size = workgroup_size;
1120
1146
 
1122
1148
}
1123
1149
 
1124
1150
static void
1125
 
clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
1126
 
                         bool use_adjacency)
 
1151
clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim, bool use_adjacency)
1127
1152
{
1128
1153
   unsigned max_reuse = max_esverts - min_verts_per_prim;
1129
1154
   if (use_adjacency)
1132
1157
}
1133
1158
 
1134
1159
static unsigned
1135
 
radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage,
1136
 
                            const struct radv_pipeline_stage *gs_stage)
 
1160
radv_get_num_input_vertices(const struct radv_pipeline_stage *es_stage, const struct radv_pipeline_stage *gs_stage)
1137
1161
{
1138
1162
   if (gs_stage) {
1139
1163
      return gs_stage->nir->info.gs.vertices_in;
1151
1175
}
1152
1176
 
1153
1177
static unsigned
1154
 
radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage,
1155
 
                                 const struct radv_pipeline_stage *gs_stage)
 
1178
radv_get_pre_rast_input_topology(const struct radv_pipeline_stage *es_stage, const struct radv_pipeline_stage *gs_stage)
1156
1179
{
1157
1180
   if (gs_stage) {
1158
1181
      return gs_stage->nir->info.gs.input_primitive;
1160
1183
 
1161
1184
   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
1162
1185
      if (es_stage->nir->info.tess.point_mode)
1163
 
         return SHADER_PRIM_POINTS;
 
1186
         return MESA_PRIM_POINTS;
1164
1187
      if (es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
1165
 
         return SHADER_PRIM_LINES;
1166
 
      return SHADER_PRIM_TRIANGLES;
 
1188
         return MESA_PRIM_LINES;
 
1189
      return MESA_PRIM_TRIANGLES;
1167
1190
   }
1168
1191
 
1169
 
   return SHADER_PRIM_TRIANGLES;
 
1192
   return MESA_PRIM_TRIANGLES;
1170
1193
}
1171
1194
 
1172
1195
static void
1183
1206
   const unsigned gs_num_invocations = gs_stage ? MAX2(gs_info->gs.invocations, 1) : 1;
1184
1207
 
1185
1208
   const unsigned input_prim = radv_get_pre_rast_input_topology(es_stage, gs_stage);
1186
 
   const bool uses_adjacency = input_prim == SHADER_PRIM_LINES_ADJACENCY ||
1187
 
                               input_prim == SHADER_PRIM_TRIANGLES_ADJACENCY;
 
1209
   const bool uses_adjacency = input_prim == MESA_PRIM_LINES_ADJACENCY || input_prim == MESA_PRIM_TRIANGLES_ADJACENCY;
1188
1210
 
1189
1211
   /* All these are in dwords: */
1190
1212
   /* We can't allow using the whole LDS, because GS waves compete with
1201
1223
 
1202
1224
   /* All these are per subgroup: */
1203
1225
   const unsigned min_esverts = gfx_level >= GFX11 ? 3 : /* gfx11 requires at least 1 primitive per TG */
1204
 
                                gfx_level >= GFX10_3 ? 29 : 24;
 
1226
                                   gfx_level >= GFX10_3 ? 29
 
1227
                                                        : 24;
1205
1228
   bool max_vert_out_per_gs_instance = false;
1206
1229
   unsigned max_esverts_base = 128;
1207
1230
   unsigned max_gsprims_base = 128; /* default prim group size clamp */
1305
1328
         max_esverts = align(max_esverts, wavesize);
1306
1329
         max_esverts = MIN2(max_esverts, max_esverts_base);
1307
1330
         if (esvert_lds_size)
1308
 
            max_esverts =
1309
 
               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
 
1331
            max_esverts = MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
1310
1332
         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1311
1333
 
1312
1334
         /* Hardware restriction: minimum value of max_esverts */
1325
1347
             * for triangles.
1326
1348
             */
1327
1349
            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1328
 
            max_gsprims = MIN2(max_gsprims,
1329
 
                               (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
 
1350
            max_gsprims = MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
1330
1351
         }
1331
1352
         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
1332
1353
         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
1346
1367
   }
1347
1368
 
1348
1369
   unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
1349
 
                               : gs_stage
1350
 
                                  ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
1351
 
                                  : max_esverts;
 
1370
                               : gs_stage ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
 
1371
                                          : max_esverts;
1352
1372
   assert(max_out_vertices <= 256);
1353
1373
 
1354
1374
   unsigned prim_amp_factor = 1;
1386
1406
   assert(out->hw_max_esverts >= min_esverts); /* HW limitation */
1387
1407
 
1388
1408
   unsigned workgroup_size =
1389
 
      ac_compute_ngg_workgroup_size(
1390
 
         max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
 
1409
      ac_compute_ngg_workgroup_size(max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
1391
1410
   if (gs_stage) {
1392
1411
      gs_info->workgroup_size = workgroup_size;
1393
1412
   }
1396
1415
 
1397
1416
static void
1398
1417
gfx10_get_ngg_query_info(const struct radv_device *device, struct radv_pipeline_stage *es_stage,
1399
 
                         struct radv_pipeline_stage *gs_stage,
1400
 
                         const struct radv_pipeline_key *pipeline_key)
 
1418
                         struct radv_pipeline_stage *gs_stage, const struct radv_pipeline_key *pipeline_key)
1401
1419
{
1402
1420
   struct radv_shader_info *info = gs_stage ? &gs_stage->info : &es_stage->info;
1403
1421
 
1404
 
   info->gs.has_ngg_pipeline_stat_query =
1405
 
      device->physical_device->emulate_ngg_gs_query_pipeline_stat && !!gs_stage;
 
1422
   info->gs.has_ngg_pipeline_stat_query = device->physical_device->emulate_ngg_gs_query_pipeline_stat && !!gs_stage;
1406
1423
   info->has_ngg_xfb_query = gs_stage ? !!gs_stage->nir->xfb_info : !!es_stage->nir->xfb_info;
1407
1424
   info->has_ngg_prim_query = pipeline_key->primitives_generated_query || info->has_ngg_xfb_query;
1408
1425
}
1409
1426
 
1410
1427
static void
1411
1428
radv_determine_ngg_settings(struct radv_device *device, struct radv_pipeline_stage *es_stage,
1412
 
                            struct radv_pipeline_stage *fs_stage,
1413
 
                            const struct radv_pipeline_key *pipeline_key)
 
1429
                            struct radv_pipeline_stage *fs_stage, const struct radv_pipeline_key *pipeline_key)
1414
1430
{
1415
1431
   assert(es_stage->stage == MESA_SHADER_VERTEX || es_stage->stage == MESA_SHADER_TESS_EVAL);
1416
 
   assert(fs_stage->stage == MESA_SHADER_FRAGMENT);
 
1432
   assert(!fs_stage || fs_stage->stage == MESA_SHADER_FRAGMENT);
1417
1433
 
1418
 
   uint64_t ps_inputs_read = fs_stage->nir->info.inputs_read;
 
1434
   uint64_t ps_inputs_read = fs_stage ? fs_stage->nir->info.inputs_read : 0;
1419
1435
 
1420
1436
   unsigned num_vertices_per_prim = 0;
1421
1437
   if (es_stage->stage == MESA_SHADER_VERTEX) {
1422
1438
      num_vertices_per_prim = radv_get_num_vertices_per_prim(pipeline_key);
1423
1439
   } else if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
1424
 
      num_vertices_per_prim = es_stage->nir->info.tess.point_mode ? 1 :
1425
 
         es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2 : 3;
 
1440
      num_vertices_per_prim = es_stage->nir->info.tess.point_mode                                   ? 1
 
1441
                              : es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2
 
1442
                                                                                                    : 3;
1426
1443
   }
1427
1444
 
1428
1445
   /* TODO: Enable culling for LLVM. */
1429
 
   es_stage->info.has_ngg_culling =
1430
 
      radv_consider_culling(device->physical_device, es_stage->nir, ps_inputs_read,
1431
 
                            num_vertices_per_prim, &es_stage->info) &&
1432
 
      !radv_use_llvm_for_stage(device, es_stage->stage);
 
1446
   es_stage->info.has_ngg_culling = radv_consider_culling(device->physical_device, es_stage->nir, ps_inputs_read,
 
1447
                                                          num_vertices_per_prim, &es_stage->info) &&
 
1448
                                    !radv_use_llvm_for_stage(device, es_stage->stage);
1433
1449
 
1434
1450
   nir_function_impl *impl = nir_shader_get_entrypoint(es_stage->nir);
1435
1451
   es_stage->info.has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
1437
1453
   /* NGG passthrough mode should be disabled when culling and when the vertex shader
1438
1454
    * exports the primitive ID.
1439
1455
    */
1440
 
   es_stage->info.is_ngg_passthrough = !es_stage->info.has_ngg_culling &&
1441
 
      !(es_stage->stage == MESA_SHADER_VERTEX && es_stage->info.outinfo.export_prim_id);
 
1456
   es_stage->info.is_ngg_passthrough = !es_stage->info.has_ngg_culling && !(es_stage->stage == MESA_SHADER_VERTEX &&
 
1457
                                                                            es_stage->info.outinfo.export_prim_id);
1442
1458
}
1443
1459
 
1444
1460
static void
1445
 
radv_link_shaders_info(struct radv_device *device,
1446
 
                       struct radv_pipeline_stage *producer, struct radv_pipeline_stage *consumer,
1447
 
                       const struct radv_pipeline_key *pipeline_key)
 
1461
radv_link_shaders_info(struct radv_device *device, struct radv_pipeline_stage *producer,
 
1462
                       struct radv_pipeline_stage *consumer, const struct radv_pipeline_key *pipeline_key)
1448
1463
{
1449
1464
   /* Export primitive ID and clip/cull distances if read by the FS, or export unconditionally when
1450
1465
    * the next stage is unknown (with graphics pipeline library).
1451
1466
    */
1452
 
   if (!consumer || consumer->stage == MESA_SHADER_FRAGMENT) {
 
1467
   if ((consumer && consumer->stage == MESA_SHADER_FRAGMENT) ||
 
1468
       !(pipeline_key->lib_flags & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1453
1469
      struct radv_vs_output_info *outinfo = &producer->info.outinfo;
1454
1470
      const bool ps_prim_id_in = !consumer || consumer->info.ps.prim_id_input;
1455
1471
      const bool ps_clip_dists_in = !consumer || !!consumer->info.ps.num_input_clips_culls;
1456
1472
 
1457
 
      if (ps_prim_id_in &&
1458
 
          (producer->stage == MESA_SHADER_VERTEX || producer->stage == MESA_SHADER_TESS_EVAL)) {
 
1473
      if (ps_prim_id_in && (producer->stage == MESA_SHADER_VERTEX || producer->stage == MESA_SHADER_TESS_EVAL)) {
1459
1474
         /* Mark the primitive ID as output when it's implicitly exported by VS or TES. */
1460
1475
         if (outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED)
1461
1476
            outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
1487
1502
 
1488
1503
         /* Compute the ESGS item size for VS or TES as ES. */
1489
1504
         producer->info.esgs_itemsize = num_outputs_written * 16;
 
1505
 
 
1506
         /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
 
1507
          * conflicts, i.e. each vertex will start on a different bank.
 
1508
          */
 
1509
         if (device->physical_device->rad_info.gfx_level >= GFX9 && producer->info.esgs_itemsize)
 
1510
            producer->info.esgs_itemsize += 4;
1490
1511
      }
1491
1512
 
1492
1513
      /* Compute NGG info (GFX10+) or GS info. */
1493
1514
      if (producer->info.is_ngg) {
1494
 
         struct radv_pipeline_stage *gs_stage =
1495
 
            consumer && consumer->stage == MESA_SHADER_GEOMETRY ? consumer : NULL;
 
1515
         struct radv_pipeline_stage *gs_stage = consumer && consumer->stage == MESA_SHADER_GEOMETRY ? consumer : NULL;
1496
1516
 
1497
1517
         gfx10_get_ngg_info(device, producer, gs_stage);
1498
1518
         gfx10_get_ngg_query_info(device, producer, gs_stage, pipeline_key);
1499
1519
 
1500
1520
         /* Determine other NGG settings like culling for VS or TES without GS. */
1501
 
         if (!gs_stage && consumer) {
 
1521
         if (!gs_stage) {
1502
1522
            radv_determine_ngg_settings(device, producer, consumer, pipeline_key);
1503
1523
         }
1504
1524
      } else if (consumer && consumer->stage == MESA_SHADER_GEOMETRY) {
1506
1526
      }
1507
1527
   }
1508
1528
 
1509
 
   if (producer->stage == MESA_SHADER_VERTEX &&
1510
 
       consumer && consumer->stage == MESA_SHADER_TESS_CTRL) {
 
1529
   if (producer->stage == MESA_SHADER_VERTEX && consumer && consumer->stage == MESA_SHADER_TESS_CTRL) {
1511
1530
      struct radv_pipeline_stage *vs_stage = producer;
1512
1531
      struct radv_pipeline_stage *tcs_stage = consumer;
1513
1532
 
1520
1539
         vs_stage->info.workgroup_size = 256;
1521
1540
         tcs_stage->info.workgroup_size = 256;
1522
1541
      } else {
1523
 
         vs_stage->info.workgroup_size =
1524
 
            ac_compute_lshs_workgroup_size(device->physical_device->rad_info.gfx_level,
1525
 
                                           MESA_SHADER_VERTEX, tcs_stage->info.num_tess_patches,
1526
 
                                           pipeline_key->tcs.tess_input_vertices,
1527
 
                                           tcs_stage->info.tcs.tcs_vertices_out);
 
1542
         vs_stage->info.workgroup_size = ac_compute_lshs_workgroup_size(
 
1543
            device->physical_device->rad_info.gfx_level, MESA_SHADER_VERTEX, tcs_stage->info.num_tess_patches,
 
1544
            pipeline_key->tcs.tess_input_vertices, tcs_stage->info.tcs.tcs_vertices_out);
1528
1545
 
1529
 
         tcs_stage->info.workgroup_size =
1530
 
            ac_compute_lshs_workgroup_size(device->physical_device->rad_info.gfx_level,
1531
 
                                           MESA_SHADER_TESS_CTRL, tcs_stage->info.num_tess_patches,
1532
 
                                           pipeline_key->tcs.tess_input_vertices,
1533
 
                                           tcs_stage->info.tcs.tcs_vertices_out);
 
1546
         tcs_stage->info.workgroup_size = ac_compute_lshs_workgroup_size(
 
1547
            device->physical_device->rad_info.gfx_level, MESA_SHADER_TESS_CTRL, tcs_stage->info.num_tess_patches,
 
1548
            pipeline_key->tcs.tess_input_vertices, tcs_stage->info.tcs.tcs_vertices_out);
1534
1549
 
1535
1550
         if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
1536
1551
            /* When the number of TCS input and output vertices are the same (typically 3):
1545
1560
            vs_stage->info.vs.tcs_in_out_eq =
1546
1561
               device->physical_device->rad_info.gfx_level >= GFX9 &&
1547
1562
               pipeline_key->tcs.tess_input_vertices == tcs_stage->info.tcs.tcs_vertices_out &&
1548
 
               vs_stage->nir->info.float_controls_execution_mode ==
1549
 
                  tcs_stage->nir->info.float_controls_execution_mode;
 
1563
               vs_stage->nir->info.float_controls_execution_mode == tcs_stage->nir->info.float_controls_execution_mode;
1550
1564
 
1551
1565
            if (vs_stage->info.vs.tcs_in_out_eq)
1552
1566
               vs_stage->info.vs.tcs_temp_only_input_mask =
1553
 
                  tcs_stage->nir->info.inputs_read &
1554
 
                  vs_stage->nir->info.outputs_written &
 
1567
                  tcs_stage->nir->info.inputs_read & vs_stage->nir->info.outputs_written &
1555
1568
                  ~tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read &
1556
 
                  ~tcs_stage->nir->info.inputs_read_indirectly &
1557
 
                  ~vs_stage->nir->info.outputs_accessed_indirectly;
 
1569
                  ~tcs_stage->nir->info.inputs_read_indirectly & ~vs_stage->nir->info.outputs_accessed_indirectly;
1558
1570
         }
1559
1571
      }
1560
1572
   }
1605
1617
      dst_info->tes = src_info->tes;
1606
1618
   }
1607
1619
 
1608
 
   if (dst->stage == MESA_SHADER_GEOMETRY) {
1609
 
      dst_info->is_ngg = src_info->is_ngg;
 
1620
   if (dst->stage == MESA_SHADER_GEOMETRY)
1610
1621
      dst_info->gs.es_type = src->stage;
1611
 
   }
1612
1622
}
1613
1623
 
1614
1624
static const gl_shader_stage graphics_shader_order[] = {
1615
 
   MESA_SHADER_VERTEX,
1616
 
   MESA_SHADER_TESS_CTRL,
1617
 
   MESA_SHADER_TESS_EVAL,
1618
 
   MESA_SHADER_GEOMETRY,
 
1625
   MESA_SHADER_VERTEX, MESA_SHADER_TESS_CTRL, MESA_SHADER_TESS_EVAL, MESA_SHADER_GEOMETRY,
1619
1626
 
1620
 
   MESA_SHADER_TASK,
1621
 
   MESA_SHADER_MESH,
 
1627
   MESA_SHADER_TASK,   MESA_SHADER_MESH,
1622
1628
};
1623
1629
 
1624
1630
void
1626
1632
                          struct radv_pipeline_stage *stages)
1627
1633
{
1628
1634
   /* Walk backwards to link */
1629
 
   struct radv_pipeline_stage *next_stage =
1630
 
      stages[MESA_SHADER_FRAGMENT].nir ? &stages[MESA_SHADER_FRAGMENT] : NULL;
 
1635
   struct radv_pipeline_stage *next_stage = stages[MESA_SHADER_FRAGMENT].nir ? &stages[MESA_SHADER_FRAGMENT] : NULL;
1631
1636
 
1632
1637
   for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; i--) {
1633
1638
      gl_shader_stage s = graphics_shader_order[i];
1646
1651
 
1647
1652
      /* Merge shader info for VS+GS or TES+GS. */
1648
1653
      if (stages[MESA_SHADER_GEOMETRY].nir) {
1649
 
         gl_shader_stage pre_stage =
1650
 
            stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
 
1654
         gl_shader_stage pre_stage = stages[MESA_SHADER_TESS_EVAL].nir ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
1651
1655
 
1652
1656
         radv_nir_shader_info_merge(&stages[pre_stage], &stages[MESA_SHADER_GEOMETRY]);
1653
1657
      }