1653
1664
* format translation
1655
uint32_t si_translate_colorformat(enum amd_gfx_level gfx_level,
1656
enum pipe_format format)
1658
const struct util_format_description *desc = util_format_description(format);
1660
#define HAS_SIZE(x, y, z, w) \
1661
(desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
1662
desc->channel[2].size == (z) && desc->channel[3].size == (w))
1664
if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
1665
return V_028C70_COLOR_10_11_11;
1667
if (gfx_level >= GFX10_3 &&
1668
format == PIPE_FORMAT_R9G9B9E5_FLOAT) /* isn't plain */
1669
return V_028C70_COLOR_5_9_9_9;
1671
if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
1672
return V_028C70_COLOR_INVALID;
1674
/* hw cannot support mixed formats (except depth/stencil, since
1675
* stencil is not written to). */
1676
if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
1677
return V_028C70_COLOR_INVALID;
1679
int first_non_void = util_format_get_first_non_void_channel(format);
1681
/* Reject SCALED formats because we don't implement them for CB. */
1682
if (first_non_void >= 0 && first_non_void <= 3 &&
1683
(desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_UNSIGNED ||
1684
desc->channel[first_non_void].type == UTIL_FORMAT_TYPE_SIGNED) &&
1685
!desc->channel[first_non_void].normalized &&
1686
!desc->channel[first_non_void].pure_integer)
1687
return V_028C70_COLOR_INVALID;
1689
switch (desc->nr_channels) {
1691
switch (desc->channel[0].size) {
1693
return V_028C70_COLOR_8;
1695
return V_028C70_COLOR_16;
1697
return V_028C70_COLOR_32;
1701
if (desc->channel[0].size == desc->channel[1].size) {
1702
switch (desc->channel[0].size) {
1704
return V_028C70_COLOR_8_8;
1706
return V_028C70_COLOR_16_16;
1708
return V_028C70_COLOR_32_32;
1710
} else if (HAS_SIZE(8, 24, 0, 0)) {
1711
return V_028C70_COLOR_24_8;
1712
} else if (HAS_SIZE(24, 8, 0, 0)) {
1713
return V_028C70_COLOR_8_24;
1717
if (HAS_SIZE(5, 6, 5, 0)) {
1718
return V_028C70_COLOR_5_6_5;
1719
} else if (HAS_SIZE(32, 8, 24, 0)) {
1720
return V_028C70_COLOR_X24_8_32_FLOAT;
1724
if (desc->channel[0].size == desc->channel[1].size &&
1725
desc->channel[0].size == desc->channel[2].size &&
1726
desc->channel[0].size == desc->channel[3].size) {
1727
switch (desc->channel[0].size) {
1729
return V_028C70_COLOR_4_4_4_4;
1731
return V_028C70_COLOR_8_8_8_8;
1733
return V_028C70_COLOR_16_16_16_16;
1735
return V_028C70_COLOR_32_32_32_32;
1737
} else if (HAS_SIZE(5, 5, 5, 1)) {
1738
return V_028C70_COLOR_1_5_5_5;
1739
} else if (HAS_SIZE(1, 5, 5, 5)) {
1740
return V_028C70_COLOR_5_5_5_1;
1741
} else if (HAS_SIZE(10, 10, 10, 2)) {
1742
return V_028C70_COLOR_2_10_10_10;
1743
} else if (HAS_SIZE(2, 10, 10, 10)) {
1744
return V_028C70_COLOR_10_10_10_2;
1748
return V_028C70_COLOR_INVALID;
1751
1667
static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
5742
5592
si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
5743
5593
si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en);
5745
if (sctx->gfx_level == GFX6) {
5746
/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID and is now per pipe,
5747
* so it should be handled in the kernel if we want to use something other than
5748
* the default value.
5749
* TODO: This should be: (number of compute units) * 4 * (waves per simd) - 1
5751
si_pm4_set_reg(pm4, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
5754
5595
if (sctx->gfx_level >= GFX7) {
5755
5596
si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en);
5756
5597
si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en);
5758
/* Disable profiling on compute chips. */
5759
if (!sscreen->info.has_graphics) {
5760
si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
5761
si_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
5765
if (!sscreen->info.has_graphics && sscreen->info.family >= CHIP_GFX940) {
5766
si_pm4_set_reg(pm4, R_00B89C_COMPUTE_TG_CHUNK_SIZE, 0);
5767
si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_PGM_RSRC3, 0);
5770
if (sctx->gfx_level >= GFX9 && sctx->gfx_level < GFX11)
5771
si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, sctx->gfx_level >= GFX10 ? 0x20 : 0);
5773
if (sscreen->info.family == CHIP_MI100 ||
5774
sscreen->info.family == CHIP_MI200) {
5775
si_pm4_set_reg(pm4, R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
5776
si_pm4_set_reg(pm4, R_00B898_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
5777
si_pm4_set_reg(pm4, R_00B89C_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
5778
si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
5781
if (sctx->gfx_level >= GFX10) {
5782
si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0);
5783
si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0);
5784
si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0);
5785
si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
5787
if (sctx->gfx_level < GFX11)
5788
si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
5790
si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
5793
if (sctx->gfx_level >= GFX11) {
5794
si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
5795
si_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
5796
si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
5797
si_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
5799
/* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
5800
* Only these values are valid: 0 (disabled), 64, 128, 256, 512
5801
* Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
5803
si_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
5600
if (sctx->gfx_level >= GFX9)
5601
si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0);
5806
5603
/* Set the pointer to border colors. MI200 doesn't support border colors. */
5807
5604
if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) {
6001
5778
si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
6004
if (sctx->gfx_level >= GFX10) {
6005
si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
6006
si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
6007
si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
6008
si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
6009
si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
6010
si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
6011
si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
6012
si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0);
6013
si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0);
6014
si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0);
6015
si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0);
6016
si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0);
6018
si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
6019
S_00B0C0_SOFT_GROUPING_EN(1) |
6020
S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
6022
/* Enable CMASK/HTILE/DCC caching in L2 for small chips. */
6023
unsigned meta_write_policy, meta_read_policy;
6024
unsigned no_alloc = sctx->gfx_level >= GFX11 ? V_02807C_CACHE_NOA_GFX11:
6025
V_02807C_CACHE_NOA_GFX10;
6026
if (sscreen->info.max_render_backends <= 4) {
6027
meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
6028
meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */
6030
meta_write_policy = V_02807C_CACHE_STREAM; /* write combine */
6031
meta_read_policy = no_alloc; /* don't cache reads that miss */
5782
si_pm4_finalize(pm4);
5783
sctx->cs_preamble_state = pm4;
5784
sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */
5787
static void cdna_init_compute_preamble_state(struct si_context *sctx)
5789
struct si_screen *sscreen = sctx->screen;
5790
uint64_t border_color_va =
5791
sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
5792
uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) |
5793
S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en);
5795
struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, 48, true);
5799
/* Compute registers. */
5800
/* Disable profiling on compute chips. */
5801
si_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
5802
si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
5803
si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
5804
si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en);
5805
si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en);
5806
si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en);
5807
si_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
5809
if (sscreen->info.family >= CHIP_GFX940) {
5810
si_pm4_set_reg(pm4, R_00B89C_COMPUTE_TG_CHUNK_SIZE, 0);
5811
si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_PGM_RSRC3, 0);
5813
si_pm4_set_reg(pm4, R_00B894_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
5814
si_pm4_set_reg(pm4, R_00B898_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
5815
si_pm4_set_reg(pm4, R_00B89C_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
5816
si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
5819
si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0);
5821
/* Set the pointer to border colors. Only MI100 supports border colors. */
5822
if (sscreen->info.family == CHIP_MI100) {
5823
si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
5824
si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI,
5825
S_030E04_ADDRESS(border_color_va >> 40));
5828
si_pm4_finalize(pm4);
5829
sctx->cs_preamble_state = pm4;
5830
sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */
5833
static void gfx10_init_gfx_preamble_state(struct si_context *sctx)
5835
struct si_screen *sscreen = sctx->screen;
5836
uint64_t border_color_va =
5837
sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
5838
uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) |
5839
S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en);
5840
unsigned meta_write_policy, meta_read_policy;
5841
unsigned no_alloc = sctx->gfx_level >= GFX11 ? V_02807C_CACHE_NOA_GFX11:
5842
V_02807C_CACHE_NOA_GFX10;
5843
/* Enable CMASK/HTILE/DCC caching in L2 for small chips. */
5844
if (sscreen->info.max_render_backends <= 4) {
5845
meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
5846
meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */
5848
meta_write_policy = V_02807C_CACHE_STREAM; /* write combine */
5849
meta_read_policy = no_alloc; /* don't cache reads that miss */
5852
/* We need more space because the preamble is large. */
5853
struct si_pm4_state *pm4 = si_pm4_create_sized(sscreen, 214, sctx->has_graphics);
5857
if (sctx->has_graphics && !sctx->shadowing.registers) {
5858
si_pm4_cmd_add(pm4, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
5859
si_pm4_cmd_add(pm4, CC0_UPDATE_LOAD_ENABLES(1));
5860
si_pm4_cmd_add(pm4, CC1_UPDATE_SHADOW_ENABLES(1));
5862
if (sscreen->dpbb_allowed) {
5863
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
5864
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
6034
si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
6035
S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) |
6036
S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) |
6037
S_02807C_HTILE_WR_POLICY(meta_write_policy) |
6038
S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) |
6039
S_02807C_Z_RD_POLICY(no_alloc) |
6040
S_02807C_S_RD_POLICY(no_alloc) |
6041
S_02807C_HTILE_RD_POLICY(meta_read_policy));
6044
if (sctx->gfx_level >= GFX11)
6045
gl2_cc = S_028410_DCC_WR_POLICY_GFX11(meta_write_policy) |
6046
S_028410_COLOR_WR_POLICY_GFX11(V_028410_CACHE_STREAM) |
6047
S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_GFX11);
6049
gl2_cc = S_028410_CMASK_WR_POLICY(meta_write_policy) |
6050
S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM) |
6051
S_028410_DCC_WR_POLICY_GFX10(meta_write_policy) |
6052
S_028410_COLOR_WR_POLICY_GFX10(V_028410_CACHE_STREAM) |
6053
S_028410_CMASK_RD_POLICY(meta_read_policy) |
6054
S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_GFX10) |
6055
S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_GFX10);
6057
si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
6059
S_028410_DCC_RD_POLICY(meta_read_policy));
6061
si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
6062
si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
6064
/* Break up a pixel wave if it contains deallocs for more than
6065
* half the parameter cache.
6067
* To avoid a deadlock where pixel waves aren't launched
6068
* because they're waiting for more pixels while the frontend
6069
* is stuck waiting for PC space, the maximum allowed value is
6070
* the size of the PC minus the largest possible allocation for
6071
* a single primitive shader subgroup.
5867
si_pm4_cmd_add(pm4, PKT3(PKT3_CLEAR_STATE, 0, 0));
5868
si_pm4_cmd_add(pm4, 0);
5871
/* Non-graphics uconfig registers. */
5872
if (sctx->gfx_level < GFX11)
5873
si_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0x20);
5874
si_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
5875
si_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40));
5877
/* Compute registers. */
5878
si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sscreen->info.address32_hi >> 8));
5879
si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
5880
si_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en);
5882
si_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en);
5883
si_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en);
5885
si_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0);
5886
si_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0);
5887
si_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0);
5888
si_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
5890
if (sctx->gfx_level >= GFX11) {
5891
si_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, compute_cu_en);
5892
si_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, compute_cu_en);
5893
si_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, compute_cu_en);
5894
si_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, compute_cu_en);
5896
/* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
5897
* Only these values are valid: 0 (disabled), 64, 128, 256, 512
5898
* Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
6073
si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL,
6074
S_028C50_MAX_DEALLOCS_IN_WAVE(sctx->gfx_level >= GFX11 ? 16 : 512));
6076
if (sctx->gfx_level < GFX11) {
6077
/* Reuse for legacy (non-NGG) only. */
6078
si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
6081
if (!has_clear_state) {
6082
si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
6083
sscreen->info.pa_sc_tile_steering_override);
6087
si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
6088
si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
6089
si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
6090
si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
6091
si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
5900
si_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
5902
si_pm4_set_reg(pm4, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
6094
if (sctx->gfx_level >= GFX10 && sctx->gfx_level <= GFX10_3) {
6095
/* Logical CUs 16 - 31 */
6096
si_pm4_set_reg_idx3(sscreen, pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
6097
ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16),
5905
si_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
5907
if (!sctx->has_graphics)
5910
/* Shader registers - PS. */
5911
unsigned cu_mask_ps = sctx->gfx_level >= GFX10_3 ? gfx103_get_cu_mask_ps(sscreen) : ~0u;
5912
if (sctx->gfx_level < GFX11) {
5913
si_pm4_set_reg_idx3(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS,
5914
ac_apply_cu_en(S_00B004_CU_EN(cu_mask_ps >> 16), /* CUs 16-31 */
6098
5915
C_00B004_CU_EN, 16, &sscreen->info));
6099
si_pm4_set_reg_idx3(sscreen, pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
6100
ac_apply_cu_en(S_00B104_CU_EN(0xffff),
5917
si_pm4_set_reg_idx3(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
5918
ac_apply_cu_en(S_00B01C_CU_EN(cu_mask_ps) |
5919
S_00B01C_WAVE_LIMIT(0x3F) |
5920
S_00B01C_LDS_GROUP_SIZE(sctx->gfx_level >= GFX11),
5921
C_00B01C_CU_EN, 0, &sscreen->info));
5922
si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
5923
S_00B0C0_SOFT_GROUPING_EN(1) |
5924
S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
5925
si_pm4_set_reg(pm4, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
5926
si_pm4_set_reg(pm4, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
5927
si_pm4_set_reg(pm4, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
5928
si_pm4_set_reg(pm4, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
5930
/* Shader registers - VS. */
5931
if (sctx->gfx_level < GFX11) {
5932
si_pm4_set_reg_idx3(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS,
5933
ac_apply_cu_en(S_00B104_CU_EN(0xffff), /* CUs 16-31 */
6101
5934
C_00B104_CU_EN, 16, &sscreen->info));
6102
si_pm4_set_reg_idx3(sscreen, pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
6103
ac_apply_cu_en(S_00B404_CU_EN(0xffff),
6104
C_00B404_CU_EN, 16, &sscreen->info));
6106
5935
si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
6107
5936
si_pm4_set_reg(pm4, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
6108
5937
si_pm4_set_reg(pm4, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
6110
5939
si_pm4_set_reg(pm4, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
6113
if (sctx->gfx_level >= GFX10_3) {
5942
/* Shader registers - GS. */
5943
si_pm4_set_reg(pm4, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
5944
si_pm4_set_reg(pm4, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
5945
si_pm4_set_reg(pm4, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
5946
si_pm4_set_reg(pm4, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0);
5947
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
5948
S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
5950
/* Shader registers - HS. */
5951
if (sctx->gfx_level < GFX11) {
5952
si_pm4_set_reg_idx3(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS,
5953
ac_apply_cu_en(S_00B404_CU_EN(0xffff), /* CUs 16-31 */
5954
C_00B404_CU_EN, 16, &sscreen->info));
5956
si_pm4_set_reg_idx3(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
5957
ac_apply_cu_en(S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F),
5958
C_00B41C_CU_EN, 0, &sscreen->info));
5959
si_pm4_set_reg(pm4, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0);
5960
si_pm4_set_reg(pm4, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0);
5961
si_pm4_set_reg(pm4, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0);
5962
si_pm4_set_reg(pm4, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0);
5963
si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
5964
S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
5966
/* Context registers. */
5967
if (sctx->gfx_level < GFX11) {
5968
si_pm4_set_reg(pm4, R_028038_DB_DFSM_CONTROL, S_028038_PUNCHOUT_MODE(V_028038_FORCE_OFF));
5970
si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
5971
S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM) |
5972
S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM) |
5973
S_02807C_HTILE_WR_POLICY(meta_write_policy) |
5974
S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM) |
5975
S_02807C_Z_RD_POLICY(no_alloc) |
5976
S_02807C_S_RD_POLICY(no_alloc) |
5977
S_02807C_HTILE_RD_POLICY(meta_read_policy));
5978
si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
5979
si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
5981
si_pm4_set_reg(pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
5982
(sctx->gfx_level >= GFX11 ?
5983
S_028410_DCC_WR_POLICY_GFX11(meta_write_policy) |
5984
S_028410_COLOR_WR_POLICY_GFX11(V_028410_CACHE_STREAM) |
5985
S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_GFX11)
5987
S_028410_CMASK_WR_POLICY(meta_write_policy) |
5988
S_028410_FMASK_WR_POLICY(V_028410_CACHE_STREAM) |
5989
S_028410_DCC_WR_POLICY_GFX10(meta_write_policy) |
5990
S_028410_COLOR_WR_POLICY_GFX10(V_028410_CACHE_STREAM) |
5991
S_028410_CMASK_RD_POLICY(meta_read_policy) |
5992
S_028410_FMASK_RD_POLICY(V_028410_CACHE_NOA_GFX10) |
5993
S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_GFX10)) |
5994
S_028410_DCC_RD_POLICY(meta_read_policy));
5996
if (sctx->gfx_level >= GFX10_3)
6114
5997
si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL, 0xff);
5999
/* If any sample location uses the -8 coordinate, the EXCLUSION fields should be set to 0. */
6000
si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL,
6001
S_02882C_XMAX_RIGHT_EXCLUSION(1) |
6002
S_02882C_YMAX_BOTTOM_EXCLUSION(1));
6003
si_pm4_set_reg(pm4, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
6004
S_028830_SMALL_PRIM_FILTER_ENABLE(1));
6005
if (sctx->gfx_level >= GFX10_3) {
6115
6006
/* The rate combiners have no effect if they are disabled like this:
6116
6007
* VERTEX_RATE: BYPASS_VTX_RATE_COMBINER = 1
6117
6008
* PRIMITIVE_RATE: BYPASS_PRIM_RATE_COMBINER = 1
6126
6017
S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE));
6129
if (sctx->gfx_level >= GFX11) {
6130
si_pm4_set_reg(pm4, R_028C54_PA_SC_BINNER_CNTL_2, 0);
6131
si_pm4_set_reg(pm4, R_028620_PA_RATE_CNTL,
6132
S_028620_VERTEX_RATE(2) | S_028620_PRIM_RATE(1));
6134
uint64_t rb_mask = BITFIELD64_MASK(sctx->screen->info.max_render_backends);
6020
si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
6021
si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1);
6022
si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
6023
sctx->gfx_level >= GFX11 ?
6024
S_028B50_ACCUM_ISOLINE(128) |
6025
S_028B50_ACCUM_TRI(128) |
6026
S_028B50_ACCUM_QUAD(128) |
6027
S_028B50_DONUT_SPLIT_GFX9(24) |
6028
S_028B50_TRAP_SPLIT(6)
6030
S_028B50_ACCUM_ISOLINE(12) |
6031
S_028B50_ACCUM_TRI(30) |
6032
S_028B50_ACCUM_QUAD(24) |
6033
S_028B50_DONUT_SPLIT_GFX9(24) |
6034
S_028B50_TRAP_SPLIT(6));
6036
si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
6037
S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
6038
S_028C48_MAX_PRIM_PER_BATCH(1023));
6039
/* Break up a pixel wave if it contains deallocs for more than
6040
* half the parameter cache.
6042
* To avoid a deadlock where pixel waves aren't launched
6043
* because they're waiting for more pixels while the frontend
6044
* is stuck waiting for PC space, the maximum allowed value is
6045
* the size of the PC minus the largest possible allocation for
6046
* a single primitive shader subgroup.
6048
si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL,
6049
S_028C50_MAX_DEALLOCS_IN_WAVE(sctx->gfx_level >= GFX11 ? 16 : 512));
6050
if (sctx->gfx_level < GFX11)
6051
si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14); /* Reuse for legacy (non-NGG) only. */
6053
/* Uconfig registers. */
6054
si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
6055
si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
6056
if (sctx->gfx_level >= GFX11) {
6057
/* This is changed by draws for indexed draws, but we need to set DISABLE_FOR_AUTO_INDEX
6058
* here, which disables primitive restart for all non-indexed draws, so that those draws
6059
* won't have to set this state.
6061
si_pm4_set_reg(pm4, R_03092C_GE_MULTI_PRIM_IB_RESET_EN, S_03092C_DISABLE_FOR_AUTO_INDEX(1));
6063
si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
6064
si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
6065
si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
6066
si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
6068
si_pm4_set_reg(pm4, R_030A00_PA_SU_LINE_STIPPLE_VALUE, 0);
6069
si_pm4_set_reg(pm4, R_030A04_PA_SC_LINE_STIPPLE_STATE, 0);
6071
if (sctx->gfx_level >= GFX11) {
6072
uint64_t rb_mask = BITFIELD64_MASK(sscreen->info.max_render_backends);
6136
6074
si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 2, 0));
6137
6075
si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1));