2
* Copyright 2020 Advanced Micro Devices, Inc.
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* on the rights to use, copy, modify, merge, publish, distribute, sub
9
* license, and/or sell copies of the Software, and to permit persons to whom
10
* the Software is furnished to do so, subject to the following conditions:
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
* USE OR OTHER DEALINGS IN THE SOFTWARE.
28
#include "si_build_pm4.h"
29
#include "si_compute.h"
33
#include "util/u_memory.h"
34
#include "tgsi/tgsi_from_mesa.h"
37
si_emit_spi_config_cntl(struct si_context* sctx,
38
struct radeon_cmdbuf *cs, bool enable);
41
si_thread_trace_init_bo(struct si_context *sctx)
43
unsigned max_se = sctx->screen->info.max_se;
44
struct radeon_winsys *ws = sctx->ws;
47
/* The buffer size and address need to be aligned in HW regs. Align the
48
* size as early as possible so that we do all the allocation & addressing
50
sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,
51
1u << SQTT_BUFFER_ALIGN_SHIFT);
53
/* Compute total size of the thread trace BO for all SEs. */
54
size = align64(sizeof(struct ac_thread_trace_info) * max_se,
55
1 << SQTT_BUFFER_ALIGN_SHIFT);
56
size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
58
sctx->thread_trace->bo =
59
ws->buffer_create(ws, size, 4096,
61
RADEON_FLAG_NO_INTERPROCESS_SHARING |
63
RADEON_FLAG_NO_SUBALLOC);
64
if (!sctx->thread_trace->bo)
71
si_se_is_disabled(struct si_context* sctx, unsigned se)
73
/* No active CU on the SE means it is disabled. */
74
return sctx->screen->info.cu_mask[se][0] == 0;
79
si_emit_thread_trace_start(struct si_context* sctx,
80
struct radeon_cmdbuf *cs,
81
uint32_t queue_family_index)
83
struct si_screen *sscreen = sctx->screen;
84
uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
85
unsigned max_se = sscreen->info.max_se;
89
for (unsigned se = 0; se < max_se; se++) {
90
uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
91
uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
92
uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
94
if (si_se_is_disabled(sctx, se))
97
/* Target SEx and SH0. */
98
radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
99
S_030800_SE_INDEX(se) |
100
S_030800_SH_INDEX(0) |
101
S_030800_INSTANCE_BROADCAST_WRITES(1));
103
/* Select the first active CUs */
104
int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
106
if (sctx->chip_class >= GFX10) {
107
/* Order seems important for the following 2 registers. */
108
radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
109
S_008D04_SIZE(shifted_size) |
110
S_008D04_BASE_HI(shifted_va >> 32));
112
radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
114
int wgp = first_active_cu / 2;
115
radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK,
116
S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
118
S_008D14_WGP_SEL(wgp) |
119
S_008D14_SIMD_SEL(0));
121
radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
122
S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |
123
V_008D18_REG_INCLUDE_SHDEC |
124
V_008D18_REG_INCLUDE_GFXUDEC |
125
V_008D18_REG_INCLUDE_CONTEXT |
126
V_008D18_REG_INCLUDE_COMP |
127
V_008D18_REG_INCLUDE_CONFIG) |
128
S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
130
/* Should be emitted last (it enables thread traces). */
131
radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
133
S_008D1C_HIWATER(5) |
134
S_008D1C_UTIL_TIMER(1) |
135
S_008D1C_RT_FREQ(2) | /* 4096 clk */
136
S_008D1C_DRAW_EVENT_EN(1) |
137
S_008D1C_REG_STALL_EN(1) |
138
S_008D1C_SPI_STALL_EN(1) |
139
S_008D1C_SQ_STALL_EN(1) |
140
S_008D1C_REG_DROP_ON_STALL(0) |
141
S_008D1C_LOWATER_OFFSET(
142
sctx->chip_class >= GFX10_3 ? 4 : 0) |
143
S_008D1C_AUTO_FLUSH_MODE(sctx->chip_class == GFX10_3));
145
/* Order seems important for the following 4 registers. */
146
radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
147
S_030CDC_ADDR_HI(shifted_va >> 32));
149
radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
151
radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
152
S_030CC4_SIZE(shifted_size));
154
radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
155
S_030CD4_RESET_BUFFER(1));
157
uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |
159
S_030CC8_SIMD_EN(0xf) |
160
S_030CC8_VM_ID_MASK(0) |
161
S_030CC8_REG_STALL_EN(1) |
162
S_030CC8_SPI_STALL_EN(1) |
163
S_030CC8_SQ_STALL_EN(1);
165
radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK,
168
/* Trace all tokens and registers. */
169
radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
170
S_030CCC_TOKEN_MASK(0xbfff) |
171
S_030CCC_REG_MASK(0xff) |
172
S_030CCC_REG_DROP_ON_STALL(0));
174
/* Enable SQTT perf counters for all CUs. */
175
radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
176
S_030CD0_SH0_MASK(0xffff) |
177
S_030CD0_SH1_MASK(0xffff));
179
radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
181
radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
182
S_030CEC_HIWATER(4));
184
if (sctx->chip_class == GFX9) {
185
/* Reset thread trace status errors. */
186
radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
187
S_030CE8_UTC_ERROR(0));
190
/* Enable the thread trace mode. */
191
uint32_t thread_trace_mode =
192
S_030CD8_MASK_PS(1) |
193
S_030CD8_MASK_VS(1) |
194
S_030CD8_MASK_GS(1) |
195
S_030CD8_MASK_ES(1) |
196
S_030CD8_MASK_HS(1) |
197
S_030CD8_MASK_LS(1) |
198
S_030CD8_MASK_CS(1) |
199
S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
202
if (sctx->chip_class == GFX9) {
203
/* Count SQTT traffic in TCC perf counters. */
204
thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
207
radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
212
/* Restore global broadcasting. */
213
radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
214
S_030800_SE_BROADCAST_WRITES(1) |
215
S_030800_SH_BROADCAST_WRITES(1) |
216
S_030800_INSTANCE_BROADCAST_WRITES(1));
218
/* Start the thread trace with a different event based on the queue. */
219
if (queue_family_index == RING_COMPUTE) {
220
radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
221
S_00B878_THREAD_TRACE_ENABLE(1));
223
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
224
radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
229
static const uint32_t gfx9_thread_trace_info_regs[] =
231
R_030CE4_SQ_THREAD_TRACE_WPTR,
232
R_030CE8_SQ_THREAD_TRACE_STATUS,
233
R_030CF0_SQ_THREAD_TRACE_CNTR,
236
static const uint32_t gfx10_thread_trace_info_regs[] =
238
R_008D10_SQ_THREAD_TRACE_WPTR,
239
R_008D20_SQ_THREAD_TRACE_STATUS,
240
R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
244
si_copy_thread_trace_info_regs(struct si_context* sctx,
245
struct radeon_cmdbuf *cs,
248
const uint32_t *thread_trace_info_regs = NULL;
250
switch (sctx->chip_class) {
253
thread_trace_info_regs = gfx10_thread_trace_info_regs;
256
thread_trace_info_regs = gfx9_thread_trace_info_regs;
259
unreachable("Unsupported chip_class");
262
/* Get the VA where the info struct is stored for this SE. */
263
uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
264
uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
268
/* Copy back the info struct one DWORD at a time. */
269
for (unsigned i = 0; i < 3; i++) {
270
radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
271
radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
272
COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
273
COPY_DATA_WR_CONFIRM);
274
radeon_emit(thread_trace_info_regs[i] >> 2);
275
radeon_emit(0); /* unused */
276
radeon_emit((info_va + i * 4));
277
radeon_emit((info_va + i * 4) >> 32);
285
si_emit_thread_trace_stop(struct si_context *sctx,
286
struct radeon_cmdbuf *cs,
287
uint32_t queue_family_index)
289
unsigned max_se = sctx->screen->info.max_se;
293
/* Stop the thread trace with a different event based on the queue. */
294
if (queue_family_index == RING_COMPUTE) {
295
radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
296
S_00B878_THREAD_TRACE_ENABLE(0));
298
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
299
radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
302
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
303
radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
306
for (unsigned se = 0; se < max_se; se++) {
307
if (si_se_is_disabled(sctx, se))
312
/* Target SEi and SH0. */
313
radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
314
S_030800_SE_INDEX(se) |
315
S_030800_SH_INDEX(0) |
316
S_030800_INSTANCE_BROADCAST_WRITES(1));
318
if (sctx->chip_class >= GFX10) {
319
/* Make sure to wait for the trace buffer. */
320
radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
321
radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
322
radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
324
radeon_emit(0); /* reference value */
325
radeon_emit(~C_008D20_FINISH_DONE); /* mask */
326
radeon_emit(4); /* poll interval */
328
/* Disable the thread trace mode. */
329
radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
332
/* Wait for thread trace completion. */
333
radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
334
radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
335
radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
337
radeon_emit(0); /* reference value */
338
radeon_emit(~C_008D20_BUSY); /* mask */
339
radeon_emit(4); /* poll interval */
341
/* Disable the thread trace mode. */
342
radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
345
/* Wait for thread trace completion. */
346
radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
347
radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
348
radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
350
radeon_emit(0); /* reference value */
351
radeon_emit(~C_030CE8_BUSY); /* mask */
352
radeon_emit(4); /* poll interval */
356
si_copy_thread_trace_info_regs(sctx, cs, se);
359
/* Restore global broadcasting. */
360
radeon_begin_again(cs);
361
radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
362
S_030800_SE_BROADCAST_WRITES(1) |
363
S_030800_SH_BROADCAST_WRITES(1) |
364
S_030800_INSTANCE_BROADCAST_WRITES(1));
369
si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
371
struct radeon_winsys *ws = sctx->ws;
377
radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
378
radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
379
radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
382
radeon_emit(PKT3(PKT3_NOP, 0, 0));
388
ws->cs_add_buffer(cs,
389
sctx->thread_trace->bo,
390
RADEON_USAGE_READWRITE,
393
si_cp_dma_wait_for_idle(sctx, cs);
395
/* Make sure to wait-for-idle before starting SQTT. */
397
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
398
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
399
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
400
sctx->emit_cache_flush(sctx, cs);
402
si_inhibit_clockgating(sctx, cs, true);
404
/* Enable SQG events that collects thread trace data. */
405
si_emit_spi_config_cntl(sctx, cs, true);
407
si_emit_thread_trace_start(sctx, cs, family);
411
si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
413
struct radeon_winsys *ws = sctx->ws;
419
radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
420
radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
421
radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
424
radeon_emit(PKT3(PKT3_NOP, 0, 0));
430
ws->cs_add_buffer(cs,
431
sctx->thread_trace->bo,
432
RADEON_USAGE_READWRITE,
435
si_cp_dma_wait_for_idle(sctx, cs);
437
/* Make sure to wait-for-idle before stopping SQTT. */
439
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
440
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
441
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
442
sctx->emit_cache_flush(sctx, cs);
444
si_emit_thread_trace_stop(sctx, cs, family);
446
/* Restore previous state by disabling SQG events. */
447
si_emit_spi_config_cntl(sctx, cs, false);
449
si_inhibit_clockgating(sctx, cs, false);
454
si_thread_trace_init_cs(struct si_context *sctx)
456
struct radeon_winsys *ws = sctx->ws;
458
/* Thread trace start CS (only handles RING_GFX). */
459
sctx->thread_trace->start_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
460
if (!ws->cs_create(sctx->thread_trace->start_cs[RING_GFX],
461
sctx->ctx, RING_GFX, NULL, NULL, 0)) {
462
free(sctx->thread_trace->start_cs[RING_GFX]);
463
sctx->thread_trace->start_cs[RING_GFX] = NULL;
467
si_thread_trace_start(sctx, RING_GFX, sctx->thread_trace->start_cs[RING_GFX]);
469
/* Thread trace stop CS. */
470
sctx->thread_trace->stop_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
471
if (!ws->cs_create(sctx->thread_trace->stop_cs[RING_GFX],
472
sctx->ctx, RING_GFX, NULL, NULL, 0)) {
473
free(sctx->thread_trace->start_cs[RING_GFX]);
474
sctx->thread_trace->start_cs[RING_GFX] = NULL;
475
free(sctx->thread_trace->stop_cs[RING_GFX]);
476
sctx->thread_trace->stop_cs[RING_GFX] = NULL;
480
si_thread_trace_stop(sctx, RING_GFX, sctx->thread_trace->stop_cs[RING_GFX]);
484
si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
486
struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[RING_GFX];
487
sctx->ws->cs_flush(cs, 0, NULL);
491
si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
493
struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[RING_GFX];
494
sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
498
si_get_thread_trace(struct si_context *sctx,
499
struct ac_thread_trace *thread_trace)
501
unsigned max_se = sctx->screen->info.max_se;
503
memset(thread_trace, 0, sizeof(*thread_trace));
504
thread_trace->num_traces = max_se;
506
sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo,
510
if (!sctx->thread_trace->ptr)
513
void *thread_trace_ptr = sctx->thread_trace->ptr;
515
for (unsigned se = 0; se < max_se; se++) {
516
uint64_t info_offset = ac_thread_trace_get_info_offset(se);
517
uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se);
518
void *info_ptr = thread_trace_ptr + info_offset;
519
void *data_ptr = thread_trace_ptr + data_offset;
520
struct ac_thread_trace_info *info =
521
(struct ac_thread_trace_info *)info_ptr;
523
struct ac_thread_trace_se thread_trace_se = {0};
525
if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) {
526
uint32_t expected_size =
527
ac_get_expected_buffer_size(&sctx->screen->info, info);
528
uint32_t available_size = (info->cur_offset * 32) / 1024;
530
fprintf(stderr, "Failed to get the thread trace "
531
"because the buffer is too small. The "
532
"hardware needs %d KB but the "
533
"buffer size is %d KB.\n",
534
expected_size, available_size);
535
fprintf(stderr, "Please update the buffer size with "
536
"AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
540
thread_trace_se.data_ptr = data_ptr;
541
thread_trace_se.info = *info;
542
thread_trace_se.shader_engine = se;
544
int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
546
/* For GFX10+ compute_unit really means WGP */
547
thread_trace_se.compute_unit =
548
sctx->screen->info.chip_class >= GFX10 ? (first_active_cu / 2) : first_active_cu;
550
thread_trace->traces[se] = thread_trace_se;
553
thread_trace->data = sctx->thread_trace;
559
si_init_thread_trace(struct si_context *sctx)
561
static bool warn_once = true;
563
fprintf(stderr, "*************************************************\n");
564
fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
565
fprintf(stderr, "*************************************************\n");
569
sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data);
571
if (sctx->chip_class < GFX8) {
572
fprintf(stderr, "GPU hardware not supported: refer to "
573
"the RGP documentation for the list of "
574
"supported GPUs!\n");
578
if (sctx->chip_class > GFX10_3) {
579
fprintf(stderr, "radeonsi: Thread trace is not supported "
584
/* Default buffer size set to 32MB per SE. */
585
sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
586
sctx->thread_trace->start_frame = 10;
588
const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
590
sctx->thread_trace->start_frame = atoi(trigger);
591
if (sctx->thread_trace->start_frame <= 0) {
592
/* This isn't a frame number, must be a file */
593
sctx->thread_trace->trigger_file = strdup(trigger);
594
sctx->thread_trace->start_frame = -1;
598
if (!si_thread_trace_init_bo(sctx))
601
list_inithead(&sctx->thread_trace->rgp_pso_correlation.record);
602
simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain);
604
list_inithead(&sctx->thread_trace->rgp_loader_events.record);
605
simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain);
607
list_inithead(&sctx->thread_trace->rgp_code_object.record);
608
simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain);
610
si_thread_trace_init_cs(sctx);
612
sctx->sqtt_next_event = EventInvalid;
618
si_destroy_thread_trace(struct si_context *sctx)
620
struct si_screen *sscreen = sctx->screen;
621
struct pb_buffer *bo = sctx->thread_trace->bo;
622
radeon_bo_reference(sctx->screen->ws, &bo, NULL);
624
if (sctx->thread_trace->trigger_file)
625
free(sctx->thread_trace->trigger_file);
627
sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[RING_GFX]);
628
sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[RING_GFX]);
630
struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation;
631
struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events;
632
struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object;
633
list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
634
&pso_correlation->record, list) {
635
list_del(&record->list);
638
simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock);
640
list_for_each_entry_safe(struct rgp_loader_events_record, record,
641
&loader_events->record, list) {
642
list_del(&record->list);
645
simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock);
647
list_for_each_entry_safe(struct rgp_code_object_record, record,
648
&code_object->record, list) {
649
uint32_t mask = record->shader_stages_mask;
652
/* Free the disassembly. */
654
i = u_bit_scan(&mask);
655
free(record->shader_data[i].code);
657
list_del(&record->list);
660
simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);
662
free(sctx->thread_trace);
663
sctx->thread_trace = NULL;
666
static uint64_t num_frames = 0;
669
si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
671
/* Should we enable SQTT yet? */
672
if (!sctx->thread_trace_enabled) {
673
bool frame_trigger = num_frames == sctx->thread_trace->start_frame;
674
bool file_trigger = false;
675
if (sctx->thread_trace->trigger_file &&
676
access(sctx->thread_trace->trigger_file, W_OK) == 0) {
677
if (unlink(sctx->thread_trace->trigger_file) == 0) {
680
/* Do not enable tracing if we cannot remove the file,
681
* because by then we'll trace every frame.
683
fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n");
687
if (frame_trigger || file_trigger) {
688
/* Wait for last submission */
689
sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE);
692
si_begin_thread_trace(sctx, rcs);
694
sctx->thread_trace_enabled = true;
695
sctx->thread_trace->start_frame = -1;
697
/* Force shader update to make sure si_sqtt_describe_pipeline_bind is called
698
* for the current "pipeline".
700
sctx->do_update_shaders = true;
703
struct ac_thread_trace thread_trace = {0};
706
si_end_thread_trace(sctx, rcs);
707
sctx->thread_trace_enabled = false;
708
sctx->thread_trace->start_frame = -1;
709
assert (sctx->last_sqtt_fence);
711
/* Wait for SQTT to finish and read back the bo */
712
if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) &&
713
si_get_thread_trace(sctx, &thread_trace)) {
714
ac_dump_rgp_capture(&sctx->screen->info, &thread_trace, NULL);
716
fprintf(stderr, "Failed to read the trace\n");
725
si_emit_thread_trace_userdata(struct si_context* sctx,
726
struct radeon_cmdbuf *cs,
727
const void *data, uint32_t num_dwords)
729
const uint32_t *dwords = (uint32_t *)data;
733
while (num_dwords > 0) {
734
uint32_t count = MIN2(num_dwords, 2);
736
/* Without the perfctr bit the CP might not always pass the
737
* write on correctly. */
738
radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10);
740
radeon_emit_array(dwords, count);
749
si_emit_spi_config_cntl(struct si_context* sctx,
750
struct radeon_cmdbuf *cs, bool enable)
754
if (sctx->chip_class >= GFX9) {
755
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
756
S_031100_EXP_PRIORITY_ORDER(3) |
757
S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
758
S_031100_ENABLE_SQG_BOP_EVENTS(enable);
760
if (sctx->chip_class >= GFX10)
761
spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
763
radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
765
/* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
766
radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
767
S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
768
S_009100_ENABLE_SQG_BOP_EVENTS(enable));
773
static uint32_t num_events = 0;
775
si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
776
enum rgp_sqtt_marker_event_type api_type,
777
uint32_t vertex_offset_user_data,
778
uint32_t instance_offset_user_data,
779
uint32_t draw_index_user_data)
781
struct rgp_sqtt_marker_event marker = {0};
783
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
784
marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type;
785
marker.cmd_id = num_events++;
788
if (vertex_offset_user_data == UINT_MAX ||
789
instance_offset_user_data == UINT_MAX) {
790
vertex_offset_user_data = 0;
791
instance_offset_user_data = 0;
794
if (draw_index_user_data == UINT_MAX)
795
draw_index_user_data = vertex_offset_user_data;
797
marker.vertex_offset_reg_idx = vertex_offset_user_data;
798
marker.instance_offset_reg_idx = instance_offset_user_data;
799
marker.draw_index_reg_idx = draw_index_user_data;
801
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
803
sctx->sqtt_next_event = EventInvalid;
807
si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
808
enum rgp_sqtt_marker_event_type api_type,
809
uint32_t x, uint32_t y, uint32_t z)
811
struct rgp_sqtt_marker_event_with_dims marker = {0};
813
marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
814
marker.event.api_type = api_type;
815
marker.event.cmd_id = num_events++;
816
marker.event.cb_id = 0;
817
marker.event.has_thread_dims = 1;
823
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
824
sctx->sqtt_next_event = EventInvalid;
828
si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs)
830
struct rgp_sqtt_marker_barrier_start marker = {0};
832
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
834
marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
836
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
840
si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs,
843
struct rgp_sqtt_marker_barrier_end marker = {0};
845
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
848
if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH)
849
marker.vs_partial_flush = true;
850
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH)
851
marker.ps_partial_flush = true;
852
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH)
853
marker.cs_partial_flush = true;
855
if (flags & SI_CONTEXT_PFP_SYNC_ME)
856
marker.pfp_sync_me = true;
858
if (flags & SI_CONTEXT_INV_VCACHE)
859
marker.inval_tcp = true;
860
if (flags & SI_CONTEXT_INV_ICACHE)
861
marker.inval_sqI = true;
862
if (flags & SI_CONTEXT_INV_SCACHE)
863
marker.inval_sqK = true;
864
if (flags & SI_CONTEXT_INV_L2)
865
marker.inval_tcc = true;
867
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
868
marker.inval_cb = true;
869
marker.flush_cb = true;
871
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
872
marker.inval_db = true;
873
marker.flush_db = true;
876
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
880
si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,
881
enum rgp_sqtt_marker_user_event_type type,
882
const char *str, int len)
884
if (type == UserEventPop) {
885
assert (str == NULL);
886
struct rgp_sqtt_marker_user_event marker = { 0 };
887
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
888
marker.data_type = type;
890
si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
892
assert (str != NULL);
893
struct rgp_sqtt_marker_user_event_with_length marker = { 0 };
894
marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
895
marker.user_event.data_type = type;
896
len = MIN2(1024, len);
897
marker.length = align(len, 4);
899
uint8_t *buffer = alloca(sizeof(marker) + marker.length);
900
memcpy(buffer, &marker, sizeof(marker));
901
memcpy(buffer + sizeof(marker), str, len);
902
buffer[sizeof(marker) + len - 1] = '\0';
904
si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4);
910
si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
911
uint64_t pipeline_hash)
913
simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);
914
list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
915
&thread_trace_data->rgp_pso_correlation.record, list) {
916
if (record->pipeline_hash[0] == pipeline_hash) {
917
simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
922
simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
929
static enum rgp_hardware_stages
930
si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type stage)
933
case PIPE_SHADER_VERTEX:
935
return RGP_HW_STAGE_LS;
936
else if (key->ge.as_es)
937
return RGP_HW_STAGE_ES;
938
else if (key->ge.as_ngg)
939
return RGP_HW_STAGE_GS;
941
return RGP_HW_STAGE_VS;
942
case PIPE_SHADER_TESS_CTRL:
943
return RGP_HW_STAGE_HS;
944
case PIPE_SHADER_TESS_EVAL:
946
return RGP_HW_STAGE_ES;
947
else if (key->ge.as_ngg)
948
return RGP_HW_STAGE_GS;
950
return RGP_HW_STAGE_VS;
951
case PIPE_SHADER_GEOMETRY:
952
return RGP_HW_STAGE_GS;
953
case PIPE_SHADER_FRAGMENT:
954
return RGP_HW_STAGE_PS;
955
case PIPE_SHADER_COMPUTE:
956
return RGP_HW_STAGE_CS;
958
unreachable("invalid mesa shader stage");
963
si_sqtt_add_code_object(struct si_context* sctx,
964
uint64_t pipeline_hash,
967
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
968
struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
969
struct rgp_code_object_record *record;
971
record = malloc(sizeof(struct rgp_code_object_record));
975
record->shader_stages_mask = 0;
976
record->num_shaders_combined = 0;
977
record->pipeline_hash[0] = pipeline_hash;
978
record->pipeline_hash[1] = pipeline_hash;
980
for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
981
struct si_shader *shader;
982
enum rgp_hardware_stages hw_stage;
985
if (i != PIPE_SHADER_COMPUTE)
987
shader = &sctx->cs_shader_state.program->shader;
988
hw_stage = RGP_HW_STAGE_CS;
989
} else if (i != PIPE_SHADER_COMPUTE) {
990
if (!sctx->shaders[i].cso || !sctx->shaders[i].current)
992
shader = sctx->shaders[i].current;
993
hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i);
998
uint8_t *code = malloc(shader->binary.uploaded_code_size);
1003
memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
1005
uint64_t va = shader->bo->gpu_address;
1006
unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
1007
record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
1008
record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
1009
record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size;
1010
record->shader_data[gl_shader_stage].code = code;
1011
record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs;
1012
record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs;
1013
record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff;
1014
record->shader_data[gl_shader_stage].elf_symbol_offset = 0;
1015
record->shader_data[gl_shader_stage].hw_stage = hw_stage;
1016
record->shader_data[gl_shader_stage].is_combined = false;
1017
record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave;
1018
record->shader_data[gl_shader_stage].wavefront_size = shader->wave_size;
1020
record->shader_stages_mask |= 1 << gl_shader_stage;
1021
record->num_shaders_combined++;
1024
simple_mtx_lock(&code_object->lock);
1025
list_addtail(&record->list, &code_object->record);
1026
code_object->record_count++;
1027
simple_mtx_unlock(&code_object->lock);
1033
si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
1035
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
1037
assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
1039
bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
1043
result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
1047
return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
1051
si_sqtt_describe_pipeline_bind(struct si_context* sctx,
1052
uint64_t pipeline_hash,
1055
struct rgp_sqtt_marker_pipeline_bind marker = {0};
1056
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
1058
if (likely(!sctx->thread_trace_enabled)) {
1062
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
1064
marker.bind_point = bind_point;
1065
marker.api_pso_hash[0] = pipeline_hash;
1066
marker.api_pso_hash[1] = pipeline_hash >> 32;
1068
si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4);