2
* Copyright © 2021 Igalia S.L.
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
#include <vulkan/vulkan_core.h>
26
#include "tu_autotune.h"
27
#include "tu_private.h"
32
* - For each renderpass we calculate the number of samples passed
33
* by storing the number before and after in GPU memory.
34
* - To store the values each command buffer holds GPU memory which
35
* expands with more renderpasses being written.
36
* - For each renderpass we create tu_renderpass_result entry which
37
* points to the results in GPU memory.
38
* - Later on tu_renderpass_result would be added to the
39
* tu_renderpass_history entry which aggregate results for a
42
* - Process results which fence was signalled.
43
* - Free per-submission data which we now don't need.
45
* - Create a command stream to write a fence value. This way we would
46
* know when we could safely read the results.
47
* - We cannot rely on the command buffer's lifetime when referencing
48
* its resources since the buffer could be destroyed before we process
50
* - For each command buffer:
51
* - Reference its GPU memory.
52
* - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
54
* Since the command buffers could be recorded on different threads
55
* we have to maintaining some amount of locking history table,
56
* however we change the table only in a single thread at the submission
57
* time, so in most cases there will be no locking.
61
tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
63
#define TU_AUTOTUNE_DEBUG_LOG 0
64
/* Dump history entries on autotuner finish,
65
* could be used to gather data from traces.
67
#define TU_AUTOTUNE_LOG_AT_FINISH 0
69
/* How many last renderpass stats are taken into account. */
70
#define MAX_HISTORY_RESULTS 5
71
/* For how many submissions we store renderpass stats. */
72
#define MAX_HISTORY_LIFETIME 128
76
* Tracks results for a given renderpass key
78
struct tu_renderpass_history {
81
/* We would delete old history entries */
85
* List of recent fd_renderpass_result's
87
struct list_head results;
93
/* Holds per-submission cs which writes the fence. */
94
struct tu_submission_data {
95
struct list_head node;
98
struct tu_cs fence_cs;
99
uint32_t buffers_count;
102
static struct tu_submission_data *
103
create_submission_data(struct tu_device *dev, struct tu_autotune *at)
105
struct tu_submission_data *submission_data =
106
calloc(1, sizeof(struct tu_submission_data));
107
submission_data->fence = at->fence_counter;
109
struct tu_cs* fence_cs = &submission_data->fence_cs;
110
tu_cs_init(fence_cs, dev, TU_CS_MODE_GROW, 5);
111
tu_cs_begin(fence_cs);
113
tu_cs_emit_pkt7(fence_cs, CP_EVENT_WRITE, 4);
114
tu_cs_emit(fence_cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
115
tu_cs_emit_qw(fence_cs, dev->global_bo->iova + gb_offset(autotune_fence));
116
tu_cs_emit(fence_cs, at->fence_counter);
120
list_addtail(&submission_data->node, &at->pending_submission_data);
122
return submission_data;
126
free_submission_data(struct tu_submission_data *data)
128
list_del(&data->node);
129
tu_cs_finish(&data->fence_cs);
134
#define APPEND_TO_HASH(state, field) \
135
XXH64_update(state, &field, sizeof(field));
138
hash_renderpass_instance(const struct tu_render_pass *pass,
139
const struct tu_framebuffer *framebuffer,
140
const struct tu_cmd_buffer *cmd) {
141
XXH64_state_t hash_state;
142
XXH64_reset(&hash_state, 0);
144
APPEND_TO_HASH(&hash_state, framebuffer->width);
145
APPEND_TO_HASH(&hash_state, framebuffer->height);
146
APPEND_TO_HASH(&hash_state, framebuffer->layers);
148
APPEND_TO_HASH(&hash_state, pass->attachment_count);
149
XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
151
for (unsigned i = 0; i < pass->attachment_count; i++) {
152
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
153
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
154
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk_format);
155
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->layer_count);
156
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->level_count);
159
APPEND_TO_HASH(&hash_state, pass->subpass_count);
160
for (unsigned i = 0; i < pass->subpass_count; i++) {
161
APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
162
APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
163
APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
164
APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
167
return XXH64_digest(&hash_state);
171
free_result(struct tu_device *dev, struct tu_renderpass_result *result)
173
tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
174
list_del(&result->node);
179
free_history(struct tu_device *dev, struct tu_renderpass_history *history)
181
tu_autotune_free_results_locked(dev, &history->results);
186
get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
188
bool has_history = false;
190
/* If the lock contantion would be found in the wild -
191
* we could use try_lock here.
193
u_rwlock_rdlock(&at->ht_lock);
194
struct hash_entry *entry =
195
_mesa_hash_table_search(at->ht, &rp_key);
197
struct tu_renderpass_history *history = entry->data;
198
if (history->num_results > 0) {
199
*avg_samples = p_atomic_read(&history->avg_samples);
203
u_rwlock_rdunlock(&at->ht_lock);
208
static struct tu_renderpass_result *
209
create_history_result(struct tu_autotune *at, uint64_t rp_key)
211
struct tu_renderpass_result *result = calloc(1, sizeof(*result));
212
result->rp_key = rp_key;
218
history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
219
struct tu_renderpass_result *result)
221
list_delinit(&result->node);
222
list_add(&result->node, &history->results);
224
if (history->num_results < MAX_HISTORY_RESULTS) {
225
history->num_results++;
227
/* Once above the limit, start popping old results off the
230
struct tu_renderpass_result *old_result =
231
list_last_entry(&history->results, struct tu_renderpass_result, node);
232
mtx_lock(&dev->autotune_mutex);
233
free_result(dev, old_result);
234
mtx_unlock(&dev->autotune_mutex);
237
/* Do calculations here to avoid locking history in tu_autotune_use_bypass */
238
uint32_t total_samples = 0;
239
list_for_each_entry(struct tu_renderpass_result, result,
240
&history->results, node) {
241
total_samples += result->samples_passed;
244
float avg_samples = (float)total_samples / (float)history->num_results;
245
p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
249
process_results(struct tu_autotune *at)
251
struct tu_device *dev = at->device;
252
struct tu6_global *global = dev->global_bo->map;
253
uint32_t current_fence = global->autotune_fence;
255
list_for_each_entry_safe(struct tu_renderpass_result, result,
256
&at->pending_results, node) {
257
if (result->fence > current_fence)
260
struct tu_renderpass_history *history = result->history;
261
result->samples_passed =
262
result->samples->samples_end - result->samples->samples_start;
264
history_add_result(dev, history, result);
267
list_for_each_entry_safe(struct tu_submission_data, submission_data,
268
&at->pending_submission_data, node) {
269
if (submission_data->fence > current_fence)
272
free_submission_data(submission_data);
277
queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
279
bool one_time_submit = cmdbuf->usage_flags &
280
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
282
if (one_time_submit) {
283
/* We can just steal the list since it won't be resubmitted again */
284
list_splicetail(&cmdbuf->renderpass_autotune_results,
285
&at->pending_results);
286
list_inithead(&cmdbuf->renderpass_autotune_results);
288
list_for_each_entry_safe(struct tu_renderpass_result, result,
289
&cmdbuf->renderpass_autotune_results, node) {
290
/* TODO: copying each result isn't nice */
291
struct tu_renderpass_result *copy = malloc(sizeof(*result));
293
tu_bo_get_ref(copy->bo.bo);
294
list_addtail(©->node, &at->pending_results);
300
tu_autotune_on_submit(struct tu_device *dev,
301
struct tu_autotune *at,
302
struct tu_cmd_buffer **cmd_buffers,
303
uint32_t cmd_buffer_count)
305
/* We are single-threaded here */
309
/* pre-increment so zero isn't valid fence */
310
uint32_t new_fence = ++at->fence_counter;
311
uint32_t result_buffers = 0;
313
/* Create history entries here to minimize work and locking being
314
* done on renderpass end.
316
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
317
struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
318
list_for_each_entry_safe(struct tu_renderpass_result, result,
319
&cmdbuf->renderpass_autotune_results, node) {
320
struct tu_renderpass_history *history;
321
struct hash_entry *entry =
322
_mesa_hash_table_search(at->ht, &result->rp_key);
324
history = calloc(1, sizeof(*history));
325
history->key = result->rp_key;
326
list_inithead(&history->results);
328
u_rwlock_wrlock(&at->ht_lock);
329
_mesa_hash_table_insert(at->ht, &history->key, history);
330
u_rwlock_wrunlock(&at->ht_lock);
332
history = (struct tu_renderpass_history *) entry->data;
335
history->last_fence = new_fence;
337
result->fence = new_fence;
338
result->history = history;
341
if (!list_is_empty(&cmdbuf->renderpass_autotune_results)) {
346
struct tu_submission_data *submission_data =
347
create_submission_data(dev, at);
348
submission_data->buffers_count = result_buffers;
350
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
351
struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
352
if (list_is_empty(&cmdbuf->renderpass_autotune_results))
355
queue_pending_results(at, cmdbuf);
358
#if TU_AUTOTUNE_DEBUG_LOG != 0
359
mesa_logi("Total history entries: %u", at->ht->entries);
362
/* Cleanup old entries from history table. The assumption
363
* here is that application doesn't hold many old unsubmitted
364
* command buffers, otherwise this table may grow big.
366
hash_table_foreach(at->ht, entry) {
367
struct tu_renderpass_history *history = entry->data;
368
if (history->last_fence == 0 ||
369
(new_fence - history->last_fence) <= MAX_HISTORY_LIFETIME)
372
#if TU_AUTOTUNE_DEBUG_LOG != 0
373
mesa_logi("Removed old history entry %016"PRIx64"", history->key);
376
u_rwlock_wrlock(&at->ht_lock);
377
_mesa_hash_table_remove_key(at->ht, &history->key);
378
u_rwlock_wrunlock(&at->ht_lock);
380
mtx_lock(&dev->autotune_mutex);
381
free_history(dev, history);
382
mtx_unlock(&dev->autotune_mutex);
385
return &submission_data->fence_cs;
389
renderpass_key_equals(const void *_a, const void *_b)
391
return *(uint64_t *)_a == *(uint64_t *)_b;
395
renderpass_key_hash(const void *_a)
397
return *((uint64_t *) _a) & 0xffffffff;
401
tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
405
at->ht = _mesa_hash_table_create(NULL,
407
renderpass_key_equals);
408
u_rwlock_init(&at->ht_lock);
410
list_inithead(&at->pending_results);
411
list_inithead(&at->pending_submission_data);
417
tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
419
#if TU_AUTOTUNE_LOG_AT_FINISH != 0
420
while (!list_is_empty(&at->pending_results)) {
424
hash_table_foreach(at->ht, entry) {
425
struct tu_renderpass_history *history = entry->data;
427
mesa_logi("%016"PRIx64" \tavg_passed=%u results=%u",
428
history->key, history->avg_samples, history->num_results);
432
tu_autotune_free_results(dev, &at->pending_results);
434
mtx_lock(&dev->autotune_mutex);
435
hash_table_foreach(at->ht, entry) {
436
struct tu_renderpass_history *history = entry->data;
437
free_history(dev, history);
439
mtx_unlock(&dev->autotune_mutex);
441
list_for_each_entry_safe(struct tu_submission_data, submission_data,
442
&at->pending_submission_data, node) {
443
free_submission_data(submission_data);
446
_mesa_hash_table_destroy(at->ht, NULL);
447
u_rwlock_destroy(&at->ht_lock);
451
tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
452
uint32_t cmd_buffer_count)
454
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
455
struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
456
if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
464
tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
466
list_for_each_entry_safe(struct tu_renderpass_result, result,
468
free_result(dev, result);
473
tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
475
mtx_lock(&dev->autotune_mutex);
476
tu_autotune_free_results_locked(dev, results);
477
mtx_unlock(&dev->autotune_mutex);
481
fallback_use_bypass(const struct tu_render_pass *pass,
482
const struct tu_framebuffer *framebuffer,
483
const struct tu_cmd_buffer *cmd_buffer)
485
if (cmd_buffer->state.drawcall_count > 5)
488
for (unsigned i = 0; i < pass->subpass_count; i++) {
489
if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
497
tu_autotune_use_bypass(struct tu_autotune *at,
498
struct tu_cmd_buffer *cmd_buffer,
499
struct tu_renderpass_result **autotune_result)
501
const struct tu_render_pass *pass = cmd_buffer->state.pass;
502
const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
504
for (unsigned i = 0; i < pass->subpass_count; i++) {
505
const struct tu_subpass *subpass = &pass->subpasses[i];
506
/* GMEM works much faster in this case */
507
if (subpass->raster_order_attachment_access)
510
/* Would be very slow in sysmem mode because we have to enable
511
* SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE)
513
if (subpass->feedback_loop_color || subpass->feedback_loop_ds)
517
/* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
518
* we would have to allocate GPU memory at the submit time and copy
520
* Native games ususally don't use it, Zink and DXVK don't use it,
521
* D3D12 doesn't have such concept.
523
bool simultaneous_use =
524
cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
526
if (!at->enabled || simultaneous_use)
527
return fallback_use_bypass(pass, framebuffer, cmd_buffer);
529
/* We use 64bit hash as a key since we don't fear rare hash collision,
530
* the worst that would happen is sysmem being selected when it should
531
* have not, and with 64bit it would be extremely rare.
533
* Q: Why not make the key from framebuffer + renderpass pointers?
534
* A: At least DXVK creates new framebuffers each frame while keeping
535
* renderpasses the same. Also we want to support replaying a single
536
* frame in a loop for testing.
538
uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
540
*autotune_result = create_history_result(at, renderpass_key);
542
uint32_t avg_samples = 0;
543
if (get_history(at, renderpass_key, &avg_samples)) {
544
/* TODO we should account for load/stores/clears/resolves especially
545
* with low drawcall count and ~fb_size samples passed, in D3D11 games
546
* we are seeing many renderpasses like:
547
* - color attachment load
548
* - single fullscreen draw
549
* - color attachment store
552
/* Low sample count could mean there was only a clear.. or there was
553
* a clear plus draws that touch no or few samples
555
if (avg_samples < 500) {
556
#if TU_AUTOTUNE_DEBUG_LOG != 0
557
mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem",
558
renderpass_key, cmd_buffer->state.drawcall_count, avg_samples);
563
/* Cost-per-sample is an estimate for the average number of reads+
564
* writes for a given passed sample.
566
float sample_cost = cmd_buffer->state.total_drawcalls_cost;
567
sample_cost /= cmd_buffer->state.drawcall_count;
569
float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count;
571
bool select_sysmem = single_draw_cost < 6000.0;
573
#if TU_AUTOTUNE_DEBUG_LOG != 0
574
mesa_logi("%016"PRIx64":%u\t avg_samples=%u, "
575
"sample_cost=%f, single_draw_cost=%f selecting %s",
576
renderpass_key, cmd_buffer->state.drawcall_count, avg_samples,
577
sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem");
580
return select_sysmem;
583
return fallback_use_bypass(pass, framebuffer, cmd_buffer);
587
tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
589
struct tu_renderpass_result *autotune_result)
591
if (!autotune_result)
594
struct tu_device *dev = cmd->device;
596
static const uint32_t size = sizeof(struct tu_renderpass_samples);
598
mtx_lock(&dev->autotune_mutex);
599
VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
600
mtx_unlock(&dev->autotune_mutex);
601
if (ret != VK_SUCCESS) {
602
autotune_result->bo.iova = 0;
606
uint64_t result_iova = autotune_result->bo.iova;
608
autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
610
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
612
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
614
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
615
tu_cs_emit(cs, ZPASS_DONE);
618
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
620
struct tu_renderpass_result *autotune_result)
622
if (!autotune_result)
625
if (!autotune_result->bo.iova)
628
uint64_t result_iova = autotune_result->bo.iova +
629
offsetof(struct tu_renderpass_samples, samples_end);
631
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
633
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
635
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
636
tu_cs_emit(cs, ZPASS_DONE);