2
* Copyright © 2008 Jérôme Glisse
3
* Copyright © 2010 Marek Olšák <maraeo@gmail.com>
6
* Permission is hereby granted, free of charge, to any person obtaining
7
* a copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
14
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
* NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18
* AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
* USE OR OTHER DEALINGS IN THE SOFTWARE.
23
* The above copyright notice and this permission notice (including the
24
* next paragraph) shall be included in all copies or substantial portions
29
This file replaces libdrm's radeon_cs_gem with our own implemention.
30
It's optimized specifically for Radeon DRM.
31
Adding buffers and space checking are faster and simpler than their
32
counterparts in libdrm (the time complexity of all the functions
33
is O(1) in nearly all scenarios, thanks to hashing).
37
cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38
also adds the size of 'buf' to the used_gart and used_vram winsys variables
39
based on the domains, which are simply or'd for the accounting purposes.
40
The adding is skipped if the reloc is already present in the list, but it
41
accounts any newly-referenced domains.
43
cs_validate is then called, which just checks:
44
used_vram/gart < vram/gart_size * 0.8
45
The 0.8 number allows for some memory fragmentation. If the validation
46
fails, the pipe driver flushes CS and tries do the validation again,
47
i.e. it validates only that one operation. If it fails again, it drops
48
the operation on the floor and prints some nasty message to stderr.
49
(done in the pipe driver)
51
cs_write_reloc(cs, buf) just writes a reloc that has been added using
52
cs_add_buffer. The read_domain and write_domain parameters have been removed,
53
because we already specify them in cs_add_buffer.
56
#include "radeon_drm_cs.h"
58
#include "util/u_memory.h"
59
#include "util/os_time.h"
67
#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
69
static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70
static void radeon_fence_reference(struct pipe_fence_handle **dst,
71
struct pipe_fence_handle *src);
73
static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
75
struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
79
ctx->ws = (struct radeon_drm_winsys*)ws;
80
ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81
return (struct radeon_winsys_ctx*)ctx;
84
static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
89
static enum pipe_reset_status
90
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
93
struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
95
unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
97
if (ctx->gpu_reset_counter == latest) {
100
return PIPE_NO_RESET;
106
ctx->gpu_reset_counter = latest;
107
return PIPE_UNKNOWN_CONTEXT_RESET;
110
static bool radeon_init_cs_context(struct radeon_cs_context *csc,
111
struct radeon_drm_winsys *ws)
117
csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
118
csc->chunks[0].length_dw = 0;
119
csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
120
csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
121
csc->chunks[1].length_dw = 0;
122
csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
123
csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
124
csc->chunks[2].length_dw = 2;
125
csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
127
csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
128
csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
129
csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
131
csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
133
for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
134
csc->reloc_indices_hashlist[i] = -1;
139
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
143
for (i = 0; i < csc->num_relocs; i++) {
144
p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145
radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
147
for (i = 0; i < csc->num_slab_buffers; ++i) {
148
p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149
radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
153
csc->num_validated_relocs = 0;
154
csc->num_slab_buffers = 0;
155
csc->chunks[0].length_dw = 0;
156
csc->chunks[1].length_dw = 0;
158
for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159
csc->reloc_indices_hashlist[i] = -1;
163
static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
165
radeon_cs_context_cleanup(csc);
166
FREE(csc->slab_buffers);
167
FREE(csc->relocs_bo);
173
radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
174
struct radeon_winsys_ctx *ctx,
175
enum ring_type ring_type,
176
void (*flush)(void *ctx, unsigned flags,
177
struct pipe_fence_handle **fence),
179
bool stop_exec_on_failure)
181
struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
182
struct radeon_drm_cs *cs;
184
cs = CALLOC_STRUCT(radeon_drm_cs);
188
util_queue_fence_init(&cs->flush_completed);
191
cs->flush_cs = flush;
192
cs->flush_data = flush_ctx;
194
if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
198
if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
199
radeon_destroy_cs_context(&cs->csc1);
204
/* Set the first command buffer as current. */
207
cs->ring_type = ring_type;
209
memset(rcs, 0, sizeof(*rcs));
210
rcs->current.buf = cs->csc->buf;
211
rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
214
p_atomic_inc(&ws->num_cs);
218
int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
220
unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
221
struct radeon_bo_item *buffers;
222
unsigned num_buffers;
223
int i = csc->reloc_indices_hashlist[hash];
226
buffers = csc->relocs_bo;
227
num_buffers = csc->num_relocs;
229
buffers = csc->slab_buffers;
230
num_buffers = csc->num_slab_buffers;
233
/* not found or found */
234
if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
237
/* Hash collision, look for the BO in the list of relocs linearly. */
238
for (i = num_buffers - 1; i >= 0; i--) {
239
if (buffers[i].bo == bo) {
240
/* Put this reloc in the hash list.
241
* This will prevent additional hash collisions if there are
242
* several consecutive lookup_buffer calls for the same buffer.
244
* Example: Assuming buffers A,B,C collide in the hash list,
245
* the following sequence of relocs:
246
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
247
* will collide here: ^ and here: ^,
248
* meaning that we should get very few collisions in the end. */
249
csc->reloc_indices_hashlist[hash] = i;
256
static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
257
struct radeon_bo *bo)
259
struct radeon_cs_context *csc = cs->csc;
260
struct drm_radeon_cs_reloc *reloc;
261
unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
264
i = radeon_lookup_buffer(csc, bo);
267
/* For async DMA, every add_buffer call must add a buffer to the list
268
* no matter how many duplicates there are. This is due to the fact
269
* the DMA CS checker doesn't use NOP packets for offset patching,
270
* but always uses the i-th buffer from the list to patch the i-th
271
* offset. If there are N offsets in a DMA CS, there must also be N
272
* buffers in the relocation list.
274
* This doesn't have to be done if virtual memory is enabled,
275
* because there is no offset patching with virtual memory.
277
if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
282
/* New relocation, check if the backing array is large enough. */
283
if (csc->num_relocs >= csc->max_relocs) {
285
csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
287
size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
288
csc->relocs_bo = realloc(csc->relocs_bo, size);
290
size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
291
csc->relocs = realloc(csc->relocs, size);
293
csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
296
/* Initialize the new relocation. */
297
csc->relocs_bo[csc->num_relocs].bo = NULL;
298
csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
299
radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
300
p_atomic_inc(&bo->num_cs_references);
301
reloc = &csc->relocs[csc->num_relocs];
302
reloc->handle = bo->handle;
303
reloc->read_domains = 0;
304
reloc->write_domain = 0;
307
csc->reloc_indices_hashlist[hash] = csc->num_relocs;
309
csc->chunks[1].length_dw += RELOC_DWORDS;
311
return csc->num_relocs++;
314
static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
315
struct radeon_bo *bo)
317
struct radeon_cs_context *csc = cs->csc;
319
struct radeon_bo_item *item;
323
idx = radeon_lookup_buffer(csc, bo);
327
real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
329
/* Check if the backing array is large enough. */
330
if (csc->num_slab_buffers >= csc->max_slab_buffers) {
331
unsigned new_max = MAX2(csc->max_slab_buffers + 16,
332
(unsigned)(csc->max_slab_buffers * 1.3));
333
struct radeon_bo_item *new_buffers =
334
REALLOC(csc->slab_buffers,
335
csc->max_slab_buffers * sizeof(*new_buffers),
336
new_max * sizeof(*new_buffers));
338
fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
342
csc->max_slab_buffers = new_max;
343
csc->slab_buffers = new_buffers;
346
/* Initialize the new relocation. */
347
idx = csc->num_slab_buffers++;
348
item = &csc->slab_buffers[idx];
351
item->u.slab.real_idx = real_idx;
352
radeon_ws_bo_reference(&item->bo, bo);
353
p_atomic_inc(&bo->num_cs_references);
355
hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
356
csc->reloc_indices_hashlist[hash] = idx;
361
static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
362
struct pb_buffer *buf,
364
enum radeon_bo_domain domains)
366
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
367
struct radeon_bo *bo = (struct radeon_bo*)buf;
368
enum radeon_bo_domain added_domains;
370
/* If VRAM is just stolen system memory, allow both VRAM and
371
* GTT, whichever has free space. If a buffer is evicted from
372
* VRAM to GTT, it will stay there.
374
if (!cs->ws->info.has_dedicated_vram)
375
domains |= RADEON_DOMAIN_GTT;
377
enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
378
enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
379
struct drm_radeon_cs_reloc *reloc;
383
index = radeon_lookup_or_add_slab_buffer(cs, bo);
387
index = cs->csc->slab_buffers[index].u.slab.real_idx;
389
index = radeon_lookup_or_add_real_buffer(cs, bo);
392
reloc = &cs->csc->relocs[index];
393
added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
394
reloc->read_domains |= rd;
395
reloc->write_domain |= wd;
397
/* The priority must be in [0, 15]. It's used by the kernel memory management. */
398
unsigned priority = usage & RADEON_ALL_PRIORITIES;
399
unsigned bo_priority = util_last_bit(priority) / 2;
400
reloc->flags = MAX2(reloc->flags, bo_priority);
401
cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
403
if (added_domains & RADEON_DOMAIN_VRAM)
404
rcs->used_vram_kb += bo->base.size / 1024;
405
else if (added_domains & RADEON_DOMAIN_GTT)
406
rcs->used_gart_kb += bo->base.size / 1024;
411
static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
412
struct pb_buffer *buf)
414
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
416
return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
419
static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
421
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
423
rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
424
rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
427
cs->csc->num_validated_relocs = cs->csc->num_relocs;
429
/* Remove lately-added buffers. The validation failed with them
430
* and the CS is about to be flushed because of that. Keep only
431
* the already-validated buffers. */
434
for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
435
p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
436
radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
438
cs->csc->num_relocs = cs->csc->num_validated_relocs;
440
/* Flush if there are any relocs. Clean up otherwise. */
441
if (cs->csc->num_relocs) {
442
cs->flush_cs(cs->flush_data,
443
RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
445
radeon_cs_context_cleanup(cs->csc);
446
rcs->used_vram_kb = 0;
447
rcs->used_gart_kb = 0;
449
assert(rcs->current.cdw == 0);
450
if (rcs->current.cdw != 0) {
451
fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
458
static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
460
assert(rcs->current.cdw <= rcs->current.max_dw);
461
return rcs->current.max_dw - rcs->current.cdw >= dw;
464
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
465
struct radeon_bo_list_item *list)
467
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
471
for (i = 0; i < cs->csc->num_relocs; i++) {
472
list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
473
list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
474
list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
477
return cs->csc->num_relocs;
480
void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
482
struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
486
r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
487
&csc->cs, sizeof(struct drm_radeon_cs));
490
fprintf(stderr, "radeon: Not enough memory for command submission.\n");
491
else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
494
fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
495
for (i = 0; i < csc->chunks[0].length_dw; i++) {
496
fprintf(stderr, "0x%08X\n", csc->buf[i]);
499
fprintf(stderr, "radeon: The kernel rejected CS, "
500
"see dmesg for more information (%i).\n", r);
504
for (i = 0; i < csc->num_relocs; i++)
505
p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
506
for (i = 0; i < csc->num_slab_buffers; i++)
507
p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
509
radeon_cs_context_cleanup(csc);
513
* Make sure previous submission of this cs are completed
515
void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
517
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
519
/* Wait for any pending ioctl of this CS to complete. */
520
if (util_queue_is_initialized(&cs->ws->cs_queue))
521
util_queue_fence_wait(&cs->flush_completed);
524
/* Add the given fence to a slab buffer fence list.
526
* There is a potential race condition when bo participates in submissions on
527
* two or more threads simultaneously. Since we do not know which of the
528
* submissions will be sent to the GPU first, we have to keep the fences
529
* of all submissions.
531
* However, fences that belong to submissions that have already returned from
532
* their respective ioctl do not have to be kept, because we know that they
533
* will signal earlier.
535
static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
539
assert(fence->num_cs_references);
541
/* Cleanup older fences */
543
for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
544
if (bo->u.slab.fences[src]->num_cs_references) {
545
bo->u.slab.fences[dst] = bo->u.slab.fences[src];
548
radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
551
bo->u.slab.num_fences = dst;
553
/* Check available space for the new fence */
554
if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
555
unsigned new_max_fences = bo->u.slab.max_fences + 1;
556
struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
557
bo->u.slab.max_fences * sizeof(*new_fences),
558
new_max_fences * sizeof(*new_fences));
560
fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
564
bo->u.slab.fences = new_fences;
565
bo->u.slab.max_fences = new_max_fences;
568
/* Add the new fence */
569
bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
570
radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
571
bo->u.slab.num_fences++;
574
static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
576
struct pipe_fence_handle **pfence)
578
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
579
struct radeon_cs_context *tmp;
581
switch (cs->ring_type) {
583
/* pad DMA ring to 8 DWs */
584
if (cs->ws->info.chip_class <= GFX6) {
585
while (rcs->current.cdw & 7)
586
radeon_emit(rcs, 0xf0000000); /* NOP packet */
588
while (rcs->current.cdw & 7)
589
radeon_emit(rcs, 0x00000000); /* NOP packet */
593
/* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
594
* r6xx, requires at least 4 dw alignment to avoid a hw bug.
596
if (cs->ws->info.gfx_ib_pad_with_type2) {
597
while (rcs->current.cdw & 7)
598
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
600
while (rcs->current.cdw & 7)
601
radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
605
while (rcs->current.cdw & 15)
606
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
612
if (rcs->current.cdw > rcs->current.max_dw) {
613
fprintf(stderr, "radeon: command stream overflowed\n");
616
if (pfence || cs->csc->num_slab_buffers) {
617
struct pipe_fence_handle *fence;
619
if (cs->next_fence) {
620
fence = cs->next_fence;
621
cs->next_fence = NULL;
623
fence = radeon_cs_create_fence(rcs);
628
radeon_fence_reference(pfence, fence);
630
mtx_lock(&cs->ws->bo_fence_lock);
631
for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
632
struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
633
p_atomic_inc(&bo->num_active_ioctls);
634
radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
636
mtx_unlock(&cs->ws->bo_fence_lock);
638
radeon_fence_reference(&fence, NULL);
641
radeon_fence_reference(&cs->next_fence, NULL);
644
radeon_drm_cs_sync_flush(rcs);
646
/* Swap command streams. */
651
/* If the CS is not empty or overflowed, emit it in a separate thread. */
652
if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
653
!cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
654
unsigned i, num_relocs;
656
num_relocs = cs->cst->num_relocs;
658
cs->cst->chunks[0].length_dw = rcs->current.cdw;
660
for (i = 0; i < num_relocs; i++) {
661
/* Update the number of active asynchronous CS ioctls for the buffer. */
662
p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
665
switch (cs->ring_type) {
667
cs->cst->flags[0] = 0;
668
cs->cst->flags[1] = RADEON_CS_RING_DMA;
669
cs->cst->cs.num_chunks = 3;
670
if (cs->ws->info.r600_has_virtual_memory) {
671
cs->cst->flags[0] |= RADEON_CS_USE_VM;
676
cs->cst->flags[0] = 0;
677
cs->cst->flags[1] = RADEON_CS_RING_UVD;
678
cs->cst->cs.num_chunks = 3;
682
cs->cst->flags[0] = 0;
683
cs->cst->flags[1] = RADEON_CS_RING_VCE;
684
cs->cst->cs.num_chunks = 3;
690
cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
691
cs->cst->flags[1] = RADEON_CS_RING_GFX;
692
cs->cst->cs.num_chunks = 3;
694
if (cs->ws->info.r600_has_virtual_memory) {
695
cs->cst->flags[0] |= RADEON_CS_USE_VM;
696
cs->cst->cs.num_chunks = 3;
698
if (flags & PIPE_FLUSH_END_OF_FRAME) {
699
cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
700
cs->cst->cs.num_chunks = 3;
702
if (cs->ring_type == RING_COMPUTE) {
703
cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
704
cs->cst->cs.num_chunks = 3;
709
if (util_queue_is_initialized(&cs->ws->cs_queue)) {
710
util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
711
radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
712
if (!(flags & PIPE_FLUSH_ASYNC))
713
radeon_drm_cs_sync_flush(rcs);
715
radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
718
radeon_cs_context_cleanup(cs->cst);
721
/* Prepare a new CS. */
722
rcs->current.buf = cs->csc->buf;
723
rcs->current.cdw = 0;
724
rcs->used_vram_kb = 0;
725
rcs->used_gart_kb = 0;
727
if (cs->ring_type == RING_GFX)
728
cs->ws->num_gfx_IBs++;
729
else if (cs->ring_type == RING_DMA)
730
cs->ws->num_sdma_IBs++;
734
static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
736
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
741
radeon_drm_cs_sync_flush(rcs);
742
util_queue_fence_destroy(&cs->flush_completed);
743
radeon_cs_context_cleanup(&cs->csc1);
744
radeon_cs_context_cleanup(&cs->csc2);
745
p_atomic_dec(&cs->ws->num_cs);
746
radeon_destroy_cs_context(&cs->csc1);
747
radeon_destroy_cs_context(&cs->csc2);
748
radeon_fence_reference(&cs->next_fence, NULL);
752
static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
753
struct pb_buffer *_buf,
756
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
757
struct radeon_bo *bo = (struct radeon_bo*)_buf;
760
if (!bo->num_cs_references)
763
index = radeon_lookup_buffer(cs->csc, bo);
768
index = cs->csc->slab_buffers[index].u.slab.real_idx;
770
if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
772
if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
780
static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
782
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
783
struct pb_buffer *fence;
785
/* Create a fence, which is a dummy BO. */
786
fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
788
RADEON_FLAG_NO_SUBALLOC
789
| RADEON_FLAG_NO_INTERPROCESS_SHARING);
793
/* Add the fence as a dummy relocation. */
794
cs->ws->base.cs_add_buffer(rcs, fence,
795
RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
796
return (struct pipe_fence_handle*)fence;
799
static bool radeon_fence_wait(struct radeon_winsys *ws,
800
struct pipe_fence_handle *fence,
803
return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
804
RADEON_USAGE_READWRITE);
807
static void radeon_fence_reference(struct pipe_fence_handle **dst,
808
struct pipe_fence_handle *src)
810
pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
813
static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
815
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
816
struct pipe_fence_handle *fence = NULL;
818
if (cs->next_fence) {
819
radeon_fence_reference(&fence, cs->next_fence);
823
fence = radeon_cs_create_fence(rcs);
827
radeon_fence_reference(&cs->next_fence, fence);
832
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
833
struct pipe_fence_handle *fence,
834
unsigned dependency_flags)
836
/* TODO: Handle the following unlikely multi-threaded scenario:
838
* Thread 1 / Context 1 Thread 2 / Context 2
839
* -------------------- --------------------
840
* f = cs_get_next_fence()
841
* cs_add_fence_dependency(f)
845
* We currently assume that this does not happen because we don't support
846
* asynchronous flushes on Radeon.
850
void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
852
ws->base.ctx_create = radeon_drm_ctx_create;
853
ws->base.ctx_destroy = radeon_drm_ctx_destroy;
854
ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
855
ws->base.cs_create = radeon_drm_cs_create;
856
ws->base.cs_destroy = radeon_drm_cs_destroy;
857
ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
858
ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
859
ws->base.cs_validate = radeon_drm_cs_validate;
860
ws->base.cs_check_space = radeon_drm_cs_check_space;
861
ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
862
ws->base.cs_flush = radeon_drm_cs_flush;
863
ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
864
ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
865
ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
866
ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
867
ws->base.fence_wait = radeon_fence_wait;
868
ws->base.fence_reference = radeon_fence_reference;