2
* Copyright (C) 2021 Collabora, Ltd.
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
#include "pan_shader.h"
28
#include "pan_scoreboard.h"
29
#include "pan_encoder.h"
30
#include "pan_indirect_draw.h"
33
#include "compiler/nir/nir_builder.h"
34
#include "util/u_memory.h"
35
#include "util/macros.h"
37
#define WORD(x) ((x) * 4)
40
for (nir_loop *l = nir_push_loop(b); l != NULL; \
41
nir_pop_loop(b, l), l = NULL)
42
#define BREAK nir_jump(b, nir_jump_break)
43
#define CONTINUE nir_jump(b, nir_jump_continue)
45
#define IF(cond) nir_push_if(b, cond);
46
#define ELSE nir_push_else(b, NULL);
47
#define ENDIF nir_pop_if(b, NULL);
49
#define MIN_MAX_JOBS 128
52
nir_ssa_def *draw_buf;
53
nir_ssa_def *draw_buf_stride;
54
nir_ssa_def *index_buf;
55
nir_ssa_def *restart_index;
56
nir_ssa_def *vertex_count;
57
nir_ssa_def *start_instance;
58
nir_ssa_def *instance_count;
59
nir_ssa_def *vertex_start;
60
nir_ssa_def *index_bias;
61
nir_ssa_def *draw_ctx;
62
nir_ssa_def *min_max_ctx;
65
struct instance_size {
72
nir_ssa_def *vertex_job;
73
nir_ssa_def *tiler_job;
74
nir_ssa_def *base_vertex_offset;
75
nir_ssa_def *first_vertex_sysval;
76
nir_ssa_def *base_vertex_sysval;
77
nir_ssa_def *base_instance_sysval;
78
nir_ssa_def *offset_start;
79
nir_ssa_def *invocation;
82
struct varyings_data {
83
nir_ssa_def *varying_bufs;
85
nir_ssa_def *psiz_ptr;
86
nir_variable *mem_ptr;
90
nir_ssa_def *attrib_count;
91
nir_ssa_def *attrib_bufs;
95
struct indirect_draw_shader_builder {
97
const struct panfrost_device *dev;
99
bool index_min_max_search;
101
struct draw_data draw;
102
struct instance_size instance_size;
103
struct jobs_data jobs;
104
struct varyings_data varyings;
105
struct attribs_data attribs;
108
/* Describes an indirect draw (see glDrawArraysIndirect()) */
110
struct indirect_draw_info {
112
uint32_t instance_count;
114
uint32_t start_instance;
117
struct indirect_indexed_draw_info {
119
uint32_t instance_count;
122
uint32_t start_instance;
125
/* Store the min/max index in a separate context. This is not supported yet, but
126
* the DDK seems to put all min/max search jobs at the beginning of the job chain
127
* when multiple indirect draws are issued to avoid the serialization caused by
128
* the draw patching jobs which have the suppress_prefetch flag set. Merging the
129
* min/max and draw contexts would prevent such optimizations (draw contexts are
130
* shared by all indirect draw in a batch).
133
struct min_max_context {
138
/* Per-batch context shared by all indirect draws queued to a given batch. */
140
struct indirect_draw_context {
141
/* Pointer to the top of the varying heap. */
142
mali_ptr varying_mem;
145
/* Indirect draw shader inputs. Those are stored in a UBO. */
147
struct indirect_draw_inputs {
148
/* indirect_draw_context pointer */
151
/* min_max_context pointer */
152
mali_ptr min_max_ctx;
154
/* Pointer to an array of indirect_draw_info objects */
157
/* Pointer to an uint32_t containing the number of draws to issue */
158
mali_ptr draw_count_ptr;
163
/* {base,first}_{vertex,instance} sysvals */
164
mali_ptr first_vertex_sysval;
165
mali_ptr base_vertex_sysval;
166
mali_ptr base_instance_sysval;
168
/* Pointers to various cmdstream structs that need to be patched */
171
mali_ptr attrib_bufs;
173
mali_ptr varying_bufs;
175
uint32_t draw_buf_stride;
176
uint32_t restart_index;
177
uint32_t attrib_count;
181
get_input_data(nir_builder *b, unsigned offset, unsigned size)
183
assert(!(offset & 0x3));
184
assert(size && !(size & 0x3));
186
return nir_load_ubo(b, 1, size,
188
nir_imm_int(b, offset),
195
#define get_input_field(b, name) \
196
get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
197
sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
200
get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
202
return nir_iadd(b, base, nir_u2u64(b, offset));
206
get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
208
return get_address(b, base, nir_imm_int(b, offset));
212
load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
214
return nir_load_global(b, addr, 4, ncomps, bit_size);
218
store_global(nir_builder *b, nir_ssa_def *addr,
219
nir_ssa_def *value, unsigned ncomps)
221
nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
225
get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
226
unsigned offset, unsigned size)
228
nir_builder *b = &builder->b;
229
return load_global(b,
230
get_address_imm(b, builder->draw.draw_ctx, offset),
235
set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
236
unsigned offset, nir_ssa_def *value, unsigned size)
238
nir_builder *b = &builder->b;
240
get_address_imm(b, builder->draw.draw_ctx, offset),
244
#define get_draw_ctx_field(builder, name) \
245
get_draw_ctx_data(builder, \
246
offsetof(struct indirect_draw_context, name), \
247
sizeof(((struct indirect_draw_context *)0)->name) * 8)
249
#define set_draw_ctx_field(builder, name, val) \
250
set_draw_ctx_data(builder, \
251
offsetof(struct indirect_draw_context, name), \
253
sizeof(((struct indirect_draw_context *)0)->name) * 8)
256
get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
257
unsigned offset, unsigned size)
259
nir_builder *b = &builder->b;
260
return load_global(b,
261
get_address_imm(b, builder->draw.min_max_ctx, offset),
265
#define get_min_max_ctx_field(builder, name) \
266
get_min_max_ctx_data(builder, \
267
offsetof(struct min_max_context, name), \
268
sizeof(((struct min_max_context *)0)->name) * 8)
271
update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
273
nir_builder *b = &builder->b;
276
builder->draw.min_max_ctx,
277
offsetof(struct min_max_context, min));
278
nir_global_atomic_umin(b, 32, addr, val);
282
update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
284
nir_builder *b = &builder->b;
287
builder->draw.min_max_ctx,
288
offsetof(struct min_max_context, max));
289
nir_global_atomic_umax(b, 32, addr, val);
292
#define get_draw_field(b, draw_ptr, field) \
294
get_address_imm(b, draw_ptr, \
295
offsetof(struct indirect_draw_info, field)), \
296
1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
298
#define get_indexed_draw_field(b, draw_ptr, field) \
300
get_address_imm(b, draw_ptr, \
301
offsetof(struct indirect_indexed_draw_info, field)), \
302
1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
305
extract_inputs(struct indirect_draw_shader_builder *builder)
307
nir_builder *b = &builder->b;
309
builder->draw.draw_ctx = get_input_field(b, draw_ctx);
310
builder->draw.draw_buf = get_input_field(b, draw_buf);
311
builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
313
if (builder->index_size) {
314
builder->draw.index_buf = get_input_field(b, index_buf);
315
builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
316
if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
317
builder->draw.restart_index =
318
get_input_field(b, restart_index);
322
if (builder->index_min_max_search)
325
builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
326
builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
327
builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
328
builder->jobs.vertex_job = get_input_field(b, vertex_job);
329
builder->jobs.tiler_job = get_input_field(b, tiler_job);
330
builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
331
builder->attribs.attribs = get_input_field(b, attribs);
332
builder->attribs.attrib_count = get_input_field(b, attrib_count);
333
builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
334
builder->varyings.mem_ptr =
335
nir_local_variable_create(b->impl,
336
glsl_uint64_t_type(),
338
nir_store_var(b, builder->varyings.mem_ptr,
339
get_draw_ctx_field(builder, varying_mem), 3);
343
init_shader_builder(struct indirect_draw_shader_builder *builder,
344
const struct panfrost_device *dev,
345
unsigned flags, unsigned index_size,
346
bool index_min_max_search)
348
memset(builder, 0, sizeof(*builder));
350
builder->flags = flags;
351
builder->index_size = index_size;
353
builder->index_min_max_search = index_min_max_search;
355
if (index_min_max_search) {
357
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
358
GENX(pan_shader_get_compiler_options)(),
359
"indirect_draw_min_max_index(index_size=%d)",
360
builder->index_size);
363
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
364
GENX(pan_shader_get_compiler_options)(),
365
"indirect_draw(index_size=%d%s%s%s%s)",
367
flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
369
flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
370
",primitive_restart" : "",
371
flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
372
",update_primitive_size" : "",
373
flags & PAN_INDIRECT_DRAW_IDVS ?
377
nir_builder *b = &builder->b;
378
nir_variable_create(b->shader, nir_var_mem_ubo,
379
glsl_uint_type(), "inputs");
380
b->shader->info.num_ubos++;
382
extract_inputs(builder);
386
update_dcd(struct indirect_draw_shader_builder *builder,
387
nir_ssa_def *job_ptr,
388
unsigned draw_offset)
390
nir_builder *b = &builder->b;
391
nir_ssa_def *draw_w01 =
392
load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
393
nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
395
/* Update DRAW.{instance_size,offset_start} */
396
nir_ssa_def *instance_size =
398
nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
399
nir_imm_int(b, 0), builder->instance_size.packed);
400
draw_w01 = nir_vec2(b,
401
nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
402
nir_ishl(b, instance_size, nir_imm_int(b, 16))),
403
builder->jobs.offset_start);
404
store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
409
update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
411
nir_builder *b = &builder->b;
412
nir_ssa_def *job_ptr =
413
type == MALI_JOB_TYPE_VERTEX ?
414
builder->jobs.vertex_job : builder->jobs.tiler_job;
416
/* Update the invocation words. */
417
store_global(b, get_address_imm(b, job_ptr, WORD(8)),
418
builder->jobs.invocation, 2);
420
unsigned draw_offset =
421
type == MALI_JOB_TYPE_VERTEX ?
422
pan_section_offset(COMPUTE_JOB, DRAW) :
423
pan_section_offset(TILER_JOB, DRAW);
424
unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
425
unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
426
unsigned index_size = builder->index_size;
428
if (type == MALI_JOB_TYPE_TILER) {
429
/* Update PRIMITIVE.{base_vertex_offset,count} */
431
get_address_imm(b, job_ptr, prim_offset + WORD(1)),
432
builder->jobs.base_vertex_offset, 1);
434
get_address_imm(b, job_ptr, prim_offset + WORD(3)),
435
nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
439
get_address_imm(b, job_ptr, prim_offset + WORD(4));
440
nir_ssa_def *indices = load_global(b, addr, 1, 64);
441
nir_ssa_def *offset =
442
nir_imul_imm(b, builder->draw.vertex_start, index_size);
444
indices = get_address(b, indices, offset);
445
store_global(b, addr, indices, 2);
448
/* Update PRIMITIVE_SIZE.size_array */
449
if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
450
(builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
452
get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
453
builder->varyings.psiz_ptr, 2);
456
/* Update DRAW.position */
457
store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
458
builder->varyings.pos_ptr, 2);
461
update_dcd(builder, job_ptr, draw_offset);
463
if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
464
assert(type == MALI_JOB_TYPE_TILER);
466
update_dcd(builder, job_ptr,
467
pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
472
split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
474
/* TODO: Lower this 64bit div to something GPU-friendly */
475
nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
476
nir_ssa_def *div64 = nir_u2u64(b, div);
477
nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
478
nir_ssa_def *f0 = nir_iadd(b,
479
nir_ishl(b, nir_imm_int64(b, 1),
480
nir_iadd_imm(b, r, 32)),
482
nir_ssa_def *fi = nir_idiv(b, f0, div64);
483
nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
484
nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
485
nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
486
*d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
487
*r_e = nir_ior(b, r, e);
491
update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
492
nir_ssa_def *attrib_buf_ptr,
493
enum mali_attribute_type type,
497
nir_builder *b = &builder->b;
498
unsigned type_mask = BITFIELD_MASK(6);
499
nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
500
nir_ssa_def *w0 = nir_channel(b, w01, 0);
501
nir_ssa_def *w1 = nir_channel(b, w01, 1);
503
/* Word 0 and 1 of the attribute descriptor contain the type,
504
* pointer and the the divisor exponent.
506
w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
507
w0 = nir_ior(b, w0, nir_imm_int(b, type));
508
w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
510
store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
512
if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
513
/* If the divisor is not a power of two, the divisor numerator
514
* is passed in word 1 of the continuation attribute (word 5
515
* if we consider the attribute and its continuation as a
519
store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
525
zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
526
nir_ssa_def *attrib_buf_ptr)
528
/* Stride is an unadorned 32-bit uint at word 2 */
529
nir_builder *b = &builder->b;
530
store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
531
nir_imm_int(b, 0), 1);
535
adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
536
nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
537
nir_ssa_def *instance_div)
539
nir_builder *b = &builder->b;
540
nir_ssa_def *zero = nir_imm_int(b, 0);
541
nir_ssa_def *two = nir_imm_int(b, 2);
542
nir_ssa_def *sub_cur_offset =
543
nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
544
nir_uge(b, builder->draw.instance_count, two));
546
nir_ssa_def *add_base_inst_offset =
547
nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
548
nir_ine(b, instance_div, zero));
550
IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
551
nir_ssa_def *offset =
552
load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
553
nir_ssa_def *stride =
554
load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
556
/* Per-instance data needs to be offset in response to a
557
* delayed start in an indexed draw.
560
IF (add_base_inst_offset) {
561
offset = nir_iadd(b, offset,
564
builder->draw.start_instance),
568
IF (sub_cur_offset) {
569
offset = nir_isub(b, offset,
571
builder->jobs.offset_start));
574
store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
579
/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
582
nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
584
return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
587
/* Based on panfrost_emit_vertex_data() */
590
update_vertex_attribs(struct indirect_draw_shader_builder *builder)
592
nir_builder *b = &builder->b;
593
nir_variable *attrib_idx_var =
594
nir_local_variable_create(b->impl, glsl_uint_type(),
596
nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
599
nir_ssa_def *single_instance =
600
nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
604
nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
605
IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
609
nir_ssa_def *attrib_buf_ptr =
610
get_address(b, builder->attribs.attrib_bufs,
611
nir_imul_imm(b, attrib_idx,
612
2 * pan_size(ATTRIBUTE_BUFFER)));
613
nir_ssa_def *attrib_ptr =
614
get_address(b, builder->attribs.attribs,
615
nir_imul_imm(b, attrib_idx,
616
pan_size(ATTRIBUTE)));
618
nir_ssa_def *r_e, *d;
621
IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
623
nir_bcsel(b, single_instance,
624
nir_imm_int(b, 0x9f),
625
builder->instance_size.packed);
628
get_address_imm(b, attrib_buf_ptr, WORD(4)),
629
nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
631
nir_store_var(b, attrib_idx_var,
632
nir_iadd_imm(b, attrib_idx, 1), 1);
636
IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
637
split_div(b, builder->instance_size.padded,
639
nir_ssa_def *default_div =
640
nir_ior(b, single_instance,
642
builder->instance_size.padded,
644
r_e = nir_bcsel(b, default_div,
645
nir_imm_int(b, 0x3f), r_e);
646
d = nir_bcsel(b, default_div,
647
nir_imm_int(b, (1u << 31) - 1), d);
649
get_address_imm(b, attrib_buf_ptr, WORD(1)),
650
nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
652
nir_store_var(b, attrib_idx_var,
653
nir_iadd_imm(b, attrib_idx, 1), 1);
658
nir_ssa_def *instance_div =
659
load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
661
nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
663
nir_ssa_def *multi_instance =
664
nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
666
IF (nir_ine(b, div, nir_imm_int(b, 0))) {
667
IF (multi_instance) {
668
IF (nir_is_power_of_two_or_zero(b, div)) {
670
nir_imax(b, nir_ufind_msb(b, div),
672
update_vertex_attrib_buf(builder, attrib_buf_ptr,
673
MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
676
split_div(b, div, &r_e, &d);
677
update_vertex_attrib_buf(builder, attrib_buf_ptr,
678
MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
682
/* Single instance with a non-0 divisor: all
683
* accesses should point to attribute 0 */
684
zero_attrib_buf_stride(builder, attrib_buf_ptr);
687
adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
688
} ELSE IF (multi_instance) {
689
update_vertex_attrib_buf(builder, attrib_buf_ptr,
690
MALI_ATTRIBUTE_TYPE_1D_MODULUS,
691
builder->instance_size.packed, NULL);
694
nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
699
update_varying_buf(struct indirect_draw_shader_builder *builder,
700
nir_ssa_def *varying_buf_ptr,
701
nir_ssa_def *vertex_count)
703
nir_builder *b = &builder->b;
705
nir_ssa_def *stride =
706
load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
707
nir_ssa_def *size = nir_imul(b, stride, vertex_count);
708
nir_ssa_def *aligned_size =
709
nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
710
nir_ssa_def *var_mem_ptr =
711
nir_load_var(b, builder->varyings.mem_ptr);
713
nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
714
nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
715
nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
716
store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
717
nir_vec4(b, w0, w1, stride, size), 4);
719
nir_store_var(b, builder->varyings.mem_ptr,
720
get_address(b, var_mem_ptr, aligned_size), 3);
725
/* Based on panfrost_emit_varying_descriptor() */
728
update_varyings(struct indirect_draw_shader_builder *builder)
730
nir_builder *b = &builder->b;
731
nir_ssa_def *vertex_count =
732
nir_imul(b, builder->instance_size.padded,
733
builder->draw.instance_count);
734
nir_ssa_def *buf_ptr =
735
get_address_imm(b, builder->varyings.varying_bufs,
737
pan_size(ATTRIBUTE_BUFFER));
738
update_varying_buf(builder, buf_ptr, vertex_count);
740
buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
742
pan_size(ATTRIBUTE_BUFFER));
743
builder->varyings.pos_ptr =
744
update_varying_buf(builder, buf_ptr, vertex_count);
746
if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
747
buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
749
pan_size(ATTRIBUTE_BUFFER));
750
builder->varyings.psiz_ptr =
751
update_varying_buf(builder, buf_ptr, vertex_count);
754
set_draw_ctx_field(builder, varying_mem,
755
nir_load_var(b, builder->varyings.mem_ptr));
758
/* Based on panfrost_pack_work_groups_compute() */
761
get_invocation(struct indirect_draw_shader_builder *builder)
763
nir_builder *b = &builder->b;
764
nir_ssa_def *one = nir_imm_int(b, 1);
765
nir_ssa_def *max_vertex =
766
nir_usub_sat(b, builder->instance_size.raw, one);
767
nir_ssa_def *max_instance =
768
nir_usub_sat(b, builder->draw.instance_count, one);
770
nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
772
nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
774
builder->jobs.invocation =
776
nir_ior(b, max_vertex,
777
nir_ishl(b, max_instance, split)),
778
nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
779
nir_imm_int(b, 2 << 28)));
783
nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
785
assert(pot != 0 && util_is_power_of_two_or_zero(pot));
787
return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
790
/* Based on panfrost_padded_vertex_count() */
793
get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
795
nir_ssa_def *one = nir_imm_int(b, 1);
796
nir_ssa_def *zero = nir_imm_int(b, 0);
797
nir_ssa_def *eleven = nir_imm_int(b, 11);
798
nir_ssa_def *four = nir_imm_int(b, 4);
801
nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
802
nir_ssa_def *base = nir_ushr(b, val, exp);
804
base = nir_iadd(b, base,
805
nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
807
nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
808
exp = nir_iadd(b, exp, rshift);
809
base = nir_ushr(b, base, rshift);
810
base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
811
rshift = nir_imax(b, nir_find_lsb(b, base), zero);
812
exp = nir_iadd(b, exp, rshift);
813
base = nir_ushr(b, base, rshift);
815
*packed = nir_ior(b, exp,
816
nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
817
return nir_ishl(b, base, exp);
821
update_jobs(struct indirect_draw_shader_builder *builder)
823
get_invocation(builder);
825
if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
826
update_job(builder, MALI_JOB_TYPE_VERTEX);
828
update_job(builder, MALI_JOB_TYPE_TILER);
833
set_null_job(struct indirect_draw_shader_builder *builder,
834
nir_ssa_def *job_ptr)
836
nir_builder *b = &builder->b;
837
nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
838
nir_ssa_def *val = load_global(b, w4, 1, 32);
840
/* Set job type to NULL (AKA NOOP) */
841
val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
842
nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
843
store_global(b, w4, val, 1);
847
get_instance_size(struct indirect_draw_shader_builder *builder)
849
nir_builder *b = &builder->b;
851
if (!builder->index_size) {
852
builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
853
builder->jobs.offset_start = builder->draw.vertex_start;
854
builder->instance_size.raw = builder->draw.vertex_count;
858
unsigned index_size = builder->index_size;
859
nir_ssa_def *min = get_min_max_ctx_field(builder, min);
860
nir_ssa_def *max = get_min_max_ctx_field(builder, max);
862
/* We handle unaligned indices here to avoid the extra complexity in
863
* the min/max search job.
865
if (builder->index_size < 4) {
866
nir_variable *min_var =
867
nir_local_variable_create(b->impl, glsl_uint_type(), "min");
868
nir_store_var(b, min_var, min, 1);
869
nir_variable *max_var =
870
nir_local_variable_create(b->impl, glsl_uint_type(), "max");
871
nir_store_var(b, max_var, max, 1);
874
get_address(b, builder->draw.index_buf,
875
nir_imul_imm(b, builder->draw.vertex_start, index_size));
876
nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
879
nir_imul_imm(b, builder->draw.vertex_count, index_size));
880
nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
881
unsigned shift = index_size * 8;
882
unsigned mask = (1 << shift) - 1;
884
base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
886
/* Unaligned start offset, we need to ignore any data that's
887
* outside the requested range. We also handle ranges that are
888
* covering less than 2 words here.
890
IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
891
min = nir_load_var(b, min_var);
892
max = nir_load_var(b, max_var);
894
nir_ssa_def *val = load_global(b, base, 1, 32);
895
for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
898
nir_ult(b, nir_imm_int(b, i), offset),
899
nir_uge(b, nir_imm_int(b, i), end));
900
nir_ssa_def *data = nir_iand_imm(b, val, mask);
902
min = nir_umin(b, min,
903
nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
904
max = nir_umax(b, max,
905
nir_bcsel(b, oob, nir_imm_int(b, 0), data));
906
val = nir_ushr_imm(b, val, shift);
909
nir_store_var(b, min_var, min, 1);
910
nir_store_var(b, max_var, max, 1);
913
nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
915
/* The last word contains less than 4bytes of data, we need to
916
* discard anything falling outside the requested range.
918
IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
919
min = nir_load_var(b, min_var);
920
max = nir_load_var(b, max_var);
922
nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
923
for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
924
nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
925
nir_ssa_def *data = nir_iand_imm(b, val, mask);
927
min = nir_umin(b, min,
928
nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
929
max = nir_umax(b, max,
930
nir_bcsel(b, oob, nir_imm_int(b, 0), data));
931
val = nir_ushr_imm(b, val, shift);
934
nir_store_var(b, min_var, min, 1);
935
nir_store_var(b, max_var, max, 1);
938
min = nir_load_var(b, min_var);
939
max = nir_load_var(b, max_var);
942
builder->jobs.base_vertex_offset = nir_ineg(b, min);
943
builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
944
builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
947
/* Patch a draw sequence */
950
patch(struct indirect_draw_shader_builder *builder)
952
unsigned index_size = builder->index_size;
953
nir_builder *b = &builder->b;
955
nir_ssa_def *draw_ptr = builder->draw.draw_buf;
958
builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
959
builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
960
builder->draw.instance_count =
961
get_indexed_draw_field(b, draw_ptr, instance_count);
962
builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
963
builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
965
builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
966
builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
967
builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
968
builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
971
assert(builder->draw.vertex_count->num_components);
973
nir_ssa_def *num_vertices =
974
nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
976
IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
977
/* If there's nothing to draw, turn the vertex/tiler jobs into
980
if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
981
set_null_job(builder, builder->jobs.vertex_job);
983
set_null_job(builder, builder->jobs.tiler_job);
985
get_instance_size(builder);
987
nir_ssa_def *count = builder->instance_size.raw;
989
/* IDVS requires padding to a multiple of 4 */
990
if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
991
count = nir_align_pot(b, count, 4);
993
builder->instance_size.padded =
994
get_padded_count(b, count,
995
&builder->instance_size.packed);
997
update_varyings(builder);
998
update_jobs(builder);
999
update_vertex_attribs(builder);
1001
IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
1002
store_global(b, builder->jobs.first_vertex_sysval,
1003
builder->jobs.offset_start, 1);
1006
IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
1007
store_global(b, builder->jobs.base_vertex_sysval,
1009
builder->draw.index_bias :
1014
IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
1015
store_global(b, builder->jobs.base_instance_sysval,
1016
builder->draw.start_instance, 1);
1021
/* Search the min/max index in the range covered by the indirect draw call */
1024
get_index_min_max(struct indirect_draw_shader_builder *builder)
1026
nir_ssa_def *restart_index = builder->draw.restart_index;
1027
unsigned index_size = builder->index_size;
1028
nir_builder *b = &builder->b;
1030
nir_ssa_def *draw_ptr = builder->draw.draw_buf;
1032
builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
1033
builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
1035
nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1036
nir_variable *min_var =
1037
nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1038
nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1039
nir_variable *max_var =
1040
nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1041
nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1044
get_address(b, builder->draw.index_buf,
1045
nir_imul_imm(b, builder->draw.vertex_start, index_size));
1048
nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1050
nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1052
base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1054
/* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1055
start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1056
end = nir_iand_imm(b, end, ~3);
1058
/* Add the job offset. */
1059
start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1061
nir_variable *offset_var =
1062
nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1063
nir_store_var(b, offset_var, start, 1);
1066
nir_ssa_def *offset = nir_load_var(b, offset_var);
1067
IF (nir_uge(b, offset, end))
1071
nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1072
nir_ssa_def *old_min = nir_load_var(b, min_var);
1073
nir_ssa_def *old_max = nir_load_var(b, max_var);
1074
nir_ssa_def *new_min;
1075
nir_ssa_def *new_max;
1077
/* TODO: use 8/16 bit arithmetic when index_size < 4. */
1078
for (unsigned i = 0; i < 4; i += index_size) {
1079
nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1080
data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1081
new_min = nir_umin(b, old_min, data);
1082
new_max = nir_umax(b, old_max, data);
1083
if (restart_index) {
1084
new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1085
new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1091
nir_store_var(b, min_var, new_min, 1);
1092
nir_store_var(b, max_var, new_max, 1);
1093
nir_store_var(b, offset_var,
1094
nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1097
IF (nir_ult(b, start, end))
1098
update_min(builder, nir_load_var(b, min_var));
1099
update_max(builder, nir_load_var(b, max_var));
1104
get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1106
if (!index_min_max_search) {
1107
flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1108
flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1110
flags |= (util_logbase2(index_size) + 1);
1114
return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1115
PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1116
PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1117
util_logbase2(index_size);
1121
create_indirect_draw_shader(struct panfrost_device *dev,
1122
unsigned flags, unsigned index_size,
1123
bool index_min_max_search)
1125
assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1126
struct indirect_draw_shader_builder builder;
1127
init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1129
nir_builder *b = &builder.b;
1131
if (index_min_max_search)
1132
get_index_min_max(&builder);
1136
struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
1137
struct pan_shader_info shader_info;
1138
struct util_dynarray binary;
1140
util_dynarray_init(&binary, NULL);
1141
GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1143
assert(!shader_info.tls_size);
1144
assert(!shader_info.wls_size);
1145
assert(!shader_info.sysvals.sysval_count);
1147
unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1148
struct pan_indirect_draw_shader *draw_shader =
1149
&dev->indirect_draw_shaders.shaders[shader_id];
1150
void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1151
(shader_id * pan_size(RENDERER_STATE));
1153
pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1154
if (!draw_shader->rsd) {
1156
pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1157
binary.data, binary.size,
1158
PAN_ARCH >= 6 ? 128 : 64);
1161
address |= shader_info.midgard.first_tag;
1164
util_dynarray_fini(&binary);
1166
pan_pack(state, RENDERER_STATE, cfg) {
1167
pan_shader_prepare_rsd(&shader_info, address, &cfg);
1170
draw_shader->push = shader_info.push;
1171
draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1172
(shader_id * pan_size(RENDERER_STATE));
1174
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1176
ralloc_free(b->shader);
1180
get_renderer_state(struct panfrost_device *dev, unsigned flags,
1181
unsigned index_size, bool index_min_max_search)
1183
unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1184
struct pan_indirect_draw_shader *info =
1185
&dev->indirect_draw_shaders.shaders[shader_id];
1188
create_indirect_draw_shader(dev, flags, index_size,
1189
index_min_max_search);
1197
get_tls(const struct panfrost_device *dev)
1199
return dev->indirect_draw_shaders.states->ptr.gpu +
1200
(PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1204
get_ubos(struct pan_pool *pool,
1205
const struct indirect_draw_inputs *inputs)
1207
struct panfrost_ptr inputs_buf =
1208
pan_pool_alloc_aligned(pool, sizeof(*inputs), 16);
1210
memcpy(inputs_buf.cpu, inputs, sizeof(*inputs));
1212
struct panfrost_ptr ubos_buf =
1213
pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
1215
pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
1216
cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16);
1217
cfg.pointer = inputs_buf.gpu;
1220
return ubos_buf.gpu;
1224
get_push_uniforms(struct pan_pool *pool,
1225
const struct pan_indirect_draw_shader *shader,
1226
const struct indirect_draw_inputs *inputs)
1228
if (!shader->push.count)
1231
struct panfrost_ptr push_consts_buf =
1232
pan_pool_alloc_aligned(pool, shader->push.count * 4, 16);
1233
uint32_t *out = push_consts_buf.cpu;
1234
uint8_t *in = (uint8_t *)inputs;
1236
for (unsigned i = 0; i < shader->push.count; ++i)
1237
memcpy(out + i, in + shader->push.words[i].offset, 4);
1239
return push_consts_buf.gpu;
1243
panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1245
pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1246
if (dev->indirect_draw_shaders.states)
1249
unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1250
pan_size(RENDERER_STATE)) +
1251
pan_size(LOCAL_STORAGE);
1253
dev->indirect_draw_shaders.states =
1254
panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1256
/* Prepare the thread storage descriptor now since it's invariant. */
1257
void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1258
(PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1259
pan_pack(tsd, LOCAL_STORAGE, ls) {
1260
ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1263
/* FIXME: Currently allocating 512M of growable memory, meaning that we
1264
* only allocate what we really use, the problem is:
1265
* - allocation happens 2M at a time, which might be more than we
1267
* - the memory is attached to the device to speed up subsequent
1268
* indirect draws, but that also means it's never shrinked
1270
dev->indirect_draw_shaders.varying_heap =
1271
panfrost_bo_create(dev, 512 * 1024 * 1024,
1272
PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1273
"Indirect draw varying heap");
1276
pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1280
panfrost_emit_index_min_max_search(struct pan_pool *pool,
1281
struct pan_scoreboard *scoreboard,
1282
const struct pan_indirect_draw_info *draw_info,
1283
const struct indirect_draw_inputs *inputs,
1284
struct indirect_draw_context *draw_ctx,
1287
struct panfrost_device *dev = pool->dev;
1288
unsigned index_size = draw_info->index_size;
1294
get_renderer_state(dev, draw_info->flags,
1295
draw_info->index_size, true);
1296
unsigned shader_id =
1297
get_shader_id(draw_info->flags, draw_info->index_size, true);
1298
const struct pan_indirect_draw_shader *shader =
1299
&dev->indirect_draw_shaders.shaders[shader_id];
1300
struct panfrost_ptr job =
1301
pan_pool_alloc_desc(pool, COMPUTE_JOB);
1303
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1304
panfrost_pack_work_groups_compute(invocation,
1305
1, 1, 1, MIN_MAX_JOBS, 1, 1,
1308
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1309
cfg.job_task_split = 7;
1312
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1313
cfg.draw_descriptor_is_64b = true;
1315
cfg.thread_storage = get_tls(pool->dev);
1316
cfg.uniform_buffers = ubos;
1317
cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
1320
return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1321
false, false, 0, 0, &job, false);
1325
GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1326
struct pan_scoreboard *scoreboard,
1327
const struct pan_indirect_draw_info *draw_info,
1328
struct panfrost_ptr *ctx)
1330
struct panfrost_device *dev = pool->dev;
1332
/* Currently only tested on Bifrost, but the logic should be the same
1335
assert(pan_is_bifrost(dev));
1337
panfrost_indirect_draw_alloc_deps(dev);
1339
struct panfrost_ptr job =
1340
pan_pool_alloc_desc(pool, COMPUTE_JOB);
1342
get_renderer_state(dev, draw_info->flags,
1343
draw_info->index_size, false);
1345
struct indirect_draw_context draw_ctx = {
1346
.varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1349
struct panfrost_ptr draw_ctx_ptr = *ctx;
1350
if (!draw_ctx_ptr.cpu) {
1351
draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1356
struct indirect_draw_inputs inputs = {
1357
.draw_ctx = draw_ctx_ptr.gpu,
1358
.draw_buf = draw_info->draw_buf,
1359
.index_buf = draw_info->index_buf,
1360
.first_vertex_sysval = draw_info->first_vertex_sysval,
1361
.base_vertex_sysval = draw_info->base_vertex_sysval,
1362
.base_instance_sysval = draw_info->base_instance_sysval,
1363
.vertex_job = draw_info->vertex_job,
1364
.tiler_job = draw_info->tiler_job,
1365
.attrib_bufs = draw_info->attrib_bufs,
1366
.attribs = draw_info->attribs,
1367
.varying_bufs = draw_info->varying_bufs,
1368
.attrib_count = draw_info->attrib_count,
1371
if (draw_info->index_size) {
1372
inputs.restart_index = draw_info->restart_index;
1374
struct panfrost_ptr min_max_ctx_ptr =
1375
pan_pool_alloc_aligned(pool,
1376
sizeof(struct min_max_context),
1378
struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1380
ctx->min = UINT32_MAX;
1382
inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1385
unsigned shader_id =
1386
get_shader_id(draw_info->flags, draw_info->index_size, false);
1387
const struct pan_indirect_draw_shader *shader =
1388
&dev->indirect_draw_shaders.shaders[shader_id];
1389
mali_ptr ubos = get_ubos(pool, &inputs);
1392
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1393
panfrost_pack_work_groups_compute(invocation,
1397
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1398
cfg.job_task_split = 2;
1401
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1402
cfg.draw_descriptor_is_64b = true;
1404
cfg.thread_storage = get_tls(pool->dev);
1405
cfg.uniform_buffers = ubos;
1406
cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
1409
unsigned global_dep = draw_info->last_indirect_draw;
1410
unsigned local_dep =
1411
panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1412
&inputs, &draw_ctx, ubos);
1415
*ctx = draw_ctx_ptr;
1416
memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1419
return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1420
false, true, local_dep, global_dep,
1425
GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1426
struct pan_pool *bin_pool)
1428
/* We allocate the states and varying_heap BO lazily to avoid
1429
* reserving memory when indirect draws are not used.
1431
pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1432
dev->indirect_draw_shaders.bin_pool = bin_pool;
1436
GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1438
panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1439
panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1440
pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);