~mmach/netext73/mesa-haswell

« back to all changes in this revision

Viewing changes to src/panfrost/lib/pan_indirect_draw.c

  • Committer: mmach
  • Date: 2022-09-22 19:56:13 UTC
  • Revision ID: netbit73@gmail.com-20220922195613-wtik9mmy20tmor0i
2022-09-22 21:17:09

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
 * Copyright (C) 2021 Collabora, Ltd.
3
 
 *
4
 
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 
 * copy of this software and associated documentation files (the "Software"),
6
 
 * to deal in the Software without restriction, including without limitation
7
 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 
 * and/or sell copies of the Software, and to permit persons to whom the
9
 
 * Software is furnished to do so, subject to the following conditions:
10
 
 *
11
 
 * The above copyright notice and this permission notice (including the next
12
 
 * paragraph) shall be included in all copies or substantial portions of the
13
 
 * Software.
14
 
 *
15
 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
 
 * SOFTWARE.
22
 
 *
23
 
 */
24
 
 
25
 
#include <stdio.h>
26
 
#include "pan_bo.h"
27
 
#include "pan_shader.h"
28
 
#include "pan_scoreboard.h"
29
 
#include "pan_encoder.h"
30
 
#include "pan_indirect_draw.h"
31
 
#include "pan_pool.h"
32
 
#include "pan_util.h"
33
 
#include "compiler/nir/nir_builder.h"
34
 
#include "util/u_memory.h"
35
 
#include "util/macros.h"
36
 
 
37
 
#define WORD(x) ((x) * 4)
38
 
 
39
 
#define LOOP \
40
 
        for (nir_loop *l = nir_push_loop(b); l != NULL; \
41
 
             nir_pop_loop(b, l), l = NULL)
42
 
#define BREAK nir_jump(b, nir_jump_break)
43
 
#define CONTINUE nir_jump(b, nir_jump_continue)
44
 
 
45
 
#define IF(cond) nir_push_if(b, cond);
46
 
#define ELSE nir_push_else(b, NULL);
47
 
#define ENDIF nir_pop_if(b, NULL);
48
 
 
49
 
#define MIN_MAX_JOBS 128
50
 
 
51
 
struct draw_data {
52
 
        nir_ssa_def *draw_buf;
53
 
        nir_ssa_def *draw_buf_stride;
54
 
        nir_ssa_def *index_buf;
55
 
        nir_ssa_def *restart_index;
56
 
        nir_ssa_def *vertex_count;
57
 
        nir_ssa_def *start_instance;
58
 
        nir_ssa_def *instance_count;
59
 
        nir_ssa_def *vertex_start;
60
 
        nir_ssa_def *index_bias;
61
 
        nir_ssa_def *draw_ctx;
62
 
        nir_ssa_def *min_max_ctx;
63
 
};
64
 
 
65
 
struct instance_size {
66
 
        nir_ssa_def *raw;
67
 
        nir_ssa_def *padded;
68
 
        nir_ssa_def *packed;
69
 
};
70
 
 
71
 
struct jobs_data {
72
 
        nir_ssa_def *vertex_job;
73
 
        nir_ssa_def *tiler_job;
74
 
        nir_ssa_def *base_vertex_offset;
75
 
        nir_ssa_def *first_vertex_sysval;
76
 
        nir_ssa_def *base_vertex_sysval;
77
 
        nir_ssa_def *base_instance_sysval;
78
 
        nir_ssa_def *offset_start;
79
 
        nir_ssa_def *invocation;
80
 
};
81
 
 
82
 
struct varyings_data {
83
 
        nir_ssa_def *varying_bufs;
84
 
        nir_ssa_def *pos_ptr;
85
 
        nir_ssa_def *psiz_ptr;
86
 
        nir_variable *mem_ptr;
87
 
};
88
 
 
89
 
struct attribs_data {
90
 
        nir_ssa_def *attrib_count;
91
 
        nir_ssa_def *attrib_bufs;
92
 
        nir_ssa_def *attribs;
93
 
};
94
 
 
95
 
struct indirect_draw_shader_builder {
96
 
        nir_builder b;
97
 
        const struct panfrost_device *dev;
98
 
        unsigned flags;
99
 
        bool index_min_max_search;
100
 
        unsigned index_size;
101
 
        struct draw_data draw;
102
 
        struct instance_size instance_size;
103
 
        struct jobs_data jobs;
104
 
        struct varyings_data varyings;
105
 
        struct attribs_data attribs;
106
 
};
107
 
 
108
 
/* Describes an indirect draw (see glDrawArraysIndirect()) */
109
 
 
110
 
struct indirect_draw_info {
111
 
        uint32_t count;
112
 
        uint32_t instance_count;
113
 
        uint32_t start;
114
 
        uint32_t start_instance;
115
 
};
116
 
 
117
 
struct indirect_indexed_draw_info {
118
 
        uint32_t count;
119
 
        uint32_t instance_count;
120
 
        uint32_t start;
121
 
        int32_t index_bias;
122
 
        uint32_t start_instance;
123
 
};
124
 
 
125
 
/* Store the min/max index in a separate context. This is not supported yet, but
126
 
 * the DDK seems to put all min/max search jobs at the beginning of the job chain
127
 
 * when multiple indirect draws are issued to avoid the serialization caused by
128
 
 * the draw patching jobs which have the suppress_prefetch flag set. Merging the
129
 
 * min/max and draw contexts would prevent such optimizations (draw contexts are
130
 
 * shared by all indirect draw in a batch).
131
 
 */
132
 
 
133
 
struct min_max_context {
134
 
        uint32_t min;
135
 
        uint32_t max;
136
 
};
137
 
 
138
 
/* Per-batch context shared by all indirect draws queued to a given batch. */
139
 
 
140
 
struct indirect_draw_context {
141
 
        /* Pointer to the top of the varying heap. */
142
 
        mali_ptr varying_mem;
143
 
};
144
 
 
145
 
/* Indirect draw shader inputs. Those are stored in a UBO. */
146
 
 
147
 
struct indirect_draw_inputs {
148
 
        /* indirect_draw_context pointer */
149
 
        mali_ptr draw_ctx;
150
 
 
151
 
        /* min_max_context pointer */
152
 
        mali_ptr min_max_ctx;
153
 
 
154
 
        /* Pointer to an array of indirect_draw_info objects */
155
 
        mali_ptr draw_buf;
156
 
 
157
 
        /* Pointer to an uint32_t containing the number of draws to issue */
158
 
        mali_ptr draw_count_ptr;
159
 
 
160
 
        /* index buffer */
161
 
        mali_ptr index_buf;
162
 
 
163
 
        /* {base,first}_{vertex,instance} sysvals */
164
 
        mali_ptr first_vertex_sysval;
165
 
        mali_ptr base_vertex_sysval;
166
 
        mali_ptr base_instance_sysval;
167
 
 
168
 
        /* Pointers to various cmdstream structs that need to be patched */
169
 
        mali_ptr vertex_job;
170
 
        mali_ptr tiler_job;
171
 
        mali_ptr attrib_bufs;
172
 
        mali_ptr attribs;
173
 
        mali_ptr varying_bufs;
174
 
        uint32_t draw_count;
175
 
        uint32_t draw_buf_stride;
176
 
        uint32_t restart_index;
177
 
        uint32_t attrib_count;
178
 
};
179
 
 
180
 
static nir_ssa_def *
181
 
get_input_data(nir_builder *b, unsigned offset, unsigned size)
182
 
{
183
 
        assert(!(offset & 0x3));
184
 
        assert(size && !(size & 0x3));
185
 
 
186
 
        return nir_load_ubo(b, 1, size,
187
 
                            nir_imm_int(b, 0),
188
 
                            nir_imm_int(b, offset),
189
 
                            .align_mul = 4,
190
 
                            .align_offset = 0,
191
 
                            .range_base = 0,
192
 
                            .range = ~0);
193
 
}
194
 
 
195
 
#define get_input_field(b, name) \
196
 
        get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
197
 
                       sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
198
 
 
199
 
static nir_ssa_def *
200
 
get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
201
 
{
202
 
        return nir_iadd(b, base, nir_u2u64(b, offset));
203
 
}
204
 
 
205
 
static nir_ssa_def *
206
 
get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
207
 
{
208
 
        return get_address(b, base, nir_imm_int(b, offset));
209
 
}
210
 
 
211
 
static nir_ssa_def *
212
 
load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
213
 
{
214
 
        return nir_load_global(b, addr, 4, ncomps, bit_size);
215
 
}
216
 
 
217
 
static void
218
 
store_global(nir_builder *b, nir_ssa_def *addr,
219
 
             nir_ssa_def *value, unsigned ncomps)
220
 
{
221
 
        nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
222
 
}
223
 
 
224
 
static nir_ssa_def *
225
 
get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
226
 
                  unsigned offset, unsigned size)
227
 
{
228
 
        nir_builder *b = &builder->b;
229
 
        return load_global(b,
230
 
                           get_address_imm(b, builder->draw.draw_ctx, offset),
231
 
                           1, size);
232
 
}
233
 
 
234
 
static void
235
 
set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
236
 
                  unsigned offset, nir_ssa_def *value, unsigned size)
237
 
{
238
 
        nir_builder *b = &builder->b;
239
 
        store_global(b,
240
 
                     get_address_imm(b, builder->draw.draw_ctx, offset),
241
 
                     value, 1);
242
 
}
243
 
 
244
 
#define get_draw_ctx_field(builder, name) \
245
 
        get_draw_ctx_data(builder, \
246
 
                          offsetof(struct indirect_draw_context, name), \
247
 
                          sizeof(((struct indirect_draw_context *)0)->name) * 8)
248
 
 
249
 
#define set_draw_ctx_field(builder, name, val) \
250
 
        set_draw_ctx_data(builder, \
251
 
                          offsetof(struct indirect_draw_context, name), \
252
 
                          val, \
253
 
                          sizeof(((struct indirect_draw_context *)0)->name) * 8)
254
 
 
255
 
static nir_ssa_def *
256
 
get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
257
 
                     unsigned offset, unsigned size)
258
 
{
259
 
        nir_builder *b = &builder->b;
260
 
        return load_global(b,
261
 
                           get_address_imm(b, builder->draw.min_max_ctx, offset),
262
 
                           1, size);
263
 
}
264
 
 
265
 
#define get_min_max_ctx_field(builder, name) \
266
 
        get_min_max_ctx_data(builder, \
267
 
                             offsetof(struct min_max_context, name), \
268
 
                             sizeof(((struct min_max_context *)0)->name) * 8)
269
 
 
270
 
static void
271
 
update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
272
 
{
273
 
        nir_builder *b = &builder->b;
274
 
        nir_ssa_def *addr =
275
 
                get_address_imm(b,
276
 
                                builder->draw.min_max_ctx,
277
 
                                offsetof(struct min_max_context, min));
278
 
        nir_global_atomic_umin(b, 32, addr, val);
279
 
}
280
 
 
281
 
static void
282
 
update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
283
 
{
284
 
        nir_builder *b = &builder->b;
285
 
        nir_ssa_def *addr =
286
 
                get_address_imm(b,
287
 
                                builder->draw.min_max_ctx,
288
 
                                offsetof(struct min_max_context, max));
289
 
        nir_global_atomic_umax(b, 32, addr, val);
290
 
}
291
 
 
292
 
#define get_draw_field(b, draw_ptr, field) \
293
 
        load_global(b, \
294
 
                    get_address_imm(b, draw_ptr, \
295
 
                                    offsetof(struct indirect_draw_info, field)), \
296
 
                    1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
297
 
 
298
 
#define get_indexed_draw_field(b, draw_ptr, field) \
299
 
        load_global(b, \
300
 
                    get_address_imm(b, draw_ptr, \
301
 
                                    offsetof(struct indirect_indexed_draw_info, field)), \
302
 
                    1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
303
 
 
304
 
static void
305
 
extract_inputs(struct indirect_draw_shader_builder *builder)
306
 
{
307
 
        nir_builder *b = &builder->b;
308
 
 
309
 
        builder->draw.draw_ctx = get_input_field(b, draw_ctx);
310
 
        builder->draw.draw_buf = get_input_field(b, draw_buf);
311
 
        builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
312
 
 
313
 
        if (builder->index_size) {
314
 
                builder->draw.index_buf = get_input_field(b, index_buf);
315
 
                builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
316
 
                if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
317
 
                        builder->draw.restart_index =
318
 
                                get_input_field(b, restart_index);
319
 
                }
320
 
        }
321
 
 
322
 
        if (builder->index_min_max_search)
323
 
                return;
324
 
 
325
 
        builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
326
 
        builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
327
 
        builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
328
 
        builder->jobs.vertex_job = get_input_field(b, vertex_job);
329
 
        builder->jobs.tiler_job = get_input_field(b, tiler_job);
330
 
        builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
331
 
        builder->attribs.attribs = get_input_field(b, attribs);
332
 
        builder->attribs.attrib_count = get_input_field(b, attrib_count);
333
 
        builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
334
 
        builder->varyings.mem_ptr =
335
 
                nir_local_variable_create(b->impl,
336
 
                                          glsl_uint64_t_type(),
337
 
                                          "var_mem_ptr");
338
 
        nir_store_var(b, builder->varyings.mem_ptr,
339
 
                      get_draw_ctx_field(builder, varying_mem), 3);
340
 
}
341
 
 
342
 
static void
343
 
init_shader_builder(struct indirect_draw_shader_builder *builder,
344
 
                    const struct panfrost_device *dev,
345
 
                    unsigned flags, unsigned index_size,
346
 
                    bool index_min_max_search)
347
 
{
348
 
        memset(builder, 0, sizeof(*builder));
349
 
        builder->dev = dev;
350
 
        builder->flags = flags;
351
 
        builder->index_size = index_size;
352
 
 
353
 
        builder->index_min_max_search = index_min_max_search;
354
 
 
355
 
        if (index_min_max_search) {
356
 
                builder->b =
357
 
                        nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
358
 
                                                       GENX(pan_shader_get_compiler_options)(),
359
 
                                                       "indirect_draw_min_max_index(index_size=%d)",
360
 
                                                       builder->index_size);
361
 
        } else {
362
 
                builder->b =
363
 
                        nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
364
 
                                                       GENX(pan_shader_get_compiler_options)(),
365
 
                                                       "indirect_draw(index_size=%d%s%s%s%s)",
366
 
                                                       builder->index_size,
367
 
                                                       flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
368
 
                                                       ",psiz" : "",
369
 
                                                       flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
370
 
                                                       ",primitive_restart" : "",
371
 
                                                       flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
372
 
                                                       ",update_primitive_size" : "",
373
 
                                                       flags & PAN_INDIRECT_DRAW_IDVS ?
374
 
                                                       ",idvs" : "");
375
 
        }
376
 
 
377
 
        nir_builder *b = &builder->b;
378
 
        nir_variable_create(b->shader, nir_var_mem_ubo,
379
 
                            glsl_uint_type(), "inputs");
380
 
        b->shader->info.num_ubos++;
381
 
 
382
 
        extract_inputs(builder);
383
 
}
384
 
 
385
 
static void
386
 
update_dcd(struct indirect_draw_shader_builder *builder,
387
 
           nir_ssa_def *job_ptr,
388
 
           unsigned draw_offset)
389
 
{
390
 
        nir_builder *b = &builder->b;
391
 
        nir_ssa_def *draw_w01 =
392
 
                load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
393
 
        nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
394
 
 
395
 
        /* Update DRAW.{instance_size,offset_start} */
396
 
        nir_ssa_def *instance_size =
397
 
                nir_bcsel(b,
398
 
                          nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
399
 
                          nir_imm_int(b, 0), builder->instance_size.packed);
400
 
        draw_w01 = nir_vec2(b,
401
 
                            nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
402
 
                                    nir_ishl(b, instance_size, nir_imm_int(b, 16))),
403
 
                            builder->jobs.offset_start);
404
 
        store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
405
 
                     draw_w01, 2);
406
 
}
407
 
 
408
 
static void
409
 
update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
410
 
{
411
 
        nir_builder *b = &builder->b;
412
 
        nir_ssa_def *job_ptr =
413
 
                type == MALI_JOB_TYPE_VERTEX ?
414
 
                builder->jobs.vertex_job : builder->jobs.tiler_job;
415
 
 
416
 
        /* Update the invocation words. */
417
 
        store_global(b, get_address_imm(b, job_ptr, WORD(8)),
418
 
                     builder->jobs.invocation, 2);
419
 
 
420
 
        unsigned draw_offset =
421
 
                type == MALI_JOB_TYPE_VERTEX ?
422
 
                pan_section_offset(COMPUTE_JOB, DRAW) :
423
 
                pan_section_offset(TILER_JOB, DRAW);
424
 
        unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
425
 
        unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
426
 
        unsigned index_size = builder->index_size;
427
 
 
428
 
        if (type == MALI_JOB_TYPE_TILER) {
429
 
                /* Update PRIMITIVE.{base_vertex_offset,count} */
430
 
                store_global(b,
431
 
                             get_address_imm(b, job_ptr, prim_offset + WORD(1)),
432
 
                             builder->jobs.base_vertex_offset, 1);
433
 
                store_global(b,
434
 
                             get_address_imm(b, job_ptr, prim_offset + WORD(3)),
435
 
                             nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
436
 
 
437
 
                if (index_size) {
438
 
                        nir_ssa_def *addr =
439
 
                                get_address_imm(b, job_ptr, prim_offset + WORD(4));
440
 
                        nir_ssa_def *indices = load_global(b, addr, 1, 64);
441
 
                        nir_ssa_def *offset =
442
 
                                nir_imul_imm(b, builder->draw.vertex_start, index_size);
443
 
 
444
 
                        indices = get_address(b, indices, offset);
445
 
                        store_global(b, addr, indices, 2);
446
 
                }
447
 
 
448
 
                /* Update PRIMITIVE_SIZE.size_array */
449
 
                if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
450
 
                    (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
451
 
                        store_global(b,
452
 
                                     get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
453
 
                                     builder->varyings.psiz_ptr, 2);
454
 
                }
455
 
 
456
 
                /* Update DRAW.position */
457
 
                store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
458
 
                             builder->varyings.pos_ptr, 2);
459
 
        }
460
 
 
461
 
        update_dcd(builder, job_ptr, draw_offset);
462
 
 
463
 
        if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
464
 
                assert(type == MALI_JOB_TYPE_TILER);
465
 
 
466
 
                update_dcd(builder, job_ptr,
467
 
                           pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
468
 
        }
469
 
}
470
 
 
471
 
static void
472
 
split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
473
 
{
474
 
        /* TODO: Lower this 64bit div to something GPU-friendly */
475
 
        nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
476
 
        nir_ssa_def *div64 = nir_u2u64(b, div);
477
 
        nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
478
 
        nir_ssa_def *f0 = nir_iadd(b,
479
 
                                   nir_ishl(b, nir_imm_int64(b, 1),
480
 
                                            nir_iadd_imm(b, r, 32)),
481
 
                                   half_div64);
482
 
        nir_ssa_def *fi = nir_idiv(b, f0, div64);
483
 
        nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
484
 
        nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
485
 
                                   nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
486
 
        *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
487
 
        *r_e = nir_ior(b, r, e);
488
 
}
489
 
 
490
 
static void
491
 
update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
492
 
                         nir_ssa_def *attrib_buf_ptr,
493
 
                         enum mali_attribute_type type,
494
 
                         nir_ssa_def *div1,
495
 
                         nir_ssa_def *div2)
496
 
{
497
 
        nir_builder *b = &builder->b;
498
 
        unsigned type_mask = BITFIELD_MASK(6);
499
 
        nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
500
 
        nir_ssa_def *w0 = nir_channel(b, w01, 0);
501
 
        nir_ssa_def *w1 = nir_channel(b, w01, 1);
502
 
 
503
 
        /* Word 0 and 1 of the attribute descriptor contain the type,
504
 
         * pointer and the the divisor exponent.
505
 
         */
506
 
        w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
507
 
        w0 = nir_ior(b, w0, nir_imm_int(b, type));
508
 
        w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
509
 
 
510
 
        store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
511
 
 
512
 
        if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
513
 
                /* If the divisor is not a power of two, the divisor numerator
514
 
                 * is passed in word 1 of the continuation attribute (word 5
515
 
                 * if we consider the attribute and its continuation as a
516
 
                 * single attribute).
517
 
                 */
518
 
                assert(div2);
519
 
                store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
520
 
                             div2, 1);
521
 
        }
522
 
}
523
 
 
524
 
static void
525
 
zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
526
 
                       nir_ssa_def *attrib_buf_ptr)
527
 
{
528
 
        /* Stride is an unadorned 32-bit uint at word 2 */
529
 
        nir_builder *b = &builder->b;
530
 
        store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
531
 
                        nir_imm_int(b, 0), 1);
532
 
}
533
 
 
534
 
static void
535
 
adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
536
 
                     nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
537
 
                     nir_ssa_def *instance_div)
538
 
{
539
 
        nir_builder *b = &builder->b;
540
 
        nir_ssa_def *zero = nir_imm_int(b, 0);
541
 
        nir_ssa_def *two = nir_imm_int(b, 2);
542
 
        nir_ssa_def *sub_cur_offset =
543
 
                nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
544
 
                         nir_uge(b, builder->draw.instance_count, two));
545
 
 
546
 
        nir_ssa_def *add_base_inst_offset =
547
 
                nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
548
 
                         nir_ine(b, instance_div, zero));
549
 
 
550
 
        IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
551
 
                nir_ssa_def *offset =
552
 
                        load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
553
 
                nir_ssa_def *stride =
554
 
                        load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
555
 
 
556
 
                /* Per-instance data needs to be offset in response to a
557
 
                 * delayed start in an indexed draw.
558
 
                 */
559
 
 
560
 
                IF (add_base_inst_offset) {
561
 
                        offset = nir_iadd(b, offset,
562
 
                                          nir_idiv(b,
563
 
                                                   nir_imul(b, stride,
564
 
                                                            builder->draw.start_instance),
565
 
                                                   instance_div));
566
 
                } ENDIF
567
 
 
568
 
                IF (sub_cur_offset) {
569
 
                        offset = nir_isub(b, offset,
570
 
                                          nir_imul(b, stride,
571
 
                                                   builder->jobs.offset_start));
572
 
                } ENDIF
573
 
 
574
 
                store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
575
 
                             offset, 1);
576
 
        } ENDIF
577
 
}
578
 
 
579
 
/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
580
 
 
581
 
static nir_ssa_def *
582
 
nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
583
 
{
584
 
        return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
585
 
}
586
 
 
587
 
/* Based on panfrost_emit_vertex_data() */
588
 
 
589
 
static void
590
 
update_vertex_attribs(struct indirect_draw_shader_builder *builder)
591
 
{
592
 
        nir_builder *b = &builder->b;
593
 
        nir_variable *attrib_idx_var =
594
 
                nir_local_variable_create(b->impl, glsl_uint_type(),
595
 
                                          "attrib_idx");
596
 
        nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
597
 
 
598
 
#if PAN_ARCH <= 5
599
 
        nir_ssa_def *single_instance =
600
 
                nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
601
 
#endif
602
 
 
603
 
        LOOP {
604
 
                nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
605
 
                IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
606
 
                        BREAK;
607
 
                ENDIF
608
 
 
609
 
                nir_ssa_def *attrib_buf_ptr =
610
 
                         get_address(b, builder->attribs.attrib_bufs,
611
 
                                     nir_imul_imm(b, attrib_idx,
612
 
                                                  2 * pan_size(ATTRIBUTE_BUFFER)));
613
 
                nir_ssa_def *attrib_ptr =
614
 
                         get_address(b, builder->attribs.attribs,
615
 
                                     nir_imul_imm(b, attrib_idx,
616
 
                                                  pan_size(ATTRIBUTE)));
617
 
 
618
 
                nir_ssa_def *r_e, *d;
619
 
 
620
 
#if PAN_ARCH <= 5
621
 
                IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
622
 
                        nir_ssa_def *r_p =
623
 
                                nir_bcsel(b, single_instance,
624
 
                                          nir_imm_int(b, 0x9f),
625
 
                                          builder->instance_size.packed);
626
 
 
627
 
                        store_global(b,
628
 
                                     get_address_imm(b, attrib_buf_ptr, WORD(4)),
629
 
                                     nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
630
 
 
631
 
                        nir_store_var(b, attrib_idx_var,
632
 
                                      nir_iadd_imm(b, attrib_idx, 1), 1);
633
 
                        CONTINUE;
634
 
                } ENDIF
635
 
 
636
 
                IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
637
 
                        split_div(b, builder->instance_size.padded,
638
 
                                  &r_e, &d);
639
 
                        nir_ssa_def *default_div =
640
 
                                nir_ior(b, single_instance,
641
 
                                        nir_ult(b,
642
 
                                                builder->instance_size.padded,
643
 
                                                nir_imm_int(b, 2)));
644
 
                        r_e = nir_bcsel(b, default_div,
645
 
                                        nir_imm_int(b, 0x3f), r_e);
646
 
                        d = nir_bcsel(b, default_div,
647
 
                                      nir_imm_int(b, (1u << 31) - 1), d);
648
 
                        store_global(b,
649
 
                                     get_address_imm(b, attrib_buf_ptr, WORD(1)),
650
 
                                     nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
651
 
                                     2);
652
 
                        nir_store_var(b, attrib_idx_var,
653
 
                                      nir_iadd_imm(b, attrib_idx, 1), 1);
654
 
                        CONTINUE;
655
 
                } ENDIF
656
 
#endif
657
 
 
658
 
                nir_ssa_def *instance_div =
659
 
                        load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
660
 
 
661
 
                nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
662
 
 
663
 
                nir_ssa_def *multi_instance =
664
 
                        nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
665
 
 
666
 
                IF (nir_ine(b, div, nir_imm_int(b, 0))) {
667
 
                        IF (multi_instance) {
668
 
                                IF (nir_is_power_of_two_or_zero(b, div)) {
669
 
                                        nir_ssa_def *exp =
670
 
                                                nir_imax(b, nir_ufind_msb(b, div),
671
 
                                                         nir_imm_int(b, 0));
672
 
                                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
673
 
                                                                 MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
674
 
                                                                 exp, NULL);
675
 
                                } ELSE {
676
 
                                        split_div(b, div, &r_e, &d);
677
 
                                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
678
 
                                                                 MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
679
 
                                                                 r_e, d);
680
 
                                } ENDIF
681
 
                        } ELSE {
682
 
                                /* Single instance with a non-0 divisor: all
683
 
                                 * accesses should point to attribute 0 */
684
 
                                zero_attrib_buf_stride(builder, attrib_buf_ptr);
685
 
                        } ENDIF
686
 
 
687
 
                        adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
688
 
                } ELSE IF (multi_instance) {
689
 
                        update_vertex_attrib_buf(builder, attrib_buf_ptr,
690
 
                                        MALI_ATTRIBUTE_TYPE_1D_MODULUS,
691
 
                                        builder->instance_size.packed, NULL);
692
 
                } ENDIF ENDIF
693
 
 
694
 
                nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
695
 
        }
696
 
}
697
 
 
698
 
static nir_ssa_def *
699
 
update_varying_buf(struct indirect_draw_shader_builder *builder,
700
 
                   nir_ssa_def *varying_buf_ptr,
701
 
                   nir_ssa_def *vertex_count)
702
 
{
703
 
        nir_builder *b = &builder->b;
704
 
 
705
 
        nir_ssa_def *stride =
706
 
                load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
707
 
        nir_ssa_def *size = nir_imul(b, stride, vertex_count);
708
 
        nir_ssa_def *aligned_size =
709
 
                nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
710
 
        nir_ssa_def *var_mem_ptr =
711
 
                nir_load_var(b, builder->varyings.mem_ptr);
712
 
        nir_ssa_def *w0 =
713
 
                nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
714
 
                        nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
715
 
        nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
716
 
        store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
717
 
                     nir_vec4(b, w0, w1, stride, size), 4);
718
 
 
719
 
        nir_store_var(b, builder->varyings.mem_ptr,
720
 
                      get_address(b, var_mem_ptr, aligned_size), 3);
721
 
 
722
 
        return var_mem_ptr;
723
 
}
724
 
 
725
 
/* Based on panfrost_emit_varying_descriptor() */
726
 
 
727
 
static void
728
 
update_varyings(struct indirect_draw_shader_builder *builder)
729
 
{
730
 
        nir_builder *b = &builder->b;
731
 
        nir_ssa_def *vertex_count =
732
 
                nir_imul(b, builder->instance_size.padded,
733
 
                         builder->draw.instance_count);
734
 
        nir_ssa_def *buf_ptr =
735
 
                get_address_imm(b, builder->varyings.varying_bufs,
736
 
                                PAN_VARY_GENERAL *
737
 
                                pan_size(ATTRIBUTE_BUFFER));
738
 
        update_varying_buf(builder, buf_ptr, vertex_count);
739
 
 
740
 
        buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
741
 
                                  PAN_VARY_POSITION *
742
 
                                  pan_size(ATTRIBUTE_BUFFER));
743
 
        builder->varyings.pos_ptr =
744
 
                update_varying_buf(builder, buf_ptr, vertex_count);
745
 
 
746
 
        if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
747
 
                buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
748
 
                                          PAN_VARY_PSIZ *
749
 
                                          pan_size(ATTRIBUTE_BUFFER));
750
 
                builder->varyings.psiz_ptr =
751
 
                        update_varying_buf(builder, buf_ptr, vertex_count);
752
 
        }
753
 
 
754
 
        set_draw_ctx_field(builder, varying_mem,
755
 
                           nir_load_var(b, builder->varyings.mem_ptr));
756
 
}
757
 
 
758
 
/* Based on panfrost_pack_work_groups_compute() */
759
 
 
760
 
static void
761
 
get_invocation(struct indirect_draw_shader_builder *builder)
762
 
{
763
 
        nir_builder *b = &builder->b;
764
 
        nir_ssa_def *one = nir_imm_int(b, 1);
765
 
        nir_ssa_def *max_vertex =
766
 
                nir_usub_sat(b, builder->instance_size.raw, one);
767
 
        nir_ssa_def *max_instance =
768
 
                nir_usub_sat(b, builder->draw.instance_count, one);
769
 
        nir_ssa_def *split =
770
 
                nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
771
 
                          nir_imm_int(b, 32),
772
 
                          nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
773
 
 
774
 
        builder->jobs.invocation =
775
 
                nir_vec2(b,
776
 
                         nir_ior(b, max_vertex,
777
 
                                 nir_ishl(b, max_instance, split)),
778
 
                         nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
779
 
                                 nir_imm_int(b, 2 << 28)));
780
 
}
781
 
 
782
 
static nir_ssa_def *
783
 
nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
784
 
{
785
 
        assert(pot != 0 && util_is_power_of_two_or_zero(pot));
786
 
 
787
 
        return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
788
 
}
789
 
 
790
 
/* Based on panfrost_padded_vertex_count() */
791
 
 
792
 
static nir_ssa_def *
793
 
get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
794
 
{
795
 
        nir_ssa_def *one = nir_imm_int(b, 1);
796
 
        nir_ssa_def *zero = nir_imm_int(b, 0);
797
 
        nir_ssa_def *eleven = nir_imm_int(b, 11);
798
 
        nir_ssa_def *four = nir_imm_int(b, 4);
799
 
 
800
 
        nir_ssa_def *exp =
801
 
                nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
802
 
        nir_ssa_def *base = nir_ushr(b, val, exp);
803
 
 
804
 
        base = nir_iadd(b, base,
805
 
                        nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
806
 
 
807
 
        nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
808
 
        exp = nir_iadd(b, exp, rshift);
809
 
        base = nir_ushr(b, base, rshift);
810
 
        base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
811
 
        rshift = nir_imax(b, nir_find_lsb(b, base), zero);
812
 
        exp = nir_iadd(b, exp, rshift);
813
 
        base = nir_ushr(b, base, rshift);
814
 
 
815
 
        *packed = nir_ior(b, exp,
816
 
                          nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
817
 
        return nir_ishl(b, base, exp);
818
 
}
819
 
 
820
 
static void
821
 
update_jobs(struct indirect_draw_shader_builder *builder)
822
 
{
823
 
        get_invocation(builder);
824
 
 
825
 
        if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
826
 
                update_job(builder, MALI_JOB_TYPE_VERTEX);
827
 
 
828
 
        update_job(builder, MALI_JOB_TYPE_TILER);
829
 
}
830
 
 
831
 
 
832
 
static void
833
 
set_null_job(struct indirect_draw_shader_builder *builder,
834
 
             nir_ssa_def *job_ptr)
835
 
{
836
 
        nir_builder *b = &builder->b;
837
 
        nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
838
 
        nir_ssa_def *val = load_global(b, w4, 1, 32);
839
 
 
840
 
        /* Set job type to NULL (AKA NOOP) */
841
 
        val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
842
 
                      nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
843
 
        store_global(b, w4, val, 1);
844
 
}
845
 
 
846
 
static void
847
 
get_instance_size(struct indirect_draw_shader_builder *builder)
848
 
{
849
 
        nir_builder *b = &builder->b;
850
 
 
851
 
        if (!builder->index_size) {
852
 
                builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
853
 
                builder->jobs.offset_start = builder->draw.vertex_start;
854
 
                builder->instance_size.raw = builder->draw.vertex_count;
855
 
                return;
856
 
        }
857
 
 
858
 
        unsigned index_size = builder->index_size;
859
 
        nir_ssa_def *min = get_min_max_ctx_field(builder, min);
860
 
        nir_ssa_def *max = get_min_max_ctx_field(builder, max);
861
 
 
862
 
        /* We handle unaligned indices here to avoid the extra complexity in
863
 
         * the min/max search job.
864
 
         */
865
 
        if (builder->index_size < 4) {
866
 
                nir_variable *min_var =
867
 
                        nir_local_variable_create(b->impl, glsl_uint_type(), "min");
868
 
                nir_store_var(b, min_var, min, 1);
869
 
                nir_variable *max_var =
870
 
                        nir_local_variable_create(b->impl, glsl_uint_type(), "max");
871
 
                nir_store_var(b, max_var, max, 1);
872
 
 
873
 
                nir_ssa_def *base =
874
 
                        get_address(b, builder->draw.index_buf,
875
 
                                    nir_imul_imm(b, builder->draw.vertex_start, index_size));
876
 
                nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
877
 
                nir_ssa_def *end =
878
 
                        nir_iadd(b, offset,
879
 
                                 nir_imul_imm(b, builder->draw.vertex_count, index_size));
880
 
                nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
881
 
                unsigned shift = index_size * 8;
882
 
                unsigned mask = (1 << shift) - 1;
883
 
 
884
 
                base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
885
 
 
886
 
                /* Unaligned start offset, we need to ignore any data that's
887
 
                 * outside the requested range. We also handle ranges that are
888
 
                 * covering less than 2 words here.
889
 
                 */
890
 
                IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
891
 
                        min = nir_load_var(b, min_var);
892
 
                        max = nir_load_var(b, max_var);
893
 
 
894
 
                        nir_ssa_def *val = load_global(b, base, 1, 32);
895
 
                        for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
896
 
                                nir_ssa_def *oob =
897
 
                                        nir_ior(b,
898
 
                                                nir_ult(b, nir_imm_int(b, i), offset),
899
 
                                                nir_uge(b, nir_imm_int(b, i), end));
900
 
                                nir_ssa_def *data = nir_iand_imm(b, val, mask);
901
 
 
902
 
                                min = nir_umin(b, min,
903
 
                                               nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
904
 
                                max = nir_umax(b, max,
905
 
                                               nir_bcsel(b, oob, nir_imm_int(b, 0), data));
906
 
                                val = nir_ushr_imm(b, val, shift);
907
 
                        }
908
 
 
909
 
                        nir_store_var(b, min_var, min, 1);
910
 
                        nir_store_var(b, max_var, max, 1);
911
 
                } ENDIF
912
 
 
913
 
                nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
914
 
 
915
 
                /* The last word contains less than 4bytes of data, we need to
916
 
                 * discard anything falling outside the requested range.
917
 
                 */
918
 
                IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
919
 
                        min = nir_load_var(b, min_var);
920
 
                        max = nir_load_var(b, max_var);
921
 
 
922
 
                        nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
923
 
                        for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
924
 
                                nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
925
 
                                nir_ssa_def *data = nir_iand_imm(b, val, mask);
926
 
 
927
 
                                min = nir_umin(b, min,
928
 
                                               nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
929
 
                                max = nir_umax(b, max,
930
 
                                               nir_bcsel(b, oob, nir_imm_int(b, 0), data));
931
 
                                val = nir_ushr_imm(b, val, shift);
932
 
                        }
933
 
 
934
 
                        nir_store_var(b, min_var, min, 1);
935
 
                        nir_store_var(b, max_var, max, 1);
936
 
                } ENDIF
937
 
 
938
 
                min = nir_load_var(b, min_var);
939
 
                max = nir_load_var(b, max_var);
940
 
        }
941
 
 
942
 
        builder->jobs.base_vertex_offset = nir_ineg(b, min);
943
 
        builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
944
 
        builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
945
 
}
946
 
 
947
 
/* Patch a draw sequence */
948
 
 
949
 
static void
950
 
patch(struct indirect_draw_shader_builder *builder)
951
 
{
952
 
        unsigned index_size = builder->index_size;
953
 
        nir_builder *b = &builder->b;
954
 
 
955
 
        nir_ssa_def *draw_ptr = builder->draw.draw_buf;
956
 
 
957
 
        if (index_size) {
958
 
                builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
959
 
                builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
960
 
                builder->draw.instance_count =
961
 
                        get_indexed_draw_field(b, draw_ptr, instance_count);
962
 
                builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
963
 
                builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
964
 
        } else {
965
 
                builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
966
 
                builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
967
 
                builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
968
 
                builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
969
 
        }
970
 
 
971
 
        assert(builder->draw.vertex_count->num_components);
972
 
 
973
 
        nir_ssa_def *num_vertices =
974
 
                nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
975
 
 
976
 
        IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
977
 
                /* If there's nothing to draw, turn the vertex/tiler jobs into
978
 
                 * null jobs.
979
 
                 */
980
 
                if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
981
 
                        set_null_job(builder, builder->jobs.vertex_job);
982
 
 
983
 
                set_null_job(builder, builder->jobs.tiler_job);
984
 
        } ELSE {
985
 
                get_instance_size(builder);
986
 
 
987
 
                nir_ssa_def *count = builder->instance_size.raw;
988
 
 
989
 
                /* IDVS requires padding to a multiple of 4 */
990
 
                if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
991
 
                        count = nir_align_pot(b, count, 4);
992
 
 
993
 
                builder->instance_size.padded =
994
 
                        get_padded_count(b, count,
995
 
                                         &builder->instance_size.packed);
996
 
 
997
 
                update_varyings(builder);
998
 
                update_jobs(builder);
999
 
                update_vertex_attribs(builder);
1000
 
 
1001
 
                IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
1002
 
                        store_global(b, builder->jobs.first_vertex_sysval,
1003
 
                                     builder->jobs.offset_start, 1);
1004
 
                } ENDIF
1005
 
 
1006
 
                IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
1007
 
                        store_global(b, builder->jobs.base_vertex_sysval,
1008
 
                                     index_size ?
1009
 
                                     builder->draw.index_bias :
1010
 
                                     nir_imm_int(b, 0),
1011
 
                                     1);
1012
 
                } ENDIF
1013
 
 
1014
 
                IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
1015
 
                        store_global(b, builder->jobs.base_instance_sysval,
1016
 
                                     builder->draw.start_instance, 1);
1017
 
                } ENDIF
1018
 
        } ENDIF
1019
 
}
1020
 
 
1021
 
/* Search the min/max index in the range covered by the indirect draw call */
1022
 
 
1023
 
static void
1024
 
get_index_min_max(struct indirect_draw_shader_builder *builder)
1025
 
{
1026
 
        nir_ssa_def *restart_index = builder->draw.restart_index;
1027
 
        unsigned index_size = builder->index_size;
1028
 
        nir_builder *b = &builder->b;
1029
 
 
1030
 
        nir_ssa_def *draw_ptr = builder->draw.draw_buf;
1031
 
 
1032
 
        builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
1033
 
        builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
1034
 
 
1035
 
        nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1036
 
        nir_variable *min_var =
1037
 
                nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1038
 
        nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1039
 
        nir_variable *max_var =
1040
 
                nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1041
 
        nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1042
 
 
1043
 
        nir_ssa_def *base =
1044
 
                get_address(b, builder->draw.index_buf,
1045
 
                            nir_imul_imm(b, builder->draw.vertex_start, index_size));
1046
 
 
1047
 
 
1048
 
        nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1049
 
        nir_ssa_def *end =
1050
 
                nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1051
 
 
1052
 
        base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1053
 
 
1054
 
        /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1055
 
        start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1056
 
        end = nir_iand_imm(b, end, ~3);
1057
 
 
1058
 
        /* Add the job offset. */
1059
 
        start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1060
 
 
1061
 
        nir_variable *offset_var =
1062
 
                nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1063
 
        nir_store_var(b, offset_var, start, 1);
1064
 
 
1065
 
        LOOP {
1066
 
                nir_ssa_def *offset = nir_load_var(b, offset_var);
1067
 
                IF (nir_uge(b, offset, end))
1068
 
                        BREAK;
1069
 
                ENDIF
1070
 
 
1071
 
                nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1072
 
                nir_ssa_def *old_min = nir_load_var(b, min_var);
1073
 
                nir_ssa_def *old_max = nir_load_var(b, max_var);
1074
 
                nir_ssa_def *new_min;
1075
 
                nir_ssa_def *new_max;
1076
 
 
1077
 
                /* TODO: use 8/16 bit arithmetic when index_size < 4. */
1078
 
                for (unsigned i = 0; i < 4; i += index_size) {
1079
 
                        nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1080
 
                        data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1081
 
                        new_min = nir_umin(b, old_min, data);
1082
 
                        new_max = nir_umax(b, old_max, data);
1083
 
                        if (restart_index) {
1084
 
                                new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1085
 
                                new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1086
 
                        }
1087
 
                        old_min = new_min;
1088
 
                        old_max = new_max;
1089
 
                }
1090
 
 
1091
 
                nir_store_var(b, min_var, new_min, 1);
1092
 
                nir_store_var(b, max_var, new_max, 1);
1093
 
                nir_store_var(b, offset_var,
1094
 
                              nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1095
 
        }
1096
 
 
1097
 
        IF (nir_ult(b, start, end))
1098
 
                update_min(builder, nir_load_var(b, min_var));
1099
 
                update_max(builder, nir_load_var(b, max_var));
1100
 
        ENDIF
1101
 
}
1102
 
 
1103
 
static unsigned
1104
 
get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1105
 
{
1106
 
        if (!index_min_max_search) {
1107
 
                flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1108
 
                flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1109
 
                if (index_size)
1110
 
                        flags |= (util_logbase2(index_size) + 1);
1111
 
                return flags;
1112
 
        }
1113
 
 
1114
 
        return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1115
 
                PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1116
 
                PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1117
 
               util_logbase2(index_size);
1118
 
}
1119
 
 
1120
 
static void
1121
 
create_indirect_draw_shader(struct panfrost_device *dev,
1122
 
                            unsigned flags, unsigned index_size,
1123
 
                            bool index_min_max_search)
1124
 
{
1125
 
        assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1126
 
        struct indirect_draw_shader_builder builder;
1127
 
        init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1128
 
 
1129
 
        nir_builder *b = &builder.b;
1130
 
 
1131
 
        if (index_min_max_search)
1132
 
                get_index_min_max(&builder);
1133
 
        else
1134
 
                patch(&builder);
1135
 
 
1136
 
        struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
1137
 
        struct pan_shader_info shader_info;
1138
 
        struct util_dynarray binary;
1139
 
 
1140
 
        util_dynarray_init(&binary, NULL);
1141
 
        GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1142
 
 
1143
 
        assert(!shader_info.tls_size);
1144
 
        assert(!shader_info.wls_size);
1145
 
        assert(!shader_info.sysvals.sysval_count);
1146
 
 
1147
 
        unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1148
 
        struct pan_indirect_draw_shader *draw_shader =
1149
 
                &dev->indirect_draw_shaders.shaders[shader_id];
1150
 
        void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1151
 
                      (shader_id * pan_size(RENDERER_STATE));
1152
 
 
1153
 
        pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1154
 
        if (!draw_shader->rsd) {
1155
 
                mali_ptr address =
1156
 
                        pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1157
 
                                                binary.data, binary.size,
1158
 
                                                PAN_ARCH >= 6 ? 128 : 64);
1159
 
 
1160
 
#if PAN_ARCH <= 5
1161
 
                address |= shader_info.midgard.first_tag;
1162
 
#endif
1163
 
 
1164
 
                util_dynarray_fini(&binary);
1165
 
 
1166
 
                pan_pack(state, RENDERER_STATE, cfg) {
1167
 
                        pan_shader_prepare_rsd(&shader_info, address, &cfg);
1168
 
                }
1169
 
 
1170
 
                draw_shader->push = shader_info.push;
1171
 
                draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1172
 
                                   (shader_id * pan_size(RENDERER_STATE));
1173
 
        }
1174
 
        pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1175
 
 
1176
 
        ralloc_free(b->shader);
1177
 
}
1178
 
 
1179
 
static mali_ptr
1180
 
get_renderer_state(struct panfrost_device *dev, unsigned flags,
1181
 
                   unsigned index_size, bool index_min_max_search)
1182
 
{
1183
 
        unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1184
 
        struct pan_indirect_draw_shader *info =
1185
 
                &dev->indirect_draw_shaders.shaders[shader_id];
1186
 
 
1187
 
        if (!info->rsd) {
1188
 
                create_indirect_draw_shader(dev, flags, index_size,
1189
 
                                            index_min_max_search);
1190
 
                assert(info->rsd);
1191
 
        }
1192
 
 
1193
 
        return info->rsd;
1194
 
}
1195
 
 
1196
 
static mali_ptr
1197
 
get_tls(const struct panfrost_device *dev)
1198
 
{
1199
 
        return dev->indirect_draw_shaders.states->ptr.gpu +
1200
 
               (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1201
 
}
1202
 
 
1203
 
static mali_ptr
1204
 
get_ubos(struct pan_pool *pool,
1205
 
         const struct indirect_draw_inputs *inputs)
1206
 
{
1207
 
        struct panfrost_ptr inputs_buf =
1208
 
                pan_pool_alloc_aligned(pool, sizeof(*inputs), 16);
1209
 
 
1210
 
        memcpy(inputs_buf.cpu, inputs, sizeof(*inputs));
1211
 
 
1212
 
        struct panfrost_ptr ubos_buf =
1213
 
                pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
1214
 
 
1215
 
        pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
1216
 
                cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16);
1217
 
                cfg.pointer = inputs_buf.gpu;
1218
 
        }
1219
 
 
1220
 
        return ubos_buf.gpu;
1221
 
}
1222
 
 
1223
 
static mali_ptr
1224
 
get_push_uniforms(struct pan_pool *pool,
1225
 
                  const struct pan_indirect_draw_shader *shader,
1226
 
                  const struct indirect_draw_inputs *inputs)
1227
 
{
1228
 
        if (!shader->push.count)
1229
 
                return 0;
1230
 
 
1231
 
        struct panfrost_ptr push_consts_buf =
1232
 
                pan_pool_alloc_aligned(pool, shader->push.count * 4, 16);
1233
 
        uint32_t *out = push_consts_buf.cpu;
1234
 
        uint8_t *in = (uint8_t *)inputs;
1235
 
 
1236
 
        for (unsigned i = 0; i < shader->push.count; ++i)
1237
 
                memcpy(out + i, in + shader->push.words[i].offset, 4);
1238
 
 
1239
 
        return push_consts_buf.gpu;
1240
 
}
1241
 
 
1242
 
static void
1243
 
panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1244
 
{
1245
 
        pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1246
 
        if (dev->indirect_draw_shaders.states)
1247
 
                goto out;
1248
 
 
1249
 
        unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1250
 
                                  pan_size(RENDERER_STATE)) +
1251
 
                                 pan_size(LOCAL_STORAGE);
1252
 
 
1253
 
        dev->indirect_draw_shaders.states =
1254
 
                panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1255
 
 
1256
 
        /* Prepare the thread storage descriptor now since it's invariant. */
1257
 
        void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1258
 
                    (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1259
 
        pan_pack(tsd, LOCAL_STORAGE, ls) {
1260
 
                ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1261
 
        };
1262
 
 
1263
 
        /* FIXME: Currently allocating 512M of growable memory, meaning that we
1264
 
         * only allocate what we really use, the problem is:
1265
 
         * - allocation happens 2M at a time, which might be more than we
1266
 
         *   actually need
1267
 
         * - the memory is attached to the device to speed up subsequent
1268
 
         *   indirect draws, but that also means it's never shrinked
1269
 
         */
1270
 
        dev->indirect_draw_shaders.varying_heap =
1271
 
                panfrost_bo_create(dev, 512 * 1024 * 1024,
1272
 
                                   PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1273
 
                                   "Indirect draw varying heap");
1274
 
 
1275
 
out:
1276
 
        pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1277
 
}
1278
 
 
1279
 
static unsigned
1280
 
panfrost_emit_index_min_max_search(struct pan_pool *pool,
1281
 
                                   struct pan_scoreboard *scoreboard,
1282
 
                                   const struct pan_indirect_draw_info *draw_info,
1283
 
                                   const struct indirect_draw_inputs *inputs,
1284
 
                                   struct indirect_draw_context *draw_ctx,
1285
 
                                   mali_ptr ubos)
1286
 
{
1287
 
        struct panfrost_device *dev = pool->dev;
1288
 
        unsigned index_size = draw_info->index_size;
1289
 
 
1290
 
        if (!index_size)
1291
 
                return 0;
1292
 
 
1293
 
        mali_ptr rsd =
1294
 
                get_renderer_state(dev, draw_info->flags,
1295
 
                                   draw_info->index_size, true);
1296
 
        unsigned shader_id =
1297
 
                get_shader_id(draw_info->flags, draw_info->index_size, true);
1298
 
        const struct pan_indirect_draw_shader *shader =
1299
 
                &dev->indirect_draw_shaders.shaders[shader_id];
1300
 
        struct panfrost_ptr job =
1301
 
                pan_pool_alloc_desc(pool, COMPUTE_JOB);
1302
 
        void *invocation =
1303
 
                pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1304
 
        panfrost_pack_work_groups_compute(invocation,
1305
 
                                          1, 1, 1, MIN_MAX_JOBS, 1, 1,
1306
 
                                          false, false);
1307
 
 
1308
 
        pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1309
 
                cfg.job_task_split = 7;
1310
 
        }
1311
 
 
1312
 
        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1313
 
                cfg.draw_descriptor_is_64b = true;
1314
 
                cfg.state = rsd;
1315
 
                cfg.thread_storage = get_tls(pool->dev);
1316
 
                cfg.uniform_buffers = ubos;
1317
 
                cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
1318
 
        }
1319
 
 
1320
 
        return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1321
 
                                false, false, 0, 0, &job, false);
1322
 
}
1323
 
 
1324
 
unsigned
1325
 
GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1326
 
                                  struct pan_scoreboard *scoreboard,
1327
 
                                  const struct pan_indirect_draw_info *draw_info,
1328
 
                                  struct panfrost_ptr *ctx)
1329
 
{
1330
 
        struct panfrost_device *dev = pool->dev;
1331
 
 
1332
 
        /* Currently only tested on Bifrost, but the logic should be the same
1333
 
         * on Midgard.
1334
 
         */
1335
 
        assert(pan_is_bifrost(dev));
1336
 
 
1337
 
        panfrost_indirect_draw_alloc_deps(dev);
1338
 
 
1339
 
        struct panfrost_ptr job =
1340
 
                pan_pool_alloc_desc(pool, COMPUTE_JOB);
1341
 
        mali_ptr rsd =
1342
 
                get_renderer_state(dev, draw_info->flags,
1343
 
                                   draw_info->index_size, false);
1344
 
 
1345
 
        struct indirect_draw_context draw_ctx = {
1346
 
                .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1347
 
        };
1348
 
 
1349
 
        struct panfrost_ptr draw_ctx_ptr = *ctx;
1350
 
        if (!draw_ctx_ptr.cpu) {
1351
 
                draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1352
 
                                                      sizeof(draw_ctx),
1353
 
                                                      sizeof(mali_ptr));
1354
 
        }
1355
 
 
1356
 
        struct indirect_draw_inputs inputs = {
1357
 
                .draw_ctx = draw_ctx_ptr.gpu,
1358
 
                .draw_buf = draw_info->draw_buf,
1359
 
                .index_buf = draw_info->index_buf,
1360
 
                .first_vertex_sysval = draw_info->first_vertex_sysval,
1361
 
                .base_vertex_sysval = draw_info->base_vertex_sysval,
1362
 
                .base_instance_sysval = draw_info->base_instance_sysval,
1363
 
                .vertex_job = draw_info->vertex_job,
1364
 
                .tiler_job = draw_info->tiler_job,
1365
 
                .attrib_bufs = draw_info->attrib_bufs,
1366
 
                .attribs = draw_info->attribs,
1367
 
                .varying_bufs = draw_info->varying_bufs,
1368
 
                .attrib_count = draw_info->attrib_count,
1369
 
        };
1370
 
 
1371
 
        if (draw_info->index_size) {
1372
 
                inputs.restart_index = draw_info->restart_index;
1373
 
 
1374
 
                struct panfrost_ptr min_max_ctx_ptr =
1375
 
                        pan_pool_alloc_aligned(pool,
1376
 
                                               sizeof(struct min_max_context),
1377
 
                                               4);
1378
 
                struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1379
 
 
1380
 
                ctx->min = UINT32_MAX;
1381
 
                ctx->max = 0;
1382
 
                inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1383
 
        }
1384
 
 
1385
 
        unsigned shader_id =
1386
 
                get_shader_id(draw_info->flags, draw_info->index_size, false);
1387
 
        const struct pan_indirect_draw_shader *shader =
1388
 
                &dev->indirect_draw_shaders.shaders[shader_id];
1389
 
        mali_ptr ubos = get_ubos(pool, &inputs);
1390
 
 
1391
 
        void *invocation =
1392
 
                pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1393
 
        panfrost_pack_work_groups_compute(invocation,
1394
 
                                          1, 1, 1, 1, 1, 1,
1395
 
                                          false, false);
1396
 
 
1397
 
        pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1398
 
                cfg.job_task_split = 2;
1399
 
        }
1400
 
 
1401
 
        pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1402
 
                cfg.draw_descriptor_is_64b = true;
1403
 
                cfg.state = rsd;
1404
 
                cfg.thread_storage = get_tls(pool->dev);
1405
 
                cfg.uniform_buffers = ubos;
1406
 
                cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
1407
 
        }
1408
 
 
1409
 
        unsigned global_dep = draw_info->last_indirect_draw;
1410
 
        unsigned local_dep =
1411
 
                panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1412
 
                                                   &inputs, &draw_ctx, ubos);
1413
 
 
1414
 
        if (!ctx->cpu) {
1415
 
                *ctx = draw_ctx_ptr;
1416
 
                memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1417
 
        }
1418
 
 
1419
 
        return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1420
 
                                false, true, local_dep, global_dep,
1421
 
                                &job, false);
1422
 
}
1423
 
 
1424
 
void
1425
 
GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1426
 
                                          struct pan_pool *bin_pool)
1427
 
{
1428
 
        /* We allocate the states and varying_heap BO lazily to avoid
1429
 
         * reserving memory when indirect draws are not used.
1430
 
         */
1431
 
        pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1432
 
        dev->indirect_draw_shaders.bin_pool = bin_pool;
1433
 
}
1434
 
 
1435
 
void
1436
 
GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1437
 
{
1438
 
        panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1439
 
        panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1440
 
        pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1441
 
}