2
* Copyright © 2017 Intel Corporation
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25
#include "nir_builder.h"
26
#include "util/u_math.h"
29
* \file nir_opt_intrinsics.c
32
static nir_intrinsic_instr *
33
lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
34
unsigned int component)
38
comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
40
comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
42
nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
43
nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL);
44
intr->const_index[0] = intrin->const_index[0];
45
intr->const_index[1] = intrin->const_index[1];
46
intr->src[0] = nir_src_for_ssa(comp);
47
if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
48
nir_src_copy(&intr->src[1], &intrin->src[1]);
50
intr->num_components = 1;
51
nir_builder_instr_insert(b, &intr->instr);
56
lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
58
assert(intrin->src[0].ssa->bit_size == 64);
59
nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
60
nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
61
return nir_pack_64_2x32_split(b, &intr_x->dest.ssa, &intr_y->dest.ssa);
65
ballot_type_to_uint(nir_builder *b, nir_ssa_def *value,
66
const nir_lower_subgroups_options *options)
68
/* Only the new-style SPIR-V subgroup instructions take a ballot result as
69
* an argument, so we only use this on uvec4 types.
71
assert(value->num_components == 4 && value->bit_size == 32);
73
return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
74
options->ballot_bit_size);
78
uint_to_ballot_type(nir_builder *b, nir_ssa_def *value,
79
unsigned num_components, unsigned bit_size)
81
assert(util_is_power_of_two_nonzero(num_components));
82
assert(util_is_power_of_two_nonzero(value->num_components));
84
unsigned total_bits = bit_size * num_components;
86
/* If the source doesn't have enough bits, zero-pad */
87
if (total_bits > value->bit_size * value->num_components)
88
value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
90
value = nir_bitcast_vector(b, value, bit_size);
92
/* If the source has too many components, truncate. This can happen if,
93
* for instance, we're implementing GL_ARB_shader_ballot or
94
* VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
95
* architecture with a native 128-bit uvec4 ballot. This comes up in Zink
96
* for OpenGL on Vulkan. It's the job of the driver calling this lowering
97
* pass to ensure that it's restricted subgroup sizes sufficiently that we
98
* have enough ballot bits.
100
if (value->num_components > num_components)
101
value = nir_channels(b, value, nir_component_mask(num_components));
107
lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin,
110
/* This is safe to call on scalar things but it would be silly */
111
assert(intrin->dest.ssa.num_components > 1);
113
nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0],
114
intrin->num_components);
115
nir_ssa_def *reads[NIR_MAX_VEC_COMPONENTS];
117
for (unsigned i = 0; i < intrin->num_components; i++) {
118
nir_intrinsic_instr *chan_intrin =
119
nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
120
nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
121
1, intrin->dest.ssa.bit_size, NULL);
122
chan_intrin->num_components = 1;
125
chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
127
if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
128
assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
129
nir_src_copy(&chan_intrin->src[1], &intrin->src[1]);
132
chan_intrin->const_index[0] = intrin->const_index[0];
133
chan_intrin->const_index[1] = intrin->const_index[1];
135
if (lower_to_32bit && chan_intrin->src[0].ssa->bit_size == 64) {
136
reads[i] = lower_subgroup_op_to_32bit(b, chan_intrin);
138
nir_builder_instr_insert(b, &chan_intrin->instr);
139
reads[i] = &chan_intrin->dest.ssa;
143
return nir_vec(b, reads, intrin->num_components);
147
lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
149
assert(intrin->src[0].is_ssa);
150
nir_ssa_def *value = intrin->src[0].ssa;
152
nir_ssa_def *result = NULL;
153
for (unsigned i = 0; i < intrin->num_components; i++) {
154
nir_intrinsic_instr *chan_intrin =
155
nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
156
nir_ssa_dest_init(&chan_intrin->instr, &chan_intrin->dest,
157
1, intrin->dest.ssa.bit_size, NULL);
158
chan_intrin->num_components = 1;
159
chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
160
nir_builder_instr_insert(b, &chan_intrin->instr);
163
result = nir_iand(b, result, &chan_intrin->dest.ssa);
165
result = &chan_intrin->dest.ssa;
173
lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
175
assert(intrin->src[0].is_ssa);
176
nir_ssa_def *value = intrin->src[0].ssa;
178
/* We have to implicitly lower to scalar */
179
nir_ssa_def *all_eq = NULL;
180
for (unsigned i = 0; i < intrin->num_components; i++) {
181
nir_ssa_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
184
if (intrin->intrinsic == nir_intrinsic_vote_feq) {
185
is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
187
is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
190
if (all_eq == NULL) {
193
all_eq = nir_iand(b, all_eq, is_eq);
197
return nir_vote_all(b, 1, all_eq);
201
lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin,
202
const nir_lower_subgroups_options *options)
204
unsigned mask = nir_src_as_uint(intrin->src[1]);
209
nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create(
210
b->shader, nir_intrinsic_masked_swizzle_amd);
211
swizzle->num_components = intrin->num_components;
212
nir_src_copy(&swizzle->src[0], &intrin->src[0]);
213
nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f);
214
nir_ssa_dest_init(&swizzle->instr, &swizzle->dest,
215
intrin->dest.ssa.num_components,
216
intrin->dest.ssa.bit_size, NULL);
218
if (options->lower_to_scalar && swizzle->num_components > 1) {
219
return lower_subgroup_op_to_scalar(b, swizzle, options->lower_shuffle_to_32bit);
220
} else if (options->lower_shuffle_to_32bit && swizzle->src[0].ssa->bit_size == 64) {
221
return lower_subgroup_op_to_32bit(b, swizzle);
223
nir_builder_instr_insert(b, &swizzle->instr);
224
return &swizzle->dest.ssa;
228
/* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */
231
lower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
232
const nir_lower_subgroups_options *options)
234
if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
235
options->lower_shuffle_to_swizzle_amd &&
236
nir_src_is_const(intrin->src[1])) {
237
nir_ssa_def *result =
238
lower_shuffle_to_swizzle(b, intrin, options);
243
nir_ssa_def *index = nir_load_subgroup_invocation(b);
244
bool is_shuffle = false;
245
switch (intrin->intrinsic) {
246
case nir_intrinsic_shuffle_xor:
247
assert(intrin->src[1].is_ssa);
248
index = nir_ixor(b, index, intrin->src[1].ssa);
251
case nir_intrinsic_shuffle_up:
252
assert(intrin->src[1].is_ssa);
253
index = nir_isub(b, index, intrin->src[1].ssa);
256
case nir_intrinsic_shuffle_down:
257
assert(intrin->src[1].is_ssa);
258
index = nir_iadd(b, index, intrin->src[1].ssa);
261
case nir_intrinsic_quad_broadcast:
262
assert(intrin->src[1].is_ssa);
263
index = nir_ior(b, nir_iand(b, index, nir_imm_int(b, ~0x3)),
266
case nir_intrinsic_quad_swap_horizontal:
267
/* For Quad operations, subgroups are divided into quads where
268
* (invocation % 4) is the index to a square arranged as follows:
276
index = nir_ixor(b, index, nir_imm_int(b, 0x1));
278
case nir_intrinsic_quad_swap_vertical:
279
index = nir_ixor(b, index, nir_imm_int(b, 0x2));
281
case nir_intrinsic_quad_swap_diagonal:
282
index = nir_ixor(b, index, nir_imm_int(b, 0x3));
285
unreachable("Invalid intrinsic");
288
nir_intrinsic_instr *shuffle =
289
nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle);
290
shuffle->num_components = intrin->num_components;
291
nir_src_copy(&shuffle->src[0], &intrin->src[0]);
292
shuffle->src[1] = nir_src_for_ssa(index);
293
nir_ssa_dest_init(&shuffle->instr, &shuffle->dest,
294
intrin->dest.ssa.num_components,
295
intrin->dest.ssa.bit_size, NULL);
297
bool lower_to_32bit = options->lower_shuffle_to_32bit && is_shuffle;
298
if (options->lower_to_scalar && shuffle->num_components > 1) {
299
return lower_subgroup_op_to_scalar(b, shuffle, lower_to_32bit);
300
} else if (lower_to_32bit && shuffle->src[0].ssa->bit_size == 64) {
301
return lower_subgroup_op_to_32bit(b, shuffle);
303
nir_builder_instr_insert(b, &shuffle->instr);
304
return &shuffle->dest.ssa;
308
static const struct glsl_type *
309
glsl_type_for_ssa(nir_ssa_def *def)
311
const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() :
312
glsl_uintN_t_type(def->bit_size);
313
return glsl_replace_vector_type(comp_type, def->num_components);
316
/* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation.
319
lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin)
321
assert(intrin->src[0].is_ssa);
322
assert(intrin->src[1].is_ssa);
323
nir_ssa_def *val = intrin->src[0].ssa;
324
nir_ssa_def *id = intrin->src[1].ssa;
326
/* The loop is something like:
329
* first_id = readFirstInvocation(gl_SubgroupInvocationID);
330
* first_val = readFirstInvocation(val);
331
* first_result = readInvocation(val, readFirstInvocation(id));
332
* if (id == first_id)
333
* result = first_val;
335
* if (id > gl_SubgroupInvocationID) {
336
* result = first_result;
342
* The idea is to guarantee, on each iteration of the loop, that anything
343
* reading from first_id gets the correct value, so that we can then kill
344
* it off by breaking out of the loop. Before doing that we also have to
345
* ensure that first_id invocation gets the correct value. It only won't be
346
* assigned the correct value already if the invocation it's reading from
347
* isn't already killed off, that is, if it's later than its own ID.
348
* Invocations where id <= gl_SubgroupInvocationID will be assigned their
349
* result in the first if, and invocations where id >
350
* gl_SubgroupInvocationID will be assigned their result in the second if.
352
* We do this more complicated loop rather than looping over all id's
353
* explicitly because at this point we don't know the "actual" subgroup
354
* size and at the moment there's no way to get at it, which means we may
355
* loop over always-inactive invocations.
358
nir_ssa_def *subgroup_id = nir_load_subgroup_invocation(b);
360
nir_variable *result =
361
nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result");
363
nir_loop *loop = nir_push_loop(b); {
364
nir_ssa_def *first_id = nir_read_first_invocation(b, subgroup_id);
365
nir_ssa_def *first_val = nir_read_first_invocation(b, val);
366
nir_ssa_def *first_result =
367
nir_read_invocation(b, val, nir_read_first_invocation(b, id));
369
nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id)); {
370
nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components));
371
} nir_pop_if(b, nif);
373
nir_if *nif2 = nir_push_if(b, nir_elect(b, 1)); {
374
nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id)); {
375
nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components));
376
} nir_pop_if(b, nif3);
378
nir_jump(b, nir_jump_break);
379
} nir_pop_if(b, nif2);
380
} nir_pop_loop(b, loop);
382
return nir_load_var(b, result);
386
lower_subgroups_filter(const nir_instr *instr, const void *_options)
388
return instr->type == nir_instr_type_intrinsic;
391
/* Return a ballot-mask-sized value which represents "val" sign-extended and
392
* then shifted left by "shift". Only particular values for "val" are
393
* supported, see below.
396
build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_ssa_def *shift,
397
const nir_lower_subgroups_options *options)
399
/* This only works if all the high bits are the same as bit 1. */
400
assert((val >> 2) == (val & 0x2 ? -1 : 0));
402
/* First compute the result assuming one ballot component. */
403
nir_ssa_def *result =
404
nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
406
if (options->ballot_components == 1)
409
/* Fix up the result when there is > 1 component. The idea is that nir_ishl
410
* masks out the high bits of the shift value already, so in case there's
411
* more than one component the component which 1 would be shifted into
412
* already has the right value and all we have to do is fixup the other
413
* components. Components below it should always be 0, and components above
414
* it must be either 0 or ~0 because of the assert above. For example, if
415
* the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
416
* we'll feed 33 into ishl, which will mask it off to get 1, so we'll
417
* compute a single-component result of 2, which is correct for the second
418
* component, but the first component needs to be 0, which we get by
419
* comparing the high bits of the shift with 0 and selecting the original
420
* answer or 0 for the first component (and something similar with the
421
* second component). This idea is generalized here for any component count
423
nir_const_value min_shift[4] = { 0 };
424
for (unsigned i = 0; i < options->ballot_components; i++)
425
min_shift[i].i32 = i * options->ballot_bit_size;
426
nir_ssa_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
428
nir_const_value max_shift[4] = { 0 };
429
for (unsigned i = 0; i < options->ballot_components; i++)
430
max_shift[i].i32 = (i + 1) * options->ballot_bit_size;
431
nir_ssa_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
433
return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
434
nir_bcsel(b, nir_ult(b, shift, min_shift_val),
435
nir_imm_intN_t(b, val >> 63, result->bit_size),
437
nir_imm_intN_t(b, 0, result->bit_size));
441
build_subgroup_eq_mask(nir_builder *b,
442
const nir_lower_subgroups_options *options)
444
nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
446
return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
450
build_subgroup_ge_mask(nir_builder *b,
451
const nir_lower_subgroups_options *options)
453
nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
455
return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
459
build_subgroup_gt_mask(nir_builder *b,
460
const nir_lower_subgroups_options *options)
462
nir_ssa_def *subgroup_idx = nir_load_subgroup_invocation(b);
464
return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
467
/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
468
* 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
469
* above the subgroup size for the masks, but gt_mask and ge_mask make them 1
470
* so we have to "and" with this mask.
473
build_subgroup_mask(nir_builder *b,
474
const nir_lower_subgroups_options *options)
476
nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
478
/* First compute the result assuming one ballot component. */
479
nir_ssa_def *result =
480
nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
481
nir_isub_imm(b, options->ballot_bit_size,
484
/* Since the subgroup size and ballot bitsize are both powers of two, there
485
* are two possible cases to consider:
487
* (1) The subgroup size is less than the ballot bitsize. We need to return
488
* "result" in the first component and 0 in every other component.
489
* (2) The subgroup size is a multiple of the ballot bitsize. We need to
490
* return ~0 if the subgroup size divided by the ballot bitsize is less
491
* than or equal to the index in the vector and 0 otherwise. For example,
492
* with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
493
* to return { ~0, ~0, 0, 0 }.
495
* In case (2) it turns out that "result" will be ~0, because
496
* "ballot_bit_size - subgroup_size" is also a multiple of
497
* "ballot_bit_size" and since nir_ushr masks the shift value it will
498
* shifted by 0. This means that the first component can just be "result"
499
* in all cases. The other components will also get the correct value in
500
* case (1) if we just use the rule in case (2), so we'll get the correct
501
* result if we just follow (2) and then replace the first component with
504
nir_const_value min_idx[4] = { 0 };
505
for (unsigned i = 0; i < options->ballot_components; i++)
506
min_idx[i].i32 = i * options->ballot_bit_size;
507
nir_ssa_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
509
nir_ssa_def *result_extended =
510
nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
512
return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
513
result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
517
vec_bit_count(nir_builder *b, nir_ssa_def *value)
519
nir_ssa_def *vec_result = nir_bit_count(b, value);
520
nir_ssa_def *result = nir_channel(b, vec_result, 0);
521
for (unsigned i = 1; i < value->num_components; i++)
522
result = nir_iadd(b, result, nir_channel(b, vec_result, i));
527
vec_find_lsb(nir_builder *b, nir_ssa_def *value)
529
nir_ssa_def *vec_result = nir_find_lsb(b, value);
530
nir_ssa_def *result = nir_imm_int(b, -1);
531
for (int i = value->num_components - 1; i >= 0; i--) {
532
nir_ssa_def *channel = nir_channel(b, vec_result, i);
533
/* result = channel >= 0 ? (i * bitsize + channel) : result */
534
result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
535
nir_iadd_imm(b, channel, i * value->bit_size),
542
vec_find_msb(nir_builder *b, nir_ssa_def *value)
544
nir_ssa_def *vec_result = nir_ufind_msb(b, value);
545
nir_ssa_def *result = nir_imm_int(b, -1);
546
for (unsigned i = 0; i < value->num_components; i++) {
547
nir_ssa_def *channel = nir_channel(b, vec_result, i);
548
/* result = channel >= 0 ? (i * bitsize + channel) : result */
549
result = nir_bcsel(b, nir_ige(b, channel, nir_imm_int(b, 0)),
550
nir_iadd_imm(b, channel, i * value->bit_size),
557
lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
558
const nir_lower_subgroups_options *options)
560
if (!options->lower_quad_broadcast_dynamic_to_const)
561
return lower_to_shuffle(b, intrin, options);
563
nir_ssa_def *dst = NULL;
565
for (unsigned i = 0; i < 4; ++i) {
566
nir_intrinsic_instr *qbcst =
567
nir_intrinsic_instr_create(b->shader, nir_intrinsic_quad_broadcast);
569
qbcst->num_components = intrin->num_components;
570
qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i));
571
nir_src_copy(&qbcst->src[0], &intrin->src[0]);
572
nir_ssa_dest_init(&qbcst->instr, &qbcst->dest,
573
intrin->dest.ssa.num_components,
574
intrin->dest.ssa.bit_size, NULL);
576
nir_ssa_def *qbcst_dst = NULL;
578
if (options->lower_to_scalar && qbcst->num_components > 1) {
579
qbcst_dst = lower_subgroup_op_to_scalar(b, qbcst, false);
581
nir_builder_instr_insert(b, &qbcst->instr);
582
qbcst_dst = &qbcst->dest.ssa;
586
dst = nir_bcsel(b, nir_ieq(b, intrin->src[1].ssa,
587
nir_src_for_ssa(nir_imm_int(b, i)).ssa),
597
lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
599
return nir_read_invocation_cond_ir3(b, intrin->dest.ssa.bit_size,
601
nir_ieq(b, intrin->src[1].ssa,
602
nir_load_subgroup_invocation(b)));
606
lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
608
const nir_lower_subgroups_options *options = _options;
610
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
611
switch (intrin->intrinsic) {
612
case nir_intrinsic_vote_any:
613
case nir_intrinsic_vote_all:
614
if (options->lower_vote_trivial)
615
return nir_ssa_for_src(b, intrin->src[0], 1);
618
case nir_intrinsic_vote_feq:
619
case nir_intrinsic_vote_ieq:
620
if (options->lower_vote_trivial)
621
return nir_imm_true(b);
623
if (options->lower_vote_eq)
624
return lower_vote_eq(b, intrin);
626
if (options->lower_to_scalar && intrin->num_components > 1)
627
return lower_vote_eq_to_scalar(b, intrin);
630
case nir_intrinsic_load_subgroup_size:
631
if (options->subgroup_size)
632
return nir_imm_int(b, options->subgroup_size);
635
case nir_intrinsic_read_invocation:
636
if (options->lower_to_scalar && intrin->num_components > 1)
637
return lower_subgroup_op_to_scalar(b, intrin, false);
639
if (options->lower_read_invocation_to_cond)
640
return lower_read_invocation_to_cond(b, intrin);
644
case nir_intrinsic_read_first_invocation:
645
if (options->lower_to_scalar && intrin->num_components > 1)
646
return lower_subgroup_op_to_scalar(b, intrin, false);
649
case nir_intrinsic_load_subgroup_eq_mask:
650
case nir_intrinsic_load_subgroup_ge_mask:
651
case nir_intrinsic_load_subgroup_gt_mask:
652
case nir_intrinsic_load_subgroup_le_mask:
653
case nir_intrinsic_load_subgroup_lt_mask: {
654
if (!options->lower_subgroup_masks)
658
switch (intrin->intrinsic) {
659
case nir_intrinsic_load_subgroup_eq_mask:
660
val = build_subgroup_eq_mask(b, options);
662
case nir_intrinsic_load_subgroup_ge_mask:
663
val = nir_iand(b, build_subgroup_ge_mask(b, options),
664
build_subgroup_mask(b, options));
666
case nir_intrinsic_load_subgroup_gt_mask:
667
val = nir_iand(b, build_subgroup_gt_mask(b, options),
668
build_subgroup_mask(b, options));
670
case nir_intrinsic_load_subgroup_le_mask:
671
val = nir_inot(b, build_subgroup_gt_mask(b, options));
673
case nir_intrinsic_load_subgroup_lt_mask:
674
val = nir_inot(b, build_subgroup_ge_mask(b, options));
677
unreachable("you seriously can't tell this is unreachable?");
680
return uint_to_ballot_type(b, val,
681
intrin->dest.ssa.num_components,
682
intrin->dest.ssa.bit_size);
685
case nir_intrinsic_ballot: {
686
if (intrin->dest.ssa.num_components == options->ballot_components &&
687
intrin->dest.ssa.bit_size == options->ballot_bit_size)
690
nir_ssa_def *ballot =
691
nir_ballot(b, options->ballot_components, options->ballot_bit_size,
694
return uint_to_ballot_type(b, ballot,
695
intrin->dest.ssa.num_components,
696
intrin->dest.ssa.bit_size);
699
case nir_intrinsic_ballot_bitfield_extract:
700
case nir_intrinsic_ballot_bit_count_reduce:
701
case nir_intrinsic_ballot_find_lsb:
702
case nir_intrinsic_ballot_find_msb: {
703
assert(intrin->src[0].is_ssa);
704
nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
707
if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
708
intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
709
/* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
711
* "Find the most significant bit set to 1 in Value, considering
712
* only the bits in Value required to represent all bits of the
713
* group’s invocations. If none of the considered bits is set to
714
* 1, the result is undefined."
716
* It has similar text for the other three. This means that, in case
717
* the subgroup size is less than 32, we have to mask off the unused
718
* bits. If the subgroup size is fixed and greater than or equal to
719
* 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
722
* We only have to worry about this for BitCount and FindMSB because
723
* FindLSB counts from the bottom and BitfieldExtract selects
724
* individual bits. In either case, if run outside the range of
725
* valid bits, we hit the undefined results case and we can return
728
int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
731
switch (intrin->intrinsic) {
732
case nir_intrinsic_ballot_bitfield_extract: {
733
assert(intrin->src[1].is_ssa);
734
nir_ssa_def *idx = intrin->src[1].ssa;
735
if (int_val->num_components > 1) {
736
/* idx will be truncated by nir_ushr, so we just need to select
737
* the right component using the bits of idx that are truncated in
741
nir_vector_extract(b, int_val,
742
nir_udiv_imm(b, idx, int_val->bit_size));
745
return nir_i2b(b, nir_iand_imm(b, nir_ushr(b, int_val, idx), 1));
747
case nir_intrinsic_ballot_bit_count_reduce:
748
return vec_bit_count(b, int_val);
749
case nir_intrinsic_ballot_find_lsb:
750
return vec_find_lsb(b, int_val);
751
case nir_intrinsic_ballot_find_msb:
752
return vec_find_msb(b, int_val);
754
unreachable("you seriously can't tell this is unreachable?");
758
case nir_intrinsic_ballot_bit_count_exclusive:
759
case nir_intrinsic_ballot_bit_count_inclusive: {
761
if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
762
mask = nir_inot(b, build_subgroup_gt_mask(b, options));
764
mask = nir_inot(b, build_subgroup_ge_mask(b, options));
767
assert(intrin->src[0].is_ssa);
768
nir_ssa_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
771
return vec_bit_count(b, nir_iand(b, int_val, mask));
774
case nir_intrinsic_elect: {
775
if (!options->lower_elect)
778
return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
781
case nir_intrinsic_shuffle:
782
if (options->lower_shuffle)
783
return lower_shuffle(b, intrin);
784
else if (options->lower_to_scalar && intrin->num_components > 1)
785
return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
786
else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
787
return lower_subgroup_op_to_32bit(b, intrin);
789
case nir_intrinsic_shuffle_xor:
790
case nir_intrinsic_shuffle_up:
791
case nir_intrinsic_shuffle_down:
792
if (options->lower_relative_shuffle)
793
return lower_to_shuffle(b, intrin, options);
794
else if (options->lower_to_scalar && intrin->num_components > 1)
795
return lower_subgroup_op_to_scalar(b, intrin, options->lower_shuffle_to_32bit);
796
else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
797
return lower_subgroup_op_to_32bit(b, intrin);
800
case nir_intrinsic_quad_broadcast:
801
case nir_intrinsic_quad_swap_horizontal:
802
case nir_intrinsic_quad_swap_vertical:
803
case nir_intrinsic_quad_swap_diagonal:
804
if (options->lower_quad ||
805
(options->lower_quad_broadcast_dynamic &&
806
intrin->intrinsic == nir_intrinsic_quad_broadcast &&
807
!nir_src_is_const(intrin->src[1])))
808
return lower_dynamic_quad_broadcast(b, intrin, options);
809
else if (options->lower_to_scalar && intrin->num_components > 1)
810
return lower_subgroup_op_to_scalar(b, intrin, false);
813
case nir_intrinsic_reduce: {
814
nir_ssa_def *ret = NULL;
815
/* A cluster size greater than the subgroup size is implemention defined */
816
if (options->subgroup_size &&
817
nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
818
nir_intrinsic_set_cluster_size(intrin, 0);
819
ret = NIR_LOWER_INSTR_PROGRESS;
821
if (options->lower_to_scalar && intrin->num_components > 1)
822
ret = lower_subgroup_op_to_scalar(b, intrin, false);
825
case nir_intrinsic_inclusive_scan:
826
case nir_intrinsic_exclusive_scan:
827
if (options->lower_to_scalar && intrin->num_components > 1)
828
return lower_subgroup_op_to_scalar(b, intrin, false);
839
nir_lower_subgroups(nir_shader *shader,
840
const nir_lower_subgroups_options *options)
842
return nir_shader_lower_instructions(shader,
843
lower_subgroups_filter,
844
lower_subgroups_instr,