1
/**************************************************************************
3
* Copyright 2010 VMware, Inc.
6
* Permission is hereby granted, free of charge, to any person obtaining a
7
* copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
14
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20
* USE OR OTHER DEALINGS IN THE SOFTWARE.
22
* The above copyright notice and this permission notice (including the
23
* next paragraph) shall be included in all copies or substantial portions
26
**************************************************************************/
29
#include "util/u_debug.h"
30
#include "util/u_cpu_detect.h"
31
#include "util/u_math.h"
32
#include "lp_bld_debug.h"
33
#include "lp_bld_const.h"
34
#include "lp_bld_format.h"
35
#include "lp_bld_gather.h"
36
#include "lp_bld_swizzle.h"
37
#include "lp_bld_type.h"
38
#include "lp_bld_init.h"
39
#include "lp_bld_intr.h"
40
#include "lp_bld_pack.h"
44
* Get the pointer to one element from scatter positions in memory.
46
* @sa lp_build_gather()
49
lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
51
LLVMValueRef base_ptr,
58
assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
64
LLVMValueRef index = lp_build_const_int32(gallivm, i);
65
offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
68
ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
75
* Gather one element from scatter positions in memory.
77
* @sa lp_build_gather()
80
lp_build_gather_elem(struct gallivm_state *gallivm,
85
LLVMValueRef base_ptr,
88
boolean vector_justify)
90
LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
91
LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
92
LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
96
assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
98
ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
99
ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
100
res = LLVMBuildLoad(gallivm->builder, ptr, "");
103
* On some archs we probably really want to avoid having to deal
104
* with alignments lower than 4 bytes (if fetch size is a power of
105
* two >= 32). On x86 it doesn't matter, however.
106
* We should be able to guarantee full alignment for any kind of texture
107
* fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
108
* (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
109
* but I don't think that's quite what we wanted).
110
* For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
111
* looks like a good fit, but it seems this cap bit (and OpenGL) aren't
112
* enforcing what we want (which is what d3d10 does, the offset needs to
113
* be aligned to element size, but GL has bytes regardless of element
114
* size which would only leave us with minimum alignment restriction of 16
115
* which doesn't make much sense if the type isn't 4x32bit). Due to
116
* translation of offsets to first_elem in sampler_views it actually seems
117
* gallium could not do anything else except 16 no matter what...
120
LLVMSetAlignment(res, 1);
121
} else if (!util_is_power_of_two_or_zero(src_width)) {
123
* Full alignment is impossible, assume the caller really meant
124
* the individual elements were aligned (e.g. 3x32bit format).
125
* And yes the generated code may otherwise crash, llvm will
126
* really assume 128bit alignment with a 96bit fetch (I suppose
127
* that makes sense as it can just assume the upper 32bit to be
129
* Maybe the caller should be able to explicitly set this, but
130
* this should cover all the 3-channel formats.
132
if (((src_width / 24) * 24 == src_width) &&
133
util_is_power_of_two_or_zero(src_width / 24)) {
134
LLVMSetAlignment(res, src_width / 24);
136
LLVMSetAlignment(res, 1);
140
assert(src_width <= dst_width);
141
if (src_width < dst_width) {
142
res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
143
if (vector_justify) {
144
#if UTIL_ARCH_BIG_ENDIAN
145
res = LLVMBuildShl(gallivm->builder, res,
146
LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
156
* Gather one element from scatter positions in memory.
157
* Nearly the same as above, however the individual elements
158
* may be vectors themselves, and fetches may be float type.
159
* Can also do pad vector instead of ZExt.
161
* @sa lp_build_gather()
164
lp_build_gather_elem_vec(struct gallivm_state *gallivm,
167
LLVMTypeRef src_type,
168
struct lp_type dst_type,
170
LLVMValueRef base_ptr,
171
LLVMValueRef offsets,
173
boolean vector_justify)
175
LLVMValueRef ptr, res;
176
LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
177
assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
179
ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
180
ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
181
res = LLVMBuildLoad(gallivm->builder, ptr, "");
184
* On some archs we probably really want to avoid having to deal
185
* with alignments lower than 4 bytes (if fetch size is a power of
186
* two >= 32). On x86 it doesn't matter, however.
187
* We should be able to guarantee full alignment for any kind of texture
188
* fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
189
* (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
190
* but I don't think that's quite what we wanted).
191
* For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
192
* looks like a good fit, but it seems this cap bit (and OpenGL) aren't
193
* enforcing what we want (which is what d3d10 does, the offset needs to
194
* be aligned to element size, but GL has bytes regardless of element
195
* size which would only leave us with minimum alignment restriction of 16
196
* which doesn't make much sense if the type isn't 4x32bit). Due to
197
* translation of offsets to first_elem in sampler_views it actually seems
198
* gallium could not do anything else except 16 no matter what...
201
LLVMSetAlignment(res, 1);
202
} else if (!util_is_power_of_two_or_zero(src_width)) {
204
* Full alignment is impossible, assume the caller really meant
205
* the individual elements were aligned (e.g. 3x32bit format).
206
* And yes the generated code may otherwise crash, llvm will
207
* really assume 128bit alignment with a 96bit fetch (I suppose
208
* that makes sense as it can just assume the upper 32bit to be
210
* Maybe the caller should be able to explicitly set this, but
211
* this should cover all the 3-channel formats.
213
if (((src_width / 24) * 24 == src_width) &&
214
util_is_power_of_two_or_zero(src_width / 24)) {
215
LLVMSetAlignment(res, src_width / 24);
217
LLVMSetAlignment(res, 1);
221
assert(src_width <= dst_type.width * dst_type.length);
222
if (src_width < dst_type.width * dst_type.length) {
223
if (dst_type.length > 1) {
224
res = lp_build_pad_vector(gallivm, res, dst_type.length);
226
* vector_justify hopefully a non-issue since we only deal
227
* with src_width >= 32 here?
230
LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
233
* Only valid if src_ptr_type is int type...
235
res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
237
#if UTIL_ARCH_BIG_ENDIAN
238
if (vector_justify) {
239
res = LLVMBuildShl(gallivm->builder, res,
240
LLVMConstInt(dst_elem_type,
241
dst_type.width - src_width, 0), "");
243
if (src_width == 48) {
244
/* Load 3x16 bit vector.
245
* The sequence of loads on big-endian hardware proceeds as follows.
246
* 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence
247
* of three fields appears in the order X, Y, Z.
249
* Load 32-bit word: 0.0.X.Y
250
* Load 16-bit halfword: 0.0.0.Z
251
* Rotate left: 0.X.Y.0
252
* Bitwise OR: 0.X.Y.Z
254
* The order in which we need the fields in the result is 0.Z.Y.X,
255
* the same as on little-endian; permute 16-bit fields accordingly
256
* within 64-bit register:
258
LLVMValueRef shuffles[4] = {
259
lp_build_const_int32(gallivm, 2),
260
lp_build_const_int32(gallivm, 1),
261
lp_build_const_int32(gallivm, 0),
262
lp_build_const_int32(gallivm, 3),
264
res = LLVMBuildBitCast(gallivm->builder, res,
265
lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
266
res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
267
res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
279
lp_build_gather_avx2(struct gallivm_state *gallivm,
282
struct lp_type dst_type,
283
LLVMValueRef base_ptr,
284
LLVMValueRef offsets)
286
LLVMBuilderRef builder = gallivm->builder;
287
LLVMTypeRef src_type, src_vec_type;
289
struct lp_type res_type = dst_type;
290
res_type.length *= length;
292
if (dst_type.floating) {
293
src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
294
LLVMFloatTypeInContext(gallivm->context);
296
src_type = LLVMIntTypeInContext(gallivm->context, src_width);
298
src_vec_type = LLVMVectorType(src_type, length);
300
/* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
301
assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
305
* XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
306
* will not use the AVX2 gather instrinsics (even with llvm 4.0), at
307
* least with Haswell. See
308
* http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
309
* And the generated code doing the emulation is quite a bit worse
310
* than what we get by doing it ourselves too.
312
LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
313
LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
314
LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
315
LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
316
LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
317
LLVMValueRef src_ptr;
319
base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
321
/* Rescale offsets from bytes to elements */
322
LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
323
scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
324
assert(LLVMTypeOf(offsets) == i32_vec_type);
325
offsets = LLVMBuildSDiv(builder, offsets, scale, "");
327
src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
330
snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
331
length, dst_type.floating ? "f" : "i", src_width);
332
LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
333
LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
334
LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
336
LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
338
res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
340
LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
341
const char *intrinsic = NULL;
344
assert(src_width == 32 || src_width == 64);
345
if (src_width == 32) {
346
assert(length == 4 || length == 8);
348
assert(length == 2 || length == 4);
351
static const char *intrinsics[2][2][2] = {
353
{{"llvm.x86.avx2.gather.d.d",
354
"llvm.x86.avx2.gather.d.d.256"},
355
{"llvm.x86.avx2.gather.d.q",
356
"llvm.x86.avx2.gather.d.q.256"}},
358
{{"llvm.x86.avx2.gather.d.ps",
359
"llvm.x86.avx2.gather.d.ps.256"},
360
{"llvm.x86.avx2.gather.d.pd",
361
"llvm.x86.avx2.gather.d.pd.256"}},
364
if ((src_width == 32 && length == 8) ||
365
(src_width == 64 && length == 4)) {
368
intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
370
LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
371
LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
372
mask = LLVMConstBitCast(mask, src_vec_type);
373
LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
375
LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
377
res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
379
res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
386
* Gather elements from scatter positions in memory into a single vector.
387
* Use for fetching texels from a texture.
388
* For SSE, typical values are length=4, src_width=32, dst_width=32.
390
* When src_width < dst_width, the return value can be justified in
392
* "integer justification" is used when the caller treats the destination
393
* as a packed integer bitmask, as described by the channels' "shift" and
395
* "vector justification" is used when the caller casts the destination
396
* to a vector and needs channel X to be in vector element 0.
398
* @param length length of the offsets
399
* @param src_width src element width in bits
400
* @param dst_type result element type (src will be expanded to fit,
401
* but truncation is not allowed)
402
* (this may be a vector, must be pot sized)
403
* @param aligned whether the data is guaranteed to be aligned (to src_width)
404
* @param base_ptr base pointer, needs to be a i8 pointer type.
405
* @param offsets vector with offsets
406
* @param vector_justify select vector rather than integer justification
409
lp_build_gather(struct gallivm_state *gallivm,
412
struct lp_type dst_type,
414
LLVMValueRef base_ptr,
415
LLVMValueRef offsets,
416
boolean vector_justify)
419
boolean need_expansion = src_width < dst_type.width * dst_type.length;
421
struct lp_type fetch_type, fetch_dst_type;
422
LLVMTypeRef src_type;
424
assert(src_width <= dst_type.width * dst_type.length);
427
* This is quite a mess...
428
* Figure out if the fetch should be done as:
429
* a) scalar or vector
432
* As an example, for a 96bit fetch expanded into 4x32bit, it is better
433
* to use (3x32bit) vector type (then pad the vector). Otherwise, the
434
* zext will cause extra instructions.
435
* However, the same isn't true for 3x16bit (the codegen for that is
436
* completely worthless on x86 simd, and for 3x8bit is is way worse
437
* still, don't try that... (To get really good code out of llvm for
438
* these cases, the only way is to decompose the fetches manually
439
* into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
440
* case requires sse41, otherwise simple scalar zext is way better.
441
* But probably not important enough, so don't bother.)
442
* Also, we try to honor the floating bit of destination (but isn't
443
* possible if caller asks for instance for 2x32bit dst_type with
444
* 48bit fetch - the idea would be to use 3x16bit fetch, pad and
445
* cast to 2x32f type, so the fetch is always int and on top of that
446
* we avoid the vec pad and use scalar zext due the above mentioned
448
* Note this is optimized for x86 sse2 and up backend. Could be tweaked
449
* for other archs if necessary...
451
if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
452
(dst_type.length > 1)) {
453
/* use vector fetch (if dst_type is vector) */
455
if (dst_type.floating) {
456
fetch_type = lp_type_float_vec(dst_type.width, src_width);
458
fetch_type = lp_type_int_vec(dst_type.width, src_width);
460
/* intentionally not using lp_build_vec_type here */
461
src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
463
fetch_dst_type = fetch_type;
464
fetch_dst_type.length = dst_type.length;
466
/* use scalar fetch */
468
if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
469
fetch_type = lp_type_float(src_width);
471
fetch_type = lp_type_int(src_width);
473
src_type = lp_build_vec_type(gallivm, fetch_type);
474
fetch_dst_type = fetch_type;
475
fetch_dst_type.width = dst_type.width * dst_type.length;
480
res = lp_build_gather_elem_vec(gallivm, length,
481
src_width, src_type, fetch_dst_type,
482
aligned, base_ptr, offsets, 0,
484
return LLVMBuildBitCast(gallivm->builder, res,
485
lp_build_vec_type(gallivm, dst_type), "");
487
* Excluding expansion from these paths because if you need it for
488
* 32bit/64bit fetches you're doing it wrong (this is gather, not
489
* conversion) and it would be awkward for floats.
491
} else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&
492
src_width == 32 && (length == 4 || length == 8)) {
493
return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
496
* This looks bad on paper wrt throughtput/latency on Haswell.
497
* Even on Broadwell it doesn't look stellar.
498
* Albeit no measurements were done (but tested to work).
499
* Should definitely enable on Skylake.
500
* (In general, should be more of a win if the fetch is 256bit wide -
501
* this is true for the 32bit case above too.)
503
} else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&
504
src_width == 64 && (length == 2 || length == 4)) {
505
return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
510
LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
512
boolean vec_zext = FALSE;
513
struct lp_type res_type, gather_res_type;
514
LLVMTypeRef res_t, gather_res_t;
516
res_type = fetch_dst_type;
517
res_type.length *= length;
518
gather_res_type = res_type;
520
if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
522
* Note that llvm is never able to optimize zext/insert combos
523
* directly (i.e. zero the simd reg, then place the elements into
524
* the appropriate place directly). (I think this has to do with
525
* scalar/vector transition.) And scalar 16->32bit zext simd loads
526
* aren't possible (instead loading to scalar reg first).
527
* No idea about other archs...
528
* We could do this manually, but instead we just use a vector
529
* zext, which is simple enough (and, in fact, llvm might optimize
531
* (We're not trying that with other bit widths as that might not be
532
* easier, in particular with 8 bit values at least with only sse2.)
534
assert(vec_fetch == FALSE);
535
gather_res_type.width /= 2;
536
fetch_dst_type = fetch_type;
537
src_type = lp_build_vec_type(gallivm, fetch_type);
540
res_t = lp_build_vec_type(gallivm, res_type);
541
gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
542
res = LLVMGetUndef(gather_res_t);
543
for (i = 0; i < length; ++i) {
544
LLVMValueRef index = lp_build_const_int32(gallivm, i);
545
elems[i] = lp_build_gather_elem_vec(gallivm, length,
546
src_width, src_type, fetch_dst_type,
547
aligned, base_ptr, offsets, i,
550
res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
554
res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
555
if (vector_justify) {
556
#if UTIL_ARCH_BIG_ENDIAN
557
unsigned sv = dst_type.width - src_width;
558
res = LLVMBuildShl(gallivm->builder, res,
559
lp_build_const_int_vec(gallivm, res_type, sv), "");
565
* Do bitcast now otherwise llvm might get some funny ideas wrt
568
for (i = 0; i < length; i++) {
569
elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
570
lp_build_vec_type(gallivm, dst_type), "");
572
res = lp_build_concat(gallivm, elems, dst_type, length);
574
struct lp_type really_final_type = dst_type;
575
assert(res_type.length * res_type.width ==
576
dst_type.length * dst_type.width * length);
577
really_final_type.length *= length;
578
res = LLVMBuildBitCast(gallivm->builder, res,
579
lp_build_vec_type(gallivm, really_final_type), "");
587
lp_build_gather_values(struct gallivm_state * gallivm,
588
LLVMValueRef * values,
589
unsigned value_count)
591
LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
592
LLVMBuilderRef builder = gallivm->builder;
593
LLVMValueRef vec = LLVMGetUndef(vec_type);
596
for (i = 0; i < value_count; i++) {
597
LLVMValueRef index = lp_build_const_int32(gallivm, i);
598
vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");