1
//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
3
// The LLVM Compiler Infrastructure
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
9
//==-----------------------------------------------------------------------===//
11
#define DEBUG_TYPE "PeepholeOpt"
13
#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
18
#include "AMDILDevices.h"
19
#include "AMDGPUInstrInfo.h"
20
#include "llvm/ADT/Statistic.h"
21
#include "llvm/ADT/StringExtras.h"
22
#include "llvm/ADT/StringRef.h"
23
#include "llvm/ADT/Twine.h"
24
#include "llvm/IR/Constants.h"
25
#include "llvm/CodeGen/MachineFunction.h"
26
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
27
#include "llvm/IR/Function.h"
28
#include "llvm/IR/Instructions.h"
29
#include "llvm/IR/Module.h"
30
#include "llvm/Support/Debug.h"
31
#include "llvm/Support/MathExtras.h"
36
STATISTIC(PointerAssignments, "Number of dynamic pointer "
37
"assigments discovered");
38
STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
42
// The Peephole optimization pass is used to do simple last minute optimizations
43
// that are required for correct code or to remove redundant functions
48
class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
52
AMDGPUPeepholeOpt(TargetMachine &tm);
54
const char *getPassName() const;
55
bool runOnFunction(Function &F);
56
bool doInitialization(Module &M);
57
bool doFinalization(Module &M);
58
void getAnalysisUsage(AnalysisUsage &AU) const;
61
// Function to initiate all of the instruction level optimizations.
62
bool instLevelOptimizations(BasicBlock::iterator *inst);
63
// Quick check to see if we need to dump all of the pointers into the
64
// arena. If this is correct, then we set all pointers to exist in arena. This
65
// is a workaround for aliasing of pointers in a struct/union.
66
bool dumpAllIntoArena(Function &F);
67
// Because I don't want to invalidate any pointers while in the
68
// safeNestedForEachFunction. I push atomic conversions to a vector and handle
69
// it later. This function does the conversions if required.
70
void doAtomicConversionIfNeeded(Function &F);
71
// Because __amdil_is_constant cannot be properly evaluated if
72
// optimizations are disabled, the call's are placed in a vector
73
// and evaluated after the __amdil_image* functions are evaluated
74
// which should allow the __amdil_is_constant function to be
75
// evaluated correctly.
76
void doIsConstCallConversionIfNeeded();
80
CodeGenOpt::Level optLevel;
81
// Run a series of tests to see if we can optimize a CALL instruction.
82
bool optimizeCallInst(BasicBlock::iterator *bbb);
83
// A peephole optimization to optimize bit extract sequences.
84
bool optimizeBitExtract(Instruction *inst);
85
// A peephole optimization to optimize bit insert sequences.
86
bool optimizeBitInsert(Instruction *inst);
87
bool setupBitInsert(Instruction *base,
91
// Expand the bit field insert instruction on versions of OpenCL that
93
bool expandBFI(CallInst *CI);
94
// Expand the bit field mask instruction on version of OpenCL that
96
bool expandBFM(CallInst *CI);
97
// On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
98
// this case we need to expand them. These functions check for 24bit functions
100
bool isSigned24BitOps(CallInst *CI);
101
void expandSigned24BitOps(CallInst *CI);
102
// One optimization that can occur is that if the required workgroup size is
103
// specified then the result of get_local_size is known at compile time and
104
// can be returned accordingly.
105
bool isRWGLocalOpt(CallInst *CI);
106
// On northern island cards, the division is slightly less accurate than on
107
// previous generations, so we need to utilize a more accurate division. So we
108
// can translate the accurate divide to a normal divide on all other cards.
109
bool convertAccurateDivide(CallInst *CI);
110
void expandAccurateDivide(CallInst *CI);
111
// If the alignment is set incorrectly, it can produce really inefficient
112
// code. This checks for this scenario and fixes it if possible.
113
bool correctMisalignedMemOp(Instruction *inst);
115
// If we are in no opt mode, then we need to make sure that
116
// local samplers are properly propagated as constant propagation
117
// doesn't occur and we need to know the value of kernel defined
118
// samplers at compile time.
119
bool propagateSamplerInst(CallInst *CI);
123
// Group of functions that recursively calculate the size of a structure based
124
// on it's sub-types.
125
size_t getTypeSize(Type * const T, bool dereferencePtr = false);
126
size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
127
size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
128
size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
129
size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
130
size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
131
size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
132
size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
136
const AMDGPUSubtarget *mSTM;
137
SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
138
SmallVector<CallInst *, 16> isConstVec;
139
}; // class AMDGPUPeepholeOpt
140
char AMDGPUPeepholeOpt::ID = 0;
142
// A template function that has two levels of looping before calling the
143
// function with a pointer to the current iterator.
144
template<class InputIterator, class SecondIterator, class Function>
145
Function safeNestedForEach(InputIterator First, InputIterator Last,
146
SecondIterator S, Function F) {
147
for ( ; First != Last; ++First) {
148
SecondIterator sf, sl;
149
for (sf = First->begin(), sl = First->end();
159
} // anonymous namespace
163
createAMDGPUPeepholeOpt(TargetMachine &tm) {
164
return new AMDGPUPeepholeOpt(tm);
168
AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
169
: FunctionPass(ID), TM(tm) {
171
optLevel = TM.getOptLevel();
175
AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
179
AMDGPUPeepholeOpt::getPassName() const {
180
return "AMDGPU PeepHole Optimization Pass";
184
containsPointerType(Type *Ty) {
188
switch(Ty->getTypeID()) {
191
case Type::StructTyID: {
192
const StructType *ST = dyn_cast<StructType>(Ty);
193
for (StructType::element_iterator stb = ST->element_begin(),
194
ste = ST->element_end(); stb != ste; ++stb) {
195
if (!containsPointerType(*stb)) {
202
case Type::VectorTyID:
203
case Type::ArrayTyID:
204
return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
205
case Type::PointerTyID:
212
AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
213
bool dumpAll = false;
214
for (Function::const_arg_iterator cab = F.arg_begin(),
215
cae = F.arg_end(); cab != cae; ++cab) {
216
const Argument *arg = cab;
217
const PointerType *PT = dyn_cast<PointerType>(arg->getType());
221
Type *DereferencedType = PT->getElementType();
222
if (!dyn_cast<StructType>(DereferencedType)
226
if (!containsPointerType(DereferencedType)) {
229
// FIXME: Because a pointer inside of a struct/union may be aliased to
230
// another pointer we need to take the conservative approach and place all
231
// pointers into the arena until more advanced detection is implemented.
237
AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
238
if (isConstVec.empty()) {
241
for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
242
CallInst *CI = isConstVec[x];
243
Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
244
Type *aType = Type::getInt32Ty(*mCTX);
245
Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
246
: ConstantInt::get(aType, 0);
247
CI->replaceAllUsesWith(Val);
248
CI->eraseFromParent();
253
AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
254
// Don't do anything if we don't have any atomic operations.
255
if (atomicFuncs.empty()) {
258
// Change the function name for the atomic if it is required
259
uint32_t size = atomicFuncs.size();
260
for (uint32_t x = 0; x < size; ++x) {
261
atomicFuncs[x].first->setOperand(
262
atomicFuncs[x].first->getNumOperands()-1,
263
atomicFuncs[x].second);
267
if (mConvertAtomics) {
273
AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
276
mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
280
mCTX = &MF.getType()->getContext();
281
mConvertAtomics = true;
282
safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
283
std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
286
doAtomicConversionIfNeeded(MF);
287
doIsConstCallConversionIfNeeded();
296
AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
297
Instruction *inst = (*bbb);
298
CallInst *CI = dyn_cast<CallInst>(inst);
302
if (isSigned24BitOps(CI)) {
303
expandSigned24BitOps(CI);
305
CI->eraseFromParent();
308
if (propagateSamplerInst(CI)) {
311
if (expandBFI(CI) || expandBFM(CI)) {
313
CI->eraseFromParent();
316
if (convertAccurateDivide(CI)) {
317
expandAccurateDivide(CI);
319
CI->eraseFromParent();
323
StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
324
if (calleeName.startswith("__amdil_is_constant")) {
325
// If we do not have optimizations, then this
326
// cannot be properly evaluated, so we add the
327
// call instruction to a vector and process
328
// them at the end of processing after the
329
// samplers have been correctly handled.
330
if (optLevel == CodeGenOpt::None) {
331
isConstVec.push_back(CI);
334
Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
335
Type *aType = Type::getInt32Ty(*mCTX);
336
Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
337
: ConstantInt::get(aType, 0);
338
CI->replaceAllUsesWith(Val);
340
CI->eraseFromParent();
345
if (calleeName.equals("__amdil_is_asic_id_i32")) {
346
ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
347
Type *aType = Type::getInt32Ty(*mCTX);
350
Val = ConstantInt::get(aType,
351
mSTM->device()->getDeviceFlag() & CV->getZExtValue());
353
Val = ConstantInt::get(aType, 0);
355
CI->replaceAllUsesWith(Val);
357
CI->eraseFromParent();
360
Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
364
if (F->getName().startswith("__atom") && !CI->getNumUses()
365
&& F->getName().find("_xchg") == StringRef::npos) {
366
std::string buffer(F->getName().str() + "_noret");
367
F = dyn_cast<Function>(
368
F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
369
atomicFuncs.push_back(std::make_pair(CI, F));
372
if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
373
&& !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
376
if (!mConvertAtomics) {
379
StringRef name = F->getName();
380
if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
381
mConvertAtomics = false;
387
AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
393
dbgs() << "Null pointer passed into function.\n";
398
if (base->getOpcode() == Instruction::Shl) {
399
shift = dyn_cast<Constant>(base->getOperand(1));
400
} else if (base->getOpcode() == Instruction::And) {
401
mask = dyn_cast<Constant>(base->getOperand(1));
405
dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
407
// If the base is neither a Shl or a And, we don't fit any of the patterns above.
410
src = dyn_cast<Instruction>(base->getOperand(0));
413
dbgs() << "Failed setup since the base operand is not an instruction!\n";
417
// If we find an 'and' operation, then we don't need to
418
// find the next operation as we already know the
419
// bits that are valid at this point.
423
if (src->getOpcode() == Instruction::Shl && !shift) {
424
shift = dyn_cast<Constant>(src->getOperand(1));
425
src = dyn_cast<Instruction>(src->getOperand(0));
426
} else if (src->getOpcode() == Instruction::And && !mask) {
427
mask = dyn_cast<Constant>(src->getOperand(1));
429
if (!mask && !shift) {
431
dbgs() << "Failed setup since both mask and shift are NULL!\n";
433
// Did not find a constant mask or a shift.
439
AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
443
if (!inst->isBinaryOp()) {
446
if (inst->getOpcode() != Instruction::Or) {
449
if (optLevel == CodeGenOpt::None) {
452
// We want to do an optimization on a sequence of ops that in the end equals a
453
// single ISA instruction.
454
// The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
455
// Some simplified versions of this pattern are as follows:
456
// (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
457
// ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
458
// (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
459
// (A & B) | (D << F) when (1 << F) >= B
460
// (A << C) | (D & E) when (1 << C) >= E
461
if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
462
// The HD4XXX hardware doesn't support the ubit_insert instruction.
465
Type *aType = inst->getType();
466
bool isVector = aType->isVectorTy();
468
// This optimization only works on 32bit integers.
469
if (aType->getScalarType()
470
!= Type::getInt32Ty(inst->getContext())) {
474
const VectorType *VT = dyn_cast<VectorType>(aType);
475
numEle = VT->getNumElements();
476
// We currently cannot support more than 4 elements in a intrinsic and we
477
// cannot support Vec3 types.
478
if (numEle > 4 || numEle == 3) {
482
// TODO: Handle vectors.
485
dbgs() << "!!! Vectors are not supported yet!\n";
489
Instruction *LHSSrc = NULL, *RHSSrc = NULL;
490
Constant *LHSMask = NULL, *RHSMask = NULL;
491
Constant *LHSShift = NULL, *RHSShift = NULL;
492
Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
493
Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
494
if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
496
dbgs() << "Found an OR Operation that failed setup!\n";
498
if (LHS) { LHS->dump(); }
499
if (LHSSrc) { LHSSrc->dump(); }
500
if (LHSMask) { LHSMask->dump(); }
501
if (LHSShift) { LHSShift->dump(); }
503
// There was an issue with the setup for BitInsert.
506
if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
508
dbgs() << "Found an OR Operation that failed setup!\n";
510
if (RHS) { RHS->dump(); }
511
if (RHSSrc) { RHSSrc->dump(); }
512
if (RHSMask) { RHSMask->dump(); }
513
if (RHSShift) { RHSShift->dump(); }
515
// There was an issue with the setup for BitInsert.
519
dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
520
dbgs() << "Op: "; inst->dump();
521
dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
522
dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
523
dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
524
dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
525
dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
526
dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
527
dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
528
dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
530
Constant *offset = NULL;
531
Constant *width = NULL;
532
uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
533
uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
534
uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
535
uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
536
lhsMaskVal = (LHSMask
537
? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
538
rhsMaskVal = (RHSMask
539
? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
540
lhsShiftVal = (LHSShift
541
? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
542
rhsShiftVal = (RHSShift
543
? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
544
lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
545
rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
546
lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
547
rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
548
// TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
549
if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
552
if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
553
offset = ConstantInt::get(aType, lhsMaskOffset, false);
554
width = ConstantInt::get(aType, lhsMaskWidth, false);
556
if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
560
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
562
} else if (lhsShiftVal != lhsMaskOffset) {
563
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
567
dbgs() << "Optimizing LHS!\n";
569
} else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
570
offset = ConstantInt::get(aType, rhsMaskOffset, false);
571
width = ConstantInt::get(aType, rhsMaskWidth, false);
574
if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
578
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
580
} else if (rhsShiftVal != rhsMaskOffset) {
581
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
585
dbgs() << "Optimizing RHS!\n";
589
dbgs() << "Failed constraint 3!\n";
594
dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
595
dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
596
dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
597
dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
599
if (!offset || !width) {
601
dbgs() << "Either width or offset are NULL, failed detection!\n";
605
// Lets create the function signature.
606
std::vector<Type *> callTypes;
607
callTypes.push_back(aType);
608
callTypes.push_back(aType);
609
callTypes.push_back(aType);
610
callTypes.push_back(aType);
611
FunctionType *funcType = FunctionType::get(aType, callTypes, false);
612
std::string name = "__amdil_ubit_insert";
613
if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
615
dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
616
getOrInsertFunction(StringRef(name), funcType));
617
Value *Operands[4] = {
623
CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
625
dbgs() << "Old Inst: ";
627
dbgs() << "New Inst: ";
631
CI->insertBefore(inst);
632
inst->replaceAllUsesWith(CI);
637
AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
641
if (!inst->isBinaryOp()) {
644
if (inst->getOpcode() != Instruction::And) {
647
if (optLevel == CodeGenOpt::None) {
650
// We want to do some simple optimizations on Shift right/And patterns. The
651
// basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
652
// value smaller than 32 and C is a mask. If C is a constant value, then the
653
// following transformation can occur. For signed integers, it turns into the
654
// function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
655
// integers, it turns into the function call dst =
656
// __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
657
// can be found in Section 7.9 of the ATI IL spec of the stream SDK for
658
// Evergreen hardware.
659
if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
660
// This does not work on HD4XXX hardware.
663
Type *aType = inst->getType();
664
bool isVector = aType->isVectorTy();
666
// XXX Support vector types
671
// This only works on 32bit integers
672
if (aType->getScalarType()
673
!= Type::getInt32Ty(inst->getContext())) {
677
const VectorType *VT = dyn_cast<VectorType>(aType);
678
numEle = VT->getNumElements();
679
// We currently cannot support more than 4 elements in a intrinsic and we
680
// cannot support Vec3 types.
681
if (numEle > 4 || numEle == 3) {
685
BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
686
// If the first operand is not a shift instruction, then we can return as it
687
// doesn't match this pattern.
688
if (!ShiftInst || !ShiftInst->isShift()) {
691
// If we are a shift left, then we need don't match this pattern.
692
if (ShiftInst->getOpcode() == Instruction::Shl) {
695
bool isSigned = ShiftInst->isArithmeticShift();
696
Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
697
Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
698
// Lets make sure that the shift value and the and mask are constant integers.
699
if (!AndMask || !ShrVal) {
702
Constant *newMaskConst;
703
Constant *shiftValConst;
705
// Handle the vector case
706
std::vector<Constant *> maskVals;
707
std::vector<Constant *> shiftVals;
708
ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
709
ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
710
Type *scalarType = AndMaskVec->getType()->getScalarType();
711
assert(AndMaskVec->getNumOperands() ==
712
ShrValVec->getNumOperands() && "cannot have a "
713
"combination where the number of elements to a "
714
"shift and an and are different!");
715
for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
716
ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
717
ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
718
if (!AndCI || !ShiftIC) {
721
uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
722
if (!isMask_32(maskVal)) {
725
maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
726
uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
727
// If the mask or shiftval is greater than the bitcount, then break out.
728
if (maskVal >= 32 || shiftVal >= 32) {
731
// If the mask val is greater than the the number of original bits left
732
// then this optimization is invalid.
733
if (maskVal > (32 - shiftVal)) {
736
maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
737
shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
739
newMaskConst = ConstantVector::get(maskVals);
740
shiftValConst = ConstantVector::get(shiftVals);
742
// Handle the scalar case
743
uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
744
// This must be a mask value where all lower bits are set to 1 and then any
745
// bit higher is set to 0.
746
if (!isMask_32(maskVal)) {
749
maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
750
// Count the number of bits set in the mask, this is the width of the
751
// resulting bit set that is extracted from the source value.
752
uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
753
// If the mask or shift val is greater than the bitcount, then break out.
754
if (maskVal >= 32 || shiftVal >= 32) {
757
// If the mask val is greater than the the number of original bits left then
758
// this optimization is invalid.
759
if (maskVal > (32 - shiftVal)) {
762
newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
763
shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
765
// Lets create the function signature.
766
std::vector<Type *> callTypes;
767
callTypes.push_back(aType);
768
callTypes.push_back(aType);
769
callTypes.push_back(aType);
770
FunctionType *funcType = FunctionType::get(aType, callTypes, false);
771
std::string name = "llvm.AMDGPU.bit.extract.u32";
773
name += ".v" + itostr(numEle) + "i32";
777
// Lets create the function.
779
dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
780
getOrInsertFunction(StringRef(name), funcType));
781
Value *Operands[3] = {
782
ShiftInst->getOperand(0),
786
// Lets create the Call with the operands
787
CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
788
CI->setDoesNotAccessMemory();
789
CI->insertBefore(inst);
790
inst->replaceAllUsesWith(CI);
795
AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
799
Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
800
if (!LHS->getName().startswith("__amdil_bfi")) {
803
Type* type = CI->getOperand(0)->getType();
804
Constant *negOneConst = NULL;
805
if (type->isVectorTy()) {
806
std::vector<Constant *> negOneVals;
807
negOneConst = ConstantInt::get(CI->getContext(),
808
APInt(32, StringRef("-1"), 10));
810
y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
811
negOneVals.push_back(negOneConst);
813
negOneConst = ConstantVector::get(negOneVals);
815
negOneConst = ConstantInt::get(CI->getContext(),
816
APInt(32, StringRef("-1"), 10));
818
// __amdil_bfi => (A & B) | (~A & C)
819
BinaryOperator *lhs =
820
BinaryOperator::Create(Instruction::And, CI->getOperand(0),
821
CI->getOperand(1), "bfi_and", CI);
822
BinaryOperator *rhs =
823
BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
825
rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
827
lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
828
CI->replaceAllUsesWith(lhs);
833
AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
837
Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
838
if (!LHS->getName().startswith("__amdil_bfm")) {
841
// __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
842
Constant *newMaskConst = NULL;
843
Constant *newShiftConst = NULL;
844
Type* type = CI->getOperand(0)->getType();
845
if (type->isVectorTy()) {
846
std::vector<Constant*> newMaskVals, newShiftVals;
847
newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
848
newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
850
y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
851
newMaskVals.push_back(newMaskConst);
852
newShiftVals.push_back(newShiftConst);
854
newMaskConst = ConstantVector::get(newMaskVals);
855
newShiftConst = ConstantVector::get(newShiftVals);
857
newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
858
newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
860
BinaryOperator *lhs =
861
BinaryOperator::Create(Instruction::And, CI->getOperand(0),
862
newMaskConst, "bfm_mask", CI);
863
lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
865
lhs = BinaryOperator::Create(Instruction::Sub, lhs,
866
newShiftConst, "bfm_sub", CI);
867
BinaryOperator *rhs =
868
BinaryOperator::Create(Instruction::And, CI->getOperand(1),
869
newMaskConst, "bfm_mask", CI);
870
lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
871
CI->replaceAllUsesWith(lhs);
876
AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
877
Instruction *inst = (*bbb);
878
if (optimizeCallInst(bbb)) {
881
if (optimizeBitExtract(inst)) {
884
if (optimizeBitInsert(inst)) {
887
if (correctMisalignedMemOp(inst)) {
893
AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
894
LoadInst *linst = dyn_cast<LoadInst>(inst);
895
StoreInst *sinst = dyn_cast<StoreInst>(inst);
897
Type* Ty = inst->getType();
899
alignment = linst->getAlignment();
900
Ty = inst->getType();
902
alignment = sinst->getAlignment();
903
Ty = sinst->getValueOperand()->getType();
907
unsigned size = getTypeSize(Ty);
908
if (size == alignment || size < alignment) {
911
if (!Ty->isStructTy()) {
916
linst->setAlignment(0);
919
sinst->setAlignment(0);
926
AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
930
Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
931
std::string namePrefix = LHS->getName().substr(0, 14);
932
if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
933
&& namePrefix != "__amdil__imul24_high") {
936
if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
943
AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
944
assert(isSigned24BitOps(CI) && "Must be a "
945
"signed 24 bit operation to call this function!");
946
Value *LHS = CI->getOperand(CI->getNumOperands()-1);
947
// On 7XX and 8XX we do not have signed 24bit, so we need to
948
// expand it to the following:
949
// imul24 turns into 32bit imul
950
// imad24 turns into 32bit imad
951
// imul24_high turns into 32bit imulhigh
952
if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
953
Type *aType = CI->getOperand(0)->getType();
954
bool isVector = aType->isVectorTy();
955
int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
956
std::vector<Type*> callTypes;
957
callTypes.push_back(CI->getOperand(0)->getType());
958
callTypes.push_back(CI->getOperand(1)->getType());
959
callTypes.push_back(CI->getOperand(2)->getType());
960
FunctionType *funcType =
961
FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
962
std::string name = "__amdil_imad";
964
name += "_v" + itostr(numEle) + "i32";
968
Function *Func = dyn_cast<Function>(
969
CI->getParent()->getParent()->getParent()->
970
getOrInsertFunction(StringRef(name), funcType));
971
Value *Operands[3] = {
976
CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
977
nCI->insertBefore(CI);
978
CI->replaceAllUsesWith(nCI);
979
} else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
980
BinaryOperator *mulOp =
981
BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
982
CI->getOperand(1), "imul24", CI);
983
CI->replaceAllUsesWith(mulOp);
984
} else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
985
Type *aType = CI->getOperand(0)->getType();
987
bool isVector = aType->isVectorTy();
988
int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
989
std::vector<Type*> callTypes;
990
callTypes.push_back(CI->getOperand(0)->getType());
991
callTypes.push_back(CI->getOperand(1)->getType());
992
FunctionType *funcType =
993
FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
994
std::string name = "__amdil_imul_high";
996
name += "_v" + itostr(numEle) + "i32";
1000
Function *Func = dyn_cast<Function>(
1001
CI->getParent()->getParent()->getParent()->
1002
getOrInsertFunction(StringRef(name), funcType));
1003
Value *Operands[2] = {
1007
CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1008
nCI->insertBefore(CI);
1009
CI->replaceAllUsesWith(nCI);
1014
AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
1016
&& CI->getOperand(CI->getNumOperands() - 1)->getName()
1017
== "__amdil_get_local_size_int");
1021
AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
1025
if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1026
&& (mSTM->getDeviceName() == "cayman")) {
1029
return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1030
== "__amdil_improved_div";
1034
AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
1035
assert(convertAccurateDivide(CI)
1036
&& "expanding accurate divide can only happen if it is expandable!");
1037
BinaryOperator *divOp =
1038
BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1039
CI->getOperand(1), "fdiv32", CI);
1040
CI->replaceAllUsesWith(divOp);
1044
AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
1045
if (optLevel != CodeGenOpt::None) {
1053
unsigned funcNameIdx = 0;
1054
funcNameIdx = CI->getNumOperands() - 1;
1055
StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1056
if (calleeName != "__amdil_image2d_read_norm"
1057
&& calleeName != "__amdil_image2d_read_unnorm"
1058
&& calleeName != "__amdil_image3d_read_norm"
1059
&& calleeName != "__amdil_image3d_read_unnorm") {
1063
unsigned samplerIdx = 2;
1065
Value *sampler = CI->getOperand(samplerIdx);
1066
LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1071
if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1075
GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1076
// If we are loading from what is not a global value, then we
1082
// If we don't have an initializer or we have an initializer and
1083
// the initializer is not a 32bit integer, we fail.
1084
if (!gv->hasInitializer()
1085
|| !gv->getInitializer()->getType()->isIntegerTy(32)) {
1089
// Now that we have the global variable initializer, lets replace
1090
// all uses of the load instruction with the samplerVal and
1091
// reparse the __amdil_is_constant() function.
1092
Constant *samplerVal = gv->getInitializer();
1093
lInst->replaceAllUsesWith(samplerVal);
1098
AMDGPUPeepholeOpt::doInitialization(Module &M) {
1103
AMDGPUPeepholeOpt::doFinalization(Module &M) {
1108
AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
1109
AU.addRequired<MachineFunctionAnalysis>();
1110
FunctionPass::getAnalysisUsage(AU);
1111
AU.setPreservesAll();
1114
size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1119
switch (T->getTypeID()) {
1120
case Type::X86_FP80TyID:
1121
case Type::FP128TyID:
1122
case Type::PPC_FP128TyID:
1123
case Type::LabelTyID:
1124
assert(0 && "These types are not supported by this backend");
1126
case Type::FloatTyID:
1127
case Type::DoubleTyID:
1128
size = T->getPrimitiveSizeInBits() >> 3;
1130
case Type::PointerTyID:
1131
size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1133
case Type::IntegerTyID:
1134
size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1136
case Type::StructTyID:
1137
size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1139
case Type::ArrayTyID:
1140
size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1142
case Type::FunctionTyID:
1143
size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1145
case Type::VectorTyID:
1146
size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1152
size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1153
bool dereferencePtr) {
1159
StructType::element_iterator eib;
1160
StructType::element_iterator eie;
1161
for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1163
size += getTypeSize(curType, dereferencePtr);
1168
size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1169
bool dereferencePtr) {
1170
return IT ? (IT->getBitWidth() >> 3) : 0;
1173
size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1174
bool dereferencePtr) {
1175
assert(0 && "Should not be able to calculate the size of an function type");
1179
size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1180
bool dereferencePtr) {
1181
return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1182
dereferencePtr) * AT->getNumElements())
1186
size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1187
bool dereferencePtr) {
1188
return VT ? (VT->getBitWidth() >> 3) : 0;
1191
size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1192
bool dereferencePtr) {
1196
Type *CT = PT->getElementType();
1197
if (CT->getTypeID() == Type::StructTyID &&
1198
PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1199
return getTypeSize(dyn_cast<StructType>(CT));
1200
} else if (dereferencePtr) {
1202
for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1203
size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1211
size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1212
bool dereferencePtr) {
1213
//assert(0 && "Should not be able to calculate the size of an opaque type");