1
//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
3
// The LLVM Compiler Infrastructure
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
8
//===----------------------------------------------------------------------===//
10
/// This transformation implements the well known scalar replacement of
11
/// aggregates transformation. It tries to identify promotable elements of an
12
/// aggregate alloca, and promote them to registers. It will also try to
13
/// convert uses of an element (or set of elements) of an alloca into a vector
14
/// or bitfield-style integer scalar if appropriate.
16
/// It works to do this with minimal slicing of the alloca so that regions
17
/// which are merely transferred in and out of external memory remain unchanged
18
/// and are not decomposed to scalar code.
20
/// Because this also performs alloca promotion, it can be thought of as also
21
/// serving the purpose of SSA formation. The algorithm iterates on the
22
/// function until all opportunities for promotion have been realized.
24
//===----------------------------------------------------------------------===//
26
#include "llvm/Transforms/Scalar.h"
27
#include "llvm/ADT/STLExtras.h"
28
#include "llvm/ADT/SetVector.h"
29
#include "llvm/ADT/SmallVector.h"
30
#include "llvm/ADT/Statistic.h"
31
#include "llvm/Analysis/AssumptionCache.h"
32
#include "llvm/Analysis/Loads.h"
33
#include "llvm/Analysis/PtrUseVisitor.h"
34
#include "llvm/Analysis/ValueTracking.h"
35
#include "llvm/IR/Constants.h"
36
#include "llvm/IR/DIBuilder.h"
37
#include "llvm/IR/DataLayout.h"
38
#include "llvm/IR/DebugInfo.h"
39
#include "llvm/IR/DerivedTypes.h"
40
#include "llvm/IR/Dominators.h"
41
#include "llvm/IR/Function.h"
42
#include "llvm/IR/IRBuilder.h"
43
#include "llvm/IR/InstVisitor.h"
44
#include "llvm/IR/Instructions.h"
45
#include "llvm/IR/IntrinsicInst.h"
46
#include "llvm/IR/LLVMContext.h"
47
#include "llvm/IR/Operator.h"
48
#include "llvm/Pass.h"
49
#include "llvm/Support/CommandLine.h"
50
#include "llvm/Support/Compiler.h"
51
#include "llvm/Support/Debug.h"
52
#include "llvm/Support/ErrorHandling.h"
53
#include "llvm/Support/MathExtras.h"
54
#include "llvm/Support/TimeValue.h"
55
#include "llvm/Support/raw_ostream.h"
56
#include "llvm/Transforms/Utils/Local.h"
57
#include "llvm/Transforms/Utils/PromoteMemToReg.h"
58
#include "llvm/Transforms/Utils/SSAUpdater.h"
60
#if __cplusplus >= 201103L && !defined(NDEBUG)
61
// We only use this for a debug check in C++11
67
#define DEBUG_TYPE "sroa"
69
STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
70
STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
71
STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
72
STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
73
STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
74
STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
75
STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
76
STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
77
STATISTIC(NumDeleted, "Number of instructions deleted");
78
STATISTIC(NumVectorized, "Number of vectorized aggregates");
80
/// Hidden option to force the pass to not use DomTree and mem2reg, instead
81
/// forming SSA values through the SSAUpdater infrastructure.
82
static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false),
85
/// Hidden option to enable randomly shuffling the slices to help uncover
86
/// instability in their order.
87
static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
88
cl::init(false), cl::Hidden);
90
/// Hidden option to experiment with completely strict handling of inbounds
92
static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
96
/// \brief A custom IRBuilder inserter which prefixes all names if they are
98
template <bool preserveNames = true>
99
class IRBuilderPrefixedInserter
100
: public IRBuilderDefaultInserter<preserveNames> {
104
void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
107
void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
108
BasicBlock::iterator InsertPt) const {
109
IRBuilderDefaultInserter<preserveNames>::InsertHelper(
110
I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
114
// Specialization for not preserving the name is trivial.
116
class IRBuilderPrefixedInserter<false>
117
: public IRBuilderDefaultInserter<false> {
119
void SetNamePrefix(const Twine &P) {}
122
/// \brief Provide a typedef for IRBuilder that drops names in release builds.
124
typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
127
typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
133
/// \brief A used slice of an alloca.
135
/// This structure represents a slice of an alloca used by some instruction. It
136
/// stores both the begin and end offsets of this use, a pointer to the use
137
/// itself, and a flag indicating whether we can classify the use as splittable
138
/// or not when forming partitions of the alloca.
140
/// \brief The beginning offset of the range.
141
uint64_t BeginOffset;
143
/// \brief The ending offset, not included in the range.
146
/// \brief Storage for both the use of this slice and whether it can be
148
PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
151
Slice() : BeginOffset(), EndOffset() {}
152
Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
153
: BeginOffset(BeginOffset), EndOffset(EndOffset),
154
UseAndIsSplittable(U, IsSplittable) {}
156
uint64_t beginOffset() const { return BeginOffset; }
157
uint64_t endOffset() const { return EndOffset; }
159
bool isSplittable() const { return UseAndIsSplittable.getInt(); }
160
void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
162
Use *getUse() const { return UseAndIsSplittable.getPointer(); }
164
bool isDead() const { return getUse() == nullptr; }
165
void kill() { UseAndIsSplittable.setPointer(nullptr); }
167
/// \brief Support for ordering ranges.
169
/// This provides an ordering over ranges such that start offsets are
170
/// always increasing, and within equal start offsets, the end offsets are
171
/// decreasing. Thus the spanning range comes first in a cluster with the
172
/// same start position.
173
bool operator<(const Slice &RHS) const {
174
if (beginOffset() < RHS.beginOffset())
176
if (beginOffset() > RHS.beginOffset())
178
if (isSplittable() != RHS.isSplittable())
179
return !isSplittable();
180
if (endOffset() > RHS.endOffset())
185
/// \brief Support comparison with a single offset to allow binary searches.
186
friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
187
uint64_t RHSOffset) {
188
return LHS.beginOffset() < RHSOffset;
190
friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
192
return LHSOffset < RHS.beginOffset();
195
bool operator==(const Slice &RHS) const {
196
return isSplittable() == RHS.isSplittable() &&
197
beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
199
bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
201
} // end anonymous namespace
204
template <typename T> struct isPodLike;
205
template <> struct isPodLike<Slice> { static const bool value = true; };
209
/// \brief Representation of the alloca slices.
211
/// This class represents the slices of an alloca which are formed by its
212
/// various uses. If a pointer escapes, we can't fully build a representation
213
/// for the slices used and we reflect that in this structure. The uses are
214
/// stored, sorted by increasing beginning offset and with unsplittable slices
215
/// starting at a particular offset before splittable slices.
218
/// \brief Construct the slices of a particular alloca.
219
AllocaSlices(const DataLayout &DL, AllocaInst &AI);
221
/// \brief Test whether a pointer to the allocation escapes our analysis.
223
/// If this is true, the slices are never fully built and should be
225
bool isEscaped() const { return PointerEscapingInstr; }
227
/// \brief Support for iterating over the slices.
229
typedef SmallVectorImpl<Slice>::iterator iterator;
230
typedef iterator_range<iterator> range;
231
iterator begin() { return Slices.begin(); }
232
iterator end() { return Slices.end(); }
234
typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
235
typedef iterator_range<const_iterator> const_range;
236
const_iterator begin() const { return Slices.begin(); }
237
const_iterator end() const { return Slices.end(); }
240
/// \brief Erase a range of slices.
241
void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
243
/// \brief Insert new slices for this alloca.
245
/// This moves the slices into the alloca's slices collection, and re-sorts
246
/// everything so that the usual ordering properties of the alloca's slices
248
void insert(ArrayRef<Slice> NewSlices) {
249
int OldSize = Slices.size();
250
Slices.append(NewSlices.begin(), NewSlices.end());
251
auto SliceI = Slices.begin() + OldSize;
252
std::sort(SliceI, Slices.end());
253
std::inplace_merge(Slices.begin(), SliceI, Slices.end());
256
// Forward declare an iterator to befriend it.
257
class partition_iterator;
259
/// \brief A partition of the slices.
261
/// An ephemeral representation for a range of slices which can be viewed as
262
/// a partition of the alloca. This range represents a span of the alloca's
263
/// memory which cannot be split, and provides access to all of the slices
264
/// overlapping some part of the partition.
266
/// Objects of this type are produced by traversing the alloca's slices, but
267
/// are only ephemeral and not persistent.
270
friend class AllocaSlices;
271
friend class AllocaSlices::partition_iterator;
273
/// \brief The begining and ending offsets of the alloca for this partition.
274
uint64_t BeginOffset, EndOffset;
276
/// \brief The start end end iterators of this partition.
279
/// \brief A collection of split slice tails overlapping the partition.
280
SmallVector<Slice *, 4> SplitTails;
282
/// \brief Raw constructor builds an empty partition starting and ending at
283
/// the given iterator.
284
Partition(iterator SI) : SI(SI), SJ(SI) {}
287
/// \brief The start offset of this partition.
289
/// All of the contained slices start at or after this offset.
290
uint64_t beginOffset() const { return BeginOffset; }
292
/// \brief The end offset of this partition.
294
/// All of the contained slices end at or before this offset.
295
uint64_t endOffset() const { return EndOffset; }
297
/// \brief The size of the partition.
299
/// Note that this can never be zero.
300
uint64_t size() const {
301
assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
302
return EndOffset - BeginOffset;
305
/// \brief Test whether this partition contains no slices, and merely spans
306
/// a region occupied by split slices.
307
bool empty() const { return SI == SJ; }
309
/// \name Iterate slices that start within the partition.
310
/// These may be splittable or unsplittable. They have a begin offset >= the
311
/// partition begin offset.
313
// FIXME: We should probably define a "concat_iterator" helper and use that
314
// to stitch together pointee_iterators over the split tails and the
315
// contiguous iterators of the partition. That would give a much nicer
316
// interface here. We could then additionally expose filtered iterators for
317
// split, unsplit, and unsplittable splices based on the usage patterns.
318
iterator begin() const { return SI; }
319
iterator end() const { return SJ; }
322
/// \brief Get the sequence of split slice tails.
324
/// These tails are of slices which start before this partition but are
325
/// split and overlap into the partition. We accumulate these while forming
327
ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
330
/// \brief An iterator over partitions of the alloca's slices.
332
/// This iterator implements the core algorithm for partitioning the alloca's
333
/// slices. It is a forward iterator as we don't support backtracking for
334
/// efficiency reasons, and re-use a single storage area to maintain the
335
/// current set of split slices.
337
/// It is templated on the slice iterator type to use so that it can operate
338
/// with either const or non-const slice iterators.
339
class partition_iterator
340
: public iterator_facade_base<partition_iterator,
341
std::forward_iterator_tag, Partition> {
342
friend class AllocaSlices;
344
/// \brief Most of the state for walking the partitions is held in a class
345
/// with a nice interface for examining them.
348
/// \brief We need to keep the end of the slices to know when to stop.
349
AllocaSlices::iterator SE;
351
/// \brief We also need to keep track of the maximum split end offset seen.
352
/// FIXME: Do we really?
353
uint64_t MaxSplitSliceEndOffset;
355
/// \brief Sets the partition to be empty at given iterator, and sets the
357
partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
358
: P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
359
// If not already at the end, advance our state to form the initial
365
/// \brief Advance the iterator to the next partition.
367
/// Requires that the iterator not be at the end of the slices.
369
assert((P.SI != SE || !P.SplitTails.empty()) &&
370
"Cannot advance past the end of the slices!");
372
// Clear out any split uses which have ended.
373
if (!P.SplitTails.empty()) {
374
if (P.EndOffset >= MaxSplitSliceEndOffset) {
375
// If we've finished all splits, this is easy.
376
P.SplitTails.clear();
377
MaxSplitSliceEndOffset = 0;
379
// Remove the uses which have ended in the prior partition. This
380
// cannot change the max split slice end because we just checked that
381
// the prior partition ended prior to that max.
384
P.SplitTails.begin(), P.SplitTails.end(),
385
[&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
387
assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
389
return S->endOffset() == MaxSplitSliceEndOffset;
391
"Could not find the current max split slice offset!");
392
assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
394
return S->endOffset() <= MaxSplitSliceEndOffset;
396
"Max split slice end offset is not actually the max!");
400
// If P.SI is already at the end, then we've cleared the split tail and
401
// now have an end iterator.
403
assert(P.SplitTails.empty() && "Failed to clear the split slices!");
407
// If we had a non-empty partition previously, set up the state for
408
// subsequent partitions.
410
// Accumulate all the splittable slices which started in the old
411
// partition into the split list.
413
if (S.isSplittable() && S.endOffset() > P.EndOffset) {
414
P.SplitTails.push_back(&S);
415
MaxSplitSliceEndOffset =
416
std::max(S.endOffset(), MaxSplitSliceEndOffset);
419
// Start from the end of the previous partition.
422
// If P.SI is now at the end, we at most have a tail of split slices.
424
P.BeginOffset = P.EndOffset;
425
P.EndOffset = MaxSplitSliceEndOffset;
429
// If the we have split slices and the next slice is after a gap and is
430
// not splittable immediately form an empty partition for the split
431
// slices up until the next slice begins.
432
if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
433
!P.SI->isSplittable()) {
434
P.BeginOffset = P.EndOffset;
435
P.EndOffset = P.SI->beginOffset();
440
// OK, we need to consume new slices. Set the end offset based on the
441
// current slice, and step SJ past it. The beginning offset of the
442
// parttion is the beginning offset of the next slice unless we have
443
// pre-existing split slices that are continuing, in which case we begin
444
// at the prior end offset.
445
P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
446
P.EndOffset = P.SI->endOffset();
449
// There are two strategies to form a partition based on whether the
450
// partition starts with an unsplittable slice or a splittable slice.
451
if (!P.SI->isSplittable()) {
452
// When we're forming an unsplittable region, it must always start at
453
// the first slice and will extend through its end.
454
assert(P.BeginOffset == P.SI->beginOffset());
456
// Form a partition including all of the overlapping slices with this
457
// unsplittable slice.
458
while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
459
if (!P.SJ->isSplittable())
460
P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
464
// We have a partition across a set of overlapping unsplittable
469
// If we're starting with a splittable slice, then we need to form
470
// a synthetic partition spanning it and any other overlapping splittable
472
assert(P.SI->isSplittable() && "Forming a splittable partition!");
474
// Collect all of the overlapping splittable slices.
475
while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
476
P.SJ->isSplittable()) {
477
P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
481
// Back upiP.EndOffset if we ended the span early when encountering an
482
// unsplittable slice. This synthesizes the early end offset of
483
// a partition spanning only splittable slices.
484
if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
485
assert(!P.SJ->isSplittable());
486
P.EndOffset = P.SJ->beginOffset();
491
bool operator==(const partition_iterator &RHS) const {
492
assert(SE == RHS.SE &&
493
"End iterators don't match between compared partition iterators!");
495
// The observed positions of partitions is marked by the P.SI iterator and
496
// the emptyness of the split slices. The latter is only relevant when
497
// P.SI == SE, as the end iterator will additionally have an empty split
498
// slices list, but the prior may have the same P.SI and a tail of split
500
if (P.SI == RHS.P.SI &&
501
P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
502
assert(P.SJ == RHS.P.SJ &&
503
"Same set of slices formed two different sized partitions!");
504
assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
505
"Same slice position with differently sized non-empty split "
512
partition_iterator &operator++() {
517
Partition &operator*() { return P; }
520
/// \brief A forward range over the partitions of the alloca's slices.
522
/// This accesses an iterator range over the partitions of the alloca's
523
/// slices. It computes these partitions on the fly based on the overlapping
524
/// offsets of the slices and the ability to split them. It will visit "empty"
525
/// partitions to cover regions of the alloca only accessed via split
527
iterator_range<partition_iterator> partitions() {
528
return make_range(partition_iterator(begin(), end()),
529
partition_iterator(end(), end()));
532
/// \brief Access the dead users for this alloca.
533
ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
535
/// \brief Access the dead operands referring to this alloca.
537
/// These are operands which have cannot actually be used to refer to the
538
/// alloca as they are outside its range and the user doesn't correct for
539
/// that. These mostly consist of PHI node inputs and the like which we just
540
/// need to replace with undef.
541
ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
543
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
544
void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
545
void printSlice(raw_ostream &OS, const_iterator I,
546
StringRef Indent = " ") const;
547
void printUse(raw_ostream &OS, const_iterator I,
548
StringRef Indent = " ") const;
549
void print(raw_ostream &OS) const;
550
void dump(const_iterator I) const;
555
template <typename DerivedT, typename RetT = void> class BuilderBase;
557
friend class AllocaSlices::SliceBuilder;
559
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
560
/// \brief Handle to alloca instruction to simplify method interfaces.
564
/// \brief The instruction responsible for this alloca not having a known set
567
/// When an instruction (potentially) escapes the pointer to the alloca, we
568
/// store a pointer to that here and abort trying to form slices of the
569
/// alloca. This will be null if the alloca slices are analyzed successfully.
570
Instruction *PointerEscapingInstr;
572
/// \brief The slices of the alloca.
574
/// We store a vector of the slices formed by uses of the alloca here. This
575
/// vector is sorted by increasing begin offset, and then the unsplittable
576
/// slices before the splittable ones. See the Slice inner class for more
578
SmallVector<Slice, 8> Slices;
580
/// \brief Instructions which will become dead if we rewrite the alloca.
582
/// Note that these are not separated by slice. This is because we expect an
583
/// alloca to be completely rewritten or not rewritten at all. If rewritten,
584
/// all these instructions can simply be removed and replaced with undef as
585
/// they come from outside of the allocated space.
586
SmallVector<Instruction *, 8> DeadUsers;
588
/// \brief Operands which will become dead if we rewrite the alloca.
590
/// These are operands that in their particular use can be replaced with
591
/// undef when we rewrite the alloca. These show up in out-of-bounds inputs
592
/// to PHI nodes and the like. They aren't entirely dead (there might be
593
/// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
594
/// want to swap this particular input for undef to simplify the use lists of
596
SmallVector<Use *, 8> DeadOperands;
600
static Value *foldSelectInst(SelectInst &SI) {
601
// If the condition being selected on is a constant or the same value is
602
// being selected between, fold the select. Yes this does (rarely) happen
604
if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
605
return SI.getOperand(1 + CI->isZero());
606
if (SI.getOperand(1) == SI.getOperand(2))
607
return SI.getOperand(1);
612
/// \brief A helper that folds a PHI node or a select.
613
static Value *foldPHINodeOrSelectInst(Instruction &I) {
614
if (PHINode *PN = dyn_cast<PHINode>(&I)) {
615
// If PN merges together the same value, return that value.
616
return PN->hasConstantValue();
618
return foldSelectInst(cast<SelectInst>(I));
621
/// \brief Builder for the alloca slices.
623
/// This class builds a set of alloca slices by recursively visiting the uses
624
/// of an alloca and making a slice for each load and store at each offset.
625
class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
626
friend class PtrUseVisitor<SliceBuilder>;
627
friend class InstVisitor<SliceBuilder>;
628
typedef PtrUseVisitor<SliceBuilder> Base;
630
const uint64_t AllocSize;
633
SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
634
SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
636
/// \brief Set to de-duplicate dead instructions found in the use walk.
637
SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
640
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
641
: PtrUseVisitor<SliceBuilder>(DL),
642
AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
645
void markAsDead(Instruction &I) {
646
if (VisitedDeadInsts.insert(&I).second)
647
AS.DeadUsers.push_back(&I);
650
void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
651
bool IsSplittable = false) {
652
// Completely skip uses which have a zero size or start either before or
653
// past the end of the allocation.
654
if (Size == 0 || Offset.uge(AllocSize)) {
655
DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
656
<< " which has zero size or starts outside of the "
657
<< AllocSize << " byte alloca:\n"
658
<< " alloca: " << AS.AI << "\n"
659
<< " use: " << I << "\n");
660
return markAsDead(I);
663
uint64_t BeginOffset = Offset.getZExtValue();
664
uint64_t EndOffset = BeginOffset + Size;
666
// Clamp the end offset to the end of the allocation. Note that this is
667
// formulated to handle even the case where "BeginOffset + Size" overflows.
668
// This may appear superficially to be something we could ignore entirely,
669
// but that is not so! There may be widened loads or PHI-node uses where
670
// some instructions are dead but not others. We can't completely ignore
671
// them, and so have to record at least the information here.
672
assert(AllocSize >= BeginOffset); // Established above.
673
if (Size > AllocSize - BeginOffset) {
674
DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
675
<< " to remain within the " << AllocSize << " byte alloca:\n"
676
<< " alloca: " << AS.AI << "\n"
677
<< " use: " << I << "\n");
678
EndOffset = AllocSize;
681
AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
684
void visitBitCastInst(BitCastInst &BC) {
686
return markAsDead(BC);
688
return Base::visitBitCastInst(BC);
691
void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
692
if (GEPI.use_empty())
693
return markAsDead(GEPI);
695
if (SROAStrictInbounds && GEPI.isInBounds()) {
696
// FIXME: This is a manually un-factored variant of the basic code inside
697
// of GEPs with checking of the inbounds invariant specified in the
698
// langref in a very strict sense. If we ever want to enable
699
// SROAStrictInbounds, this code should be factored cleanly into
700
// PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
701
// by writing out the code here where we have tho underlying allocation
702
// size readily available.
703
APInt GEPOffset = Offset;
704
const DataLayout &DL = GEPI.getModule()->getDataLayout();
705
for (gep_type_iterator GTI = gep_type_begin(GEPI),
706
GTE = gep_type_end(GEPI);
708
ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
712
// Handle a struct index, which adds its field offset to the pointer.
713
if (StructType *STy = dyn_cast<StructType>(*GTI)) {
714
unsigned ElementIdx = OpC->getZExtValue();
715
const StructLayout *SL = DL.getStructLayout(STy);
717
APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
719
// For array or vector indices, scale the index by the size of the
721
APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
722
GEPOffset += Index * APInt(Offset.getBitWidth(),
723
DL.getTypeAllocSize(GTI.getIndexedType()));
726
// If this index has computed an intermediate pointer which is not
727
// inbounds, then the result of the GEP is a poison value and we can
728
// delete it and all uses.
729
if (GEPOffset.ugt(AllocSize))
730
return markAsDead(GEPI);
734
return Base::visitGetElementPtrInst(GEPI);
737
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
738
uint64_t Size, bool IsVolatile) {
739
// We allow splitting of non-volatile loads and stores where the type is an
740
// integer type. These may be used to implement 'memcpy' or other "transfer
741
// of bits" patterns.
742
bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
744
insertUse(I, Offset, Size, IsSplittable);
747
void visitLoadInst(LoadInst &LI) {
748
assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
749
"All simple FCA loads should have been pre-split");
752
return PI.setAborted(&LI);
754
const DataLayout &DL = LI.getModule()->getDataLayout();
755
uint64_t Size = DL.getTypeStoreSize(LI.getType());
756
return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
759
void visitStoreInst(StoreInst &SI) {
760
Value *ValOp = SI.getValueOperand();
762
return PI.setEscapedAndAborted(&SI);
764
return PI.setAborted(&SI);
766
const DataLayout &DL = SI.getModule()->getDataLayout();
767
uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
769
// If this memory access can be shown to *statically* extend outside the
770
// bounds of of the allocation, it's behavior is undefined, so simply
771
// ignore it. Note that this is more strict than the generic clamping
772
// behavior of insertUse. We also try to handle cases which might run the
774
// FIXME: We should instead consider the pointer to have escaped if this
775
// function is being instrumented for addressing bugs or race conditions.
776
if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
777
DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
778
<< " which extends past the end of the " << AllocSize
780
<< " alloca: " << AS.AI << "\n"
781
<< " use: " << SI << "\n");
782
return markAsDead(SI);
785
assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
786
"All simple FCA stores should have been pre-split");
787
handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
790
void visitMemSetInst(MemSetInst &II) {
791
assert(II.getRawDest() == *U && "Pointer use is not the destination?");
792
ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
793
if ((Length && Length->getValue() == 0) ||
794
(IsOffsetKnown && Offset.uge(AllocSize)))
795
// Zero-length mem transfer intrinsics can be ignored entirely.
796
return markAsDead(II);
799
return PI.setAborted(&II);
801
insertUse(II, Offset, Length ? Length->getLimitedValue()
802
: AllocSize - Offset.getLimitedValue(),
806
void visitMemTransferInst(MemTransferInst &II) {
807
ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
808
if (Length && Length->getValue() == 0)
809
// Zero-length mem transfer intrinsics can be ignored entirely.
810
return markAsDead(II);
812
// Because we can visit these intrinsics twice, also check to see if the
813
// first time marked this instruction as dead. If so, skip it.
814
if (VisitedDeadInsts.count(&II))
818
return PI.setAborted(&II);
820
// This side of the transfer is completely out-of-bounds, and so we can
821
// nuke the entire transfer. However, we also need to nuke the other side
822
// if already added to our partitions.
823
// FIXME: Yet another place we really should bypass this when
824
// instrumenting for ASan.
825
if (Offset.uge(AllocSize)) {
826
SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
827
MemTransferSliceMap.find(&II);
828
if (MTPI != MemTransferSliceMap.end())
829
AS.Slices[MTPI->second].kill();
830
return markAsDead(II);
833
uint64_t RawOffset = Offset.getLimitedValue();
834
uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
836
// Check for the special case where the same exact value is used for both
838
if (*U == II.getRawDest() && *U == II.getRawSource()) {
839
// For non-volatile transfers this is a no-op.
840
if (!II.isVolatile())
841
return markAsDead(II);
843
return insertUse(II, Offset, Size, /*IsSplittable=*/false);
846
// If we have seen both source and destination for a mem transfer, then
847
// they both point to the same alloca.
849
SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
850
std::tie(MTPI, Inserted) =
851
MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
852
unsigned PrevIdx = MTPI->second;
854
Slice &PrevP = AS.Slices[PrevIdx];
856
// Check if the begin offsets match and this is a non-volatile transfer.
857
// In that case, we can completely elide the transfer.
858
if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
860
return markAsDead(II);
863
// Otherwise we have an offset transfer within the same alloca. We can't
865
PrevP.makeUnsplittable();
868
// Insert the use now that we've fixed up the splittable nature.
869
insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
871
// Check that we ended up with a valid index in the map.
872
assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
873
"Map index doesn't point back to a slice with this user.");
876
// Disable SRoA for any intrinsics except for lifetime invariants.
877
// FIXME: What about debug intrinsics? This matches old behavior, but
878
// doesn't make sense.
879
void visitIntrinsicInst(IntrinsicInst &II) {
881
return PI.setAborted(&II);
883
if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
884
II.getIntrinsicID() == Intrinsic::lifetime_end) {
885
ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
886
uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
887
Length->getLimitedValue());
888
insertUse(II, Offset, Size, true);
892
Base::visitIntrinsicInst(II);
895
Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
896
// We consider any PHI or select that results in a direct load or store of
897
// the same offset to be a viable use for slicing purposes. These uses
898
// are considered unsplittable and the size is the maximum loaded or stored
900
SmallPtrSet<Instruction *, 4> Visited;
901
SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
902
Visited.insert(Root);
903
Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
904
const DataLayout &DL = Root->getModule()->getDataLayout();
905
// If there are no loads or stores, the access is dead. We mark that as
906
// a size zero access.
909
Instruction *I, *UsedI;
910
std::tie(UsedI, I) = Uses.pop_back_val();
912
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
913
Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
916
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
917
Value *Op = SI->getOperand(0);
920
Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
924
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
925
if (!GEP->hasAllZeroIndices())
927
} else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
928
!isa<SelectInst>(I)) {
932
for (User *U : I->users())
933
if (Visited.insert(cast<Instruction>(U)).second)
934
Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
935
} while (!Uses.empty());
940
void visitPHINodeOrSelectInst(Instruction &I) {
941
assert(isa<PHINode>(I) || isa<SelectInst>(I));
943
return markAsDead(I);
945
// TODO: We could use SimplifyInstruction here to fold PHINodes and
946
// SelectInsts. However, doing so requires to change the current
947
// dead-operand-tracking mechanism. For instance, suppose neither loading
948
// from %U nor %other traps. Then "load (select undef, %U, %other)" does not
949
// trap either. However, if we simply replace %U with undef using the
950
// current dead-operand-tracking mechanism, "load (select undef, undef,
951
// %other)" may trap because the select may return the first operand
953
if (Value *Result = foldPHINodeOrSelectInst(I)) {
955
// If the result of the constant fold will be the pointer, recurse
956
// through the PHI/select as if we had RAUW'ed it.
959
// Otherwise the operand to the PHI/select is dead, and we can replace
961
AS.DeadOperands.push_back(U);
967
return PI.setAborted(&I);
969
// See if we already have computed info on this node.
970
uint64_t &Size = PHIOrSelectSizes[&I];
972
// This is a new PHI/Select, check for an unsafe use of it.
973
if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
974
return PI.setAborted(UnsafeI);
977
// For PHI and select operands outside the alloca, we can't nuke the entire
978
// phi or select -- the other side might still be relevant, so we special
979
// case them here and use a separate structure to track the operands
980
// themselves which should be replaced with undef.
981
// FIXME: This should instead be escaped in the event we're instrumenting
982
// for address sanitization.
983
if (Offset.uge(AllocSize)) {
984
AS.DeadOperands.push_back(U);
988
insertUse(I, Offset, Size);
991
void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
993
void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
995
/// \brief Disable SROA entirely if there are unhandled users of the alloca.
996
void visitInstruction(Instruction &I) { PI.setAborted(&I); }
999
AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1001
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1004
PointerEscapingInstr(nullptr) {
1005
SliceBuilder PB(DL, AI, *this);
1006
SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1007
if (PtrI.isEscaped() || PtrI.isAborted()) {
1008
// FIXME: We should sink the escape vs. abort info into the caller nicely,
1009
// possibly by just storing the PtrInfo in the AllocaSlices.
1010
PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1011
: PtrI.getAbortingInst();
1012
assert(PointerEscapingInstr && "Did not track a bad instruction");
1016
Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
1017
[](const Slice &S) {
1022
#if __cplusplus >= 201103L && !defined(NDEBUG)
1023
if (SROARandomShuffleSlices) {
1024
std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
1025
std::shuffle(Slices.begin(), Slices.end(), MT);
1029
// Sort the uses. This arranges for the offsets to be in ascending order,
1030
// and the sizes to be in descending order.
1031
std::sort(Slices.begin(), Slices.end());
1034
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1036
void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1037
StringRef Indent) const {
1038
printSlice(OS, I, Indent);
1040
printUse(OS, I, Indent);
1043
void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1044
StringRef Indent) const {
1045
OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1046
<< " slice #" << (I - begin())
1047
<< (I->isSplittable() ? " (splittable)" : "");
1050
void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1051
StringRef Indent) const {
1052
OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1055
void AllocaSlices::print(raw_ostream &OS) const {
1056
if (PointerEscapingInstr) {
1057
OS << "Can't analyze slices for alloca: " << AI << "\n"
1058
<< " A pointer to this alloca escaped by:\n"
1059
<< " " << *PointerEscapingInstr << "\n";
1063
OS << "Slices of alloca: " << AI << "\n";
1064
for (const_iterator I = begin(), E = end(); I != E; ++I)
1068
LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1071
LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1073
#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1076
/// \brief Implementation of LoadAndStorePromoter for promoting allocas.
1078
/// This subclass of LoadAndStorePromoter adds overrides to handle promoting
1079
/// the loads and stores of an alloca instruction, as well as updating its
1080
/// debug information. This is used when a domtree is unavailable and thus
1081
/// mem2reg in its full form can't be used to handle promotion of allocas to
1083
class AllocaPromoter : public LoadAndStorePromoter {
1087
SmallVector<DbgDeclareInst *, 4> DDIs;
1088
SmallVector<DbgValueInst *, 4> DVIs;
1091
AllocaPromoter(ArrayRef<const Instruction *> Insts,
1093
AllocaInst &AI, DIBuilder &DIB)
1094
: LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
1096
void run(const SmallVectorImpl<Instruction *> &Insts) {
1097
// Retain the debug information attached to the alloca for use when
1098
// rewriting loads and stores.
1099
if (auto *L = LocalAsMetadata::getIfExists(&AI)) {
1100
if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) {
1101
for (User *U : DINode->users())
1102
if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
1103
DDIs.push_back(DDI);
1104
else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
1105
DVIs.push_back(DVI);
1109
LoadAndStorePromoter::run(Insts);
1111
// While we have the debug information, clear it off of the alloca. The
1112
// caller takes care of deleting the alloca.
1113
while (!DDIs.empty())
1114
DDIs.pop_back_val()->eraseFromParent();
1115
while (!DVIs.empty())
1116
DVIs.pop_back_val()->eraseFromParent();
1120
isInstInList(Instruction *I,
1121
const SmallVectorImpl<Instruction *> &Insts) const override {
1123
if (LoadInst *LI = dyn_cast<LoadInst>(I))
1124
Ptr = LI->getOperand(0);
1126
Ptr = cast<StoreInst>(I)->getPointerOperand();
1128
// Only used to detect cycles, which will be rare and quickly found as
1129
// we're walking up a chain of defs rather than down through uses.
1130
SmallPtrSet<Value *, 4> Visited;
1136
if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
1137
Ptr = BCI->getOperand(0);
1138
else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
1139
Ptr = GEPI->getPointerOperand();
1143
} while (Visited.insert(Ptr).second);
1148
void updateDebugInfo(Instruction *Inst) const override {
1149
for (DbgDeclareInst *DDI : DDIs)
1150
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
1151
ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
1152
else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
1153
ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
1154
for (DbgValueInst *DVI : DVIs) {
1155
Value *Arg = nullptr;
1156
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1157
// If an argument is zero extended then use argument directly. The ZExt
1158
// may be zapped by an optimization pass in future.
1159
if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
1160
Arg = dyn_cast<Argument>(ZExt->getOperand(0));
1161
else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
1162
Arg = dyn_cast<Argument>(SExt->getOperand(0));
1164
Arg = SI->getValueOperand();
1165
} else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1166
Arg = LI->getPointerOperand();
1170
DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(),
1171
DVI->getExpression(), DVI->getDebugLoc(),
1176
} // end anon namespace
1179
/// \brief An optimization pass providing Scalar Replacement of Aggregates.
1181
/// This pass takes allocations which can be completely analyzed (that is, they
1182
/// don't escape) and tries to turn them into scalar SSA values. There are
1183
/// a few steps to this process.
1185
/// 1) It takes allocations of aggregates and analyzes the ways in which they
1186
/// are used to try to split them into smaller allocations, ideally of
1187
/// a single scalar data type. It will split up memcpy and memset accesses
1188
/// as necessary and try to isolate individual scalar accesses.
1189
/// 2) It will transform accesses into forms which are suitable for SSA value
1190
/// promotion. This can be replacing a memset with a scalar store of an
1191
/// integer value, or it can involve speculating operations on a PHI or
1192
/// select to be a PHI or select of the results.
1193
/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
1194
/// onto insert and extract operations on a vector value, and convert them to
1195
/// this form. By doing so, it will enable promotion of vector aggregates to
1196
/// SSA vector values.
1197
class SROA : public FunctionPass {
1198
const bool RequiresDomTree;
1202
AssumptionCache *AC;
1204
/// \brief Worklist of alloca instructions to simplify.
1206
/// Each alloca in the function is added to this. Each new alloca formed gets
1207
/// added to it as well to recursively simplify unless that alloca can be
1208
/// directly promoted. Finally, each time we rewrite a use of an alloca other
1209
/// the one being actively rewritten, we add it back onto the list if not
1210
/// already present to ensure it is re-visited.
1211
SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist;
1213
/// \brief A collection of instructions to delete.
1214
/// We try to batch deletions to simplify code and make things a bit more
1216
SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts;
1218
/// \brief Post-promotion worklist.
1220
/// Sometimes we discover an alloca which has a high probability of becoming
1221
/// viable for SROA after a round of promotion takes place. In those cases,
1222
/// the alloca is enqueued here for re-processing.
1224
/// Note that we have to be very careful to clear allocas out of this list in
1225
/// the event they are deleted.
1226
SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist;
1228
/// \brief A collection of alloca instructions we can directly promote.
1229
std::vector<AllocaInst *> PromotableAllocas;
1231
/// \brief A worklist of PHIs to speculate prior to promoting allocas.
1233
/// All of these PHIs have been checked for the safety of speculation and by
1234
/// being speculated will allow promoting allocas currently in the promotable
1236
SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs;
1238
/// \brief A worklist of select instructions to speculate prior to promoting
1241
/// All of these select instructions have been checked for the safety of
1242
/// speculation and by being speculated will allow promoting allocas
1243
/// currently in the promotable queue.
1244
SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects;
1247
SROA(bool RequiresDomTree = true)
1248
: FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
1250
initializeSROAPass(*PassRegistry::getPassRegistry());
1252
bool runOnFunction(Function &F) override;
1253
void getAnalysisUsage(AnalysisUsage &AU) const override;
1255
const char *getPassName() const override { return "SROA"; }
1259
friend class PHIOrSelectSpeculator;
1260
friend class AllocaSliceRewriter;
1262
bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
1263
AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS,
1264
AllocaSlices::Partition &P);
1265
bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
1266
bool runOnAlloca(AllocaInst &AI);
1267
void clobberUse(Use &U);
1268
void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
1269
bool promoteAllocas(Function &F);
1275
FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
1276
return new SROA(RequiresDomTree);
1279
INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
1281
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1282
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1283
INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
1286
/// Walk the range of a partitioning looking for a common type to cover this
1287
/// sequence of slices.
1288
static Type *findCommonType(AllocaSlices::const_iterator B,
1289
AllocaSlices::const_iterator E,
1290
uint64_t EndOffset) {
1292
bool TyIsCommon = true;
1293
IntegerType *ITy = nullptr;
1295
// Note that we need to look at *every* alloca slice's Use to ensure we
1296
// always get consistent results regardless of the order of slices.
1297
for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1298
Use *U = I->getUse();
1299
if (isa<IntrinsicInst>(*U->getUser()))
1301
if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1304
Type *UserTy = nullptr;
1305
if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1306
UserTy = LI->getType();
1307
} else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1308
UserTy = SI->getValueOperand()->getType();
1311
if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1312
// If the type is larger than the partition, skip it. We only encounter
1313
// this for split integer operations where we want to use the type of the
1314
// entity causing the split. Also skip if the type is not a byte width
1316
if (UserITy->getBitWidth() % 8 != 0 ||
1317
UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1320
// Track the largest bitwidth integer type used in this way in case there
1321
// is no common type.
1322
if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1326
// To avoid depending on the order of slices, Ty and TyIsCommon must not
1327
// depend on types skipped above.
1328
if (!UserTy || (Ty && Ty != UserTy))
1329
TyIsCommon = false; // Give up on anything but an iN type.
1334
return TyIsCommon ? Ty : ITy;
1337
/// PHI instructions that use an alloca and are subsequently loaded can be
1338
/// rewritten to load both input pointers in the pred blocks and then PHI the
1339
/// results, allowing the load of the alloca to be promoted.
1341
/// %P2 = phi [i32* %Alloca, i32* %Other]
1342
/// %V = load i32* %P2
1344
/// %V1 = load i32* %Alloca -> will be mem2reg'd
1346
/// %V2 = load i32* %Other
1348
/// %V = phi [i32 %V1, i32 %V2]
1350
/// We can do this to a select if its only uses are loads and if the operands
1351
/// to the select can be loaded unconditionally.
1353
/// FIXME: This should be hoisted into a generic utility, likely in
1354
/// Transforms/Util/Local.h
1355
static bool isSafePHIToSpeculate(PHINode &PN) {
1356
// For now, we can only do this promotion if the load is in the same block
1357
// as the PHI, and if there are no stores between the phi and load.
1358
// TODO: Allow recursive phi users.
1359
// TODO: Allow stores.
1360
BasicBlock *BB = PN.getParent();
1361
unsigned MaxAlign = 0;
1362
bool HaveLoad = false;
1363
for (User *U : PN.users()) {
1364
LoadInst *LI = dyn_cast<LoadInst>(U);
1365
if (!LI || !LI->isSimple())
1368
// For now we only allow loads in the same block as the PHI. This is
1369
// a common case that happens when instcombine merges two loads through
1371
if (LI->getParent() != BB)
1374
// Ensure that there are no instructions between the PHI and the load that
1376
for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
1377
if (BBI->mayWriteToMemory())
1380
MaxAlign = std::max(MaxAlign, LI->getAlignment());
1387
const DataLayout &DL = PN.getModule()->getDataLayout();
1389
// We can only transform this if it is safe to push the loads into the
1390
// predecessor blocks. The only thing to watch out for is that we can't put
1391
// a possibly trapping load in the predecessor if it is a critical edge.
1392
for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1393
TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
1394
Value *InVal = PN.getIncomingValue(Idx);
1396
// If the value is produced by the terminator of the predecessor (an
1397
// invoke) or it has side-effects, there is no valid place to put a load
1398
// in the predecessor.
1399
if (TI == InVal || TI->mayHaveSideEffects())
1402
// If the predecessor has a single successor, then the edge isn't
1404
if (TI->getNumSuccessors() == 1)
1407
// If this pointer is always safe to load, or if we can prove that there
1408
// is already a load in the block, then we can move the load to the pred
1410
if (isDereferenceablePointer(InVal, DL) ||
1411
isSafeToLoadUnconditionally(InVal, TI, MaxAlign))
1420
static void speculatePHINodeLoads(PHINode &PN) {
1421
DEBUG(dbgs() << " original: " << PN << "\n");
1423
Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
1424
IRBuilderTy PHIBuilder(&PN);
1425
PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1426
PN.getName() + ".sroa.speculated");
1428
// Get the AA tags and alignment to use from one of the loads. It doesn't
1429
// matter which one we get and if any differ.
1430
LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1433
SomeLoad->getAAMetadata(AATags);
1434
unsigned Align = SomeLoad->getAlignment();
1436
// Rewrite all loads of the PN to use the new PHI.
1437
while (!PN.use_empty()) {
1438
LoadInst *LI = cast<LoadInst>(PN.user_back());
1439
LI->replaceAllUsesWith(NewPN);
1440
LI->eraseFromParent();
1443
// Inject loads into all of the pred blocks.
1444
for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1445
BasicBlock *Pred = PN.getIncomingBlock(Idx);
1446
TerminatorInst *TI = Pred->getTerminator();
1447
Value *InVal = PN.getIncomingValue(Idx);
1448
IRBuilderTy PredBuilder(TI);
1450
LoadInst *Load = PredBuilder.CreateLoad(
1451
InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1452
++NumLoadsSpeculated;
1453
Load->setAlignment(Align);
1455
Load->setAAMetadata(AATags);
1456
NewPN->addIncoming(Load, Pred);
1459
DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1460
PN.eraseFromParent();
1463
/// Select instructions that use an alloca and are subsequently loaded can be
1464
/// rewritten to load both input pointers and then select between the result,
1465
/// allowing the load of the alloca to be promoted.
1467
/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1468
/// %V = load i32* %P2
1470
/// %V1 = load i32* %Alloca -> will be mem2reg'd
1471
/// %V2 = load i32* %Other
1472
/// %V = select i1 %cond, i32 %V1, i32 %V2
1474
/// We can do this to a select if its only uses are loads and if the operand
1475
/// to the select can be loaded unconditionally.
1476
static bool isSafeSelectToSpeculate(SelectInst &SI) {
1477
Value *TValue = SI.getTrueValue();
1478
Value *FValue = SI.getFalseValue();
1479
const DataLayout &DL = SI.getModule()->getDataLayout();
1480
bool TDerefable = isDereferenceablePointer(TValue, DL);
1481
bool FDerefable = isDereferenceablePointer(FValue, DL);
1483
for (User *U : SI.users()) {
1484
LoadInst *LI = dyn_cast<LoadInst>(U);
1485
if (!LI || !LI->isSimple())
1488
// Both operands to the select need to be dereferencable, either
1489
// absolutely (e.g. allocas) or at this point because we can see other
1492
!isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment()))
1495
!isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment()))
1502
static void speculateSelectInstLoads(SelectInst &SI) {
1503
DEBUG(dbgs() << " original: " << SI << "\n");
1505
IRBuilderTy IRB(&SI);
1506
Value *TV = SI.getTrueValue();
1507
Value *FV = SI.getFalseValue();
1508
// Replace the loads of the select with a select of two loads.
1509
while (!SI.use_empty()) {
1510
LoadInst *LI = cast<LoadInst>(SI.user_back());
1511
assert(LI->isSimple() && "We only speculate simple loads");
1513
IRB.SetInsertPoint(LI);
1515
IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
1517
IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
1518
NumLoadsSpeculated += 2;
1520
// Transfer alignment and AA info if present.
1521
TL->setAlignment(LI->getAlignment());
1522
FL->setAlignment(LI->getAlignment());
1525
LI->getAAMetadata(Tags);
1527
TL->setAAMetadata(Tags);
1528
FL->setAAMetadata(Tags);
1531
Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1532
LI->getName() + ".sroa.speculated");
1534
DEBUG(dbgs() << " speculated to: " << *V << "\n");
1535
LI->replaceAllUsesWith(V);
1536
LI->eraseFromParent();
1538
SI.eraseFromParent();
1541
/// \brief Build a GEP out of a base pointer and indices.
1543
/// This will return the BasePtr if that is valid, or build a new GEP
1544
/// instruction using the IRBuilder if GEP-ing is needed.
1545
static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
1546
SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
1547
if (Indices.empty())
1550
// A single zero index is a no-op, so check for this and avoid building a GEP
1552
if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
1555
return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
1556
NamePrefix + "sroa_idx");
1559
/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
1560
/// TargetTy without changing the offset of the pointer.
1562
/// This routine assumes we've already established a properly offset GEP with
1563
/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1564
/// zero-indices down through type layers until we find one the same as
1565
/// TargetTy. If we can't find one with the same type, we at least try to use
1566
/// one with the same size. If none of that works, we just produce the GEP as
1567
/// indicated by Indices to have the correct offset.
1568
static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
1569
Value *BasePtr, Type *Ty, Type *TargetTy,
1570
SmallVectorImpl<Value *> &Indices,
1573
return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1575
// Pointer size to use for the indices.
1576
unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
1578
// See if we can descend into a struct and locate a field with the correct
1580
unsigned NumLayers = 0;
1581
Type *ElementTy = Ty;
1583
if (ElementTy->isPointerTy())
1586
if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
1587
ElementTy = ArrayTy->getElementType();
1588
Indices.push_back(IRB.getIntN(PtrSize, 0));
1589
} else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
1590
ElementTy = VectorTy->getElementType();
1591
Indices.push_back(IRB.getInt32(0));
1592
} else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
1593
if (STy->element_begin() == STy->element_end())
1594
break; // Nothing left to descend into.
1595
ElementTy = *STy->element_begin();
1596
Indices.push_back(IRB.getInt32(0));
1601
} while (ElementTy != TargetTy);
1602
if (ElementTy != TargetTy)
1603
Indices.erase(Indices.end() - NumLayers, Indices.end());
1605
return buildGEP(IRB, BasePtr, Indices, NamePrefix);
1608
/// \brief Recursively compute indices for a natural GEP.
1610
/// This is the recursive step for getNaturalGEPWithOffset that walks down the
1611
/// element types adding appropriate indices for the GEP.
1612
static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
1613
Value *Ptr, Type *Ty, APInt &Offset,
1615
SmallVectorImpl<Value *> &Indices,
1618
return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
1621
// We can't recurse through pointer types.
1622
if (Ty->isPointerTy())
1625
// We try to analyze GEPs over vectors here, but note that these GEPs are
1626
// extremely poorly defined currently. The long-term goal is to remove GEPing
1627
// over a vector from the IR completely.
1628
if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
1629
unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
1630
if (ElementSizeInBits % 8 != 0) {
1631
// GEPs over non-multiple of 8 size vector elements are invalid.
1634
APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
1635
APInt NumSkippedElements = Offset.sdiv(ElementSize);
1636
if (NumSkippedElements.ugt(VecTy->getNumElements()))
1638
Offset -= NumSkippedElements * ElementSize;
1639
Indices.push_back(IRB.getInt(NumSkippedElements));
1640
return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
1641
Offset, TargetTy, Indices, NamePrefix);
1644
if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
1645
Type *ElementTy = ArrTy->getElementType();
1646
APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
1647
APInt NumSkippedElements = Offset.sdiv(ElementSize);
1648
if (NumSkippedElements.ugt(ArrTy->getNumElements()))
1651
Offset -= NumSkippedElements * ElementSize;
1652
Indices.push_back(IRB.getInt(NumSkippedElements));
1653
return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1654
Indices, NamePrefix);
1657
StructType *STy = dyn_cast<StructType>(Ty);
1661
const StructLayout *SL = DL.getStructLayout(STy);
1662
uint64_t StructOffset = Offset.getZExtValue();
1663
if (StructOffset >= SL->getSizeInBytes())
1665
unsigned Index = SL->getElementContainingOffset(StructOffset);
1666
Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
1667
Type *ElementTy = STy->getElementType(Index);
1668
if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
1669
return nullptr; // The offset points into alignment padding.
1671
Indices.push_back(IRB.getInt32(Index));
1672
return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1673
Indices, NamePrefix);
1676
/// \brief Get a natural GEP from a base pointer to a particular offset and
1677
/// resulting in a particular type.
1679
/// The goal is to produce a "natural" looking GEP that works with the existing
1680
/// composite types to arrive at the appropriate offset and element type for
1681
/// a pointer. TargetTy is the element type the returned GEP should point-to if
1682
/// possible. We recurse by decreasing Offset, adding the appropriate index to
1683
/// Indices, and setting Ty to the result subtype.
1685
/// If no natural GEP can be constructed, this function returns null.
1686
static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
1687
Value *Ptr, APInt Offset, Type *TargetTy,
1688
SmallVectorImpl<Value *> &Indices,
1690
PointerType *Ty = cast<PointerType>(Ptr->getType());
1692
// Don't consider any GEPs through an i8* as natural unless the TargetTy is
1694
if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
1697
Type *ElementTy = Ty->getElementType();
1698
if (!ElementTy->isSized())
1699
return nullptr; // We can't GEP through an unsized element.
1700
APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
1701
if (ElementSize == 0)
1702
return nullptr; // Zero-length arrays can't help us build a natural GEP.
1703
APInt NumSkippedElements = Offset.sdiv(ElementSize);
1705
Offset -= NumSkippedElements * ElementSize;
1706
Indices.push_back(IRB.getInt(NumSkippedElements));
1707
return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
1708
Indices, NamePrefix);
1711
/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
1712
/// resulting pointer has PointerTy.
1714
/// This tries very hard to compute a "natural" GEP which arrives at the offset
1715
/// and produces the pointer type desired. Where it cannot, it will try to use
1716
/// the natural GEP to arrive at the offset and bitcast to the type. Where that
1717
/// fails, it will try to use an existing i8* and GEP to the byte offset and
1718
/// bitcast to the type.
1720
/// The strategy for finding the more natural GEPs is to peel off layers of the
1721
/// pointer, walking back through bit casts and GEPs, searching for a base
1722
/// pointer from which we can compute a natural GEP with the desired
1723
/// properties. The algorithm tries to fold as many constant indices into
1724
/// a single GEP as possible, thus making each GEP more independent of the
1725
/// surrounding code.
1726
static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1727
APInt Offset, Type *PointerTy, Twine NamePrefix) {
1728
// Even though we don't look through PHI nodes, we could be called on an
1729
// instruction in an unreachable block, which may be on a cycle.
1730
SmallPtrSet<Value *, 4> Visited;
1731
Visited.insert(Ptr);
1732
SmallVector<Value *, 4> Indices;
1734
// We may end up computing an offset pointer that has the wrong type. If we
1735
// never are able to compute one directly that has the correct type, we'll
1736
// fall back to it, so keep it and the base it was computed from around here.
1737
Value *OffsetPtr = nullptr;
1738
Value *OffsetBasePtr;
1740
// Remember any i8 pointer we come across to re-use if we need to do a raw
1742
Value *Int8Ptr = nullptr;
1743
APInt Int8PtrOffset(Offset.getBitWidth(), 0);
1745
Type *TargetTy = PointerTy->getPointerElementType();
1748
// First fold any existing GEPs into the offset.
1749
while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
1750
APInt GEPOffset(Offset.getBitWidth(), 0);
1751
if (!GEP->accumulateConstantOffset(DL, GEPOffset))
1753
Offset += GEPOffset;
1754
Ptr = GEP->getPointerOperand();
1755
if (!Visited.insert(Ptr).second)
1759
// See if we can perform a natural GEP here.
1761
if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
1762
Indices, NamePrefix)) {
1763
// If we have a new natural pointer at the offset, clear out any old
1764
// offset pointer we computed. Unless it is the base pointer or
1765
// a non-instruction, we built a GEP we don't need. Zap it.
1766
if (OffsetPtr && OffsetPtr != OffsetBasePtr)
1767
if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
1768
assert(I->use_empty() && "Built a GEP with uses some how!");
1769
I->eraseFromParent();
1772
OffsetBasePtr = Ptr;
1773
// If we also found a pointer of the right type, we're done.
1774
if (P->getType() == PointerTy)
1778
// Stash this pointer if we've found an i8*.
1779
if (Ptr->getType()->isIntegerTy(8)) {
1781
Int8PtrOffset = Offset;
1784
// Peel off a layer of the pointer and update the offset appropriately.
1785
if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
1786
Ptr = cast<Operator>(Ptr)->getOperand(0);
1787
} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
1788
if (GA->mayBeOverridden())
1790
Ptr = GA->getAliasee();
1794
assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
1795
} while (Visited.insert(Ptr).second);
1799
Int8Ptr = IRB.CreateBitCast(
1800
Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
1801
NamePrefix + "sroa_raw_cast");
1802
Int8PtrOffset = Offset;
1805
OffsetPtr = Int8PtrOffset == 0
1807
: IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
1808
IRB.getInt(Int8PtrOffset),
1809
NamePrefix + "sroa_raw_idx");
1813
// On the off chance we were targeting i8*, guard the bitcast here.
1814
if (Ptr->getType() != PointerTy)
1815
Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
1820
/// \brief Compute the adjusted alignment for a load or store from an offset.
1821
static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
1822
const DataLayout &DL) {
1825
if (auto *LI = dyn_cast<LoadInst>(I)) {
1826
Alignment = LI->getAlignment();
1828
} else if (auto *SI = dyn_cast<StoreInst>(I)) {
1829
Alignment = SI->getAlignment();
1830
Ty = SI->getValueOperand()->getType();
1832
llvm_unreachable("Only loads and stores are allowed!");
1836
Alignment = DL.getABITypeAlignment(Ty);
1838
return MinAlign(Alignment, Offset);
1841
/// \brief Test whether we can convert a value from the old to the new type.
1843
/// This predicate should be used to guard calls to convertValue in order to
1844
/// ensure that we only try to convert viable values. The strategy is that we
1845
/// will peel off single element struct and array wrappings to get to an
1846
/// underlying value, and convert that value.
1847
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
1851
// For integer types, we can't handle any bit-width differences. This would
1852
// break both vector conversions with extension and introduce endianness
1853
// issues when in conjunction with loads and stores.
1854
if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1855
assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1856
cast<IntegerType>(NewTy)->getBitWidth() &&
1857
"We can't have the same bitwidth for different int types");
1861
if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
1863
if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1866
// We can convert pointers to integers and vice-versa. Same for vectors
1867
// of pointers and integers.
1868
OldTy = OldTy->getScalarType();
1869
NewTy = NewTy->getScalarType();
1870
if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1871
if (NewTy->isPointerTy() && OldTy->isPointerTy())
1873
if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
1881
/// \brief Generic routine to convert an SSA value to a value of a different
1884
/// This will try various different casting techniques, such as bitcasts,
1885
/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1886
/// two types for viability with this routine.
1887
static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
1889
Type *OldTy = V->getType();
1890
assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
1895
assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
1896
"Integer types must be the exact same to convert.");
1898
// See if we need inttoptr for this type pair. A cast involving both scalars
1899
// and vectors requires and additional bitcast.
1900
if (OldTy->getScalarType()->isIntegerTy() &&
1901
NewTy->getScalarType()->isPointerTy()) {
1902
// Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1903
if (OldTy->isVectorTy() && !NewTy->isVectorTy())
1904
return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1907
// Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1908
if (!OldTy->isVectorTy() && NewTy->isVectorTy())
1909
return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
1912
return IRB.CreateIntToPtr(V, NewTy);
1915
// See if we need ptrtoint for this type pair. A cast involving both scalars
1916
// and vectors requires and additional bitcast.
1917
if (OldTy->getScalarType()->isPointerTy() &&
1918
NewTy->getScalarType()->isIntegerTy()) {
1919
// Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1920
if (OldTy->isVectorTy() && !NewTy->isVectorTy())
1921
return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1924
// Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1925
if (!OldTy->isVectorTy() && NewTy->isVectorTy())
1926
return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
1929
return IRB.CreatePtrToInt(V, NewTy);
1932
return IRB.CreateBitCast(V, NewTy);
1935
/// \brief Test whether the given slice use can be promoted to a vector.
1937
/// This function is called to test each entry in a partioning which is slated
1938
/// for a single slice.
1939
static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,
1940
const Slice &S, VectorType *Ty,
1941
uint64_t ElementSize,
1942
const DataLayout &DL) {
1943
// First validate the slice offsets.
1944
uint64_t BeginOffset =
1945
std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
1946
uint64_t BeginIndex = BeginOffset / ElementSize;
1947
if (BeginIndex * ElementSize != BeginOffset ||
1948
BeginIndex >= Ty->getNumElements())
1950
uint64_t EndOffset =
1951
std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
1952
uint64_t EndIndex = EndOffset / ElementSize;
1953
if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
1956
assert(EndIndex > BeginIndex && "Empty vector!");
1957
uint64_t NumElements = EndIndex - BeginIndex;
1958
Type *SliceTy = (NumElements == 1)
1959
? Ty->getElementType()
1960
: VectorType::get(Ty->getElementType(), NumElements);
1963
Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
1965
Use *U = S.getUse();
1967
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
1968
if (MI->isVolatile())
1970
if (!S.isSplittable())
1971
return false; // Skip any unsplittable intrinsics.
1972
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
1973
if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
1974
II->getIntrinsicID() != Intrinsic::lifetime_end)
1976
} else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
1977
// Disable vector promotion when there are loads or stores of an FCA.
1979
} else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1980
if (LI->isVolatile())
1982
Type *LTy = LI->getType();
1983
if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1984
assert(LTy->isIntegerTy());
1987
if (!canConvertValue(DL, SliceTy, LTy))
1989
} else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1990
if (SI->isVolatile())
1992
Type *STy = SI->getValueOperand()->getType();
1993
if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
1994
assert(STy->isIntegerTy());
1997
if (!canConvertValue(DL, STy, SliceTy))
2006
/// \brief Test whether the given alloca partitioning and range of slices can be
2007
/// promoted to a vector.
2009
/// This is a quick test to check whether we can rewrite a particular alloca
2010
/// partition (and its newly formed alloca) into a vector alloca with only
2011
/// whole-vector loads and stores such that it could be promoted to a vector
2012
/// SSA value. We only can ensure this for a limited set of operations, and we
2013
/// don't want to do the rewrites unless we are confident that the result will
2014
/// be promotable, so we have an early test here.
2015
static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
2016
const DataLayout &DL) {
2017
// Collect the candidate types for vector-based promotion. Also track whether
2018
// we have different element types.
2019
SmallVector<VectorType *, 4> CandidateTys;
2020
Type *CommonEltTy = nullptr;
2021
bool HaveCommonEltTy = true;
2022
auto CheckCandidateType = [&](Type *Ty) {
2023
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2024
CandidateTys.push_back(VTy);
2026
CommonEltTy = VTy->getElementType();
2027
else if (CommonEltTy != VTy->getElementType())
2028
HaveCommonEltTy = false;
2031
// Consider any loads or stores that are the exact size of the slice.
2032
for (const Slice &S : P)
2033
if (S.beginOffset() == P.beginOffset() &&
2034
S.endOffset() == P.endOffset()) {
2035
if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2036
CheckCandidateType(LI->getType());
2037
else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2038
CheckCandidateType(SI->getValueOperand()->getType());
2041
// If we didn't find a vector type, nothing to do here.
2042
if (CandidateTys.empty())
2045
// Remove non-integer vector types if we had multiple common element types.
2046
// FIXME: It'd be nice to replace them with integer vector types, but we can't
2047
// do that until all the backends are known to produce good code for all
2048
// integer vector types.
2049
if (!HaveCommonEltTy) {
2050
CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
2051
[](VectorType *VTy) {
2052
return !VTy->getElementType()->isIntegerTy();
2054
CandidateTys.end());
2056
// If there were no integer vector types, give up.
2057
if (CandidateTys.empty())
2060
// Rank the remaining candidate vector types. This is easy because we know
2061
// they're all integer vectors. We sort by ascending number of elements.
2062
auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2063
assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
2064
"Cannot have vector types of different sizes!");
2065
assert(RHSTy->getElementType()->isIntegerTy() &&
2066
"All non-integer types eliminated!");
2067
assert(LHSTy->getElementType()->isIntegerTy() &&
2068
"All non-integer types eliminated!");
2069
return RHSTy->getNumElements() < LHSTy->getNumElements();
2071
std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
2073
std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
2074
CandidateTys.end());
2076
// The only way to have the same element type in every vector type is to
2077
// have the same vector type. Check that and remove all but one.
2079
for (VectorType *VTy : CandidateTys) {
2080
assert(VTy->getElementType() == CommonEltTy &&
2081
"Unaccounted for element type!");
2082
assert(VTy == CandidateTys[0] &&
2083
"Different vector types with the same element type!");
2086
CandidateTys.resize(1);
2089
// Try each vector type, and return the one which works.
2090
auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
2091
uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
2093
// While the definition of LLVM vectors is bitpacked, we don't support sizes
2094
// that aren't byte sized.
2095
if (ElementSize % 8)
2097
assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
2098
"vector size not a multiple of element size?");
2101
for (const Slice &S : P)
2102
if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
2105
for (const Slice *S : P.splitSliceTails())
2106
if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
2111
for (VectorType *VTy : CandidateTys)
2112
if (CheckVectorTypeForPromotion(VTy))
2118
/// \brief Test whether a slice of an alloca is valid for integer widening.
2120
/// This implements the necessary checking for the \c isIntegerWideningViable
2121
/// test below on a single slice of the alloca.
2122
static bool isIntegerWideningViableForSlice(const Slice &S,
2123
uint64_t AllocBeginOffset,
2125
const DataLayout &DL,
2126
bool &WholeAllocaOp) {
2127
uint64_t Size = DL.getTypeStoreSize(AllocaTy);
2129
uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2130
uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2132
// We can't reasonably handle cases where the load or store extends past
2133
// the end of the aloca's type and into its padding.
2137
Use *U = S.getUse();
2139
if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2140
if (LI->isVolatile())
2142
// We can't handle loads that extend past the allocated memory.
2143
if (DL.getTypeStoreSize(LI->getType()) > Size)
2145
// Note that we don't count vector loads or stores as whole-alloca
2146
// operations which enable integer widening because we would prefer to use
2147
// vector widening instead.
2148
if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2149
WholeAllocaOp = true;
2150
if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2151
if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
2153
} else if (RelBegin != 0 || RelEnd != Size ||
2154
!canConvertValue(DL, AllocaTy, LI->getType())) {
2155
// Non-integer loads need to be convertible from the alloca type so that
2156
// they are promotable.
2159
} else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2160
Type *ValueTy = SI->getValueOperand()->getType();
2161
if (SI->isVolatile())
2163
// We can't handle stores that extend past the allocated memory.
2164
if (DL.getTypeStoreSize(ValueTy) > Size)
2166
// Note that we don't count vector loads or stores as whole-alloca
2167
// operations which enable integer widening because we would prefer to use
2168
// vector widening instead.
2169
if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2170
WholeAllocaOp = true;
2171
if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2172
if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
2174
} else if (RelBegin != 0 || RelEnd != Size ||
2175
!canConvertValue(DL, ValueTy, AllocaTy)) {
2176
// Non-integer stores need to be convertible to the alloca type so that
2177
// they are promotable.
2180
} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2181
if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2183
if (!S.isSplittable())
2184
return false; // Skip any unsplittable intrinsics.
2185
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2186
if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
2187
II->getIntrinsicID() != Intrinsic::lifetime_end)
2196
/// \brief Test whether the given alloca partition's integer operations can be
2197
/// widened to promotable ones.
2199
/// This is a quick test to check whether we can rewrite the integer loads and
2200
/// stores to a particular alloca into wider loads and stores and be able to
2201
/// promote the resulting alloca.
2202
static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy,
2203
const DataLayout &DL) {
2204
uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
2205
// Don't create integer types larger than the maximum bitwidth.
2206
if (SizeInBits > IntegerType::MAX_INT_BITS)
2209
// Don't try to handle allocas with bit-padding.
2210
if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
2213
// We need to ensure that an integer type with the appropriate bitwidth can
2214
// be converted to the alloca type, whatever that is. We don't want to force
2215
// the alloca itself to have an integer type if there is a more suitable one.
2216
Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2217
if (!canConvertValue(DL, AllocaTy, IntTy) ||
2218
!canConvertValue(DL, IntTy, AllocaTy))
2221
// While examining uses, we ensure that the alloca has a covering load or
2222
// store. We don't want to widen the integer operations only to fail to
2223
// promote due to some other unsplittable entry (which we may make splittable
2224
// later). However, if there are only splittable uses, go ahead and assume
2225
// that we cover the alloca.
2226
// FIXME: We shouldn't consider split slices that happen to start in the
2227
// partition here...
2228
bool WholeAllocaOp =
2229
P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
2231
for (const Slice &S : P)
2232
if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2236
for (const Slice *S : P.splitSliceTails())
2237
if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2241
return WholeAllocaOp;
2244
static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2245
IntegerType *Ty, uint64_t Offset,
2246
const Twine &Name) {
2247
DEBUG(dbgs() << " start: " << *V << "\n");
2248
IntegerType *IntTy = cast<IntegerType>(V->getType());
2249
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
2250
"Element extends past full value");
2251
uint64_t ShAmt = 8 * Offset;
2252
if (DL.isBigEndian())
2253
ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
2255
V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2256
DEBUG(dbgs() << " shifted: " << *V << "\n");
2258
assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2259
"Cannot extract to a larger integer!");
2261
V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2262
DEBUG(dbgs() << " trunced: " << *V << "\n");
2267
static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2268
Value *V, uint64_t Offset, const Twine &Name) {
2269
IntegerType *IntTy = cast<IntegerType>(Old->getType());
2270
IntegerType *Ty = cast<IntegerType>(V->getType());
2271
assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2272
"Cannot insert a larger integer!");
2273
DEBUG(dbgs() << " start: " << *V << "\n");
2275
V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2276
DEBUG(dbgs() << " extended: " << *V << "\n");
2278
assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
2279
"Element store outside of alloca store");
2280
uint64_t ShAmt = 8 * Offset;
2281
if (DL.isBigEndian())
2282
ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
2284
V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2285
DEBUG(dbgs() << " shifted: " << *V << "\n");
2288
if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2289
APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2290
Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2291
DEBUG(dbgs() << " masked: " << *Old << "\n");
2292
V = IRB.CreateOr(Old, V, Name + ".insert");
2293
DEBUG(dbgs() << " inserted: " << *V << "\n");
2298
static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2299
unsigned EndIndex, const Twine &Name) {
2300
VectorType *VecTy = cast<VectorType>(V->getType());
2301
unsigned NumElements = EndIndex - BeginIndex;
2302
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2304
if (NumElements == VecTy->getNumElements())
2307
if (NumElements == 1) {
2308
V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2310
DEBUG(dbgs() << " extract: " << *V << "\n");
2314
SmallVector<Constant *, 8> Mask;
2315
Mask.reserve(NumElements);
2316
for (unsigned i = BeginIndex; i != EndIndex; ++i)
2317
Mask.push_back(IRB.getInt32(i));
2318
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
2319
ConstantVector::get(Mask), Name + ".extract");
2320
DEBUG(dbgs() << " shuffle: " << *V << "\n");
2324
static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2325
unsigned BeginIndex, const Twine &Name) {
2326
VectorType *VecTy = cast<VectorType>(Old->getType());
2327
assert(VecTy && "Can only insert a vector into a vector");
2329
VectorType *Ty = dyn_cast<VectorType>(V->getType());
2331
// Single element to insert.
2332
V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2334
DEBUG(dbgs() << " insert: " << *V << "\n");
2338
assert(Ty->getNumElements() <= VecTy->getNumElements() &&
2339
"Too many elements!");
2340
if (Ty->getNumElements() == VecTy->getNumElements()) {
2341
assert(V->getType() == VecTy && "Vector type mismatch");
2344
unsigned EndIndex = BeginIndex + Ty->getNumElements();
2346
// When inserting a smaller vector into the larger to store, we first
2347
// use a shuffle vector to widen it with undef elements, and then
2348
// a second shuffle vector to select between the loaded vector and the
2350
SmallVector<Constant *, 8> Mask;
2351
Mask.reserve(VecTy->getNumElements());
2352
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
2353
if (i >= BeginIndex && i < EndIndex)
2354
Mask.push_back(IRB.getInt32(i - BeginIndex));
2356
Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
2357
V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
2358
ConstantVector::get(Mask), Name + ".expand");
2359
DEBUG(dbgs() << " shuffle: " << *V << "\n");
2362
for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
2363
Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2365
V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
2367
DEBUG(dbgs() << " blend: " << *V << "\n");
2372
/// \brief Visitor to rewrite instructions using p particular slice of an alloca
2373
/// to use a new alloca.
2375
/// Also implements the rewriting to vector-based accesses when the partition
2376
/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2378
class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2379
// Befriend the base class so it can delegate to private visit methods.
2380
friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
2381
typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
2383
const DataLayout &DL;
2386
AllocaInst &OldAI, &NewAI;
2387
const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2390
// This is a convenience and flag variable that will be null unless the new
2391
// alloca's integer operations should be widened to this integer type due to
2392
// passing isIntegerWideningViable above. If it is non-null, the desired
2393
// integer type will be stored here for easy access during rewriting.
2396
// If we are rewriting an alloca partition which can be written as pure
2397
// vector operations, we stash extra information here. When VecTy is
2398
// non-null, we have some strict guarantees about the rewritten alloca:
2399
// - The new alloca is exactly the size of the vector type here.
2400
// - The accesses all either map to the entire vector or to a single
2402
// - The set of accessing instructions is only one of those handled above
2403
// in isVectorPromotionViable. Generally these are the same access kinds
2404
// which are promotable via mem2reg.
2407
uint64_t ElementSize;
2409
// The original offset of the slice currently being rewritten relative to
2410
// the original alloca.
2411
uint64_t BeginOffset, EndOffset;
2412
// The new offsets of the slice currently being rewritten relative to the
2414
uint64_t NewBeginOffset, NewEndOffset;
2420
Instruction *OldPtr;
2422
// Track post-rewrite users which are PHI nodes and Selects.
2423
SmallPtrSetImpl<PHINode *> &PHIUsers;
2424
SmallPtrSetImpl<SelectInst *> &SelectUsers;
2426
// Utility IR builder, whose name prefix is setup for each visited use, and
2427
// the insertion point is set to point to the user.
2431
AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2432
AllocaInst &OldAI, AllocaInst &NewAI,
2433
uint64_t NewAllocaBeginOffset,
2434
uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2435
VectorType *PromotableVecTy,
2436
SmallPtrSetImpl<PHINode *> &PHIUsers,
2437
SmallPtrSetImpl<SelectInst *> &SelectUsers)
2438
: DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2439
NewAllocaBeginOffset(NewAllocaBeginOffset),
2440
NewAllocaEndOffset(NewAllocaEndOffset),
2441
NewAllocaTy(NewAI.getAllocatedType()),
2442
IntTy(IsIntegerPromotable
2445
DL.getTypeSizeInBits(NewAI.getAllocatedType()))
2447
VecTy(PromotableVecTy),
2448
ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2449
ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
2450
BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
2451
OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2452
IRB(NewAI.getContext(), ConstantFolder()) {
2454
assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
2455
"Only multiple-of-8 sized vector elements are viable");
2458
assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2461
bool visit(AllocaSlices::const_iterator I) {
2462
bool CanSROA = true;
2463
BeginOffset = I->beginOffset();
2464
EndOffset = I->endOffset();
2465
IsSplittable = I->isSplittable();
2467
BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2468
DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2469
DEBUG(AS.printSlice(dbgs(), I, ""));
2470
DEBUG(dbgs() << "\n");
2472
// Compute the intersecting offset range.
2473
assert(BeginOffset < NewAllocaEndOffset);
2474
assert(EndOffset > NewAllocaBeginOffset);
2475
NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2476
NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2478
SliceSize = NewEndOffset - NewBeginOffset;
2480
OldUse = I->getUse();
2481
OldPtr = cast<Instruction>(OldUse->get());
2483
Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2484
IRB.SetInsertPoint(OldUserI);
2485
IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2486
IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
2488
CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2495
// Make sure the other visit overloads are visible.
2498
// Every instruction which can end up as a user must have a rewrite rule.
2499
bool visitInstruction(Instruction &I) {
2500
DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2501
llvm_unreachable("No rewrite rule for this instruction!");
2504
Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2505
// Note that the offset computation can use BeginOffset or NewBeginOffset
2506
// interchangeably for unsplit slices.
2507
assert(IsSplit || BeginOffset == NewBeginOffset);
2508
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2511
StringRef OldName = OldPtr->getName();
2512
// Skip through the last '.sroa.' component of the name.
2513
size_t LastSROAPrefix = OldName.rfind(".sroa.");
2514
if (LastSROAPrefix != StringRef::npos) {
2515
OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2516
// Look for an SROA slice index.
2517
size_t IndexEnd = OldName.find_first_not_of("0123456789");
2518
if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2519
// Strip the index and look for the offset.
2520
OldName = OldName.substr(IndexEnd + 1);
2521
size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2522
if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2523
// Strip the offset.
2524
OldName = OldName.substr(OffsetEnd + 1);
2527
// Strip any SROA suffixes as well.
2528
OldName = OldName.substr(0, OldName.find(".sroa_"));
2531
return getAdjustedPtr(IRB, DL, &NewAI,
2532
APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
2534
Twine(OldName) + "."
2541
/// \brief Compute suitable alignment to access this slice of the *new*
2544
/// You can optionally pass a type to this routine and if that type's ABI
2545
/// alignment is itself suitable, this will return zero.
2546
unsigned getSliceAlign(Type *Ty = nullptr) {
2547
unsigned NewAIAlign = NewAI.getAlignment();
2549
NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
2551
MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
2552
return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
2555
unsigned getIndex(uint64_t Offset) {
2556
assert(VecTy && "Can only call getIndex when rewriting a vector");
2557
uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2558
assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2559
uint32_t Index = RelOffset / ElementSize;
2560
assert(Index * ElementSize == RelOffset);
2564
void deleteIfTriviallyDead(Value *V) {
2565
Instruction *I = cast<Instruction>(V);
2566
if (isInstructionTriviallyDead(I))
2567
Pass.DeadInsts.insert(I);
2570
Value *rewriteVectorizedLoadInst() {
2571
unsigned BeginIndex = getIndex(NewBeginOffset);
2572
unsigned EndIndex = getIndex(NewEndOffset);
2573
assert(EndIndex > BeginIndex && "Empty vector!");
2575
Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
2576
return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
2579
Value *rewriteIntegerLoad(LoadInst &LI) {
2580
assert(IntTy && "We cannot insert an integer to the alloca");
2581
assert(!LI.isVolatile());
2582
Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
2583
V = convertValue(DL, IRB, V, IntTy);
2584
assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2585
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2586
if (Offset > 0 || NewEndOffset < NewAllocaEndOffset)
2587
V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset,
2592
bool visitLoadInst(LoadInst &LI) {
2593
DEBUG(dbgs() << " original: " << LI << "\n");
2594
Value *OldOp = LI.getOperand(0);
2595
assert(OldOp == OldPtr);
2597
Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2599
const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
2600
bool IsPtrAdjusted = false;
2603
V = rewriteVectorizedLoadInst();
2604
} else if (IntTy && LI.getType()->isIntegerTy()) {
2605
V = rewriteIntegerLoad(LI);
2606
} else if (NewBeginOffset == NewAllocaBeginOffset &&
2607
NewEndOffset == NewAllocaEndOffset &&
2608
(canConvertValue(DL, NewAllocaTy, TargetTy) ||
2609
(IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2610
TargetTy->isIntegerTy()))) {
2611
LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
2612
LI.isVolatile(), LI.getName());
2613
if (LI.isVolatile())
2614
NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
2617
// If this is an integer load past the end of the slice (which means the
2618
// bytes outside the slice are undef or this load is dead) just forcibly
2619
// fix the integer size with correct handling of endianness.
2620
if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2621
if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
2622
if (AITy->getBitWidth() < TITy->getBitWidth()) {
2623
V = IRB.CreateZExt(V, TITy, "load.ext");
2624
if (DL.isBigEndian())
2625
V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
2629
Type *LTy = TargetTy->getPointerTo();
2630
LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
2631
getSliceAlign(TargetTy),
2632
LI.isVolatile(), LI.getName());
2633
if (LI.isVolatile())
2634
NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
2637
IsPtrAdjusted = true;
2639
V = convertValue(DL, IRB, V, TargetTy);
2642
assert(!LI.isVolatile());
2643
assert(LI.getType()->isIntegerTy() &&
2644
"Only integer type loads and stores are split");
2645
assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
2646
"Split load isn't smaller than original load");
2647
assert(LI.getType()->getIntegerBitWidth() ==
2648
DL.getTypeStoreSizeInBits(LI.getType()) &&
2649
"Non-byte-multiple bit width");
2650
// Move the insertion point just past the load so that we can refer to it.
2651
IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI)));
2652
// Create a placeholder value with the same type as LI to use as the
2653
// basis for the new value. This allows us to replace the uses of LI with
2654
// the computed value, and then replace the placeholder with LI, leaving
2655
// LI only used for this computation.
2656
Value *Placeholder =
2657
new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
2658
V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
2660
LI.replaceAllUsesWith(V);
2661
Placeholder->replaceAllUsesWith(&LI);
2664
LI.replaceAllUsesWith(V);
2667
Pass.DeadInsts.insert(&LI);
2668
deleteIfTriviallyDead(OldOp);
2669
DEBUG(dbgs() << " to: " << *V << "\n");
2670
return !LI.isVolatile() && !IsPtrAdjusted;
2673
bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
2674
if (V->getType() != VecTy) {
2675
unsigned BeginIndex = getIndex(NewBeginOffset);
2676
unsigned EndIndex = getIndex(NewEndOffset);
2677
assert(EndIndex > BeginIndex && "Empty vector!");
2678
unsigned NumElements = EndIndex - BeginIndex;
2679
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2680
Type *SliceTy = (NumElements == 1)
2682
: VectorType::get(ElementTy, NumElements);
2683
if (V->getType() != SliceTy)
2684
V = convertValue(DL, IRB, V, SliceTy);
2686
// Mix in the existing elements.
2687
Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
2688
V = insertVector(IRB, Old, V, BeginIndex, "vec");
2690
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
2691
Pass.DeadInsts.insert(&SI);
2694
DEBUG(dbgs() << " to: " << *Store << "\n");
2698
bool rewriteIntegerStore(Value *V, StoreInst &SI) {
2699
assert(IntTy && "We cannot extract an integer from the alloca");
2700
assert(!SI.isVolatile());
2701
if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
2703
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
2704
Old = convertValue(DL, IRB, Old, IntTy);
2705
assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2706
uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
2707
V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
2709
V = convertValue(DL, IRB, V, NewAllocaTy);
2710
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
2711
Pass.DeadInsts.insert(&SI);
2713
DEBUG(dbgs() << " to: " << *Store << "\n");
2717
bool visitStoreInst(StoreInst &SI) {
2718
DEBUG(dbgs() << " original: " << SI << "\n");
2719
Value *OldOp = SI.getOperand(1);
2720
assert(OldOp == OldPtr);
2722
Value *V = SI.getValueOperand();
2724
// Strip all inbounds GEPs and pointer casts to try to dig out any root
2725
// alloca that should be re-examined after promoting this alloca.
2726
if (V->getType()->isPointerTy())
2727
if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
2728
Pass.PostPromotionWorklist.insert(AI);
2730
if (SliceSize < DL.getTypeStoreSize(V->getType())) {
2731
assert(!SI.isVolatile());
2732
assert(V->getType()->isIntegerTy() &&
2733
"Only integer type loads and stores are split");
2734
assert(V->getType()->getIntegerBitWidth() ==
2735
DL.getTypeStoreSizeInBits(V->getType()) &&
2736
"Non-byte-multiple bit width");
2737
IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
2738
V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
2743
return rewriteVectorizedStoreInst(V, SI, OldOp);
2744
if (IntTy && V->getType()->isIntegerTy())
2745
return rewriteIntegerStore(V, SI);
2747
const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
2749
if (NewBeginOffset == NewAllocaBeginOffset &&
2750
NewEndOffset == NewAllocaEndOffset &&
2751
(canConvertValue(DL, V->getType(), NewAllocaTy) ||
2752
(IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
2753
V->getType()->isIntegerTy()))) {
2754
// If this is an integer store past the end of slice (and thus the bytes
2755
// past that point are irrelevant or this is unreachable), truncate the
2756
// value prior to storing.
2757
if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
2758
if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2759
if (VITy->getBitWidth() > AITy->getBitWidth()) {
2760
if (DL.isBigEndian())
2761
V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
2763
V = IRB.CreateTrunc(V, AITy, "load.trunc");
2766
V = convertValue(DL, IRB, V, NewAllocaTy);
2767
NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
2770
Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
2771
NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
2774
if (SI.isVolatile())
2775
NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
2776
Pass.DeadInsts.insert(&SI);
2777
deleteIfTriviallyDead(OldOp);
2779
DEBUG(dbgs() << " to: " << *NewSI << "\n");
2780
return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
2783
/// \brief Compute an integer value from splatting an i8 across the given
2784
/// number of bytes.
2786
/// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2787
/// call this routine.
2788
/// FIXME: Heed the advice above.
2790
/// \param V The i8 value to splat.
2791
/// \param Size The number of bytes in the output (assuming i8 is one byte)
2792
Value *getIntegerSplat(Value *V, unsigned Size) {
2793
assert(Size > 0 && "Expected a positive number of bytes.");
2794
IntegerType *VTy = cast<IntegerType>(V->getType());
2795
assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
2799
Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
2801
IRB.CreateZExt(V, SplatIntTy, "zext"),
2802
ConstantExpr::getUDiv(
2803
Constant::getAllOnesValue(SplatIntTy),
2804
ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
2810
/// \brief Compute a vector splat for a given element value.
2811
Value *getVectorSplat(Value *V, unsigned NumElements) {
2812
V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
2813
DEBUG(dbgs() << " splat: " << *V << "\n");
2817
bool visitMemSetInst(MemSetInst &II) {
2818
DEBUG(dbgs() << " original: " << II << "\n");
2819
assert(II.getRawDest() == OldPtr);
2821
// If the memset has a variable size, it cannot be split, just adjust the
2822
// pointer to the new alloca.
2823
if (!isa<Constant>(II.getLength())) {
2825
assert(NewBeginOffset == BeginOffset);
2826
II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
2827
Type *CstTy = II.getAlignmentCst()->getType();
2828
II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
2830
deleteIfTriviallyDead(OldPtr);
2834
// Record this instruction for deletion.
2835
Pass.DeadInsts.insert(&II);
2837
Type *AllocaTy = NewAI.getAllocatedType();
2838
Type *ScalarTy = AllocaTy->getScalarType();
2840
// If this doesn't map cleanly onto the alloca type, and that type isn't
2841
// a single value type, just emit a memset.
2842
if (!VecTy && !IntTy &&
2843
(BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2844
SliceSize != DL.getTypeStoreSize(AllocaTy) ||
2845
!AllocaTy->isSingleValueType() ||
2846
!DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
2847
DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
2848
Type *SizeTy = II.getLength()->getType();
2849
Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
2850
CallInst *New = IRB.CreateMemSet(
2851
getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
2852
getSliceAlign(), II.isVolatile());
2854
DEBUG(dbgs() << " to: " << *New << "\n");
2858
// If we can represent this as a simple value, we have to build the actual
2859
// value to store, which requires expanding the byte present in memset to
2860
// a sensible representation for the alloca type. This is essentially
2861
// splatting the byte to a sufficiently wide integer, splatting it across
2862
// any desired vector width, and bitcasting to the final type.
2866
// If this is a memset of a vectorized alloca, insert it.
2867
assert(ElementTy == ScalarTy);
2869
unsigned BeginIndex = getIndex(NewBeginOffset);
2870
unsigned EndIndex = getIndex(NewEndOffset);
2871
assert(EndIndex > BeginIndex && "Empty vector!");
2872
unsigned NumElements = EndIndex - BeginIndex;
2873
assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2876
getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
2877
Splat = convertValue(DL, IRB, Splat, ElementTy);
2878
if (NumElements > 1)
2879
Splat = getVectorSplat(Splat, NumElements);
2882
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
2883
V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
2885
// If this is a memset on an alloca where we can widen stores, insert the
2887
assert(!II.isVolatile());
2889
uint64_t Size = NewEndOffset - NewBeginOffset;
2890
V = getIntegerSplat(II.getValue(), Size);
2892
if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
2893
EndOffset != NewAllocaBeginOffset)) {
2895
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
2896
Old = convertValue(DL, IRB, Old, IntTy);
2897
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2898
V = insertInteger(DL, IRB, Old, V, Offset, "insert");
2900
assert(V->getType() == IntTy &&
2901
"Wrong type for an alloca wide integer!");
2903
V = convertValue(DL, IRB, V, AllocaTy);
2905
// Established these invariants above.
2906
assert(NewBeginOffset == NewAllocaBeginOffset);
2907
assert(NewEndOffset == NewAllocaEndOffset);
2909
V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
2910
if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
2911
V = getVectorSplat(V, AllocaVecTy->getNumElements());
2913
V = convertValue(DL, IRB, V, AllocaTy);
2916
Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
2919
DEBUG(dbgs() << " to: " << *New << "\n");
2920
return !II.isVolatile();
2923
bool visitMemTransferInst(MemTransferInst &II) {
2924
// Rewriting of memory transfer instructions can be a bit tricky. We break
2925
// them into two categories: split intrinsics and unsplit intrinsics.
2927
DEBUG(dbgs() << " original: " << II << "\n");
2929
bool IsDest = &II.getRawDestUse() == OldUse;
2930
assert((IsDest && II.getRawDest() == OldPtr) ||
2931
(!IsDest && II.getRawSource() == OldPtr));
2933
unsigned SliceAlign = getSliceAlign();
2935
// For unsplit intrinsics, we simply modify the source and destination
2936
// pointers in place. This isn't just an optimization, it is a matter of
2937
// correctness. With unsplit intrinsics we may be dealing with transfers
2938
// within a single alloca before SROA ran, or with transfers that have
2939
// a variable length. We may also be dealing with memmove instead of
2940
// memcpy, and so simply updating the pointers is the necessary for us to
2941
// update both source and dest of a single call.
2942
if (!IsSplittable) {
2943
Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
2945
II.setDest(AdjustedPtr);
2947
II.setSource(AdjustedPtr);
2949
if (II.getAlignment() > SliceAlign) {
2950
Type *CstTy = II.getAlignmentCst()->getType();
2952
ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
2955
DEBUG(dbgs() << " to: " << II << "\n");
2956
deleteIfTriviallyDead(OldPtr);
2959
// For split transfer intrinsics we have an incredibly useful assurance:
2960
// the source and destination do not reside within the same alloca, and at
2961
// least one of them does not escape. This means that we can replace
2962
// memmove with memcpy, and we don't need to worry about all manner of
2963
// downsides to splitting and transforming the operations.
2965
// If this doesn't map cleanly onto the alloca type, and that type isn't
2966
// a single value type, just emit a memcpy.
2969
(BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
2970
SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
2971
!NewAI.getAllocatedType()->isSingleValueType());
2973
// If we're just going to emit a memcpy, the alloca hasn't changed, and the
2974
// size hasn't been shrunk based on analysis of the viable range, this is
2976
if (EmitMemCpy && &OldAI == &NewAI) {
2977
// Ensure the start lines up.
2978
assert(NewBeginOffset == BeginOffset);
2980
// Rewrite the size as needed.
2981
if (NewEndOffset != EndOffset)
2982
II.setLength(ConstantInt::get(II.getLength()->getType(),
2983
NewEndOffset - NewBeginOffset));
2986
// Record this instruction for deletion.
2987
Pass.DeadInsts.insert(&II);
2989
// Strip all inbounds GEPs and pointer casts to try to dig out any root
2990
// alloca that should be re-examined after rewriting this instruction.
2991
Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
2992
if (AllocaInst *AI =
2993
dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
2994
assert(AI != &OldAI && AI != &NewAI &&
2995
"Splittable transfers cannot reach the same alloca on both ends.");
2996
Pass.Worklist.insert(AI);
2999
Type *OtherPtrTy = OtherPtr->getType();
3000
unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3002
// Compute the relative offset for the other pointer within the transfer.
3003
unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
3004
APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
3005
unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
3006
OtherOffset.zextOrTrunc(64).getZExtValue());
3009
// Compute the other pointer, folding as much as possible to produce
3010
// a single, simple GEP in most cases.
3011
OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3012
OtherPtr->getName() + ".");
3014
Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3015
Type *SizeTy = II.getLength()->getType();
3016
Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3018
CallInst *New = IRB.CreateMemCpy(
3019
IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
3020
MinAlign(SliceAlign, OtherAlign), II.isVolatile());
3022
DEBUG(dbgs() << " to: " << *New << "\n");
3026
bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3027
NewEndOffset == NewAllocaEndOffset;
3028
uint64_t Size = NewEndOffset - NewBeginOffset;
3029
unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3030
unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3031
unsigned NumElements = EndIndex - BeginIndex;
3032
IntegerType *SubIntTy =
3033
IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3035
// Reset the other pointer type to match the register type we're going to
3036
// use, but using the address space of the original other pointer.
3037
if (VecTy && !IsWholeAlloca) {
3038
if (NumElements == 1)
3039
OtherPtrTy = VecTy->getElementType();
3041
OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
3043
OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
3044
} else if (IntTy && !IsWholeAlloca) {
3045
OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
3047
OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
3050
Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3051
OtherPtr->getName() + ".");
3052
unsigned SrcAlign = OtherAlign;
3053
Value *DstPtr = &NewAI;
3054
unsigned DstAlign = SliceAlign;
3056
std::swap(SrcPtr, DstPtr);
3057
std::swap(SrcAlign, DstAlign);
3061
if (VecTy && !IsWholeAlloca && !IsDest) {
3062
Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
3063
Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3064
} else if (IntTy && !IsWholeAlloca && !IsDest) {
3065
Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
3066
Src = convertValue(DL, IRB, Src, IntTy);
3067
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3068
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3071
IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
3074
if (VecTy && !IsWholeAlloca && IsDest) {
3076
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
3077
Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3078
} else if (IntTy && !IsWholeAlloca && IsDest) {
3080
IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
3081
Old = convertValue(DL, IRB, Old, IntTy);
3082
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3083
Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3084
Src = convertValue(DL, IRB, Src, NewAllocaTy);
3087
StoreInst *Store = cast<StoreInst>(
3088
IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3090
DEBUG(dbgs() << " to: " << *Store << "\n");
3091
return !II.isVolatile();
3094
bool visitIntrinsicInst(IntrinsicInst &II) {
3095
assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
3096
II.getIntrinsicID() == Intrinsic::lifetime_end);
3097
DEBUG(dbgs() << " original: " << II << "\n");
3098
assert(II.getArgOperand(1) == OldPtr);
3100
// Record this instruction for deletion.
3101
Pass.DeadInsts.insert(&II);
3104
ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3105
NewEndOffset - NewBeginOffset);
3106
Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3108
if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3109
New = IRB.CreateLifetimeStart(Ptr, Size);
3111
New = IRB.CreateLifetimeEnd(Ptr, Size);
3114
DEBUG(dbgs() << " to: " << *New << "\n");
3118
bool visitPHINode(PHINode &PN) {
3119
DEBUG(dbgs() << " original: " << PN << "\n");
3120
assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3121
assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3123
// We would like to compute a new pointer in only one place, but have it be
3124
// as local as possible to the PHI. To do that, we re-use the location of
3125
// the old pointer, which necessarily must be in the right position to
3126
// dominate the PHI.
3127
IRBuilderTy PtrBuilder(IRB);
3128
if (isa<PHINode>(OldPtr))
3129
PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt());
3131
PtrBuilder.SetInsertPoint(OldPtr);
3132
PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3134
Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
3135
// Replace the operands which were using the old pointer.
3136
std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3138
DEBUG(dbgs() << " to: " << PN << "\n");
3139
deleteIfTriviallyDead(OldPtr);
3141
// PHIs can't be promoted on their own, but often can be speculated. We
3142
// check the speculation outside of the rewriter so that we see the
3143
// fully-rewritten alloca.
3144
PHIUsers.insert(&PN);
3148
bool visitSelectInst(SelectInst &SI) {
3149
DEBUG(dbgs() << " original: " << SI << "\n");
3150
assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3151
"Pointer isn't an operand!");
3152
assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3153
assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3155
Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3156
// Replace the operands which were using the old pointer.
3157
if (SI.getOperand(1) == OldPtr)
3158
SI.setOperand(1, NewPtr);
3159
if (SI.getOperand(2) == OldPtr)
3160
SI.setOperand(2, NewPtr);
3162
DEBUG(dbgs() << " to: " << SI << "\n");
3163
deleteIfTriviallyDead(OldPtr);
3165
// Selects can't be promoted on their own, but often can be speculated. We
3166
// check the speculation outside of the rewriter so that we see the
3167
// fully-rewritten alloca.
3168
SelectUsers.insert(&SI);
3175
/// \brief Visitor to rewrite aggregate loads and stores as scalar.
3177
/// This pass aggressively rewrites all aggregate loads and stores on
3178
/// a particular pointer (or any pointer derived from it which we can identify)
3179
/// with scalar loads and stores.
3180
class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3181
// Befriend the base class so it can delegate to private visit methods.
3182
friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
3184
const DataLayout &DL;
3186
/// Queue of pointer uses to analyze and potentially rewrite.
3187
SmallVector<Use *, 8> Queue;
3189
/// Set to prevent us from cycling with phi nodes and loops.
3190
SmallPtrSet<User *, 8> Visited;
3192
/// The current pointer use being rewritten. This is used to dig up the used
3193
/// value (as opposed to the user).
3197
AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
3199
/// Rewrite loads and stores through a pointer and all pointers derived from
3201
bool rewrite(Instruction &I) {
3202
DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3204
bool Changed = false;
3205
while (!Queue.empty()) {
3206
U = Queue.pop_back_val();
3207
Changed |= visit(cast<Instruction>(U->getUser()));
3213
/// Enqueue all the users of the given instruction for further processing.
3214
/// This uses a set to de-duplicate users.
3215
void enqueueUsers(Instruction &I) {
3216
for (Use &U : I.uses())
3217
if (Visited.insert(U.getUser()).second)
3218
Queue.push_back(&U);
3221
// Conservative default is to not rewrite anything.
3222
bool visitInstruction(Instruction &I) { return false; }
3224
/// \brief Generic recursive split emission class.
3225
template <typename Derived> class OpSplitter {
3227
/// The builder used to form new instructions.
3229
/// The indices which to be used with insert- or extractvalue to select the
3230
/// appropriate value within the aggregate.
3231
SmallVector<unsigned, 4> Indices;
3232
/// The indices to a GEP instruction which will move Ptr to the correct slot
3233
/// within the aggregate.
3234
SmallVector<Value *, 4> GEPIndices;
3235
/// The base pointer of the original op, used as a base for GEPing the
3236
/// split operations.
3239
/// Initialize the splitter with an insertion point, Ptr and start with a
3240
/// single zero GEP index.
3241
OpSplitter(Instruction *InsertionPoint, Value *Ptr)
3242
: IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
3245
/// \brief Generic recursive split emission routine.
3247
/// This method recursively splits an aggregate op (load or store) into
3248
/// scalar or vector ops. It splits recursively until it hits a single value
3249
/// and emits that single value operation via the template argument.
3251
/// The logic of this routine relies on GEPs and insertvalue and
3252
/// extractvalue all operating with the same fundamental index list, merely
3253
/// formatted differently (GEPs need actual values).
3255
/// \param Ty The type being split recursively into smaller ops.
3256
/// \param Agg The aggregate value being built up or stored, depending on
3257
/// whether this is splitting a load or a store respectively.
3258
void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3259
if (Ty->isSingleValueType())
3260
return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
3262
if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3263
unsigned OldSize = Indices.size();
3265
for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3267
assert(Indices.size() == OldSize && "Did not return to the old size");
3268
Indices.push_back(Idx);
3269
GEPIndices.push_back(IRB.getInt32(Idx));
3270
emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3271
GEPIndices.pop_back();
3277
if (StructType *STy = dyn_cast<StructType>(Ty)) {
3278
unsigned OldSize = Indices.size();
3280
for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3282
assert(Indices.size() == OldSize && "Did not return to the old size");
3283
Indices.push_back(Idx);
3284
GEPIndices.push_back(IRB.getInt32(Idx));
3285
emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3286
GEPIndices.pop_back();
3292
llvm_unreachable("Only arrays and structs are aggregate loadable types");
3296
struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3297
LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
3298
: OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
3300
/// Emit a leaf load of a single value. This is called at the leaves of the
3301
/// recursive emission to actually load values.
3302
void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
3303
assert(Ty->isSingleValueType());
3304
// Load the single value and insert it using the indices.
3306
IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
3307
Value *Load = IRB.CreateLoad(GEP, Name + ".load");
3308
Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3309
DEBUG(dbgs() << " to: " << *Load << "\n");
3313
bool visitLoadInst(LoadInst &LI) {
3314
assert(LI.getPointerOperand() == *U);
3315
if (!LI.isSimple() || LI.getType()->isSingleValueType())
3318
// We have an aggregate being loaded, split it apart.
3319
DEBUG(dbgs() << " original: " << LI << "\n");
3320
LoadOpSplitter Splitter(&LI, *U);
3321
Value *V = UndefValue::get(LI.getType());
3322
Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3323
LI.replaceAllUsesWith(V);
3324
LI.eraseFromParent();
3328
struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3329
StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
3330
: OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
3332
/// Emit a leaf store of a single value. This is called at the leaves of the
3333
/// recursive emission to actually produce stores.
3334
void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
3335
assert(Ty->isSingleValueType());
3336
// Extract the single value and store it using the indices.
3337
Value *Store = IRB.CreateStore(
3338
IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
3339
IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"));
3341
DEBUG(dbgs() << " to: " << *Store << "\n");
3345
bool visitStoreInst(StoreInst &SI) {
3346
if (!SI.isSimple() || SI.getPointerOperand() != *U)
3348
Value *V = SI.getValueOperand();
3349
if (V->getType()->isSingleValueType())
3352
// We have an aggregate being stored, split it apart.
3353
DEBUG(dbgs() << " original: " << SI << "\n");
3354
StoreOpSplitter Splitter(&SI, *U);
3355
Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
3356
SI.eraseFromParent();
3360
bool visitBitCastInst(BitCastInst &BC) {
3365
bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
3370
bool visitPHINode(PHINode &PN) {
3375
bool visitSelectInst(SelectInst &SI) {
3382
/// \brief Strip aggregate type wrapping.
3384
/// This removes no-op aggregate types wrapping an underlying type. It will
3385
/// strip as many layers of types as it can without changing either the type
3386
/// size or the allocated size.
3387
static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
3388
if (Ty->isSingleValueType())
3391
uint64_t AllocSize = DL.getTypeAllocSize(Ty);
3392
uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
3395
if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
3396
InnerTy = ArrTy->getElementType();
3397
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
3398
const StructLayout *SL = DL.getStructLayout(STy);
3399
unsigned Index = SL->getElementContainingOffset(0);
3400
InnerTy = STy->getElementType(Index);
3405
if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
3406
TypeSize > DL.getTypeSizeInBits(InnerTy))
3409
return stripAggregateTypeWrapping(DL, InnerTy);
3412
/// \brief Try to find a partition of the aggregate type passed in for a given
3413
/// offset and size.
3415
/// This recurses through the aggregate type and tries to compute a subtype
3416
/// based on the offset and size. When the offset and size span a sub-section
3417
/// of an array, it will even compute a new array type for that sub-section,
3418
/// and the same for structs.
3420
/// Note that this routine is very strict and tries to find a partition of the
3421
/// type which produces the *exact* right offset and size. It is not forgiving
3422
/// when the size or offset cause either end of type-based partition to be off.
3423
/// Also, this is a best-effort routine. It is reasonable to give up and not
3424
/// return a type if necessary.
3425
static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
3427
if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
3428
return stripAggregateTypeWrapping(DL, Ty);
3429
if (Offset > DL.getTypeAllocSize(Ty) ||
3430
(DL.getTypeAllocSize(Ty) - Offset) < Size)
3433
if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
3434
// We can't partition pointers...
3435
if (SeqTy->isPointerTy())
3438
Type *ElementTy = SeqTy->getElementType();
3439
uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
3440
uint64_t NumSkippedElements = Offset / ElementSize;
3441
if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
3442
if (NumSkippedElements >= ArrTy->getNumElements())
3444
} else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
3445
if (NumSkippedElements >= VecTy->getNumElements())
3448
Offset -= NumSkippedElements * ElementSize;
3450
// First check if we need to recurse.
3451
if (Offset > 0 || Size < ElementSize) {
3452
// Bail if the partition ends in a different array element.
3453
if ((Offset + Size) > ElementSize)
3455
// Recurse through the element type trying to peel off offset bytes.
3456
return getTypePartition(DL, ElementTy, Offset, Size);
3458
assert(Offset == 0);
3460
if (Size == ElementSize)
3461
return stripAggregateTypeWrapping(DL, ElementTy);
3462
assert(Size > ElementSize);
3463
uint64_t NumElements = Size / ElementSize;
3464
if (NumElements * ElementSize != Size)
3466
return ArrayType::get(ElementTy, NumElements);
3469
StructType *STy = dyn_cast<StructType>(Ty);
3473
const StructLayout *SL = DL.getStructLayout(STy);
3474
if (Offset >= SL->getSizeInBytes())
3476
uint64_t EndOffset = Offset + Size;
3477
if (EndOffset > SL->getSizeInBytes())
3480
unsigned Index = SL->getElementContainingOffset(Offset);
3481
Offset -= SL->getElementOffset(Index);
3483
Type *ElementTy = STy->getElementType(Index);
3484
uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
3485
if (Offset >= ElementSize)
3486
return nullptr; // The offset points into alignment padding.
3488
// See if any partition must be contained by the element.
3489
if (Offset > 0 || Size < ElementSize) {
3490
if ((Offset + Size) > ElementSize)
3492
return getTypePartition(DL, ElementTy, Offset, Size);
3494
assert(Offset == 0);
3496
if (Size == ElementSize)
3497
return stripAggregateTypeWrapping(DL, ElementTy);
3499
StructType::element_iterator EI = STy->element_begin() + Index,
3500
EE = STy->element_end();
3501
if (EndOffset < SL->getSizeInBytes()) {
3502
unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
3503
if (Index == EndIndex)
3504
return nullptr; // Within a single element and its padding.
3506
// Don't try to form "natural" types if the elements don't line up with the
3508
// FIXME: We could potentially recurse down through the last element in the
3509
// sub-struct to find a natural end point.
3510
if (SL->getElementOffset(EndIndex) != EndOffset)
3513
assert(Index < EndIndex);
3514
EE = STy->element_begin() + EndIndex;
3517
// Try to build up a sub-structure.
3519
StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
3520
const StructLayout *SubSL = DL.getStructLayout(SubTy);
3521
if (Size != SubSL->getSizeInBytes())
3522
return nullptr; // The sub-struct doesn't have quite the size needed.
3527
/// \brief Pre-split loads and stores to simplify rewriting.
3529
/// We want to break up the splittable load+store pairs as much as
3530
/// possible. This is important to do as a preprocessing step, as once we
3531
/// start rewriting the accesses to partitions of the alloca we lose the
3532
/// necessary information to correctly split apart paired loads and stores
3533
/// which both point into this alloca. The case to consider is something like
3536
/// %a = alloca [12 x i8]
3537
/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
3538
/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
3539
/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
3540
/// %iptr1 = bitcast i8* %gep1 to i64*
3541
/// %iptr2 = bitcast i8* %gep2 to i64*
3542
/// %fptr1 = bitcast i8* %gep1 to float*
3543
/// %fptr2 = bitcast i8* %gep2 to float*
3544
/// %fptr3 = bitcast i8* %gep3 to float*
3545
/// store float 0.0, float* %fptr1
3546
/// store float 1.0, float* %fptr2
3547
/// %v = load i64* %iptr1
3548
/// store i64 %v, i64* %iptr2
3549
/// %f1 = load float* %fptr2
3550
/// %f2 = load float* %fptr3
3552
/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3553
/// promote everything so we recover the 2 SSA values that should have been
3554
/// there all along.
3556
/// \returns true if any changes are made.
3557
bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
3558
DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3560
// Track the loads and stores which are candidates for pre-splitting here, in
3561
// the order they first appear during the partition scan. These give stable
3562
// iteration order and a basis for tracking which loads and stores we
3564
SmallVector<LoadInst *, 4> Loads;
3565
SmallVector<StoreInst *, 4> Stores;
3567
// We need to accumulate the splits required of each load or store where we
3568
// can find them via a direct lookup. This is important to cross-check loads
3569
// and stores against each other. We also track the slice so that we can kill
3570
// all the slices that end up split.
3571
struct SplitOffsets {
3573
std::vector<uint64_t> Splits;
3575
SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
3577
// Track loads out of this alloca which cannot, for any reason, be pre-split.
3578
// This is important as we also cannot pre-split stores of those loads!
3579
// FIXME: This is all pretty gross. It means that we can be more aggressive
3580
// in pre-splitting when the load feeding the store happens to come from
3581
// a separate alloca. Put another way, the effectiveness of SROA would be
3582
// decreased by a frontend which just concatenated all of its local allocas
3583
// into one big flat alloca. But defeating such patterns is exactly the job
3584
// SROA is tasked with! Sadly, to not have this discrepancy we would have
3585
// change store pre-splitting to actually force pre-splitting of the load
3586
// that feeds it *and all stores*. That makes pre-splitting much harder, but
3587
// maybe it would make it more principled?
3588
SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
3590
DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3591
for (auto &P : AS.partitions()) {
3592
for (Slice &S : P) {
3593
Instruction *I = cast<Instruction>(S.getUse()->getUser());
3594
if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
3595
// If this was a load we have to track that it can't participate in any
3597
if (auto *LI = dyn_cast<LoadInst>(I))
3598
UnsplittableLoads.insert(LI);
3601
assert(P.endOffset() > S.beginOffset() &&
3602
"Empty or backwards partition!");
3604
// Determine if this is a pre-splittable slice.
3605
if (auto *LI = dyn_cast<LoadInst>(I)) {
3606
assert(!LI->isVolatile() && "Cannot split volatile loads!");
3608
// The load must be used exclusively to store into other pointers for
3609
// us to be able to arbitrarily pre-split it. The stores must also be
3610
// simple to avoid changing semantics.
3611
auto IsLoadSimplyStored = [](LoadInst *LI) {
3612
for (User *LU : LI->users()) {
3613
auto *SI = dyn_cast<StoreInst>(LU);
3614
if (!SI || !SI->isSimple())
3619
if (!IsLoadSimplyStored(LI)) {
3620
UnsplittableLoads.insert(LI);
3624
Loads.push_back(LI);
3625
} else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
3627
S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
3629
auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
3630
if (!StoredLoad || !StoredLoad->isSimple())
3632
assert(!SI->isVolatile() && "Cannot split volatile stores!");
3634
Stores.push_back(SI);
3636
// Other uses cannot be pre-split.
3640
// Record the initial split.
3641
DEBUG(dbgs() << " Candidate: " << *I << "\n");
3642
auto &Offsets = SplitOffsetsMap[I];
3643
assert(Offsets.Splits.empty() &&
3644
"Should not have splits the first time we see an instruction!");
3646
Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
3649
// Now scan the already split slices, and add a split for any of them which
3650
// we're going to pre-split.
3651
for (Slice *S : P.splitSliceTails()) {
3652
auto SplitOffsetsMapI =
3653
SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
3654
if (SplitOffsetsMapI == SplitOffsetsMap.end())
3656
auto &Offsets = SplitOffsetsMapI->second;
3658
assert(Offsets.S == S && "Found a mismatched slice!");
3659
assert(!Offsets.Splits.empty() &&
3660
"Cannot have an empty set of splits on the second partition!");
3661
assert(Offsets.Splits.back() ==
3662
P.beginOffset() - Offsets.S->beginOffset() &&
3663
"Previous split does not end where this one begins!");
3665
// Record each split. The last partition's end isn't needed as the size
3666
// of the slice dictates that.
3667
if (S->endOffset() > P.endOffset())
3668
Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
3672
// We may have split loads where some of their stores are split stores. For
3673
// such loads and stores, we can only pre-split them if their splits exactly
3674
// match relative to their starting offset. We have to verify this prior to
3677
std::remove_if(Stores.begin(), Stores.end(),
3678
[&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
3679
// Lookup the load we are storing in our map of split
3681
auto *LI = cast<LoadInst>(SI->getValueOperand());
3682
// If it was completely unsplittable, then we're done,
3683
// and this store can't be pre-split.
3684
if (UnsplittableLoads.count(LI))
3687
auto LoadOffsetsI = SplitOffsetsMap.find(LI);
3688
if (LoadOffsetsI == SplitOffsetsMap.end())
3689
return false; // Unrelated loads are definitely safe.
3690
auto &LoadOffsets = LoadOffsetsI->second;
3692
// Now lookup the store's offsets.
3693
auto &StoreOffsets = SplitOffsetsMap[SI];
3695
// If the relative offsets of each split in the load and
3696
// store match exactly, then we can split them and we
3697
// don't need to remove them here.
3698
if (LoadOffsets.Splits == StoreOffsets.Splits)
3702
<< " Mismatched splits for load and store:\n"
3703
<< " " << *LI << "\n"
3704
<< " " << *SI << "\n");
3706
// We've found a store and load that we need to split
3707
// with mismatched relative splits. Just give up on them
3708
// and remove both instructions from our list of
3710
UnsplittableLoads.insert(LI);
3714
// Now we have to go *back* through all te stores, because a later store may
3715
// have caused an earlier store's load to become unsplittable and if it is
3716
// unsplittable for the later store, then we can't rely on it being split in
3717
// the earlier store either.
3718
Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
3719
[&UnsplittableLoads](StoreInst *SI) {
3721
cast<LoadInst>(SI->getValueOperand());
3722
return UnsplittableLoads.count(LI);
3725
// Once we've established all the loads that can't be split for some reason,
3726
// filter any that made it into our list out.
3727
Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
3728
[&UnsplittableLoads](LoadInst *LI) {
3729
return UnsplittableLoads.count(LI);
3734
// If no loads or stores are left, there is no pre-splitting to be done for
3736
if (Loads.empty() && Stores.empty())
3739
// From here on, we can't fail and will be building new accesses, so rig up
3741
IRBuilderTy IRB(&AI);
3743
// Collect the new slices which we will merge into the alloca slices.
3744
SmallVector<Slice, 4> NewSlices;
3746
// Track any allocas we end up splitting loads and stores for so we iterate
3748
SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
3750
// At this point, we have collected all of the loads and stores we can
3751
// pre-split, and the specific splits needed for them. We actually do the
3752
// splitting in a specific order in order to handle when one of the loads in
3753
// the value operand to one of the stores.
3755
// First, we rewrite all of the split loads, and just accumulate each split
3756
// load in a parallel structure. We also build the slices for them and append
3757
// them to the alloca slices.
3758
SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
3759
std::vector<LoadInst *> SplitLoads;
3760
const DataLayout &DL = AI.getModule()->getDataLayout();
3761
for (LoadInst *LI : Loads) {
3764
IntegerType *Ty = cast<IntegerType>(LI->getType());
3765
uint64_t LoadSize = Ty->getBitWidth() / 8;
3766
assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
3768
auto &Offsets = SplitOffsetsMap[LI];
3769
assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
3770
"Slice size should always match load size exactly!");
3771
uint64_t BaseOffset = Offsets.S->beginOffset();
3772
assert(BaseOffset + LoadSize > BaseOffset &&
3773
"Cannot represent alloca access size using 64-bit integers!");
3775
Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
3776
IRB.SetInsertPoint(BasicBlock::iterator(LI));
3778
DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
3780
uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
3781
int Idx = 0, Size = Offsets.Splits.size();
3783
auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
3784
auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
3785
LoadInst *PLoad = IRB.CreateAlignedLoad(
3786
getAdjustedPtr(IRB, DL, BasePtr,
3787
APInt(DL.getPointerSizeInBits(), PartOffset),
3788
PartPtrTy, BasePtr->getName() + "."),
3789
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
3792
// Append this load onto the list of split loads so we can find it later
3793
// to rewrite the stores.
3794
SplitLoads.push_back(PLoad);
3796
// Now build a new slice for the alloca.
3797
NewSlices.push_back(
3798
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
3799
&PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
3800
/*IsSplittable*/ false));
3801
DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
3802
<< ", " << NewSlices.back().endOffset() << "): " << *PLoad
3805
// See if we've handled all the splits.
3809
// Setup the next partition.
3810
PartOffset = Offsets.Splits[Idx];
3812
PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
3815
// Now that we have the split loads, do the slow walk over all uses of the
3816
// load and rewrite them as split stores, or save the split loads to use
3817
// below if the store is going to be split there anyways.
3818
bool DeferredStores = false;
3819
for (User *LU : LI->users()) {
3820
StoreInst *SI = cast<StoreInst>(LU);
3821
if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
3822
DeferredStores = true;
3823
DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
3827
Value *StoreBasePtr = SI->getPointerOperand();
3828
IRB.SetInsertPoint(BasicBlock::iterator(SI));
3830
DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
3832
for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
3833
LoadInst *PLoad = SplitLoads[Idx];
3834
uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
3836
PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
3838
StoreInst *PStore = IRB.CreateAlignedStore(
3839
PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
3840
APInt(DL.getPointerSizeInBits(), PartOffset),
3841
PartPtrTy, StoreBasePtr->getName() + "."),
3842
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
3844
DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
3847
// We want to immediately iterate on any allocas impacted by splitting
3848
// this store, and we have to track any promotable alloca (indicated by
3849
// a direct store) as needing to be resplit because it is no longer
3851
if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
3852
ResplitPromotableAllocas.insert(OtherAI);
3853
Worklist.insert(OtherAI);
3854
} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
3855
StoreBasePtr->stripInBoundsOffsets())) {
3856
Worklist.insert(OtherAI);
3859
// Mark the original store as dead.
3860
DeadInsts.insert(SI);
3863
// Save the split loads if there are deferred stores among the users.
3865
SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
3867
// Mark the original load as dead and kill the original slice.
3868
DeadInsts.insert(LI);
3872
// Second, we rewrite all of the split stores. At this point, we know that
3873
// all loads from this alloca have been split already. For stores of such
3874
// loads, we can simply look up the pre-existing split loads. For stores of
3875
// other loads, we split those loads first and then write split stores of
3877
for (StoreInst *SI : Stores) {
3878
auto *LI = cast<LoadInst>(SI->getValueOperand());
3879
IntegerType *Ty = cast<IntegerType>(LI->getType());
3880
uint64_t StoreSize = Ty->getBitWidth() / 8;
3881
assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
3883
auto &Offsets = SplitOffsetsMap[SI];
3884
assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
3885
"Slice size should always match load size exactly!");
3886
uint64_t BaseOffset = Offsets.S->beginOffset();
3887
assert(BaseOffset + StoreSize > BaseOffset &&
3888
"Cannot represent alloca access size using 64-bit integers!");
3890
Value *LoadBasePtr = LI->getPointerOperand();
3891
Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
3893
DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
3895
// Check whether we have an already split load.
3896
auto SplitLoadsMapI = SplitLoadsMap.find(LI);
3897
std::vector<LoadInst *> *SplitLoads = nullptr;
3898
if (SplitLoadsMapI != SplitLoadsMap.end()) {
3899
SplitLoads = &SplitLoadsMapI->second;
3900
assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
3901
"Too few split loads for the number of splits in the store!");
3903
DEBUG(dbgs() << " of load: " << *LI << "\n");
3906
uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
3907
int Idx = 0, Size = Offsets.Splits.size();
3909
auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
3910
auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
3912
// Either lookup a split load or create one.
3915
PLoad = (*SplitLoads)[Idx];
3917
IRB.SetInsertPoint(BasicBlock::iterator(LI));
3918
PLoad = IRB.CreateAlignedLoad(
3919
getAdjustedPtr(IRB, DL, LoadBasePtr,
3920
APInt(DL.getPointerSizeInBits(), PartOffset),
3921
PartPtrTy, LoadBasePtr->getName() + "."),
3922
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
3926
// And store this partition.
3927
IRB.SetInsertPoint(BasicBlock::iterator(SI));
3928
StoreInst *PStore = IRB.CreateAlignedStore(
3929
PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
3930
APInt(DL.getPointerSizeInBits(), PartOffset),
3931
PartPtrTy, StoreBasePtr->getName() + "."),
3932
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
3934
// Now build a new slice for the alloca.
3935
NewSlices.push_back(
3936
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
3937
&PStore->getOperandUse(PStore->getPointerOperandIndex()),
3938
/*IsSplittable*/ false));
3939
DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
3940
<< ", " << NewSlices.back().endOffset() << "): " << *PStore
3943
DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
3946
// See if we've finished all the splits.
3950
// Setup the next partition.
3951
PartOffset = Offsets.Splits[Idx];
3953
PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
3956
// We want to immediately iterate on any allocas impacted by splitting
3957
// this load, which is only relevant if it isn't a load of this alloca and
3958
// thus we didn't already split the loads above. We also have to keep track
3959
// of any promotable allocas we split loads on as they can no longer be
3962
if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
3963
assert(OtherAI != &AI && "We can't re-split our own alloca!");
3964
ResplitPromotableAllocas.insert(OtherAI);
3965
Worklist.insert(OtherAI);
3966
} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
3967
LoadBasePtr->stripInBoundsOffsets())) {
3968
assert(OtherAI != &AI && "We can't re-split our own alloca!");
3969
Worklist.insert(OtherAI);
3973
// Mark the original store as dead now that we've split it up and kill its
3974
// slice. Note that we leave the original load in place unless this store
3975
// was its ownly use. It may in turn be split up if it is an alloca load
3976
// for some other alloca, but it may be a normal load. This may introduce
3977
// redundant loads, but where those can be merged the rest of the optimizer
3978
// should handle the merging, and this uncovers SSA splits which is more
3979
// important. In practice, the original loads will almost always be fully
3980
// split and removed eventually, and the splits will be merged by any
3981
// trivial CSE, including instcombine.
3982
if (LI->hasOneUse()) {
3983
assert(*LI->user_begin() == SI && "Single use isn't this store!");
3984
DeadInsts.insert(LI);
3986
DeadInsts.insert(SI);
3990
// Remove the killed slices that have ben pre-split.
3991
AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
3995
// Insert our new slices. This will sort and merge them into the sorted
3997
AS.insert(NewSlices);
3999
DEBUG(dbgs() << " Pre-split slices:\n");
4001
for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4002
DEBUG(AS.print(dbgs(), I, " "));
4005
// Finally, don't try to promote any allocas that new require re-splitting.
4006
// They have already been added to the worklist above.
4007
PromotableAllocas.erase(
4009
PromotableAllocas.begin(), PromotableAllocas.end(),
4010
[&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
4011
PromotableAllocas.end());
4016
/// \brief Rewrite an alloca partition's users.
4018
/// This routine drives both of the rewriting goals of the SROA pass. It tries
4019
/// to rewrite uses of an alloca partition to be conducive for SSA value
4020
/// promotion. If the partition needs a new, more refined alloca, this will
4021
/// build that new alloca, preserving as much type information as possible, and
4022
/// rewrite the uses of the old alloca to point at the new one and have the
4023
/// appropriate new offsets. It also evaluates how successful the rewrite was
4024
/// at enabling promotion and if it was successful queues the alloca to be
4026
AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4027
AllocaSlices::Partition &P) {
4028
// Try to compute a friendly type for this partition of the alloca. This
4029
// won't always succeed, in which case we fall back to a legal integer type
4030
// or an i8 array of an appropriate size.
4031
Type *SliceTy = nullptr;
4032
const DataLayout &DL = AI.getModule()->getDataLayout();
4033
if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
4034
if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
4035
SliceTy = CommonUseTy;
4037
if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4038
P.beginOffset(), P.size()))
4039
SliceTy = TypePartitionTy;
4040
if ((!SliceTy || (SliceTy->isArrayTy() &&
4041
SliceTy->getArrayElementType()->isIntegerTy())) &&
4042
DL.isLegalInteger(P.size() * 8))
4043
SliceTy = Type::getIntNTy(*C, P.size() * 8);
4045
SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4046
assert(DL.getTypeAllocSize(SliceTy) >= P.size());
4048
bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4051
IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4055
// Check for the case where we're going to rewrite to a new alloca of the
4056
// exact same type as the original, and with the same access offsets. In that
4057
// case, re-use the existing alloca, but still run through the rewriter to
4058
// perform phi and select speculation.
4060
if (SliceTy == AI.getAllocatedType()) {
4061
assert(P.beginOffset() == 0 &&
4062
"Non-zero begin offset but same alloca type");
4064
// FIXME: We should be able to bail at this point with "nothing changed".
4065
// FIXME: We might want to defer PHI speculation until after here.
4066
// FIXME: return nullptr;
4068
unsigned Alignment = AI.getAlignment();
4070
// The minimum alignment which users can rely on when the explicit
4071
// alignment is omitted or zero is that required by the ABI for this
4073
Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
4075
Alignment = MinAlign(Alignment, P.beginOffset());
4076
// If we will get at least this much alignment from the type alone, leave
4077
// the alloca's alignment unconstrained.
4078
if (Alignment <= DL.getABITypeAlignment(SliceTy))
4080
NewAI = new AllocaInst(
4081
SliceTy, nullptr, Alignment,
4082
AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
4086
DEBUG(dbgs() << "Rewriting alloca partition "
4087
<< "[" << P.beginOffset() << "," << P.endOffset()
4088
<< ") to: " << *NewAI << "\n");
4090
// Track the high watermark on the worklist as it is only relevant for
4091
// promoted allocas. We will reset it to this point if the alloca is not in
4092
// fact scheduled for promotion.
4093
unsigned PPWOldSize = PostPromotionWorklist.size();
4094
unsigned NumUses = 0;
4095
SmallPtrSet<PHINode *, 8> PHIUsers;
4096
SmallPtrSet<SelectInst *, 8> SelectUsers;
4098
AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4099
P.endOffset(), IsIntegerPromotable, VecTy,
4100
PHIUsers, SelectUsers);
4101
bool Promotable = true;
4102
for (Slice *S : P.splitSliceTails()) {
4103
Promotable &= Rewriter.visit(S);
4106
for (Slice &S : P) {
4107
Promotable &= Rewriter.visit(&S);
4111
NumAllocaPartitionUses += NumUses;
4112
MaxUsesPerAllocaPartition =
4113
std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
4115
// Now that we've processed all the slices in the new partition, check if any
4116
// PHIs or Selects would block promotion.
4117
for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
4120
if (!isSafePHIToSpeculate(**I)) {
4123
SelectUsers.clear();
4126
for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
4127
E = SelectUsers.end();
4129
if (!isSafeSelectToSpeculate(**I)) {
4132
SelectUsers.clear();
4137
if (PHIUsers.empty() && SelectUsers.empty()) {
4138
// Promote the alloca.
4139
PromotableAllocas.push_back(NewAI);
4141
// If we have either PHIs or Selects to speculate, add them to those
4142
// worklists and re-queue the new alloca so that we promote in on the
4144
for (PHINode *PHIUser : PHIUsers)
4145
SpeculatablePHIs.insert(PHIUser);
4146
for (SelectInst *SelectUser : SelectUsers)
4147
SpeculatableSelects.insert(SelectUser);
4148
Worklist.insert(NewAI);
4151
// If we can't promote the alloca, iterate on it to check for new
4152
// refinements exposed by splitting the current alloca. Don't iterate on an
4153
// alloca which didn't actually change and didn't get promoted.
4155
Worklist.insert(NewAI);
4157
// Drop any post-promotion work items if promotion didn't happen.
4158
while (PostPromotionWorklist.size() > PPWOldSize)
4159
PostPromotionWorklist.pop_back();
4165
/// \brief Walks the slices of an alloca and form partitions based on them,
4166
/// rewriting each of their uses.
4167
bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
4168
if (AS.begin() == AS.end())
4171
unsigned NumPartitions = 0;
4172
bool Changed = false;
4173
const DataLayout &DL = AI.getModule()->getDataLayout();
4175
// First try to pre-split loads and stores.
4176
Changed |= presplitLoadsAndStores(AI, AS);
4178
// Now that we have identified any pre-splitting opportunities, mark any
4179
// splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
4180
// to split these during pre-splitting, we want to force them to be
4181
// rewritten into a partition.
4182
bool IsSorted = true;
4183
for (Slice &S : AS) {
4184
if (!S.isSplittable())
4186
// FIXME: We currently leave whole-alloca splittable loads and stores. This
4187
// used to be the only splittable loads and stores and we need to be
4188
// confident that the above handling of splittable loads and stores is
4189
// completely sufficient before we forcibly disable the remaining handling.
4190
if (S.beginOffset() == 0 &&
4191
S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
4193
if (isa<LoadInst>(S.getUse()->getUser()) ||
4194
isa<StoreInst>(S.getUse()->getUser())) {
4195
S.makeUnsplittable();
4200
std::sort(AS.begin(), AS.end());
4202
/// \brief Describes the allocas introduced by rewritePartition
4203
/// in order to migrate the debug info.
4208
Piece(AllocaInst *AI, uint64_t O, uint64_t S)
4209
: Alloca(AI), Offset(O), Size(S) {}
4211
SmallVector<Piece, 4> Pieces;
4213
// Rewrite each partition.
4214
for (auto &P : AS.partitions()) {
4215
if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
4218
uint64_t SizeOfByte = 8;
4219
uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
4220
// Don't include any padding.
4221
uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
4222
Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
4228
NumAllocaPartitions += NumPartitions;
4229
MaxPartitionsPerAlloca =
4230
std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
4232
// Migrate debug information from the old alloca to the new alloca(s)
4233
// and the individial partitions.
4234
if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
4235
auto *Var = DbgDecl->getVariable();
4236
auto *Expr = DbgDecl->getExpression();
4237
DIBuilder DIB(*AI.getParent()->getParent()->getParent(),
4238
/*AllowUnresolved*/ false);
4239
bool IsSplit = Pieces.size() > 1;
4240
for (auto Piece : Pieces) {
4241
// Create a piece expression describing the new partition or reuse AI's
4242
// expression if there is only one partition.
4243
auto *PieceExpr = Expr;
4244
if (IsSplit || Expr->isBitPiece()) {
4245
// If this alloca is already a scalar replacement of a larger aggregate,
4246
// Piece.Offset describes the offset inside the scalar.
4247
uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
4248
uint64_t Start = Offset + Piece.Offset;
4249
uint64_t Size = Piece.Size;
4250
if (Expr->isBitPiece()) {
4251
uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
4252
if (Start >= AbsEnd)
4253
// No need to describe a SROAed padding.
4255
Size = std::min(Size, AbsEnd - Start);
4257
PieceExpr = DIB.createBitPieceExpression(Start, Size);
4260
// Remove any existing dbg.declare intrinsic describing the same alloca.
4261
if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
4262
OldDDI->eraseFromParent();
4264
DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
4271
/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
4272
void SROA::clobberUse(Use &U) {
4274
// Replace the use with an undef value.
4275
U = UndefValue::get(OldV->getType());
4277
// Check for this making an instruction dead. We have to garbage collect
4278
// all the dead instructions to ensure the uses of any alloca end up being
4280
if (Instruction *OldI = dyn_cast<Instruction>(OldV))
4281
if (isInstructionTriviallyDead(OldI)) {
4282
DeadInsts.insert(OldI);
4286
/// \brief Analyze an alloca for SROA.
4288
/// This analyzes the alloca to ensure we can reason about it, builds
4289
/// the slices of the alloca, and then hands it off to be split and
4290
/// rewritten as needed.
4291
bool SROA::runOnAlloca(AllocaInst &AI) {
4292
DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
4293
++NumAllocasAnalyzed;
4295
// Special case dead allocas, as they're trivial.
4296
if (AI.use_empty()) {
4297
AI.eraseFromParent();
4300
const DataLayout &DL = AI.getModule()->getDataLayout();
4302
// Skip alloca forms that this analysis can't handle.
4303
if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
4304
DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
4307
bool Changed = false;
4309
// First, split any FCA loads and stores touching this alloca to promote
4310
// better splitting and promotion opportunities.
4311
AggLoadStoreRewriter AggRewriter(DL);
4312
Changed |= AggRewriter.rewrite(AI);
4314
// Build the slices using a recursive instruction-visiting builder.
4315
AllocaSlices AS(DL, AI);
4316
DEBUG(AS.print(dbgs()));
4320
// Delete all the dead users of this alloca before splitting and rewriting it.
4321
for (Instruction *DeadUser : AS.getDeadUsers()) {
4322
// Free up everything used by this instruction.
4323
for (Use &DeadOp : DeadUser->operands())
4326
// Now replace the uses of this instruction.
4327
DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
4329
// And mark it for deletion.
4330
DeadInsts.insert(DeadUser);
4333
for (Use *DeadOp : AS.getDeadOperands()) {
4334
clobberUse(*DeadOp);
4338
// No slices to split. Leave the dead alloca for a later pass to clean up.
4339
if (AS.begin() == AS.end())
4342
Changed |= splitAlloca(AI, AS);
4344
DEBUG(dbgs() << " Speculating PHIs\n");
4345
while (!SpeculatablePHIs.empty())
4346
speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
4348
DEBUG(dbgs() << " Speculating Selects\n");
4349
while (!SpeculatableSelects.empty())
4350
speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
4355
/// \brief Delete the dead instructions accumulated in this run.
4357
/// Recursively deletes the dead instructions we've accumulated. This is done
4358
/// at the very end to maximize locality of the recursive delete and to
4359
/// minimize the problems of invalidated instruction pointers as such pointers
4360
/// are used heavily in the intermediate stages of the algorithm.
4362
/// We also record the alloca instructions deleted here so that they aren't
4363
/// subsequently handed to mem2reg to promote.
4364
void SROA::deleteDeadInstructions(
4365
SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
4366
while (!DeadInsts.empty()) {
4367
Instruction *I = DeadInsts.pop_back_val();
4368
DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
4370
I->replaceAllUsesWith(UndefValue::get(I->getType()));
4372
for (Use &Operand : I->operands())
4373
if (Instruction *U = dyn_cast<Instruction>(Operand)) {
4374
// Zero out the operand and see if it becomes trivially dead.
4376
if (isInstructionTriviallyDead(U))
4377
DeadInsts.insert(U);
4380
if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
4381
DeletedAllocas.insert(AI);
4382
if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
4383
DbgDecl->eraseFromParent();
4387
I->eraseFromParent();
4391
static void enqueueUsersInWorklist(Instruction &I,
4392
SmallVectorImpl<Instruction *> &Worklist,
4393
SmallPtrSetImpl<Instruction *> &Visited) {
4394
for (User *U : I.users())
4395
if (Visited.insert(cast<Instruction>(U)).second)
4396
Worklist.push_back(cast<Instruction>(U));
4399
/// \brief Promote the allocas, using the best available technique.
4401
/// This attempts to promote whatever allocas have been identified as viable in
4402
/// the PromotableAllocas list. If that list is empty, there is nothing to do.
4403
/// If there is a domtree available, we attempt to promote using the full power
4404
/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
4405
/// based on the SSAUpdater utilities. This function returns whether any
4406
/// promotion occurred.
4407
bool SROA::promoteAllocas(Function &F) {
4408
if (PromotableAllocas.empty())
4411
NumPromoted += PromotableAllocas.size();
4413
if (DT && !ForceSSAUpdater) {
4414
DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4415
PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
4416
PromotableAllocas.clear();
4420
DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
4422
DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
4423
SmallVector<Instruction *, 64> Insts;
4425
// We need a worklist to walk the uses of each alloca.
4426
SmallVector<Instruction *, 8> Worklist;
4427
SmallPtrSet<Instruction *, 8> Visited;
4428
SmallVector<Instruction *, 32> DeadInsts;
4430
for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
4431
AllocaInst *AI = PromotableAllocas[Idx];
4436
enqueueUsersInWorklist(*AI, Worklist, Visited);
4438
while (!Worklist.empty()) {
4439
Instruction *I = Worklist.pop_back_val();
4441
// FIXME: Currently the SSAUpdater infrastructure doesn't reason about
4442
// lifetime intrinsics and so we strip them (and the bitcasts+GEPs
4443
// leading to them) here. Eventually it should use them to optimize the
4444
// scalar values produced.
4445
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
4446
assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
4447
II->getIntrinsicID() == Intrinsic::lifetime_end);
4448
II->eraseFromParent();
4452
// Push the loads and stores we find onto the list. SROA will already
4453
// have validated that all loads and stores are viable candidates for
4455
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
4456
assert(LI->getType() == AI->getAllocatedType());
4457
Insts.push_back(LI);
4460
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
4461
assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
4462
Insts.push_back(SI);
4466
// For everything else, we know that only no-op bitcasts and GEPs will
4467
// make it this far, just recurse through them and recall them for later
4469
DeadInsts.push_back(I);
4470
enqueueUsersInWorklist(*I, Worklist, Visited);
4472
AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
4473
while (!DeadInsts.empty())
4474
DeadInsts.pop_back_val()->eraseFromParent();
4475
AI->eraseFromParent();
4478
PromotableAllocas.clear();
4482
bool SROA::runOnFunction(Function &F) {
4483
if (skipOptnoneFunction(F))
4486
DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
4487
C = &F.getContext();
4488
DominatorTreeWrapperPass *DTWP =
4489
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
4490
DT = DTWP ? &DTWP->getDomTree() : nullptr;
4491
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
4493
BasicBlock &EntryBB = F.getEntryBlock();
4494
for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
4496
if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
4497
Worklist.insert(AI);
4500
bool Changed = false;
4501
// A set of deleted alloca instruction pointers which should be removed from
4502
// the list of promotable allocas.
4503
SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
4506
while (!Worklist.empty()) {
4507
Changed |= runOnAlloca(*Worklist.pop_back_val());
4508
deleteDeadInstructions(DeletedAllocas);
4510
// Remove the deleted allocas from various lists so that we don't try to
4511
// continue processing them.
4512
if (!DeletedAllocas.empty()) {
4513
auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
4514
Worklist.remove_if(IsInSet);
4515
PostPromotionWorklist.remove_if(IsInSet);
4516
PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
4517
PromotableAllocas.end(),
4519
PromotableAllocas.end());
4520
DeletedAllocas.clear();
4524
Changed |= promoteAllocas(F);
4526
Worklist = PostPromotionWorklist;
4527
PostPromotionWorklist.clear();
4528
} while (!Worklist.empty());
4533
void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
4534
AU.addRequired<AssumptionCacheTracker>();
4535
if (RequiresDomTree)
4536
AU.addRequired<DominatorTreeWrapperPass>();
4537
AU.setPreservesCFG();