1
//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
3
// The LLVM Compiler Infrastructure
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
8
//===----------------------------------------------------------------------===//
10
// This file computes an optimal sequence of instructions for doing all shuffles
11
// of two 4-element vectors. With a release build and when configured to emit
12
// an altivec instruction table, this takes about 30s to run on a 2.7Ghz
15
//===----------------------------------------------------------------------===//
23
// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
24
// an element from that value of the input vectors. A value of 8 means the
25
// entry is undefined.
27
// Mask manipulation functions.
28
static inline unsigned short MakeMask(unsigned V0, unsigned V1,
29
unsigned V2, unsigned V3) {
30
return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
33
/// getMaskElt - Return element N of the specified mask.
34
static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
35
return (Mask >> ((3-Elt)*4)) & 0xF;
38
static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
39
unsigned FieldShift = ((3-Elt)*4);
40
return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
43
// Reject elements where the values are 9-15.
44
static bool isValidMask(unsigned short Mask) {
45
unsigned short UndefBits = Mask & 0x8888;
46
return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
49
/// hasUndefElements - Return true if any of the elements in the mask are undefs
51
static bool hasUndefElements(unsigned short Mask) {
52
return (Mask & 0x8888) != 0;
55
/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
56
/// including undef values..
57
static bool isOnlyLHSMask(unsigned short Mask) {
58
return (Mask & 0x4444) == 0;
61
/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
62
/// refer to the LHS only (for when one argument value is passed into the same
65
static unsigned short getLHSOnlyMask(unsigned short Mask) {
66
return Mask & 0xBBBB; // Keep only LHS and Undefs.
70
/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
71
/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
72
static unsigned getCompressedMask(unsigned short Mask) {
73
return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
74
getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3);
77
static void PrintMask(unsigned i, std::ostream &OS) {
78
OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
79
<< "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
80
<< "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
81
<< "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
85
/// ShuffleVal - This represents a shufflevector operation.
87
unsigned Cost; // Number of instrs used to generate this value.
88
Operator *Op; // The Operation used to generate this value.
89
unsigned short Arg0, Arg1; // Input operands for this value.
91
ShuffleVal() : Cost(1000000) {}
95
/// ShufTab - This is the actual shuffle table that we are trying to generate.
97
static ShuffleVal ShufTab[65536];
99
/// TheOperators - All of the operators that this target supports.
100
static std::vector<Operator*> TheOperators;
102
/// Operator - This is a vector operation that is available for use.
104
unsigned short ShuffleMask;
105
unsigned short OpNum;
109
Operator(unsigned short shufflemask, const char *name, unsigned opnum,
111
: ShuffleMask(shufflemask), OpNum(opnum), Name(name), Cost(cost) {
112
TheOperators.push_back(this);
115
assert(TheOperators.back() == this);
116
TheOperators.pop_back();
119
bool isOnlyLHSOperator() const {
120
return isOnlyLHSMask(ShuffleMask);
123
const char *getName() const { return Name; }
124
unsigned getCost() const { return Cost; }
126
unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
127
// Extract the elements from LHSMask and RHSMask, as appropriate.
129
for (unsigned i = 0; i != 4; ++i) {
130
unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
133
ResElt = getMaskElt(LHSMask, SrcElt);
135
ResElt = getMaskElt(RHSMask, SrcElt-4);
137
assert(SrcElt == 8 && "Bad src elt!");
140
Result |= ResElt << (4*i);
146
static const char *getZeroCostOpName(unsigned short Op) {
147
if (ShufTab[Op].Arg0 == 0x0123)
149
else if (ShufTab[Op].Arg0 == 0x4567)
152
assert(0 && "bad zero cost operation");
157
static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
158
unsigned short ThisOp = Vals[ValNo];
159
std::cerr << "t" << ValNo;
160
PrintMask(ThisOp, std::cerr);
161
std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
163
if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
164
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
165
PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
167
// Figure out what tmp # it is.
168
for (unsigned i = 0; ; ++i)
169
if (Vals[i] == ShufTab[ThisOp].Arg0) {
170
std::cerr << "t" << i;
175
if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
177
if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
178
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
179
PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
181
// Figure out what tmp # it is.
182
for (unsigned i = 0; ; ++i)
183
if (Vals[i] == ShufTab[ThisOp].Arg1) {
184
std::cerr << "t" << i;
192
static unsigned getNumEntered() {
194
for (unsigned i = 0; i != 65536; ++i)
195
Count += ShufTab[i].Cost < 100;
199
static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
201
if (ShufTab[Elt].Cost == 0) return;
203
// If this value has already been evaluated, it is free. FIXME: match undefs.
204
for (unsigned i = 0, e = NumVals; i != e; ++i)
205
if (Vals[i] == Elt) return;
207
// Otherwise, get the operands of the value, then add it.
208
unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
209
if (ShufTab[Arg0].Cost)
210
EvaluateOps(Arg0, Vals, NumVals);
211
if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
212
EvaluateOps(Arg1, Vals, NumVals);
214
Vals[NumVals++] = Elt;
219
// Seed the table with accesses to the LHS and RHS.
220
ShufTab[0x0123].Cost = 0;
221
ShufTab[0x0123].Op = 0;
222
ShufTab[0x0123].Arg0 = 0x0123;
223
ShufTab[0x4567].Cost = 0;
224
ShufTab[0x4567].Op = 0;
225
ShufTab[0x4567].Arg0 = 0x4567;
227
// Seed the first-level of shuffles, shuffles whose inputs are the input to
228
// the vectorshuffle operation.
229
bool MadeChange = true;
230
unsigned OpCount = 0;
234
std::cerr << "Starting iteration #" << OpCount << " with "
235
<< getNumEntered() << " entries established.\n";
237
// Scan the table for two reasons: First, compute the maximum cost of any
238
// operation left in the table. Second, make sure that values with undefs
239
// have the cheapest alternative that they match.
240
unsigned MaxCost = ShufTab[0].Cost;
241
for (unsigned i = 1; i != 0x8889; ++i) {
242
if (!isValidMask(i)) continue;
243
if (ShufTab[i].Cost > MaxCost)
244
MaxCost = ShufTab[i].Cost;
246
// If this value has an undef, make it be computed the cheapest possible
247
// way of any of the things that it matches.
248
if (hasUndefElements(i)) {
249
// This code is a little bit tricky, so here's the idea: consider some
250
// permutation, like 7u4u. To compute the lowest cost for 7u4u, we
251
// need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
252
// there are 3 undefs, the number rises to 729 entries we have to scan,
253
// and for the 4 undef case, we have to scan the whole table.
255
// Instead of doing this huge amount of scanning, we process the table
256
// entries *in order*, and use the fact that 'u' is 8, larger than any
257
// valid index. Given an entry like 7u4u then, we only need to scan
258
// 7[0-7]4u - 8 entries. We can get away with this, because we already
259
// know that each of 704u, 714u, 724u, etc contain the minimum value of
260
// all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
274
unsigned MinCost = ShufTab[i].Cost;
276
// Scan the 8 entries.
277
for (unsigned j = 0; j != 8; ++j) {
278
unsigned NewElt = setMaskElt(i, UndefIdx, j);
279
if (ShufTab[NewElt].Cost < MinCost) {
280
MinCost = ShufTab[NewElt].Cost;
285
// If we found something cheaper than what was here before, use it.
288
ShufTab[i] = ShufTab[MinVal];
293
for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
294
if (!isValidMask(LHS)) continue;
295
if (ShufTab[LHS].Cost > 1000) continue;
297
// If nothing involving this operand could possibly be cheaper than what
298
// we already have, don't consider it.
299
if (ShufTab[LHS].Cost + 1 >= MaxCost)
302
for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
303
Operator *Op = TheOperators[opnum];
305
// Evaluate op(LHS,LHS)
306
unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
308
unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
309
if (Cost < ShufTab[ResultMask].Cost) {
310
ShufTab[ResultMask].Cost = Cost;
311
ShufTab[ResultMask].Op = Op;
312
ShufTab[ResultMask].Arg0 = LHS;
313
ShufTab[ResultMask].Arg1 = LHS;
317
// If this is a two input instruction, include the op(x,y) cases. If
318
// this is a one input instruction, skip this.
319
if (Op->isOnlyLHSOperator()) continue;
321
for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
322
if (!isValidMask(RHS)) continue;
323
if (ShufTab[RHS].Cost > 1000) continue;
325
// If nothing involving this operand could possibly be cheaper than
326
// what we already have, don't consider it.
327
if (ShufTab[RHS].Cost + 1 >= MaxCost)
331
// Evaluate op(LHS,RHS)
332
unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
334
if (ShufTab[ResultMask].Cost <= OpCount ||
335
ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
336
ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
339
// Figure out the cost to evaluate this, knowing that CSE's only need
340
// to be evaluated once.
341
unsigned short Vals[30];
342
unsigned NumVals = 0;
343
EvaluateOps(LHS, Vals, NumVals);
344
EvaluateOps(RHS, Vals, NumVals);
346
unsigned Cost = NumVals + Op->getCost();
347
if (Cost < ShufTab[ResultMask].Cost) {
348
ShufTab[ResultMask].Cost = Cost;
349
ShufTab[ResultMask].Op = Op;
350
ShufTab[ResultMask].Arg0 = LHS;
351
ShufTab[ResultMask].Arg1 = RHS;
359
std::cerr << "Finished Table has " << getNumEntered()
360
<< " entries established.\n";
362
unsigned CostArray[10] = { 0 };
364
// Compute a cost histogram.
365
for (unsigned i = 0; i != 65536; ++i) {
366
if (!isValidMask(i)) continue;
367
if (ShufTab[i].Cost > 9)
370
++CostArray[ShufTab[i].Cost];
373
for (unsigned i = 0; i != 9; ++i)
375
std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
377
std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
380
// Build up the table to emit.
381
std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
382
std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
384
for (unsigned i = 0; i != 0x8889; ++i) {
385
if (!isValidMask(i)) continue;
387
// CostSat - The cost of this operation saturated to two bits.
388
unsigned CostSat = ShufTab[i].Cost;
389
if (CostSat > 4) CostSat = 4;
390
if (CostSat == 0) CostSat = 1;
391
--CostSat; // Cost is now between 0-3.
393
unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
394
assert(OpNum < 16 && "Too few bits to encode operation!");
396
unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
397
unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
399
// Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
400
// LHS, and 13 bits of RHS = 32 bits.
401
unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
403
std::cout << " " << Val << "U,\t// ";
404
PrintMask(i, std::cout);
405
std::cout << ": Cost " << ShufTab[i].Cost;
406
std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
408
if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
409
std::cout << getZeroCostOpName(ShufTab[i].Arg0);
411
PrintMask(ShufTab[i].Arg0, std::cout);
414
if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
416
if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
417
std::cout << getZeroCostOpName(ShufTab[i].Arg1);
419
PrintMask(ShufTab[i].Arg1, std::cout);
424
std::cout << " 0\n};\n";
427
// Print out the table.
428
for (unsigned i = 0; i != 0x8889; ++i) {
429
if (!isValidMask(i)) continue;
430
if (ShufTab[i].Cost < 1000) {
431
PrintMask(i, std::cerr);
432
std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
434
unsigned short Vals[30];
435
unsigned NumVals = 0;
436
EvaluateOps(i, Vals, NumVals);
438
for (unsigned j = 0, e = NumVals; j != e; ++j)
439
PrintOperation(j, Vals);
447
#ifdef GENERATE_ALTIVEC
449
///===---------------------------------------------------------------------===//
450
/// The altivec instruction definitions. This is the altivec-specific part of
452
///===---------------------------------------------------------------------===//
454
// Note that the opcode numbers here must match those in the PPC backend.
456
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
468
struct vmrghw : public Operator {
469
vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
472
struct vmrglw : public Operator {
473
vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
476
template<unsigned Elt>
477
struct vspltisw : public Operator {
478
vspltisw(const char *N, unsigned Opc)
479
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
482
vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
483
vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
484
vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
485
vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
488
struct vsldoi : public Operator {
489
vsldoi(const char *Name, unsigned Opc)
490
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
494
vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
495
vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
496
vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
500
#define GENERATE_NEON
504
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
513
OP_VUZPL, // VUZP, left result
514
OP_VUZPR, // VUZP, right result
515
OP_VZIPL, // VZIP, left result
516
OP_VZIPR, // VZIP, right result
517
OP_VTRNL, // VTRN, left result
518
OP_VTRNR // VTRN, right result
521
struct vrev : public Operator {
522
vrev() : Operator(0x1032, "vrev", OP_VREV) {}
525
template<unsigned Elt>
526
struct vdup : public Operator {
527
vdup(const char *N, unsigned Opc)
528
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
531
vdup<0> the_vdup0("vdup0", OP_VDUP0);
532
vdup<1> the_vdup1("vdup1", OP_VDUP1);
533
vdup<2> the_vdup2("vdup2", OP_VDUP2);
534
vdup<3> the_vdup3("vdup3", OP_VDUP3);
537
struct vext : public Operator {
538
vext(const char *Name, unsigned Opc)
539
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
543
vext<1> the_vext1("vext1", OP_VEXT1);
544
vext<2> the_vext2("vext2", OP_VEXT2);
545
vext<3> the_vext3("vext3", OP_VEXT3);
547
struct vuzpl : public Operator {
548
vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {}
551
struct vuzpr : public Operator {
552
vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {}
555
struct vzipl : public Operator {
556
vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {}
559
struct vzipr : public Operator {
560
vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {}
563
struct vtrnl : public Operator {
564
vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {}
567
struct vtrnr : public Operator {
568
vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {}