1
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3
// The LLVM Compiler Infrastructure
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
8
//===----------------------------------------------------------------------===//
10
// This file defines the interfaces that X86 uses to lower LLVM code into a
13
//===----------------------------------------------------------------------===//
15
#define DEBUG_TYPE "x86-isel"
17
#include "X86InstrBuilder.h"
18
#include "X86ISelLowering.h"
19
#include "X86ShuffleDecode.h"
20
#include "X86TargetMachine.h"
21
#include "X86TargetObjectFile.h"
22
#include "llvm/CallingConv.h"
23
#include "llvm/Constants.h"
24
#include "llvm/DerivedTypes.h"
25
#include "llvm/GlobalAlias.h"
26
#include "llvm/GlobalVariable.h"
27
#include "llvm/Function.h"
28
#include "llvm/Instructions.h"
29
#include "llvm/Intrinsics.h"
30
#include "llvm/LLVMContext.h"
31
#include "llvm/CodeGen/MachineFrameInfo.h"
32
#include "llvm/CodeGen/MachineFunction.h"
33
#include "llvm/CodeGen/MachineInstrBuilder.h"
34
#include "llvm/CodeGen/MachineJumpTableInfo.h"
35
#include "llvm/CodeGen/MachineModuleInfo.h"
36
#include "llvm/CodeGen/MachineRegisterInfo.h"
37
#include "llvm/CodeGen/PseudoSourceValue.h"
38
#include "llvm/MC/MCAsmInfo.h"
39
#include "llvm/MC/MCContext.h"
40
#include "llvm/MC/MCExpr.h"
41
#include "llvm/MC/MCSymbol.h"
42
#include "llvm/ADT/BitVector.h"
43
#include "llvm/ADT/SmallSet.h"
44
#include "llvm/ADT/Statistic.h"
45
#include "llvm/ADT/StringExtras.h"
46
#include "llvm/ADT/VectorExtras.h"
47
#include "llvm/Support/CommandLine.h"
48
#include "llvm/Support/Debug.h"
49
#include "llvm/Support/Dwarf.h"
50
#include "llvm/Support/ErrorHandling.h"
51
#include "llvm/Support/MathExtras.h"
52
#include "llvm/Support/raw_ostream.h"
54
using namespace dwarf;
56
STATISTIC(NumTailCalls, "Number of tail calls");
59
DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
61
// Forward declarations.
62
static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
65
static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
67
bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
69
if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) {
70
if (is64Bit) return new X8664_MachoTargetObjectFile();
71
return new TargetLoweringObjectFileMachO();
72
} else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){
73
if (is64Bit) return new X8664_ELFTargetObjectFile(TM);
74
return new X8632_ELFTargetObjectFile(TM);
75
} else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) {
76
return new TargetLoweringObjectFileCOFF();
78
llvm_unreachable("unknown subtarget type");
81
X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
82
: TargetLowering(TM, createTLOF(TM)) {
83
Subtarget = &TM.getSubtarget<X86Subtarget>();
84
X86ScalarSSEf64 = Subtarget->hasSSE2();
85
X86ScalarSSEf32 = Subtarget->hasSSE1();
86
X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
88
RegInfo = TM.getRegisterInfo();
91
// Set up the TargetLowering object.
93
// X86 is weird, it always uses i8 for shift amounts and setcc results.
94
setShiftAmountType(MVT::i8);
95
setBooleanContents(ZeroOrOneBooleanContent);
96
setSchedulingPreference(Sched::RegPressure);
97
setStackPointerRegisterToSaveRestore(X86StackPtr);
99
if (Subtarget->isTargetDarwin()) {
100
// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
101
setUseUnderscoreSetJmp(false);
102
setUseUnderscoreLongJmp(false);
103
} else if (Subtarget->isTargetMingw()) {
104
// MS runtime is weird: it exports _setjmp, but longjmp!
105
setUseUnderscoreSetJmp(true);
106
setUseUnderscoreLongJmp(false);
108
setUseUnderscoreSetJmp(true);
109
setUseUnderscoreLongJmp(true);
112
// Set up the register classes.
113
addRegisterClass(MVT::i8, X86::GR8RegisterClass);
114
addRegisterClass(MVT::i16, X86::GR16RegisterClass);
115
addRegisterClass(MVT::i32, X86::GR32RegisterClass);
116
if (Subtarget->is64Bit())
117
addRegisterClass(MVT::i64, X86::GR64RegisterClass);
119
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
121
// We don't accept any truncstore of integer registers.
122
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
123
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
124
setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
125
setTruncStoreAction(MVT::i32, MVT::i16, Expand);
126
setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
127
setTruncStoreAction(MVT::i16, MVT::i8, Expand);
129
// SETOEQ and SETUNE require checking two conditions.
130
setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
131
setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
132
setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
133
setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
134
setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
135
setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
137
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
139
setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
140
setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
141
setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
143
if (Subtarget->is64Bit()) {
144
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
145
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
146
} else if (!UseSoftFloat) {
147
// We have an algorithm for SSE2->double, and we turn this into a
148
// 64-bit FILD followed by conditional FADD for other targets.
149
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
150
// We have an algorithm for SSE2, and we turn this into a 64-bit
151
// FILD for other targets.
152
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
155
// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
157
setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
158
setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
161
// SSE has no i16 to fp conversion, only i32
162
if (X86ScalarSSEf32) {
163
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
164
// f32 and f64 cases are Legal, f80 case is not
165
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
167
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
168
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
171
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
172
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
175
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
176
// are Legal, f80 is custom lowered.
177
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
178
setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
180
// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
182
setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
183
setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
185
if (X86ScalarSSEf32) {
186
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
187
// f32 and f64 cases are Legal, f80 case is not
188
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
190
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
191
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
194
// Handle FP_TO_UINT by promoting the destination to a larger signed
196
setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
197
setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
198
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
200
if (Subtarget->is64Bit()) {
201
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
202
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
203
} else if (!UseSoftFloat) {
204
if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
205
// Expand FP_TO_UINT into a select.
206
// FIXME: We would like to use a Custom expander here eventually to do
207
// the optimal thing for SSE vs. the default expansion in the legalizer.
208
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
210
// With SSE3 we can use fisttpll to convert to a signed i64; without
211
// SSE, we're stuck with a fistpll.
212
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
215
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
216
if (!X86ScalarSSEf64) {
217
setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand);
218
setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand);
219
if (Subtarget->is64Bit()) {
220
setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand);
221
// Without SSE, i64->f64 goes through memory; i64->MMX is Legal.
222
if (Subtarget->hasMMX() && !DisableMMX)
223
setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom);
225
setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand);
229
// Scalar integer divide and remainder are lowered to use operations that
230
// produce two results, to match the available instructions. This exposes
231
// the two-result form to trivial CSE, which is able to combine x/y and x%y
232
// into a single instruction.
234
// Scalar integer multiply-high is also lowered to use two-result
235
// operations, to match the available instructions. However, plain multiply
236
// (low) operations are left as Legal, as there are single-result
237
// instructions for this in x86. Using the two-result multiply instructions
238
// when both high and low results are needed must be arranged by dagcombine.
239
setOperationAction(ISD::MULHS , MVT::i8 , Expand);
240
setOperationAction(ISD::MULHU , MVT::i8 , Expand);
241
setOperationAction(ISD::SDIV , MVT::i8 , Expand);
242
setOperationAction(ISD::UDIV , MVT::i8 , Expand);
243
setOperationAction(ISD::SREM , MVT::i8 , Expand);
244
setOperationAction(ISD::UREM , MVT::i8 , Expand);
245
setOperationAction(ISD::MULHS , MVT::i16 , Expand);
246
setOperationAction(ISD::MULHU , MVT::i16 , Expand);
247
setOperationAction(ISD::SDIV , MVT::i16 , Expand);
248
setOperationAction(ISD::UDIV , MVT::i16 , Expand);
249
setOperationAction(ISD::SREM , MVT::i16 , Expand);
250
setOperationAction(ISD::UREM , MVT::i16 , Expand);
251
setOperationAction(ISD::MULHS , MVT::i32 , Expand);
252
setOperationAction(ISD::MULHU , MVT::i32 , Expand);
253
setOperationAction(ISD::SDIV , MVT::i32 , Expand);
254
setOperationAction(ISD::UDIV , MVT::i32 , Expand);
255
setOperationAction(ISD::SREM , MVT::i32 , Expand);
256
setOperationAction(ISD::UREM , MVT::i32 , Expand);
257
setOperationAction(ISD::MULHS , MVT::i64 , Expand);
258
setOperationAction(ISD::MULHU , MVT::i64 , Expand);
259
setOperationAction(ISD::SDIV , MVT::i64 , Expand);
260
setOperationAction(ISD::UDIV , MVT::i64 , Expand);
261
setOperationAction(ISD::SREM , MVT::i64 , Expand);
262
setOperationAction(ISD::UREM , MVT::i64 , Expand);
264
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
265
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
266
setOperationAction(ISD::BR_CC , MVT::Other, Expand);
267
setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
268
if (Subtarget->is64Bit())
269
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
270
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
271
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
272
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
273
setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
274
setOperationAction(ISD::FREM , MVT::f32 , Expand);
275
setOperationAction(ISD::FREM , MVT::f64 , Expand);
276
setOperationAction(ISD::FREM , MVT::f80 , Expand);
277
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
279
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
280
setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
281
setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
282
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
283
setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
284
setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
285
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
286
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
287
setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
288
if (Subtarget->is64Bit()) {
289
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
290
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
291
setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
294
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
295
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
297
// These should be promoted to a larger select which is supported.
298
setOperationAction(ISD::SELECT , MVT::i1 , Promote);
299
// X86 wants to expand cmov itself.
300
setOperationAction(ISD::SELECT , MVT::i8 , Custom);
301
setOperationAction(ISD::SELECT , MVT::i16 , Custom);
302
setOperationAction(ISD::SELECT , MVT::i32 , Custom);
303
setOperationAction(ISD::SELECT , MVT::f32 , Custom);
304
setOperationAction(ISD::SELECT , MVT::f64 , Custom);
305
setOperationAction(ISD::SELECT , MVT::f80 , Custom);
306
setOperationAction(ISD::SETCC , MVT::i8 , Custom);
307
setOperationAction(ISD::SETCC , MVT::i16 , Custom);
308
setOperationAction(ISD::SETCC , MVT::i32 , Custom);
309
setOperationAction(ISD::SETCC , MVT::f32 , Custom);
310
setOperationAction(ISD::SETCC , MVT::f64 , Custom);
311
setOperationAction(ISD::SETCC , MVT::f80 , Custom);
312
if (Subtarget->is64Bit()) {
313
setOperationAction(ISD::SELECT , MVT::i64 , Custom);
314
setOperationAction(ISD::SETCC , MVT::i64 , Custom);
316
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
319
setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
320
setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
321
setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
322
setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
323
if (Subtarget->is64Bit())
324
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
325
setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
326
setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
327
if (Subtarget->is64Bit()) {
328
setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
329
setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
330
setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
331
setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
332
setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
334
// 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
335
setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
336
setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
337
setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
338
if (Subtarget->is64Bit()) {
339
setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
340
setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
341
setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
344
if (Subtarget->hasSSE1())
345
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
347
// We may not have a libcall for MEMBARRIER so we should lower this.
348
setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom);
350
// On X86 and X86-64, atomic operations are lowered to locked instructions.
351
// Locked instructions, in turn, have implicit fence semantics (all memory
352
// operations are flushed before issuing the locked instruction, and they
353
// are not buffered), so we can fold away the common pattern of
354
// fence-atomic-fence.
355
setShouldFoldAtomicFences(true);
357
// Expand certain atomics
358
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
359
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
360
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
361
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
363
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
364
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
365
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
366
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
368
if (!Subtarget->is64Bit()) {
369
setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
370
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
371
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
372
setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
373
setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
374
setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
375
setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
378
// FIXME - use subtarget debug flags
379
if (!Subtarget->isTargetDarwin() &&
380
!Subtarget->isTargetELF() &&
381
!Subtarget->isTargetCygMing()) {
382
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
385
setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
386
setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
387
setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
388
setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
389
if (Subtarget->is64Bit()) {
390
setExceptionPointerRegister(X86::RAX);
391
setExceptionSelectorRegister(X86::RDX);
393
setExceptionPointerRegister(X86::EAX);
394
setExceptionSelectorRegister(X86::EDX);
396
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
397
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
399
setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
401
setOperationAction(ISD::TRAP, MVT::Other, Legal);
403
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
404
setOperationAction(ISD::VASTART , MVT::Other, Custom);
405
setOperationAction(ISD::VAEND , MVT::Other, Expand);
406
if (Subtarget->is64Bit()) {
407
setOperationAction(ISD::VAARG , MVT::Other, Custom);
408
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
410
setOperationAction(ISD::VAARG , MVT::Other, Expand);
411
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
414
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
415
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
416
if (Subtarget->is64Bit())
417
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
418
if (Subtarget->isTargetCygMing())
419
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
421
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
423
if (!UseSoftFloat && X86ScalarSSEf64) {
424
// f32 and f64 use SSE.
425
// Set up the FP register classes.
426
addRegisterClass(MVT::f32, X86::FR32RegisterClass);
427
addRegisterClass(MVT::f64, X86::FR64RegisterClass);
429
// Use ANDPD to simulate FABS.
430
setOperationAction(ISD::FABS , MVT::f64, Custom);
431
setOperationAction(ISD::FABS , MVT::f32, Custom);
433
// Use XORP to simulate FNEG.
434
setOperationAction(ISD::FNEG , MVT::f64, Custom);
435
setOperationAction(ISD::FNEG , MVT::f32, Custom);
437
// Use ANDPD and ORPD to simulate FCOPYSIGN.
438
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
439
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
441
// We don't support sin/cos/fmod
442
setOperationAction(ISD::FSIN , MVT::f64, Expand);
443
setOperationAction(ISD::FCOS , MVT::f64, Expand);
444
setOperationAction(ISD::FSIN , MVT::f32, Expand);
445
setOperationAction(ISD::FCOS , MVT::f32, Expand);
447
// Expand FP immediates into loads from the stack, except for the special
449
addLegalFPImmediate(APFloat(+0.0)); // xorpd
450
addLegalFPImmediate(APFloat(+0.0f)); // xorps
451
} else if (!UseSoftFloat && X86ScalarSSEf32) {
452
// Use SSE for f32, x87 for f64.
453
// Set up the FP register classes.
454
addRegisterClass(MVT::f32, X86::FR32RegisterClass);
455
addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
457
// Use ANDPS to simulate FABS.
458
setOperationAction(ISD::FABS , MVT::f32, Custom);
460
// Use XORP to simulate FNEG.
461
setOperationAction(ISD::FNEG , MVT::f32, Custom);
463
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
465
// Use ANDPS and ORPS to simulate FCOPYSIGN.
466
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
467
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
469
// We don't support sin/cos/fmod
470
setOperationAction(ISD::FSIN , MVT::f32, Expand);
471
setOperationAction(ISD::FCOS , MVT::f32, Expand);
473
// Special cases we handle for FP constants.
474
addLegalFPImmediate(APFloat(+0.0f)); // xorps
475
addLegalFPImmediate(APFloat(+0.0)); // FLD0
476
addLegalFPImmediate(APFloat(+1.0)); // FLD1
477
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
478
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
481
setOperationAction(ISD::FSIN , MVT::f64 , Expand);
482
setOperationAction(ISD::FCOS , MVT::f64 , Expand);
484
} else if (!UseSoftFloat) {
485
// f32 and f64 in x87.
486
// Set up the FP register classes.
487
addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
488
addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
490
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
491
setOperationAction(ISD::UNDEF, MVT::f32, Expand);
492
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
493
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
496
setOperationAction(ISD::FSIN , MVT::f64 , Expand);
497
setOperationAction(ISD::FCOS , MVT::f64 , Expand);
499
addLegalFPImmediate(APFloat(+0.0)); // FLD0
500
addLegalFPImmediate(APFloat(+1.0)); // FLD1
501
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
502
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
503
addLegalFPImmediate(APFloat(+0.0f)); // FLD0
504
addLegalFPImmediate(APFloat(+1.0f)); // FLD1
505
addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
506
addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
509
// Long double always uses X87.
511
addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
512
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
513
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
516
APFloat TmpFlt(+0.0);
517
TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
519
addLegalFPImmediate(TmpFlt); // FLD0
521
addLegalFPImmediate(TmpFlt); // FLD0/FCHS
522
APFloat TmpFlt2(+1.0);
523
TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
525
addLegalFPImmediate(TmpFlt2); // FLD1
526
TmpFlt2.changeSign();
527
addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
531
setOperationAction(ISD::FSIN , MVT::f80 , Expand);
532
setOperationAction(ISD::FCOS , MVT::f80 , Expand);
536
// Always use a library call for pow.
537
setOperationAction(ISD::FPOW , MVT::f32 , Expand);
538
setOperationAction(ISD::FPOW , MVT::f64 , Expand);
539
setOperationAction(ISD::FPOW , MVT::f80 , Expand);
541
setOperationAction(ISD::FLOG, MVT::f80, Expand);
542
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
543
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
544
setOperationAction(ISD::FEXP, MVT::f80, Expand);
545
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
547
// First set operation action for all vector types to either promote
548
// (for widening) or expand (for scalarization). Then we will selectively
549
// turn on ones that can be effectively codegen'd.
550
for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
551
VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
552
setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
553
setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
554
setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
555
setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
556
setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
557
setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
558
setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
559
setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
560
setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
561
setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
562
setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
563
setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
564
setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
565
setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
566
setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
567
setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
568
setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
569
setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
570
setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
571
setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
572
setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
573
setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
574
setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
575
setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
576
setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
577
setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
578
setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
579
setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
580
setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
581
setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
582
setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
583
setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
584
setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
585
setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
586
setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
587
setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
588
setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
589
setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
590
setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
591
setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
592
setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
593
setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
594
setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
595
setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
596
setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
597
setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
598
setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
599
setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
600
setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
601
setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand);
602
setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand);
603
setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand);
604
setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand);
605
for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
606
InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
607
setTruncStoreAction((MVT::SimpleValueType)VT,
608
(MVT::SimpleValueType)InnerVT, Expand);
609
setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
610
setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
611
setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
614
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
615
// with -msoft-float, disable use of MMX as well.
616
if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
617
addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false);
618
addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
619
addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
621
addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
623
setOperationAction(ISD::ADD, MVT::v8i8, Legal);
624
setOperationAction(ISD::ADD, MVT::v4i16, Legal);
625
setOperationAction(ISD::ADD, MVT::v2i32, Legal);
626
setOperationAction(ISD::ADD, MVT::v1i64, Legal);
628
setOperationAction(ISD::SUB, MVT::v8i8, Legal);
629
setOperationAction(ISD::SUB, MVT::v4i16, Legal);
630
setOperationAction(ISD::SUB, MVT::v2i32, Legal);
631
setOperationAction(ISD::SUB, MVT::v1i64, Legal);
633
setOperationAction(ISD::MULHS, MVT::v4i16, Legal);
634
setOperationAction(ISD::MUL, MVT::v4i16, Legal);
636
setOperationAction(ISD::AND, MVT::v8i8, Promote);
637
AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64);
638
setOperationAction(ISD::AND, MVT::v4i16, Promote);
639
AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64);
640
setOperationAction(ISD::AND, MVT::v2i32, Promote);
641
AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64);
642
setOperationAction(ISD::AND, MVT::v1i64, Legal);
644
setOperationAction(ISD::OR, MVT::v8i8, Promote);
645
AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64);
646
setOperationAction(ISD::OR, MVT::v4i16, Promote);
647
AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64);
648
setOperationAction(ISD::OR, MVT::v2i32, Promote);
649
AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64);
650
setOperationAction(ISD::OR, MVT::v1i64, Legal);
652
setOperationAction(ISD::XOR, MVT::v8i8, Promote);
653
AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64);
654
setOperationAction(ISD::XOR, MVT::v4i16, Promote);
655
AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64);
656
setOperationAction(ISD::XOR, MVT::v2i32, Promote);
657
AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64);
658
setOperationAction(ISD::XOR, MVT::v1i64, Legal);
660
setOperationAction(ISD::LOAD, MVT::v8i8, Promote);
661
AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64);
662
setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
663
AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64);
664
setOperationAction(ISD::LOAD, MVT::v2i32, Promote);
665
AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64);
666
setOperationAction(ISD::LOAD, MVT::v1i64, Legal);
668
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
669
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
670
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
671
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
673
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
674
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
675
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
676
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
678
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom);
679
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom);
680
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom);
682
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
684
setOperationAction(ISD::SELECT, MVT::v8i8, Promote);
685
setOperationAction(ISD::SELECT, MVT::v4i16, Promote);
686
setOperationAction(ISD::SELECT, MVT::v2i32, Promote);
687
setOperationAction(ISD::SELECT, MVT::v1i64, Custom);
688
setOperationAction(ISD::VSETCC, MVT::v8i8, Custom);
689
setOperationAction(ISD::VSETCC, MVT::v4i16, Custom);
690
setOperationAction(ISD::VSETCC, MVT::v2i32, Custom);
692
if (!X86ScalarSSEf64 && Subtarget->is64Bit()) {
693
setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom);
694
setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom);
695
setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom);
696
setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom);
700
if (!UseSoftFloat && Subtarget->hasSSE1()) {
701
addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
703
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
704
setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
705
setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
706
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
707
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
708
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
709
setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
710
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
711
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
712
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
713
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
714
setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
717
if (!UseSoftFloat && Subtarget->hasSSE2()) {
718
addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
720
// FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
721
// registers cannot be used even for integer operations.
722
addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
723
addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
724
addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
725
addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
727
setOperationAction(ISD::ADD, MVT::v16i8, Legal);
728
setOperationAction(ISD::ADD, MVT::v8i16, Legal);
729
setOperationAction(ISD::ADD, MVT::v4i32, Legal);
730
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
731
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
732
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
733
setOperationAction(ISD::SUB, MVT::v8i16, Legal);
734
setOperationAction(ISD::SUB, MVT::v4i32, Legal);
735
setOperationAction(ISD::SUB, MVT::v2i64, Legal);
736
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
737
setOperationAction(ISD::FADD, MVT::v2f64, Legal);
738
setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
739
setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
740
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
741
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
742
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
744
setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
745
setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
746
setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
747
setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
749
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
750
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
751
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
752
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
753
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
755
setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom);
756
setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom);
757
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
758
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
759
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
761
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
762
for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
763
EVT VT = (MVT::SimpleValueType)i;
764
// Do not attempt to custom lower non-power-of-2 vectors
765
if (!isPowerOf2_32(VT.getVectorNumElements()))
767
// Do not attempt to custom lower non-128-bit vectors
768
if (!VT.is128BitVector())
770
setOperationAction(ISD::BUILD_VECTOR,
771
VT.getSimpleVT().SimpleTy, Custom);
772
setOperationAction(ISD::VECTOR_SHUFFLE,
773
VT.getSimpleVT().SimpleTy, Custom);
774
setOperationAction(ISD::EXTRACT_VECTOR_ELT,
775
VT.getSimpleVT().SimpleTy, Custom);
778
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
779
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
780
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
781
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
782
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
783
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
785
if (Subtarget->is64Bit()) {
786
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
787
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
790
// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
791
for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
792
MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
795
// Do not attempt to promote non-128-bit vectors
796
if (!VT.is128BitVector())
799
setOperationAction(ISD::AND, SVT, Promote);
800
AddPromotedToType (ISD::AND, SVT, MVT::v2i64);
801
setOperationAction(ISD::OR, SVT, Promote);
802
AddPromotedToType (ISD::OR, SVT, MVT::v2i64);
803
setOperationAction(ISD::XOR, SVT, Promote);
804
AddPromotedToType (ISD::XOR, SVT, MVT::v2i64);
805
setOperationAction(ISD::LOAD, SVT, Promote);
806
AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64);
807
setOperationAction(ISD::SELECT, SVT, Promote);
808
AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
811
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
813
// Custom lower v2i64 and v2f64 selects.
814
setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
815
setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
816
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
817
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
819
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
820
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
821
if (!DisableMMX && Subtarget->hasMMX()) {
822
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
823
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
827
if (Subtarget->hasSSE41()) {
828
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
829
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
830
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
831
setOperationAction(ISD::FRINT, MVT::f32, Legal);
832
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
833
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
834
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
835
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
836
setOperationAction(ISD::FRINT, MVT::f64, Legal);
837
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
839
// FIXME: Do we need to handle scalar-to-vector here?
840
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
842
// Can turn SHL into an integer multiply.
843
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
844
setOperationAction(ISD::SHL, MVT::v16i8, Custom);
846
// i8 and i16 vectors are custom , because the source register and source
847
// source memory operand types are not the same width. f32 vectors are
848
// custom since the immediate controlling the insert encodes additional
850
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
851
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
852
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
853
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
855
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
856
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
857
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
858
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
860
if (Subtarget->is64Bit()) {
861
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
862
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
866
if (Subtarget->hasSSE42()) {
867
setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
870
if (!UseSoftFloat && Subtarget->hasAVX()) {
871
addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
872
addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
873
addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
874
addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
875
addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
877
setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
878
setOperationAction(ISD::LOAD, MVT::v8i32, Legal);
879
setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
880
setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
881
setOperationAction(ISD::FADD, MVT::v8f32, Legal);
882
setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
883
setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
884
setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
885
setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
886
setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
887
setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom);
888
//setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom);
889
//setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
890
//setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
891
//setOperationAction(ISD::VSETCC, MVT::v8f32, Custom);
893
// Operations to consider commented out -v16i16 v32i8
894
//setOperationAction(ISD::ADD, MVT::v16i16, Legal);
895
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
896
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
897
//setOperationAction(ISD::SUB, MVT::v32i8, Legal);
898
//setOperationAction(ISD::SUB, MVT::v16i16, Legal);
899
setOperationAction(ISD::SUB, MVT::v8i32, Custom);
900
setOperationAction(ISD::SUB, MVT::v4i64, Custom);
901
//setOperationAction(ISD::MUL, MVT::v16i16, Legal);
902
setOperationAction(ISD::FADD, MVT::v4f64, Legal);
903
setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
904
setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
905
setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
906
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
907
setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
909
setOperationAction(ISD::VSETCC, MVT::v4f64, Custom);
910
// setOperationAction(ISD::VSETCC, MVT::v32i8, Custom);
911
// setOperationAction(ISD::VSETCC, MVT::v16i16, Custom);
912
setOperationAction(ISD::VSETCC, MVT::v8i32, Custom);
914
// setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom);
915
// setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom);
916
// setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom);
917
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom);
918
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom);
920
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
921
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom);
922
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom);
923
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom);
924
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom);
925
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
928
// Not sure we want to do this since there are no 256-bit integer
931
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
932
// This includes 256-bit vectors
933
for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
934
EVT VT = (MVT::SimpleValueType)i;
936
// Do not attempt to custom lower non-power-of-2 vectors
937
if (!isPowerOf2_32(VT.getVectorNumElements()))
940
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
941
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
942
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
945
if (Subtarget->is64Bit()) {
946
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom);
947
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
952
// Not sure we want to do this since there are no 256-bit integer
955
// Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
956
// Including 256-bit vectors
957
for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
958
EVT VT = (MVT::SimpleValueType)i;
960
if (!VT.is256BitVector()) {
963
setOperationAction(ISD::AND, VT, Promote);
964
AddPromotedToType (ISD::AND, VT, MVT::v4i64);
965
setOperationAction(ISD::OR, VT, Promote);
966
AddPromotedToType (ISD::OR, VT, MVT::v4i64);
967
setOperationAction(ISD::XOR, VT, Promote);
968
AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
969
setOperationAction(ISD::LOAD, VT, Promote);
970
AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
971
setOperationAction(ISD::SELECT, VT, Promote);
972
AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
975
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
979
// We want to custom lower some of our intrinsics.
980
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
982
// Add/Sub/Mul with overflow operations are custom lowered.
983
setOperationAction(ISD::SADDO, MVT::i32, Custom);
984
setOperationAction(ISD::UADDO, MVT::i32, Custom);
985
setOperationAction(ISD::SSUBO, MVT::i32, Custom);
986
setOperationAction(ISD::USUBO, MVT::i32, Custom);
987
setOperationAction(ISD::SMULO, MVT::i32, Custom);
989
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
990
// handle type legalization for these operations here.
992
// FIXME: We really should do custom legalization for addition and
993
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
994
// than generic legalization for 64-bit multiplication-with-overflow, though.
995
if (Subtarget->is64Bit()) {
996
setOperationAction(ISD::SADDO, MVT::i64, Custom);
997
setOperationAction(ISD::UADDO, MVT::i64, Custom);
998
setOperationAction(ISD::SSUBO, MVT::i64, Custom);
999
setOperationAction(ISD::USUBO, MVT::i64, Custom);
1000
setOperationAction(ISD::SMULO, MVT::i64, Custom);
1003
if (!Subtarget->is64Bit()) {
1004
// These libcalls are not available in 32-bit.
1005
setLibcallName(RTLIB::SHL_I128, 0);
1006
setLibcallName(RTLIB::SRL_I128, 0);
1007
setLibcallName(RTLIB::SRA_I128, 0);
1010
// We have target-specific dag combine patterns for the following nodes:
1011
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1012
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1013
setTargetDAGCombine(ISD::BUILD_VECTOR);
1014
setTargetDAGCombine(ISD::SELECT);
1015
setTargetDAGCombine(ISD::SHL);
1016
setTargetDAGCombine(ISD::SRA);
1017
setTargetDAGCombine(ISD::SRL);
1018
setTargetDAGCombine(ISD::OR);
1019
setTargetDAGCombine(ISD::STORE);
1020
setTargetDAGCombine(ISD::ZERO_EXTEND);
1021
if (Subtarget->is64Bit())
1022
setTargetDAGCombine(ISD::MUL);
1024
computeRegisterProperties();
1026
// FIXME: These should be based on subtarget info. Plus, the values should
1027
// be smaller when we are in optimizing for size mode.
1028
maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1029
maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1030
maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
1031
setPrefLoopAlignment(16);
1032
benefitFromCodePlacementOpt = true;
1036
MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1041
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1042
/// the desired ByVal argument alignment.
1043
static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
1046
if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1047
if (VTy->getBitWidth() == 128)
1049
} else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1050
unsigned EltAlign = 0;
1051
getMaxByValAlign(ATy->getElementType(), EltAlign);
1052
if (EltAlign > MaxAlign)
1053
MaxAlign = EltAlign;
1054
} else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
1055
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1056
unsigned EltAlign = 0;
1057
getMaxByValAlign(STy->getElementType(i), EltAlign);
1058
if (EltAlign > MaxAlign)
1059
MaxAlign = EltAlign;
1067
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1068
/// function arguments in the caller parameter area. For X86, aggregates
1069
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1070
/// are at 4-byte boundaries.
1071
unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1072
if (Subtarget->is64Bit()) {
1073
// Max of 8 and alignment of type.
1074
unsigned TyAlign = TD->getABITypeAlignment(Ty);
1081
if (Subtarget->hasSSE1())
1082
getMaxByValAlign(Ty, Align);
1086
/// getOptimalMemOpType - Returns the target specific optimal type for load
1087
/// and store operations as a result of memset, memcpy, and memmove
1088
/// lowering. If DstAlign is zero that means it's safe to destination
1089
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1090
/// means there isn't a need to check it against alignment requirement,
1091
/// probably because the source does not need to be loaded. If
1092
/// 'NonScalarIntSafe' is true, that means it's safe to return a
1093
/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1094
/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1095
/// constant so it does not need to be loaded.
1096
/// It returns EVT::Other if the type should be determined using generic
1097
/// target-independent logic.
1099
X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1100
unsigned DstAlign, unsigned SrcAlign,
1101
bool NonScalarIntSafe,
1103
MachineFunction &MF) const {
1104
// FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1105
// linux. This is because the stack realignment code can't handle certain
1106
// cases like PR2962. This should be removed when PR2962 is fixed.
1107
const Function *F = MF.getFunction();
1108
if (NonScalarIntSafe &&
1109
!F->hasFnAttr(Attribute::NoImplicitFloat)) {
1111
(Subtarget->isUnalignedMemAccessFast() ||
1112
((DstAlign == 0 || DstAlign >= 16) &&
1113
(SrcAlign == 0 || SrcAlign >= 16))) &&
1114
Subtarget->getStackAlignment() >= 16) {
1115
if (Subtarget->hasSSE2())
1117
if (Subtarget->hasSSE1())
1119
} else if (!MemcpyStrSrc && Size >= 8 &&
1120
!Subtarget->is64Bit() &&
1121
Subtarget->getStackAlignment() >= 8 &&
1122
Subtarget->hasSSE2()) {
1123
// Do not use f64 to lower memcpy if source is string constant. It's
1124
// better to use i32 to avoid the loads.
1128
if (Subtarget->is64Bit() && Size >= 8)
1133
/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1134
/// current function. The returned value is a member of the
1135
/// MachineJumpTableInfo::JTEntryKind enum.
1136
unsigned X86TargetLowering::getJumpTableEncoding() const {
1137
// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1139
if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1140
Subtarget->isPICStyleGOT())
1141
return MachineJumpTableInfo::EK_Custom32;
1143
// Otherwise, use the normal jump table encoding heuristics.
1144
return TargetLowering::getJumpTableEncoding();
1147
/// getPICBaseSymbol - Return the X86-32 PIC base.
1149
X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
1150
MCContext &Ctx) const {
1151
const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
1152
return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
1153
Twine(MF->getFunctionNumber())+"$pb");
1158
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1159
const MachineBasicBlock *MBB,
1160
unsigned uid,MCContext &Ctx) const{
1161
assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1162
Subtarget->isPICStyleGOT());
1163
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1165
return MCSymbolRefExpr::Create(MBB->getSymbol(),
1166
MCSymbolRefExpr::VK_GOTOFF, Ctx);
1169
/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1171
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1172
SelectionDAG &DAG) const {
1173
if (!Subtarget->is64Bit())
1174
// This doesn't have DebugLoc associated with it, but is not really the
1175
// same as a Register.
1176
return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1180
/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1181
/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1183
const MCExpr *X86TargetLowering::
1184
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1185
MCContext &Ctx) const {
1186
// X86-64 uses RIP relative addressing based on the jump table label.
1187
if (Subtarget->isPICStyleRIPRel())
1188
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1190
// Otherwise, the reference is relative to the PIC base.
1191
return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
1194
/// getFunctionAlignment - Return the Log2 alignment of this function.
1195
unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1196
return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
1199
std::pair<const TargetRegisterClass*, uint8_t>
1200
X86TargetLowering::findRepresentativeClass(EVT VT) const{
1201
const TargetRegisterClass *RRC = 0;
1203
switch (VT.getSimpleVT().SimpleTy) {
1205
return TargetLowering::findRepresentativeClass(VT);
1206
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1207
RRC = (Subtarget->is64Bit()
1208
? X86::GR64RegisterClass : X86::GR32RegisterClass);
1210
case MVT::v8i8: case MVT::v4i16:
1211
case MVT::v2i32: case MVT::v1i64:
1212
RRC = X86::VR64RegisterClass;
1214
case MVT::f32: case MVT::f64:
1215
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1216
case MVT::v4f32: case MVT::v2f64:
1217
case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1219
RRC = X86::VR128RegisterClass;
1222
return std::make_pair(RRC, Cost);
1226
X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
1227
MachineFunction &MF) const {
1228
unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0;
1229
switch (RC->getID()) {
1232
case X86::GR32RegClassID:
1234
case X86::GR64RegClassID:
1236
case X86::VR128RegClassID:
1237
return Subtarget->is64Bit() ? 10 : 4;
1238
case X86::VR64RegClassID:
1243
bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1244
unsigned &Offset) const {
1245
if (!Subtarget->isTargetLinux())
1248
if (Subtarget->is64Bit()) {
1249
// %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1251
if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1264
//===----------------------------------------------------------------------===//
1265
// Return Value Calling Convention Implementation
1266
//===----------------------------------------------------------------------===//
1268
#include "X86GenCallingConv.inc"
1271
X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
1272
const SmallVectorImpl<ISD::OutputArg> &Outs,
1273
LLVMContext &Context) const {
1274
SmallVector<CCValAssign, 16> RVLocs;
1275
CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1277
return CCInfo.CheckReturn(Outs, RetCC_X86);
1281
X86TargetLowering::LowerReturn(SDValue Chain,
1282
CallingConv::ID CallConv, bool isVarArg,
1283
const SmallVectorImpl<ISD::OutputArg> &Outs,
1284
const SmallVectorImpl<SDValue> &OutVals,
1285
DebugLoc dl, SelectionDAG &DAG) const {
1286
MachineFunction &MF = DAG.getMachineFunction();
1287
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1289
SmallVector<CCValAssign, 16> RVLocs;
1290
CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1291
RVLocs, *DAG.getContext());
1292
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1294
// Add the regs to the liveout set for the function.
1295
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1296
for (unsigned i = 0; i != RVLocs.size(); ++i)
1297
if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1298
MRI.addLiveOut(RVLocs[i].getLocReg());
1302
SmallVector<SDValue, 6> RetOps;
1303
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1304
// Operand #1 = Bytes To Pop
1305
RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1308
// Copy the result values into the output registers.
1309
for (unsigned i = 0; i != RVLocs.size(); ++i) {
1310
CCValAssign &VA = RVLocs[i];
1311
assert(VA.isRegLoc() && "Can only return in registers!");
1312
SDValue ValToCopy = OutVals[i];
1313
EVT ValVT = ValToCopy.getValueType();
1315
// If this is x86-64, and we disabled SSE, we can't return FP values
1316
if ((ValVT == MVT::f32 || ValVT == MVT::f64) &&
1317
(Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
1318
report_fatal_error("SSE register return with SSE disabled");
1320
// Likewise we can't return F64 values with SSE1 only. gcc does so, but
1321
// llvm-gcc has never done it right and no one has noticed, so this
1322
// should be OK for now.
1323
if (ValVT == MVT::f64 &&
1324
(Subtarget->is64Bit() && !Subtarget->hasSSE2()))
1325
report_fatal_error("SSE2 register return with SSE2 disabled");
1327
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
1328
// the RET instruction and handled by the FP Stackifier.
1329
if (VA.getLocReg() == X86::ST0 ||
1330
VA.getLocReg() == X86::ST1) {
1331
// If this is a copy from an xmm register to ST(0), use an FPExtend to
1332
// change the value to the FP stack register class.
1333
if (isScalarFPTypeInSSEReg(VA.getValVT()))
1334
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1335
RetOps.push_back(ValToCopy);
1336
// Don't emit a copytoreg.
1340
// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1341
// which is returned in RAX / RDX.
1342
if (Subtarget->is64Bit()) {
1343
if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1344
ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1345
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1346
ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
1349
// If we don't have SSE2 available, convert to v4f32 so the generated
1350
// register is legal.
1351
if (!Subtarget->hasSSE2())
1352
ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy);
1357
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1358
Flag = Chain.getValue(1);
1361
// The x86-64 ABI for returning structs by value requires that we copy
1362
// the sret argument into %rax for the return. We saved the argument into
1363
// a virtual register in the entry block, so now we copy the value out
1365
if (Subtarget->is64Bit() &&
1366
DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1367
MachineFunction &MF = DAG.getMachineFunction();
1368
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1369
unsigned Reg = FuncInfo->getSRetReturnReg();
1371
"SRetReturnReg should have been set in LowerFormalArguments().");
1372
SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1374
Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1375
Flag = Chain.getValue(1);
1377
// RAX now acts like a return value.
1378
MRI.addLiveOut(X86::RAX);
1381
RetOps[0] = Chain; // Update chain.
1383
// Add the flag if we have it.
1385
RetOps.push_back(Flag);
1387
return DAG.getNode(X86ISD::RET_FLAG, dl,
1388
MVT::Other, &RetOps[0], RetOps.size());
1391
/// LowerCallResult - Lower the result values of a call into the
1392
/// appropriate copies out of appropriate physical registers.
1395
X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1396
CallingConv::ID CallConv, bool isVarArg,
1397
const SmallVectorImpl<ISD::InputArg> &Ins,
1398
DebugLoc dl, SelectionDAG &DAG,
1399
SmallVectorImpl<SDValue> &InVals) const {
1401
// Assign locations to each value returned by this call.
1402
SmallVector<CCValAssign, 16> RVLocs;
1403
bool Is64Bit = Subtarget->is64Bit();
1404
CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1405
RVLocs, *DAG.getContext());
1406
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1408
// Copy all of the result registers out of their specified physreg.
1409
for (unsigned i = 0; i != RVLocs.size(); ++i) {
1410
CCValAssign &VA = RVLocs[i];
1411
EVT CopyVT = VA.getValVT();
1413
// If this is x86-64, and we disabled SSE, we can't return FP values
1414
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1415
((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1416
report_fatal_error("SSE register return with SSE disabled");
1421
// If this is a call to a function that returns an fp value on the floating
1422
// point stack, we must guarantee the the value is popped from the stack, so
1423
// a CopyFromReg is not good enough - the copy instruction may be eliminated
1424
// if the return value is not used. We use the FpGET_ST0 instructions
1426
if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
1427
// If we prefer to use the value in xmm registers, copy it out as f80 and
1428
// use a truncate to move it from fp stack reg to xmm reg.
1429
if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
1430
bool isST0 = VA.getLocReg() == X86::ST0;
1432
if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32;
1433
if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64;
1434
if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80;
1435
SDValue Ops[] = { Chain, InFlag };
1436
Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag,
1438
Val = Chain.getValue(0);
1440
// Round the f80 to the right size, which also moves it to the appropriate
1442
if (CopyVT != VA.getValVT())
1443
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1444
// This truncation won't change the value.
1445
DAG.getIntPtrConstant(1));
1446
} else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1447
// For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1448
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1449
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1450
MVT::v2i64, InFlag).getValue(1);
1451
Val = Chain.getValue(0);
1452
Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1453
Val, DAG.getConstant(0, MVT::i64));
1455
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1456
MVT::i64, InFlag).getValue(1);
1457
Val = Chain.getValue(0);
1459
Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1461
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1462
CopyVT, InFlag).getValue(1);
1463
Val = Chain.getValue(0);
1465
InFlag = Chain.getValue(2);
1466
InVals.push_back(Val);
1473
//===----------------------------------------------------------------------===//
1474
// C & StdCall & Fast Calling Convention implementation
1475
//===----------------------------------------------------------------------===//
1476
// StdCall calling convention seems to be standard for many Windows' API
1477
// routines and around. It differs from C calling convention just a little:
1478
// callee should clean up the stack, not caller. Symbols should be also
1479
// decorated in some fancy way :) It doesn't support any vector arguments.
1480
// For info on fast calling convention see Fast Calling Convention (tail call)
1481
// implementation LowerX86_32FastCCCallTo.
1483
/// CallIsStructReturn - Determines whether a call uses struct return
1485
static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1489
return Outs[0].Flags.isSRet();
1492
/// ArgsAreStructReturn - Determines whether a function uses struct
1493
/// return semantics.
1495
ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1499
return Ins[0].Flags.isSRet();
1502
/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1503
/// given CallingConvention value.
1504
CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1505
if (Subtarget->is64Bit()) {
1506
if (CC == CallingConv::GHC)
1507
return CC_X86_64_GHC;
1508
else if (Subtarget->isTargetWin64())
1509
return CC_X86_Win64_C;
1514
if (CC == CallingConv::X86_FastCall)
1515
return CC_X86_32_FastCall;
1516
else if (CC == CallingConv::X86_ThisCall)
1517
return CC_X86_32_ThisCall;
1518
else if (CC == CallingConv::Fast)
1519
return CC_X86_32_FastCC;
1520
else if (CC == CallingConv::GHC)
1521
return CC_X86_32_GHC;
1526
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1527
/// by "Src" to address "Dst" with size and alignment information specified by
1528
/// the specific parameter attribute. The copy will be passed as a byval
1529
/// function parameter.
1531
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1532
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1534
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1535
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1536
/*isVolatile*/false, /*AlwaysInline=*/true,
1540
/// IsTailCallConvention - Return true if the calling convention is one that
1541
/// supports tail call optimization.
1542
static bool IsTailCallConvention(CallingConv::ID CC) {
1543
return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1546
/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1547
/// a tailcall target by changing its ABI.
1548
static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
1549
return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1553
X86TargetLowering::LowerMemArgument(SDValue Chain,
1554
CallingConv::ID CallConv,
1555
const SmallVectorImpl<ISD::InputArg> &Ins,
1556
DebugLoc dl, SelectionDAG &DAG,
1557
const CCValAssign &VA,
1558
MachineFrameInfo *MFI,
1560
// Create the nodes corresponding to a load from this parameter slot.
1561
ISD::ArgFlagsTy Flags = Ins[i].Flags;
1562
bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
1563
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1566
// If value is passed by pointer we have address passed instead of the value
1568
if (VA.getLocInfo() == CCValAssign::Indirect)
1569
ValVT = VA.getLocVT();
1571
ValVT = VA.getValVT();
1573
// FIXME: For now, all byval parameter objects are marked mutable. This can be
1574
// changed with more analysis.
1575
// In case of tail call optimization mark all arguments mutable. Since they
1576
// could be overwritten by lowering of arguments in case of a tail call.
1577
if (Flags.isByVal()) {
1578
int FI = MFI->CreateFixedObject(Flags.getByValSize(),
1579
VA.getLocMemOffset(), isImmutable);
1580
return DAG.getFrameIndex(FI, getPointerTy());
1582
int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1583
VA.getLocMemOffset(), isImmutable);
1584
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1585
return DAG.getLoad(ValVT, dl, Chain, FIN,
1586
PseudoSourceValue::getFixedStack(FI), 0,
1592
X86TargetLowering::LowerFormalArguments(SDValue Chain,
1593
CallingConv::ID CallConv,
1595
const SmallVectorImpl<ISD::InputArg> &Ins,
1598
SmallVectorImpl<SDValue> &InVals)
1600
MachineFunction &MF = DAG.getMachineFunction();
1601
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1603
const Function* Fn = MF.getFunction();
1604
if (Fn->hasExternalLinkage() &&
1605
Subtarget->isTargetCygMing() &&
1606
Fn->getName() == "main")
1607
FuncInfo->setForceFramePointer(true);
1609
MachineFrameInfo *MFI = MF.getFrameInfo();
1610
bool Is64Bit = Subtarget->is64Bit();
1611
bool IsWin64 = Subtarget->isTargetWin64();
1613
assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1614
"Var args not supported with calling convention fastcc or ghc");
1616
// Assign locations to all of the incoming arguments.
1617
SmallVector<CCValAssign, 16> ArgLocs;
1618
CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1619
ArgLocs, *DAG.getContext());
1620
CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1622
unsigned LastVal = ~0U;
1624
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1625
CCValAssign &VA = ArgLocs[i];
1626
// TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1628
assert(VA.getValNo() != LastVal &&
1629
"Don't support value assigned to multiple locs yet");
1630
LastVal = VA.getValNo();
1632
if (VA.isRegLoc()) {
1633
EVT RegVT = VA.getLocVT();
1634
TargetRegisterClass *RC = NULL;
1635
if (RegVT == MVT::i32)
1636
RC = X86::GR32RegisterClass;
1637
else if (Is64Bit && RegVT == MVT::i64)
1638
RC = X86::GR64RegisterClass;
1639
else if (RegVT == MVT::f32)
1640
RC = X86::FR32RegisterClass;
1641
else if (RegVT == MVT::f64)
1642
RC = X86::FR64RegisterClass;
1643
else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
1644
RC = X86::VR256RegisterClass;
1645
else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1646
RC = X86::VR128RegisterClass;
1647
else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1648
RC = X86::VR64RegisterClass;
1650
llvm_unreachable("Unknown argument type!");
1652
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1653
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1655
// If this is an 8 or 16-bit value, it is really passed promoted to 32
1656
// bits. Insert an assert[sz]ext to capture this, then truncate to the
1658
if (VA.getLocInfo() == CCValAssign::SExt)
1659
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1660
DAG.getValueType(VA.getValVT()));
1661
else if (VA.getLocInfo() == CCValAssign::ZExt)
1662
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1663
DAG.getValueType(VA.getValVT()));
1664
else if (VA.getLocInfo() == CCValAssign::BCvt)
1665
ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1667
if (VA.isExtInLoc()) {
1668
// Handle MMX values passed in XMM regs.
1669
if (RegVT.isVector()) {
1670
ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1671
ArgValue, DAG.getConstant(0, MVT::i64));
1672
ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1674
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1677
assert(VA.isMemLoc());
1678
ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1681
// If value is passed via pointer - do a load.
1682
if (VA.getLocInfo() == CCValAssign::Indirect)
1683
ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
1686
InVals.push_back(ArgValue);
1689
// The x86-64 ABI for returning structs by value requires that we copy
1690
// the sret argument into %rax for the return. Save the argument into
1691
// a virtual register so that we can access it from the return points.
1692
if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1693
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1694
unsigned Reg = FuncInfo->getSRetReturnReg();
1696
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1697
FuncInfo->setSRetReturnReg(Reg);
1699
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1700
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1703
unsigned StackSize = CCInfo.getNextStackOffset();
1704
// Align stack specially for tail calls.
1705
if (FuncIsMadeTailCallSafe(CallConv))
1706
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1708
// If the function takes variable number of arguments, make a frame index for
1709
// the start of the first vararg value... for expansion of llvm.va_start.
1711
if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
1712
CallConv != CallingConv::X86_ThisCall)) {
1713
FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
1716
unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1718
// FIXME: We should really autogenerate these arrays
1719
static const unsigned GPR64ArgRegsWin64[] = {
1720
X86::RCX, X86::RDX, X86::R8, X86::R9
1722
static const unsigned XMMArgRegsWin64[] = {
1723
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1725
static const unsigned GPR64ArgRegs64Bit[] = {
1726
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1728
static const unsigned XMMArgRegs64Bit[] = {
1729
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1730
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1732
const unsigned *GPR64ArgRegs, *XMMArgRegs;
1735
TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1736
GPR64ArgRegs = GPR64ArgRegsWin64;
1737
XMMArgRegs = XMMArgRegsWin64;
1739
TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1740
GPR64ArgRegs = GPR64ArgRegs64Bit;
1741
XMMArgRegs = XMMArgRegs64Bit;
1743
unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1745
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1748
bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1749
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1750
"SSE register cannot be used when SSE is disabled!");
1751
assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1752
"SSE register cannot be used when SSE is disabled!");
1753
if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1754
// Kernel mode asks for SSE to be disabled, so don't push them
1756
TotalNumXMMRegs = 0;
1758
// For X86-64, if there are vararg parameters that are passed via
1759
// registers, then we must store them to their spots on the stack so they
1760
// may be loaded by deferencing the result of va_next.
1761
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1762
FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
1763
FuncInfo->setRegSaveFrameIndex(
1764
MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
1767
// Store the integer parameter registers.
1768
SmallVector<SDValue, 8> MemOps;
1769
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1771
unsigned Offset = FuncInfo->getVarArgsGPOffset();
1772
for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1773
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1774
DAG.getIntPtrConstant(Offset));
1775
unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1776
X86::GR64RegisterClass);
1777
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1779
DAG.getStore(Val.getValue(1), dl, Val, FIN,
1780
PseudoSourceValue::getFixedStack(
1781
FuncInfo->getRegSaveFrameIndex()),
1782
Offset, false, false, 0);
1783
MemOps.push_back(Store);
1787
if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1788
// Now store the XMM (fp + vector) parameter registers.
1789
SmallVector<SDValue, 11> SaveXMMOps;
1790
SaveXMMOps.push_back(Chain);
1792
unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1793
SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1794
SaveXMMOps.push_back(ALVal);
1796
SaveXMMOps.push_back(DAG.getIntPtrConstant(
1797
FuncInfo->getRegSaveFrameIndex()));
1798
SaveXMMOps.push_back(DAG.getIntPtrConstant(
1799
FuncInfo->getVarArgsFPOffset()));
1801
for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1802
unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1803
X86::VR128RegisterClass);
1804
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1805
SaveXMMOps.push_back(Val);
1807
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1809
&SaveXMMOps[0], SaveXMMOps.size()));
1812
if (!MemOps.empty())
1813
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1814
&MemOps[0], MemOps.size());
1818
// Some CCs need callee pop.
1819
if (Subtarget->IsCalleePop(isVarArg, CallConv)) {
1820
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1822
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1823
// If this is an sret function, the return should pop the hidden pointer.
1824
if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
1825
FuncInfo->setBytesToPopOnReturn(4);
1829
// RegSaveFrameIndex is X86-64 only.
1830
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1831
if (CallConv == CallingConv::X86_FastCall ||
1832
CallConv == CallingConv::X86_ThisCall)
1833
// fastcc functions can't have varargs.
1834
FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1841
X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1842
SDValue StackPtr, SDValue Arg,
1843
DebugLoc dl, SelectionDAG &DAG,
1844
const CCValAssign &VA,
1845
ISD::ArgFlagsTy Flags) const {
1846
const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1847
unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1848
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1849
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1850
if (Flags.isByVal()) {
1851
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1853
return DAG.getStore(Chain, dl, Arg, PtrOff,
1854
PseudoSourceValue::getStack(), LocMemOffset,
1858
/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1859
/// optimization is performed and it is required.
1861
X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1862
SDValue &OutRetAddr, SDValue Chain,
1863
bool IsTailCall, bool Is64Bit,
1864
int FPDiff, DebugLoc dl) const {
1865
// Adjust the Return address stack slot.
1866
EVT VT = getPointerTy();
1867
OutRetAddr = getReturnAddressFrameIndex(DAG);
1869
// Load the "old" Return address.
1870
OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
1871
return SDValue(OutRetAddr.getNode(), 1);
1874
/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1875
/// optimization is performed and it is required (FPDiff!=0).
1877
EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1878
SDValue Chain, SDValue RetAddrFrIdx,
1879
bool Is64Bit, int FPDiff, DebugLoc dl) {
1880
// Store the return address to the appropriate stack slot.
1881
if (!FPDiff) return Chain;
1882
// Calculate the new stack slot for the return address.
1883
int SlotSize = Is64Bit ? 8 : 4;
1884
int NewReturnAddrFI =
1885
MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
1886
EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1887
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1888
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1889
PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
1895
X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1896
CallingConv::ID CallConv, bool isVarArg,
1898
const SmallVectorImpl<ISD::OutputArg> &Outs,
1899
const SmallVectorImpl<SDValue> &OutVals,
1900
const SmallVectorImpl<ISD::InputArg> &Ins,
1901
DebugLoc dl, SelectionDAG &DAG,
1902
SmallVectorImpl<SDValue> &InVals) const {
1903
MachineFunction &MF = DAG.getMachineFunction();
1904
bool Is64Bit = Subtarget->is64Bit();
1905
bool IsStructRet = CallIsStructReturn(Outs);
1906
bool IsSibcall = false;
1909
// Check if it's really possible to do a tail call.
1910
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1911
isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1912
Outs, OutVals, Ins, DAG);
1914
// Sibcalls are automatically detected tailcalls which do not require
1916
if (!GuaranteedTailCallOpt && isTailCall)
1923
assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1924
"Var args not supported with calling convention fastcc or ghc");
1926
// Analyze operands of the call, assigning locations to each operand.
1927
SmallVector<CCValAssign, 16> ArgLocs;
1928
CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1929
ArgLocs, *DAG.getContext());
1930
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1932
// Get a count of how many bytes are to be pushed on the stack.
1933
unsigned NumBytes = CCInfo.getNextStackOffset();
1935
// This is a sibcall. The memory operands are available in caller's
1936
// own caller's stack.
1938
else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
1939
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1942
if (isTailCall && !IsSibcall) {
1943
// Lower arguments at fp - stackoffset + fpdiff.
1944
unsigned NumBytesCallerPushed =
1945
MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1946
FPDiff = NumBytesCallerPushed - NumBytes;
1948
// Set the delta of movement of the returnaddr stackslot.
1949
// But only set if delta is greater than previous delta.
1950
if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1951
MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1955
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1957
SDValue RetAddrFrIdx;
1958
// Load return adress for tail calls.
1959
if (isTailCall && FPDiff)
1960
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
1961
Is64Bit, FPDiff, dl);
1963
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1964
SmallVector<SDValue, 8> MemOpChains;
1967
// Walk the register/memloc assignments, inserting copies/loads. In the case
1968
// of tail call optimization arguments are handle later.
1969
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1970
CCValAssign &VA = ArgLocs[i];
1971
EVT RegVT = VA.getLocVT();
1972
SDValue Arg = OutVals[i];
1973
ISD::ArgFlagsTy Flags = Outs[i].Flags;
1974
bool isByVal = Flags.isByVal();
1976
// Promote the value if needed.
1977
switch (VA.getLocInfo()) {
1978
default: llvm_unreachable("Unknown loc info!");
1979
case CCValAssign::Full: break;
1980
case CCValAssign::SExt:
1981
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1983
case CCValAssign::ZExt:
1984
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1986
case CCValAssign::AExt:
1987
if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1988
// Special case: passing MMX values in XMM registers.
1989
Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1990
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1991
Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1993
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1995
case CCValAssign::BCvt:
1996
Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1998
case CCValAssign::Indirect: {
1999
// Store the argument.
2000
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2001
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2002
Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2003
PseudoSourceValue::getFixedStack(FI), 0,
2010
if (VA.isRegLoc()) {
2011
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2012
if (isVarArg && Subtarget->isTargetWin64()) {
2013
// Win64 ABI requires argument XMM reg to be copied to the corresponding
2014
// shadow reg if callee is a varargs function.
2015
unsigned ShadowReg = 0;
2016
switch (VA.getLocReg()) {
2017
case X86::XMM0: ShadowReg = X86::RCX; break;
2018
case X86::XMM1: ShadowReg = X86::RDX; break;
2019
case X86::XMM2: ShadowReg = X86::R8; break;
2020
case X86::XMM3: ShadowReg = X86::R9; break;
2023
RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2025
} else if (!IsSibcall && (!isTailCall || isByVal)) {
2026
assert(VA.isMemLoc());
2027
if (StackPtr.getNode() == 0)
2028
StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
2029
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2030
dl, DAG, VA, Flags));
2034
if (!MemOpChains.empty())
2035
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2036
&MemOpChains[0], MemOpChains.size());
2038
// Build a sequence of copy-to-reg nodes chained together with token chain
2039
// and flag operands which copy the outgoing args into registers.
2041
// Tail call byval lowering might overwrite argument registers so in case of
2042
// tail call optimization the copies to registers are lowered later.
2044
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2045
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2046
RegsToPass[i].second, InFlag);
2047
InFlag = Chain.getValue(1);
2050
if (Subtarget->isPICStyleGOT()) {
2051
// ELF / PIC requires GOT in the EBX register before function calls via PLT
2054
Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
2055
DAG.getNode(X86ISD::GlobalBaseReg,
2056
DebugLoc(), getPointerTy()),
2058
InFlag = Chain.getValue(1);
2060
// If we are tail calling and generating PIC/GOT style code load the
2061
// address of the callee into ECX. The value in ecx is used as target of
2062
// the tail jump. This is done to circumvent the ebx/callee-saved problem
2063
// for tail calls on PIC/GOT architectures. Normally we would just put the
2064
// address of GOT into ebx and then call target@PLT. But for tail calls
2065
// ebx would be restored (since ebx is callee saved) before jumping to the
2068
// Note: The actual moving to ECX is done further down.
2069
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2070
if (G && !G->getGlobal()->hasHiddenVisibility() &&
2071
!G->getGlobal()->hasProtectedVisibility())
2072
Callee = LowerGlobalAddress(Callee, DAG);
2073
else if (isa<ExternalSymbolSDNode>(Callee))
2074
Callee = LowerExternalSymbol(Callee, DAG);
2078
if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) {
2079
// From AMD64 ABI document:
2080
// For calls that may call functions that use varargs or stdargs
2081
// (prototype-less calls or calls to functions containing ellipsis (...) in
2082
// the declaration) %al is used as hidden argument to specify the number
2083
// of SSE registers used. The contents of %al do not need to match exactly
2084
// the number of registers, but must be an ubound on the number of SSE
2085
// registers used and is in the range 0 - 8 inclusive.
2087
// Count the number of XMM registers allocated.
2088
static const unsigned XMMArgRegs[] = {
2089
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2090
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2092
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2093
assert((Subtarget->hasSSE1() || !NumXMMRegs)
2094
&& "SSE registers cannot be used when SSE is disabled");
2096
Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
2097
DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
2098
InFlag = Chain.getValue(1);
2102
// For tail calls lower the arguments to the 'real' stack slot.
2104
// Force all the incoming stack arguments to be loaded from the stack
2105
// before any new outgoing arguments are stored to the stack, because the
2106
// outgoing stack slots may alias the incoming argument stack slots, and
2107
// the alias isn't otherwise explicit. This is slightly more conservative
2108
// than necessary, because it means that each store effectively depends
2109
// on every argument instead of just those arguments it would clobber.
2110
SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2112
SmallVector<SDValue, 8> MemOpChains2;
2115
// Do not flag preceeding copytoreg stuff together with the following stuff.
2117
if (GuaranteedTailCallOpt) {
2118
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2119
CCValAssign &VA = ArgLocs[i];
2122
assert(VA.isMemLoc());
2123
SDValue Arg = OutVals[i];
2124
ISD::ArgFlagsTy Flags = Outs[i].Flags;
2125
// Create frame index.
2126
int32_t Offset = VA.getLocMemOffset()+FPDiff;
2127
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2128
FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2129
FIN = DAG.getFrameIndex(FI, getPointerTy());
2131
if (Flags.isByVal()) {
2132
// Copy relative to framepointer.
2133
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2134
if (StackPtr.getNode() == 0)
2135
StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2137
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2139
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2143
// Store relative to framepointer.
2144
MemOpChains2.push_back(
2145
DAG.getStore(ArgChain, dl, Arg, FIN,
2146
PseudoSourceValue::getFixedStack(FI), 0,
2152
if (!MemOpChains2.empty())
2153
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2154
&MemOpChains2[0], MemOpChains2.size());
2156
// Copy arguments to their registers.
2157
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2158
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2159
RegsToPass[i].second, InFlag);
2160
InFlag = Chain.getValue(1);
2164
// Store the return address to the appropriate stack slot.
2165
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2169
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2170
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2171
// In the 64-bit large code model, we have to make all calls
2172
// through a register, since the call instruction's 32-bit
2173
// pc-relative offset may not be large enough to hold the whole
2175
} else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2176
// If the callee is a GlobalAddress node (quite common, every direct call
2177
// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2180
// We should use extra load for direct calls to dllimported functions in
2182
const GlobalValue *GV = G->getGlobal();
2183
if (!GV->hasDLLImportLinkage()) {
2184
unsigned char OpFlags = 0;
2186
// On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2187
// external symbols most go through the PLT in PIC mode. If the symbol
2188
// has hidden or protected visibility, or if it is static or local, then
2189
// we don't need to use the PLT - we can directly call it.
2190
if (Subtarget->isTargetELF() &&
2191
getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2192
GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2193
OpFlags = X86II::MO_PLT;
2194
} else if (Subtarget->isPICStyleStubAny() &&
2195
(GV->isDeclaration() || GV->isWeakForLinker()) &&
2196
Subtarget->getDarwinVers() < 9) {
2197
// PC-relative references to external symbols should go through $stub,
2198
// unless we're building with the leopard linker or later, which
2199
// automatically synthesizes these stubs.
2200
OpFlags = X86II::MO_DARWIN_STUB;
2203
Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
2204
G->getOffset(), OpFlags);
2206
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2207
unsigned char OpFlags = 0;
2209
// On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
2210
// symbols should go through the PLT.
2211
if (Subtarget->isTargetELF() &&
2212
getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2213
OpFlags = X86II::MO_PLT;
2214
} else if (Subtarget->isPICStyleStubAny() &&
2215
Subtarget->getDarwinVers() < 9) {
2216
// PC-relative references to external symbols should go through $stub,
2217
// unless we're building with the leopard linker or later, which
2218
// automatically synthesizes these stubs.
2219
OpFlags = X86II::MO_DARWIN_STUB;
2222
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2226
// Returns a chain & a flag for retval copy to use.
2227
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
2228
SmallVector<SDValue, 8> Ops;
2230
if (!IsSibcall && isTailCall) {
2231
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2232
DAG.getIntPtrConstant(0, true), InFlag);
2233
InFlag = Chain.getValue(1);
2236
Ops.push_back(Chain);
2237
Ops.push_back(Callee);
2240
Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2242
// Add argument registers to the end of the list so that they are known live
2244
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2245
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2246
RegsToPass[i].second.getValueType()));
2248
// Add an implicit use GOT pointer in EBX.
2249
if (!isTailCall && Subtarget->isPICStyleGOT())
2250
Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2252
// Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
2253
if (Is64Bit && isVarArg && !Subtarget->isTargetWin64())
2254
Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2256
if (InFlag.getNode())
2257
Ops.push_back(InFlag);
2261
//// If this is the first return lowered for this function, add the regs
2262
//// to the liveout set for the function.
2263
// This isn't right, although it's probably harmless on x86; liveouts
2264
// should be computed from returns not tail calls. Consider a void
2265
// function making a tail call to a function returning int.
2266
return DAG.getNode(X86ISD::TC_RETURN, dl,
2267
NodeTys, &Ops[0], Ops.size());
2270
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2271
InFlag = Chain.getValue(1);
2273
// Create the CALLSEQ_END node.
2274
unsigned NumBytesForCalleeToPush;
2275
if (Subtarget->IsCalleePop(isVarArg, CallConv))
2276
NumBytesForCalleeToPush = NumBytes; // Callee pops everything
2277
else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
2278
// If this is a call to a struct-return function, the callee
2279
// pops the hidden struct pointer, so we have to push it back.
2280
// This is common for Darwin/X86, Linux & Mingw32 targets.
2281
NumBytesForCalleeToPush = 4;
2283
NumBytesForCalleeToPush = 0; // Callee pops nothing.
2285
// Returns a flag for retval copy to use.
2287
Chain = DAG.getCALLSEQ_END(Chain,
2288
DAG.getIntPtrConstant(NumBytes, true),
2289
DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2292
InFlag = Chain.getValue(1);
2295
// Handle result values, copying them out of physregs into vregs that we
2297
return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2298
Ins, dl, DAG, InVals);
2302
//===----------------------------------------------------------------------===//
2303
// Fast Calling Convention (tail call) implementation
2304
//===----------------------------------------------------------------------===//
2306
// Like std call, callee cleans arguments, convention except that ECX is
2307
// reserved for storing the tail called function address. Only 2 registers are
2308
// free for argument passing (inreg). Tail call optimization is performed
2310
// * tailcallopt is enabled
2311
// * caller/callee are fastcc
2312
// On X86_64 architecture with GOT-style position independent code only local
2313
// (within module) calls are supported at the moment.
2314
// To keep the stack aligned according to platform abi the function
2315
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
2316
// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2317
// If a tail called function callee has more arguments than the caller the
2318
// caller needs to make sure that there is room to move the RETADDR to. This is
2319
// achieved by reserving an area the size of the argument delta right after the
2320
// original REtADDR, but before the saved framepointer or the spilled registers
2321
// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2333
/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2334
/// for a 16 byte align requirement.
2336
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2337
SelectionDAG& DAG) const {
2338
MachineFunction &MF = DAG.getMachineFunction();
2339
const TargetMachine &TM = MF.getTarget();
2340
const TargetFrameInfo &TFI = *TM.getFrameInfo();
2341
unsigned StackAlignment = TFI.getStackAlignment();
2342
uint64_t AlignMask = StackAlignment - 1;
2343
int64_t Offset = StackSize;
2344
uint64_t SlotSize = TD->getPointerSize();
2345
if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2346
// Number smaller than 12 so just add the difference.
2347
Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2349
// Mask out lower bits, add stackalignment once plus the 12 bytes.
2350
Offset = ((~AlignMask) & Offset) + StackAlignment +
2351
(StackAlignment-SlotSize);
2356
/// MatchingStackOffset - Return true if the given stack call argument is
2357
/// already available in the same position (relatively) of the caller's
2358
/// incoming argument stack.
2360
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2361
MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2362
const X86InstrInfo *TII) {
2363
unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2365
if (Arg.getOpcode() == ISD::CopyFromReg) {
2366
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2367
if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
2369
MachineInstr *Def = MRI->getVRegDef(VR);
2372
if (!Flags.isByVal()) {
2373
if (!TII->isLoadFromStackSlot(Def, FI))
2376
unsigned Opcode = Def->getOpcode();
2377
if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2378
Def->getOperand(1).isFI()) {
2379
FI = Def->getOperand(1).getIndex();
2380
Bytes = Flags.getByValSize();
2384
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2385
if (Flags.isByVal())
2386
// ByVal argument is passed in as a pointer but it's now being
2387
// dereferenced. e.g.
2388
// define @foo(%struct.X* %A) {
2389
// tail call @bar(%struct.X* byval %A)
2392
SDValue Ptr = Ld->getBasePtr();
2393
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2396
FI = FINode->getIndex();
2400
assert(FI != INT_MAX);
2401
if (!MFI->isFixedObjectIndex(FI))
2403
return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2406
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2407
/// for tail call optimization. Targets which want to do tail call
2408
/// optimization should implement this function.
2410
X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2411
CallingConv::ID CalleeCC,
2413
bool isCalleeStructRet,
2414
bool isCallerStructRet,
2415
const SmallVectorImpl<ISD::OutputArg> &Outs,
2416
const SmallVectorImpl<SDValue> &OutVals,
2417
const SmallVectorImpl<ISD::InputArg> &Ins,
2418
SelectionDAG& DAG) const {
2419
if (!IsTailCallConvention(CalleeCC) &&
2420
CalleeCC != CallingConv::C)
2423
// If -tailcallopt is specified, make fastcc functions tail-callable.
2424
const MachineFunction &MF = DAG.getMachineFunction();
2425
const Function *CallerF = DAG.getMachineFunction().getFunction();
2426
CallingConv::ID CallerCC = CallerF->getCallingConv();
2427
bool CCMatch = CallerCC == CalleeCC;
2429
if (GuaranteedTailCallOpt) {
2430
if (IsTailCallConvention(CalleeCC) && CCMatch)
2435
// Look for obvious safe cases to perform tail call optimization that do not
2436
// require ABI changes. This is what gcc calls sibcall.
2438
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2439
// emit a special epilogue.
2440
if (RegInfo->needsStackRealignment(MF))
2443
// Do not sibcall optimize vararg calls unless the call site is not passing
2445
if (isVarArg && !Outs.empty())
2448
// Also avoid sibcall optimization if either caller or callee uses struct
2449
// return semantics.
2450
if (isCalleeStructRet || isCallerStructRet)
2453
// If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
2454
// Therefore if it's not used by the call it is not safe to optimize this into
2456
bool Unused = false;
2457
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2464
SmallVector<CCValAssign, 16> RVLocs;
2465
CCState CCInfo(CalleeCC, false, getTargetMachine(),
2466
RVLocs, *DAG.getContext());
2467
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2468
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2469
CCValAssign &VA = RVLocs[i];
2470
if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2475
// If the calling conventions do not match, then we'd better make sure the
2476
// results are returned in the same way as what the caller expects.
2478
SmallVector<CCValAssign, 16> RVLocs1;
2479
CCState CCInfo1(CalleeCC, false, getTargetMachine(),
2480
RVLocs1, *DAG.getContext());
2481
CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2483
SmallVector<CCValAssign, 16> RVLocs2;
2484
CCState CCInfo2(CallerCC, false, getTargetMachine(),
2485
RVLocs2, *DAG.getContext());
2486
CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2488
if (RVLocs1.size() != RVLocs2.size())
2490
for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2491
if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2493
if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2495
if (RVLocs1[i].isRegLoc()) {
2496
if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2499
if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2505
// If the callee takes no arguments then go on to check the results of the
2507
if (!Outs.empty()) {
2508
// Check if stack adjustment is needed. For now, do not do this if any
2509
// argument is passed on the stack.
2510
SmallVector<CCValAssign, 16> ArgLocs;
2511
CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
2512
ArgLocs, *DAG.getContext());
2513
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
2514
if (CCInfo.getNextStackOffset()) {
2515
MachineFunction &MF = DAG.getMachineFunction();
2516
if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2518
if (Subtarget->isTargetWin64())
2519
// Win64 ABI has additional complications.
2522
// Check if the arguments are already laid out in the right way as
2523
// the caller's fixed stack objects.
2524
MachineFrameInfo *MFI = MF.getFrameInfo();
2525
const MachineRegisterInfo *MRI = &MF.getRegInfo();
2526
const X86InstrInfo *TII =
2527
((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2528
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2529
CCValAssign &VA = ArgLocs[i];
2530
SDValue Arg = OutVals[i];
2531
ISD::ArgFlagsTy Flags = Outs[i].Flags;
2532
if (VA.getLocInfo() == CCValAssign::Indirect)
2534
if (!VA.isRegLoc()) {
2535
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2542
// If the tailcall address may be in a register, then make sure it's
2543
// possible to register allocate for it. In 32-bit, the call address can
2544
// only target EAX, EDX, or ECX since the tail call must be scheduled after
2545
// callee-saved registers are restored. These happen to be the same
2546
// registers used to pass 'inreg' arguments so watch out for those.
2547
if (!Subtarget->is64Bit() &&
2548
!isa<GlobalAddressSDNode>(Callee) &&
2549
!isa<ExternalSymbolSDNode>(Callee)) {
2550
unsigned NumInRegs = 0;
2551
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2552
CCValAssign &VA = ArgLocs[i];
2555
unsigned Reg = VA.getLocReg();
2558
case X86::EAX: case X86::EDX: case X86::ECX:
2559
if (++NumInRegs == 3)
2571
X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
2572
return X86::createFastISel(funcInfo);
2576
//===----------------------------------------------------------------------===//
2577
// Other Lowering Hooks
2578
//===----------------------------------------------------------------------===//
2580
static bool MayFoldLoad(SDValue Op) {
2581
return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
2584
static bool MayFoldIntoStore(SDValue Op) {
2585
return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2588
static bool isTargetShuffle(unsigned Opcode) {
2590
default: return false;
2591
case X86ISD::PSHUFD:
2592
case X86ISD::PSHUFHW:
2593
case X86ISD::PSHUFLW:
2594
case X86ISD::SHUFPD:
2595
case X86ISD::SHUFPS:
2596
case X86ISD::MOVLHPS:
2597
case X86ISD::MOVLHPD:
2598
case X86ISD::MOVHLPS:
2599
case X86ISD::MOVLPS:
2600
case X86ISD::MOVLPD:
2601
case X86ISD::MOVSHDUP:
2602
case X86ISD::MOVSLDUP:
2605
case X86ISD::UNPCKLPS:
2606
case X86ISD::UNPCKLPD:
2607
case X86ISD::PUNPCKLWD:
2608
case X86ISD::PUNPCKLBW:
2609
case X86ISD::PUNPCKLDQ:
2610
case X86ISD::PUNPCKLQDQ:
2611
case X86ISD::UNPCKHPS:
2612
case X86ISD::UNPCKHPD:
2613
case X86ISD::PUNPCKHWD:
2614
case X86ISD::PUNPCKHBW:
2615
case X86ISD::PUNPCKHDQ:
2616
case X86ISD::PUNPCKHQDQ:
2622
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2623
SDValue V1, SelectionDAG &DAG) {
2625
default: llvm_unreachable("Unknown x86 shuffle node");
2626
case X86ISD::MOVSHDUP:
2627
case X86ISD::MOVSLDUP:
2628
return DAG.getNode(Opc, dl, VT, V1);
2634
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2635
SDValue V1, unsigned TargetMask, SelectionDAG &DAG) {
2637
default: llvm_unreachable("Unknown x86 shuffle node");
2638
case X86ISD::PSHUFD:
2639
case X86ISD::PSHUFHW:
2640
case X86ISD::PSHUFLW:
2641
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
2647
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2648
SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) {
2650
default: llvm_unreachable("Unknown x86 shuffle node");
2651
case X86ISD::SHUFPD:
2652
case X86ISD::SHUFPS:
2653
return DAG.getNode(Opc, dl, VT, V1, V2,
2654
DAG.getConstant(TargetMask, MVT::i8));
2659
static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
2660
SDValue V1, SDValue V2, SelectionDAG &DAG) {
2662
default: llvm_unreachable("Unknown x86 shuffle node");
2663
case X86ISD::MOVLHPS:
2664
case X86ISD::MOVLHPD:
2665
case X86ISD::MOVHLPS:
2666
case X86ISD::MOVLPS:
2667
case X86ISD::MOVLPD:
2670
case X86ISD::UNPCKLPS:
2671
case X86ISD::UNPCKLPD:
2672
case X86ISD::PUNPCKLWD:
2673
case X86ISD::PUNPCKLBW:
2674
case X86ISD::PUNPCKLDQ:
2675
case X86ISD::PUNPCKLQDQ:
2676
case X86ISD::UNPCKHPS:
2677
case X86ISD::UNPCKHPD:
2678
case X86ISD::PUNPCKHWD:
2679
case X86ISD::PUNPCKHBW:
2680
case X86ISD::PUNPCKHDQ:
2681
case X86ISD::PUNPCKHQDQ:
2682
return DAG.getNode(Opc, dl, VT, V1, V2);
2687
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2688
MachineFunction &MF = DAG.getMachineFunction();
2689
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2690
int ReturnAddrIndex = FuncInfo->getRAIndex();
2692
if (ReturnAddrIndex == 0) {
2693
// Set up a frame object for the return address.
2694
uint64_t SlotSize = TD->getPointerSize();
2695
ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2697
FuncInfo->setRAIndex(ReturnAddrIndex);
2700
return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2704
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2705
bool hasSymbolicDisplacement) {
2706
// Offset should fit into 32 bit immediate field.
2707
if (!isInt<32>(Offset))
2710
// If we don't have a symbolic displacement - we don't have any extra
2712
if (!hasSymbolicDisplacement)
2715
// FIXME: Some tweaks might be needed for medium code model.
2716
if (M != CodeModel::Small && M != CodeModel::Kernel)
2719
// For small code model we assume that latest object is 16MB before end of 31
2720
// bits boundary. We may also accept pretty large negative constants knowing
2721
// that all objects are in the positive half of address space.
2722
if (M == CodeModel::Small && Offset < 16*1024*1024)
2725
// For kernel code model we know that all object resist in the negative half
2726
// of 32bits address space. We may not accept negative offsets, since they may
2727
// be just off and we may accept pretty large positive ones.
2728
if (M == CodeModel::Kernel && Offset > 0)
2734
/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2735
/// specific condition code, returning the condition code and the LHS/RHS of the
2736
/// comparison to make.
2737
static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2738
SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2740
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2741
if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2742
// X > -1 -> X == 0, jump !sign.
2743
RHS = DAG.getConstant(0, RHS.getValueType());
2744
return X86::COND_NS;
2745
} else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2746
// X < 0 -> X == 0, jump on sign.
2748
} else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2750
RHS = DAG.getConstant(0, RHS.getValueType());
2751
return X86::COND_LE;
2755
switch (SetCCOpcode) {
2756
default: llvm_unreachable("Invalid integer condition!");
2757
case ISD::SETEQ: return X86::COND_E;
2758
case ISD::SETGT: return X86::COND_G;
2759
case ISD::SETGE: return X86::COND_GE;
2760
case ISD::SETLT: return X86::COND_L;
2761
case ISD::SETLE: return X86::COND_LE;
2762
case ISD::SETNE: return X86::COND_NE;
2763
case ISD::SETULT: return X86::COND_B;
2764
case ISD::SETUGT: return X86::COND_A;
2765
case ISD::SETULE: return X86::COND_BE;
2766
case ISD::SETUGE: return X86::COND_AE;
2770
// First determine if it is required or is profitable to flip the operands.
2772
// If LHS is a foldable load, but RHS is not, flip the condition.
2773
if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2774
!(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2775
SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2776
std::swap(LHS, RHS);
2779
switch (SetCCOpcode) {
2785
std::swap(LHS, RHS);
2789
// On a floating point condition, the flags are set as follows:
2791
// 0 | 0 | 0 | X > Y
2792
// 0 | 0 | 1 | X < Y
2793
// 1 | 0 | 0 | X == Y
2794
// 1 | 1 | 1 | unordered
2795
switch (SetCCOpcode) {
2796
default: llvm_unreachable("Condcode should be pre-legalized away");
2798
case ISD::SETEQ: return X86::COND_E;
2799
case ISD::SETOLT: // flipped
2801
case ISD::SETGT: return X86::COND_A;
2802
case ISD::SETOLE: // flipped
2804
case ISD::SETGE: return X86::COND_AE;
2805
case ISD::SETUGT: // flipped
2807
case ISD::SETLT: return X86::COND_B;
2808
case ISD::SETUGE: // flipped
2810
case ISD::SETLE: return X86::COND_BE;
2812
case ISD::SETNE: return X86::COND_NE;
2813
case ISD::SETUO: return X86::COND_P;
2814
case ISD::SETO: return X86::COND_NP;
2816
case ISD::SETUNE: return X86::COND_INVALID;
2820
/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2821
/// code. Current x86 isa includes the following FP cmov instructions:
2822
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2823
static bool hasFPCMov(unsigned X86CC) {
2839
/// isFPImmLegal - Returns true if the target can instruction select the
2840
/// specified FP immediate natively. If false, the legalizer will
2841
/// materialize the FP immediate as a load from a constant pool.
2842
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
2843
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
2844
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
2850
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
2851
/// the specified range (L, H].
2852
static bool isUndefOrInRange(int Val, int Low, int Hi) {
2853
return (Val < 0) || (Val >= Low && Val < Hi);
2856
/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2857
/// specified value.
2858
static bool isUndefOrEqual(int Val, int CmpVal) {
2859
if (Val < 0 || Val == CmpVal)
2864
/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2865
/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
2866
/// the second operand.
2867
static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2868
if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2869
return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2870
if (VT == MVT::v2f64 || VT == MVT::v2i64)
2871
return (Mask[0] < 2 && Mask[1] < 2);
2875
bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2876
SmallVector<int, 8> M;
2878
return ::isPSHUFDMask(M, N->getValueType(0));
2881
/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2882
/// is suitable for input to PSHUFHW.
2883
static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2884
if (VT != MVT::v8i16)
2887
// Lower quadword copied in order or undef.
2888
for (int i = 0; i != 4; ++i)
2889
if (Mask[i] >= 0 && Mask[i] != i)
2892
// Upper quadword shuffled.
2893
for (int i = 4; i != 8; ++i)
2894
if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2900
bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2901
SmallVector<int, 8> M;
2903
return ::isPSHUFHWMask(M, N->getValueType(0));
2906
/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2907
/// is suitable for input to PSHUFLW.
2908
static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2909
if (VT != MVT::v8i16)
2912
// Upper quadword copied in order.
2913
for (int i = 4; i != 8; ++i)
2914
if (Mask[i] >= 0 && Mask[i] != i)
2917
// Lower quadword shuffled.
2918
for (int i = 0; i != 4; ++i)
2925
bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2926
SmallVector<int, 8> M;
2928
return ::isPSHUFLWMask(M, N->getValueType(0));
2931
/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
2932
/// is suitable for input to PALIGNR.
2933
static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
2935
int i, e = VT.getVectorNumElements();
2937
// Do not handle v2i64 / v2f64 shuffles with palignr.
2938
if (e < 4 || !hasSSSE3)
2941
for (i = 0; i != e; ++i)
2945
// All undef, not a palignr.
2949
// Determine if it's ok to perform a palignr with only the LHS, since we
2950
// don't have access to the actual shuffle elements to see if RHS is undef.
2951
bool Unary = Mask[i] < (int)e;
2952
bool NeedsUnary = false;
2954
int s = Mask[i] - i;
2956
// Check the rest of the elements to see if they are consecutive.
2957
for (++i; i != e; ++i) {
2962
Unary = Unary && (m < (int)e);
2963
NeedsUnary = NeedsUnary || (m < s);
2965
if (NeedsUnary && !Unary)
2967
if (Unary && m != ((s+i) & (e-1)))
2969
if (!Unary && m != (s+i))
2975
bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
2976
SmallVector<int, 8> M;
2978
return ::isPALIGNRMask(M, N->getValueType(0), true);
2981
/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2982
/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2983
static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2984
int NumElems = VT.getVectorNumElements();
2985
if (NumElems != 2 && NumElems != 4)
2988
int Half = NumElems / 2;
2989
for (int i = 0; i < Half; ++i)
2990
if (!isUndefOrInRange(Mask[i], 0, NumElems))
2992
for (int i = Half; i < NumElems; ++i)
2993
if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2999
bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
3000
SmallVector<int, 8> M;
3002
return ::isSHUFPMask(M, N->getValueType(0));
3005
/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
3006
/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
3007
/// half elements to come from vector 1 (which would equal the dest.) and
3008
/// the upper half to come from vector 2.
3009
static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3010
int NumElems = VT.getVectorNumElements();
3012
if (NumElems != 2 && NumElems != 4)
3015
int Half = NumElems / 2;
3016
for (int i = 0; i < Half; ++i)
3017
if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
3019
for (int i = Half; i < NumElems; ++i)
3020
if (!isUndefOrInRange(Mask[i], 0, NumElems))
3025
static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
3026
SmallVector<int, 8> M;
3028
return isCommutedSHUFPMask(M, N->getValueType(0));
3031
/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
3032
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
3033
bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
3034
if (N->getValueType(0).getVectorNumElements() != 4)
3037
// Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
3038
return isUndefOrEqual(N->getMaskElt(0), 6) &&
3039
isUndefOrEqual(N->getMaskElt(1), 7) &&
3040
isUndefOrEqual(N->getMaskElt(2), 2) &&
3041
isUndefOrEqual(N->getMaskElt(3), 3);
3044
/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
3045
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
3047
bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
3048
unsigned NumElems = N->getValueType(0).getVectorNumElements();
3053
return isUndefOrEqual(N->getMaskElt(0), 2) &&
3054
isUndefOrEqual(N->getMaskElt(1), 3) &&
3055
isUndefOrEqual(N->getMaskElt(2), 2) &&
3056
isUndefOrEqual(N->getMaskElt(3), 3);
3059
/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
3060
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
3061
bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
3062
unsigned NumElems = N->getValueType(0).getVectorNumElements();
3064
if (NumElems != 2 && NumElems != 4)
3067
for (unsigned i = 0; i < NumElems/2; ++i)
3068
if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
3071
for (unsigned i = NumElems/2; i < NumElems; ++i)
3072
if (!isUndefOrEqual(N->getMaskElt(i), i))
3078
/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
3079
/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
3080
bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
3081
unsigned NumElems = N->getValueType(0).getVectorNumElements();
3083
if (NumElems != 2 && NumElems != 4)
3086
for (unsigned i = 0; i < NumElems/2; ++i)
3087
if (!isUndefOrEqual(N->getMaskElt(i), i))
3090
for (unsigned i = 0; i < NumElems/2; ++i)
3091
if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
3097
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
3098
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
3099
static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
3100
bool V2IsSplat = false) {
3101
int NumElts = VT.getVectorNumElements();
3102
if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
3105
for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
3107
int BitI1 = Mask[i+1];
3108
if (!isUndefOrEqual(BitI, j))
3111
if (!isUndefOrEqual(BitI1, NumElts))
3114
if (!isUndefOrEqual(BitI1, j + NumElts))
3121
bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
3122
SmallVector<int, 8> M;
3124
return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
3127
/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
3128
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
3129
static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
3130
bool V2IsSplat = false) {
3131
int NumElts = VT.getVectorNumElements();
3132
if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
3135
for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
3137
int BitI1 = Mask[i+1];
3138
if (!isUndefOrEqual(BitI, j + NumElts/2))
3141
if (isUndefOrEqual(BitI1, NumElts))
3144
if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
3151
bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
3152
SmallVector<int, 8> M;
3154
return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
3157
/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
3158
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
3160
static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
3161
int NumElems = VT.getVectorNumElements();
3162
if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
3165
for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
3167
int BitI1 = Mask[i+1];
3168
if (!isUndefOrEqual(BitI, j))
3170
if (!isUndefOrEqual(BitI1, j))
3176
bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
3177
SmallVector<int, 8> M;
3179
return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
3182
/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
3183
/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
3185
static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
3186
int NumElems = VT.getVectorNumElements();
3187
if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
3190
for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
3192
int BitI1 = Mask[i+1];
3193
if (!isUndefOrEqual(BitI, j))
3195
if (!isUndefOrEqual(BitI1, j))
3201
bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
3202
SmallVector<int, 8> M;
3204
return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
3207
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3208
/// specifies a shuffle of elements that is suitable for input to MOVSS,
3209
/// MOVSD, and MOVD, i.e. setting the lowest element.
3210
static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3211
if (VT.getVectorElementType().getSizeInBits() < 32)
3214
int NumElts = VT.getVectorNumElements();
3216
if (!isUndefOrEqual(Mask[0], NumElts))
3219
for (int i = 1; i < NumElts; ++i)
3220
if (!isUndefOrEqual(Mask[i], i))
3226
bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
3227
SmallVector<int, 8> M;
3229
return ::isMOVLMask(M, N->getValueType(0));
3232
/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
3233
/// of what x86 movss want. X86 movs requires the lowest element to be lowest
3234
/// element of vector 2 and the other elements to come from vector 1 in order.
3235
static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
3236
bool V2IsSplat = false, bool V2IsUndef = false) {
3237
int NumOps = VT.getVectorNumElements();
3238
if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3241
if (!isUndefOrEqual(Mask[0], 0))
3244
for (int i = 1; i < NumOps; ++i)
3245
if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3246
(V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3247
(V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3253
static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
3254
bool V2IsUndef = false) {
3255
SmallVector<int, 8> M;
3257
return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
3260
/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3261
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3262
bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
3263
if (N->getValueType(0).getVectorNumElements() != 4)
3266
// Expect 1, 1, 3, 3
3267
for (unsigned i = 0; i < 2; ++i) {
3268
int Elt = N->getMaskElt(i);
3269
if (Elt >= 0 && Elt != 1)
3274
for (unsigned i = 2; i < 4; ++i) {
3275
int Elt = N->getMaskElt(i);
3276
if (Elt >= 0 && Elt != 3)
3281
// Don't use movshdup if it can be done with a shufps.
3282
// FIXME: verify that matching u, u, 3, 3 is what we want.
3286
/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3287
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3288
bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
3289
if (N->getValueType(0).getVectorNumElements() != 4)
3292
// Expect 0, 0, 2, 2
3293
for (unsigned i = 0; i < 2; ++i)
3294
if (N->getMaskElt(i) > 0)
3298
for (unsigned i = 2; i < 4; ++i) {
3299
int Elt = N->getMaskElt(i);
3300
if (Elt >= 0 && Elt != 2)
3305
// Don't use movsldup if it can be done with a shufps.
3309
/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3310
/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
3311
bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
3312
int e = N->getValueType(0).getVectorNumElements() / 2;
3314
for (int i = 0; i < e; ++i)
3315
if (!isUndefOrEqual(N->getMaskElt(i), i))
3317
for (int i = 0; i < e; ++i)
3318
if (!isUndefOrEqual(N->getMaskElt(e+i), i))
3323
/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3324
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3325
unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
3326
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3327
int NumOperands = SVOp->getValueType(0).getVectorNumElements();
3329
unsigned Shift = (NumOperands == 4) ? 2 : 1;
3331
for (int i = 0; i < NumOperands; ++i) {
3332
int Val = SVOp->getMaskElt(NumOperands-i-1);
3333
if (Val < 0) Val = 0;
3334
if (Val >= NumOperands) Val -= NumOperands;
3336
if (i != NumOperands - 1)
3342
/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3343
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3344
unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
3345
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3347
// 8 nodes, but we only care about the last 4.
3348
for (unsigned i = 7; i >= 4; --i) {
3349
int Val = SVOp->getMaskElt(i);
3358
/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3359
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3360
unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
3361
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3363
// 8 nodes, but we only care about the first 4.
3364
for (int i = 3; i >= 0; --i) {
3365
int Val = SVOp->getMaskElt(i);
3374
/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3375
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3376
unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
3377
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3378
EVT VVT = N->getValueType(0);
3379
unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
3383
for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
3384
Val = SVOp->getMaskElt(i);
3388
return (Val - i) * EltSize;
3391
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
3393
bool X86::isZeroNode(SDValue Elt) {
3394
return ((isa<ConstantSDNode>(Elt) &&
3395
cast<ConstantSDNode>(Elt)->isNullValue()) ||
3396
(isa<ConstantFPSDNode>(Elt) &&
3397
cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
3400
/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
3401
/// their permute mask.
3402
static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
3403
SelectionDAG &DAG) {
3404
EVT VT = SVOp->getValueType(0);
3405
unsigned NumElems = VT.getVectorNumElements();
3406
SmallVector<int, 8> MaskVec;
3408
for (unsigned i = 0; i != NumElems; ++i) {
3409
int idx = SVOp->getMaskElt(i);
3411
MaskVec.push_back(idx);
3412
else if (idx < (int)NumElems)
3413
MaskVec.push_back(idx + NumElems);
3415
MaskVec.push_back(idx - NumElems);
3417
return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
3418
SVOp->getOperand(0), &MaskVec[0]);
3421
/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3422
/// the two vector operands have swapped position.
3423
static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
3424
unsigned NumElems = VT.getVectorNumElements();
3425
for (unsigned i = 0; i != NumElems; ++i) {
3429
else if (idx < (int)NumElems)
3430
Mask[i] = idx + NumElems;
3432
Mask[i] = idx - NumElems;
3436
/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
3437
/// match movhlps. The lower half elements should come from upper half of
3438
/// V1 (and in order), and the upper half elements should come from the upper
3439
/// half of V2 (and in order).
3440
static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
3441
if (Op->getValueType(0).getVectorNumElements() != 4)
3443
for (unsigned i = 0, e = 2; i != e; ++i)
3444
if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
3446
for (unsigned i = 2; i != 4; ++i)
3447
if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
3452
/// isScalarLoadToVector - Returns true if the node is a scalar load that
3453
/// is promoted to a vector. It also returns the LoadSDNode by reference if
3455
static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
3456
if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
3458
N = N->getOperand(0).getNode();
3459
if (!ISD::isNON_EXTLoad(N))
3462
*LD = cast<LoadSDNode>(N);
3466
/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
3467
/// match movlp{s|d}. The lower half elements should come from lower half of
3468
/// V1 (and in order), and the upper half elements should come from the upper
3469
/// half of V2 (and in order). And since V1 will become the source of the
3470
/// MOVLP, it must be either a vector load or a scalar load to vector.
3471
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
3472
ShuffleVectorSDNode *Op) {
3473
if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
3475
// Is V2 is a vector load, don't do this transformation. We will try to use
3476
// load folding shufps op.
3477
if (ISD::isNON_EXTLoad(V2))
3480
unsigned NumElems = Op->getValueType(0).getVectorNumElements();
3482
if (NumElems != 2 && NumElems != 4)
3484
for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3485
if (!isUndefOrEqual(Op->getMaskElt(i), i))
3487
for (unsigned i = NumElems/2; i != NumElems; ++i)
3488
if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
3493
/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
3495
static bool isSplatVector(SDNode *N) {
3496
if (N->getOpcode() != ISD::BUILD_VECTOR)
3499
SDValue SplatValue = N->getOperand(0);
3500
for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
3501
if (N->getOperand(i) != SplatValue)
3506
/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
3507
/// to an zero vector.
3508
/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
3509
static bool isZeroShuffle(ShuffleVectorSDNode *N) {
3510
SDValue V1 = N->getOperand(0);
3511
SDValue V2 = N->getOperand(1);
3512
unsigned NumElems = N->getValueType(0).getVectorNumElements();
3513
for (unsigned i = 0; i != NumElems; ++i) {
3514
int Idx = N->getMaskElt(i);
3515
if (Idx >= (int)NumElems) {
3516
unsigned Opc = V2.getOpcode();
3517
if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
3519
if (Opc != ISD::BUILD_VECTOR ||
3520
!X86::isZeroNode(V2.getOperand(Idx-NumElems)))
3522
} else if (Idx >= 0) {
3523
unsigned Opc = V1.getOpcode();
3524
if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
3526
if (Opc != ISD::BUILD_VECTOR ||
3527
!X86::isZeroNode(V1.getOperand(Idx)))
3534
/// getZeroVector - Returns a vector of specified type with all zero elements.
3536
static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
3538
assert(VT.isVector() && "Expected a vector type");
3540
// Always build zero vectors as <4 x i32> or <2 x i32> bitcasted
3541
// to their dest type. This ensures they get CSE'd.
3543
if (VT.getSizeInBits() == 64) { // MMX
3544
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3545
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3546
} else if (VT.getSizeInBits() == 128) {
3547
if (HasSSE2) { // SSE2
3548
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3549
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3551
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3552
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
3554
} else if (VT.getSizeInBits() == 256) { // AVX
3555
// 256-bit logic and arithmetic instructions in AVX are
3556
// all floating-point, no support for integer ops. Default
3557
// to emitting fp zeroed vectors then.
3558
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3559
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
3560
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
3562
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3565
/// getOnesVector - Returns a vector of specified type with all bits set.
3567
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3568
assert(VT.isVector() && "Expected a vector type");
3570
// Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3571
// type. This ensures they get CSE'd.
3572
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
3574
if (VT.getSizeInBits() == 64) // MMX
3575
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3577
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3578
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3582
/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
3583
/// that point to V2 points to its first element.
3584
static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3585
EVT VT = SVOp->getValueType(0);
3586
unsigned NumElems = VT.getVectorNumElements();
3588
bool Changed = false;
3589
SmallVector<int, 8> MaskVec;
3590
SVOp->getMask(MaskVec);
3592
for (unsigned i = 0; i != NumElems; ++i) {
3593
if (MaskVec[i] > (int)NumElems) {
3594
MaskVec[i] = NumElems;
3599
return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
3600
SVOp->getOperand(1), &MaskVec[0]);
3601
return SDValue(SVOp, 0);
3604
/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
3605
/// operation of specified width.
3606
static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3608
unsigned NumElems = VT.getVectorNumElements();
3609
SmallVector<int, 8> Mask;
3610
Mask.push_back(NumElems);
3611
for (unsigned i = 1; i != NumElems; ++i)
3613
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3616
/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
3617
static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3619
unsigned NumElems = VT.getVectorNumElements();
3620
SmallVector<int, 8> Mask;
3621
for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3623
Mask.push_back(i + NumElems);
3625
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3628
/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
3629
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3631
unsigned NumElems = VT.getVectorNumElements();
3632
unsigned Half = NumElems/2;
3633
SmallVector<int, 8> Mask;
3634
for (unsigned i = 0; i != Half; ++i) {
3635
Mask.push_back(i + Half);
3636
Mask.push_back(i + NumElems + Half);
3638
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3641
/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
3642
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
3643
if (SV->getValueType(0).getVectorNumElements() <= 4)
3644
return SDValue(SV, 0);
3646
EVT PVT = MVT::v4f32;
3647
EVT VT = SV->getValueType(0);
3648
DebugLoc dl = SV->getDebugLoc();
3649
SDValue V1 = SV->getOperand(0);
3650
int NumElems = VT.getVectorNumElements();
3651
int EltNo = SV->getSplatIndex();
3653
// unpack elements to the correct location
3654
while (NumElems > 4) {
3655
if (EltNo < NumElems/2) {
3656
V1 = getUnpackl(DAG, dl, VT, V1, V1);
3658
V1 = getUnpackh(DAG, dl, VT, V1, V1);
3659
EltNo -= NumElems/2;
3664
// Perform the splat.
3665
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3666
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3667
V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3668
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3671
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3672
/// vector of zero or undef vector. This produces a shuffle where the low
3673
/// element of V2 is swizzled into the zero/undef vector, landing at element
3674
/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
3675
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3676
bool isZero, bool HasSSE2,
3677
SelectionDAG &DAG) {
3678
EVT VT = V2.getValueType();
3680
? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3681
unsigned NumElems = VT.getVectorNumElements();
3682
SmallVector<int, 16> MaskVec;
3683
for (unsigned i = 0; i != NumElems; ++i)
3684
// If this is the insertion idx, put the low elt of V2 here.
3685
MaskVec.push_back(i == Idx ? NumElems : i);
3686
return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3689
/// getShuffleScalarElt - Returns the scalar element that will make up the ith
3690
/// element of the result of the vector shuffle.
3691
SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
3694
return SDValue(); // Limit search depth.
3696
SDValue V = SDValue(N, 0);
3697
EVT VT = V.getValueType();
3698
unsigned Opcode = V.getOpcode();
3700
// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
3701
if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
3702
Index = SV->getMaskElt(Index);
3705
return DAG.getUNDEF(VT.getVectorElementType());
3707
int NumElems = VT.getVectorNumElements();
3708
SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
3709
return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
3712
// Recurse into target specific vector shuffles to find scalars.
3713
if (isTargetShuffle(Opcode)) {
3714
int NumElems = VT.getVectorNumElements();
3715
SmallVector<unsigned, 16> ShuffleMask;
3719
case X86ISD::SHUFPS:
3720
case X86ISD::SHUFPD:
3721
ImmN = N->getOperand(N->getNumOperands()-1);
3722
DecodeSHUFPSMask(NumElems,
3723
cast<ConstantSDNode>(ImmN)->getZExtValue(),
3726
case X86ISD::PUNPCKHBW:
3727
case X86ISD::PUNPCKHWD:
3728
case X86ISD::PUNPCKHDQ:
3729
case X86ISD::PUNPCKHQDQ:
3730
DecodePUNPCKHMask(NumElems, ShuffleMask);
3732
case X86ISD::UNPCKHPS:
3733
case X86ISD::UNPCKHPD:
3734
DecodeUNPCKHPMask(NumElems, ShuffleMask);
3736
case X86ISD::PUNPCKLBW:
3737
case X86ISD::PUNPCKLWD:
3738
case X86ISD::PUNPCKLDQ:
3739
case X86ISD::PUNPCKLQDQ:
3740
DecodePUNPCKLMask(NumElems, ShuffleMask);
3742
case X86ISD::UNPCKLPS:
3743
case X86ISD::UNPCKLPD:
3744
DecodeUNPCKLPMask(NumElems, ShuffleMask);
3746
case X86ISD::MOVHLPS:
3747
DecodeMOVHLPSMask(NumElems, ShuffleMask);
3749
case X86ISD::MOVLHPS:
3750
DecodeMOVLHPSMask(NumElems, ShuffleMask);
3752
case X86ISD::PSHUFD:
3753
ImmN = N->getOperand(N->getNumOperands()-1);
3754
DecodePSHUFMask(NumElems,
3755
cast<ConstantSDNode>(ImmN)->getZExtValue(),
3758
case X86ISD::PSHUFHW:
3759
ImmN = N->getOperand(N->getNumOperands()-1);
3760
DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
3763
case X86ISD::PSHUFLW:
3764
ImmN = N->getOperand(N->getNumOperands()-1);
3765
DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
3769
case X86ISD::MOVSD: {
3770
// The index 0 always comes from the first element of the second source,
3771
// this is why MOVSS and MOVSD are used in the first place. The other
3772
// elements come from the other positions of the first source vector.
3773
unsigned OpNum = (Index == 0) ? 1 : 0;
3774
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
3778
assert("not implemented for target shuffle node");
3782
Index = ShuffleMask[Index];
3784
return DAG.getUNDEF(VT.getVectorElementType());
3786
SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
3787
return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
3791
// Actual nodes that may contain scalar elements
3792
if (Opcode == ISD::BIT_CONVERT) {
3793
V = V.getOperand(0);
3794
EVT SrcVT = V.getValueType();
3795
unsigned NumElems = VT.getVectorNumElements();
3797
if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
3801
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
3802
return (Index == 0) ? V.getOperand(0)
3803
: DAG.getUNDEF(VT.getVectorElementType());
3805
if (V.getOpcode() == ISD::BUILD_VECTOR)
3806
return V.getOperand(Index);
3811
/// getNumOfConsecutiveZeros - Return the number of elements of a vector
3812
/// shuffle operation which come from a consecutively from a zero. The
3813
/// search can start in two diferent directions, from left or right.
3815
unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
3816
bool ZerosFromLeft, SelectionDAG &DAG) {
3819
while (i < NumElems) {
3820
unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
3821
SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
3822
if (!(Elt.getNode() &&
3823
(Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
3831
/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
3832
/// MaskE correspond consecutively to elements from one of the vector operands,
3833
/// starting from its index OpIdx. Also tell OpNum which source vector operand.
3835
bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
3836
int OpIdx, int NumElems, unsigned &OpNum) {
3837
bool SeenV1 = false;
3838
bool SeenV2 = false;
3840
for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
3841
int Idx = SVOp->getMaskElt(i);
3842
// Ignore undef indicies
3851
// Only accept consecutive elements from the same vector
3852
if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
3856
OpNum = SeenV1 ? 0 : 1;
3860
/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
3861
/// logical left shift of a vector.
3862
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3863
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3864
unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
3865
unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
3866
false /* check zeros from right */, DAG);
3872
// Considering the elements in the mask that are not consecutive zeros,
3873
// check if they consecutively come from only one of the source vectors.
3875
// V1 = {X, A, B, C} 0
3877
// vector_shuffle V1, V2 <1, 2, 3, X>
3879
if (!isShuffleMaskConsecutive(SVOp,
3880
0, // Mask Start Index
3881
NumElems-NumZeros-1, // Mask End Index
3882
NumZeros, // Where to start looking in the src vector
3883
NumElems, // Number of elements in vector
3884
OpSrc)) // Which source operand ?
3889
ShVal = SVOp->getOperand(OpSrc);
3893
/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
3894
/// logical left shift of a vector.
3895
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3896
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3897
unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
3898
unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
3899
true /* check zeros from left */, DAG);
3905
// Considering the elements in the mask that are not consecutive zeros,
3906
// check if they consecutively come from only one of the source vectors.
3908
// 0 { A, B, X, X } = V2
3910
// vector_shuffle V1, V2 <X, X, 4, 5>
3912
if (!isShuffleMaskConsecutive(SVOp,
3913
NumZeros, // Mask Start Index
3914
NumElems-1, // Mask End Index
3915
0, // Where to start looking in the src vector
3916
NumElems, // Number of elements in vector
3917
OpSrc)) // Which source operand ?
3922
ShVal = SVOp->getOperand(OpSrc);
3926
/// isVectorShift - Returns true if the shuffle can be implemented as a
3927
/// logical left or right shift of a vector.
3928
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3929
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3930
if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
3931
isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
3937
/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3939
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3940
unsigned NumNonZero, unsigned NumZero,
3942
const TargetLowering &TLI) {
3946
DebugLoc dl = Op.getDebugLoc();
3949
for (unsigned i = 0; i < 16; ++i) {
3950
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3951
if (ThisIsNonZero && First) {
3953
V = getZeroVector(MVT::v8i16, true, DAG, dl);
3955
V = DAG.getUNDEF(MVT::v8i16);
3960
SDValue ThisElt(0, 0), LastElt(0, 0);
3961
bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3962
if (LastIsNonZero) {
3963
LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3964
MVT::i16, Op.getOperand(i-1));
3966
if (ThisIsNonZero) {
3967
ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3968
ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3969
ThisElt, DAG.getConstant(8, MVT::i8));
3971
ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3975
if (ThisElt.getNode())
3976
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3977
DAG.getIntPtrConstant(i/2));
3981
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3984
/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3986
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3987
unsigned NumNonZero, unsigned NumZero,
3989
const TargetLowering &TLI) {
3993
DebugLoc dl = Op.getDebugLoc();
3996
for (unsigned i = 0; i < 8; ++i) {
3997
bool isNonZero = (NonZeros & (1 << i)) != 0;
4001
V = getZeroVector(MVT::v8i16, true, DAG, dl);
4003
V = DAG.getUNDEF(MVT::v8i16);
4006
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
4007
MVT::v8i16, V, Op.getOperand(i),
4008
DAG.getIntPtrConstant(i));
4015
/// getVShift - Return a vector logical shift node.
4017
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
4018
unsigned NumBits, SelectionDAG &DAG,
4019
const TargetLowering &TLI, DebugLoc dl) {
4020
bool isMMX = VT.getSizeInBits() == 64;
4021
EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
4022
unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
4023
SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
4024
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4025
DAG.getNode(Opc, dl, ShVT, SrcOp,
4026
DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
4030
X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
4031
SelectionDAG &DAG) const {
4033
// Check if the scalar load can be widened into a vector load. And if
4034
// the address is "base + cst" see if the cst can be "absorbed" into
4035
// the shuffle mask.
4036
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
4037
SDValue Ptr = LD->getBasePtr();
4038
if (!ISD::isNormalLoad(LD) || LD->isVolatile())
4040
EVT PVT = LD->getValueType(0);
4041
if (PVT != MVT::i32 && PVT != MVT::f32)
4046
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
4047
FI = FINode->getIndex();
4049
} else if (Ptr.getOpcode() == ISD::ADD &&
4050
isa<ConstantSDNode>(Ptr.getOperand(1)) &&
4051
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
4052
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
4053
Offset = Ptr.getConstantOperandVal(1);
4054
Ptr = Ptr.getOperand(0);
4059
SDValue Chain = LD->getChain();
4060
// Make sure the stack object alignment is at least 16.
4061
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
4062
if (DAG.InferPtrAlignment(Ptr) < 16) {
4063
if (MFI->isFixedObjectIndex(FI)) {
4064
// Can't change the alignment. FIXME: It's possible to compute
4065
// the exact stack offset and reference FI + adjust offset instead.
4066
// If someone *really* cares about this. That's the way to implement it.
4069
MFI->setObjectAlignment(FI, 16);
4073
// (Offset % 16) must be multiple of 4. Then address is then
4074
// Ptr + (Offset & ~15).
4077
if ((Offset % 16) & 3)
4079
int64_t StartOffset = Offset & ~15;
4081
Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
4082
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
4084
int EltNo = (Offset - StartOffset) >> 2;
4085
int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
4086
EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
4087
SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
4089
// Canonicalize it to a v4i32 shuffle.
4090
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
4091
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4092
DAG.getVectorShuffle(MVT::v4i32, dl, V1,
4093
DAG.getUNDEF(MVT::v4i32), &Mask[0]));
4099
/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
4100
/// vector of type 'VT', see if the elements can be replaced by a single large
4101
/// load which has the same value as a build_vector whose operands are 'elts'.
4103
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
4105
/// FIXME: we'd also like to handle the case where the last elements are zero
4106
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
4107
/// There's even a handy isZeroNode for that purpose.
4108
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
4109
DebugLoc &dl, SelectionDAG &DAG) {
4110
EVT EltVT = VT.getVectorElementType();
4111
unsigned NumElems = Elts.size();
4113
LoadSDNode *LDBase = NULL;
4114
unsigned LastLoadedElt = -1U;
4116
// For each element in the initializer, see if we've found a load or an undef.
4117
// If we don't find an initial load element, or later load elements are
4118
// non-consecutive, bail out.
4119
for (unsigned i = 0; i < NumElems; ++i) {
4120
SDValue Elt = Elts[i];
4122
if (!Elt.getNode() ||
4123
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
4126
if (Elt.getNode()->getOpcode() == ISD::UNDEF)
4128
LDBase = cast<LoadSDNode>(Elt.getNode());
4132
if (Elt.getOpcode() == ISD::UNDEF)
4135
LoadSDNode *LD = cast<LoadSDNode>(Elt);
4136
if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
4141
// If we have found an entire vector of loads and undefs, then return a large
4142
// load of the entire vector width starting at the base pointer. If we found
4143
// consecutive loads for the low half, generate a vzext_load node.
4144
if (LastLoadedElt == NumElems - 1) {
4145
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
4146
return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
4147
LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
4148
LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
4149
return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
4150
LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
4151
LDBase->isVolatile(), LDBase->isNonTemporal(),
4152
LDBase->getAlignment());
4153
} else if (NumElems == 4 && LastLoadedElt == 1) {
4154
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
4155
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
4156
SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
4157
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
4163
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
4164
DebugLoc dl = Op.getDebugLoc();
4165
// All zero's are handled with pxor in SSE2 and above, xorps in SSE1.
4166
// All one's are handled with pcmpeqd. In AVX, zero's are handled with
4167
// vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
4168
// is present, so AllOnes is ignored.
4169
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
4170
(Op.getValueType().getSizeInBits() != 256 &&
4171
ISD::isBuildVectorAllOnes(Op.getNode()))) {
4172
// Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
4173
// 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
4174
// eliminated on x86-32 hosts.
4175
if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
4178
if (ISD::isBuildVectorAllOnes(Op.getNode()))
4179
return getOnesVector(Op.getValueType(), DAG, dl);
4180
return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
4183
EVT VT = Op.getValueType();
4184
EVT ExtVT = VT.getVectorElementType();
4185
unsigned EVTBits = ExtVT.getSizeInBits();
4187
unsigned NumElems = Op.getNumOperands();
4188
unsigned NumZero = 0;
4189
unsigned NumNonZero = 0;
4190
unsigned NonZeros = 0;
4191
bool IsAllConstants = true;
4192
SmallSet<SDValue, 8> Values;
4193
for (unsigned i = 0; i < NumElems; ++i) {
4194
SDValue Elt = Op.getOperand(i);
4195
if (Elt.getOpcode() == ISD::UNDEF)
4198
if (Elt.getOpcode() != ISD::Constant &&
4199
Elt.getOpcode() != ISD::ConstantFP)
4200
IsAllConstants = false;
4201
if (X86::isZeroNode(Elt))
4204
NonZeros |= (1 << i);
4209
// All undef vector. Return an UNDEF. All zero vectors were handled above.
4210
if (NumNonZero == 0)
4211
return DAG.getUNDEF(VT);
4213
// Special case for single non-zero, non-undef, element.
4214
if (NumNonZero == 1) {
4215
unsigned Idx = CountTrailingZeros_32(NonZeros);
4216
SDValue Item = Op.getOperand(Idx);
4218
// If this is an insertion of an i64 value on x86-32, and if the top bits of
4219
// the value are obviously zero, truncate the value to i32 and do the
4220
// insertion that way. Only do this if the value is non-constant or if the
4221
// value is a constant being inserted into element 0. It is cheaper to do
4222
// a constant pool load than it is to do a movd + shuffle.
4223
if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
4224
(!IsAllConstants || Idx == 0)) {
4225
if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
4226
// Handle MMX and SSE both.
4227
EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
4228
unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
4230
// Truncate the value (which may itself be a constant) to i32, and
4231
// convert it to a vector with movd (S2V+shuffle to zero extend).
4232
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
4233
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
4234
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
4235
Subtarget->hasSSE2(), DAG);
4237
// Now we have our 32-bit value zero extended in the low element of
4238
// a vector. If Idx != 0, swizzle it into place.
4240
SmallVector<int, 4> Mask;
4241
Mask.push_back(Idx);
4242
for (unsigned i = 1; i != VecElts; ++i)
4244
Item = DAG.getVectorShuffle(VecVT, dl, Item,
4245
DAG.getUNDEF(Item.getValueType()),
4248
return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
4252
// If we have a constant or non-constant insertion into the low element of
4253
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
4254
// the rest of the elements. This will be matched as movd/movq/movss/movsd
4255
// depending on what the source datatype is.
4258
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
4259
} else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
4260
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
4261
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
4262
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
4263
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
4265
} else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
4266
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
4267
EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
4268
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
4269
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
4270
Subtarget->hasSSE2(), DAG);
4271
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
4275
// Is it a vector logical left shift?
4276
if (NumElems == 2 && Idx == 1 &&
4277
X86::isZeroNode(Op.getOperand(0)) &&
4278
!X86::isZeroNode(Op.getOperand(1))) {
4279
unsigned NumBits = VT.getSizeInBits();
4280
return getVShift(true, VT,
4281
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4282
VT, Op.getOperand(1)),
4283
NumBits/2, DAG, *this, dl);
4286
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
4289
// Otherwise, if this is a vector with i32 or f32 elements, and the element
4290
// is a non-constant being inserted into an element other than the low one,
4291
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
4292
// movd/movss) to move this into the low element, then shuffle it into
4294
if (EVTBits == 32) {
4295
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
4297
// Turn it into a shuffle of zero and zero-extended scalar to vector.
4298
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
4299
Subtarget->hasSSE2(), DAG);
4300
SmallVector<int, 8> MaskVec;
4301
for (unsigned i = 0; i < NumElems; i++)
4302
MaskVec.push_back(i == Idx ? 0 : 1);
4303
return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
4307
// Splat is obviously ok. Let legalizer expand it to a shuffle.
4308
if (Values.size() == 1) {
4309
if (EVTBits == 32) {
4310
// Instead of a shuffle like this:
4311
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
4312
// Check if it's possible to issue this instead.
4313
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
4314
unsigned Idx = CountTrailingZeros_32(NonZeros);
4315
SDValue Item = Op.getOperand(Idx);
4316
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
4317
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
4322
// A vector full of immediates; various special cases are already
4323
// handled, so this is best done with a single constant-pool load.
4327
// Let legalizer expand 2-wide build_vectors.
4328
if (EVTBits == 64) {
4329
if (NumNonZero == 1) {
4330
// One half is zero or undef.
4331
unsigned Idx = CountTrailingZeros_32(NonZeros);
4332
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
4333
Op.getOperand(Idx));
4334
return getShuffleVectorZeroOrUndef(V2, Idx, true,
4335
Subtarget->hasSSE2(), DAG);
4340
// If element VT is < 32 bits, convert it to inserts into a zero vector.
4341
if (EVTBits == 8 && NumElems == 16) {
4342
SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
4344
if (V.getNode()) return V;
4347
if (EVTBits == 16 && NumElems == 8) {
4348
SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
4350
if (V.getNode()) return V;
4353
// If element VT is == 32 bits, turn it into a number of shuffles.
4354
SmallVector<SDValue, 8> V;
4356
if (NumElems == 4 && NumZero > 0) {
4357
for (unsigned i = 0; i < 4; ++i) {
4358
bool isZero = !(NonZeros & (1 << i));
4360
V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4362
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
4365
for (unsigned i = 0; i < 2; ++i) {
4366
switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
4369
V[i] = V[i*2]; // Must be a zero vector.
4372
V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
4375
V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
4378
V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
4383
SmallVector<int, 8> MaskVec;
4384
bool Reverse = (NonZeros & 0x3) == 2;
4385
for (unsigned i = 0; i < 2; ++i)
4386
MaskVec.push_back(Reverse ? 1-i : i);
4387
Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
4388
for (unsigned i = 0; i < 2; ++i)
4389
MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
4390
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
4393
if (Values.size() > 1 && VT.getSizeInBits() == 128) {
4394
// Check for a build vector of consecutive loads.
4395
for (unsigned i = 0; i < NumElems; ++i)
4396
V[i] = Op.getOperand(i);
4398
// Check for elements which are consecutive loads.
4399
SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
4403
// For SSE 4.1, use insertps to put the high elements into the low element.
4404
if (getSubtarget()->hasSSE41()) {
4406
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
4407
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
4409
Result = DAG.getUNDEF(VT);
4411
for (unsigned i = 1; i < NumElems; ++i) {
4412
if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
4413
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
4414
Op.getOperand(i), DAG.getIntPtrConstant(i));
4419
// Otherwise, expand into a number of unpckl*, start by extending each of
4420
// our (non-undef) elements to the full vector width with the element in the
4421
// bottom slot of the vector (which generates no code for SSE).
4422
for (unsigned i = 0; i < NumElems; ++i) {
4423
if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
4424
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
4426
V[i] = DAG.getUNDEF(VT);
4429
// Next, we iteratively mix elements, e.g. for v4f32:
4430
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
4431
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
4432
// Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
4433
unsigned EltStride = NumElems >> 1;
4434
while (EltStride != 0) {
4435
for (unsigned i = 0; i < EltStride; ++i) {
4436
// If V[i+EltStride] is undef and this is the first round of mixing,
4437
// then it is safe to just drop this shuffle: V[i] is already in the
4438
// right place, the one element (since it's the first round) being
4439
// inserted as undef can be dropped. This isn't safe for successive
4440
// rounds because they will permute elements within both vectors.
4441
if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
4442
EltStride == NumElems/2)
4445
V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
4455
X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
4456
// We support concatenate two MMX registers and place them in a MMX
4457
// register. This is better than doing a stack convert.
4458
DebugLoc dl = Op.getDebugLoc();
4459
EVT ResVT = Op.getValueType();
4460
assert(Op.getNumOperands() == 2);
4461
assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
4462
ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
4464
SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
4465
SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
4466
InVec = Op.getOperand(1);
4467
if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4468
unsigned NumElts = ResVT.getVectorNumElements();
4469
VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4470
VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
4471
InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
4473
InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
4474
SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
4475
Mask[0] = 0; Mask[1] = 2;
4476
VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
4478
return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4481
// v8i16 shuffles - Prefer shuffles in the following order:
4482
// 1. [all] pshuflw, pshufhw, optional move
4483
// 2. [ssse3] 1 x pshufb
4484
// 3. [ssse3] 2 x pshufb + 1 x por
4485
// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
4487
X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
4488
SelectionDAG &DAG) const {
4489
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4490
SDValue V1 = SVOp->getOperand(0);
4491
SDValue V2 = SVOp->getOperand(1);
4492
DebugLoc dl = SVOp->getDebugLoc();
4493
SmallVector<int, 8> MaskVals;
4495
// Determine if more than 1 of the words in each of the low and high quadwords
4496
// of the result come from the same quadword of one of the two inputs. Undef
4497
// mask values count as coming from any quadword, for better codegen.
4498
SmallVector<unsigned, 4> LoQuad(4);
4499
SmallVector<unsigned, 4> HiQuad(4);
4500
BitVector InputQuads(4);
4501
for (unsigned i = 0; i < 8; ++i) {
4502
SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
4503
int EltIdx = SVOp->getMaskElt(i);
4504
MaskVals.push_back(EltIdx);
4513
InputQuads.set(EltIdx / 4);
4516
int BestLoQuad = -1;
4517
unsigned MaxQuad = 1;
4518
for (unsigned i = 0; i < 4; ++i) {
4519
if (LoQuad[i] > MaxQuad) {
4521
MaxQuad = LoQuad[i];
4525
int BestHiQuad = -1;
4527
for (unsigned i = 0; i < 4; ++i) {
4528
if (HiQuad[i] > MaxQuad) {
4530
MaxQuad = HiQuad[i];
4534
// For SSSE3, If all 8 words of the result come from only 1 quadword of each
4535
// of the two input vectors, shuffle them into one input vector so only a
4536
// single pshufb instruction is necessary. If There are more than 2 input
4537
// quads, disable the next transformation since it does not help SSSE3.
4538
bool V1Used = InputQuads[0] || InputQuads[1];
4539
bool V2Used = InputQuads[2] || InputQuads[3];
4540
if (Subtarget->hasSSSE3()) {
4541
if (InputQuads.count() == 2 && V1Used && V2Used) {
4542
BestLoQuad = InputQuads.find_first();
4543
BestHiQuad = InputQuads.find_next(BestLoQuad);
4545
if (InputQuads.count() > 2) {
4551
// If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
4552
// the shuffle mask. If a quad is scored as -1, that means that it contains
4553
// words from all 4 input quadwords.
4555
if (BestLoQuad >= 0 || BestHiQuad >= 0) {
4556
SmallVector<int, 8> MaskV;
4557
MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
4558
MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
4559
NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
4560
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
4561
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
4562
NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
4564
// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
4565
// source words for the shuffle, to aid later transformations.
4566
bool AllWordsInNewV = true;
4567
bool InOrder[2] = { true, true };
4568
for (unsigned i = 0; i != 8; ++i) {
4569
int idx = MaskVals[i];
4571
InOrder[i/4] = false;
4572
if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
4574
AllWordsInNewV = false;
4578
bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
4579
if (AllWordsInNewV) {
4580
for (int i = 0; i != 8; ++i) {
4581
int idx = MaskVals[i];
4584
idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
4585
if ((idx != i) && idx < 4)
4587
if ((idx != i) && idx > 3)
4596
// If we've eliminated the use of V2, and the new mask is a pshuflw or
4597
// pshufhw, that's as cheap as it gets. Return the new shuffle.
4598
if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
4599
unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
4600
unsigned TargetMask = 0;
4601
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
4602
DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
4603
TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
4604
X86::getShufflePSHUFLWImmediate(NewV.getNode());
4605
V1 = NewV.getOperand(0);
4606
return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
4610
// If we have SSSE3, and all words of the result are from 1 input vector,
4611
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
4612
// is present, fall back to case 4.
4613
if (Subtarget->hasSSSE3()) {
4614
SmallVector<SDValue,16> pshufbMask;
4616
// If we have elements from both input vectors, set the high bit of the
4617
// shuffle mask element to zero out elements that come from V2 in the V1
4618
// mask, and elements that come from V1 in the V2 mask, so that the two
4619
// results can be OR'd together.
4620
bool TwoInputs = V1Used && V2Used;
4621
for (unsigned i = 0; i != 8; ++i) {
4622
int EltIdx = MaskVals[i] * 2;
4623
if (TwoInputs && (EltIdx >= 16)) {
4624
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4625
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4628
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
4629
pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
4631
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
4632
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4633
DAG.getNode(ISD::BUILD_VECTOR, dl,
4634
MVT::v16i8, &pshufbMask[0], 16));
4636
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4638
// Calculate the shuffle mask for the second input, shuffle it, and
4639
// OR it with the first shuffled input.
4641
for (unsigned i = 0; i != 8; ++i) {
4642
int EltIdx = MaskVals[i] * 2;
4644
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4645
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4648
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4649
pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
4651
V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
4652
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4653
DAG.getNode(ISD::BUILD_VECTOR, dl,
4654
MVT::v16i8, &pshufbMask[0], 16));
4655
V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4656
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4659
// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
4660
// and update MaskVals with new element order.
4661
BitVector InOrder(8);
4662
if (BestLoQuad >= 0) {
4663
SmallVector<int, 8> MaskV;
4664
for (int i = 0; i != 4; ++i) {
4665
int idx = MaskVals[i];
4667
MaskV.push_back(-1);
4669
} else if ((idx / 4) == BestLoQuad) {
4670
MaskV.push_back(idx & 3);
4673
MaskV.push_back(-1);
4676
for (unsigned i = 4; i != 8; ++i)
4678
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4681
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
4682
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
4684
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
4688
// If BestHi >= 0, generate a pshufhw to put the high elements in order,
4689
// and update MaskVals with the new element order.
4690
if (BestHiQuad >= 0) {
4691
SmallVector<int, 8> MaskV;
4692
for (unsigned i = 0; i != 4; ++i)
4694
for (unsigned i = 4; i != 8; ++i) {
4695
int idx = MaskVals[i];
4697
MaskV.push_back(-1);
4699
} else if ((idx / 4) == BestHiQuad) {
4700
MaskV.push_back((idx & 3) + 4);
4703
MaskV.push_back(-1);
4706
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4709
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
4710
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
4712
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
4716
// In case BestHi & BestLo were both -1, which means each quadword has a word
4717
// from each of the four input quadwords, calculate the InOrder bitvector now
4718
// before falling through to the insert/extract cleanup.
4719
if (BestLoQuad == -1 && BestHiQuad == -1) {
4721
for (int i = 0; i != 8; ++i)
4722
if (MaskVals[i] < 0 || MaskVals[i] == i)
4726
// The other elements are put in the right place using pextrw and pinsrw.
4727
for (unsigned i = 0; i != 8; ++i) {
4730
int EltIdx = MaskVals[i];
4733
SDValue ExtOp = (EltIdx < 8)
4734
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
4735
DAG.getIntPtrConstant(EltIdx))
4736
: DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
4737
DAG.getIntPtrConstant(EltIdx - 8));
4738
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
4739
DAG.getIntPtrConstant(i));
4744
// v16i8 shuffles - Prefer shuffles in the following order:
4745
// 1. [ssse3] 1 x pshufb
4746
// 2. [ssse3] 2 x pshufb + 1 x por
4747
// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
4749
SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
4751
const X86TargetLowering &TLI) {
4752
SDValue V1 = SVOp->getOperand(0);
4753
SDValue V2 = SVOp->getOperand(1);
4754
DebugLoc dl = SVOp->getDebugLoc();
4755
SmallVector<int, 16> MaskVals;
4756
SVOp->getMask(MaskVals);
4758
// If we have SSSE3, case 1 is generated when all result bytes come from
4759
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
4760
// present, fall back to case 3.
4761
// FIXME: kill V2Only once shuffles are canonizalized by getNode.
4764
for (unsigned i = 0; i < 16; ++i) {
4765
int EltIdx = MaskVals[i];
4774
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
4775
if (TLI.getSubtarget()->hasSSSE3()) {
4776
SmallVector<SDValue,16> pshufbMask;
4778
// If all result elements are from one input vector, then only translate
4779
// undef mask values to 0x80 (zero out result) in the pshufb mask.
4781
// Otherwise, we have elements from both input vectors, and must zero out
4782
// elements that come from V2 in the first mask, and V1 in the second mask
4783
// so that we can OR them together.
4784
bool TwoInputs = !(V1Only || V2Only);
4785
for (unsigned i = 0; i != 16; ++i) {
4786
int EltIdx = MaskVals[i];
4787
if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
4788
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4791
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
4793
// If all the elements are from V2, assign it to V1 and return after
4794
// building the first pshufb.
4797
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4798
DAG.getNode(ISD::BUILD_VECTOR, dl,
4799
MVT::v16i8, &pshufbMask[0], 16));
4803
// Calculate the shuffle mask for the second input, shuffle it, and
4804
// OR it with the first shuffled input.
4806
for (unsigned i = 0; i != 16; ++i) {
4807
int EltIdx = MaskVals[i];
4809
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4812
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4814
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4815
DAG.getNode(ISD::BUILD_VECTOR, dl,
4816
MVT::v16i8, &pshufbMask[0], 16));
4817
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4820
// No SSSE3 - Calculate in place words and then fix all out of place words
4821
// With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
4822
// the 16 different words that comprise the two doublequadword input vectors.
4823
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4824
V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
4825
SDValue NewV = V2Only ? V2 : V1;
4826
for (int i = 0; i != 8; ++i) {
4827
int Elt0 = MaskVals[i*2];
4828
int Elt1 = MaskVals[i*2+1];
4830
// This word of the result is all undef, skip it.
4831
if (Elt0 < 0 && Elt1 < 0)
4834
// This word of the result is already in the correct place, skip it.
4835
if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
4837
if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
4840
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
4841
SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
4844
// If Elt0 and Elt1 are defined, are consecutive, and can be load
4845
// using a single extract together, load it and store it.
4846
if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
4847
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4848
DAG.getIntPtrConstant(Elt1 / 2));
4849
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4850
DAG.getIntPtrConstant(i));
4854
// If Elt1 is defined, extract it from the appropriate source. If the
4855
// source byte is not also odd, shift the extracted word left 8 bits
4856
// otherwise clear the bottom 8 bits if we need to do an or.
4858
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4859
DAG.getIntPtrConstant(Elt1 / 2));
4860
if ((Elt1 & 1) == 0)
4861
InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
4862
DAG.getConstant(8, TLI.getShiftAmountTy()));
4864
InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
4865
DAG.getConstant(0xFF00, MVT::i16));
4867
// If Elt0 is defined, extract it from the appropriate source. If the
4868
// source byte is not also even, shift the extracted word right 8 bits. If
4869
// Elt1 was also defined, OR the extracted values together before
4870
// inserting them in the result.
4872
SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
4873
Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
4874
if ((Elt0 & 1) != 0)
4875
InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
4876
DAG.getConstant(8, TLI.getShiftAmountTy()));
4878
InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
4879
DAG.getConstant(0x00FF, MVT::i16));
4880
InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
4883
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4884
DAG.getIntPtrConstant(i));
4886
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
4889
/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
4890
/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be
4891
/// done when every pair / quad of shuffle mask elements point to elements in
4892
/// the right sequence. e.g.
4893
/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
4895
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
4897
const TargetLowering &TLI, DebugLoc dl) {
4898
EVT VT = SVOp->getValueType(0);
4899
SDValue V1 = SVOp->getOperand(0);
4900
SDValue V2 = SVOp->getOperand(1);
4901
unsigned NumElems = VT.getVectorNumElements();
4902
unsigned NewWidth = (NumElems == 4) ? 2 : 4;
4903
EVT MaskVT = (NewWidth == 4) ? MVT::v4i16 : MVT::v2i32;
4905
switch (VT.getSimpleVT().SimpleTy) {
4906
default: assert(false && "Unexpected!");
4907
case MVT::v4f32: NewVT = MVT::v2f64; break;
4908
case MVT::v4i32: NewVT = MVT::v2i64; break;
4909
case MVT::v8i16: NewVT = MVT::v4i32; break;
4910
case MVT::v16i8: NewVT = MVT::v4i32; break;
4913
if (NewWidth == 2) {
4919
int Scale = NumElems / NewWidth;
4920
SmallVector<int, 8> MaskVec;
4921
for (unsigned i = 0; i < NumElems; i += Scale) {
4923
for (int j = 0; j < Scale; ++j) {
4924
int EltIdx = SVOp->getMaskElt(i+j);
4928
StartIdx = EltIdx - (EltIdx % Scale);
4929
if (EltIdx != StartIdx + j)
4933
MaskVec.push_back(-1);
4935
MaskVec.push_back(StartIdx / Scale);
4938
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
4939
V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
4940
return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
4943
/// getVZextMovL - Return a zero-extending vector move low node.
4945
static SDValue getVZextMovL(EVT VT, EVT OpVT,
4946
SDValue SrcOp, SelectionDAG &DAG,
4947
const X86Subtarget *Subtarget, DebugLoc dl) {
4948
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
4949
LoadSDNode *LD = NULL;
4950
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
4951
LD = dyn_cast<LoadSDNode>(SrcOp);
4953
// movssrr and movsdrr do not clear top bits. Try to use movd, movq
4955
MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
4956
if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
4957
SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
4958
SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
4959
SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
4961
OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
4962
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4963
DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4964
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4972
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4973
DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4974
DAG.getNode(ISD::BIT_CONVERT, dl,
4978
/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
4981
LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4982
SDValue V1 = SVOp->getOperand(0);
4983
SDValue V2 = SVOp->getOperand(1);
4984
DebugLoc dl = SVOp->getDebugLoc();
4985
EVT VT = SVOp->getValueType(0);
4987
SmallVector<std::pair<int, int>, 8> Locs;
4989
SmallVector<int, 8> Mask1(4U, -1);
4990
SmallVector<int, 8> PermMask;
4991
SVOp->getMask(PermMask);
4995
for (unsigned i = 0; i != 4; ++i) {
4996
int Idx = PermMask[i];
4998
Locs[i] = std::make_pair(-1, -1);
5000
assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
5002
Locs[i] = std::make_pair(0, NumLo);
5006
Locs[i] = std::make_pair(1, NumHi);
5008
Mask1[2+NumHi] = Idx;
5014
if (NumLo <= 2 && NumHi <= 2) {
5015
// If no more than two elements come from either vector. This can be
5016
// implemented with two shuffles. First shuffle gather the elements.
5017
// The second shuffle, which takes the first shuffle as both of its
5018
// vector operands, put the elements into the right order.
5019
V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
5021
SmallVector<int, 8> Mask2(4U, -1);
5023
for (unsigned i = 0; i != 4; ++i) {
5024
if (Locs[i].first == -1)
5027
unsigned Idx = (i < 2) ? 0 : 4;
5028
Idx += Locs[i].first * 2 + Locs[i].second;
5033
return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
5034
} else if (NumLo == 3 || NumHi == 3) {
5035
// Otherwise, we must have three elements from one vector, call it X, and
5036
// one element from the other, call it Y. First, use a shufps to build an
5037
// intermediate vector with the one element from Y and the element from X
5038
// that will be in the same half in the final destination (the indexes don't
5039
// matter). Then, use a shufps to build the final vector, taking the half
5040
// containing the element from Y from the intermediate, and the other half
5043
// Normalize it so the 3 elements come from V1.
5044
CommuteVectorShuffleMask(PermMask, VT);
5048
// Find the element from V2.
5050
for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
5051
int Val = PermMask[HiIndex];
5058
Mask1[0] = PermMask[HiIndex];
5060
Mask1[2] = PermMask[HiIndex^1];
5062
V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
5065
Mask1[0] = PermMask[0];
5066
Mask1[1] = PermMask[1];
5067
Mask1[2] = HiIndex & 1 ? 6 : 4;
5068
Mask1[3] = HiIndex & 1 ? 4 : 6;
5069
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
5071
Mask1[0] = HiIndex & 1 ? 2 : 0;
5072
Mask1[1] = HiIndex & 1 ? 0 : 2;
5073
Mask1[2] = PermMask[2];
5074
Mask1[3] = PermMask[3];
5079
return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
5083
// Break it into (shuffle shuffle_hi, shuffle_lo).
5085
SmallVector<int,8> LoMask(4U, -1);
5086
SmallVector<int,8> HiMask(4U, -1);
5088
SmallVector<int,8> *MaskPtr = &LoMask;
5089
unsigned MaskIdx = 0;
5092
for (unsigned i = 0; i != 4; ++i) {
5099
int Idx = PermMask[i];
5101
Locs[i] = std::make_pair(-1, -1);
5102
} else if (Idx < 4) {
5103
Locs[i] = std::make_pair(MaskIdx, LoIdx);
5104
(*MaskPtr)[LoIdx] = Idx;
5107
Locs[i] = std::make_pair(MaskIdx, HiIdx);
5108
(*MaskPtr)[HiIdx] = Idx;
5113
SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
5114
SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
5115
SmallVector<int, 8> MaskOps;
5116
for (unsigned i = 0; i != 4; ++i) {
5117
if (Locs[i].first == -1) {
5118
MaskOps.push_back(-1);
5120
unsigned Idx = Locs[i].first * 4 + Locs[i].second;
5121
MaskOps.push_back(Idx);
5124
return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
5127
static bool MayFoldVectorLoad(SDValue V) {
5128
if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT)
5129
V = V.getOperand(0);
5130
if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5131
V = V.getOperand(0);
5138
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
5140
SDValue V1 = Op.getOperand(0);
5141
SDValue V2 = Op.getOperand(1);
5142
EVT VT = Op.getValueType();
5144
assert(VT != MVT::v2i64 && "unsupported shuffle type");
5146
if (HasSSE2 && VT == MVT::v2f64)
5147
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
5150
return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
5154
SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
5155
SDValue V1 = Op.getOperand(0);
5156
SDValue V2 = Op.getOperand(1);
5157
EVT VT = Op.getValueType();
5159
assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
5160
"unsupported shuffle type");
5162
if (V2.getOpcode() == ISD::UNDEF)
5166
return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
5170
SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
5171
SDValue V1 = Op.getOperand(0);
5172
SDValue V2 = Op.getOperand(1);
5173
EVT VT = Op.getValueType();
5174
unsigned NumElems = VT.getVectorNumElements();
5176
// Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
5177
// operand of these instructions is only memory, so check if there's a
5178
// potencial load folding here, otherwise use SHUFPS or MOVSD to match the
5180
bool CanFoldLoad = false;
5182
// Trivial case, when V2 comes from a load.
5183
if (MayFoldVectorLoad(V2))
5186
// When V1 is a load, it can be folded later into a store in isel, example:
5187
// (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
5189
// (MOVLPSmr addr:$src1, VR128:$src2)
5190
// So, recognize this potential and also use MOVLPS or MOVLPD
5191
if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
5195
if (HasSSE2 && NumElems == 2)
5196
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
5199
return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
5202
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5203
// movl and movlp will both match v2i64, but v2i64 is never matched by
5204
// movl earlier because we make it strict to avoid messing with the movlp load
5205
// folding logic (see the code above getMOVLP call). Match it here then,
5206
// this is horrible, but will stay like this until we move all shuffle
5207
// matching to x86 specific nodes. Note that for the 1st condition all
5208
// types are matched with movsd.
5209
if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
5210
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
5212
return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
5215
assert(VT != MVT::v4i32 && "unsupported shuffle type");
5217
// Invert the operand order and use SHUFPS to match it.
5218
return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
5219
X86::getShuffleSHUFImmediate(SVOp), DAG);
5222
static inline unsigned getUNPCKLOpcode(EVT VT) {
5223
switch(VT.getSimpleVT().SimpleTy) {
5224
case MVT::v4i32: return X86ISD::PUNPCKLDQ;
5225
case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
5226
case MVT::v4f32: return X86ISD::UNPCKLPS;
5227
case MVT::v2f64: return X86ISD::UNPCKLPD;
5228
case MVT::v16i8: return X86ISD::PUNPCKLBW;
5229
case MVT::v8i16: return X86ISD::PUNPCKLWD;
5231
llvm_unreachable("Unknow type for unpckl");
5236
static inline unsigned getUNPCKHOpcode(EVT VT) {
5237
switch(VT.getSimpleVT().SimpleTy) {
5238
case MVT::v4i32: return X86ISD::PUNPCKHDQ;
5239
case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
5240
case MVT::v4f32: return X86ISD::UNPCKHPS;
5241
case MVT::v2f64: return X86ISD::UNPCKHPD;
5242
case MVT::v16i8: return X86ISD::PUNPCKHBW;
5243
case MVT::v8i16: return X86ISD::PUNPCKHWD;
5245
llvm_unreachable("Unknow type for unpckh");
5251
X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
5252
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5253
SDValue V1 = Op.getOperand(0);
5254
SDValue V2 = Op.getOperand(1);
5255
EVT VT = Op.getValueType();
5256
DebugLoc dl = Op.getDebugLoc();
5257
unsigned NumElems = VT.getVectorNumElements();
5258
bool isMMX = VT.getSizeInBits() == 64;
5259
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
5260
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
5261
bool V1IsSplat = false;
5262
bool V2IsSplat = false;
5263
bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
5264
bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
5265
MachineFunction &MF = DAG.getMachineFunction();
5266
bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
5268
if (isZeroShuffle(SVOp))
5269
return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
5271
// Promote splats to v4f32.
5272
if (SVOp->isSplat()) {
5273
if (isMMX || NumElems < 4)
5275
return PromoteSplat(SVOp, DAG);
5278
// If the shuffle can be profitably rewritten as a narrower shuffle, then
5280
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
5281
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
5282
if (NewOp.getNode())
5283
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5284
LowerVECTOR_SHUFFLE(NewOp, DAG));
5285
} else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
5286
// FIXME: Figure out a cleaner way to do this.
5287
// Try to make use of movq to zero out the top part.
5288
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
5289
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
5290
if (NewOp.getNode()) {
5291
if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
5292
return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
5293
DAG, Subtarget, dl);
5295
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
5296
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
5297
if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
5298
return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
5299
DAG, Subtarget, dl);
5303
// NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
5304
// unpckh_undef). Only use pshufd if speed is more important than size.
5305
if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
5306
if (VT != MVT::v2i64 && VT != MVT::v2f64)
5307
return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
5308
if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
5309
if (VT != MVT::v2i64 && VT != MVT::v2f64)
5310
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
5312
if (X86::isPSHUFDMask(SVOp)) {
5313
// The actual implementation will match the mask in the if above and then
5314
// during isel it can match several different instructions, not only pshufd
5315
// as its name says, sad but true, emulate the behavior for now...
5316
if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
5317
return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
5319
unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
5321
if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
5322
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
5324
if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
5325
return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
5328
if (VT == MVT::v4f32)
5329
return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
5333
// Check if this can be converted into a logical shift.
5334
bool isLeft = false;
5337
bool isShift = getSubtarget()->hasSSE2() &&
5338
isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
5339
if (isShift && ShVal.hasOneUse()) {
5340
// If the shifted value has multiple uses, it may be cheaper to use
5341
// v_set0 + movlhps or movhlps, etc.
5342
EVT EltVT = VT.getVectorElementType();
5343
ShAmt *= EltVT.getSizeInBits();
5344
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
5347
if (X86::isMOVLMask(SVOp)) {
5350
if (ISD::isBuildVectorAllZeros(V1.getNode()))
5351
return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
5352
if (!isMMX && !X86::isMOVLPMask(SVOp)) {
5353
if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
5354
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
5356
if (VT == MVT::v4i32 || VT == MVT::v4f32)
5357
return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
5361
// FIXME: fold these into legal mask.
5363
if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
5364
return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
5366
if (X86::isMOVHLPSMask(SVOp))
5367
return getMOVHighToLow(Op, dl, DAG);
5369
if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
5370
return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
5372
if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
5373
return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
5375
if (X86::isMOVLPMask(SVOp))
5376
return getMOVLP(Op, dl, DAG, HasSSE2);
5379
if (ShouldXformToMOVHLPS(SVOp) ||
5380
ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
5381
return CommuteVectorShuffle(SVOp, DAG);
5384
// No better options. Use a vshl / vsrl.
5385
EVT EltVT = VT.getVectorElementType();
5386
ShAmt *= EltVT.getSizeInBits();
5387
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
5390
bool Commuted = false;
5391
// FIXME: This should also accept a bitcast of a splat? Be careful, not
5392
// 1,1,1,1 -> v8i16 though.
5393
V1IsSplat = isSplatVector(V1.getNode());
5394
V2IsSplat = isSplatVector(V2.getNode());
5396
// Canonicalize the splat or undef, if present, to be on the RHS.
5397
if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
5398
Op = CommuteVectorShuffle(SVOp, DAG);
5399
SVOp = cast<ShuffleVectorSDNode>(Op);
5400
V1 = SVOp->getOperand(0);
5401
V2 = SVOp->getOperand(1);
5402
std::swap(V1IsSplat, V2IsSplat);
5403
std::swap(V1IsUndef, V2IsUndef);
5407
if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
5408
// Shuffling low element of v1 into undef, just return v1.
5411
// If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
5412
// the instruction selector will not match, so get a canonical MOVL with
5413
// swapped operands to undo the commute.
5414
return getMOVL(DAG, dl, VT, V2, V1);
5417
if (X86::isUNPCKL_v_undef_Mask(SVOp) || X86::isUNPCKLMask(SVOp))
5419
Op : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
5421
if (X86::isUNPCKH_v_undef_Mask(SVOp) || X86::isUNPCKHMask(SVOp))
5423
Op : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
5426
// Normalize mask so all entries that point to V2 points to its first
5427
// element then try to match unpck{h|l} again. If match, return a
5428
// new vector_shuffle with the corrected mask.
5429
SDValue NewMask = NormalizeMask(SVOp, DAG);
5430
ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
5431
if (NSVOp != SVOp) {
5432
if (X86::isUNPCKLMask(NSVOp, true)) {
5434
} else if (X86::isUNPCKHMask(NSVOp, true)) {
5441
// Commute is back and try unpck* again.
5442
// FIXME: this seems wrong.
5443
SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
5444
ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
5446
if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || X86::isUNPCKLMask(NewSVOp))
5448
NewOp : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
5450
if (X86::isUNPCKH_v_undef_Mask(NewSVOp) || X86::isUNPCKHMask(NewSVOp))
5452
NewOp : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
5455
// FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
5457
// Normalize the node to match x86 shuffle ops if needed
5458
if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
5459
return CommuteVectorShuffle(SVOp, DAG);
5461
// The checks below are all present in isShuffleMaskLegal, but they are
5462
// inlined here right now to enable us to directly emit target specific
5463
// nodes, and remove one by one until they don't return Op anymore.
5464
SmallVector<int, 16> M;
5467
// Very little shuffling can be done for 64-bit vectors right now.
5468
if (VT.getSizeInBits() == 64)
5469
return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ? Op : SDValue();
5471
// FIXME: pshufb, blends, shifts.
5472
if (VT.getVectorNumElements() == 2 ||
5473
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
5474
isPALIGNRMask(M, VT, Subtarget->hasSSSE3()))
5477
if (isPSHUFHWMask(M, VT))
5478
return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
5479
X86::getShufflePSHUFHWImmediate(SVOp),
5482
if (isPSHUFLWMask(M, VT))
5483
return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
5484
X86::getShufflePSHUFLWImmediate(SVOp),
5487
if (isSHUFPMask(M, VT)) {
5488
unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
5489
if (VT == MVT::v4f32 || VT == MVT::v4i32)
5490
return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
5492
if (VT == MVT::v2f64 || VT == MVT::v2i64)
5493
return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
5497
// Handle v8i16 specifically since SSE can do byte extraction and insertion.
5498
if (VT == MVT::v8i16) {
5499
SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
5500
if (NewOp.getNode())
5504
if (VT == MVT::v16i8) {
5505
SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
5506
if (NewOp.getNode())
5510
// Handle all 4 wide cases with a number of shuffles except for MMX.
5511
if (NumElems == 4 && !isMMX)
5512
return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
5518
X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
5519
SelectionDAG &DAG) const {
5520
EVT VT = Op.getValueType();
5521
DebugLoc dl = Op.getDebugLoc();
5522
if (VT.getSizeInBits() == 8) {
5523
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
5524
Op.getOperand(0), Op.getOperand(1));
5525
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
5526
DAG.getValueType(VT));
5527
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
5528
} else if (VT.getSizeInBits() == 16) {
5529
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5530
// If Idx is 0, it's cheaper to do a move instead of a pextrw.
5532
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
5533
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
5534
DAG.getNode(ISD::BIT_CONVERT, dl,
5538
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
5539
Op.getOperand(0), Op.getOperand(1));
5540
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
5541
DAG.getValueType(VT));
5542
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
5543
} else if (VT == MVT::f32) {
5544
// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
5545
// the result back to FR32 register. It's only worth matching if the
5546
// result has a single use which is a store or a bitcast to i32. And in
5547
// the case of a store, it's not worth it if the index is a constant 0,
5548
// because a MOVSSmr can be used instead, which is smaller and faster.
5549
if (!Op.hasOneUse())
5551
SDNode *User = *Op.getNode()->use_begin();
5552
if ((User->getOpcode() != ISD::STORE ||
5553
(isa<ConstantSDNode>(Op.getOperand(1)) &&
5554
cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
5555
(User->getOpcode() != ISD::BIT_CONVERT ||
5556
User->getValueType(0) != MVT::i32))
5558
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
5559
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
5562
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
5563
} else if (VT == MVT::i32) {
5564
// ExtractPS works with constant index.
5565
if (isa<ConstantSDNode>(Op.getOperand(1)))
5573
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
5574
SelectionDAG &DAG) const {
5575
if (!isa<ConstantSDNode>(Op.getOperand(1)))
5578
if (Subtarget->hasSSE41()) {
5579
SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
5584
EVT VT = Op.getValueType();
5585
DebugLoc dl = Op.getDebugLoc();
5586
// TODO: handle v16i8.
5587
if (VT.getSizeInBits() == 16) {
5588
SDValue Vec = Op.getOperand(0);
5589
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5591
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
5592
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
5593
DAG.getNode(ISD::BIT_CONVERT, dl,
5596
// Transform it so it match pextrw which produces a 32-bit result.
5597
EVT EltVT = MVT::i32;
5598
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
5599
Op.getOperand(0), Op.getOperand(1));
5600
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
5601
DAG.getValueType(VT));
5602
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
5603
} else if (VT.getSizeInBits() == 32) {
5604
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5608
// SHUFPS the element to the lowest double word, then movss.
5609
int Mask[4] = { Idx, -1, -1, -1 };
5610
EVT VVT = Op.getOperand(0).getValueType();
5611
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
5612
DAG.getUNDEF(VVT), Mask);
5613
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
5614
DAG.getIntPtrConstant(0));
5615
} else if (VT.getSizeInBits() == 64) {
5616
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
5617
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
5618
// to match extract_elt for f64.
5619
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5623
// UNPCKHPD the element to the lowest double word, then movsd.
5624
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
5625
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
5626
int Mask[2] = { 1, -1 };
5627
EVT VVT = Op.getOperand(0).getValueType();
5628
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
5629
DAG.getUNDEF(VVT), Mask);
5630
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
5631
DAG.getIntPtrConstant(0));
5638
X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
5639
SelectionDAG &DAG) const {
5640
EVT VT = Op.getValueType();
5641
EVT EltVT = VT.getVectorElementType();
5642
DebugLoc dl = Op.getDebugLoc();
5644
SDValue N0 = Op.getOperand(0);
5645
SDValue N1 = Op.getOperand(1);
5646
SDValue N2 = Op.getOperand(2);
5648
if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
5649
isa<ConstantSDNode>(N2)) {
5651
if (VT == MVT::v8i16)
5652
Opc = X86ISD::PINSRW;
5653
else if (VT == MVT::v4i16)
5654
Opc = X86ISD::MMX_PINSRW;
5655
else if (VT == MVT::v16i8)
5656
Opc = X86ISD::PINSRB;
5658
Opc = X86ISD::PINSRB;
5660
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
5662
if (N1.getValueType() != MVT::i32)
5663
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5664
if (N2.getValueType() != MVT::i32)
5665
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5666
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
5667
} else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
5668
// Bits [7:6] of the constant are the source select. This will always be
5669
// zero here. The DAG Combiner may combine an extract_elt index into these
5670
// bits. For example (insert (extract, 3), 2) could be matched by putting
5671
// the '3' into bits [7:6] of X86ISD::INSERTPS.
5672
// Bits [5:4] of the constant are the destination select. This is the
5673
// value of the incoming immediate.
5674
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
5675
// combine either bitwise AND or insert of float 0.0 to set these bits.
5676
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
5677
// Create this as a scalar to vector..
5678
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
5679
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
5680
} else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
5681
// PINSR* works with constant index.
5688
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
5689
EVT VT = Op.getValueType();
5690
EVT EltVT = VT.getVectorElementType();
5692
if (Subtarget->hasSSE41())
5693
return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
5695
if (EltVT == MVT::i8)
5698
DebugLoc dl = Op.getDebugLoc();
5699
SDValue N0 = Op.getOperand(0);
5700
SDValue N1 = Op.getOperand(1);
5701
SDValue N2 = Op.getOperand(2);
5703
if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
5704
// Transform it so it match pinsrw which expects a 16-bit value in a GR32
5705
// as its second argument.
5706
if (N1.getValueType() != MVT::i32)
5707
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5708
if (N2.getValueType() != MVT::i32)
5709
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5710
return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
5711
dl, VT, N0, N1, N2);
5717
X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5718
DebugLoc dl = Op.getDebugLoc();
5720
if (Op.getValueType() == MVT::v1i64 &&
5721
Op.getOperand(0).getValueType() == MVT::i64)
5722
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
5724
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
5725
EVT VT = MVT::v2i32;
5726
switch (Op.getValueType().getSimpleVT().SimpleTy) {
5733
return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
5734
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
5737
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
5738
// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
5739
// one of the above mentioned nodes. It has to be wrapped because otherwise
5740
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
5741
// be used to form addressing mode. These wrapped nodes will be selected
5744
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
5745
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
5747
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5749
unsigned char OpFlag = 0;
5750
unsigned WrapperKind = X86ISD::Wrapper;
5751
CodeModel::Model M = getTargetMachine().getCodeModel();
5753
if (Subtarget->isPICStyleRIPRel() &&
5754
(M == CodeModel::Small || M == CodeModel::Kernel))
5755
WrapperKind = X86ISD::WrapperRIP;
5756
else if (Subtarget->isPICStyleGOT())
5757
OpFlag = X86II::MO_GOTOFF;
5758
else if (Subtarget->isPICStyleStubPIC())
5759
OpFlag = X86II::MO_PIC_BASE_OFFSET;
5761
SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
5763
CP->getOffset(), OpFlag);
5764
DebugLoc DL = CP->getDebugLoc();
5765
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5766
// With PIC, the address is actually $g + Offset.
5768
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5769
DAG.getNode(X86ISD::GlobalBaseReg,
5770
DebugLoc(), getPointerTy()),
5777
SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
5778
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
5780
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5782
unsigned char OpFlag = 0;
5783
unsigned WrapperKind = X86ISD::Wrapper;
5784
CodeModel::Model M = getTargetMachine().getCodeModel();
5786
if (Subtarget->isPICStyleRIPRel() &&
5787
(M == CodeModel::Small || M == CodeModel::Kernel))
5788
WrapperKind = X86ISD::WrapperRIP;
5789
else if (Subtarget->isPICStyleGOT())
5790
OpFlag = X86II::MO_GOTOFF;
5791
else if (Subtarget->isPICStyleStubPIC())
5792
OpFlag = X86II::MO_PIC_BASE_OFFSET;
5794
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
5796
DebugLoc DL = JT->getDebugLoc();
5797
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5799
// With PIC, the address is actually $g + Offset.
5801
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5802
DAG.getNode(X86ISD::GlobalBaseReg,
5803
DebugLoc(), getPointerTy()),
5811
X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
5812
const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
5814
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5816
unsigned char OpFlag = 0;
5817
unsigned WrapperKind = X86ISD::Wrapper;
5818
CodeModel::Model M = getTargetMachine().getCodeModel();
5820
if (Subtarget->isPICStyleRIPRel() &&
5821
(M == CodeModel::Small || M == CodeModel::Kernel))
5822
WrapperKind = X86ISD::WrapperRIP;
5823
else if (Subtarget->isPICStyleGOT())
5824
OpFlag = X86II::MO_GOTOFF;
5825
else if (Subtarget->isPICStyleStubPIC())
5826
OpFlag = X86II::MO_PIC_BASE_OFFSET;
5828
SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
5830
DebugLoc DL = Op.getDebugLoc();
5831
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5834
// With PIC, the address is actually $g + Offset.
5835
if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
5836
!Subtarget->is64Bit()) {
5837
Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5838
DAG.getNode(X86ISD::GlobalBaseReg,
5839
DebugLoc(), getPointerTy()),
5847
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
5848
// Create the TargetBlockAddressAddress node.
5849
unsigned char OpFlags =
5850
Subtarget->ClassifyBlockAddressReference();
5851
CodeModel::Model M = getTargetMachine().getCodeModel();
5852
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
5853
DebugLoc dl = Op.getDebugLoc();
5854
SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
5855
/*isTarget=*/true, OpFlags);
5857
if (Subtarget->isPICStyleRIPRel() &&
5858
(M == CodeModel::Small || M == CodeModel::Kernel))
5859
Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5861
Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5863
// With PIC, the address is actually $g + Offset.
5864
if (isGlobalRelativeToPICBase(OpFlags)) {
5865
Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5866
DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5874
X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
5876
SelectionDAG &DAG) const {
5877
// Create the TargetGlobalAddress node, folding in the constant
5878
// offset if it is legal.
5879
unsigned char OpFlags =
5880
Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5881
CodeModel::Model M = getTargetMachine().getCodeModel();
5883
if (OpFlags == X86II::MO_NO_FLAG &&
5884
X86::isOffsetSuitableForCodeModel(Offset, M)) {
5885
// A direct static reference to a global.
5886
Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
5889
Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
5892
if (Subtarget->isPICStyleRIPRel() &&
5893
(M == CodeModel::Small || M == CodeModel::Kernel))
5894
Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5896
Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5898
// With PIC, the address is actually $g + Offset.
5899
if (isGlobalRelativeToPICBase(OpFlags)) {
5900
Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5901
DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5905
// For globals that require a load from a stub to get the address, emit the
5907
if (isGlobalStubReference(OpFlags))
5908
Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
5909
PseudoSourceValue::getGOT(), 0, false, false, 0);
5911
// If there was a non-zero offset that we didn't fold, create an explicit
5914
Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
5915
DAG.getConstant(Offset, getPointerTy()));
5921
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
5922
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5923
int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
5924
return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
5928
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
5929
SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
5930
unsigned char OperandFlags) {
5931
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5932
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5933
DebugLoc dl = GA->getDebugLoc();
5934
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
5935
GA->getValueType(0),
5939
SDValue Ops[] = { Chain, TGA, *InFlag };
5940
Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
5942
SDValue Ops[] = { Chain, TGA };
5943
Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
5946
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
5947
MFI->setAdjustsStack(true);
5949
SDValue Flag = Chain.getValue(1);
5950
return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
5953
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
5955
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5958
DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
5959
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
5960
DAG.getNode(X86ISD::GlobalBaseReg,
5961
DebugLoc(), PtrVT), InFlag);
5962
InFlag = Chain.getValue(1);
5964
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
5967
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
5969
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5971
return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
5972
X86::RAX, X86II::MO_TLSGD);
5975
// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
5976
// "local exec" model.
5977
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5978
const EVT PtrVT, TLSModel::Model model,
5980
DebugLoc dl = GA->getDebugLoc();
5981
// Get the Thread Pointer
5982
SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
5984
DAG.getRegister(is64Bit? X86::FS : X86::GS,
5987
SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
5988
NULL, 0, false, false, 0);
5990
unsigned char OperandFlags = 0;
5991
// Most TLS accesses are not RIP relative, even on x86-64. One exception is
5993
unsigned WrapperKind = X86ISD::Wrapper;
5994
if (model == TLSModel::LocalExec) {
5995
OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
5996
} else if (is64Bit) {
5997
assert(model == TLSModel::InitialExec);
5998
OperandFlags = X86II::MO_GOTTPOFF;
5999
WrapperKind = X86ISD::WrapperRIP;
6001
assert(model == TLSModel::InitialExec);
6002
OperandFlags = X86II::MO_INDNTPOFF;
6005
// emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
6007
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
6008
GA->getValueType(0),
6009
GA->getOffset(), OperandFlags);
6010
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
6012
if (model == TLSModel::InitialExec)
6013
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
6014
PseudoSourceValue::getGOT(), 0, false, false, 0);
6016
// The address of the thread local variable is the add of the thread
6017
// pointer with the offset of the variable.
6018
return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
6022
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
6024
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6025
const GlobalValue *GV = GA->getGlobal();
6027
if (Subtarget->isTargetELF()) {
6028
// TODO: implement the "local dynamic" model
6029
// TODO: implement the "initial exec"model for pic executables
6031
// If GV is an alias then use the aliasee for determining
6032
// thread-localness.
6033
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
6034
GV = GA->resolveAliasedGlobal(false);
6036
TLSModel::Model model
6037
= getTLSModel(GV, getTargetMachine().getRelocationModel());
6040
case TLSModel::GeneralDynamic:
6041
case TLSModel::LocalDynamic: // not implemented
6042
if (Subtarget->is64Bit())
6043
return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
6044
return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
6046
case TLSModel::InitialExec:
6047
case TLSModel::LocalExec:
6048
return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
6049
Subtarget->is64Bit());
6051
} else if (Subtarget->isTargetDarwin()) {
6052
// Darwin only has one model of TLS. Lower to that.
6053
unsigned char OpFlag = 0;
6054
unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
6055
X86ISD::WrapperRIP : X86ISD::Wrapper;
6057
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
6059
bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
6060
!Subtarget->is64Bit();
6062
OpFlag = X86II::MO_TLVP_PIC_BASE;
6064
OpFlag = X86II::MO_TLVP;
6065
DebugLoc DL = Op.getDebugLoc();
6066
SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
6068
GA->getOffset(), OpFlag);
6069
SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
6071
// With PIC32, the address is actually $g + Offset.
6073
Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
6074
DAG.getNode(X86ISD::GlobalBaseReg,
6075
DebugLoc(), getPointerTy()),
6078
// Lowering the machine isd will make sure everything is in the right
6080
SDValue Args[] = { Offset };
6081
SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1);
6083
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
6084
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6085
MFI->setAdjustsStack(true);
6087
// And our return value (tls address) is in the standard call return value
6089
unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
6090
return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
6094
"TLS not implemented for this target.");
6096
llvm_unreachable("Unreachable");
6101
/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
6102
/// take a 2 x i32 value to shift plus a shift amount.
6103
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
6104
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6105
EVT VT = Op.getValueType();
6106
unsigned VTBits = VT.getSizeInBits();
6107
DebugLoc dl = Op.getDebugLoc();
6108
bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
6109
SDValue ShOpLo = Op.getOperand(0);
6110
SDValue ShOpHi = Op.getOperand(1);
6111
SDValue ShAmt = Op.getOperand(2);
6112
SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
6113
DAG.getConstant(VTBits - 1, MVT::i8))
6114
: DAG.getConstant(0, VT);
6117
if (Op.getOpcode() == ISD::SHL_PARTS) {
6118
Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
6119
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6121
Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
6122
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
6125
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
6126
DAG.getConstant(VTBits, MVT::i8));
6127
SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
6128
AndNode, DAG.getConstant(0, MVT::i8));
6131
SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6132
SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
6133
SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
6135
if (Op.getOpcode() == ISD::SHL_PARTS) {
6136
Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
6137
Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
6139
Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
6140
Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
6143
SDValue Ops[2] = { Lo, Hi };
6144
return DAG.getMergeValues(Ops, 2, dl);
6147
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
6148
SelectionDAG &DAG) const {
6149
EVT SrcVT = Op.getOperand(0).getValueType();
6151
if (SrcVT.isVector()) {
6152
if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
6158
assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
6159
"Unknown SINT_TO_FP to lower!");
6161
// These are really Legal; return the operand so the caller accepts it as
6163
if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
6165
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
6166
Subtarget->is64Bit()) {
6170
DebugLoc dl = Op.getDebugLoc();
6171
unsigned Size = SrcVT.getSizeInBits()/8;
6172
MachineFunction &MF = DAG.getMachineFunction();
6173
int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
6174
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6175
SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
6177
PseudoSourceValue::getFixedStack(SSFI), 0,
6179
return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
6182
SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
6184
SelectionDAG &DAG) const {
6186
DebugLoc dl = Op.getDebugLoc();
6188
bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
6190
Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
6192
Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
6193
SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
6194
SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
6195
Tys, Ops, array_lengthof(Ops));
6198
Chain = Result.getValue(1);
6199
SDValue InFlag = Result.getValue(2);
6201
// FIXME: Currently the FST is flagged to the FILD_FLAG. This
6202
// shouldn't be necessary except that RFP cannot be live across
6203
// multiple blocks. When stackifier is fixed, they can be uncoupled.
6204
MachineFunction &MF = DAG.getMachineFunction();
6205
int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
6206
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6207
Tys = DAG.getVTList(MVT::Other);
6209
Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
6211
Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
6212
Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
6213
PseudoSourceValue::getFixedStack(SSFI), 0,
6220
// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
6221
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
6222
SelectionDAG &DAG) const {
6223
// This algorithm is not obvious. Here it is in C code, more or less:
6225
double uint64_to_double( uint32_t hi, uint32_t lo ) {
6226
static const __m128i exp = { 0x4330000045300000ULL, 0 };
6227
static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
6229
// Copy ints to xmm registers.
6230
__m128i xh = _mm_cvtsi32_si128( hi );
6231
__m128i xl = _mm_cvtsi32_si128( lo );
6233
// Combine into low half of a single xmm register.
6234
__m128i x = _mm_unpacklo_epi32( xh, xl );
6238
// Merge in appropriate exponents to give the integer bits the right
6240
x = _mm_unpacklo_epi32( x, exp );
6242
// Subtract away the biases to deal with the IEEE-754 double precision
6244
d = _mm_sub_pd( (__m128d) x, bias );
6246
// All conversions up to here are exact. The correctly rounded result is
6247
// calculated using the current rounding mode using the following
6249
d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
6250
_mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
6251
// store doesn't really need to be here (except
6252
// maybe to zero the other double)
6257
DebugLoc dl = Op.getDebugLoc();
6258
LLVMContext *Context = DAG.getContext();
6260
// Build some magic constants.
6261
std::vector<Constant*> CV0;
6262
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
6263
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
6264
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
6265
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
6266
Constant *C0 = ConstantVector::get(CV0);
6267
SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
6269
std::vector<Constant*> CV1;
6271
ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
6273
ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
6274
Constant *C1 = ConstantVector::get(CV1);
6275
SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
6277
SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
6278
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6280
DAG.getIntPtrConstant(1)));
6281
SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
6282
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6284
DAG.getIntPtrConstant(0)));
6285
SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
6286
SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
6287
PseudoSourceValue::getConstantPool(), 0,
6289
SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
6290
SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
6291
SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
6292
PseudoSourceValue::getConstantPool(), 0,
6294
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
6296
// Add the halves; easiest way is to swap them into another reg first.
6297
int ShufMask[2] = { 1, -1 };
6298
SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
6299
DAG.getUNDEF(MVT::v2f64), ShufMask);
6300
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
6301
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
6302
DAG.getIntPtrConstant(0));
6305
// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
6306
SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
6307
SelectionDAG &DAG) const {
6308
DebugLoc dl = Op.getDebugLoc();
6309
// FP constant to bias correct the final result.
6310
SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
6313
// Load the 32-bit value into an XMM register.
6314
SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
6315
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
6317
DAG.getIntPtrConstant(0)));
6319
Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
6320
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
6321
DAG.getIntPtrConstant(0));
6323
// Or the load with the bias.
6324
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
6325
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
6326
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6328
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
6329
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6330
MVT::v2f64, Bias)));
6331
Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
6332
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
6333
DAG.getIntPtrConstant(0));
6335
// Subtract the bias.
6336
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
6338
// Handle final rounding.
6339
EVT DestVT = Op.getValueType();
6341
if (DestVT.bitsLT(MVT::f64)) {
6342
return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
6343
DAG.getIntPtrConstant(0));
6344
} else if (DestVT.bitsGT(MVT::f64)) {
6345
return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
6348
// Handle final rounding.
6352
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
6353
SelectionDAG &DAG) const {
6354
SDValue N0 = Op.getOperand(0);
6355
DebugLoc dl = Op.getDebugLoc();
6357
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
6358
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
6359
// the optimization here.
6360
if (DAG.SignBitIsZero(N0))
6361
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
6363
EVT SrcVT = N0.getValueType();
6364
EVT DstVT = Op.getValueType();
6365
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
6366
return LowerUINT_TO_FP_i64(Op, DAG);
6367
else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
6368
return LowerUINT_TO_FP_i32(Op, DAG);
6370
// Make a 64-bit buffer, and use it to build an FILD.
6371
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
6372
if (SrcVT == MVT::i32) {
6373
SDValue WordOff = DAG.getConstant(4, getPointerTy());
6374
SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
6375
getPointerTy(), StackSlot, WordOff);
6376
SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
6377
StackSlot, NULL, 0, false, false, 0);
6378
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
6379
OffsetSlot, NULL, 0, false, false, 0);
6380
SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
6384
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
6385
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
6386
StackSlot, NULL, 0, false, false, 0);
6387
// For i64 source, we need to add the appropriate power of 2 if the input
6388
// was negative. This is the same as the optimization in
6389
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
6390
// we must be careful to do the computation in x87 extended precision, not
6391
// in SSE. (The generic code can't know it's OK to do this, or how to.)
6392
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
6393
SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
6394
SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3);
6396
APInt FF(32, 0x5F800000ULL);
6398
// Check whether the sign bit is set.
6399
SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
6400
Op.getOperand(0), DAG.getConstant(0, MVT::i64),
6403
// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
6404
SDValue FudgePtr = DAG.getConstantPool(
6405
ConstantInt::get(*DAG.getContext(), FF.zext(64)),
6408
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
6409
SDValue Zero = DAG.getIntPtrConstant(0);
6410
SDValue Four = DAG.getIntPtrConstant(4);
6411
SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
6413
FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
6415
// Load the value out, extending it from f32 to f80.
6416
// FIXME: Avoid the extend by constructing the right constant pool?
6417
SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(),
6418
FudgePtr, PseudoSourceValue::getConstantPool(),
6419
0, MVT::f32, false, false, 4);
6420
// Extend everything to 80 bits to force it to be done on x87.
6421
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
6422
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
6425
std::pair<SDValue,SDValue> X86TargetLowering::
6426
FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
6427
DebugLoc dl = Op.getDebugLoc();
6429
EVT DstTy = Op.getValueType();
6432
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
6436
assert(DstTy.getSimpleVT() <= MVT::i64 &&
6437
DstTy.getSimpleVT() >= MVT::i16 &&
6438
"Unknown FP_TO_SINT to lower!");
6440
// These are really Legal.
6441
if (DstTy == MVT::i32 &&
6442
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
6443
return std::make_pair(SDValue(), SDValue());
6444
if (Subtarget->is64Bit() &&
6445
DstTy == MVT::i64 &&
6446
isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
6447
return std::make_pair(SDValue(), SDValue());
6449
// We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
6451
MachineFunction &MF = DAG.getMachineFunction();
6452
unsigned MemSize = DstTy.getSizeInBits()/8;
6453
int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
6454
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6457
switch (DstTy.getSimpleVT().SimpleTy) {
6458
default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
6459
case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
6460
case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
6461
case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
6464
SDValue Chain = DAG.getEntryNode();
6465
SDValue Value = Op.getOperand(0);
6466
if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
6467
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
6468
Chain = DAG.getStore(Chain, dl, Value, StackSlot,
6469
PseudoSourceValue::getFixedStack(SSFI), 0,
6471
SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
6473
Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
6475
Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
6476
Chain = Value.getValue(1);
6477
SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
6478
StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
6481
// Build the FP_TO_INT*_IN_MEM
6482
SDValue Ops[] = { Chain, Value, StackSlot };
6483
SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
6485
return std::make_pair(FIST, StackSlot);
6488
SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
6489
SelectionDAG &DAG) const {
6490
if (Op.getValueType().isVector()) {
6491
if (Op.getValueType() == MVT::v2i32 &&
6492
Op.getOperand(0).getValueType() == MVT::v2f64) {
6498
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
6499
SDValue FIST = Vals.first, StackSlot = Vals.second;
6500
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
6501
if (FIST.getNode() == 0) return Op;
6504
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
6505
FIST, StackSlot, NULL, 0, false, false, 0);
6508
SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
6509
SelectionDAG &DAG) const {
6510
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
6511
SDValue FIST = Vals.first, StackSlot = Vals.second;
6512
assert(FIST.getNode() && "Unexpected failure");
6515
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
6516
FIST, StackSlot, NULL, 0, false, false, 0);
6519
SDValue X86TargetLowering::LowerFABS(SDValue Op,
6520
SelectionDAG &DAG) const {
6521
LLVMContext *Context = DAG.getContext();
6522
DebugLoc dl = Op.getDebugLoc();
6523
EVT VT = Op.getValueType();
6526
EltVT = VT.getVectorElementType();
6527
std::vector<Constant*> CV;
6528
if (EltVT == MVT::f64) {
6529
Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
6533
Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
6539
Constant *C = ConstantVector::get(CV);
6540
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
6541
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
6542
PseudoSourceValue::getConstantPool(), 0,
6544
return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
6547
SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
6548
LLVMContext *Context = DAG.getContext();
6549
DebugLoc dl = Op.getDebugLoc();
6550
EVT VT = Op.getValueType();
6553
EltVT = VT.getVectorElementType();
6554
std::vector<Constant*> CV;
6555
if (EltVT == MVT::f64) {
6556
Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
6560
Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
6566
Constant *C = ConstantVector::get(CV);
6567
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
6568
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
6569
PseudoSourceValue::getConstantPool(), 0,
6571
if (VT.isVector()) {
6572
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
6573
DAG.getNode(ISD::XOR, dl, MVT::v2i64,
6574
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
6576
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
6578
return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
6582
SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
6583
LLVMContext *Context = DAG.getContext();
6584
SDValue Op0 = Op.getOperand(0);
6585
SDValue Op1 = Op.getOperand(1);
6586
DebugLoc dl = Op.getDebugLoc();
6587
EVT VT = Op.getValueType();
6588
EVT SrcVT = Op1.getValueType();
6590
// If second operand is smaller, extend it first.
6591
if (SrcVT.bitsLT(VT)) {
6592
Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
6595
// And if it is bigger, shrink it first.
6596
if (SrcVT.bitsGT(VT)) {
6597
Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
6601
// At this point the operands and the result should have the same
6602
// type, and that won't be f80 since that is not custom lowered.
6604
// First get the sign bit of second operand.
6605
std::vector<Constant*> CV;
6606
if (SrcVT == MVT::f64) {
6607
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
6608
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
6610
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
6611
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6612
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6613
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6615
Constant *C = ConstantVector::get(CV);
6616
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
6617
SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
6618
PseudoSourceValue::getConstantPool(), 0,
6620
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
6622
// Shift sign bit right or left if the two operands have different types.
6623
if (SrcVT.bitsGT(VT)) {
6624
// Op0 is MVT::f32, Op1 is MVT::f64.
6625
SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
6626
SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
6627
DAG.getConstant(32, MVT::i32));
6628
SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
6629
SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
6630
DAG.getIntPtrConstant(0));
6633
// Clear first operand sign bit.
6635
if (VT == MVT::f64) {
6636
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
6637
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
6639
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
6640
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6641
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6642
CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6644
C = ConstantVector::get(CV);
6645
CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
6646
SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
6647
PseudoSourceValue::getConstantPool(), 0,
6649
SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
6651
// Or the value with the sign bit.
6652
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
6655
/// Emit nodes that will be selected as "test Op0,Op0", or something
6657
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
6658
SelectionDAG &DAG) const {
6659
DebugLoc dl = Op.getDebugLoc();
6661
// CF and OF aren't always set the way we want. Determine which
6662
// of these we need.
6663
bool NeedCF = false;
6664
bool NeedOF = false;
6667
case X86::COND_A: case X86::COND_AE:
6668
case X86::COND_B: case X86::COND_BE:
6671
case X86::COND_G: case X86::COND_GE:
6672
case X86::COND_L: case X86::COND_LE:
6673
case X86::COND_O: case X86::COND_NO:
6678
// See if we can use the EFLAGS value from the operand instead of
6679
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
6680
// we prove that the arithmetic won't overflow, we can't use OF or CF.
6681
if (Op.getResNo() != 0 || NeedOF || NeedCF)
6682
// Emit a CMP with 0, which is the TEST pattern.
6683
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
6684
DAG.getConstant(0, Op.getValueType()));
6686
unsigned Opcode = 0;
6687
unsigned NumOperands = 0;
6688
switch (Op.getNode()->getOpcode()) {
6690
// Due to an isel shortcoming, be conservative if this add is likely to be
6691
// selected as part of a load-modify-store instruction. When the root node
6692
// in a match is a store, isel doesn't know how to remap non-chain non-flag
6693
// uses of other nodes in the match, such as the ADD in this case. This
6694
// leads to the ADD being left around and reselected, with the result being
6695
// two adds in the output. Alas, even if none our users are stores, that
6696
// doesn't prove we're O.K. Ergo, if we have any parents that aren't
6697
// CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
6698
// climbing the DAG back to the root, and it doesn't seem to be worth the
6700
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6701
UE = Op.getNode()->use_end(); UI != UE; ++UI)
6702
if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
6705
if (ConstantSDNode *C =
6706
dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
6707
// An add of one will be selected as an INC.
6708
if (C->getAPIntValue() == 1) {
6709
Opcode = X86ISD::INC;
6714
// An add of negative one (subtract of one) will be selected as a DEC.
6715
if (C->getAPIntValue().isAllOnesValue()) {
6716
Opcode = X86ISD::DEC;
6722
// Otherwise use a regular EFLAGS-setting add.
6723
Opcode = X86ISD::ADD;
6727
// If the primary and result isn't used, don't bother using X86ISD::AND,
6728
// because a TEST instruction will be better.
6729
bool NonFlagUse = false;
6730
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6731
UE = Op.getNode()->use_end(); UI != UE; ++UI) {
6733
unsigned UOpNo = UI.getOperandNo();
6734
if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
6735
// Look pass truncate.
6736
UOpNo = User->use_begin().getOperandNo();
6737
User = *User->use_begin();
6740
if (User->getOpcode() != ISD::BRCOND &&
6741
User->getOpcode() != ISD::SETCC &&
6742
(User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
6755
// Due to the ISEL shortcoming noted above, be conservative if this op is
6756
// likely to be selected as part of a load-modify-store instruction.
6757
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6758
UE = Op.getNode()->use_end(); UI != UE; ++UI)
6759
if (UI->getOpcode() == ISD::STORE)
6762
// Otherwise use a regular EFLAGS-setting instruction.
6763
switch (Op.getNode()->getOpcode()) {
6764
default: llvm_unreachable("unexpected operator!");
6765
case ISD::SUB: Opcode = X86ISD::SUB; break;
6766
case ISD::OR: Opcode = X86ISD::OR; break;
6767
case ISD::XOR: Opcode = X86ISD::XOR; break;
6768
case ISD::AND: Opcode = X86ISD::AND; break;
6780
return SDValue(Op.getNode(), 1);
6787
// Emit a CMP with 0, which is the TEST pattern.
6788
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
6789
DAG.getConstant(0, Op.getValueType()));
6791
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
6792
SmallVector<SDValue, 4> Ops;
6793
for (unsigned i = 0; i != NumOperands; ++i)
6794
Ops.push_back(Op.getOperand(i));
6796
SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
6797
DAG.ReplaceAllUsesWith(Op, New);
6798
return SDValue(New.getNode(), 1);
6801
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
6803
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
6804
SelectionDAG &DAG) const {
6805
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
6806
if (C->getAPIntValue() == 0)
6807
return EmitTest(Op0, X86CC, DAG);
6809
DebugLoc dl = Op0.getDebugLoc();
6810
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
6813
/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
6814
/// if it's possible.
6815
SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
6816
DebugLoc dl, SelectionDAG &DAG) const {
6817
SDValue Op0 = And.getOperand(0);
6818
SDValue Op1 = And.getOperand(1);
6819
if (Op0.getOpcode() == ISD::TRUNCATE)
6820
Op0 = Op0.getOperand(0);
6821
if (Op1.getOpcode() == ISD::TRUNCATE)
6822
Op1 = Op1.getOperand(0);
6825
if (Op1.getOpcode() == ISD::SHL)
6826
std::swap(Op0, Op1);
6827
if (Op0.getOpcode() == ISD::SHL) {
6828
if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
6829
if (And00C->getZExtValue() == 1) {
6830
// If we looked past a truncate, check that it's only truncating away
6832
unsigned BitWidth = Op0.getValueSizeInBits();
6833
unsigned AndBitWidth = And.getValueSizeInBits();
6834
if (BitWidth > AndBitWidth) {
6835
APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
6836
DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
6837
if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
6841
RHS = Op0.getOperand(1);
6843
} else if (Op1.getOpcode() == ISD::Constant) {
6844
ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
6845
SDValue AndLHS = Op0;
6846
if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
6847
LHS = AndLHS.getOperand(0);
6848
RHS = AndLHS.getOperand(1);
6852
if (LHS.getNode()) {
6853
// If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
6854
// instruction. Since the shift amount is in-range-or-undefined, we know
6855
// that doing a bittest on the i32 value is ok. We extend to i32 because
6856
// the encoding for the i16 version is larger than the i32 version.
6857
// Also promote i16 to i32 for performance / code size reason.
6858
if (LHS.getValueType() == MVT::i8 ||
6859
LHS.getValueType() == MVT::i16)
6860
LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
6862
// If the operand types disagree, extend the shift amount to match. Since
6863
// BT ignores high bits (like shifts) we can use anyextend.
6864
if (LHS.getValueType() != RHS.getValueType())
6865
RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
6867
SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
6868
unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
6869
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6870
DAG.getConstant(Cond, MVT::i8), BT);
6876
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
6877
assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
6878
SDValue Op0 = Op.getOperand(0);
6879
SDValue Op1 = Op.getOperand(1);
6880
DebugLoc dl = Op.getDebugLoc();
6881
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6883
// Optimize to BT if possible.
6884
// Lower (X & (1 << N)) == 0 to BT(X, N).
6885
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
6886
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
6887
if (Op0.getOpcode() == ISD::AND &&
6889
Op1.getOpcode() == ISD::Constant &&
6890
cast<ConstantSDNode>(Op1)->isNullValue() &&
6891
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
6892
SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
6893
if (NewSetCC.getNode())
6897
// Look for "(setcc) == / != 1" to avoid unncessary setcc.
6898
if (Op0.getOpcode() == X86ISD::SETCC &&
6899
Op1.getOpcode() == ISD::Constant &&
6900
(cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
6901
cast<ConstantSDNode>(Op1)->isNullValue()) &&
6902
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
6903
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
6904
bool Invert = (CC == ISD::SETNE) ^
6905
cast<ConstantSDNode>(Op1)->isNullValue();
6907
CCode = X86::GetOppositeBranchCondition(CCode);
6908
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6909
DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
6912
bool isFP = Op1.getValueType().isFloatingPoint();
6913
unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
6914
if (X86CC == X86::COND_INVALID)
6917
SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
6919
// Use sbb x, x to materialize carry bit into a GPR.
6920
if (X86CC == X86::COND_B)
6921
return DAG.getNode(ISD::AND, dl, MVT::i8,
6922
DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
6923
DAG.getConstant(X86CC, MVT::i8), Cond),
6924
DAG.getConstant(1, MVT::i8));
6926
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6927
DAG.getConstant(X86CC, MVT::i8), Cond);
6930
SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
6932
SDValue Op0 = Op.getOperand(0);
6933
SDValue Op1 = Op.getOperand(1);
6934
SDValue CC = Op.getOperand(2);
6935
EVT VT = Op.getValueType();
6936
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6937
bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
6938
DebugLoc dl = Op.getDebugLoc();
6942
EVT VT0 = Op0.getValueType();
6943
assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
6944
unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
6947
switch (SetCCOpcode) {
6950
case ISD::SETEQ: SSECC = 0; break;
6952
case ISD::SETGT: Swap = true; // Fallthrough
6954
case ISD::SETOLT: SSECC = 1; break;
6956
case ISD::SETGE: Swap = true; // Fallthrough
6958
case ISD::SETOLE: SSECC = 2; break;
6959
case ISD::SETUO: SSECC = 3; break;
6961
case ISD::SETNE: SSECC = 4; break;
6962
case ISD::SETULE: Swap = true;
6963
case ISD::SETUGE: SSECC = 5; break;
6964
case ISD::SETULT: Swap = true;
6965
case ISD::SETUGT: SSECC = 6; break;
6966
case ISD::SETO: SSECC = 7; break;
6969
std::swap(Op0, Op1);
6971
// In the two special cases we can't handle, emit two comparisons.
6973
if (SetCCOpcode == ISD::SETUEQ) {
6975
UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
6976
EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
6977
return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
6979
else if (SetCCOpcode == ISD::SETONE) {
6981
ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
6982
NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
6983
return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
6985
llvm_unreachable("Illegal FP comparison");
6987
// Handle all other FP comparisons here.
6988
return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
6991
// We are handling one of the integer comparisons here. Since SSE only has
6992
// GT and EQ comparisons for integer, swapping operands and multiple
6993
// operations may be required for some comparisons.
6994
unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
6995
bool Swap = false, Invert = false, FlipSigns = false;
6997
switch (VT.getSimpleVT().SimpleTy) {
7000
case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
7002
case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
7004
case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
7005
case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
7008
switch (SetCCOpcode) {
7010
case ISD::SETNE: Invert = true;
7011
case ISD::SETEQ: Opc = EQOpc; break;
7012
case ISD::SETLT: Swap = true;
7013
case ISD::SETGT: Opc = GTOpc; break;
7014
case ISD::SETGE: Swap = true;
7015
case ISD::SETLE: Opc = GTOpc; Invert = true; break;
7016
case ISD::SETULT: Swap = true;
7017
case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
7018
case ISD::SETUGE: Swap = true;
7019
case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
7022
std::swap(Op0, Op1);
7024
// Since SSE has no unsigned integer comparisons, we need to flip the sign
7025
// bits of the inputs before performing those operations.
7027
EVT EltVT = VT.getVectorElementType();
7028
SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
7030
std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
7031
SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
7033
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
7034
Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
7037
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
7039
// If the logical-not of the result is required, perform that now.
7041
Result = DAG.getNOT(dl, Result, VT);
7046
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
7047
static bool isX86LogicalCmp(SDValue Op) {
7048
unsigned Opc = Op.getNode()->getOpcode();
7049
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
7051
if (Op.getResNo() == 1 &&
7052
(Opc == X86ISD::ADD ||
7053
Opc == X86ISD::SUB ||
7054
Opc == X86ISD::SMUL ||
7055
Opc == X86ISD::UMUL ||
7056
Opc == X86ISD::INC ||
7057
Opc == X86ISD::DEC ||
7058
Opc == X86ISD::OR ||
7059
Opc == X86ISD::XOR ||
7060
Opc == X86ISD::AND))
7066
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
7067
bool addTest = true;
7068
SDValue Cond = Op.getOperand(0);
7069
DebugLoc dl = Op.getDebugLoc();
7072
if (Cond.getOpcode() == ISD::SETCC) {
7073
SDValue NewCond = LowerSETCC(Cond, DAG);
7074
if (NewCond.getNode())
7078
// (select (x == 0), -1, 0) -> (sign_bit (x - 1))
7079
SDValue Op1 = Op.getOperand(1);
7080
SDValue Op2 = Op.getOperand(2);
7081
if (Cond.getOpcode() == X86ISD::SETCC &&
7082
cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
7083
SDValue Cmp = Cond.getOperand(1);
7084
if (Cmp.getOpcode() == X86ISD::CMP) {
7085
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
7086
ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
7087
ConstantSDNode *RHSC =
7088
dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
7089
if (N1C && N1C->isAllOnesValue() &&
7090
N2C && N2C->isNullValue() &&
7091
RHSC && RHSC->isNullValue()) {
7092
SDValue CmpOp0 = Cmp.getOperand(0);
7093
Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
7094
CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
7095
return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
7096
DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
7101
// Look pass (and (setcc_carry (cmp ...)), 1).
7102
if (Cond.getOpcode() == ISD::AND &&
7103
Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
7104
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
7105
if (C && C->getAPIntValue() == 1)
7106
Cond = Cond.getOperand(0);
7109
// If condition flag is set by a X86ISD::CMP, then use it as the condition
7110
// setting operand in place of the X86ISD::SETCC.
7111
if (Cond.getOpcode() == X86ISD::SETCC ||
7112
Cond.getOpcode() == X86ISD::SETCC_CARRY) {
7113
CC = Cond.getOperand(0);
7115
SDValue Cmp = Cond.getOperand(1);
7116
unsigned Opc = Cmp.getOpcode();
7117
EVT VT = Op.getValueType();
7119
bool IllegalFPCMov = false;
7120
if (VT.isFloatingPoint() && !VT.isVector() &&
7121
!isScalarFPTypeInSSEReg(VT)) // FPStack?
7122
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
7124
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
7125
Opc == X86ISD::BT) { // FIXME
7132
// Look pass the truncate.
7133
if (Cond.getOpcode() == ISD::TRUNCATE)
7134
Cond = Cond.getOperand(0);
7136
// We know the result of AND is compared against zero. Try to match
7138
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
7139
SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
7140
if (NewSetCC.getNode()) {
7141
CC = NewSetCC.getOperand(0);
7142
Cond = NewSetCC.getOperand(1);
7149
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7150
Cond = EmitTest(Cond, X86::COND_NE, DAG);
7153
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
7154
// condition is true.
7155
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
7156
SDValue Ops[] = { Op2, Op1, CC, Cond };
7157
return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
7160
// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
7161
// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
7162
// from the AND / OR.
7163
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
7164
Opc = Op.getOpcode();
7165
if (Opc != ISD::OR && Opc != ISD::AND)
7167
return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
7168
Op.getOperand(0).hasOneUse() &&
7169
Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
7170
Op.getOperand(1).hasOneUse());
7173
// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
7174
// 1 and that the SETCC node has a single use.
7175
static bool isXor1OfSetCC(SDValue Op) {
7176
if (Op.getOpcode() != ISD::XOR)
7178
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
7179
if (N1C && N1C->getAPIntValue() == 1) {
7180
return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
7181
Op.getOperand(0).hasOneUse();
7186
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
7187
bool addTest = true;
7188
SDValue Chain = Op.getOperand(0);
7189
SDValue Cond = Op.getOperand(1);
7190
SDValue Dest = Op.getOperand(2);
7191
DebugLoc dl = Op.getDebugLoc();
7194
if (Cond.getOpcode() == ISD::SETCC) {
7195
SDValue NewCond = LowerSETCC(Cond, DAG);
7196
if (NewCond.getNode())
7200
// FIXME: LowerXALUO doesn't handle these!!
7201
else if (Cond.getOpcode() == X86ISD::ADD ||
7202
Cond.getOpcode() == X86ISD::SUB ||
7203
Cond.getOpcode() == X86ISD::SMUL ||
7204
Cond.getOpcode() == X86ISD::UMUL)
7205
Cond = LowerXALUO(Cond, DAG);
7208
// Look pass (and (setcc_carry (cmp ...)), 1).
7209
if (Cond.getOpcode() == ISD::AND &&
7210
Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
7211
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
7212
if (C && C->getAPIntValue() == 1)
7213
Cond = Cond.getOperand(0);
7216
// If condition flag is set by a X86ISD::CMP, then use it as the condition
7217
// setting operand in place of the X86ISD::SETCC.
7218
if (Cond.getOpcode() == X86ISD::SETCC ||
7219
Cond.getOpcode() == X86ISD::SETCC_CARRY) {
7220
CC = Cond.getOperand(0);
7222
SDValue Cmp = Cond.getOperand(1);
7223
unsigned Opc = Cmp.getOpcode();
7224
// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
7225
if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
7229
switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
7233
// These can only come from an arithmetic instruction with overflow,
7234
// e.g. SADDO, UADDO.
7235
Cond = Cond.getNode()->getOperand(1);
7242
if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
7243
SDValue Cmp = Cond.getOperand(0).getOperand(1);
7244
if (CondOpc == ISD::OR) {
7245
// Also, recognize the pattern generated by an FCMP_UNE. We can emit
7246
// two branches instead of an explicit OR instruction with a
7248
if (Cmp == Cond.getOperand(1).getOperand(1) &&
7249
isX86LogicalCmp(Cmp)) {
7250
CC = Cond.getOperand(0).getOperand(0);
7251
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
7252
Chain, Dest, CC, Cmp);
7253
CC = Cond.getOperand(1).getOperand(0);
7257
} else { // ISD::AND
7258
// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
7259
// two branches instead of an explicit AND instruction with a
7260
// separate test. However, we only do this if this block doesn't
7261
// have a fall-through edge, because this requires an explicit
7262
// jmp when the condition is false.
7263
if (Cmp == Cond.getOperand(1).getOperand(1) &&
7264
isX86LogicalCmp(Cmp) &&
7265
Op.getNode()->hasOneUse()) {
7266
X86::CondCode CCode =
7267
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
7268
CCode = X86::GetOppositeBranchCondition(CCode);
7269
CC = DAG.getConstant(CCode, MVT::i8);
7270
SDNode *User = *Op.getNode()->use_begin();
7271
// Look for an unconditional branch following this conditional branch.
7272
// We need this because we need to reverse the successors in order
7273
// to implement FCMP_OEQ.
7274
if (User->getOpcode() == ISD::BR) {
7275
SDValue FalseBB = User->getOperand(1);
7277
DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
7278
assert(NewBR == User);
7282
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
7283
Chain, Dest, CC, Cmp);
7284
X86::CondCode CCode =
7285
(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
7286
CCode = X86::GetOppositeBranchCondition(CCode);
7287
CC = DAG.getConstant(CCode, MVT::i8);
7293
} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
7294
// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
7295
// It should be transformed during dag combiner except when the condition
7296
// is set by a arithmetics with overflow node.
7297
X86::CondCode CCode =
7298
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
7299
CCode = X86::GetOppositeBranchCondition(CCode);
7300
CC = DAG.getConstant(CCode, MVT::i8);
7301
Cond = Cond.getOperand(0).getOperand(1);
7307
// Look pass the truncate.
7308
if (Cond.getOpcode() == ISD::TRUNCATE)
7309
Cond = Cond.getOperand(0);
7311
// We know the result of AND is compared against zero. Try to match
7313
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
7314
SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
7315
if (NewSetCC.getNode()) {
7316
CC = NewSetCC.getOperand(0);
7317
Cond = NewSetCC.getOperand(1);
7324
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
7325
Cond = EmitTest(Cond, X86::COND_NE, DAG);
7327
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
7328
Chain, Dest, CC, Cond);
7332
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
7333
// Calls to _alloca is needed to probe the stack when allocating more than 4k
7334
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
7335
// that the guard pages used by the OS virtual memory manager are allocated in
7336
// correct sequence.
7338
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
7339
SelectionDAG &DAG) const {
7340
assert(Subtarget->isTargetCygMing() &&
7341
"This should be used only on Cygwin/Mingw targets");
7342
DebugLoc dl = Op.getDebugLoc();
7345
SDValue Chain = Op.getOperand(0);
7346
SDValue Size = Op.getOperand(1);
7347
// FIXME: Ensure alignment here
7351
EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
7353
Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
7354
Flag = Chain.getValue(1);
7356
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
7358
Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag);
7359
Flag = Chain.getValue(1);
7361
Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
7363
SDValue Ops1[2] = { Chain.getValue(0), Chain };
7364
return DAG.getMergeValues(Ops1, 2, dl);
7367
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
7368
MachineFunction &MF = DAG.getMachineFunction();
7369
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
7371
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7372
DebugLoc dl = Op.getDebugLoc();
7374
if (!Subtarget->is64Bit()) {
7375
// vastart just stores the address of the VarArgsFrameIndex slot into the
7376
// memory location argument.
7377
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
7379
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
7384
// gp_offset (0 - 6 * 8)
7385
// fp_offset (48 - 48 + 8 * 16)
7386
// overflow_arg_area (point to parameters coming in memory).
7388
SmallVector<SDValue, 8> MemOps;
7389
SDValue FIN = Op.getOperand(1);
7391
SDValue Store = DAG.getStore(Op.getOperand(0), dl,
7392
DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
7394
FIN, SV, 0, false, false, 0);
7395
MemOps.push_back(Store);
7398
FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7399
FIN, DAG.getIntPtrConstant(4));
7400
Store = DAG.getStore(Op.getOperand(0), dl,
7401
DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
7403
FIN, SV, 4, false, false, 0);
7404
MemOps.push_back(Store);
7406
// Store ptr to overflow_arg_area
7407
FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7408
FIN, DAG.getIntPtrConstant(4));
7409
SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
7411
Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8,
7413
MemOps.push_back(Store);
7415
// Store ptr to reg_save_area.
7416
FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
7417
FIN, DAG.getIntPtrConstant(8));
7418
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
7420
Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16,
7422
MemOps.push_back(Store);
7423
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
7424
&MemOps[0], MemOps.size());
7427
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
7428
// X86-64 va_list is a struct { i32, i32, i8*, i8* }.
7429
assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
7431
report_fatal_error("VAArgInst is not yet implemented for x86-64!");
7435
SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
7436
// X86-64 va_list is a struct { i32, i32, i8*, i8* }.
7437
assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
7438
SDValue Chain = Op.getOperand(0);
7439
SDValue DstPtr = Op.getOperand(1);
7440
SDValue SrcPtr = Op.getOperand(2);
7441
const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
7442
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7443
DebugLoc dl = Op.getDebugLoc();
7445
return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
7446
DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
7447
false, DstSV, 0, SrcSV, 0);
7451
X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
7452
DebugLoc dl = Op.getDebugLoc();
7453
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7455
default: return SDValue(); // Don't custom lower most intrinsics.
7456
// Comparison intrinsics.
7457
case Intrinsic::x86_sse_comieq_ss:
7458
case Intrinsic::x86_sse_comilt_ss:
7459
case Intrinsic::x86_sse_comile_ss:
7460
case Intrinsic::x86_sse_comigt_ss:
7461
case Intrinsic::x86_sse_comige_ss:
7462
case Intrinsic::x86_sse_comineq_ss:
7463
case Intrinsic::x86_sse_ucomieq_ss:
7464
case Intrinsic::x86_sse_ucomilt_ss:
7465
case Intrinsic::x86_sse_ucomile_ss:
7466
case Intrinsic::x86_sse_ucomigt_ss:
7467
case Intrinsic::x86_sse_ucomige_ss:
7468
case Intrinsic::x86_sse_ucomineq_ss:
7469
case Intrinsic::x86_sse2_comieq_sd:
7470
case Intrinsic::x86_sse2_comilt_sd:
7471
case Intrinsic::x86_sse2_comile_sd:
7472
case Intrinsic::x86_sse2_comigt_sd:
7473
case Intrinsic::x86_sse2_comige_sd:
7474
case Intrinsic::x86_sse2_comineq_sd:
7475
case Intrinsic::x86_sse2_ucomieq_sd:
7476
case Intrinsic::x86_sse2_ucomilt_sd:
7477
case Intrinsic::x86_sse2_ucomile_sd:
7478
case Intrinsic::x86_sse2_ucomigt_sd:
7479
case Intrinsic::x86_sse2_ucomige_sd:
7480
case Intrinsic::x86_sse2_ucomineq_sd: {
7482
ISD::CondCode CC = ISD::SETCC_INVALID;
7485
case Intrinsic::x86_sse_comieq_ss:
7486
case Intrinsic::x86_sse2_comieq_sd:
7490
case Intrinsic::x86_sse_comilt_ss:
7491
case Intrinsic::x86_sse2_comilt_sd:
7495
case Intrinsic::x86_sse_comile_ss:
7496
case Intrinsic::x86_sse2_comile_sd:
7500
case Intrinsic::x86_sse_comigt_ss:
7501
case Intrinsic::x86_sse2_comigt_sd:
7505
case Intrinsic::x86_sse_comige_ss:
7506
case Intrinsic::x86_sse2_comige_sd:
7510
case Intrinsic::x86_sse_comineq_ss:
7511
case Intrinsic::x86_sse2_comineq_sd:
7515
case Intrinsic::x86_sse_ucomieq_ss:
7516
case Intrinsic::x86_sse2_ucomieq_sd:
7517
Opc = X86ISD::UCOMI;
7520
case Intrinsic::x86_sse_ucomilt_ss:
7521
case Intrinsic::x86_sse2_ucomilt_sd:
7522
Opc = X86ISD::UCOMI;
7525
case Intrinsic::x86_sse_ucomile_ss:
7526
case Intrinsic::x86_sse2_ucomile_sd:
7527
Opc = X86ISD::UCOMI;
7530
case Intrinsic::x86_sse_ucomigt_ss:
7531
case Intrinsic::x86_sse2_ucomigt_sd:
7532
Opc = X86ISD::UCOMI;
7535
case Intrinsic::x86_sse_ucomige_ss:
7536
case Intrinsic::x86_sse2_ucomige_sd:
7537
Opc = X86ISD::UCOMI;
7540
case Intrinsic::x86_sse_ucomineq_ss:
7541
case Intrinsic::x86_sse2_ucomineq_sd:
7542
Opc = X86ISD::UCOMI;
7547
SDValue LHS = Op.getOperand(1);
7548
SDValue RHS = Op.getOperand(2);
7549
unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
7550
assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
7551
SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
7552
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
7553
DAG.getConstant(X86CC, MVT::i8), Cond);
7554
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
7556
// ptest and testp intrinsics. The intrinsic these come from are designed to
7557
// return an integer value, not just an instruction so lower it to the ptest
7558
// or testp pattern and a setcc for the result.
7559
case Intrinsic::x86_sse41_ptestz:
7560
case Intrinsic::x86_sse41_ptestc:
7561
case Intrinsic::x86_sse41_ptestnzc:
7562
case Intrinsic::x86_avx_ptestz_256:
7563
case Intrinsic::x86_avx_ptestc_256:
7564
case Intrinsic::x86_avx_ptestnzc_256:
7565
case Intrinsic::x86_avx_vtestz_ps:
7566
case Intrinsic::x86_avx_vtestc_ps:
7567
case Intrinsic::x86_avx_vtestnzc_ps:
7568
case Intrinsic::x86_avx_vtestz_pd:
7569
case Intrinsic::x86_avx_vtestc_pd:
7570
case Intrinsic::x86_avx_vtestnzc_pd:
7571
case Intrinsic::x86_avx_vtestz_ps_256:
7572
case Intrinsic::x86_avx_vtestc_ps_256:
7573
case Intrinsic::x86_avx_vtestnzc_ps_256:
7574
case Intrinsic::x86_avx_vtestz_pd_256:
7575
case Intrinsic::x86_avx_vtestc_pd_256:
7576
case Intrinsic::x86_avx_vtestnzc_pd_256: {
7577
bool IsTestPacked = false;
7580
default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
7581
case Intrinsic::x86_avx_vtestz_ps:
7582
case Intrinsic::x86_avx_vtestz_pd:
7583
case Intrinsic::x86_avx_vtestz_ps_256:
7584
case Intrinsic::x86_avx_vtestz_pd_256:
7585
IsTestPacked = true; // Fallthrough
7586
case Intrinsic::x86_sse41_ptestz:
7587
case Intrinsic::x86_avx_ptestz_256:
7589
X86CC = X86::COND_E;
7591
case Intrinsic::x86_avx_vtestc_ps:
7592
case Intrinsic::x86_avx_vtestc_pd:
7593
case Intrinsic::x86_avx_vtestc_ps_256:
7594
case Intrinsic::x86_avx_vtestc_pd_256:
7595
IsTestPacked = true; // Fallthrough
7596
case Intrinsic::x86_sse41_ptestc:
7597
case Intrinsic::x86_avx_ptestc_256:
7599
X86CC = X86::COND_B;
7601
case Intrinsic::x86_avx_vtestnzc_ps:
7602
case Intrinsic::x86_avx_vtestnzc_pd:
7603
case Intrinsic::x86_avx_vtestnzc_ps_256:
7604
case Intrinsic::x86_avx_vtestnzc_pd_256:
7605
IsTestPacked = true; // Fallthrough
7606
case Intrinsic::x86_sse41_ptestnzc:
7607
case Intrinsic::x86_avx_ptestnzc_256:
7609
X86CC = X86::COND_A;
7613
SDValue LHS = Op.getOperand(1);
7614
SDValue RHS = Op.getOperand(2);
7615
unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
7616
SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
7617
SDValue CC = DAG.getConstant(X86CC, MVT::i8);
7618
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
7619
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
7622
// Fix vector shift instructions where the last operand is a non-immediate
7624
case Intrinsic::x86_sse2_pslli_w:
7625
case Intrinsic::x86_sse2_pslli_d:
7626
case Intrinsic::x86_sse2_pslli_q:
7627
case Intrinsic::x86_sse2_psrli_w:
7628
case Intrinsic::x86_sse2_psrli_d:
7629
case Intrinsic::x86_sse2_psrli_q:
7630
case Intrinsic::x86_sse2_psrai_w:
7631
case Intrinsic::x86_sse2_psrai_d:
7632
case Intrinsic::x86_mmx_pslli_w:
7633
case Intrinsic::x86_mmx_pslli_d:
7634
case Intrinsic::x86_mmx_pslli_q:
7635
case Intrinsic::x86_mmx_psrli_w:
7636
case Intrinsic::x86_mmx_psrli_d:
7637
case Intrinsic::x86_mmx_psrli_q:
7638
case Intrinsic::x86_mmx_psrai_w:
7639
case Intrinsic::x86_mmx_psrai_d: {
7640
SDValue ShAmt = Op.getOperand(2);
7641
if (isa<ConstantSDNode>(ShAmt))
7644
unsigned NewIntNo = 0;
7645
EVT ShAmtVT = MVT::v4i32;
7647
case Intrinsic::x86_sse2_pslli_w:
7648
NewIntNo = Intrinsic::x86_sse2_psll_w;
7650
case Intrinsic::x86_sse2_pslli_d:
7651
NewIntNo = Intrinsic::x86_sse2_psll_d;
7653
case Intrinsic::x86_sse2_pslli_q:
7654
NewIntNo = Intrinsic::x86_sse2_psll_q;
7656
case Intrinsic::x86_sse2_psrli_w:
7657
NewIntNo = Intrinsic::x86_sse2_psrl_w;
7659
case Intrinsic::x86_sse2_psrli_d:
7660
NewIntNo = Intrinsic::x86_sse2_psrl_d;
7662
case Intrinsic::x86_sse2_psrli_q:
7663
NewIntNo = Intrinsic::x86_sse2_psrl_q;
7665
case Intrinsic::x86_sse2_psrai_w:
7666
NewIntNo = Intrinsic::x86_sse2_psra_w;
7668
case Intrinsic::x86_sse2_psrai_d:
7669
NewIntNo = Intrinsic::x86_sse2_psra_d;
7672
ShAmtVT = MVT::v2i32;
7674
case Intrinsic::x86_mmx_pslli_w:
7675
NewIntNo = Intrinsic::x86_mmx_psll_w;
7677
case Intrinsic::x86_mmx_pslli_d:
7678
NewIntNo = Intrinsic::x86_mmx_psll_d;
7680
case Intrinsic::x86_mmx_pslli_q:
7681
NewIntNo = Intrinsic::x86_mmx_psll_q;
7683
case Intrinsic::x86_mmx_psrli_w:
7684
NewIntNo = Intrinsic::x86_mmx_psrl_w;
7686
case Intrinsic::x86_mmx_psrli_d:
7687
NewIntNo = Intrinsic::x86_mmx_psrl_d;
7689
case Intrinsic::x86_mmx_psrli_q:
7690
NewIntNo = Intrinsic::x86_mmx_psrl_q;
7692
case Intrinsic::x86_mmx_psrai_w:
7693
NewIntNo = Intrinsic::x86_mmx_psra_w;
7695
case Intrinsic::x86_mmx_psrai_d:
7696
NewIntNo = Intrinsic::x86_mmx_psra_d;
7698
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
7704
// The vector shift intrinsics with scalars uses 32b shift amounts but
7705
// the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
7709
ShOps[1] = DAG.getConstant(0, MVT::i32);
7710
if (ShAmtVT == MVT::v4i32) {
7711
ShOps[2] = DAG.getUNDEF(MVT::i32);
7712
ShOps[3] = DAG.getUNDEF(MVT::i32);
7713
ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
7715
ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
7718
EVT VT = Op.getValueType();
7719
ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
7720
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7721
DAG.getConstant(NewIntNo, MVT::i32),
7722
Op.getOperand(1), ShAmt);
7727
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
7728
SelectionDAG &DAG) const {
7729
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7730
MFI->setReturnAddressIsTaken(true);
7732
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7733
DebugLoc dl = Op.getDebugLoc();
7736
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7738
DAG.getConstant(TD->getPointerSize(),
7739
Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
7740
return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7741
DAG.getNode(ISD::ADD, dl, getPointerTy(),
7743
NULL, 0, false, false, 0);
7746
// Just load the return address.
7747
SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
7748
return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7749
RetAddrFI, NULL, 0, false, false, 0);
7752
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
7753
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7754
MFI->setFrameAddressIsTaken(true);
7756
EVT VT = Op.getValueType();
7757
DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
7758
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7759
unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
7760
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
7762
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
7767
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
7768
SelectionDAG &DAG) const {
7769
return DAG.getIntPtrConstant(2*TD->getPointerSize());
7772
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
7773
MachineFunction &MF = DAG.getMachineFunction();
7774
SDValue Chain = Op.getOperand(0);
7775
SDValue Offset = Op.getOperand(1);
7776
SDValue Handler = Op.getOperand(2);
7777
DebugLoc dl = Op.getDebugLoc();
7779
SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
7780
Subtarget->is64Bit() ? X86::RBP : X86::EBP,
7782
unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
7784
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
7785
DAG.getIntPtrConstant(TD->getPointerSize()));
7786
StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
7787
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
7788
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
7789
MF.getRegInfo().addLiveOut(StoreAddrReg);
7791
return DAG.getNode(X86ISD::EH_RETURN, dl,
7793
Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
7796
SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
7797
SelectionDAG &DAG) const {
7798
SDValue Root = Op.getOperand(0);
7799
SDValue Trmp = Op.getOperand(1); // trampoline
7800
SDValue FPtr = Op.getOperand(2); // nested function
7801
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7802
DebugLoc dl = Op.getDebugLoc();
7804
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7806
if (Subtarget->is64Bit()) {
7807
SDValue OutChains[6];
7809
// Large code-model.
7810
const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
7811
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
7813
const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
7814
const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
7816
const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
7818
// Load the pointer to the nested function into R11.
7819
unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
7820
SDValue Addr = Trmp;
7821
OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7822
Addr, TrmpAddr, 0, false, false, 0);
7824
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7825
DAG.getConstant(2, MVT::i64));
7826
OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
7829
// Load the 'nest' parameter value into R10.
7830
// R10 is specified in X86CallingConv.td
7831
OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
7832
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7833
DAG.getConstant(10, MVT::i64));
7834
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7835
Addr, TrmpAddr, 10, false, false, 0);
7837
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7838
DAG.getConstant(12, MVT::i64));
7839
OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
7842
// Jump to the nested function.
7843
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
7844
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7845
DAG.getConstant(20, MVT::i64));
7846
OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7847
Addr, TrmpAddr, 20, false, false, 0);
7849
unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
7850
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7851
DAG.getConstant(22, MVT::i64));
7852
OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
7853
TrmpAddr, 22, false, false, 0);
7856
{ Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
7857
return DAG.getMergeValues(Ops, 2, dl);
7859
const Function *Func =
7860
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7861
CallingConv::ID CC = Func->getCallingConv();
7866
llvm_unreachable("Unsupported calling convention");
7867
case CallingConv::C:
7868
case CallingConv::X86_StdCall: {
7869
// Pass 'nest' parameter in ECX.
7870
// Must be kept in sync with X86CallingConv.td
7873
// Check that ECX wasn't needed by an 'inreg' parameter.
7874
const FunctionType *FTy = Func->getFunctionType();
7875
const AttrListPtr &Attrs = Func->getAttributes();
7877
if (!Attrs.isEmpty() && !Func->isVarArg()) {
7878
unsigned InRegCount = 0;
7881
for (FunctionType::param_iterator I = FTy->param_begin(),
7882
E = FTy->param_end(); I != E; ++I, ++Idx)
7883
if (Attrs.paramHasAttr(Idx, Attribute::InReg))
7884
// FIXME: should only count parameters that are lowered to integers.
7885
InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
7887
if (InRegCount > 2) {
7888
report_fatal_error("Nest register in use - reduce number of inreg"
7894
case CallingConv::X86_FastCall:
7895
case CallingConv::X86_ThisCall:
7896
case CallingConv::Fast:
7897
// Pass 'nest' parameter in EAX.
7898
// Must be kept in sync with X86CallingConv.td
7903
SDValue OutChains[4];
7906
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7907
DAG.getConstant(10, MVT::i32));
7908
Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
7910
// This is storing the opcode for MOV32ri.
7911
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
7912
const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
7913
OutChains[0] = DAG.getStore(Root, dl,
7914
DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
7915
Trmp, TrmpAddr, 0, false, false, 0);
7917
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7918
DAG.getConstant(1, MVT::i32));
7919
OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
7922
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
7923
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7924
DAG.getConstant(5, MVT::i32));
7925
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
7926
TrmpAddr, 5, false, false, 1);
7928
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7929
DAG.getConstant(6, MVT::i32));
7930
OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
7934
{ Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
7935
return DAG.getMergeValues(Ops, 2, dl);
7939
SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
7940
SelectionDAG &DAG) const {
7942
The rounding mode is in bits 11:10 of FPSR, and has the following
7949
FLT_ROUNDS, on the other hand, expects the following:
7956
To perform the conversion, we do:
7957
(((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
7960
MachineFunction &MF = DAG.getMachineFunction();
7961
const TargetMachine &TM = MF.getTarget();
7962
const TargetFrameInfo &TFI = *TM.getFrameInfo();
7963
unsigned StackAlignment = TFI.getStackAlignment();
7964
EVT VT = Op.getValueType();
7965
DebugLoc dl = Op.getDebugLoc();
7967
// Save FP Control Word to stack slot
7968
int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
7969
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7971
SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
7972
DAG.getEntryNode(), StackSlot);
7974
// Load FP Control Word from stack slot
7975
SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
7978
// Transform as necessary
7980
DAG.getNode(ISD::SRL, dl, MVT::i16,
7981
DAG.getNode(ISD::AND, dl, MVT::i16,
7982
CWD, DAG.getConstant(0x800, MVT::i16)),
7983
DAG.getConstant(11, MVT::i8));
7985
DAG.getNode(ISD::SRL, dl, MVT::i16,
7986
DAG.getNode(ISD::AND, dl, MVT::i16,
7987
CWD, DAG.getConstant(0x400, MVT::i16)),
7988
DAG.getConstant(9, MVT::i8));
7991
DAG.getNode(ISD::AND, dl, MVT::i16,
7992
DAG.getNode(ISD::ADD, dl, MVT::i16,
7993
DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
7994
DAG.getConstant(1, MVT::i16)),
7995
DAG.getConstant(3, MVT::i16));
7998
return DAG.getNode((VT.getSizeInBits() < 16 ?
7999
ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
8002
SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
8003
EVT VT = Op.getValueType();
8005
unsigned NumBits = VT.getSizeInBits();
8006
DebugLoc dl = Op.getDebugLoc();
8008
Op = Op.getOperand(0);
8009
if (VT == MVT::i8) {
8010
// Zero extend to i32 since there is not an i8 bsr.
8012
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
8015
// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
8016
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
8017
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
8019
// If src is zero (i.e. bsr sets ZF), returns NumBits.
8022
DAG.getConstant(NumBits+NumBits-1, OpVT),
8023
DAG.getConstant(X86::COND_E, MVT::i8),
8026
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
8028
// Finally xor with NumBits-1.
8029
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
8032
Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
8036
SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
8037
EVT VT = Op.getValueType();
8039
unsigned NumBits = VT.getSizeInBits();
8040
DebugLoc dl = Op.getDebugLoc();
8042
Op = Op.getOperand(0);
8043
if (VT == MVT::i8) {
8045
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
8048
// Issue a bsf (scan bits forward) which also sets EFLAGS.
8049
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
8050
Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
8052
// If src is zero (i.e. bsf sets ZF), returns NumBits.
8055
DAG.getConstant(NumBits, OpVT),
8056
DAG.getConstant(X86::COND_E, MVT::i8),
8059
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
8062
Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
8066
SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
8067
EVT VT = Op.getValueType();
8068
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
8069
DebugLoc dl = Op.getDebugLoc();
8071
// ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
8072
// ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
8073
// ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
8074
// ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
8075
// ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
8077
// AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
8078
// AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
8079
// return AloBlo + AloBhi + AhiBlo;
8081
SDValue A = Op.getOperand(0);
8082
SDValue B = Op.getOperand(1);
8084
SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8085
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
8086
A, DAG.getConstant(32, MVT::i32));
8087
SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8088
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
8089
B, DAG.getConstant(32, MVT::i32));
8090
SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8091
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
8093
SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8094
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
8096
SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8097
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
8099
AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8100
DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
8101
AloBhi, DAG.getConstant(32, MVT::i32));
8102
AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8103
DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
8104
AhiBlo, DAG.getConstant(32, MVT::i32));
8105
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
8106
Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
8110
SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
8111
EVT VT = Op.getValueType();
8112
DebugLoc dl = Op.getDebugLoc();
8113
SDValue R = Op.getOperand(0);
8115
LLVMContext *Context = DAG.getContext();
8117
assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
8119
if (VT == MVT::v4i32) {
8120
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8121
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
8122
Op.getOperand(1), DAG.getConstant(23, MVT::i32));
8124
ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
8126
std::vector<Constant*> CV(4, CI);
8127
Constant *C = ConstantVector::get(CV);
8128
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8129
SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8130
PseudoSourceValue::getConstantPool(), 0,
8133
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
8134
Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op);
8135
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
8136
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
8138
if (VT == MVT::v16i8) {
8140
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8141
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
8142
Op.getOperand(1), DAG.getConstant(5, MVT::i32));
8144
ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
8145
ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
8147
std::vector<Constant*> CVM1(16, CM1);
8148
std::vector<Constant*> CVM2(16, CM2);
8149
Constant *C = ConstantVector::get(CVM1);
8150
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8151
SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8152
PseudoSourceValue::getConstantPool(), 0,
8155
// r = pblendv(r, psllw(r & (char16)15, 4), a);
8156
M = DAG.getNode(ISD::AND, dl, VT, R, M);
8157
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8158
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
8159
DAG.getConstant(4, MVT::i32));
8160
R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8161
DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
8164
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
8166
C = ConstantVector::get(CVM2);
8167
CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
8168
M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
8169
PseudoSourceValue::getConstantPool(), 0, false, false, 16);
8171
// r = pblendv(r, psllw(r & (char16)63, 2), a);
8172
M = DAG.getNode(ISD::AND, dl, VT, R, M);
8173
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8174
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
8175
DAG.getConstant(2, MVT::i32));
8176
R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8177
DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
8180
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
8182
// return pblendv(r, r+r, a);
8183
R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
8184
DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32),
8185
R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
8191
SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
8192
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
8193
// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
8194
// looks for this combo and may remove the "setcc" instruction if the "setcc"
8195
// has only one use.
8196
SDNode *N = Op.getNode();
8197
SDValue LHS = N->getOperand(0);
8198
SDValue RHS = N->getOperand(1);
8199
unsigned BaseOp = 0;
8201
DebugLoc dl = Op.getDebugLoc();
8203
switch (Op.getOpcode()) {
8204
default: llvm_unreachable("Unknown ovf instruction!");
8206
// A subtract of one will be selected as a INC. Note that INC doesn't
8207
// set CF, so we can't do this for UADDO.
8208
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
8209
if (C->getAPIntValue() == 1) {
8210
BaseOp = X86ISD::INC;
8214
BaseOp = X86ISD::ADD;
8218
BaseOp = X86ISD::ADD;
8222
// A subtract of one will be selected as a DEC. Note that DEC doesn't
8223
// set CF, so we can't do this for USUBO.
8224
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
8225
if (C->getAPIntValue() == 1) {
8226
BaseOp = X86ISD::DEC;
8230
BaseOp = X86ISD::SUB;
8234
BaseOp = X86ISD::SUB;
8238
BaseOp = X86ISD::SMUL;
8242
BaseOp = X86ISD::UMUL;
8247
// Also sets EFLAGS.
8248
SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
8249
SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
8252
DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
8253
DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
8255
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
8259
SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
8260
DebugLoc dl = Op.getDebugLoc();
8262
if (!Subtarget->hasSSE2()) {
8263
SDValue Chain = Op.getOperand(0);
8264
SDValue Zero = DAG.getConstant(0,
8265
Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
8267
DAG.getRegister(X86::ESP, MVT::i32), // Base
8268
DAG.getTargetConstant(1, MVT::i8), // Scale
8269
DAG.getRegister(0, MVT::i32), // Index
8270
DAG.getTargetConstant(0, MVT::i32), // Disp
8271
DAG.getRegister(0, MVT::i32), // Segment.
8276
DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
8277
array_lengthof(Ops));
8278
return SDValue(Res, 0);
8281
unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
8283
return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
8285
unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8286
unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
8287
unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
8288
unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
8290
// def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
8291
if (!Op1 && !Op2 && !Op3 && Op4)
8292
return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
8294
// def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
8295
if (Op1 && !Op2 && !Op3 && !Op4)
8296
return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
8298
// def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
8300
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
8303
SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
8304
EVT T = Op.getValueType();
8305
DebugLoc dl = Op.getDebugLoc();
8308
switch(T.getSimpleVT().SimpleTy) {
8310
assert(false && "Invalid value type!");
8311
case MVT::i8: Reg = X86::AL; size = 1; break;
8312
case MVT::i16: Reg = X86::AX; size = 2; break;
8313
case MVT::i32: Reg = X86::EAX; size = 4; break;
8315
assert(Subtarget->is64Bit() && "Node not type legal!");
8316
Reg = X86::RAX; size = 8;
8319
SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
8320
Op.getOperand(2), SDValue());
8321
SDValue Ops[] = { cpIn.getValue(0),
8324
DAG.getTargetConstant(size, MVT::i8),
8326
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
8327
SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
8329
DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
8333
SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
8334
SelectionDAG &DAG) const {
8335
assert(Subtarget->is64Bit() && "Result not type legalized?");
8336
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
8337
SDValue TheChain = Op.getOperand(0);
8338
DebugLoc dl = Op.getDebugLoc();
8339
SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
8340
SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
8341
SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
8343
SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
8344
DAG.getConstant(32, MVT::i8));
8346
DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
8349
return DAG.getMergeValues(Ops, 2, dl);
8352
SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op,
8353
SelectionDAG &DAG) const {
8354
EVT SrcVT = Op.getOperand(0).getValueType();
8355
EVT DstVT = Op.getValueType();
8356
assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
8357
Subtarget->hasMMX() && !DisableMMX) &&
8358
"Unexpected custom BIT_CONVERT");
8359
assert((DstVT == MVT::i64 ||
8360
(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
8361
"Unexpected custom BIT_CONVERT");
8362
// i64 <=> MMX conversions are Legal.
8363
if (SrcVT==MVT::i64 && DstVT.isVector())
8365
if (DstVT==MVT::i64 && SrcVT.isVector())
8367
// MMX <=> MMX conversions are Legal.
8368
if (SrcVT.isVector() && DstVT.isVector())
8370
// All other conversions need to be expanded.
8373
SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
8374
SDNode *Node = Op.getNode();
8375
DebugLoc dl = Node->getDebugLoc();
8376
EVT T = Node->getValueType(0);
8377
SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
8378
DAG.getConstant(0, T), Node->getOperand(2));
8379
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
8380
cast<AtomicSDNode>(Node)->getMemoryVT(),
8381
Node->getOperand(0),
8382
Node->getOperand(1), negOp,
8383
cast<AtomicSDNode>(Node)->getSrcValue(),
8384
cast<AtomicSDNode>(Node)->getAlignment());
8387
/// LowerOperation - Provide custom lowering hooks for some operations.
8389
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
8390
switch (Op.getOpcode()) {
8391
default: llvm_unreachable("Should not custom lower this!");
8392
case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG);
8393
case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
8394
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
8395
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
8396
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
8397
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
8398
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8399
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
8400
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
8401
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
8402
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
8403
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
8404
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
8405
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
8406
case ISD::SHL_PARTS:
8407
case ISD::SRA_PARTS:
8408
case ISD::SRL_PARTS: return LowerShift(Op, DAG);
8409
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
8410
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
8411
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
8412
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
8413
case ISD::FABS: return LowerFABS(Op, DAG);
8414
case ISD::FNEG: return LowerFNEG(Op, DAG);
8415
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
8416
case ISD::SETCC: return LowerSETCC(Op, DAG);
8417
case ISD::VSETCC: return LowerVSETCC(Op, DAG);
8418
case ISD::SELECT: return LowerSELECT(Op, DAG);
8419
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
8420
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
8421
case ISD::VASTART: return LowerVASTART(Op, DAG);
8422
case ISD::VAARG: return LowerVAARG(Op, DAG);
8423
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
8424
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8425
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
8426
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
8427
case ISD::FRAME_TO_ARGS_OFFSET:
8428
return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
8429
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
8430
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
8431
case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
8432
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
8433
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
8434
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
8435
case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
8436
case ISD::SHL: return LowerSHL(Op, DAG);
8442
case ISD::UMULO: return LowerXALUO(Op, DAG);
8443
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
8444
case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG);
8448
void X86TargetLowering::
8449
ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
8450
SelectionDAG &DAG, unsigned NewOp) const {
8451
EVT T = Node->getValueType(0);
8452
DebugLoc dl = Node->getDebugLoc();
8453
assert (T == MVT::i64 && "Only know how to expand i64 atomics");
8455
SDValue Chain = Node->getOperand(0);
8456
SDValue In1 = Node->getOperand(1);
8457
SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
8458
Node->getOperand(2), DAG.getIntPtrConstant(0));
8459
SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
8460
Node->getOperand(2), DAG.getIntPtrConstant(1));
8461
SDValue Ops[] = { Chain, In1, In2L, In2H };
8462
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
8464
DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
8465
cast<MemSDNode>(Node)->getMemOperand());
8466
SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
8467
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
8468
Results.push_back(Result.getValue(2));
8471
/// ReplaceNodeResults - Replace a node with an illegal result type
8472
/// with a new node built out of custom code.
8473
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
8474
SmallVectorImpl<SDValue>&Results,
8475
SelectionDAG &DAG) const {
8476
DebugLoc dl = N->getDebugLoc();
8477
switch (N->getOpcode()) {
8479
assert(false && "Do not know how to custom type legalize this operation!");
8481
case ISD::FP_TO_SINT: {
8482
std::pair<SDValue,SDValue> Vals =
8483
FP_TO_INTHelper(SDValue(N, 0), DAG, true);
8484
SDValue FIST = Vals.first, StackSlot = Vals.second;
8485
if (FIST.getNode() != 0) {
8486
EVT VT = N->getValueType(0);
8487
// Return a load from the stack slot.
8488
Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
8493
case ISD::READCYCLECOUNTER: {
8494
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
8495
SDValue TheChain = N->getOperand(0);
8496
SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
8497
SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
8499
SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
8501
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
8502
SDValue Ops[] = { eax, edx };
8503
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
8504
Results.push_back(edx.getValue(1));
8507
case ISD::ATOMIC_CMP_SWAP: {
8508
EVT T = N->getValueType(0);
8509
assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
8510
SDValue cpInL, cpInH;
8511
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
8512
DAG.getConstant(0, MVT::i32));
8513
cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
8514
DAG.getConstant(1, MVT::i32));
8515
cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
8516
cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
8518
SDValue swapInL, swapInH;
8519
swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
8520
DAG.getConstant(0, MVT::i32));
8521
swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
8522
DAG.getConstant(1, MVT::i32));
8523
swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
8525
swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
8526
swapInL.getValue(1));
8527
SDValue Ops[] = { swapInH.getValue(0),
8529
swapInH.getValue(1) };
8530
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
8531
SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
8532
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
8533
MVT::i32, Result.getValue(1));
8534
SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
8535
MVT::i32, cpOutL.getValue(2));
8536
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
8537
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
8538
Results.push_back(cpOutH.getValue(1));
8541
case ISD::ATOMIC_LOAD_ADD:
8542
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
8544
case ISD::ATOMIC_LOAD_AND:
8545
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
8547
case ISD::ATOMIC_LOAD_NAND:
8548
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
8550
case ISD::ATOMIC_LOAD_OR:
8551
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
8553
case ISD::ATOMIC_LOAD_SUB:
8554
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
8556
case ISD::ATOMIC_LOAD_XOR:
8557
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
8559
case ISD::ATOMIC_SWAP:
8560
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
8565
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
8567
default: return NULL;
8568
case X86ISD::BSF: return "X86ISD::BSF";
8569
case X86ISD::BSR: return "X86ISD::BSR";
8570
case X86ISD::SHLD: return "X86ISD::SHLD";
8571
case X86ISD::SHRD: return "X86ISD::SHRD";
8572
case X86ISD::FAND: return "X86ISD::FAND";
8573
case X86ISD::FOR: return "X86ISD::FOR";
8574
case X86ISD::FXOR: return "X86ISD::FXOR";
8575
case X86ISD::FSRL: return "X86ISD::FSRL";
8576
case X86ISD::FILD: return "X86ISD::FILD";
8577
case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
8578
case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
8579
case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
8580
case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
8581
case X86ISD::FLD: return "X86ISD::FLD";
8582
case X86ISD::FST: return "X86ISD::FST";
8583
case X86ISD::CALL: return "X86ISD::CALL";
8584
case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
8585
case X86ISD::BT: return "X86ISD::BT";
8586
case X86ISD::CMP: return "X86ISD::CMP";
8587
case X86ISD::COMI: return "X86ISD::COMI";
8588
case X86ISD::UCOMI: return "X86ISD::UCOMI";
8589
case X86ISD::SETCC: return "X86ISD::SETCC";
8590
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
8591
case X86ISD::CMOV: return "X86ISD::CMOV";
8592
case X86ISD::BRCOND: return "X86ISD::BRCOND";
8593
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
8594
case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
8595
case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
8596
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
8597
case X86ISD::Wrapper: return "X86ISD::Wrapper";
8598
case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
8599
case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
8600
case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
8601
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
8602
case X86ISD::PINSRB: return "X86ISD::PINSRB";
8603
case X86ISD::PINSRW: return "X86ISD::PINSRW";
8604
case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
8605
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
8606
case X86ISD::FMAX: return "X86ISD::FMAX";
8607
case X86ISD::FMIN: return "X86ISD::FMIN";
8608
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
8609
case X86ISD::FRCP: return "X86ISD::FRCP";
8610
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
8611
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
8612
case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
8613
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
8614
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
8615
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
8616
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
8617
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
8618
case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
8619
case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
8620
case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
8621
case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
8622
case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
8623
case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
8624
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
8625
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
8626
case X86ISD::VSHL: return "X86ISD::VSHL";
8627
case X86ISD::VSRL: return "X86ISD::VSRL";
8628
case X86ISD::CMPPD: return "X86ISD::CMPPD";
8629
case X86ISD::CMPPS: return "X86ISD::CMPPS";
8630
case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB";
8631
case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW";
8632
case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD";
8633
case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ";
8634
case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB";
8635
case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW";
8636
case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD";
8637
case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ";
8638
case X86ISD::ADD: return "X86ISD::ADD";
8639
case X86ISD::SUB: return "X86ISD::SUB";
8640
case X86ISD::SMUL: return "X86ISD::SMUL";
8641
case X86ISD::UMUL: return "X86ISD::UMUL";
8642
case X86ISD::INC: return "X86ISD::INC";
8643
case X86ISD::DEC: return "X86ISD::DEC";
8644
case X86ISD::OR: return "X86ISD::OR";
8645
case X86ISD::XOR: return "X86ISD::XOR";
8646
case X86ISD::AND: return "X86ISD::AND";
8647
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
8648
case X86ISD::PTEST: return "X86ISD::PTEST";
8649
case X86ISD::TESTP: return "X86ISD::TESTP";
8650
case X86ISD::PALIGN: return "X86ISD::PALIGN";
8651
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
8652
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
8653
case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD";
8654
case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
8655
case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD";
8656
case X86ISD::SHUFPS: return "X86ISD::SHUFPS";
8657
case X86ISD::SHUFPD: return "X86ISD::SHUFPD";
8658
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
8659
case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
8660
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
8661
case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD";
8662
case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
8663
case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
8664
case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
8665
case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
8666
case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
8667
case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD";
8668
case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD";
8669
case X86ISD::MOVSD: return "X86ISD::MOVSD";
8670
case X86ISD::MOVSS: return "X86ISD::MOVSS";
8671
case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS";
8672
case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD";
8673
case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS";
8674
case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD";
8675
case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW";
8676
case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD";
8677
case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ";
8678
case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ";
8679
case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW";
8680
case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
8681
case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
8682
case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
8683
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
8684
case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA";
8688
// isLegalAddressingMode - Return true if the addressing mode represented
8689
// by AM is legal for this target, for a load/store of the specified type.
8690
bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
8691
const Type *Ty) const {
8692
// X86 supports extremely general addressing modes.
8693
CodeModel::Model M = getTargetMachine().getCodeModel();
8694
Reloc::Model R = getTargetMachine().getRelocationModel();
8696
// X86 allows a sign-extended 32-bit immediate field as a displacement.
8697
if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
8702
Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
8704
// If a reference to this global requires an extra load, we can't fold it.
8705
if (isGlobalStubReference(GVFlags))
8708
// If BaseGV requires a register for the PIC base, we cannot also have a
8709
// BaseReg specified.
8710
if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
8713
// If lower 4G is not available, then we must use rip-relative addressing.
8714
if ((M != CodeModel::Small || R != Reloc::Static) &&
8715
Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
8725
// These scales always work.
8730
// These scales are formed with basereg+scalereg. Only accept if there is
8735
default: // Other stuff never works.
8743
bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
8744
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
8746
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
8747
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
8748
if (NumBits1 <= NumBits2)
8753
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
8754
if (!VT1.isInteger() || !VT2.isInteger())
8756
unsigned NumBits1 = VT1.getSizeInBits();
8757
unsigned NumBits2 = VT2.getSizeInBits();
8758
if (NumBits1 <= NumBits2)
8763
bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
8764
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
8765
return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
8768
bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
8769
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
8770
return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
8773
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
8774
// i16 instructions are longer (0x66 prefix) and potentially slower.
8775
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
8778
/// isShuffleMaskLegal - Targets can use this to indicate that they only
8779
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8780
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8781
/// are assumed to be legal.
8783
X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
8785
// Very little shuffling can be done for 64-bit vectors right now.
8786
if (VT.getSizeInBits() == 64)
8787
return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
8789
// FIXME: pshufb, blends, shifts.
8790
return (VT.getVectorNumElements() == 2 ||
8791
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
8792
isMOVLMask(M, VT) ||
8793
isSHUFPMask(M, VT) ||
8794
isPSHUFDMask(M, VT) ||
8795
isPSHUFHWMask(M, VT) ||
8796
isPSHUFLWMask(M, VT) ||
8797
isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
8798
isUNPCKLMask(M, VT) ||
8799
isUNPCKHMask(M, VT) ||
8800
isUNPCKL_v_undef_Mask(M, VT) ||
8801
isUNPCKH_v_undef_Mask(M, VT));
8805
X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
8807
unsigned NumElts = VT.getVectorNumElements();
8808
// FIXME: This collection of masks seems suspect.
8811
if (NumElts == 4 && VT.getSizeInBits() == 128) {
8812
return (isMOVLMask(Mask, VT) ||
8813
isCommutedMOVLMask(Mask, VT, true) ||
8814
isSHUFPMask(Mask, VT) ||
8815
isCommutedSHUFPMask(Mask, VT));
8820
//===----------------------------------------------------------------------===//
8821
// X86 Scheduler Hooks
8822
//===----------------------------------------------------------------------===//
8824
// private utility function
8826
X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
8827
MachineBasicBlock *MBB,
8834
TargetRegisterClass *RC,
8835
bool invSrc) const {
8836
// For the atomic bitwise operator, we generate
8839
// ld t1 = [bitinstr.addr]
8840
// op t2 = t1, [bitinstr.val]
8842
// lcs dest = [bitinstr.addr], t2 [EAX is implicit]
8844
// fallthrough -->nextMBB
8845
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8846
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8847
MachineFunction::iterator MBBIter = MBB;
8850
/// First build the CFG
8851
MachineFunction *F = MBB->getParent();
8852
MachineBasicBlock *thisMBB = MBB;
8853
MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8854
MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8855
F->insert(MBBIter, newMBB);
8856
F->insert(MBBIter, nextMBB);
8858
// Transfer the remainder of thisMBB and its successor edges to nextMBB.
8859
nextMBB->splice(nextMBB->begin(), thisMBB,
8860
llvm::next(MachineBasicBlock::iterator(bInstr)),
8862
nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
8864
// Update thisMBB to fall through to newMBB
8865
thisMBB->addSuccessor(newMBB);
8867
// newMBB jumps to itself and fall through to nextMBB
8868
newMBB->addSuccessor(nextMBB);
8869
newMBB->addSuccessor(newMBB);
8871
// Insert instructions into newMBB based on incoming instruction
8872
assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
8873
"unexpected number of operands");
8874
DebugLoc dl = bInstr->getDebugLoc();
8875
MachineOperand& destOper = bInstr->getOperand(0);
8876
MachineOperand* argOpers[2 + X86::AddrNumOperands];
8877
int numArgs = bInstr->getNumOperands() - 1;
8878
for (int i=0; i < numArgs; ++i)
8879
argOpers[i] = &bInstr->getOperand(i+1);
8881
// x86 address has 4 operands: base, index, scale, and displacement
8882
int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
8883
int valArgIndx = lastAddrIndx + 1;
8885
unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8886
MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
8887
for (int i=0; i <= lastAddrIndx; ++i)
8888
(*MIB).addOperand(*argOpers[i]);
8890
unsigned tt = F->getRegInfo().createVirtualRegister(RC);
8892
MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
8897
unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8898
assert((argOpers[valArgIndx]->isReg() ||
8899
argOpers[valArgIndx]->isImm()) &&
8901
if (argOpers[valArgIndx]->isReg())
8902
MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
8904
MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
8906
(*MIB).addOperand(*argOpers[valArgIndx]);
8908
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
8911
MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
8912
for (int i=0; i <= lastAddrIndx; ++i)
8913
(*MIB).addOperand(*argOpers[i]);
8915
assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8916
(*MIB).setMemRefs(bInstr->memoperands_begin(),
8917
bInstr->memoperands_end());
8919
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
8923
BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8925
bInstr->eraseFromParent(); // The pseudo instruction is gone now.
8929
// private utility function: 64 bit atomics on 32 bit host.
8931
X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
8932
MachineBasicBlock *MBB,
8937
bool invSrc) const {
8938
// For the atomic bitwise operator, we generate
8939
// thisMBB (instructions are in pairs, except cmpxchg8b)
8940
// ld t1,t2 = [bitinstr.addr]
8942
// out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
8943
// op t5, t6 <- out1, out2, [bitinstr.val]
8944
// (for SWAP, substitute: mov t5, t6 <- [bitinstr.val])
8945
// mov ECX, EBX <- t5, t6
8946
// mov EAX, EDX <- t1, t2
8947
// cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit]
8948
// mov t3, t4 <- EAX, EDX
8950
// result in out1, out2
8951
// fallthrough -->nextMBB
8953
const TargetRegisterClass *RC = X86::GR32RegisterClass;
8954
const unsigned LoadOpc = X86::MOV32rm;
8955
const unsigned NotOpc = X86::NOT32r;
8956
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8957
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8958
MachineFunction::iterator MBBIter = MBB;
8961
/// First build the CFG
8962
MachineFunction *F = MBB->getParent();
8963
MachineBasicBlock *thisMBB = MBB;
8964
MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8965
MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8966
F->insert(MBBIter, newMBB);
8967
F->insert(MBBIter, nextMBB);
8969
// Transfer the remainder of thisMBB and its successor edges to nextMBB.
8970
nextMBB->splice(nextMBB->begin(), thisMBB,
8971
llvm::next(MachineBasicBlock::iterator(bInstr)),
8973
nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
8975
// Update thisMBB to fall through to newMBB
8976
thisMBB->addSuccessor(newMBB);
8978
// newMBB jumps to itself and fall through to nextMBB
8979
newMBB->addSuccessor(nextMBB);
8980
newMBB->addSuccessor(newMBB);
8982
DebugLoc dl = bInstr->getDebugLoc();
8983
// Insert instructions into newMBB based on incoming instruction
8984
// There are 8 "real" operands plus 9 implicit def/uses, ignored here.
8985
assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
8986
"unexpected number of operands");
8987
MachineOperand& dest1Oper = bInstr->getOperand(0);
8988
MachineOperand& dest2Oper = bInstr->getOperand(1);
8989
MachineOperand* argOpers[2 + X86::AddrNumOperands];
8990
for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
8991
argOpers[i] = &bInstr->getOperand(i+2);
8993
// We use some of the operands multiple times, so conservatively just
8994
// clear any kill flags that might be present.
8995
if (argOpers[i]->isReg() && argOpers[i]->isUse())
8996
argOpers[i]->setIsKill(false);
8999
// x86 address has 5 operands: base, index, scale, displacement, and segment.
9000
int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
9002
unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
9003
MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
9004
for (int i=0; i <= lastAddrIndx; ++i)
9005
(*MIB).addOperand(*argOpers[i]);
9006
unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
9007
MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
9008
// add 4 to displacement.
9009
for (int i=0; i <= lastAddrIndx-2; ++i)
9010
(*MIB).addOperand(*argOpers[i]);
9011
MachineOperand newOp3 = *(argOpers[3]);
9013
newOp3.setImm(newOp3.getImm()+4);
9015
newOp3.setOffset(newOp3.getOffset()+4);
9016
(*MIB).addOperand(newOp3);
9017
(*MIB).addOperand(*argOpers[lastAddrIndx]);
9019
// t3/4 are defined later, at the bottom of the loop
9020
unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
9021
unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
9022
BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
9023
.addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
9024
BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
9025
.addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
9027
// The subsequent operations should be using the destination registers of
9028
//the PHI instructions.
9030
t1 = F->getRegInfo().createVirtualRegister(RC);
9031
t2 = F->getRegInfo().createVirtualRegister(RC);
9032
MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
9033
MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
9035
t1 = dest1Oper.getReg();
9036
t2 = dest2Oper.getReg();
9039
int valArgIndx = lastAddrIndx + 1;
9040
assert((argOpers[valArgIndx]->isReg() ||
9041
argOpers[valArgIndx]->isImm()) &&
9043
unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
9044
unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
9045
if (argOpers[valArgIndx]->isReg())
9046
MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
9048
MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
9049
if (regOpcL != X86::MOV32rr)
9051
(*MIB).addOperand(*argOpers[valArgIndx]);
9052
assert(argOpers[valArgIndx + 1]->isReg() ==
9053
argOpers[valArgIndx]->isReg());
9054
assert(argOpers[valArgIndx + 1]->isImm() ==
9055
argOpers[valArgIndx]->isImm());
9056
if (argOpers[valArgIndx + 1]->isReg())
9057
MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
9059
MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
9060
if (regOpcH != X86::MOV32rr)
9062
(*MIB).addOperand(*argOpers[valArgIndx + 1]);
9064
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
9066
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
9069
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
9071
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
9074
MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
9075
for (int i=0; i <= lastAddrIndx; ++i)
9076
(*MIB).addOperand(*argOpers[i]);
9078
assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
9079
(*MIB).setMemRefs(bInstr->memoperands_begin(),
9080
bInstr->memoperands_end());
9082
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
9083
MIB.addReg(X86::EAX);
9084
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
9085
MIB.addReg(X86::EDX);
9088
BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
9090
bInstr->eraseFromParent(); // The pseudo instruction is gone now.
9094
// private utility function
9096
X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
9097
MachineBasicBlock *MBB,
9098
unsigned cmovOpc) const {
9099
// For the atomic min/max operator, we generate
9102
// ld t1 = [min/max.addr]
9103
// mov t2 = [min/max.val]
9105
// cmov[cond] t2 = t1
9107
// lcs dest = [bitinstr.addr], t2 [EAX is implicit]
9109
// fallthrough -->nextMBB
9111
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
9112
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
9113
MachineFunction::iterator MBBIter = MBB;
9116
/// First build the CFG
9117
MachineFunction *F = MBB->getParent();
9118
MachineBasicBlock *thisMBB = MBB;
9119
MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
9120
MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
9121
F->insert(MBBIter, newMBB);
9122
F->insert(MBBIter, nextMBB);
9124
// Transfer the remainder of thisMBB and its successor edges to nextMBB.
9125
nextMBB->splice(nextMBB->begin(), thisMBB,
9126
llvm::next(MachineBasicBlock::iterator(mInstr)),
9128
nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
9130
// Update thisMBB to fall through to newMBB
9131
thisMBB->addSuccessor(newMBB);
9133
// newMBB jumps to newMBB and fall through to nextMBB
9134
newMBB->addSuccessor(nextMBB);
9135
newMBB->addSuccessor(newMBB);
9137
DebugLoc dl = mInstr->getDebugLoc();
9138
// Insert instructions into newMBB based on incoming instruction
9139
assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
9140
"unexpected number of operands");
9141
MachineOperand& destOper = mInstr->getOperand(0);
9142
MachineOperand* argOpers[2 + X86::AddrNumOperands];
9143
int numArgs = mInstr->getNumOperands() - 1;
9144
for (int i=0; i < numArgs; ++i)
9145
argOpers[i] = &mInstr->getOperand(i+1);
9147
// x86 address has 4 operands: base, index, scale, and displacement
9148
int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
9149
int valArgIndx = lastAddrIndx + 1;
9151
unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
9152
MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
9153
for (int i=0; i <= lastAddrIndx; ++i)
9154
(*MIB).addOperand(*argOpers[i]);
9156
// We only support register and immediate values
9157
assert((argOpers[valArgIndx]->isReg() ||
9158
argOpers[valArgIndx]->isImm()) &&
9161
unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
9162
if (argOpers[valArgIndx]->isReg())
9163
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
9165
MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
9166
(*MIB).addOperand(*argOpers[valArgIndx]);
9168
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
9171
MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
9176
unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
9177
MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
9181
// Cmp and exchange if none has modified the memory location
9182
MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
9183
for (int i=0; i <= lastAddrIndx; ++i)
9184
(*MIB).addOperand(*argOpers[i]);
9186
assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
9187
(*MIB).setMemRefs(mInstr->memoperands_begin(),
9188
mInstr->memoperands_end());
9190
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
9191
MIB.addReg(X86::EAX);
9194
BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
9196
mInstr->eraseFromParent(); // The pseudo instruction is gone now.
9200
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
9201
// or XMM0_V32I8 in AVX all of this code can be replaced with that
9204
X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
9205
unsigned numArgs, bool memArg) const {
9207
assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
9208
"Target must have SSE4.2 or AVX features enabled");
9210
DebugLoc dl = MI->getDebugLoc();
9211
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
9215
if (!Subtarget->hasAVX()) {
9217
Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
9219
Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
9222
Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
9224
Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
9227
MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
9229
for (unsigned i = 0; i < numArgs; ++i) {
9230
MachineOperand &Op = MI->getOperand(i+1);
9232
if (!(Op.isReg() && Op.isImplicit()))
9236
BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
9239
MI->eraseFromParent();
9245
X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
9247
MachineBasicBlock *MBB) const {
9248
// Emit code to save XMM registers to the stack. The ABI says that the
9249
// number of registers to save is given in %al, so it's theoretically
9250
// possible to do an indirect jump trick to avoid saving all of them,
9251
// however this code takes a simpler approach and just executes all
9252
// of the stores if %al is non-zero. It's less code, and it's probably
9253
// easier on the hardware branch predictor, and stores aren't all that
9254
// expensive anyway.
9256
// Create the new basic blocks. One block contains all the XMM stores,
9257
// and one block is the final destination regardless of whether any
9258
// stores were performed.
9259
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
9260
MachineFunction *F = MBB->getParent();
9261
MachineFunction::iterator MBBIter = MBB;
9263
MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
9264
MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
9265
F->insert(MBBIter, XMMSaveMBB);
9266
F->insert(MBBIter, EndMBB);
9268
// Transfer the remainder of MBB and its successor edges to EndMBB.
9269
EndMBB->splice(EndMBB->begin(), MBB,
9270
llvm::next(MachineBasicBlock::iterator(MI)),
9272
EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
9274
// The original block will now fall through to the XMM save block.
9275
MBB->addSuccessor(XMMSaveMBB);
9276
// The XMMSaveMBB will fall through to the end block.
9277
XMMSaveMBB->addSuccessor(EndMBB);
9279
// Now add the instructions.
9280
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
9281
DebugLoc DL = MI->getDebugLoc();
9283
unsigned CountReg = MI->getOperand(0).getReg();
9284
int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
9285
int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
9287
if (!Subtarget->isTargetWin64()) {
9288
// If %al is 0, branch around the XMM save block.
9289
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
9290
BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
9291
MBB->addSuccessor(EndMBB);
9294
// In the XMM save block, save all the XMM argument registers.
9295
for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
9296
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
9297
MachineMemOperand *MMO =
9298
F->getMachineMemOperand(
9299
PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
9300
MachineMemOperand::MOStore, Offset,
9301
/*Size=*/16, /*Align=*/16);
9302
BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
9303
.addFrameIndex(RegSaveFrameIndex)
9304
.addImm(/*Scale=*/1)
9305
.addReg(/*IndexReg=*/0)
9306
.addImm(/*Disp=*/Offset)
9307
.addReg(/*Segment=*/0)
9308
.addReg(MI->getOperand(i).getReg())
9309
.addMemOperand(MMO);
9312
MI->eraseFromParent(); // The pseudo instruction is gone now.
9318
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
9319
MachineBasicBlock *BB) const {
9320
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
9321
DebugLoc DL = MI->getDebugLoc();
9323
// To "insert" a SELECT_CC instruction, we actually have to insert the
9324
// diamond control-flow pattern. The incoming instruction knows the
9325
// destination vreg to set, the condition code register to branch on, the
9326
// true/false values to select between, and a branch opcode to use.
9327
const BasicBlock *LLVM_BB = BB->getBasicBlock();
9328
MachineFunction::iterator It = BB;
9334
// cmpTY ccX, r1, r2
9336
// fallthrough --> copy0MBB
9337
MachineBasicBlock *thisMBB = BB;
9338
MachineFunction *F = BB->getParent();
9339
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
9340
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
9341
F->insert(It, copy0MBB);
9342
F->insert(It, sinkMBB);
9344
// If the EFLAGS register isn't dead in the terminator, then claim that it's
9345
// live into the sink and copy blocks.
9346
const MachineFunction *MF = BB->getParent();
9347
const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
9348
BitVector ReservedRegs = TRI->getReservedRegs(*MF);
9350
for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
9351
const MachineOperand &MO = MI->getOperand(I);
9352
if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
9353
unsigned Reg = MO.getReg();
9354
if (Reg != X86::EFLAGS) continue;
9355
copy0MBB->addLiveIn(Reg);
9356
sinkMBB->addLiveIn(Reg);
9359
// Transfer the remainder of BB and its successor edges to sinkMBB.
9360
sinkMBB->splice(sinkMBB->begin(), BB,
9361
llvm::next(MachineBasicBlock::iterator(MI)),
9363
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
9365
// Add the true and fallthrough blocks as its successors.
9366
BB->addSuccessor(copy0MBB);
9367
BB->addSuccessor(sinkMBB);
9369
// Create the conditional branch instruction.
9371
X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
9372
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
9375
// %FalseValue = ...
9376
// # fallthrough to sinkMBB
9377
copy0MBB->addSuccessor(sinkMBB);
9380
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
9382
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
9383
TII->get(X86::PHI), MI->getOperand(0).getReg())
9384
.addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
9385
.addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
9387
MI->eraseFromParent(); // The pseudo instruction is gone now.
9392
X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
9393
MachineBasicBlock *BB) const {
9394
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
9395
DebugLoc DL = MI->getDebugLoc();
9397
// The lowering is pretty easy: we're just emitting the call to _alloca. The
9398
// non-trivial part is impdef of ESP.
9399
// FIXME: The code should be tweaked as soon as we'll try to do codegen for
9402
BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
9403
.addExternalSymbol("_alloca")
9404
.addReg(X86::EAX, RegState::Implicit)
9405
.addReg(X86::ESP, RegState::Implicit)
9406
.addReg(X86::EAX, RegState::Define | RegState::Implicit)
9407
.addReg(X86::ESP, RegState::Define | RegState::Implicit)
9408
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
9410
MI->eraseFromParent(); // The pseudo instruction is gone now.
9415
X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
9416
MachineBasicBlock *BB) const {
9417
// This is pretty easy. We're taking the value that we received from
9418
// our load from the relocation, sticking it in either RDI (x86-64)
9419
// or EAX and doing an indirect call. The return value will then
9420
// be in the normal return register.
9421
const X86InstrInfo *TII
9422
= static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
9423
DebugLoc DL = MI->getDebugLoc();
9424
MachineFunction *F = BB->getParent();
9425
bool IsWin64 = Subtarget->isTargetWin64();
9427
assert(MI->getOperand(3).isGlobal() && "This should be a global");
9429
if (Subtarget->is64Bit()) {
9430
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
9431
TII->get(X86::MOV64rm), X86::RDI)
9433
.addImm(0).addReg(0)
9434
.addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
9435
MI->getOperand(3).getTargetFlags())
9437
MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m));
9438
addDirectMem(MIB, X86::RDI);
9439
} else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
9440
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
9441
TII->get(X86::MOV32rm), X86::EAX)
9443
.addImm(0).addReg(0)
9444
.addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
9445
MI->getOperand(3).getTargetFlags())
9447
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
9448
addDirectMem(MIB, X86::EAX);
9450
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
9451
TII->get(X86::MOV32rm), X86::EAX)
9452
.addReg(TII->getGlobalBaseReg(F))
9453
.addImm(0).addReg(0)
9454
.addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
9455
MI->getOperand(3).getTargetFlags())
9457
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
9458
addDirectMem(MIB, X86::EAX);
9461
MI->eraseFromParent(); // The pseudo instruction is gone now.
9466
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
9467
MachineBasicBlock *BB) const {
9468
switch (MI->getOpcode()) {
9469
default: assert(false && "Unexpected instr type to insert");
9470
case X86::MINGW_ALLOCA:
9471
return EmitLoweredMingwAlloca(MI, BB);
9472
case X86::TLSCall_32:
9473
case X86::TLSCall_64:
9474
return EmitLoweredTLSCall(MI, BB);
9476
case X86::CMOV_V1I64:
9477
case X86::CMOV_FR32:
9478
case X86::CMOV_FR64:
9479
case X86::CMOV_V4F32:
9480
case X86::CMOV_V2F64:
9481
case X86::CMOV_V2I64:
9482
case X86::CMOV_GR16:
9483
case X86::CMOV_GR32:
9484
case X86::CMOV_RFP32:
9485
case X86::CMOV_RFP64:
9486
case X86::CMOV_RFP80:
9487
return EmitLoweredSelect(MI, BB);
9489
case X86::FP32_TO_INT16_IN_MEM:
9490
case X86::FP32_TO_INT32_IN_MEM:
9491
case X86::FP32_TO_INT64_IN_MEM:
9492
case X86::FP64_TO_INT16_IN_MEM:
9493
case X86::FP64_TO_INT32_IN_MEM:
9494
case X86::FP64_TO_INT64_IN_MEM:
9495
case X86::FP80_TO_INT16_IN_MEM:
9496
case X86::FP80_TO_INT32_IN_MEM:
9497
case X86::FP80_TO_INT64_IN_MEM: {
9498
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
9499
DebugLoc DL = MI->getDebugLoc();
9501
// Change the floating point control register to use "round towards zero"
9502
// mode when truncating to an integer value.
9503
MachineFunction *F = BB->getParent();
9504
int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
9505
addFrameReference(BuildMI(*BB, MI, DL,
9506
TII->get(X86::FNSTCW16m)), CWFrameIdx);
9508
// Load the old value of the high byte of the control word...
9510
F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
9511
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
9514
// Set the high part to be round to zero...
9515
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
9518
// Reload the modified control word now...
9519
addFrameReference(BuildMI(*BB, MI, DL,
9520
TII->get(X86::FLDCW16m)), CWFrameIdx);
9522
// Restore the memory image of control word to original value
9523
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
9526
// Get the X86 opcode to use.
9528
switch (MI->getOpcode()) {
9529
default: llvm_unreachable("illegal opcode!");
9530
case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
9531
case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
9532
case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
9533
case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
9534
case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
9535
case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
9536
case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
9537
case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
9538
case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
9542
MachineOperand &Op = MI->getOperand(0);
9544
AM.BaseType = X86AddressMode::RegBase;
9545
AM.Base.Reg = Op.getReg();
9547
AM.BaseType = X86AddressMode::FrameIndexBase;
9548
AM.Base.FrameIndex = Op.getIndex();
9550
Op = MI->getOperand(1);
9552
AM.Scale = Op.getImm();
9553
Op = MI->getOperand(2);
9555
AM.IndexReg = Op.getImm();
9556
Op = MI->getOperand(3);
9557
if (Op.isGlobal()) {
9558
AM.GV = Op.getGlobal();
9560
AM.Disp = Op.getImm();
9562
addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
9563
.addReg(MI->getOperand(X86::AddrNumOperands).getReg());
9565
// Reload the original control word now.
9566
addFrameReference(BuildMI(*BB, MI, DL,
9567
TII->get(X86::FLDCW16m)), CWFrameIdx);
9569
MI->eraseFromParent(); // The pseudo instruction is gone now.
9572
// String/text processing lowering.
9573
case X86::PCMPISTRM128REG:
9574
case X86::VPCMPISTRM128REG:
9575
return EmitPCMP(MI, BB, 3, false /* in-mem */);
9576
case X86::PCMPISTRM128MEM:
9577
case X86::VPCMPISTRM128MEM:
9578
return EmitPCMP(MI, BB, 3, true /* in-mem */);
9579
case X86::PCMPESTRM128REG:
9580
case X86::VPCMPESTRM128REG:
9581
return EmitPCMP(MI, BB, 5, false /* in mem */);
9582
case X86::PCMPESTRM128MEM:
9583
case X86::VPCMPESTRM128MEM:
9584
return EmitPCMP(MI, BB, 5, true /* in mem */);
9587
case X86::ATOMAND32:
9588
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
9589
X86::AND32ri, X86::MOV32rm,
9591
X86::NOT32r, X86::EAX,
9592
X86::GR32RegisterClass);
9594
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
9595
X86::OR32ri, X86::MOV32rm,
9597
X86::NOT32r, X86::EAX,
9598
X86::GR32RegisterClass);
9599
case X86::ATOMXOR32:
9600
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
9601
X86::XOR32ri, X86::MOV32rm,
9603
X86::NOT32r, X86::EAX,
9604
X86::GR32RegisterClass);
9605
case X86::ATOMNAND32:
9606
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
9607
X86::AND32ri, X86::MOV32rm,
9609
X86::NOT32r, X86::EAX,
9610
X86::GR32RegisterClass, true);
9611
case X86::ATOMMIN32:
9612
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
9613
case X86::ATOMMAX32:
9614
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
9615
case X86::ATOMUMIN32:
9616
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
9617
case X86::ATOMUMAX32:
9618
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
9620
case X86::ATOMAND16:
9621
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
9622
X86::AND16ri, X86::MOV16rm,
9624
X86::NOT16r, X86::AX,
9625
X86::GR16RegisterClass);
9627
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
9628
X86::OR16ri, X86::MOV16rm,
9630
X86::NOT16r, X86::AX,
9631
X86::GR16RegisterClass);
9632
case X86::ATOMXOR16:
9633
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
9634
X86::XOR16ri, X86::MOV16rm,
9636
X86::NOT16r, X86::AX,
9637
X86::GR16RegisterClass);
9638
case X86::ATOMNAND16:
9639
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
9640
X86::AND16ri, X86::MOV16rm,
9642
X86::NOT16r, X86::AX,
9643
X86::GR16RegisterClass, true);
9644
case X86::ATOMMIN16:
9645
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
9646
case X86::ATOMMAX16:
9647
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
9648
case X86::ATOMUMIN16:
9649
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
9650
case X86::ATOMUMAX16:
9651
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
9654
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
9655
X86::AND8ri, X86::MOV8rm,
9657
X86::NOT8r, X86::AL,
9658
X86::GR8RegisterClass);
9660
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
9661
X86::OR8ri, X86::MOV8rm,
9663
X86::NOT8r, X86::AL,
9664
X86::GR8RegisterClass);
9666
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
9667
X86::XOR8ri, X86::MOV8rm,
9669
X86::NOT8r, X86::AL,
9670
X86::GR8RegisterClass);
9671
case X86::ATOMNAND8:
9672
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
9673
X86::AND8ri, X86::MOV8rm,
9675
X86::NOT8r, X86::AL,
9676
X86::GR8RegisterClass, true);
9677
// FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
9678
// This group is for 64-bit host.
9679
case X86::ATOMAND64:
9680
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
9681
X86::AND64ri32, X86::MOV64rm,
9683
X86::NOT64r, X86::RAX,
9684
X86::GR64RegisterClass);
9686
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
9687
X86::OR64ri32, X86::MOV64rm,
9689
X86::NOT64r, X86::RAX,
9690
X86::GR64RegisterClass);
9691
case X86::ATOMXOR64:
9692
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
9693
X86::XOR64ri32, X86::MOV64rm,
9695
X86::NOT64r, X86::RAX,
9696
X86::GR64RegisterClass);
9697
case X86::ATOMNAND64:
9698
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
9699
X86::AND64ri32, X86::MOV64rm,
9701
X86::NOT64r, X86::RAX,
9702
X86::GR64RegisterClass, true);
9703
case X86::ATOMMIN64:
9704
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
9705
case X86::ATOMMAX64:
9706
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
9707
case X86::ATOMUMIN64:
9708
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
9709
case X86::ATOMUMAX64:
9710
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
9712
// This group does 64-bit operations on a 32-bit host.
9713
case X86::ATOMAND6432:
9714
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9715
X86::AND32rr, X86::AND32rr,
9716
X86::AND32ri, X86::AND32ri,
9718
case X86::ATOMOR6432:
9719
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9720
X86::OR32rr, X86::OR32rr,
9721
X86::OR32ri, X86::OR32ri,
9723
case X86::ATOMXOR6432:
9724
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9725
X86::XOR32rr, X86::XOR32rr,
9726
X86::XOR32ri, X86::XOR32ri,
9728
case X86::ATOMNAND6432:
9729
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9730
X86::AND32rr, X86::AND32rr,
9731
X86::AND32ri, X86::AND32ri,
9733
case X86::ATOMADD6432:
9734
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9735
X86::ADD32rr, X86::ADC32rr,
9736
X86::ADD32ri, X86::ADC32ri,
9738
case X86::ATOMSUB6432:
9739
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9740
X86::SUB32rr, X86::SBB32rr,
9741
X86::SUB32ri, X86::SBB32ri,
9743
case X86::ATOMSWAP6432:
9744
return EmitAtomicBit6432WithCustomInserter(MI, BB,
9745
X86::MOV32rr, X86::MOV32rr,
9746
X86::MOV32ri, X86::MOV32ri,
9748
case X86::VASTART_SAVE_XMM_REGS:
9749
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
9753
//===----------------------------------------------------------------------===//
9754
// X86 Optimization Hooks
9755
//===----------------------------------------------------------------------===//
9757
void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
9761
const SelectionDAG &DAG,
9762
unsigned Depth) const {
9763
unsigned Opc = Op.getOpcode();
9764
assert((Opc >= ISD::BUILTIN_OP_END ||
9765
Opc == ISD::INTRINSIC_WO_CHAIN ||
9766
Opc == ISD::INTRINSIC_W_CHAIN ||
9767
Opc == ISD::INTRINSIC_VOID) &&
9768
"Should use MaskedValueIsZero if you don't know whether Op"
9769
" is a target node!");
9771
KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything.
9783
// These nodes' second result is a boolean.
9784
if (Op.getResNo() == 0)
9788
KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
9789
Mask.getBitWidth() - 1);
9794
/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
9795
/// node is a GlobalAddress + offset.
9796
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
9797
const GlobalValue* &GA,
9798
int64_t &Offset) const {
9799
if (N->getOpcode() == X86ISD::Wrapper) {
9800
if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
9801
GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
9802
Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
9806
return TargetLowering::isGAPlusOffset(N, GA, Offset);
9809
/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
9810
/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
9811
/// if the load addresses are consecutive, non-overlapping, and in the right
9813
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
9814
const TargetLowering &TLI) {
9815
DebugLoc dl = N->getDebugLoc();
9816
EVT VT = N->getValueType(0);
9818
if (VT.getSizeInBits() != 128)
9821
SmallVector<SDValue, 16> Elts;
9822
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
9823
Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
9825
return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
9828
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
9829
/// generation and convert it from being a bunch of shuffles and extracts
9830
/// to a simple store and scalar loads to extract the elements.
9831
static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
9832
const TargetLowering &TLI) {
9833
SDValue InputVector = N->getOperand(0);
9835
// Only operate on vectors of 4 elements, where the alternative shuffling
9836
// gets to be more expensive.
9837
if (InputVector.getValueType() != MVT::v4i32)
9840
// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
9841
// single use which is a sign-extend or zero-extend, and all elements are
9843
SmallVector<SDNode *, 4> Uses;
9844
unsigned ExtractedElements = 0;
9845
for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
9846
UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
9847
if (UI.getUse().getResNo() != InputVector.getResNo())
9850
SDNode *Extract = *UI;
9851
if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9854
if (Extract->getValueType(0) != MVT::i32)
9856
if (!Extract->hasOneUse())
9858
if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
9859
Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
9861
if (!isa<ConstantSDNode>(Extract->getOperand(1)))
9864
// Record which element was extracted.
9865
ExtractedElements |=
9866
1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
9868
Uses.push_back(Extract);
9871
// If not all the elements were used, this may not be worthwhile.
9872
if (ExtractedElements != 15)
9875
// Ok, we've now decided to do the transformation.
9876
DebugLoc dl = InputVector.getDebugLoc();
9878
// Store the value to a temporary stack slot.
9879
SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
9880
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL,
9881
0, false, false, 0);
9883
// Replace each use (extract) with a load of the appropriate element.
9884
for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
9885
UE = Uses.end(); UI != UE; ++UI) {
9886
SDNode *Extract = *UI;
9888
// Compute the element's address.
9889
SDValue Idx = Extract->getOperand(1);
9891
InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
9892
uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
9893
SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
9895
SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
9896
OffsetVal, StackPtr);
9899
SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
9900
ScalarAddr, NULL, 0, false, false, 0);
9902
// Replace the exact with the load.
9903
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
9906
// The replacement was made in place; don't return anything.
9910
/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
9911
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
9912
const X86Subtarget *Subtarget) {
9913
DebugLoc DL = N->getDebugLoc();
9914
SDValue Cond = N->getOperand(0);
9915
// Get the LHS/RHS of the select.
9916
SDValue LHS = N->getOperand(1);
9917
SDValue RHS = N->getOperand(2);
9919
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
9920
// instructions match the semantics of the common C idiom x<y?x:y but not
9921
// x<=y?x:y, because of how they handle negative zero (which can be
9922
// ignored in unsafe-math mode).
9923
if (Subtarget->hasSSE2() &&
9924
(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
9925
Cond.getOpcode() == ISD::SETCC) {
9926
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
9928
unsigned Opcode = 0;
9929
// Check for x CC y ? x : y.
9930
if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
9931
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
9935
// Converting this to a min would handle NaNs incorrectly, and swapping
9936
// the operands would cause it to handle comparisons between positive
9937
// and negative zero incorrectly.
9938
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
9939
if (!UnsafeFPMath &&
9940
!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9942
std::swap(LHS, RHS);
9944
Opcode = X86ISD::FMIN;
9947
// Converting this to a min would handle comparisons between positive
9948
// and negative zero incorrectly.
9949
if (!UnsafeFPMath &&
9950
!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
9952
Opcode = X86ISD::FMIN;
9955
// Converting this to a min would handle both negative zeros and NaNs
9956
// incorrectly, but we can swap the operands to fix both.
9957
std::swap(LHS, RHS);
9961
Opcode = X86ISD::FMIN;
9965
// Converting this to a max would handle comparisons between positive
9966
// and negative zero incorrectly.
9967
if (!UnsafeFPMath &&
9968
!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
9970
Opcode = X86ISD::FMAX;
9973
// Converting this to a max would handle NaNs incorrectly, and swapping
9974
// the operands would cause it to handle comparisons between positive
9975
// and negative zero incorrectly.
9976
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
9977
if (!UnsafeFPMath &&
9978
!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9980
std::swap(LHS, RHS);
9982
Opcode = X86ISD::FMAX;
9985
// Converting this to a max would handle both negative zeros and NaNs
9986
// incorrectly, but we can swap the operands to fix both.
9987
std::swap(LHS, RHS);
9991
Opcode = X86ISD::FMAX;
9994
// Check for x CC y ? y : x -- a min/max with reversed arms.
9995
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
9996
DAG.isEqualTo(RHS, Cond.getOperand(0))) {
10000
// Converting this to a min would handle comparisons between positive
10001
// and negative zero incorrectly, and swapping the operands would
10002
// cause it to handle NaNs incorrectly.
10003
if (!UnsafeFPMath &&
10004
!(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
10005
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
10007
std::swap(LHS, RHS);
10009
Opcode = X86ISD::FMIN;
10012
// Converting this to a min would handle NaNs incorrectly.
10013
if (!UnsafeFPMath &&
10014
(!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
10016
Opcode = X86ISD::FMIN;
10019
// Converting this to a min would handle both negative zeros and NaNs
10020
// incorrectly, but we can swap the operands to fix both.
10021
std::swap(LHS, RHS);
10025
Opcode = X86ISD::FMIN;
10029
// Converting this to a max would handle NaNs incorrectly.
10030
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
10032
Opcode = X86ISD::FMAX;
10035
// Converting this to a max would handle comparisons between positive
10036
// and negative zero incorrectly, and swapping the operands would
10037
// cause it to handle NaNs incorrectly.
10038
if (!UnsafeFPMath &&
10039
!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
10040
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
10042
std::swap(LHS, RHS);
10044
Opcode = X86ISD::FMAX;
10047
// Converting this to a max would handle both negative zeros and NaNs
10048
// incorrectly, but we can swap the operands to fix both.
10049
std::swap(LHS, RHS);
10053
Opcode = X86ISD::FMAX;
10059
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
10062
// If this is a select between two integer constants, try to do some
10064
if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
10065
if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
10066
// Don't do this for crazy integer types.
10067
if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
10068
// If this is efficiently invertible, canonicalize the LHSC/RHSC values
10069
// so that TrueC (the true value) is larger than FalseC.
10070
bool NeedsCondInvert = false;
10072
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
10073
// Efficiently invertible.
10074
(Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
10075
(Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
10076
isa<ConstantSDNode>(Cond.getOperand(1))))) {
10077
NeedsCondInvert = true;
10078
std::swap(TrueC, FalseC);
10081
// Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
10082
if (FalseC->getAPIntValue() == 0 &&
10083
TrueC->getAPIntValue().isPowerOf2()) {
10084
if (NeedsCondInvert) // Invert the condition if needed.
10085
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
10086
DAG.getConstant(1, Cond.getValueType()));
10088
// Zero extend the condition if needed.
10089
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
10091
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
10092
return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
10093
DAG.getConstant(ShAmt, MVT::i8));
10096
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
10097
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
10098
if (NeedsCondInvert) // Invert the condition if needed.
10099
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
10100
DAG.getConstant(1, Cond.getValueType()));
10102
// Zero extend the condition if needed.
10103
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
10104
FalseC->getValueType(0), Cond);
10105
return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
10106
SDValue(FalseC, 0));
10109
// Optimize cases that will turn into an LEA instruction. This requires
10110
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
10111
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
10112
uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
10113
if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
10115
bool isFastMultiplier = false;
10117
switch ((unsigned char)Diff) {
10119
case 1: // result = add base, cond
10120
case 2: // result = lea base( , cond*2)
10121
case 3: // result = lea base(cond, cond*2)
10122
case 4: // result = lea base( , cond*4)
10123
case 5: // result = lea base(cond, cond*4)
10124
case 8: // result = lea base( , cond*8)
10125
case 9: // result = lea base(cond, cond*8)
10126
isFastMultiplier = true;
10131
if (isFastMultiplier) {
10132
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
10133
if (NeedsCondInvert) // Invert the condition if needed.
10134
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
10135
DAG.getConstant(1, Cond.getValueType()));
10137
// Zero extend the condition if needed.
10138
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
10140
// Scale the condition by the difference.
10142
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
10143
DAG.getConstant(Diff, Cond.getValueType()));
10145
// Add the base if non-zero.
10146
if (FalseC->getAPIntValue() != 0)
10147
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
10148
SDValue(FalseC, 0));
10158
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
10159
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
10160
TargetLowering::DAGCombinerInfo &DCI) {
10161
DebugLoc DL = N->getDebugLoc();
10163
// If the flag operand isn't dead, don't touch this CMOV.
10164
if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
10167
// If this is a select between two integer constants, try to do some
10168
// optimizations. Note that the operands are ordered the opposite of SELECT
10170
if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
10171
if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
10172
// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
10173
// larger than FalseC (the false value).
10174
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
10176
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
10177
CC = X86::GetOppositeBranchCondition(CC);
10178
std::swap(TrueC, FalseC);
10181
// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
10182
// This is efficient for any integer data type (including i8/i16) and
10184
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
10185
SDValue Cond = N->getOperand(3);
10186
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
10187
DAG.getConstant(CC, MVT::i8), Cond);
10189
// Zero extend the condition if needed.
10190
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
10192
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
10193
Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
10194
DAG.getConstant(ShAmt, MVT::i8));
10195
if (N->getNumValues() == 2) // Dead flag value?
10196
return DCI.CombineTo(N, Cond, SDValue());
10200
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
10201
// for any integer data type, including i8/i16.
10202
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
10203
SDValue Cond = N->getOperand(3);
10204
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
10205
DAG.getConstant(CC, MVT::i8), Cond);
10207
// Zero extend the condition if needed.
10208
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
10209
FalseC->getValueType(0), Cond);
10210
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
10211
SDValue(FalseC, 0));
10213
if (N->getNumValues() == 2) // Dead flag value?
10214
return DCI.CombineTo(N, Cond, SDValue());
10218
// Optimize cases that will turn into an LEA instruction. This requires
10219
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
10220
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
10221
uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
10222
if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
10224
bool isFastMultiplier = false;
10226
switch ((unsigned char)Diff) {
10228
case 1: // result = add base, cond
10229
case 2: // result = lea base( , cond*2)
10230
case 3: // result = lea base(cond, cond*2)
10231
case 4: // result = lea base( , cond*4)
10232
case 5: // result = lea base(cond, cond*4)
10233
case 8: // result = lea base( , cond*8)
10234
case 9: // result = lea base(cond, cond*8)
10235
isFastMultiplier = true;
10240
if (isFastMultiplier) {
10241
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
10242
SDValue Cond = N->getOperand(3);
10243
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
10244
DAG.getConstant(CC, MVT::i8), Cond);
10245
// Zero extend the condition if needed.
10246
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
10248
// Scale the condition by the difference.
10250
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
10251
DAG.getConstant(Diff, Cond.getValueType()));
10253
// Add the base if non-zero.
10254
if (FalseC->getAPIntValue() != 0)
10255
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
10256
SDValue(FalseC, 0));
10257
if (N->getNumValues() == 2) // Dead flag value?
10258
return DCI.CombineTo(N, Cond, SDValue());
10268
/// PerformMulCombine - Optimize a single multiply with constant into two
10269
/// in order to implement it with two cheaper instructions, e.g.
10270
/// LEA + SHL, LEA + LEA.
10271
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
10272
TargetLowering::DAGCombinerInfo &DCI) {
10273
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
10276
EVT VT = N->getValueType(0);
10277
if (VT != MVT::i64)
10280
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
10283
uint64_t MulAmt = C->getZExtValue();
10284
if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
10287
uint64_t MulAmt1 = 0;
10288
uint64_t MulAmt2 = 0;
10289
if ((MulAmt % 9) == 0) {
10291
MulAmt2 = MulAmt / 9;
10292
} else if ((MulAmt % 5) == 0) {
10294
MulAmt2 = MulAmt / 5;
10295
} else if ((MulAmt % 3) == 0) {
10297
MulAmt2 = MulAmt / 3;
10300
(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
10301
DebugLoc DL = N->getDebugLoc();
10303
if (isPowerOf2_64(MulAmt2) &&
10304
!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
10305
// If second multiplifer is pow2, issue it first. We want the multiply by
10306
// 3, 5, or 9 to be folded into the addressing mode unless the lone use
10308
std::swap(MulAmt1, MulAmt2);
10311
if (isPowerOf2_64(MulAmt1))
10312
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
10313
DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
10315
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
10316
DAG.getConstant(MulAmt1, VT));
10318
if (isPowerOf2_64(MulAmt2))
10319
NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
10320
DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
10322
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
10323
DAG.getConstant(MulAmt2, VT));
10325
// Do not add new nodes to DAG combiner worklist.
10326
DCI.CombineTo(N, NewMul, false);
10331
static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
10332
SDValue N0 = N->getOperand(0);
10333
SDValue N1 = N->getOperand(1);
10334
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
10335
EVT VT = N0.getValueType();
10337
// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
10338
// since the result of setcc_c is all zero's or all ones.
10339
if (N1C && N0.getOpcode() == ISD::AND &&
10340
N0.getOperand(1).getOpcode() == ISD::Constant) {
10341
SDValue N00 = N0.getOperand(0);
10342
if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
10343
((N00.getOpcode() == ISD::ANY_EXTEND ||
10344
N00.getOpcode() == ISD::ZERO_EXTEND) &&
10345
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
10346
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
10347
APInt ShAmt = N1C->getAPIntValue();
10348
Mask = Mask.shl(ShAmt);
10350
return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
10351
N00, DAG.getConstant(Mask, VT));
10358
/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
10360
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
10361
const X86Subtarget *Subtarget) {
10362
EVT VT = N->getValueType(0);
10363
if (!VT.isVector() && VT.isInteger() &&
10364
N->getOpcode() == ISD::SHL)
10365
return PerformSHLCombine(N, DAG);
10367
// On X86 with SSE2 support, we can transform this to a vector shift if
10368
// all elements are shifted by the same amount. We can't do this in legalize
10369
// because the a constant vector is typically transformed to a constant pool
10370
// so we have no knowledge of the shift amount.
10371
if (!Subtarget->hasSSE2())
10374
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
10377
SDValue ShAmtOp = N->getOperand(1);
10378
EVT EltVT = VT.getVectorElementType();
10379
DebugLoc DL = N->getDebugLoc();
10380
SDValue BaseShAmt = SDValue();
10381
if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
10382
unsigned NumElts = VT.getVectorNumElements();
10384
for (; i != NumElts; ++i) {
10385
SDValue Arg = ShAmtOp.getOperand(i);
10386
if (Arg.getOpcode() == ISD::UNDEF) continue;
10390
for (; i != NumElts; ++i) {
10391
SDValue Arg = ShAmtOp.getOperand(i);
10392
if (Arg.getOpcode() == ISD::UNDEF) continue;
10393
if (Arg != BaseShAmt) {
10397
} else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
10398
cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
10399
SDValue InVec = ShAmtOp.getOperand(0);
10400
if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
10401
unsigned NumElts = InVec.getValueType().getVectorNumElements();
10403
for (; i != NumElts; ++i) {
10404
SDValue Arg = InVec.getOperand(i);
10405
if (Arg.getOpcode() == ISD::UNDEF) continue;
10409
} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
10410
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
10411
unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
10412
if (C->getZExtValue() == SplatIdx)
10413
BaseShAmt = InVec.getOperand(1);
10416
if (BaseShAmt.getNode() == 0)
10417
BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
10418
DAG.getIntPtrConstant(0));
10422
// The shift amount is an i32.
10423
if (EltVT.bitsGT(MVT::i32))
10424
BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
10425
else if (EltVT.bitsLT(MVT::i32))
10426
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
10428
// The shift amount is identical so we can do a vector shift.
10429
SDValue ValOp = N->getOperand(0);
10430
switch (N->getOpcode()) {
10432
llvm_unreachable("Unknown shift opcode!");
10435
if (VT == MVT::v2i64)
10436
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10437
DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
10439
if (VT == MVT::v4i32)
10440
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10441
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
10443
if (VT == MVT::v8i16)
10444
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10445
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
10449
if (VT == MVT::v4i32)
10450
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10451
DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
10453
if (VT == MVT::v8i16)
10454
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10455
DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
10459
if (VT == MVT::v2i64)
10460
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10461
DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
10463
if (VT == MVT::v4i32)
10464
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10465
DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
10467
if (VT == MVT::v8i16)
10468
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10469
DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
10476
static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
10477
TargetLowering::DAGCombinerInfo &DCI,
10478
const X86Subtarget *Subtarget) {
10479
if (DCI.isBeforeLegalizeOps())
10482
EVT VT = N->getValueType(0);
10483
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
10486
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
10487
SDValue N0 = N->getOperand(0);
10488
SDValue N1 = N->getOperand(1);
10489
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
10491
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
10493
if (!N0.hasOneUse() || !N1.hasOneUse())
10496
SDValue ShAmt0 = N0.getOperand(1);
10497
if (ShAmt0.getValueType() != MVT::i8)
10499
SDValue ShAmt1 = N1.getOperand(1);
10500
if (ShAmt1.getValueType() != MVT::i8)
10502
if (ShAmt0.getOpcode() == ISD::TRUNCATE)
10503
ShAmt0 = ShAmt0.getOperand(0);
10504
if (ShAmt1.getOpcode() == ISD::TRUNCATE)
10505
ShAmt1 = ShAmt1.getOperand(0);
10507
DebugLoc DL = N->getDebugLoc();
10508
unsigned Opc = X86ISD::SHLD;
10509
SDValue Op0 = N0.getOperand(0);
10510
SDValue Op1 = N1.getOperand(0);
10511
if (ShAmt0.getOpcode() == ISD::SUB) {
10512
Opc = X86ISD::SHRD;
10513
std::swap(Op0, Op1);
10514
std::swap(ShAmt0, ShAmt1);
10517
unsigned Bits = VT.getSizeInBits();
10518
if (ShAmt1.getOpcode() == ISD::SUB) {
10519
SDValue Sum = ShAmt1.getOperand(0);
10520
if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
10521
SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
10522
if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
10523
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
10524
if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
10525
return DAG.getNode(Opc, DL, VT,
10527
DAG.getNode(ISD::TRUNCATE, DL,
10530
} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
10531
ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
10533
ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
10534
return DAG.getNode(Opc, DL, VT,
10535
N0.getOperand(0), N1.getOperand(0),
10536
DAG.getNode(ISD::TRUNCATE, DL,
10543
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
10544
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
10545
const X86Subtarget *Subtarget) {
10546
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
10547
// the FP state in cases where an emms may be missing.
10548
// A preferable solution to the general problem is to figure out the right
10549
// places to insert EMMS. This qualifies as a quick hack.
10551
// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
10552
StoreSDNode *St = cast<StoreSDNode>(N);
10553
EVT VT = St->getValue().getValueType();
10554
if (VT.getSizeInBits() != 64)
10557
const Function *F = DAG.getMachineFunction().getFunction();
10558
bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
10559
bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
10560
&& Subtarget->hasSSE2();
10561
if ((VT.isVector() ||
10562
(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
10563
isa<LoadSDNode>(St->getValue()) &&
10564
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
10565
St->getChain().hasOneUse() && !St->isVolatile()) {
10566
SDNode* LdVal = St->getValue().getNode();
10567
LoadSDNode *Ld = 0;
10568
int TokenFactorIndex = -1;
10569
SmallVector<SDValue, 8> Ops;
10570
SDNode* ChainVal = St->getChain().getNode();
10571
// Must be a store of a load. We currently handle two cases: the load
10572
// is a direct child, and it's under an intervening TokenFactor. It is
10573
// possible to dig deeper under nested TokenFactors.
10574
if (ChainVal == LdVal)
10575
Ld = cast<LoadSDNode>(St->getChain());
10576
else if (St->getValue().hasOneUse() &&
10577
ChainVal->getOpcode() == ISD::TokenFactor) {
10578
for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
10579
if (ChainVal->getOperand(i).getNode() == LdVal) {
10580
TokenFactorIndex = i;
10581
Ld = cast<LoadSDNode>(St->getValue());
10583
Ops.push_back(ChainVal->getOperand(i));
10587
if (!Ld || !ISD::isNormalLoad(Ld))
10590
// If this is not the MMX case, i.e. we are just turning i64 load/store
10591
// into f64 load/store, avoid the transformation if there are multiple
10592
// uses of the loaded value.
10593
if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
10596
DebugLoc LdDL = Ld->getDebugLoc();
10597
DebugLoc StDL = N->getDebugLoc();
10598
// If we are a 64-bit capable x86, lower to a single movq load/store pair.
10599
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
10601
if (Subtarget->is64Bit() || F64IsLegal) {
10602
EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
10603
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
10604
Ld->getBasePtr(), Ld->getSrcValue(),
10605
Ld->getSrcValueOffset(), Ld->isVolatile(),
10606
Ld->isNonTemporal(), Ld->getAlignment());
10607
SDValue NewChain = NewLd.getValue(1);
10608
if (TokenFactorIndex != -1) {
10609
Ops.push_back(NewChain);
10610
NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
10613
return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
10614
St->getSrcValue(), St->getSrcValueOffset(),
10615
St->isVolatile(), St->isNonTemporal(),
10616
St->getAlignment());
10619
// Otherwise, lower to two pairs of 32-bit loads / stores.
10620
SDValue LoAddr = Ld->getBasePtr();
10621
SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
10622
DAG.getConstant(4, MVT::i32));
10624
SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
10625
Ld->getSrcValue(), Ld->getSrcValueOffset(),
10626
Ld->isVolatile(), Ld->isNonTemporal(),
10627
Ld->getAlignment());
10628
SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
10629
Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
10630
Ld->isVolatile(), Ld->isNonTemporal(),
10631
MinAlign(Ld->getAlignment(), 4));
10633
SDValue NewChain = LoLd.getValue(1);
10634
if (TokenFactorIndex != -1) {
10635
Ops.push_back(LoLd);
10636
Ops.push_back(HiLd);
10637
NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
10641
LoAddr = St->getBasePtr();
10642
HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
10643
DAG.getConstant(4, MVT::i32));
10645
SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
10646
St->getSrcValue(), St->getSrcValueOffset(),
10647
St->isVolatile(), St->isNonTemporal(),
10648
St->getAlignment());
10649
SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
10651
St->getSrcValueOffset() + 4,
10653
St->isNonTemporal(),
10654
MinAlign(St->getAlignment(), 4));
10655
return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
10660
/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
10661
/// X86ISD::FXOR nodes.
10662
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
10663
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
10664
// F[X]OR(0.0, x) -> x
10665
// F[X]OR(x, 0.0) -> x
10666
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
10667
if (C->getValueAPF().isPosZero())
10668
return N->getOperand(1);
10669
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
10670
if (C->getValueAPF().isPosZero())
10671
return N->getOperand(0);
10675
/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
10676
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
10677
// FAND(0.0, x) -> 0.0
10678
// FAND(x, 0.0) -> 0.0
10679
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
10680
if (C->getValueAPF().isPosZero())
10681
return N->getOperand(0);
10682
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
10683
if (C->getValueAPF().isPosZero())
10684
return N->getOperand(1);
10688
static SDValue PerformBTCombine(SDNode *N,
10690
TargetLowering::DAGCombinerInfo &DCI) {
10691
// BT ignores high bits in the bit index operand.
10692
SDValue Op1 = N->getOperand(1);
10693
if (Op1.hasOneUse()) {
10694
unsigned BitWidth = Op1.getValueSizeInBits();
10695
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
10696
APInt KnownZero, KnownOne;
10697
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
10698
!DCI.isBeforeLegalizeOps());
10699
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10700
if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
10701
TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
10702
DCI.CommitTargetLoweringOpt(TLO);
10707
static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
10708
SDValue Op = N->getOperand(0);
10709
if (Op.getOpcode() == ISD::BIT_CONVERT)
10710
Op = Op.getOperand(0);
10711
EVT VT = N->getValueType(0), OpVT = Op.getValueType();
10712
if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
10713
VT.getVectorElementType().getSizeInBits() ==
10714
OpVT.getVectorElementType().getSizeInBits()) {
10715
return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
10720
static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
10721
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
10722
// (and (i32 x86isd::setcc_carry), 1)
10723
// This eliminates the zext. This transformation is necessary because
10724
// ISD::SETCC is always legalized to i8.
10725
DebugLoc dl = N->getDebugLoc();
10726
SDValue N0 = N->getOperand(0);
10727
EVT VT = N->getValueType(0);
10728
if (N0.getOpcode() == ISD::AND &&
10730
N0.getOperand(0).hasOneUse()) {
10731
SDValue N00 = N0.getOperand(0);
10732
if (N00.getOpcode() != X86ISD::SETCC_CARRY)
10734
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
10735
if (!C || C->getZExtValue() != 1)
10737
return DAG.getNode(ISD::AND, dl, VT,
10738
DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
10739
N00.getOperand(0), N00.getOperand(1)),
10740
DAG.getConstant(1, VT));
10746
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
10747
DAGCombinerInfo &DCI) const {
10748
SelectionDAG &DAG = DCI.DAG;
10749
switch (N->getOpcode()) {
10751
case ISD::EXTRACT_VECTOR_ELT:
10752
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
10753
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
10754
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
10755
case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
10758
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
10759
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
10760
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
10762
case X86ISD::FOR: return PerformFORCombine(N, DAG);
10763
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
10764
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
10765
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
10766
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG);
10767
case X86ISD::SHUFPS: // Handle all target specific shuffles
10768
case X86ISD::SHUFPD:
10769
case X86ISD::PUNPCKHBW:
10770
case X86ISD::PUNPCKHWD:
10771
case X86ISD::PUNPCKHDQ:
10772
case X86ISD::PUNPCKHQDQ:
10773
case X86ISD::UNPCKHPS:
10774
case X86ISD::UNPCKHPD:
10775
case X86ISD::PUNPCKLBW:
10776
case X86ISD::PUNPCKLWD:
10777
case X86ISD::PUNPCKLDQ:
10778
case X86ISD::PUNPCKLQDQ:
10779
case X86ISD::UNPCKLPS:
10780
case X86ISD::UNPCKLPD:
10781
case X86ISD::MOVHLPS:
10782
case X86ISD::MOVLHPS:
10783
case X86ISD::PSHUFD:
10784
case X86ISD::PSHUFHW:
10785
case X86ISD::PSHUFLW:
10786
case X86ISD::MOVSS:
10787
case X86ISD::MOVSD:
10788
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
10794
/// isTypeDesirableForOp - Return true if the target has native support for
10795
/// the specified value type and it is 'desirable' to use the type for the
10796
/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
10797
/// instruction encodings are longer and some i16 instructions are slow.
10798
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
10799
if (!isTypeLegal(VT))
10801
if (VT != MVT::i16)
10808
case ISD::SIGN_EXTEND:
10809
case ISD::ZERO_EXTEND:
10810
case ISD::ANY_EXTEND:
10823
/// IsDesirableToPromoteOp - This method query the target whether it is
10824
/// beneficial for dag combiner to promote the specified node. If true, it
10825
/// should return the desired promotion type by reference.
10826
bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
10827
EVT VT = Op.getValueType();
10828
if (VT != MVT::i16)
10831
bool Promote = false;
10832
bool Commute = false;
10833
switch (Op.getOpcode()) {
10836
LoadSDNode *LD = cast<LoadSDNode>(Op);
10837
// If the non-extending load has a single use and it's not live out, then it
10838
// might be folded.
10839
if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
10840
Op.hasOneUse()*/) {
10841
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
10842
UE = Op.getNode()->use_end(); UI != UE; ++UI) {
10843
// The only case where we'd want to promote LOAD (rather then it being
10844
// promoted as an operand is when it's only use is liveout.
10845
if (UI->getOpcode() != ISD::CopyToReg)
10852
case ISD::SIGN_EXTEND:
10853
case ISD::ZERO_EXTEND:
10854
case ISD::ANY_EXTEND:
10859
SDValue N0 = Op.getOperand(0);
10860
// Look out for (store (shl (load), x)).
10861
if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
10874
SDValue N0 = Op.getOperand(0);
10875
SDValue N1 = Op.getOperand(1);
10876
if (!Commute && MayFoldLoad(N1))
10878
// Avoid disabling potential load folding opportunities.
10879
if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
10881
if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
10891
//===----------------------------------------------------------------------===//
10892
// X86 Inline Assembly Support
10893
//===----------------------------------------------------------------------===//
10895
static bool LowerToBSwap(CallInst *CI) {
10896
// FIXME: this should verify that we are targetting a 486 or better. If not,
10897
// we will turn this bswap into something that will be lowered to logical ops
10898
// instead of emitting the bswap asm. For now, we don't support 486 or lower
10899
// so don't worry about this.
10901
// Verify this is a simple bswap.
10902
if (CI->getNumArgOperands() != 1 ||
10903
CI->getType() != CI->getArgOperand(0)->getType() ||
10904
!CI->getType()->isIntegerTy())
10907
const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
10908
if (!Ty || Ty->getBitWidth() % 16 != 0)
10911
// Okay, we can do this xform, do so now.
10912
const Type *Tys[] = { Ty };
10913
Module *M = CI->getParent()->getParent()->getParent();
10914
Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
10916
Value *Op = CI->getArgOperand(0);
10917
Op = CallInst::Create(Int, Op, CI->getName(), CI);
10919
CI->replaceAllUsesWith(Op);
10920
CI->eraseFromParent();
10924
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
10925
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
10926
std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
10928
std::string AsmStr = IA->getAsmString();
10930
// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
10931
SmallVector<StringRef, 4> AsmPieces;
10932
SplitString(AsmStr, AsmPieces, "\n"); // ; as separator?
10934
switch (AsmPieces.size()) {
10935
default: return false;
10937
AsmStr = AsmPieces[0];
10939
SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
10942
if (AsmPieces.size() == 2 &&
10943
(AsmPieces[0] == "bswap" ||
10944
AsmPieces[0] == "bswapq" ||
10945
AsmPieces[0] == "bswapl") &&
10946
(AsmPieces[1] == "$0" ||
10947
AsmPieces[1] == "${0:q}")) {
10948
// No need to check constraints, nothing other than the equivalent of
10949
// "=r,0" would be valid here.
10950
return LowerToBSwap(CI);
10952
// rorw $$8, ${0:w} --> llvm.bswap.i16
10953
if (CI->getType()->isIntegerTy(16) &&
10954
AsmPieces.size() == 3 &&
10955
(AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
10956
AsmPieces[1] == "$$8," &&
10957
AsmPieces[2] == "${0:w}" &&
10958
IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
10960
const std::string &Constraints = IA->getConstraintString();
10961
SplitString(StringRef(Constraints).substr(5), AsmPieces, ",");
10962
std::sort(AsmPieces.begin(), AsmPieces.end());
10963
if (AsmPieces.size() == 4 &&
10964
AsmPieces[0] == "~{cc}" &&
10965
AsmPieces[1] == "~{dirflag}" &&
10966
AsmPieces[2] == "~{flags}" &&
10967
AsmPieces[3] == "~{fpsr}") {
10968
return LowerToBSwap(CI);
10973
if (CI->getType()->isIntegerTy(64) &&
10974
Constraints.size() >= 2 &&
10975
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
10976
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
10977
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
10978
SmallVector<StringRef, 4> Words;
10979
SplitString(AsmPieces[0], Words, " \t");
10980
if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
10982
SplitString(AsmPieces[1], Words, " \t");
10983
if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
10985
SplitString(AsmPieces[2], Words, " \t,");
10986
if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
10987
Words[2] == "%edx") {
10988
return LowerToBSwap(CI);
11000
/// getConstraintType - Given a constraint letter, return the type of
11001
/// constraint it is for this target.
11002
X86TargetLowering::ConstraintType
11003
X86TargetLowering::getConstraintType(const std::string &Constraint) const {
11004
if (Constraint.size() == 1) {
11005
switch (Constraint[0]) {
11017
return C_RegisterClass;
11025
return TargetLowering::getConstraintType(Constraint);
11028
/// LowerXConstraint - try to replace an X constraint, which matches anything,
11029
/// with another that has more specific requirements based on the type of the
11030
/// corresponding operand.
11031
const char *X86TargetLowering::
11032
LowerXConstraint(EVT ConstraintVT) const {
11033
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
11034
// 'f' like normal targets.
11035
if (ConstraintVT.isFloatingPoint()) {
11036
if (Subtarget->hasSSE2())
11038
if (Subtarget->hasSSE1())
11042
return TargetLowering::LowerXConstraint(ConstraintVT);
11045
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11046
/// vector. If it is invalid, don't add anything to Ops.
11047
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
11049
std::vector<SDValue>&Ops,
11050
SelectionDAG &DAG) const {
11051
SDValue Result(0, 0);
11053
switch (Constraint) {
11056
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11057
if (C->getZExtValue() <= 31) {
11058
Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
11064
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11065
if (C->getZExtValue() <= 63) {
11066
Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
11072
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11073
if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
11074
Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
11080
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11081
if (C->getZExtValue() <= 255) {
11082
Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
11088
// 32-bit signed value
11089
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11090
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
11091
C->getSExtValue())) {
11092
// Widen to 64 bits here to get it sign extended.
11093
Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
11096
// FIXME gcc accepts some relocatable values here too, but only in certain
11097
// memory models; it's complicated.
11102
// 32-bit unsigned value
11103
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11104
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
11105
C->getZExtValue())) {
11106
Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
11110
// FIXME gcc accepts some relocatable values here too, but only in certain
11111
// memory models; it's complicated.
11115
// Literal immediates are always ok.
11116
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
11117
// Widen to 64 bits here to get it sign extended.
11118
Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
11122
// In any sort of PIC mode addresses need to be computed at runtime by
11123
// adding in a register or some sort of table lookup. These can't
11124
// be used as immediates.
11125
if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
11128
// If we are in non-pic codegen mode, we allow the address of a global (with
11129
// an optional displacement) to be used with 'i'.
11130
GlobalAddressSDNode *GA = 0;
11131
int64_t Offset = 0;
11133
// Match either (GA), (GA+C), (GA+C1+C2), etc.
11135
if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
11136
Offset += GA->getOffset();
11138
} else if (Op.getOpcode() == ISD::ADD) {
11139
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
11140
Offset += C->getZExtValue();
11141
Op = Op.getOperand(0);
11144
} else if (Op.getOpcode() == ISD::SUB) {
11145
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
11146
Offset += -C->getZExtValue();
11147
Op = Op.getOperand(0);
11152
// Otherwise, this isn't something we can handle, reject it.
11156
const GlobalValue *GV = GA->getGlobal();
11157
// If we require an extra load to get this address, as in PIC mode, we
11158
// can't accept it.
11159
if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
11160
getTargetMachine())))
11163
Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
11164
GA->getValueType(0), Offset);
11169
if (Result.getNode()) {
11170
Ops.push_back(Result);
11173
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11176
std::vector<unsigned> X86TargetLowering::
11177
getRegClassForInlineAsmConstraint(const std::string &Constraint,
11179
if (Constraint.size() == 1) {
11180
// FIXME: not handling fp-stack yet!
11181
switch (Constraint[0]) { // GCC X86 Constraint Letters
11182
default: break; // Unknown constraint letter
11183
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
11184
if (Subtarget->is64Bit()) {
11185
if (VT == MVT::i32)
11186
return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
11187
X86::ESI, X86::EDI, X86::R8D, X86::R9D,
11188
X86::R10D,X86::R11D,X86::R12D,
11189
X86::R13D,X86::R14D,X86::R15D,
11190
X86::EBP, X86::ESP, 0);
11191
else if (VT == MVT::i16)
11192
return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX,
11193
X86::SI, X86::DI, X86::R8W,X86::R9W,
11194
X86::R10W,X86::R11W,X86::R12W,
11195
X86::R13W,X86::R14W,X86::R15W,
11196
X86::BP, X86::SP, 0);
11197
else if (VT == MVT::i8)
11198
return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL,
11199
X86::SIL, X86::DIL, X86::R8B,X86::R9B,
11200
X86::R10B,X86::R11B,X86::R12B,
11201
X86::R13B,X86::R14B,X86::R15B,
11202
X86::BPL, X86::SPL, 0);
11204
else if (VT == MVT::i64)
11205
return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
11206
X86::RSI, X86::RDI, X86::R8, X86::R9,
11207
X86::R10, X86::R11, X86::R12,
11208
X86::R13, X86::R14, X86::R15,
11209
X86::RBP, X86::RSP, 0);
11213
// 32-bit fallthrough
11214
case 'Q': // Q_REGS
11215
if (VT == MVT::i32)
11216
return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
11217
else if (VT == MVT::i16)
11218
return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
11219
else if (VT == MVT::i8)
11220
return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
11221
else if (VT == MVT::i64)
11222
return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
11227
return std::vector<unsigned>();
11230
std::pair<unsigned, const TargetRegisterClass*>
11231
X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
11233
// First, see if this is a constraint that directly corresponds to an LLVM
11235
if (Constraint.size() == 1) {
11236
// GCC Constraint Letters
11237
switch (Constraint[0]) {
11239
case 'r': // GENERAL_REGS
11240
case 'l': // INDEX_REGS
11242
return std::make_pair(0U, X86::GR8RegisterClass);
11243
if (VT == MVT::i16)
11244
return std::make_pair(0U, X86::GR16RegisterClass);
11245
if (VT == MVT::i32 || !Subtarget->is64Bit())
11246
return std::make_pair(0U, X86::GR32RegisterClass);
11247
return std::make_pair(0U, X86::GR64RegisterClass);
11248
case 'R': // LEGACY_REGS
11250
return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
11251
if (VT == MVT::i16)
11252
return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
11253
if (VT == MVT::i32 || !Subtarget->is64Bit())
11254
return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
11255
return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
11256
case 'f': // FP Stack registers.
11257
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
11258
// value to the correct fpstack register class.
11259
if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
11260
return std::make_pair(0U, X86::RFP32RegisterClass);
11261
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
11262
return std::make_pair(0U, X86::RFP64RegisterClass);
11263
return std::make_pair(0U, X86::RFP80RegisterClass);
11264
case 'y': // MMX_REGS if MMX allowed.
11265
if (!Subtarget->hasMMX()) break;
11266
return std::make_pair(0U, X86::VR64RegisterClass);
11267
case 'Y': // SSE_REGS if SSE2 allowed
11268
if (!Subtarget->hasSSE2()) break;
11270
case 'x': // SSE_REGS if SSE1 allowed
11271
if (!Subtarget->hasSSE1()) break;
11273
switch (VT.getSimpleVT().SimpleTy) {
11275
// Scalar SSE types.
11278
return std::make_pair(0U, X86::FR32RegisterClass);
11281
return std::make_pair(0U, X86::FR64RegisterClass);
11289
return std::make_pair(0U, X86::VR128RegisterClass);
11295
// Use the default implementation in TargetLowering to convert the register
11296
// constraint into a member of a register class.
11297
std::pair<unsigned, const TargetRegisterClass*> Res;
11298
Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
11300
// Not found as a standard register?
11301
if (Res.second == 0) {
11302
// Map st(0) -> st(7) -> ST0
11303
if (Constraint.size() == 7 && Constraint[0] == '{' &&
11304
tolower(Constraint[1]) == 's' &&
11305
tolower(Constraint[2]) == 't' &&
11306
Constraint[3] == '(' &&
11307
(Constraint[4] >= '0' && Constraint[4] <= '7') &&
11308
Constraint[5] == ')' &&
11309
Constraint[6] == '}') {
11311
Res.first = X86::ST0+Constraint[4]-'0';
11312
Res.second = X86::RFP80RegisterClass;
11316
// GCC allows "st(0)" to be called just plain "st".
11317
if (StringRef("{st}").equals_lower(Constraint)) {
11318
Res.first = X86::ST0;
11319
Res.second = X86::RFP80RegisterClass;
11324
if (StringRef("{flags}").equals_lower(Constraint)) {
11325
Res.first = X86::EFLAGS;
11326
Res.second = X86::CCRRegisterClass;
11330
// 'A' means EAX + EDX.
11331
if (Constraint == "A") {
11332
Res.first = X86::EAX;
11333
Res.second = X86::GR32_ADRegisterClass;
11339
// Otherwise, check to see if this is a register class of the wrong value
11340
// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
11341
// turn into {ax},{dx}.
11342
if (Res.second->hasType(VT))
11343
return Res; // Correct type already, nothing to do.
11345
// All of the single-register GCC register classes map their values onto
11346
// 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
11347
// really want an 8-bit or 32-bit register, map to the appropriate register
11348
// class and return the appropriate register.
11349
if (Res.second == X86::GR16RegisterClass) {
11350
if (VT == MVT::i8) {
11351
unsigned DestReg = 0;
11352
switch (Res.first) {
11354
case X86::AX: DestReg = X86::AL; break;
11355
case X86::DX: DestReg = X86::DL; break;
11356
case X86::CX: DestReg = X86::CL; break;
11357
case X86::BX: DestReg = X86::BL; break;
11360
Res.first = DestReg;
11361
Res.second = X86::GR8RegisterClass;
11363
} else if (VT == MVT::i32) {
11364
unsigned DestReg = 0;
11365
switch (Res.first) {
11367
case X86::AX: DestReg = X86::EAX; break;
11368
case X86::DX: DestReg = X86::EDX; break;
11369
case X86::CX: DestReg = X86::ECX; break;
11370
case X86::BX: DestReg = X86::EBX; break;
11371
case X86::SI: DestReg = X86::ESI; break;
11372
case X86::DI: DestReg = X86::EDI; break;
11373
case X86::BP: DestReg = X86::EBP; break;
11374
case X86::SP: DestReg = X86::ESP; break;
11377
Res.first = DestReg;
11378
Res.second = X86::GR32RegisterClass;
11380
} else if (VT == MVT::i64) {
11381
unsigned DestReg = 0;
11382
switch (Res.first) {
11384
case X86::AX: DestReg = X86::RAX; break;
11385
case X86::DX: DestReg = X86::RDX; break;
11386
case X86::CX: DestReg = X86::RCX; break;
11387
case X86::BX: DestReg = X86::RBX; break;
11388
case X86::SI: DestReg = X86::RSI; break;
11389
case X86::DI: DestReg = X86::RDI; break;
11390
case X86::BP: DestReg = X86::RBP; break;
11391
case X86::SP: DestReg = X86::RSP; break;
11394
Res.first = DestReg;
11395
Res.second = X86::GR64RegisterClass;
11398
} else if (Res.second == X86::FR32RegisterClass ||
11399
Res.second == X86::FR64RegisterClass ||
11400
Res.second == X86::VR128RegisterClass) {
11401
// Handle references to XMM physical registers that got mapped into the
11402
// wrong class. This can happen with constraints like {xmm0} where the
11403
// target independent register mapper will just pick the first match it can
11404
// find, ignoring the required type.
11405
if (VT == MVT::f32)
11406
Res.second = X86::FR32RegisterClass;
11407
else if (VT == MVT::f64)
11408
Res.second = X86::FR64RegisterClass;
11409
else if (X86::VR128RegisterClass->hasType(VT))
11410
Res.second = X86::VR128RegisterClass;