1
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X32
2
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X64
4
@g16 = external global i16
6
define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
7
%tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
10
; X32: pinsrd $1, 4(%esp), %xmm0
13
; X64: pinsrd $1, %edi, %xmm0
16
define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
17
%tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
20
; X32: pinsrb $1, 4(%esp), %xmm0
23
; X64: pinsrb $1, %edi, %xmm0
27
define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
29
%0 = load i32* %p, align 4
30
%1 = insertelement <4 x i32> undef, i32 %0, i32 0
31
%2 = insertelement <4 x i32> %1, i32 0, i32 1
32
%3 = insertelement <4 x i32> %2, i32 0, i32 2
33
%4 = insertelement <4 x i32> %3, i32 0, i32 3
34
%5 = bitcast <4 x i32> %4 to <16 x i8>
35
%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
36
%7 = bitcast <4 x i32> %6 to <2 x i64>
40
; X32: movl 4(%esp), %eax
41
; X32: pmovsxbd (%eax), %xmm0
44
; X64: pmovsxbd (%rdi), %xmm0
47
define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
49
%0 = load i64* %p ; <i64> [#uses=1]
50
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1]
51
%1 = bitcast <2 x i64> %tmp2 to <8 x i16> ; <<8 x i16>> [#uses=1]
52
%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1]
53
%3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1]
57
; X32: movl 4(%esp), %eax
58
; X32: pmovsxwd (%eax), %xmm0
61
; X64: pmovsxwd (%rdi), %xmm0
67
define <2 x i64> @pmovzxbq_1() nounwind {
69
%0 = load i16* @g16, align 2 ; <i16> [#uses=1]
70
%1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
71
%2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
72
%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
76
; X32: movl L_g16$non_lazy_ptr, %eax
77
; X32: pmovzxbq (%eax), %xmm0
80
; X64: movq _g16@GOTPCREL(%rip), %rax
81
; X64: pmovzxbq (%rax), %xmm0
84
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
85
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
86
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
91
define i32 @extractps_1(<4 x float> %v) nounwind {
92
%s = extractelement <4 x float> %v, i32 3
93
%i = bitcast float %s to i32
97
; X32: extractps $3, %xmm0, %eax
100
; X64: extractps $3, %xmm0, %eax
102
define i32 @extractps_2(<4 x float> %v) nounwind {
103
%t = bitcast <4 x float> %v to <4 x i32>
104
%s = extractelement <4 x i32> %t, i32 3
108
; X32: extractps $3, %xmm0, %eax
111
; X64: extractps $3, %xmm0, %eax
115
; The non-store form of extractps puts its result into a GPR.
116
; This makes it suitable for an extract from a <4 x float> that
117
; is bitcasted to i32, but unsuitable for much of anything else.
119
define float @ext_1(<4 x float> %v) nounwind {
120
%s = extractelement <4 x float> %v, i32 3
121
%t = fadd float %s, 1.0
125
; X32: pshufd $3, %xmm0, %xmm0
126
; X32: addss LCPI8_0, %xmm0
129
; X64: pshufd $3, %xmm0, %xmm0
130
; X64: addss LCPI8_0(%rip), %xmm0
132
define float @ext_2(<4 x float> %v) nounwind {
133
%s = extractelement <4 x float> %v, i32 3
137
; X32: pshufd $3, %xmm0, %xmm0
140
; X64: pshufd $3, %xmm0, %xmm0
142
define i32 @ext_3(<4 x i32> %v) nounwind {
143
%i = extractelement <4 x i32> %v, i32 3
147
; X32: pextrd $3, %xmm0, %eax
150
; X64: pextrd $3, %xmm0, %eax
153
define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
154
%tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
155
ret <4 x float> %tmp1
157
; X32: insertps $1, %xmm1, %xmm0
160
; X64: insertps $1, %xmm1, %xmm0
163
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
165
define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
166
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
167
ret <4 x float> %tmp1
169
; X32: insertps $0, 4(%esp), %xmm0
172
; X64: insertps $0, %xmm1, %xmm0
175
define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
176
%tmp2 = extractelement <4 x float> %t2, i32 0
177
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
178
ret <4 x float> %tmp1
180
; X32: insertps $0, %xmm1, %xmm0
183
; X64: insertps $0, %xmm1, %xmm0
186
define i32 @ptestz_1(<4 x float> %t1, <4 x float> %t2) nounwind {
187
%tmp1 = call i32 @llvm.x86.sse41.ptestz(<4 x float> %t1, <4 x float> %t2) nounwind readnone
190
; X32: ptest %xmm1, %xmm0
194
; X64: ptest %xmm1, %xmm0
198
define i32 @ptestz_2(<4 x float> %t1, <4 x float> %t2) nounwind {
199
%tmp1 = call i32 @llvm.x86.sse41.ptestc(<4 x float> %t1, <4 x float> %t2) nounwind readnone
202
; X32: ptest %xmm1, %xmm0
206
; X64: ptest %xmm1, %xmm0
210
define i32 @ptestz_3(<4 x float> %t1, <4 x float> %t2) nounwind {
211
%tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<4 x float> %t1, <4 x float> %t2) nounwind readnone
214
; X32: ptest %xmm1, %xmm0
218
; X64: ptest %xmm1, %xmm0
223
declare i32 @llvm.x86.sse41.ptestz(<4 x float>, <4 x float>) nounwind readnone
224
declare i32 @llvm.x86.sse41.ptestc(<4 x float>, <4 x float>) nounwind readnone
225
declare i32 @llvm.x86.sse41.ptestnzc(<4 x float>, <4 x float>) nounwind readnone