1
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
3
define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4
; CHECK-LABEL: smull_v8i8_v8i16:
5
; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
6
%tmp1 = load <8 x i8>, <8 x i8>* %A
7
%tmp2 = load <8 x i8>, <8 x i8>* %B
8
%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
9
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
10
%tmp5 = mul <8 x i16> %tmp3, %tmp4
14
define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15
; CHECK-LABEL: smull_v4i16_v4i32:
16
; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
17
%tmp1 = load <4 x i16>, <4 x i16>* %A
18
%tmp2 = load <4 x i16>, <4 x i16>* %B
19
%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
20
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
21
%tmp5 = mul <4 x i32> %tmp3, %tmp4
25
define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
26
; CHECK-LABEL: smull_v2i32_v2i64:
27
; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
28
%tmp1 = load <2 x i32>, <2 x i32>* %A
29
%tmp2 = load <2 x i32>, <2 x i32>* %B
30
%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
31
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
32
%tmp5 = mul <2 x i64> %tmp3, %tmp4
36
define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
37
; CHECK-LABEL: umull_v8i8_v8i16:
38
; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
39
%tmp1 = load <8 x i8>, <8 x i8>* %A
40
%tmp2 = load <8 x i8>, <8 x i8>* %B
41
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
42
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
43
%tmp5 = mul <8 x i16> %tmp3, %tmp4
47
define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
48
; CHECK-LABEL: umull_v4i16_v4i32:
49
; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
50
%tmp1 = load <4 x i16>, <4 x i16>* %A
51
%tmp2 = load <4 x i16>, <4 x i16>* %B
52
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
53
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
54
%tmp5 = mul <4 x i32> %tmp3, %tmp4
58
define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
59
; CHECK-LABEL: umull_v2i32_v2i64:
60
; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
61
%tmp1 = load <2 x i32>, <2 x i32>* %A
62
%tmp2 = load <2 x i32>, <2 x i32>* %B
63
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
64
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
65
%tmp5 = mul <2 x i64> %tmp3, %tmp4
69
define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
70
; CHECK-LABEL: smlal_v8i8_v8i16:
71
; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
72
%tmp1 = load <8 x i16>, <8 x i16>* %A
73
%tmp2 = load <8 x i8>, <8 x i8>* %B
74
%tmp3 = load <8 x i8>, <8 x i8>* %C
75
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
76
%tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
77
%tmp6 = mul <8 x i16> %tmp4, %tmp5
78
%tmp7 = add <8 x i16> %tmp1, %tmp6
82
define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
83
; CHECK-LABEL: smlal_v4i16_v4i32:
84
; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
85
%tmp1 = load <4 x i32>, <4 x i32>* %A
86
%tmp2 = load <4 x i16>, <4 x i16>* %B
87
%tmp3 = load <4 x i16>, <4 x i16>* %C
88
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
89
%tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
90
%tmp6 = mul <4 x i32> %tmp4, %tmp5
91
%tmp7 = add <4 x i32> %tmp1, %tmp6
95
define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
96
; CHECK-LABEL: smlal_v2i32_v2i64:
97
; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
98
%tmp1 = load <2 x i64>, <2 x i64>* %A
99
%tmp2 = load <2 x i32>, <2 x i32>* %B
100
%tmp3 = load <2 x i32>, <2 x i32>* %C
101
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
102
%tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
103
%tmp6 = mul <2 x i64> %tmp4, %tmp5
104
%tmp7 = add <2 x i64> %tmp1, %tmp6
108
define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
109
; CHECK-LABEL: umlal_v8i8_v8i16:
110
; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
111
%tmp1 = load <8 x i16>, <8 x i16>* %A
112
%tmp2 = load <8 x i8>, <8 x i8>* %B
113
%tmp3 = load <8 x i8>, <8 x i8>* %C
114
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
115
%tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
116
%tmp6 = mul <8 x i16> %tmp4, %tmp5
117
%tmp7 = add <8 x i16> %tmp1, %tmp6
121
define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
122
; CHECK-LABEL: umlal_v4i16_v4i32:
123
; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
124
%tmp1 = load <4 x i32>, <4 x i32>* %A
125
%tmp2 = load <4 x i16>, <4 x i16>* %B
126
%tmp3 = load <4 x i16>, <4 x i16>* %C
127
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
128
%tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
129
%tmp6 = mul <4 x i32> %tmp4, %tmp5
130
%tmp7 = add <4 x i32> %tmp1, %tmp6
134
define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
135
; CHECK-LABEL: umlal_v2i32_v2i64:
136
; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
137
%tmp1 = load <2 x i64>, <2 x i64>* %A
138
%tmp2 = load <2 x i32>, <2 x i32>* %B
139
%tmp3 = load <2 x i32>, <2 x i32>* %C
140
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
141
%tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
142
%tmp6 = mul <2 x i64> %tmp4, %tmp5
143
%tmp7 = add <2 x i64> %tmp1, %tmp6
147
define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
148
; CHECK-LABEL: smlsl_v8i8_v8i16:
149
; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
150
%tmp1 = load <8 x i16>, <8 x i16>* %A
151
%tmp2 = load <8 x i8>, <8 x i8>* %B
152
%tmp3 = load <8 x i8>, <8 x i8>* %C
153
%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
154
%tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
155
%tmp6 = mul <8 x i16> %tmp4, %tmp5
156
%tmp7 = sub <8 x i16> %tmp1, %tmp6
160
define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
161
; CHECK-LABEL: smlsl_v4i16_v4i32:
162
; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
163
%tmp1 = load <4 x i32>, <4 x i32>* %A
164
%tmp2 = load <4 x i16>, <4 x i16>* %B
165
%tmp3 = load <4 x i16>, <4 x i16>* %C
166
%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
167
%tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
168
%tmp6 = mul <4 x i32> %tmp4, %tmp5
169
%tmp7 = sub <4 x i32> %tmp1, %tmp6
173
define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
174
; CHECK-LABEL: smlsl_v2i32_v2i64:
175
; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
176
%tmp1 = load <2 x i64>, <2 x i64>* %A
177
%tmp2 = load <2 x i32>, <2 x i32>* %B
178
%tmp3 = load <2 x i32>, <2 x i32>* %C
179
%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
180
%tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
181
%tmp6 = mul <2 x i64> %tmp4, %tmp5
182
%tmp7 = sub <2 x i64> %tmp1, %tmp6
186
define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
187
; CHECK-LABEL: umlsl_v8i8_v8i16:
188
; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
189
%tmp1 = load <8 x i16>, <8 x i16>* %A
190
%tmp2 = load <8 x i8>, <8 x i8>* %B
191
%tmp3 = load <8 x i8>, <8 x i8>* %C
192
%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
193
%tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
194
%tmp6 = mul <8 x i16> %tmp4, %tmp5
195
%tmp7 = sub <8 x i16> %tmp1, %tmp6
199
define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
200
; CHECK-LABEL: umlsl_v4i16_v4i32:
201
; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
202
%tmp1 = load <4 x i32>, <4 x i32>* %A
203
%tmp2 = load <4 x i16>, <4 x i16>* %B
204
%tmp3 = load <4 x i16>, <4 x i16>* %C
205
%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
206
%tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
207
%tmp6 = mul <4 x i32> %tmp4, %tmp5
208
%tmp7 = sub <4 x i32> %tmp1, %tmp6
212
define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
213
; CHECK-LABEL: umlsl_v2i32_v2i64:
214
; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
215
%tmp1 = load <2 x i64>, <2 x i64>* %A
216
%tmp2 = load <2 x i32>, <2 x i32>* %B
217
%tmp3 = load <2 x i32>, <2 x i32>* %C
218
%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
219
%tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
220
%tmp6 = mul <2 x i64> %tmp4, %tmp5
221
%tmp7 = sub <2 x i64> %tmp1, %tmp6
225
; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
226
define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
227
; CHECK-LABEL: smull_extvec_v8i8_v8i16:
228
; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
229
%tmp3 = sext <8 x i8> %arg to <8 x i16>
230
%tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
234
define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
235
; Do not use SMULL if the BUILD_VECTOR element values are too big.
236
; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
238
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
239
%tmp3 = sext <8 x i8> %arg to <8 x i16>
240
%tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
244
define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
245
; CHECK-LABEL: smull_extvec_v4i16_v4i32:
246
; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
247
%tmp3 = sext <4 x i16> %arg to <4 x i32>
248
%tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
252
define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
253
; CHECK: smull_extvec_v2i32_v2i64
254
; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
255
%tmp3 = sext <2 x i32> %arg to <2 x i64>
256
%tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
260
define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
261
; CHECK-LABEL: umull_extvec_v8i8_v8i16:
262
; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
263
%tmp3 = zext <8 x i8> %arg to <8 x i16>
264
%tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
268
define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
269
; Do not use SMULL if the BUILD_VECTOR element values are too big.
270
; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
272
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
273
%tmp3 = zext <8 x i8> %arg to <8 x i16>
274
%tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
278
define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
279
; CHECK-LABEL: umull_extvec_v4i16_v4i32:
280
; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
281
%tmp3 = zext <4 x i16> %arg to <4 x i32>
282
%tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
286
define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
287
; CHECK-LABEL: umull_extvec_v2i32_v2i64:
288
; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
289
%tmp3 = zext <2 x i32> %arg to <2 x i64>
290
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
294
define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
295
; If one operand has a zero-extend and the other a sign-extend, smull
297
; CHECK-LABEL: smullWithInconsistentExtensions:
298
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
299
%1 = sext <8 x i8> %vec to <8 x i16>
300
%2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
301
%3 = extractelement <8 x i16> %2, i32 0
305
define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
307
; CHECK-LABEL: distribute:
308
; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
309
; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
310
%0 = trunc i32 %mul to i8
311
%1 = insertelement <8 x i8> undef, i8 %0, i32 0
312
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
313
%3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
314
%4 = bitcast <16 x i8> %3 to <2 x double>
315
%5 = extractelement <2 x double> %4, i32 1
316
%6 = bitcast double %5 to <8 x i8>
317
%7 = zext <8 x i8> %6 to <8 x i16>
318
%8 = zext <8 x i8> %2 to <8 x i16>
319
%9 = extractelement <2 x double> %4, i32 0
320
%10 = bitcast double %9 to <8 x i8>
321
%11 = zext <8 x i8> %10 to <8 x i16>
322
%12 = add <8 x i16> %7, %11
323
%13 = mul <8 x i16> %12, %8
324
%14 = bitcast i16* %dst to i8*
325
tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
329
declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
331
declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind