1
From 82fcc7f26f3ab8391c120ad9437a85b8fb919d2d Mon Sep 17 00:00:00 2001
2
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
3
Date: Fri, 31 Jan 2014 14:47:30 +0000
4
Subject: [PATCH 112/158] target-arm: A64: Add SIMD ld/st multiple
6
Content-Type: text/plain; charset=UTF-8
7
Content-Transfer-Encoding: 8bit
9
This adds support support for the SIMD load/store
10
multiple category of instructions.
12
This also brings in a couple of helper functions for manipulating
13
sections of the SIMD registers:
15
* do_vec_get - fetch value from a slice of a vector register
16
* do_vec_set - set a slice of a vector register
18
which use vec_reg_offset for consistent processing of offsets in an
19
endian aware manner. There are also additional helpers:
21
* do_vec_ld - load value into SIMD
22
* do_vec_st - store value from SIMD
24
which load or store a slice of a vector register to memory.
25
These don't zero extend like the fp variants.
27
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
28
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
29
Reviewed-by: Richard Henderson <rth@twiddle.net>
31
target-arm/translate-a64.c | 250 ++++++++++++++++++++++++++++++++++++++++++++-
32
1 file changed, 248 insertions(+), 2 deletions(-)
34
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
35
index cf80c46..e4fdf00 100644
36
--- a/target-arm/translate-a64.c
37
+++ b/target-arm/translate-a64.c
38
@@ -308,6 +308,28 @@ static TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf)
42
+/* Return the offset into CPUARMState of an element of specified
43
+ * size, 'element' places in from the least significant end of
44
+ * the FP/vector register Qn.
46
+static inline int vec_reg_offset(int regno, int element, TCGMemOp size)
48
+ int offs = offsetof(CPUARMState, vfp.regs[regno * 2]);
49
+#ifdef HOST_WORDS_BIGENDIAN
50
+ /* This is complicated slightly because vfp.regs[2n] is
51
+ * still the low half and vfp.regs[2n+1] the high half
52
+ * of the 128 bit vector, even on big endian systems.
53
+ * Calculate the offset assuming a fully bigendian 128 bits,
54
+ * then XOR to account for the order of the two 64 bit halves.
56
+ offs += (16 - ((element + 1) * (1 << size)));
59
+ offs += element * (1 << size);
64
/* Return the offset into CPUARMState of a slice (from
65
* the least significant end) of FP register Qn (ie
67
@@ -661,6 +683,108 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
71
+ * Vector load/store helpers.
73
+ * The principal difference between this and a FP load is that we don't
74
+ * zero extend as we are filling a partial chunk of the vector register.
75
+ * These functions don't support 128 bit loads/stores, which would be
76
+ * normal load/store operations.
79
+/* Get value of an element within a vector register */
80
+static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx,
81
+ int element, TCGMemOp memop)
83
+ int vect_off = vec_reg_offset(srcidx, element, memop & MO_SIZE);
86
+ tcg_gen_ld8u_i64(tcg_dest, cpu_env, vect_off);
89
+ tcg_gen_ld16u_i64(tcg_dest, cpu_env, vect_off);
92
+ tcg_gen_ld32u_i64(tcg_dest, cpu_env, vect_off);
95
+ tcg_gen_ld8s_i64(tcg_dest, cpu_env, vect_off);
98
+ tcg_gen_ld16s_i64(tcg_dest, cpu_env, vect_off);
100
+ case MO_32|MO_SIGN:
101
+ tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off);
104
+ case MO_64|MO_SIGN:
105
+ tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off);
108
+ g_assert_not_reached();
112
+/* Set value of an element within a vector register */
113
+static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
114
+ int element, TCGMemOp memop)
116
+ int vect_off = vec_reg_offset(destidx, element, memop & MO_SIZE);
119
+ tcg_gen_st8_i64(tcg_src, cpu_env, vect_off);
122
+ tcg_gen_st16_i64(tcg_src, cpu_env, vect_off);
125
+ tcg_gen_st32_i64(tcg_src, cpu_env, vect_off);
128
+ tcg_gen_st_i64(tcg_src, cpu_env, vect_off);
131
+ g_assert_not_reached();
135
+/* Clear the high 64 bits of a 128 bit vector (in general non-quad
136
+ * vector ops all need to do this).
138
+static void clear_vec_high(DisasContext *s, int rd)
140
+ TCGv_i64 tcg_zero = tcg_const_i64(0);
142
+ write_vec_element(s, tcg_zero, rd, 1, MO_64);
143
+ tcg_temp_free_i64(tcg_zero);
146
+/* Store from vector register to memory */
147
+static void do_vec_st(DisasContext *s, int srcidx, int element,
148
+ TCGv_i64 tcg_addr, int size)
150
+ TCGMemOp memop = MO_TE + size;
151
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
153
+ read_vec_element(s, tcg_tmp, srcidx, element, size);
154
+ tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
156
+ tcg_temp_free_i64(tcg_tmp);
159
+/* Load from memory to vector register */
160
+static void do_vec_ld(DisasContext *s, int destidx, int element,
161
+ TCGv_i64 tcg_addr, int size)
163
+ TCGMemOp memop = MO_TE + size;
164
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
166
+ tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
167
+ write_vec_element(s, tcg_tmp, destidx, element, size);
169
+ tcg_temp_free_i64(tcg_tmp);
173
* This utility function is for doing register extension with an
174
* optional shift. You will likely want to pass a temporary for the
175
* destination register. See DecodeRegExtend() in the ARM ARM.
176
@@ -1835,10 +1959,132 @@ static void disas_ldst_reg(DisasContext *s, uint32_t insn)
180
-/* AdvSIMD load/store multiple structures */
181
+/* C3.3.1 AdvSIMD load/store multiple structures
183
+ * 31 30 29 23 22 21 16 15 12 11 10 9 5 4 0
184
+ * +---+---+---------------+---+-------------+--------+------+------+------+
185
+ * | 0 | Q | 0 0 1 1 0 0 0 | L | 0 0 0 0 0 0 | opcode | size | Rn | Rt |
186
+ * +---+---+---------------+---+-------------+--------+------+------+------+
188
+ * C3.3.2 AdvSIMD load/store multiple structures (post-indexed)
190
+ * 31 30 29 23 22 21 20 16 15 12 11 10 9 5 4 0
191
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
192
+ * | 0 | Q | 0 0 1 1 0 0 1 | L | 0 | Rm | opcode | size | Rn | Rt |
193
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
195
+ * Rt: first (or only) SIMD&FP register to be transferred
196
+ * Rn: base address or SP
197
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
199
static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
201
- unsupported_encoding(s, insn);
202
+ int rt = extract32(insn, 0, 5);
203
+ int rn = extract32(insn, 5, 5);
204
+ int size = extract32(insn, 10, 2);
205
+ int opcode = extract32(insn, 12, 4);
206
+ bool is_store = !extract32(insn, 22, 1);
207
+ bool is_postidx = extract32(insn, 23, 1);
208
+ bool is_q = extract32(insn, 30, 1);
209
+ TCGv_i64 tcg_addr, tcg_rn;
211
+ int ebytes = 1 << size;
212
+ int elements = (is_q ? 128 : 64) / (8 << size);
213
+ int rpt; /* num iterations */
214
+ int selem; /* structure elements */
217
+ if (extract32(insn, 31, 1) || extract32(insn, 21, 1)) {
218
+ unallocated_encoding(s);
222
+ /* From the shared decode logic */
253
+ unallocated_encoding(s);
257
+ if (size == 3 && !is_q && selem != 1) {
259
+ unallocated_encoding(s);
264
+ gen_check_sp_alignment(s);
267
+ tcg_rn = cpu_reg_sp(s, rn);
268
+ tcg_addr = tcg_temp_new_i64();
269
+ tcg_gen_mov_i64(tcg_addr, tcg_rn);
271
+ for (r = 0; r < rpt; r++) {
273
+ for (e = 0; e < elements; e++) {
274
+ int tt = (rt + r) % 32;
276
+ for (xs = 0; xs < selem; xs++) {
278
+ do_vec_st(s, tt, e, tcg_addr, size);
280
+ do_vec_ld(s, tt, e, tcg_addr, size);
282
+ /* For non-quad operations, setting a slice of the low
283
+ * 64 bits of the register clears the high 64 bits (in
284
+ * the ARM ARM pseudocode this is implicit in the fact
285
+ * that 'rval' is a 64 bit wide variable). We optimize
286
+ * by noticing that we only need to do this the first
287
+ * time we touch a register.
289
+ if (!is_q && e == 0 && (r == 0 || xs == selem - 1)) {
290
+ clear_vec_high(s, tt);
293
+ tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
294
+ tt = (tt + 1) % 32;
300
+ int rm = extract32(insn, 16, 5);
302
+ tcg_gen_mov_i64(tcg_rn, tcg_addr);
304
+ tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
307
+ tcg_temp_free_i64(tcg_addr);
310
/* AdvSIMD load/store single structure */