1
/* Copyright (c) 2013, Linaro Limited
4
Redistribution and use in source and binary forms, with or without
5
modification, are permitted provided that the following conditions are met:
6
* Redistributions of source code must retain the above copyright
7
notice, this list of conditions and the following disclaimer.
8
* Redistributions in binary form must reproduce the above copyright
9
notice, this list of conditions and the following disclaimer in the
10
documentation and/or other materials provided with the distribution.
11
* Neither the name of the Linaro nor the
12
names of its contributors may be used to endorse or promote products
13
derived from this software without specific prior written permission.
15
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
33
.macro def_fn f p2align=0
41
/* Parameters and result. */
67
b.hs memcpy /* No overlap. */
69
/* Upwards move with potential overlap.
70
* Need to move from the tail backwards. SRC and DST point one
71
* byte beyond the remaining data to move. */
75
b.ge .Lmov_not_short_up
77
/* Deal with small moves quickly by dropping straight into the
80
/* Move up to 48 bytes of data. At this point we only need the
81
* bottom 6 bits of count to be accurate. */
82
ands tmp1, count, #0x30
89
ldp A_l, A_h, [src, #32]
90
stp A_l, A_h, [dst, #32]
92
ldp A_l, A_h, [src, #16]
93
stp A_l, A_h, [dst, #16]
98
/* Move up to 15 bytes of data. Does not assume additional data
101
ldr tmp1, [src, #-8]!
102
str tmp1, [dst, #-8]!
105
ldr tmp1w, [src, #-4]!
106
str tmp1w, [dst, #-4]!
109
ldrh tmp1w, [src, #-2]!
110
strh tmp1w, [dst, #-2]!
113
ldrb tmp1w, [src, #-1]
114
strb tmp1w, [dst, #-1]
119
/* We don't much care about the alignment of DST, but we want SRC
120
* to be 128-bit (16 byte) aligned so that we don't cross cache line
121
* boundaries on both loads and stores. */
122
ands tmp2, src, #15 /* Bytes to reach alignment. */
124
sub count, count, tmp2
125
/* Move enough data to reach alignment; unlike memcpy, we have to
126
* be aware of the overlap, which means we can't move data twice. */
128
ldr tmp1, [src, #-8]!
129
str tmp1, [dst, #-8]!
132
ldr tmp1w, [src, #-4]!
133
str tmp1w, [dst, #-4]!
136
ldrh tmp1w, [src, #-2]!
137
strh tmp1w, [dst, #-2]!
140
ldrb tmp1w, [src, #-1]!
141
strb tmp1w, [dst, #-1]!
144
/* There may be less than 63 bytes to go now. */
148
subs count, count, #128
149
b.ge .Lmov_body_large_up
150
/* Less than 128 bytes to move, so handle 64 here and then jump
152
ldp A_l, A_h, [src, #-64]!
153
ldp B_l, B_h, [src, #16]
154
ldp C_l, C_h, [src, #32]
155
ldp D_l, D_h, [src, #48]
156
stp A_l, A_h, [dst, #-64]!
157
stp B_l, B_h, [dst, #16]
158
stp C_l, C_h, [dst, #32]
159
stp D_l, D_h, [dst, #48]
164
/* Critical loop. Start at a new Icache line boundary. Assuming
165
* 64 bytes per line this ensures the entire loop is in one line. */
168
/* There are at least 128 bytes to move. */
169
ldp A_l, A_h, [src, #-16]
170
ldp B_l, B_h, [src, #-32]
171
ldp C_l, C_h, [src, #-48]
172
ldp D_l, D_h, [src, #-64]!
174
stp A_l, A_h, [dst, #-16]
175
ldp A_l, A_h, [src, #-16]
176
stp B_l, B_h, [dst, #-32]
177
ldp B_l, B_h, [src, #-32]
178
stp C_l, C_h, [dst, #-48]
179
ldp C_l, C_h, [src, #-48]
180
stp D_l, D_h, [dst, #-64]!
181
ldp D_l, D_h, [src, #-64]!
182
subs count, count, #64
184
stp A_l, A_h, [dst, #-16]
185
stp B_l, B_h, [dst, #-32]
186
stp C_l, C_h, [dst, #-48]
187
stp D_l, D_h, [dst, #-64]!
194
/* For a downwards move we can safely use memcpy provided that
195
* DST is more than 16 bytes away from SRC. */
198
b.ls memcpy /* May overlap, but not critically. */
200
mov dst, dstin /* Preserve DSTIN for return value. */
202
b.ge .Lmov_not_short_down
204
/* Deal with small moves quickly by dropping straight into the
207
/* Move up to 48 bytes of data. At this point we only need the
208
* bottom 6 bits of count to be accurate. */
209
ands tmp1, count, #0x30
216
ldp A_l, A_h, [src, #-48]
217
stp A_l, A_h, [dst, #-48]
219
ldp A_l, A_h, [src, #-32]
220
stp A_l, A_h, [dst, #-32]
222
ldp A_l, A_h, [src, #-16]
223
stp A_l, A_h, [dst, #-16]
225
/* Move up to 15 bytes of data. Does not assume additional data
236
ldrh tmp1w, [src], #2
237
strh tmp1w, [dst], #2
245
.Lmov_not_short_down:
246
/* We don't much care about the alignment of DST, but we want SRC
247
* to be 128-bit (16 byte) aligned so that we don't cross cache line
248
* boundaries on both loads and stores. */
250
ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
252
sub count, count, tmp2
253
/* Move enough data to reach alignment; unlike memcpy, we have to
254
* be aware of the overlap, which means we can't move data twice. */
264
ldrh tmp1w, [src], #2
265
strh tmp1w, [dst], #2
268
ldrb tmp1w, [src], #1
269
strb tmp1w, [dst], #1
272
/* There may be less than 63 bytes to go now. */
276
subs count, count, #128
277
b.ge .Lmov_body_large_down
278
/* Less than 128 bytes to move, so handle 64 here and then jump
281
ldp B_l, B_h, [src, #16]
282
ldp C_l, C_h, [src, #32]
283
ldp D_l, D_h, [src, #48]
285
stp B_l, B_h, [dst, #16]
286
stp C_l, C_h, [dst, #32]
287
stp D_l, D_h, [dst, #48]
294
/* Critical loop. Start at a new cache line boundary. Assuming
295
* 64 bytes per line this ensures the entire loop is in one line. */
297
.Lmov_body_large_down:
298
/* There are at least 128 bytes to move. */
299
ldp A_l, A_h, [src, #0]
300
sub dst, dst, #16 /* Pre-bias. */
301
ldp B_l, B_h, [src, #16]
302
ldp C_l, C_h, [src, #32]
303
ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
305
stp A_l, A_h, [dst, #16]
306
ldp A_l, A_h, [src, #16]
307
stp B_l, B_h, [dst, #32]
308
ldp B_l, B_h, [src, #32]
309
stp C_l, C_h, [dst, #48]
310
ldp C_l, C_h, [src, #48]
311
stp D_l, D_h, [dst, #64]!
312
ldp D_l, D_h, [src, #64]!
313
subs count, count, #64
315
stp A_l, A_h, [dst, #16]
316
stp B_l, B_h, [dst, #32]
317
stp C_l, C_h, [dst, #48]
318
stp D_l, D_h, [dst, #64]
320
add dst, dst, #64 + 16
324
.size memmove, . - memmove