1
/* Copyright (c) 2012, Linaro Limited
4
Redistribution and use in source and binary forms, with or without
5
modification, are permitted provided that the following conditions are met:
6
* Redistributions of source code must retain the above copyright
7
notice, this list of conditions and the following disclaimer.
8
* Redistributions in binary form must reproduce the above copyright
9
notice, this list of conditions and the following disclaimer in the
10
documentation and/or other materials provided with the distribution.
11
* Neither the name of the Linaro nor the
12
names of its contributors may be used to endorse or promote products
13
derived from this software without specific prior written permission.
15
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
54
.macro def_fn f p2align=0
62
def_fn memcpy p2align=6
70
/* Deal with small copies quickly by dropping straight into the
73
/* Copy up to 48 bytes of data. At this point we only need the
74
* bottom 6 bits of count to be accurate. */
75
ands tmp1, count, #0x30
82
ldp A_l, A_h, [src, #-48]
83
stp A_l, A_h, [dst, #-48]
85
ldp A_l, A_h, [src, #-32]
86
stp A_l, A_h, [dst, #-32]
88
ldp A_l, A_h, [src, #-16]
89
stp A_l, A_h, [dst, #-16]
92
ands count, count, #15
95
ldp A_l, A_h, [src, #-16]
97
stp A_l, A_h, [dst, #-16]
102
/* Copy up to 15 bytes of data. Does not assume additional data
113
ldrh tmp1w, [src], #2
114
strh tmp1w, [dst], #2
123
/* We don't much care about the alignment of DST, but we want SRC
124
* to be 128-bit (16 byte) aligned so that we don't cross cache line
125
* boundaries on both loads and stores. */
127
ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
129
sub count, count, tmp2
130
/* Copy more data than needed; it's faster than jumping
131
* around copying sub-Quadword quantities. We know that
132
* it can't overrun. */
137
/* There may be less than 63 bytes to go now. */
141
subs count, count, #128
142
b.ge .Lcpy_body_large
143
/* Less than 128 bytes to copy, so handle 64 here and then jump
146
ldp B_l, B_h, [src, #16]
147
ldp C_l, C_h, [src, #32]
148
ldp D_l, D_h, [src, #48]
150
stp B_l, B_h, [dst, #16]
151
stp C_l, C_h, [dst, #32]
152
stp D_l, D_h, [dst, #48]
159
/* Critical loop. Start at a new cache line boundary. Assuming
160
* 64 bytes per line this ensures the entire loop is in one line. */
163
/* There are at least 128 bytes to copy. */
164
ldp A_l, A_h, [src, #0]
165
sub dst, dst, #16 /* Pre-bias. */
166
ldp B_l, B_h, [src, #16]
167
ldp C_l, C_h, [src, #32]
168
ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
170
stp A_l, A_h, [dst, #16]
171
ldp A_l, A_h, [src, #16]
172
stp B_l, B_h, [dst, #32]
173
ldp B_l, B_h, [src, #32]
174
stp C_l, C_h, [dst, #48]
175
ldp C_l, C_h, [src, #48]
176
stp D_l, D_h, [dst, #64]!
177
ldp D_l, D_h, [src, #64]!
178
subs count, count, #64
180
stp A_l, A_h, [dst, #16]
181
stp B_l, B_h, [dst, #32]
182
stp C_l, C_h, [dst, #48]
183
stp D_l, D_h, [dst, #64]
185
add dst, dst, #64 + 16