1
/* Copyright (c) 2009 CodeSourcery, Inc.
4
* Redistribution and use in source and binary forms, with or without
5
* modification, are permitted provided that the following conditions are met:
6
* * Redistributions of source code must retain the above copyright
7
* notice, this list of conditions and the following disclaimer.
8
* * Redistributions in binary form must reproduce the above copyright
9
* notice, this list of conditions and the following disclaimer in the
10
* documentation and/or other materials provided with the distribution.
11
* * Neither the name of CodeSourcery nor the
12
* names of its contributors may be used to endorse or promote products
13
* derived from this software without specific prior written permission.
15
* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
16
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
19
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
/* Standard operations for word-sized values. */
32
#define WORD_REF(ADDRESS, OFFSET) \
33
*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
35
/* On processors with NEON, we use 128-bit vectors. Also,
36
we need to include arm_neon.h to use these. */
37
#if defined(__ARM_NEON__)
40
#define WORD_TYPE uint8x16_t
43
#define WORD_DUPLICATE(VALUE) \
46
/* On ARM processors with 64-bit ldrd instructions, we use those,
47
except on Cortex-M* where benchmarking has shown them to
49
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
50
|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
51
#define WORD_TYPE uint64_t
54
/* ARM stores 64-bit values in two 32-bit registers and does not
55
have 64-bit multiply or bitwise-or instructions, so this union
56
operation results in optimal code. */
57
static inline uint64_t splat8(value) {
58
union { uint32_t ints[2]; uint64_t result; } quad;
59
quad.ints[0] = (unsigned char)(value) * 0x01010101;
60
quad.ints[1] = quad.ints[0];
63
#define WORD_DUPLICATE(VALUE) \
66
/* On everything else, we use 32-bit loads and stores. */
68
#define WORD_TYPE uint32_t
70
#define WORD_DUPLICATE(VALUE) \
71
(unsigned char)(VALUE) * 0x01010101
74
/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */
75
#define SHORTWORD_TYPE uint32_t
76
#define SHORTWORD_SIZE 4
77
#define SHORTWORD_REF(ADDRESS, OFFSET) \
78
*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
79
#define SHORTWORD_DUPLICATE(VALUE) \
80
(uint32_t)(unsigned char)(VALUE) * 0x01010101
82
void *memset(void *DST, int C, size_t LENGTH)
85
unsigned char C_BYTE = C;
87
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
88
const char* DST_end = (char*)DST + LENGTH;
89
while ((char*)DST < DST_end) {
90
*((char*)DST) = C_BYTE;
95
#else /* not PREFER_SIZE_OVER_SPEED */
96
/* Handle short strings and immediately return. */
97
if (__builtin_expect(LENGTH < SHORTWORD_SIZE, 1)) {
100
((char*)DST)[i] = C_BYTE;
106
const char* DST_end = (char*)DST + LENGTH;
108
/* Align DST to SHORTWORD_SIZE. */
109
while ((uintptr_t)DST % SHORTWORD_SIZE != 0) {
110
*(char*) (DST++) = C_BYTE;
113
#if WORD_SIZE > SHORTWORD_SIZE
114
SHORTWORD_TYPE C_SHORTWORD = SHORTWORD_DUPLICATE(C_BYTE);
116
/* Align DST to WORD_SIZE in steps of SHORTWORD_SIZE. */
117
if (__builtin_expect(DST_end - (char*)DST >= WORD_SIZE, 0)) {
118
while ((uintptr_t)DST % WORD_SIZE != 0) {
119
SHORTWORD_REF(DST, 0) = C_SHORTWORD;
120
DST += SHORTWORD_SIZE;
122
#endif /* WORD_SIZE > SHORTWORD_SIZE */
124
WORD_TYPE C_WORD = WORD_DUPLICATE(C_BYTE);
126
#if defined(__ARM_NEON__)
127
/* Testing on Cortex-A8 indicates that the following idiom
128
produces faster assembly code when doing vector copies,
129
but not when doing regular copies. */
131
LENGTH = DST_end - (char*)DST;
132
while (i + WORD_SIZE * 16 <= LENGTH) {
133
WORD_REF(DST, i) = C_WORD;
134
WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
135
WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
136
WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
137
WORD_REF(DST, i + WORD_SIZE * 4) = C_WORD;
138
WORD_REF(DST, i + WORD_SIZE * 5) = C_WORD;
139
WORD_REF(DST, i + WORD_SIZE * 6) = C_WORD;
140
WORD_REF(DST, i + WORD_SIZE * 7) = C_WORD;
141
WORD_REF(DST, i + WORD_SIZE * 8) = C_WORD;
142
WORD_REF(DST, i + WORD_SIZE * 9) = C_WORD;
143
WORD_REF(DST, i + WORD_SIZE * 10) = C_WORD;
144
WORD_REF(DST, i + WORD_SIZE * 11) = C_WORD;
145
WORD_REF(DST, i + WORD_SIZE * 12) = C_WORD;
146
WORD_REF(DST, i + WORD_SIZE * 13) = C_WORD;
147
WORD_REF(DST, i + WORD_SIZE * 14) = C_WORD;
148
WORD_REF(DST, i + WORD_SIZE * 15) = C_WORD;
151
while (i + WORD_SIZE * 4 <= LENGTH) {
152
WORD_REF(DST, i) = C_WORD;
153
WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
154
WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
155
WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
158
while (i + WORD_SIZE <= LENGTH) {
159
WORD_REF(DST, i) = C_WORD;
163
#else /* not defined(__ARM_NEON__) */
164
/* Note: 16-times unrolling is about 50% faster than 4-times
165
unrolling on both ARM Cortex-A8 and Cortex-M3. */
166
while (DST_end - (char*) DST >= WORD_SIZE * 16) {
167
WORD_REF(DST, 0) = C_WORD;
168
WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
169
WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
170
WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
171
WORD_REF(DST, WORD_SIZE * 4) = C_WORD;
172
WORD_REF(DST, WORD_SIZE * 5) = C_WORD;
173
WORD_REF(DST, WORD_SIZE * 6) = C_WORD;
174
WORD_REF(DST, WORD_SIZE * 7) = C_WORD;
175
WORD_REF(DST, WORD_SIZE * 8) = C_WORD;
176
WORD_REF(DST, WORD_SIZE * 9) = C_WORD;
177
WORD_REF(DST, WORD_SIZE * 10) = C_WORD;
178
WORD_REF(DST, WORD_SIZE * 11) = C_WORD;
179
WORD_REF(DST, WORD_SIZE * 12) = C_WORD;
180
WORD_REF(DST, WORD_SIZE * 13) = C_WORD;
181
WORD_REF(DST, WORD_SIZE * 14) = C_WORD;
182
WORD_REF(DST, WORD_SIZE * 15) = C_WORD;
183
DST += WORD_SIZE * 16;
185
while (WORD_SIZE * 4 <= DST_end - (char*) DST) {
186
WORD_REF(DST, 0) = C_WORD;
187
WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
188
WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
189
WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
190
DST += WORD_SIZE * 4;
192
while (WORD_SIZE <= DST_end - (char*) DST) {
193
WORD_REF(DST, 0) = C_WORD;
196
#endif /* not defined(__ARM_NEON__) */
198
#if WORD_SIZE > SHORTWORD_SIZE
199
} /* end if N >= WORD_SIZE */
201
while (SHORTWORD_SIZE <= DST_end - (char*)DST) {
202
SHORTWORD_REF(DST, 0) = C_SHORTWORD;
203
DST += SHORTWORD_SIZE;
205
#endif /* WORD_SIZE > SHORTWORD_SIZE */
207
while ((char*)DST < DST_end) {
208
*((char*)DST) = C_BYTE;
213
#endif /* not PREFER_SIZE_OVER_SPEED */