~linaro-toolchain-dev/cortex-strings/trunk

4 by Michael Hope
Modified the imported versions to build locally. Added the CSL routines.
1
/* Copyright (c) 2009 CodeSourcery, Inc.
2
 * All rights reserved.
3
 * 
4
 * Redistribution and use in source and binary forms, with or without
5
 * modification, are permitted provided that the following conditions are met:
6
 *     * Redistributions of source code must retain the above copyright
7
 *       notice, this list of conditions and the following disclaimer.
8
 *     * Redistributions in binary form must reproduce the above copyright
9
 *       notice, this list of conditions and the following disclaimer in the
10
 *       documentation and/or other materials provided with the distribution.
11
 *     * Neither the name of CodeSourcery nor the
12
 *       names of its contributors may be used to endorse or promote products
13
 *       derived from this software without specific prior written permission.
14
 * 
15
 * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
16
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
 * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
19
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 */
26
27
#include "arm_asm.h"
28
#include <string.h>
29
#include <stdint.h>
30
31
/* Standard operations for word-sized values.  */
32
#define WORD_REF(ADDRESS, OFFSET) \
33
	*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
34
35
/* On processors with NEON, we use 128-bit vectors.  Also,
36
   we need to include arm_neon.h to use these.  */
37
#if defined(__ARM_NEON__)
38
  #include <arm_neon.h>
39
40
  #define WORD_TYPE uint8x16_t
41
  #define WORD_SIZE 16
42
43
  #define WORD_DUPLICATE(VALUE) \
44
	vdupq_n_u8(VALUE)
45
46
/* On ARM processors with 64-bit ldrd instructions, we use those,
47
   except on Cortex-M* where benchmarking has shown them to
48
   be slower.  */
49
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
50
	|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
51
  #define WORD_TYPE uint64_t
52
  #define WORD_SIZE 8
53
54
  /* ARM stores 64-bit values in two 32-bit registers and does not
55
     have 64-bit multiply or bitwise-or instructions, so this union
56
     operation results in optimal code.  */
57
  static inline uint64_t splat8(value) {
58
	union { uint32_t ints[2]; uint64_t result; } quad;
59
	quad.ints[0] = (unsigned char)(value) * 0x01010101;
60
	quad.ints[1] = quad.ints[0];
61
	return quad.result;
62
  }
63
  #define WORD_DUPLICATE(VALUE) \
64
	splat8(VALUE)
65
66
/* On everything else, we use 32-bit loads and stores.  */
67
#else
68
  #define WORD_TYPE uint32_t
69
  #define WORD_SIZE 4
70
  #define WORD_DUPLICATE(VALUE) \
71
	(unsigned char)(VALUE) * 0x01010101
72
#endif
73
74
/* On all ARM platforms, 'SHORTWORD' is a 32-bit value.  */
75
#define SHORTWORD_TYPE uint32_t
76
#define SHORTWORD_SIZE 4
77
#define SHORTWORD_REF(ADDRESS, OFFSET) \
78
	*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
79
#define SHORTWORD_DUPLICATE(VALUE) \
80
	(uint32_t)(unsigned char)(VALUE) * 0x01010101
81
82
void *memset(void *DST, int C, size_t LENGTH)
83
{
84
  void* DST0 = DST;
85
  unsigned char C_BYTE = C;
86
87
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
88
  const char* DST_end = (char*)DST + LENGTH;
89
  while ((char*)DST < DST_end) {
90
    *((char*)DST) = C_BYTE;
91
    DST++;
92
  }
93
94
  return DST0;
95
#else /* not PREFER_SIZE_OVER_SPEED */
96
  /* Handle short strings and immediately return.  */
97
  if (__builtin_expect(LENGTH < SHORTWORD_SIZE, 1)) {
98
    size_t i = 0;
99
    while (i < LENGTH) {
100
      ((char*)DST)[i] = C_BYTE;
101
      i++;
102
    }
103
    return DST;
104
  }
105
106
  const char* DST_end = (char*)DST + LENGTH;
107
108
  /* Align DST to SHORTWORD_SIZE.  */
109
  while ((uintptr_t)DST % SHORTWORD_SIZE != 0) {
110
    *(char*) (DST++) = C_BYTE;
111
  }
112
113
#if WORD_SIZE > SHORTWORD_SIZE
114
  SHORTWORD_TYPE C_SHORTWORD = SHORTWORD_DUPLICATE(C_BYTE);
115
116
  /* Align DST to WORD_SIZE in steps of SHORTWORD_SIZE.  */
117
  if (__builtin_expect(DST_end - (char*)DST >= WORD_SIZE, 0)) {
118
    while ((uintptr_t)DST % WORD_SIZE != 0) {
119
      SHORTWORD_REF(DST, 0) = C_SHORTWORD;
120
      DST += SHORTWORD_SIZE;
121
    }
122
#endif /* WORD_SIZE > SHORTWORD_SIZE */
123
124
    WORD_TYPE C_WORD = WORD_DUPLICATE(C_BYTE);
125
126
#if defined(__ARM_NEON__)
127
    /* Testing on Cortex-A8 indicates that the following idiom
128
       produces faster assembly code when doing vector copies,
129
       but not when doing regular copies.  */
130
    size_t i = 0;
131
    LENGTH = DST_end - (char*)DST;
132
    while (i + WORD_SIZE * 16 <= LENGTH) {
133
      WORD_REF(DST, i) = C_WORD;
134
      WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
135
      WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
136
      WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
137
      WORD_REF(DST, i + WORD_SIZE * 4) = C_WORD;
138
      WORD_REF(DST, i + WORD_SIZE * 5) = C_WORD;
139
      WORD_REF(DST, i + WORD_SIZE * 6) = C_WORD;
140
      WORD_REF(DST, i + WORD_SIZE * 7) = C_WORD;
141
      WORD_REF(DST, i + WORD_SIZE * 8) = C_WORD;
142
      WORD_REF(DST, i + WORD_SIZE * 9) = C_WORD;
143
      WORD_REF(DST, i + WORD_SIZE * 10) = C_WORD;
144
      WORD_REF(DST, i + WORD_SIZE * 11) = C_WORD;
145
      WORD_REF(DST, i + WORD_SIZE * 12) = C_WORD;
146
      WORD_REF(DST, i + WORD_SIZE * 13) = C_WORD;
147
      WORD_REF(DST, i + WORD_SIZE * 14) = C_WORD;
148
      WORD_REF(DST, i + WORD_SIZE * 15) = C_WORD;
149
      i += WORD_SIZE * 16;
150
    }
151
    while (i + WORD_SIZE * 4 <= LENGTH) {
152
      WORD_REF(DST, i) = C_WORD;
153
      WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
154
      WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
155
      WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
156
      i += WORD_SIZE * 4;
157
    }
158
    while (i + WORD_SIZE <= LENGTH) {
159
      WORD_REF(DST, i) = C_WORD;
160
      i += WORD_SIZE;
161
    }
162
    DST += i;
163
#else /* not defined(__ARM_NEON__) */
164
    /* Note: 16-times unrolling is about 50% faster than 4-times
165
       unrolling on both ARM Cortex-A8 and Cortex-M3.  */
166
    while (DST_end - (char*) DST >= WORD_SIZE * 16) {
167
      WORD_REF(DST, 0) = C_WORD;
168
      WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
169
      WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
170
      WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
171
      WORD_REF(DST, WORD_SIZE * 4) = C_WORD;
172
      WORD_REF(DST, WORD_SIZE * 5) = C_WORD;
173
      WORD_REF(DST, WORD_SIZE * 6) = C_WORD;
174
      WORD_REF(DST, WORD_SIZE * 7) = C_WORD;
175
      WORD_REF(DST, WORD_SIZE * 8) = C_WORD;
176
      WORD_REF(DST, WORD_SIZE * 9) = C_WORD;
177
      WORD_REF(DST, WORD_SIZE * 10) = C_WORD;
178
      WORD_REF(DST, WORD_SIZE * 11) = C_WORD;
179
      WORD_REF(DST, WORD_SIZE * 12) = C_WORD;
180
      WORD_REF(DST, WORD_SIZE * 13) = C_WORD;
181
      WORD_REF(DST, WORD_SIZE * 14) = C_WORD;
182
      WORD_REF(DST, WORD_SIZE * 15) = C_WORD;
183
      DST += WORD_SIZE * 16;
184
    }
185
    while (WORD_SIZE * 4 <= DST_end - (char*) DST) {
186
      WORD_REF(DST, 0) = C_WORD;
187
      WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
188
      WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
189
      WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
190
      DST += WORD_SIZE * 4;
191
    }
192
    while (WORD_SIZE <= DST_end - (char*) DST) {
193
      WORD_REF(DST, 0) = C_WORD;
194
      DST += WORD_SIZE;
195
    }
196
#endif /* not defined(__ARM_NEON__) */
197
198
#if WORD_SIZE > SHORTWORD_SIZE
199
  } /* end if N >= WORD_SIZE */
200
201
  while (SHORTWORD_SIZE <= DST_end - (char*)DST) {
202
    SHORTWORD_REF(DST, 0) = C_SHORTWORD;
203
    DST += SHORTWORD_SIZE;
204
  }
205
#endif /* WORD_SIZE > SHORTWORD_SIZE */
206
207
  while ((char*)DST < DST_end) {
208
    *((char*)DST) = C_BYTE;
209
    DST++;
210
  }
211
212
  return DST0;
213
#endif /* not PREFER_SIZE_OVER_SPEED */
214
}