~linaro-toolchain-dev/cortex-strings/trunk

4 by Michael Hope
Modified the imported versions to build locally. Added the CSL routines.
1
/* Copyright (c) 2009 CodeSourcery, Inc.
2
 * All rights reserved.
3
 * 
4
 * Redistribution and use in source and binary forms, with or without
5
 * modification, are permitted provided that the following conditions are met:
6
 *     * Redistributions of source code must retain the above copyright
7
 *       notice, this list of conditions and the following disclaimer.
8
 *     * Redistributions in binary form must reproduce the above copyright
9
 *       notice, this list of conditions and the following disclaimer in the
10
 *       documentation and/or other materials provided with the distribution.
11
 *     * Neither the name of CodeSourcery nor the
12
 *       names of its contributors may be used to endorse or promote products
13
 *       derived from this software without specific prior written permission.
14
 * 
15
 * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
16
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
 * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
19
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 */
26
27
#include "arm_asm.h"
28
#include <string.h>
29
#include <stdint.h>
30
#include <stddef.h>
31
32
/* Standard operations for word-sized values.  */
33
#define WORD_REF(ADDRESS, OFFSET) \
34
	*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
35
#define WORD_COPY(OUT, IN, OFFSET) \
36
	WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)
37
38
/* On processors with NEON, we use 128-bit vectors.  Also,
39
   we need to include arm_neon.h to use these.  */
40
#if defined(__ARM_NEON__)
41
  #include <arm_neon.h>
42
43
  #define WORD_TYPE uint8x16_t
44
  #define WORD_SIZE 16
45
  #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
46
47
/* On ARM processors with 64-bit ldrd instructions, we use those,
48
   except on Cortex-M* where benchmarking has shown them to
49
   be slower.  */
50
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
51
	|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
52
  #define WORD_TYPE uint64_t
53
  #define WORD_SIZE 8
54
  #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
55
56
/* On everything else, we use 32-bit loads and stores, and
57
   do not use prefetching.  */
58
#else
59
  #define WORD_TYPE uint32_t
60
  #define WORD_SIZE 4
61
  #define MAYBE_PREFETCH(IN)
62
#endif
63
64
/* On all ARM platforms, 'SHORTWORD' is a 32-bit value.  */
65
#define SHORTWORD_TYPE uint32_t
66
#define SHORTWORD_SIZE 4
67
#define SHORTWORD_REF(ADDRESS, OFFSET) \
68
	*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
69
#define SHORTWORD_COPY(OUT, IN, OFFSET) \
70
	SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)
71
72
/* Shifting directionality depends on endianness.  */
73
#ifdef __ARMEB__
74
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
75
	((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8))
76
#else
77
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
78
	((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8))
79
#endif
80
81
void *memcpy(void *OUT, const void *IN, size_t N)
82
{
83
  void* OUT0 = OUT;
84
85
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
86
  const char* OUT_end = (char*)OUT + N;
87
  while ((char*)OUT < OUT_end) {
88
    *((char*)OUT) = *((char*)IN);
89
    OUT++;
90
    IN++;
91
  }
92
93
  return OUT0;
94
#else
95
  /* Handle short strings and immediately return.  */
96
  if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {
97
    size_t i = 0;
98
    while (i < N) {
99
      ((char*)OUT)[i] = ((char*)IN)[i];
100
      i++;
101
    }
102
    return OUT;
103
  }
104
105
  const char* OUT_end = (char*)OUT + N;
106
107
  /* Align OUT to SHORTWORD_SIZE.  */
108
  while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {
109
    *(char*) (OUT++) = *(char*) (IN++);
110
  }
111
112
  if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {
113
114
#if WORD_SIZE > SHORTWORD_SIZE
115
    /* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE.  */
116
    if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {
117
      while ((uintptr_t)OUT % WORD_SIZE != 0) {
118
        SHORTWORD_COPY(OUT, IN, 0);
119
        OUT += SHORTWORD_SIZE;
120
        IN += SHORTWORD_SIZE;
121
      }
122
123
      if ((uintptr_t) IN % WORD_SIZE == 0) {
124
#endif /* WORD_SIZE > SHORTWORD_SIZE */
125
126
#if defined(__ARM_NEON__)
127
        /* Testing on Cortex-A8 indicates that the following idiom
128
           produces faster assembly code when doing vector copies,
129
           but not when doing regular copies.  */
130
        size_t i = 0;
131
        N = OUT_end - (char*)OUT;
132
        MAYBE_PREFETCH(IN + 64);
133
        MAYBE_PREFETCH(IN + 128);
134
        MAYBE_PREFETCH(IN + 192);
135
        if (N >= 640) {
136
          MAYBE_PREFETCH(IN + 256);
137
          MAYBE_PREFETCH(IN + 320);
138
          MAYBE_PREFETCH(IN + 384);
139
          MAYBE_PREFETCH(IN + 448);
140
          MAYBE_PREFETCH(IN + 512);
141
          MAYBE_PREFETCH(IN + 576);
142
          MAYBE_PREFETCH(IN + 640);
143
          MAYBE_PREFETCH(IN + 704);
144
	  /* We phrase the loop condition in this way so that the
145
             i + WORD_SIZE * 16 value can be reused to increment i.  */
146
          while (i + WORD_SIZE * 16 <= N - 640) {
147
            MAYBE_PREFETCH(IN + 768);
148
            MAYBE_PREFETCH(IN + 832);
149
            MAYBE_PREFETCH(IN + 896);
150
            MAYBE_PREFETCH(IN + 960);
151
            WORD_COPY(OUT, IN, i);
152
            WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
153
            WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
154
            WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
155
            WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
156
            WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
157
            WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
158
            WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
159
            WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
160
            WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
161
            WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
162
            WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
163
            WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
164
            WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
165
            WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
166
            WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
167
            i += WORD_SIZE * 16;
168
          }
169
        }
170
        while (i + WORD_SIZE * 16 <= N) {
171
          WORD_COPY(OUT, IN, i);
172
          WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
173
          WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
174
          WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
175
          WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
176
          WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
177
          WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
178
          WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
179
          WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
180
          WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
181
          WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
182
          WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
183
          WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
184
          WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
185
          WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
186
          WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
187
          i += WORD_SIZE * 16;
188
        }
189
        while (i + WORD_SIZE * 4 <= N) {
190
          WORD_COPY(OUT, IN, i);
191
          WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
192
          WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
193
          WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
194
          i += WORD_SIZE * 4;
195
        }
196
        while (i + WORD_SIZE <= N) {
197
          WORD_COPY(OUT, IN, i);
198
          i += WORD_SIZE;
199
        }
200
        OUT += i;
201
        IN += i;
202
#else /* not defined(__ARM_NEON__) */
203
        /* Note: 16-times unrolling is about 20% faster than 4-times
204
           unrolling on both ARM Cortex-A8 and Cortex-M3.  */
205
        MAYBE_PREFETCH(IN + 64);
206
        MAYBE_PREFETCH(IN + 128);
207
        MAYBE_PREFETCH(IN + 192);
208
        while (OUT_end - (char*)OUT >= WORD_SIZE * 16) {
209
          MAYBE_PREFETCH(IN + 256);
210
          MAYBE_PREFETCH(IN + 320);
211
          WORD_COPY(OUT, IN, 0);
212
          WORD_COPY(OUT, IN, WORD_SIZE * 1);
213
          WORD_COPY(OUT, IN, WORD_SIZE * 2);
214
          WORD_COPY(OUT, IN, WORD_SIZE * 3);
215
          WORD_COPY(OUT, IN, WORD_SIZE * 4);
216
          WORD_COPY(OUT, IN, WORD_SIZE * 5);
217
          WORD_COPY(OUT, IN, WORD_SIZE * 6);
218
          WORD_COPY(OUT, IN, WORD_SIZE * 7);
219
          WORD_COPY(OUT, IN, WORD_SIZE * 8);
220
          WORD_COPY(OUT, IN, WORD_SIZE * 9);
221
          WORD_COPY(OUT, IN, WORD_SIZE * 10);
222
          WORD_COPY(OUT, IN, WORD_SIZE * 11);
223
          WORD_COPY(OUT, IN, WORD_SIZE * 12);
224
          WORD_COPY(OUT, IN, WORD_SIZE * 13);
225
          WORD_COPY(OUT, IN, WORD_SIZE * 14);
226
          WORD_COPY(OUT, IN, WORD_SIZE * 15);
227
          OUT += WORD_SIZE * 16;
228
          IN += WORD_SIZE * 16;
229
        }
230
        while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {
231
          WORD_COPY(OUT, IN, 0);
232
          WORD_COPY(OUT, IN, WORD_SIZE * 1);
233
          WORD_COPY(OUT, IN, WORD_SIZE * 2);
234
          WORD_COPY(OUT, IN, WORD_SIZE * 3);
235
          OUT += WORD_SIZE * 4;
236
          IN += WORD_SIZE * 4;
237
        }
238
        while (WORD_SIZE <= OUT_end - (char*)OUT) {
239
          WORD_COPY(OUT, IN, 0);
240
          OUT += WORD_SIZE;
241
          IN += WORD_SIZE;
242
        }
243
#endif /* not defined(__ARM_NEON__) */
244
245
#if WORD_SIZE > SHORTWORD_SIZE
246
      } else { /* if IN is not WORD_SIZE aligned */
247
        while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {
248
          SHORTWORD_COPY(OUT, IN, 0);
249
          SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);
250
          SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);
251
          SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);
252
          OUT += SHORTWORD_SIZE * 4;
253
          IN += SHORTWORD_SIZE * 4;
254
        }
255
      } /* end if IN is not WORD_SIZE aligned */
256
    } /* end if N >= WORD_SIZE */
257
258
    while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
259
      SHORTWORD_COPY(OUT, IN, 0);
260
      OUT += SHORTWORD_SIZE;
261
      IN += SHORTWORD_SIZE;
262
    }
263
#endif /* WORD_SIZE > SHORTWORD_SIZE */
264
265
  } else { /* if IN is not SHORTWORD_SIZE aligned */
266
    ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;
267
268
    SHORTWORD_TYPE temp1, temp2;
269
    temp1 = SHORTWORD_REF(IN, -misalign);
270
271
    /* Benchmarking indicates that unrolling this loop doesn't
272
       produce a measurable performance improvement on ARM.  */
273
    while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
274
      IN += SHORTWORD_SIZE;
275
      temp2 = SHORTWORD_REF(IN, -misalign);
276
      SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);
277
      temp1 = temp2;
278
      OUT += SHORTWORD_SIZE;
279
    }
280
281
  } /* end if IN is not SHORTWORD_SIZE aligned */
282
283
  while ((char*)OUT < OUT_end) {
284
    *((char*)OUT) = *((char*)IN);
285
    OUT++;
286
    IN++;
287
  }
288
289
  return OUT0;
290
#endif
291
}