~linaro-toolchain-dev/cortex-strings/trunk : contents of reference/csl/memset.c at revision 107

~linaro-toolchain-dev/cortex-strings/trunk : (revision 107)

/* Copyright (c) 2009 CodeSourcery, Inc.
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of CodeSourcery nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "arm_asm.h"
#include <string.h>
#include <stdint.h>

/* Standard operations for word-sized values.  */
#define WORD_REF(ADDRESS, OFFSET) \
	*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))

/* On processors with NEON, we use 128-bit vectors.  Also,
   we need to include arm_neon.h to use these.  */
#if defined(__ARM_NEON__)
  #include <arm_neon.h>

  #define WORD_TYPE uint8x16_t
  #define WORD_SIZE 16

  #define WORD_DUPLICATE(VALUE) \
	vdupq_n_u8(VALUE)

/* On ARM processors with 64-bit ldrd instructions, we use those,
   except on Cortex-M* where benchmarking has shown them to
   be slower.  */
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
	|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
  #define WORD_TYPE uint64_t
  #define WORD_SIZE 8

  /* ARM stores 64-bit values in two 32-bit registers and does not
     have 64-bit multiply or bitwise-or instructions, so this union
     operation results in optimal code.  */
  static inline uint64_t splat8(value) {
	union { uint32_t ints[2]; uint64_t result; } quad;
	quad.ints[0] = (unsigned char)(value) * 0x01010101;
	quad.ints[1] = quad.ints[0];
	return quad.result;
  }
  #define WORD_DUPLICATE(VALUE) \
	splat8(VALUE)

/* On everything else, we use 32-bit loads and stores.  */
#else
  #define WORD_TYPE uint32_t
  #define WORD_SIZE 4
  #define WORD_DUPLICATE(VALUE) \
	(unsigned char)(VALUE) * 0x01010101
#endif

/* On all ARM platforms, 'SHORTWORD' is a 32-bit value.  */
#define SHORTWORD_TYPE uint32_t
#define SHORTWORD_SIZE 4
#define SHORTWORD_REF(ADDRESS, OFFSET) \
	*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
#define SHORTWORD_DUPLICATE(VALUE) \
	(uint32_t)(unsigned char)(VALUE) * 0x01010101

void *memset(void *DST, int C, size_t LENGTH)
{
  void* DST0 = DST;
  unsigned char C_BYTE = C;

#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
  const char* DST_end = (char*)DST + LENGTH;
  while ((char*)DST < DST_end) {
    *((char*)DST) = C_BYTE;
    DST++;
  }

  return DST0;
#else /* not PREFER_SIZE_OVER_SPEED */
  /* Handle short strings and immediately return.  */
  if (__builtin_expect(LENGTH < SHORTWORD_SIZE, 1)) {
    size_t i = 0;
    while (i < LENGTH) {
      ((char*)DST)[i] = C_BYTE;
      i++;
    }
    return DST;
  }

  const char* DST_end = (char*)DST + LENGTH;

  /* Align DST to SHORTWORD_SIZE.  */
  while ((uintptr_t)DST % SHORTWORD_SIZE != 0) {
    *(char*) (DST++) = C_BYTE;
  }

#if WORD_SIZE > SHORTWORD_SIZE
  SHORTWORD_TYPE C_SHORTWORD = SHORTWORD_DUPLICATE(C_BYTE);

  /* Align DST to WORD_SIZE in steps of SHORTWORD_SIZE.  */
  if (__builtin_expect(DST_end - (char*)DST >= WORD_SIZE, 0)) {
    while ((uintptr_t)DST % WORD_SIZE != 0) {
      SHORTWORD_REF(DST, 0) = C_SHORTWORD;
      DST += SHORTWORD_SIZE;
    }
#endif /* WORD_SIZE > SHORTWORD_SIZE */

    WORD_TYPE C_WORD = WORD_DUPLICATE(C_BYTE);

#if defined(__ARM_NEON__)
    /* Testing on Cortex-A8 indicates that the following idiom
       produces faster assembly code when doing vector copies,
       but not when doing regular copies.  */
    size_t i = 0;
    LENGTH = DST_end - (char*)DST;
    while (i + WORD_SIZE * 16 <= LENGTH) {
      WORD_REF(DST, i) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 4) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 5) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 6) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 7) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 8) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 9) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 10) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 11) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 12) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 13) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 14) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 15) = C_WORD;
      i += WORD_SIZE * 16;
    }
    while (i + WORD_SIZE * 4 <= LENGTH) {
      WORD_REF(DST, i) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
      WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
      i += WORD_SIZE * 4;
    }
    while (i + WORD_SIZE <= LENGTH) {
      WORD_REF(DST, i) = C_WORD;
      i += WORD_SIZE;
    }
    DST += i;
#else /* not defined(__ARM_NEON__) */
    /* Note: 16-times unrolling is about 50% faster than 4-times
       unrolling on both ARM Cortex-A8 and Cortex-M3.  */
    while (DST_end - (char*) DST >= WORD_SIZE * 16) {
      WORD_REF(DST, 0) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 4) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 5) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 6) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 7) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 8) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 9) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 10) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 11) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 12) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 13) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 14) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 15) = C_WORD;
      DST += WORD_SIZE * 16;
    }
    while (WORD_SIZE * 4 <= DST_end - (char*) DST) {
      WORD_REF(DST, 0) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
      WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
      DST += WORD_SIZE * 4;
    }
    while (WORD_SIZE <= DST_end - (char*) DST) {
      WORD_REF(DST, 0) = C_WORD;
      DST += WORD_SIZE;
    }
#endif /* not defined(__ARM_NEON__) */

#if WORD_SIZE > SHORTWORD_SIZE
  } /* end if N >= WORD_SIZE */

  while (SHORTWORD_SIZE <= DST_end - (char*)DST) {
    SHORTWORD_REF(DST, 0) = C_SHORTWORD;
    DST += SHORTWORD_SIZE;
  }
#endif /* WORD_SIZE > SHORTWORD_SIZE */

  while ((char*)DST < DST_end) {
    *((char*)DST) = C_BYTE;
    DST++;
  }

  return DST0;
#endif /* not PREFER_SIZE_OVER_SPEED */
}

4 by Michael Hope Modified the imported versions to build locally. Added the CSL routines.	1	/* Copyright (c) 2009 CodeSourcery, Inc.
	2	* All rights reserved.
	3	*
	4	* Redistribution and use in source and binary forms, with or without
	5	* modification, are permitted provided that the following conditions are met:
	6	* * Redistributions of source code must retain the above copyright
	7	* notice, this list of conditions and the following disclaimer.
	8	* * Redistributions in binary form must reproduce the above copyright
	9	* notice, this list of conditions and the following disclaimer in the
	10	* documentation and/or other materials provided with the distribution.
	11	* * Neither the name of CodeSourcery nor the
	12	* names of its contributors may be used to endorse or promote products
	13	* derived from this software without specific prior written permission.
	14	*
	15	* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
	16	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	17	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	18	* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
	19	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	20	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	21	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	22	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	24	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	25	*/
	26
	27	#include "arm_asm.h"
	28	#include <string.h>
	29	#include <stdint.h>
	30
	31	/* Standard operations for word-sized values. */
	32	#define WORD_REF(ADDRESS, OFFSET) \
	33	((WORD_TYPE)((char*)(ADDRESS) + (OFFSET)))
	34
	35	/* On processors with NEON, we use 128-bit vectors. Also,
	36	we need to include arm_neon.h to use these. */
	37	#if defined(__ARM_NEON__)
	38	#include <arm_neon.h>
	39
	40	#define WORD_TYPE uint8x16_t
	41	#define WORD_SIZE 16
	42
	43	#define WORD_DUPLICATE(VALUE) \
	44	vdupq_n_u8(VALUE)
	45
	46	/* On ARM processors with 64-bit ldrd instructions, we use those,
	47	except on Cortex-M* where benchmarking has shown them to
	48	be slower. */
	49	#elif defined(__ARM_ARCH_5E__) \|\| defined(__ARM_ARCH_5TE__) \
	50	\|\| defined(__ARM_ARCH_5TEJ__) \|\| defined(_ISA_ARM_6)
	51	#define WORD_TYPE uint64_t
	52	#define WORD_SIZE 8
	53
	54	/* ARM stores 64-bit values in two 32-bit registers and does not
	55	have 64-bit multiply or bitwise-or instructions, so this union
	56	operation results in optimal code. */
	57	static inline uint64_t splat8(value) {
	58	union { uint32_t ints[2]; uint64_t result; } quad;
	59	quad.ints[0] = (unsigned char)(value) * 0x01010101;
	60	quad.ints[1] = quad.ints[0];
	61	return quad.result;
	62	}
	63	#define WORD_DUPLICATE(VALUE) \
	64	splat8(VALUE)
65
66	/* On everything else, we use 32-bit loads and stores. */
67	#else
68	#define WORD_TYPE uint32_t
69	#define WORD_SIZE 4
70	#define WORD_DUPLICATE(VALUE) \
71	(unsigned char)(VALUE) * 0x01010101
72	#endif
73
74	/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */
75	#define SHORTWORD_TYPE uint32_t
76	#define SHORTWORD_SIZE 4
77	#define SHORTWORD_REF(ADDRESS, OFFSET) \
78	((SHORTWORD_TYPE)((char*)(ADDRESS) + (OFFSET)))
79	#define SHORTWORD_DUPLICATE(VALUE) \
80	(uint32_t)(unsigned char)(VALUE) * 0x01010101
81
82	void memset(void DST, int C, size_t LENGTH)
83	{
84	void* DST0 = DST;
85	unsigned char C_BYTE = C;
86
87	#if defined(PREFER_SIZE_OVER_SPEED) \|\| defined(__OPTIMIZE_SIZE__)
88	const char* DST_end = (char*)DST + LENGTH;
89	while ((char*)DST < DST_end) {
90	((char)DST) = C_BYTE;
91	DST++;
92	}
93
94	return DST0;
95	#else /* not PREFER_SIZE_OVER_SPEED */
96	/* Handle short strings and immediately return. */
97	if (__builtin_expect(LENGTH < SHORTWORD_SIZE, 1)) {
98	size_t i = 0;
99	while (i < LENGTH) {
100	((char*)DST)[i] = C_BYTE;
101	i++;
102	}
103	return DST;
104	}
105
106	const char* DST_end = (char*)DST + LENGTH;
107
108	/* Align DST to SHORTWORD_SIZE. */
109	while ((uintptr_t)DST % SHORTWORD_SIZE != 0) {
110	(char) (DST++) = C_BYTE;
111	}
112
113	#if WORD_SIZE > SHORTWORD_SIZE
114	SHORTWORD_TYPE C_SHORTWORD = SHORTWORD_DUPLICATE(C_BYTE);
115
116	/* Align DST to WORD_SIZE in steps of SHORTWORD_SIZE. */
117	if (__builtin_expect(DST_end - (char*)DST >= WORD_SIZE, 0)) {
118	while ((uintptr_t)DST % WORD_SIZE != 0) {
119	SHORTWORD_REF(DST, 0) = C_SHORTWORD;
120	DST += SHORTWORD_SIZE;
121	}
122	#endif /* WORD_SIZE > SHORTWORD_SIZE */
123
124	WORD_TYPE C_WORD = WORD_DUPLICATE(C_BYTE);
125
126	#if defined(__ARM_NEON__)
127	/* Testing on Cortex-A8 indicates that the following idiom
128	produces faster assembly code when doing vector copies,
129	but not when doing regular copies. */
130	size_t i = 0;
131	LENGTH = DST_end - (char*)DST;
132	while (i + WORD_SIZE * 16 <= LENGTH) {
133	WORD_REF(DST, i) = C_WORD;
134	WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
135	WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
136	WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
137	WORD_REF(DST, i + WORD_SIZE * 4) = C_WORD;
138	WORD_REF(DST, i + WORD_SIZE * 5) = C_WORD;
139	WORD_REF(DST, i + WORD_SIZE * 6) = C_WORD;
140	WORD_REF(DST, i + WORD_SIZE * 7) = C_WORD;
141	WORD_REF(DST, i + WORD_SIZE * 8) = C_WORD;
142	WORD_REF(DST, i + WORD_SIZE * 9) = C_WORD;
143	WORD_REF(DST, i + WORD_SIZE * 10) = C_WORD;
144	WORD_REF(DST, i + WORD_SIZE * 11) = C_WORD;
145	WORD_REF(DST, i + WORD_SIZE * 12) = C_WORD;
146	WORD_REF(DST, i + WORD_SIZE * 13) = C_WORD;
147	WORD_REF(DST, i + WORD_SIZE * 14) = C_WORD;
148	WORD_REF(DST, i + WORD_SIZE * 15) = C_WORD;
149	i += WORD_SIZE * 16;
150	}
151	while (i + WORD_SIZE * 4 <= LENGTH) {
152	WORD_REF(DST, i) = C_WORD;
153	WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;
154	WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;
155	WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;
156	i += WORD_SIZE * 4;
157	}
158	while (i + WORD_SIZE <= LENGTH) {
159	WORD_REF(DST, i) = C_WORD;
160	i += WORD_SIZE;
161	}
162	DST += i;
163	#else /* not defined(__ARM_NEON__) */
164	/* Note: 16-times unrolling is about 50% faster than 4-times
165	unrolling on both ARM Cortex-A8 and Cortex-M3. */
166	while (DST_end - (char) DST >= WORD_SIZE 16) {
167	WORD_REF(DST, 0) = C_WORD;
168	WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
169	WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
170	WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
171	WORD_REF(DST, WORD_SIZE * 4) = C_WORD;
172	WORD_REF(DST, WORD_SIZE * 5) = C_WORD;
173	WORD_REF(DST, WORD_SIZE * 6) = C_WORD;
174	WORD_REF(DST, WORD_SIZE * 7) = C_WORD;
175	WORD_REF(DST, WORD_SIZE * 8) = C_WORD;
176	WORD_REF(DST, WORD_SIZE * 9) = C_WORD;
177	WORD_REF(DST, WORD_SIZE * 10) = C_WORD;
178	WORD_REF(DST, WORD_SIZE * 11) = C_WORD;
179	WORD_REF(DST, WORD_SIZE * 12) = C_WORD;
180	WORD_REF(DST, WORD_SIZE * 13) = C_WORD;
181	WORD_REF(DST, WORD_SIZE * 14) = C_WORD;
182	WORD_REF(DST, WORD_SIZE * 15) = C_WORD;
183	DST += WORD_SIZE * 16;
184	}
185	while (WORD_SIZE * 4 <= DST_end - (char*) DST) {
186	WORD_REF(DST, 0) = C_WORD;
187	WORD_REF(DST, WORD_SIZE * 1) = C_WORD;
188	WORD_REF(DST, WORD_SIZE * 2) = C_WORD;
189	WORD_REF(DST, WORD_SIZE * 3) = C_WORD;
190	DST += WORD_SIZE * 4;
191	}
192	while (WORD_SIZE <= DST_end - (char*) DST) {
193	WORD_REF(DST, 0) = C_WORD;
194	DST += WORD_SIZE;
195	}
196	#endif /* not defined(__ARM_NEON__) */
197
198	#if WORD_SIZE > SHORTWORD_SIZE
199	} /* end if N >= WORD_SIZE */
200
201	while (SHORTWORD_SIZE <= DST_end - (char*)DST) {
202	SHORTWORD_REF(DST, 0) = C_SHORTWORD;
203	DST += SHORTWORD_SIZE;
204	}
205	#endif /* WORD_SIZE > SHORTWORD_SIZE */
206
207	while ((char*)DST < DST_end) {
208	((char)DST) = C_BYTE;
209	DST++;
210	}
211
212	return DST0;
213	#endif /* not PREFER_SIZE_OVER_SPEED */
214	}