~linaro-toolchain-dev/cortex-strings/trunk : contents of reference/csl/memcpy.c at revision 107

~linaro-toolchain-dev/cortex-strings/trunk : (revision 107)

/* Copyright (c) 2009 CodeSourcery, Inc.
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of CodeSourcery nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "arm_asm.h"
#include <string.h>
#include <stdint.h>
#include <stddef.h>

/* Standard operations for word-sized values.  */
#define WORD_REF(ADDRESS, OFFSET) \
	*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
#define WORD_COPY(OUT, IN, OFFSET) \
	WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)

/* On processors with NEON, we use 128-bit vectors.  Also,
   we need to include arm_neon.h to use these.  */
#if defined(__ARM_NEON__)
  #include <arm_neon.h>

  #define WORD_TYPE uint8x16_t
  #define WORD_SIZE 16
  #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)

/* On ARM processors with 64-bit ldrd instructions, we use those,
   except on Cortex-M* where benchmarking has shown them to
   be slower.  */
#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
	|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)
  #define WORD_TYPE uint64_t
  #define WORD_SIZE 8
  #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)

/* On everything else, we use 32-bit loads and stores, and
   do not use prefetching.  */
#else
  #define WORD_TYPE uint32_t
  #define WORD_SIZE 4
  #define MAYBE_PREFETCH(IN)
#endif

/* On all ARM platforms, 'SHORTWORD' is a 32-bit value.  */
#define SHORTWORD_TYPE uint32_t
#define SHORTWORD_SIZE 4
#define SHORTWORD_REF(ADDRESS, OFFSET) \
	*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))
#define SHORTWORD_COPY(OUT, IN, OFFSET) \
	SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)

/* Shifting directionality depends on endianness.  */
#ifdef __ARMEB__
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
	((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8))
#else
#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
	((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8))
#endif

void *memcpy(void *OUT, const void *IN, size_t N)
{
  void* OUT0 = OUT;

#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
  const char* OUT_end = (char*)OUT + N;
  while ((char*)OUT < OUT_end) {
    *((char*)OUT) = *((char*)IN);
    OUT++;
    IN++;
  }

  return OUT0;
#else
  /* Handle short strings and immediately return.  */
  if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {
    size_t i = 0;
    while (i < N) {
      ((char*)OUT)[i] = ((char*)IN)[i];
      i++;
    }
    return OUT;
  }

  const char* OUT_end = (char*)OUT + N;

  /* Align OUT to SHORTWORD_SIZE.  */
  while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {
    *(char*) (OUT++) = *(char*) (IN++);
  }

  if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {

#if WORD_SIZE > SHORTWORD_SIZE
    /* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE.  */
    if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {
      while ((uintptr_t)OUT % WORD_SIZE != 0) {
        SHORTWORD_COPY(OUT, IN, 0);
        OUT += SHORTWORD_SIZE;
        IN += SHORTWORD_SIZE;
      }

      if ((uintptr_t) IN % WORD_SIZE == 0) {
#endif /* WORD_SIZE > SHORTWORD_SIZE */

#if defined(__ARM_NEON__)
        /* Testing on Cortex-A8 indicates that the following idiom
           produces faster assembly code when doing vector copies,
           but not when doing regular copies.  */
        size_t i = 0;
        N = OUT_end - (char*)OUT;
        MAYBE_PREFETCH(IN + 64);
        MAYBE_PREFETCH(IN + 128);
        MAYBE_PREFETCH(IN + 192);
        if (N >= 640) {
          MAYBE_PREFETCH(IN + 256);
          MAYBE_PREFETCH(IN + 320);
          MAYBE_PREFETCH(IN + 384);
          MAYBE_PREFETCH(IN + 448);
          MAYBE_PREFETCH(IN + 512);
          MAYBE_PREFETCH(IN + 576);
          MAYBE_PREFETCH(IN + 640);
          MAYBE_PREFETCH(IN + 704);
	  /* We phrase the loop condition in this way so that the
             i + WORD_SIZE * 16 value can be reused to increment i.  */
          while (i + WORD_SIZE * 16 <= N - 640) {
            MAYBE_PREFETCH(IN + 768);
            MAYBE_PREFETCH(IN + 832);
            MAYBE_PREFETCH(IN + 896);
            MAYBE_PREFETCH(IN + 960);
            WORD_COPY(OUT, IN, i);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
            WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
            i += WORD_SIZE * 16;
          }
        }
        while (i + WORD_SIZE * 16 <= N) {
          WORD_COPY(OUT, IN, i);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
          i += WORD_SIZE * 16;
        }
        while (i + WORD_SIZE * 4 <= N) {
          WORD_COPY(OUT, IN, i);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
          WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
          i += WORD_SIZE * 4;
        }
        while (i + WORD_SIZE <= N) {
          WORD_COPY(OUT, IN, i);
          i += WORD_SIZE;
        }
        OUT += i;
        IN += i;
#else /* not defined(__ARM_NEON__) */
        /* Note: 16-times unrolling is about 20% faster than 4-times
           unrolling on both ARM Cortex-A8 and Cortex-M3.  */
        MAYBE_PREFETCH(IN + 64);
        MAYBE_PREFETCH(IN + 128);
        MAYBE_PREFETCH(IN + 192);
        while (OUT_end - (char*)OUT >= WORD_SIZE * 16) {
          MAYBE_PREFETCH(IN + 256);
          MAYBE_PREFETCH(IN + 320);
          WORD_COPY(OUT, IN, 0);
          WORD_COPY(OUT, IN, WORD_SIZE * 1);
          WORD_COPY(OUT, IN, WORD_SIZE * 2);
          WORD_COPY(OUT, IN, WORD_SIZE * 3);
          WORD_COPY(OUT, IN, WORD_SIZE * 4);
          WORD_COPY(OUT, IN, WORD_SIZE * 5);
          WORD_COPY(OUT, IN, WORD_SIZE * 6);
          WORD_COPY(OUT, IN, WORD_SIZE * 7);
          WORD_COPY(OUT, IN, WORD_SIZE * 8);
          WORD_COPY(OUT, IN, WORD_SIZE * 9);
          WORD_COPY(OUT, IN, WORD_SIZE * 10);
          WORD_COPY(OUT, IN, WORD_SIZE * 11);
          WORD_COPY(OUT, IN, WORD_SIZE * 12);
          WORD_COPY(OUT, IN, WORD_SIZE * 13);
          WORD_COPY(OUT, IN, WORD_SIZE * 14);
          WORD_COPY(OUT, IN, WORD_SIZE * 15);
          OUT += WORD_SIZE * 16;
          IN += WORD_SIZE * 16;
        }
        while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {
          WORD_COPY(OUT, IN, 0);
          WORD_COPY(OUT, IN, WORD_SIZE * 1);
          WORD_COPY(OUT, IN, WORD_SIZE * 2);
          WORD_COPY(OUT, IN, WORD_SIZE * 3);
          OUT += WORD_SIZE * 4;
          IN += WORD_SIZE * 4;
        }
        while (WORD_SIZE <= OUT_end - (char*)OUT) {
          WORD_COPY(OUT, IN, 0);
          OUT += WORD_SIZE;
          IN += WORD_SIZE;
        }
#endif /* not defined(__ARM_NEON__) */

#if WORD_SIZE > SHORTWORD_SIZE
      } else { /* if IN is not WORD_SIZE aligned */
        while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {
          SHORTWORD_COPY(OUT, IN, 0);
          SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);
          SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);
          SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);
          OUT += SHORTWORD_SIZE * 4;
          IN += SHORTWORD_SIZE * 4;
        }
      } /* end if IN is not WORD_SIZE aligned */
    } /* end if N >= WORD_SIZE */

    while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
      SHORTWORD_COPY(OUT, IN, 0);
      OUT += SHORTWORD_SIZE;
      IN += SHORTWORD_SIZE;
    }
#endif /* WORD_SIZE > SHORTWORD_SIZE */

  } else { /* if IN is not SHORTWORD_SIZE aligned */
    ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;

    SHORTWORD_TYPE temp1, temp2;
    temp1 = SHORTWORD_REF(IN, -misalign);

    /* Benchmarking indicates that unrolling this loop doesn't
       produce a measurable performance improvement on ARM.  */
    while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
      IN += SHORTWORD_SIZE;
      temp2 = SHORTWORD_REF(IN, -misalign);
      SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);
      temp1 = temp2;
      OUT += SHORTWORD_SIZE;
    }

  } /* end if IN is not SHORTWORD_SIZE aligned */

  while ((char*)OUT < OUT_end) {
    *((char*)OUT) = *((char*)IN);
    OUT++;
    IN++;
  }

  return OUT0;
#endif
}

4 by Michael Hope Modified the imported versions to build locally. Added the CSL routines.	1	/* Copyright (c) 2009 CodeSourcery, Inc.
	2	* All rights reserved.
	3	*
	4	* Redistribution and use in source and binary forms, with or without
	5	* modification, are permitted provided that the following conditions are met:
	6	* * Redistributions of source code must retain the above copyright
	7	* notice, this list of conditions and the following disclaimer.
	8	* * Redistributions in binary form must reproduce the above copyright
	9	* notice, this list of conditions and the following disclaimer in the
	10	* documentation and/or other materials provided with the distribution.
	11	* * Neither the name of CodeSourcery nor the
	12	* names of its contributors may be used to endorse or promote products
	13	* derived from this software without specific prior written permission.
	14	*
	15	* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
	16	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	17	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	18	* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
	19	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	20	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	21	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	22	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	24	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	25	*/
	26
	27	#include "arm_asm.h"
	28	#include <string.h>
	29	#include <stdint.h>
	30	#include <stddef.h>
	31
	32	/* Standard operations for word-sized values. */
	33	#define WORD_REF(ADDRESS, OFFSET) \
	34	((WORD_TYPE)((char*)(ADDRESS) + (OFFSET)))
	35	#define WORD_COPY(OUT, IN, OFFSET) \
	36	WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)
	37
	38	/* On processors with NEON, we use 128-bit vectors. Also,
	39	we need to include arm_neon.h to use these. */
	40	#if defined(__ARM_NEON__)
	41	#include <arm_neon.h>
	42
	43	#define WORD_TYPE uint8x16_t
	44	#define WORD_SIZE 16
	45	#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
	46
	47	/* On ARM processors with 64-bit ldrd instructions, we use those,
	48	except on Cortex-M* where benchmarking has shown them to
	49	be slower. */
	50	#elif defined(__ARM_ARCH_5E__) \|\| defined(__ARM_ARCH_5TE__) \
	51	\|\| defined(__ARM_ARCH_5TEJ__) \|\| defined(_ISA_ARM_6)
	52	#define WORD_TYPE uint64_t
	53	#define WORD_SIZE 8
	54	#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)
	55
	56	/* On everything else, we use 32-bit loads and stores, and
	57	do not use prefetching. */
	58	#else
	59	#define WORD_TYPE uint32_t
	60	#define WORD_SIZE 4
	61	#define MAYBE_PREFETCH(IN)
	62	#endif
	63
	64	/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */
65	#define SHORTWORD_TYPE uint32_t
66	#define SHORTWORD_SIZE 4
67	#define SHORTWORD_REF(ADDRESS, OFFSET) \
68	((SHORTWORD_TYPE)((char*)(ADDRESS) + (OFFSET)))
69	#define SHORTWORD_COPY(OUT, IN, OFFSET) \
70	SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)
71
72	/* Shifting directionality depends on endianness. */
73	#ifdef __ARMEB__
74	#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
75	((IN0) << ((OFFSET)8)) \| ((IN1) >> (SHORTWORD_SIZE8 - (OFFSET)*8))
76	#else
77	#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \
78	((IN0) >> ((OFFSET)8)) \| ((IN1) << (SHORTWORD_SIZE8 - (OFFSET)*8))
79	#endif
80
81	void memcpy(void OUT, const void *IN, size_t N)
82	{
83	void* OUT0 = OUT;
84
85	#if defined(PREFER_SIZE_OVER_SPEED) \|\| defined(__OPTIMIZE_SIZE__)
86	const char* OUT_end = (char*)OUT + N;
87	while ((char*)OUT < OUT_end) {
88	((char)OUT) = ((char)IN);
89	OUT++;
90	IN++;
91	}
92
93	return OUT0;
94	#else
95	/* Handle short strings and immediately return. */
96	if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {
97	size_t i = 0;
98	while (i < N) {
99	((char)OUT)[i] = ((char)IN)[i];
100	i++;
101	}
102	return OUT;
103	}
104
105	const char* OUT_end = (char*)OUT + N;
106
107	/* Align OUT to SHORTWORD_SIZE. */
108	while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {
109	(char) (OUT++) = (char) (IN++);
110	}
111
112	if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {
113
114	#if WORD_SIZE > SHORTWORD_SIZE
115	/* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE. */
116	if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {
117	while ((uintptr_t)OUT % WORD_SIZE != 0) {
118	SHORTWORD_COPY(OUT, IN, 0);
119	OUT += SHORTWORD_SIZE;
120	IN += SHORTWORD_SIZE;
121	}
122
123	if ((uintptr_t) IN % WORD_SIZE == 0) {
124	#endif /* WORD_SIZE > SHORTWORD_SIZE */
125
126	#if defined(__ARM_NEON__)
127	/* Testing on Cortex-A8 indicates that the following idiom
128	produces faster assembly code when doing vector copies,
129	but not when doing regular copies. */
130	size_t i = 0;
131	N = OUT_end - (char*)OUT;
132	MAYBE_PREFETCH(IN + 64);
133	MAYBE_PREFETCH(IN + 128);
134	MAYBE_PREFETCH(IN + 192);
135	if (N >= 640) {
136	MAYBE_PREFETCH(IN + 256);
137	MAYBE_PREFETCH(IN + 320);
138	MAYBE_PREFETCH(IN + 384);
139	MAYBE_PREFETCH(IN + 448);
140	MAYBE_PREFETCH(IN + 512);
141	MAYBE_PREFETCH(IN + 576);
142	MAYBE_PREFETCH(IN + 640);
143	MAYBE_PREFETCH(IN + 704);
144	/* We phrase the loop condition in this way so that the
145	i + WORD_SIZE * 16 value can be reused to increment i. */
146	while (i + WORD_SIZE * 16 <= N - 640) {
147	MAYBE_PREFETCH(IN + 768);
148	MAYBE_PREFETCH(IN + 832);
149	MAYBE_PREFETCH(IN + 896);
150	MAYBE_PREFETCH(IN + 960);
151	WORD_COPY(OUT, IN, i);
152	WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
153	WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
154	WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
155	WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
156	WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
157	WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
158	WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
159	WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
160	WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
161	WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
162	WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
163	WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
164	WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
165	WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
166	WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
167	i += WORD_SIZE * 16;
168	}
169	}
170	while (i + WORD_SIZE * 16 <= N) {
171	WORD_COPY(OUT, IN, i);
172	WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
173	WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
174	WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
175	WORD_COPY(OUT, IN, i + WORD_SIZE * 4);
176	WORD_COPY(OUT, IN, i + WORD_SIZE * 5);
177	WORD_COPY(OUT, IN, i + WORD_SIZE * 6);
178	WORD_COPY(OUT, IN, i + WORD_SIZE * 7);
179	WORD_COPY(OUT, IN, i + WORD_SIZE * 8);
180	WORD_COPY(OUT, IN, i + WORD_SIZE * 9);
181	WORD_COPY(OUT, IN, i + WORD_SIZE * 10);
182	WORD_COPY(OUT, IN, i + WORD_SIZE * 11);
183	WORD_COPY(OUT, IN, i + WORD_SIZE * 12);
184	WORD_COPY(OUT, IN, i + WORD_SIZE * 13);
185	WORD_COPY(OUT, IN, i + WORD_SIZE * 14);
186	WORD_COPY(OUT, IN, i + WORD_SIZE * 15);
187	i += WORD_SIZE * 16;
188	}
189	while (i + WORD_SIZE * 4 <= N) {
190	WORD_COPY(OUT, IN, i);
191	WORD_COPY(OUT, IN, i + WORD_SIZE * 1);
192	WORD_COPY(OUT, IN, i + WORD_SIZE * 2);
193	WORD_COPY(OUT, IN, i + WORD_SIZE * 3);
194	i += WORD_SIZE * 4;
195	}
196	while (i + WORD_SIZE <= N) {
197	WORD_COPY(OUT, IN, i);
198	i += WORD_SIZE;
199	}
200	OUT += i;
201	IN += i;
202	#else /* not defined(__ARM_NEON__) */
203	/* Note: 16-times unrolling is about 20% faster than 4-times
204	unrolling on both ARM Cortex-A8 and Cortex-M3. */
205	MAYBE_PREFETCH(IN + 64);
206	MAYBE_PREFETCH(IN + 128);
207	MAYBE_PREFETCH(IN + 192);
208	while (OUT_end - (char)OUT >= WORD_SIZE 16) {
209	MAYBE_PREFETCH(IN + 256);
210	MAYBE_PREFETCH(IN + 320);
211	WORD_COPY(OUT, IN, 0);
212	WORD_COPY(OUT, IN, WORD_SIZE * 1);
213	WORD_COPY(OUT, IN, WORD_SIZE * 2);
214	WORD_COPY(OUT, IN, WORD_SIZE * 3);
215	WORD_COPY(OUT, IN, WORD_SIZE * 4);
216	WORD_COPY(OUT, IN, WORD_SIZE * 5);
217	WORD_COPY(OUT, IN, WORD_SIZE * 6);
218	WORD_COPY(OUT, IN, WORD_SIZE * 7);
219	WORD_COPY(OUT, IN, WORD_SIZE * 8);
220	WORD_COPY(OUT, IN, WORD_SIZE * 9);
221	WORD_COPY(OUT, IN, WORD_SIZE * 10);
222	WORD_COPY(OUT, IN, WORD_SIZE * 11);
223	WORD_COPY(OUT, IN, WORD_SIZE * 12);
224	WORD_COPY(OUT, IN, WORD_SIZE * 13);
225	WORD_COPY(OUT, IN, WORD_SIZE * 14);
226	WORD_COPY(OUT, IN, WORD_SIZE * 15);
227	OUT += WORD_SIZE * 16;
228	IN += WORD_SIZE * 16;
229	}
230	while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {
231	WORD_COPY(OUT, IN, 0);
232	WORD_COPY(OUT, IN, WORD_SIZE * 1);
233	WORD_COPY(OUT, IN, WORD_SIZE * 2);
234	WORD_COPY(OUT, IN, WORD_SIZE * 3);
235	OUT += WORD_SIZE * 4;
236	IN += WORD_SIZE * 4;
237	}
238	while (WORD_SIZE <= OUT_end - (char*)OUT) {
239	WORD_COPY(OUT, IN, 0);
240	OUT += WORD_SIZE;
241	IN += WORD_SIZE;
242	}
243	#endif /* not defined(__ARM_NEON__) */
244
245	#if WORD_SIZE > SHORTWORD_SIZE
246	} else { /* if IN is not WORD_SIZE aligned */
247	while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {
248	SHORTWORD_COPY(OUT, IN, 0);
249	SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);
250	SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);
251	SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);
252	OUT += SHORTWORD_SIZE * 4;
253	IN += SHORTWORD_SIZE * 4;
254	}
255	} /* end if IN is not WORD_SIZE aligned */
256	} /* end if N >= WORD_SIZE */
257
258	while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
259	SHORTWORD_COPY(OUT, IN, 0);
260	OUT += SHORTWORD_SIZE;
261	IN += SHORTWORD_SIZE;
262	}
263	#endif /* WORD_SIZE > SHORTWORD_SIZE */
264
265	} else { /* if IN is not SHORTWORD_SIZE aligned */
266	ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;
267
268	SHORTWORD_TYPE temp1, temp2;
269	temp1 = SHORTWORD_REF(IN, -misalign);
270
271	/* Benchmarking indicates that unrolling this loop doesn't
272	produce a measurable performance improvement on ARM. */
273	while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {
274	IN += SHORTWORD_SIZE;
275	temp2 = SHORTWORD_REF(IN, -misalign);
276	SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);
277	temp1 = temp2;
278	OUT += SHORTWORD_SIZE;
279	}
280
281	} /* end if IN is not SHORTWORD_SIZE aligned */
282
283	while ((char*)OUT < OUT_end) {
284	((char)OUT) = ((char)IN);
285	OUT++;
286	IN++;
287	}
288
289	return OUT0;
290	#endif
291	}