/* Copyright (c) 2009 CodeSourcery, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of CodeSourcery nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "arm_asm.h" #include #include #include /* Standard operations for word-sized values. */ #define WORD_REF(ADDRESS, OFFSET) \ *((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET))) #define WORD_COPY(OUT, IN, OFFSET) \ WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET) /* On processors with NEON, we use 128-bit vectors. Also, we need to include arm_neon.h to use these. */ #if defined(__ARM_NEON__) #include #define WORD_TYPE uint8x16_t #define WORD_SIZE 16 #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0) /* On ARM processors with 64-bit ldrd instructions, we use those, except on Cortex-M* where benchmarking has shown them to be slower. */ #elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \ || defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6) #define WORD_TYPE uint64_t #define WORD_SIZE 8 #define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0) /* On everything else, we use 32-bit loads and stores, and do not use prefetching. */ #else #define WORD_TYPE uint32_t #define WORD_SIZE 4 #define MAYBE_PREFETCH(IN) #endif /* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */ #define SHORTWORD_TYPE uint32_t #define SHORTWORD_SIZE 4 #define SHORTWORD_REF(ADDRESS, OFFSET) \ *((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET))) #define SHORTWORD_COPY(OUT, IN, OFFSET) \ SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET) /* Shifting directionality depends on endianness. */ #ifdef __ARMEB__ #define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \ ((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8)) #else #define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \ ((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8)) #endif void *memcpy(void *OUT, const void *IN, size_t N) { void* OUT0 = OUT; #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) const char* OUT_end = (char*)OUT + N; while ((char*)OUT < OUT_end) { *((char*)OUT) = *((char*)IN); OUT++; IN++; } return OUT0; #else /* Handle short strings and immediately return. */ if (__builtin_expect(N < SHORTWORD_SIZE, 1)) { size_t i = 0; while (i < N) { ((char*)OUT)[i] = ((char*)IN)[i]; i++; } return OUT; } const char* OUT_end = (char*)OUT + N; /* Align OUT to SHORTWORD_SIZE. */ while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) { *(char*) (OUT++) = *(char*) (IN++); } if ((uintptr_t) IN % SHORTWORD_SIZE == 0) { #if WORD_SIZE > SHORTWORD_SIZE /* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE. */ if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) { while ((uintptr_t)OUT % WORD_SIZE != 0) { SHORTWORD_COPY(OUT, IN, 0); OUT += SHORTWORD_SIZE; IN += SHORTWORD_SIZE; } if ((uintptr_t) IN % WORD_SIZE == 0) { #endif /* WORD_SIZE > SHORTWORD_SIZE */ #if defined(__ARM_NEON__) /* Testing on Cortex-A8 indicates that the following idiom produces faster assembly code when doing vector copies, but not when doing regular copies. */ size_t i = 0; N = OUT_end - (char*)OUT; MAYBE_PREFETCH(IN + 64); MAYBE_PREFETCH(IN + 128); MAYBE_PREFETCH(IN + 192); if (N >= 640) { MAYBE_PREFETCH(IN + 256); MAYBE_PREFETCH(IN + 320); MAYBE_PREFETCH(IN + 384); MAYBE_PREFETCH(IN + 448); MAYBE_PREFETCH(IN + 512); MAYBE_PREFETCH(IN + 576); MAYBE_PREFETCH(IN + 640); MAYBE_PREFETCH(IN + 704); /* We phrase the loop condition in this way so that the i + WORD_SIZE * 16 value can be reused to increment i. */ while (i + WORD_SIZE * 16 <= N - 640) { MAYBE_PREFETCH(IN + 768); MAYBE_PREFETCH(IN + 832); MAYBE_PREFETCH(IN + 896); MAYBE_PREFETCH(IN + 960); WORD_COPY(OUT, IN, i); WORD_COPY(OUT, IN, i + WORD_SIZE * 1); WORD_COPY(OUT, IN, i + WORD_SIZE * 2); WORD_COPY(OUT, IN, i + WORD_SIZE * 3); WORD_COPY(OUT, IN, i + WORD_SIZE * 4); WORD_COPY(OUT, IN, i + WORD_SIZE * 5); WORD_COPY(OUT, IN, i + WORD_SIZE * 6); WORD_COPY(OUT, IN, i + WORD_SIZE * 7); WORD_COPY(OUT, IN, i + WORD_SIZE * 8); WORD_COPY(OUT, IN, i + WORD_SIZE * 9); WORD_COPY(OUT, IN, i + WORD_SIZE * 10); WORD_COPY(OUT, IN, i + WORD_SIZE * 11); WORD_COPY(OUT, IN, i + WORD_SIZE * 12); WORD_COPY(OUT, IN, i + WORD_SIZE * 13); WORD_COPY(OUT, IN, i + WORD_SIZE * 14); WORD_COPY(OUT, IN, i + WORD_SIZE * 15); i += WORD_SIZE * 16; } } while (i + WORD_SIZE * 16 <= N) { WORD_COPY(OUT, IN, i); WORD_COPY(OUT, IN, i + WORD_SIZE * 1); WORD_COPY(OUT, IN, i + WORD_SIZE * 2); WORD_COPY(OUT, IN, i + WORD_SIZE * 3); WORD_COPY(OUT, IN, i + WORD_SIZE * 4); WORD_COPY(OUT, IN, i + WORD_SIZE * 5); WORD_COPY(OUT, IN, i + WORD_SIZE * 6); WORD_COPY(OUT, IN, i + WORD_SIZE * 7); WORD_COPY(OUT, IN, i + WORD_SIZE * 8); WORD_COPY(OUT, IN, i + WORD_SIZE * 9); WORD_COPY(OUT, IN, i + WORD_SIZE * 10); WORD_COPY(OUT, IN, i + WORD_SIZE * 11); WORD_COPY(OUT, IN, i + WORD_SIZE * 12); WORD_COPY(OUT, IN, i + WORD_SIZE * 13); WORD_COPY(OUT, IN, i + WORD_SIZE * 14); WORD_COPY(OUT, IN, i + WORD_SIZE * 15); i += WORD_SIZE * 16; } while (i + WORD_SIZE * 4 <= N) { WORD_COPY(OUT, IN, i); WORD_COPY(OUT, IN, i + WORD_SIZE * 1); WORD_COPY(OUT, IN, i + WORD_SIZE * 2); WORD_COPY(OUT, IN, i + WORD_SIZE * 3); i += WORD_SIZE * 4; } while (i + WORD_SIZE <= N) { WORD_COPY(OUT, IN, i); i += WORD_SIZE; } OUT += i; IN += i; #else /* not defined(__ARM_NEON__) */ /* Note: 16-times unrolling is about 20% faster than 4-times unrolling on both ARM Cortex-A8 and Cortex-M3. */ MAYBE_PREFETCH(IN + 64); MAYBE_PREFETCH(IN + 128); MAYBE_PREFETCH(IN + 192); while (OUT_end - (char*)OUT >= WORD_SIZE * 16) { MAYBE_PREFETCH(IN + 256); MAYBE_PREFETCH(IN + 320); WORD_COPY(OUT, IN, 0); WORD_COPY(OUT, IN, WORD_SIZE * 1); WORD_COPY(OUT, IN, WORD_SIZE * 2); WORD_COPY(OUT, IN, WORD_SIZE * 3); WORD_COPY(OUT, IN, WORD_SIZE * 4); WORD_COPY(OUT, IN, WORD_SIZE * 5); WORD_COPY(OUT, IN, WORD_SIZE * 6); WORD_COPY(OUT, IN, WORD_SIZE * 7); WORD_COPY(OUT, IN, WORD_SIZE * 8); WORD_COPY(OUT, IN, WORD_SIZE * 9); WORD_COPY(OUT, IN, WORD_SIZE * 10); WORD_COPY(OUT, IN, WORD_SIZE * 11); WORD_COPY(OUT, IN, WORD_SIZE * 12); WORD_COPY(OUT, IN, WORD_SIZE * 13); WORD_COPY(OUT, IN, WORD_SIZE * 14); WORD_COPY(OUT, IN, WORD_SIZE * 15); OUT += WORD_SIZE * 16; IN += WORD_SIZE * 16; } while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) { WORD_COPY(OUT, IN, 0); WORD_COPY(OUT, IN, WORD_SIZE * 1); WORD_COPY(OUT, IN, WORD_SIZE * 2); WORD_COPY(OUT, IN, WORD_SIZE * 3); OUT += WORD_SIZE * 4; IN += WORD_SIZE * 4; } while (WORD_SIZE <= OUT_end - (char*)OUT) { WORD_COPY(OUT, IN, 0); OUT += WORD_SIZE; IN += WORD_SIZE; } #endif /* not defined(__ARM_NEON__) */ #if WORD_SIZE > SHORTWORD_SIZE } else { /* if IN is not WORD_SIZE aligned */ while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) { SHORTWORD_COPY(OUT, IN, 0); SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1); SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2); SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3); OUT += SHORTWORD_SIZE * 4; IN += SHORTWORD_SIZE * 4; } } /* end if IN is not WORD_SIZE aligned */ } /* end if N >= WORD_SIZE */ while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) { SHORTWORD_COPY(OUT, IN, 0); OUT += SHORTWORD_SIZE; IN += SHORTWORD_SIZE; } #endif /* WORD_SIZE > SHORTWORD_SIZE */ } else { /* if IN is not SHORTWORD_SIZE aligned */ ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE; SHORTWORD_TYPE temp1, temp2; temp1 = SHORTWORD_REF(IN, -misalign); /* Benchmarking indicates that unrolling this loop doesn't produce a measurable performance improvement on ARM. */ while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) { IN += SHORTWORD_SIZE; temp2 = SHORTWORD_REF(IN, -misalign); SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign); temp1 = temp2; OUT += SHORTWORD_SIZE; } } /* end if IN is not SHORTWORD_SIZE aligned */ while ((char*)OUT < OUT_end) { *((char*)OUT) = *((char*)IN); OUT++; IN++; } return OUT0; #endif }