/* Copyright (c) 2009 CodeSourcery, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of CodeSourcery nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "arm_asm.h" #include #include /* Standard operations for word-sized values. */ #define WORD_REF(ADDRESS, OFFSET) \ *((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET))) /* On processors with NEON, we use 128-bit vectors. Also, we need to include arm_neon.h to use these. */ #if defined(__ARM_NEON__) #include #define WORD_TYPE uint8x16_t #define WORD_SIZE 16 #define WORD_DUPLICATE(VALUE) \ vdupq_n_u8(VALUE) /* On ARM processors with 64-bit ldrd instructions, we use those, except on Cortex-M* where benchmarking has shown them to be slower. */ #elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \ || defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6) #define WORD_TYPE uint64_t #define WORD_SIZE 8 /* ARM stores 64-bit values in two 32-bit registers and does not have 64-bit multiply or bitwise-or instructions, so this union operation results in optimal code. */ static inline uint64_t splat8(value) { union { uint32_t ints[2]; uint64_t result; } quad; quad.ints[0] = (unsigned char)(value) * 0x01010101; quad.ints[1] = quad.ints[0]; return quad.result; } #define WORD_DUPLICATE(VALUE) \ splat8(VALUE) /* On everything else, we use 32-bit loads and stores. */ #else #define WORD_TYPE uint32_t #define WORD_SIZE 4 #define WORD_DUPLICATE(VALUE) \ (unsigned char)(VALUE) * 0x01010101 #endif /* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */ #define SHORTWORD_TYPE uint32_t #define SHORTWORD_SIZE 4 #define SHORTWORD_REF(ADDRESS, OFFSET) \ *((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET))) #define SHORTWORD_DUPLICATE(VALUE) \ (uint32_t)(unsigned char)(VALUE) * 0x01010101 void *memset(void *DST, int C, size_t LENGTH) { void* DST0 = DST; unsigned char C_BYTE = C; #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) const char* DST_end = (char*)DST + LENGTH; while ((char*)DST < DST_end) { *((char*)DST) = C_BYTE; DST++; } return DST0; #else /* not PREFER_SIZE_OVER_SPEED */ /* Handle short strings and immediately return. */ if (__builtin_expect(LENGTH < SHORTWORD_SIZE, 1)) { size_t i = 0; while (i < LENGTH) { ((char*)DST)[i] = C_BYTE; i++; } return DST; } const char* DST_end = (char*)DST + LENGTH; /* Align DST to SHORTWORD_SIZE. */ while ((uintptr_t)DST % SHORTWORD_SIZE != 0) { *(char*) (DST++) = C_BYTE; } #if WORD_SIZE > SHORTWORD_SIZE SHORTWORD_TYPE C_SHORTWORD = SHORTWORD_DUPLICATE(C_BYTE); /* Align DST to WORD_SIZE in steps of SHORTWORD_SIZE. */ if (__builtin_expect(DST_end - (char*)DST >= WORD_SIZE, 0)) { while ((uintptr_t)DST % WORD_SIZE != 0) { SHORTWORD_REF(DST, 0) = C_SHORTWORD; DST += SHORTWORD_SIZE; } #endif /* WORD_SIZE > SHORTWORD_SIZE */ WORD_TYPE C_WORD = WORD_DUPLICATE(C_BYTE); #if defined(__ARM_NEON__) /* Testing on Cortex-A8 indicates that the following idiom produces faster assembly code when doing vector copies, but not when doing regular copies. */ size_t i = 0; LENGTH = DST_end - (char*)DST; while (i + WORD_SIZE * 16 <= LENGTH) { WORD_REF(DST, i) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 4) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 5) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 6) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 7) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 8) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 9) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 10) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 11) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 12) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 13) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 14) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 15) = C_WORD; i += WORD_SIZE * 16; } while (i + WORD_SIZE * 4 <= LENGTH) { WORD_REF(DST, i) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD; WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD; i += WORD_SIZE * 4; } while (i + WORD_SIZE <= LENGTH) { WORD_REF(DST, i) = C_WORD; i += WORD_SIZE; } DST += i; #else /* not defined(__ARM_NEON__) */ /* Note: 16-times unrolling is about 50% faster than 4-times unrolling on both ARM Cortex-A8 and Cortex-M3. */ while (DST_end - (char*) DST >= WORD_SIZE * 16) { WORD_REF(DST, 0) = C_WORD; WORD_REF(DST, WORD_SIZE * 1) = C_WORD; WORD_REF(DST, WORD_SIZE * 2) = C_WORD; WORD_REF(DST, WORD_SIZE * 3) = C_WORD; WORD_REF(DST, WORD_SIZE * 4) = C_WORD; WORD_REF(DST, WORD_SIZE * 5) = C_WORD; WORD_REF(DST, WORD_SIZE * 6) = C_WORD; WORD_REF(DST, WORD_SIZE * 7) = C_WORD; WORD_REF(DST, WORD_SIZE * 8) = C_WORD; WORD_REF(DST, WORD_SIZE * 9) = C_WORD; WORD_REF(DST, WORD_SIZE * 10) = C_WORD; WORD_REF(DST, WORD_SIZE * 11) = C_WORD; WORD_REF(DST, WORD_SIZE * 12) = C_WORD; WORD_REF(DST, WORD_SIZE * 13) = C_WORD; WORD_REF(DST, WORD_SIZE * 14) = C_WORD; WORD_REF(DST, WORD_SIZE * 15) = C_WORD; DST += WORD_SIZE * 16; } while (WORD_SIZE * 4 <= DST_end - (char*) DST) { WORD_REF(DST, 0) = C_WORD; WORD_REF(DST, WORD_SIZE * 1) = C_WORD; WORD_REF(DST, WORD_SIZE * 2) = C_WORD; WORD_REF(DST, WORD_SIZE * 3) = C_WORD; DST += WORD_SIZE * 4; } while (WORD_SIZE <= DST_end - (char*) DST) { WORD_REF(DST, 0) = C_WORD; DST += WORD_SIZE; } #endif /* not defined(__ARM_NEON__) */ #if WORD_SIZE > SHORTWORD_SIZE } /* end if N >= WORD_SIZE */ while (SHORTWORD_SIZE <= DST_end - (char*)DST) { SHORTWORD_REF(DST, 0) = C_SHORTWORD; DST += SHORTWORD_SIZE; } #endif /* WORD_SIZE > SHORTWORD_SIZE */ while ((char*)DST < DST_end) { *((char*)DST) = C_BYTE; DST++; } return DST0; #endif /* not PREFER_SIZE_OVER_SPEED */ }