~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/reference/csl/memset.c

Committer: Michael Hope
Date: 2010-08-30 23:30:25 UTC
Revision ID: michael.hope@linaro.org-20100830233025-2f14wknqev6ryj01

Modified the imported versions to build locally. Added the CSL routines.

files added:
src/reference/csl

src/reference/csl/arm_asm.h

src/reference/csl/memcpy.c

src/reference/csl/memset.c

src/reference/plain/memset.c

src/reference/plain/strcmp.c

src/reference/plain/strcpy.c

files modified:
src/reference/Makefile.am

src/reference/bionic/memcmp.S

src/reference/bionic/memcmp16.S

src/reference/bionic/memcpy.S

src/reference/glibc/memmove.S

src/reference/glibc/memset.S

Show diffs side-by-side

added added

removed removed

src/reference/csl/memset.c

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in the

* documentation and/or other materials provided with the distribution.

* * Neither the name of CodeSourcery nor the

* names of its contributors may be used to endorse or promote products

* derived from this software without specific prior written permission.

* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY

* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY

* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "arm_asm.h"

#include <string.h>

#include <stdint.h>

/* Standard operations for word-sized values. */

#define WORD_REF(ADDRESS, OFFSET) \

*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))

/* On processors with NEON, we use 128-bit vectors. Also,

we need to include arm_neon.h to use these. */

#if defined(__ARM_NEON__)

#include <arm_neon.h>

#define WORD_TYPE uint8x16_t

#define WORD_SIZE 16

#define WORD_DUPLICATE(VALUE) \

vdupq_n_u8(VALUE)

/* On ARM processors with 64-bit ldrd instructions, we use those,

except on Cortex-M* where benchmarking has shown them to

be slower. */

#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \

|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)

#define WORD_TYPE uint64_t

#define WORD_SIZE 8

/* ARM stores 64-bit values in two 32-bit registers and does not

have 64-bit multiply or bitwise-or instructions, so this union

operation results in optimal code. */

static inline uint64_t splat8(value) {

union { uint32_t ints[2]; uint64_t result; } quad;

quad.ints[0] = (unsigned char)(value) * 0x01010101;

quad.ints[1] = quad.ints[0];

return quad.result;

}

#define WORD_DUPLICATE(VALUE) \

splat8(VALUE)

/* On everything else, we use 32-bit loads and stores. */

#else

#define WORD_TYPE uint32_t

#define WORD_SIZE 4

#define WORD_DUPLICATE(VALUE) \

(unsigned char)(VALUE) * 0x01010101

#endif

/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */

#define SHORTWORD_TYPE uint32_t

#define SHORTWORD_SIZE 4

#define SHORTWORD_REF(ADDRESS, OFFSET) \

*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))

#define SHORTWORD_DUPLICATE(VALUE) \

(uint32_t)(unsigned char)(VALUE) * 0x01010101

void *memset(void *DST, int C, size_t LENGTH)

{

void* DST0 = DST;

unsigned char C_BYTE = C;

#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)

const char* DST_end = (char*)DST + LENGTH;

while ((char*)DST < DST_end) {

*((char*)DST) = C_BYTE;

DST++;

}

return DST0;

#else /* not PREFER_SIZE_OVER_SPEED */

/* Handle short strings and immediately return. */

if (__builtin_expect(LENGTH < SHORTWORD_SIZE, 1)) {

size_t i = 0;

while (i < LENGTH) {

100

((char*)DST)[i] = C_BYTE;

101

i++;

102

}

103

return DST;

104

}

105

106

const char* DST_end = (char*)DST + LENGTH;

107

108

/* Align DST to SHORTWORD_SIZE. */

109

while ((uintptr_t)DST % SHORTWORD_SIZE != 0) {

110

*(char*) (DST++) = C_BYTE;

111

}

112

113

#if WORD_SIZE > SHORTWORD_SIZE

114

SHORTWORD_TYPE C_SHORTWORD = SHORTWORD_DUPLICATE(C_BYTE);

115

116

/* Align DST to WORD_SIZE in steps of SHORTWORD_SIZE. */

117

if (__builtin_expect(DST_end - (char*)DST >= WORD_SIZE, 0)) {

118

while ((uintptr_t)DST % WORD_SIZE != 0) {

119

SHORTWORD_REF(DST, 0) = C_SHORTWORD;

120

DST += SHORTWORD_SIZE;

121

}

122

#endif /* WORD_SIZE > SHORTWORD_SIZE */

123

124

WORD_TYPE C_WORD = WORD_DUPLICATE(C_BYTE);

125

126

#if defined(__ARM_NEON__)

127

/* Testing on Cortex-A8 indicates that the following idiom

128

produces faster assembly code when doing vector copies,

129

but not when doing regular copies. */

130

size_t i = 0;

131

LENGTH = DST_end - (char*)DST;

132

while (i + WORD_SIZE * 16 <= LENGTH) {

133

WORD_REF(DST, i) = C_WORD;

134

WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;

135

WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;

136

WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;

137

WORD_REF(DST, i + WORD_SIZE * 4) = C_WORD;

138

WORD_REF(DST, i + WORD_SIZE * 5) = C_WORD;

139

WORD_REF(DST, i + WORD_SIZE * 6) = C_WORD;

140

WORD_REF(DST, i + WORD_SIZE * 7) = C_WORD;

141

WORD_REF(DST, i + WORD_SIZE * 8) = C_WORD;

142

WORD_REF(DST, i + WORD_SIZE * 9) = C_WORD;

143

WORD_REF(DST, i + WORD_SIZE * 10) = C_WORD;

144

WORD_REF(DST, i + WORD_SIZE * 11) = C_WORD;

145

WORD_REF(DST, i + WORD_SIZE * 12) = C_WORD;

146

WORD_REF(DST, i + WORD_SIZE * 13) = C_WORD;

147

WORD_REF(DST, i + WORD_SIZE * 14) = C_WORD;

148

WORD_REF(DST, i + WORD_SIZE * 15) = C_WORD;

149

i += WORD_SIZE * 16;

150

}

151

while (i + WORD_SIZE * 4 <= LENGTH) {

152

WORD_REF(DST, i) = C_WORD;

153

WORD_REF(DST, i + WORD_SIZE * 1) = C_WORD;

154

WORD_REF(DST, i + WORD_SIZE * 2) = C_WORD;

155

WORD_REF(DST, i + WORD_SIZE * 3) = C_WORD;

156

i += WORD_SIZE * 4;

157

}

158

while (i + WORD_SIZE <= LENGTH) {

159

WORD_REF(DST, i) = C_WORD;

160

i += WORD_SIZE;

161

}

162

DST += i;

163

#else /* not defined(__ARM_NEON__) */

164

/* Note: 16-times unrolling is about 50% faster than 4-times

165

unrolling on both ARM Cortex-A8 and Cortex-M3. */

166

while (DST_end - (char*) DST >= WORD_SIZE * 16) {

167

WORD_REF(DST, 0) = C_WORD;

168

WORD_REF(DST, WORD_SIZE * 1) = C_WORD;

169

WORD_REF(DST, WORD_SIZE * 2) = C_WORD;

170

WORD_REF(DST, WORD_SIZE * 3) = C_WORD;

171

WORD_REF(DST, WORD_SIZE * 4) = C_WORD;

172

WORD_REF(DST, WORD_SIZE * 5) = C_WORD;

173

WORD_REF(DST, WORD_SIZE * 6) = C_WORD;

174

WORD_REF(DST, WORD_SIZE * 7) = C_WORD;

175

WORD_REF(DST, WORD_SIZE * 8) = C_WORD;

176

WORD_REF(DST, WORD_SIZE * 9) = C_WORD;

177

WORD_REF(DST, WORD_SIZE * 10) = C_WORD;

178

WORD_REF(DST, WORD_SIZE * 11) = C_WORD;

179

WORD_REF(DST, WORD_SIZE * 12) = C_WORD;

180

WORD_REF(DST, WORD_SIZE * 13) = C_WORD;

181

WORD_REF(DST, WORD_SIZE * 14) = C_WORD;

182

WORD_REF(DST, WORD_SIZE * 15) = C_WORD;

183

DST += WORD_SIZE * 16;

184

}

185

while (WORD_SIZE * 4 <= DST_end - (char*) DST) {

186

WORD_REF(DST, 0) = C_WORD;

187

WORD_REF(DST, WORD_SIZE * 1) = C_WORD;

188

WORD_REF(DST, WORD_SIZE * 2) = C_WORD;

189

WORD_REF(DST, WORD_SIZE * 3) = C_WORD;

190

DST += WORD_SIZE * 4;

191

}

192

while (WORD_SIZE <= DST_end - (char*) DST) {

193

WORD_REF(DST, 0) = C_WORD;

194

DST += WORD_SIZE;

195

}

196

#endif /* not defined(__ARM_NEON__) */

197

198

#if WORD_SIZE > SHORTWORD_SIZE

199

} /* end if N >= WORD_SIZE */

200

201

while (SHORTWORD_SIZE <= DST_end - (char*)DST) {

202

SHORTWORD_REF(DST, 0) = C_SHORTWORD;

203

DST += SHORTWORD_SIZE;

204

}

205

#endif /* WORD_SIZE > SHORTWORD_SIZE */

206

207

while ((char*)DST < DST_end) {

208

*((char*)DST) = C_BYTE;

209

DST++;

210

}

211

212

return DST0;

213

#endif /* not PREFER_SIZE_OVER_SPEED */

214

}

Older »