~linaro-toolchain-dev/cortex-strings/trunk

« back to all changes in this revision

Viewing changes to src/reference/csl/memcpy.c

Committer: Michael Hope
Date: 2010-08-30 23:30:25 UTC
Revision ID: michael.hope@linaro.org-20100830233025-2f14wknqev6ryj01

Modified the imported versions to build locally. Added the CSL routines.

files added:
src/reference/csl

src/reference/csl/arm_asm.h

src/reference/csl/memcpy.c

src/reference/csl/memset.c

src/reference/plain/memset.c

src/reference/plain/strcmp.c

src/reference/plain/strcpy.c

files modified:
src/reference/Makefile.am

src/reference/bionic/memcmp.S

src/reference/bionic/memcmp16.S

src/reference/bionic/memcpy.S

src/reference/glibc/memmove.S

src/reference/glibc/memset.S

Show diffs side-by-side

added added

removed removed

src/reference/csl/memcpy.c

* Redistribution and use in source and binary forms, with or without

* modification, are permitted provided that the following conditions are met:

* * Redistributions of source code must retain the above copyright

* notice, this list of conditions and the following disclaimer.

* * Redistributions in binary form must reproduce the above copyright

* notice, this list of conditions and the following disclaimer in the

* documentation and/or other materials provided with the distribution.

* * Neither the name of CodeSourcery nor the

* names of its contributors may be used to endorse or promote products

* derived from this software without specific prior written permission.

* THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY

* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

* DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY

* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "arm_asm.h"

#include <string.h>

#include <stdint.h>

#include <stddef.h>

/* Standard operations for word-sized values. */

#define WORD_REF(ADDRESS, OFFSET) \

*((WORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))

#define WORD_COPY(OUT, IN, OFFSET) \

WORD_REF(OUT, OFFSET) = WORD_REF(IN, OFFSET)

/* On processors with NEON, we use 128-bit vectors. Also,

we need to include arm_neon.h to use these. */

#if defined(__ARM_NEON__)

#include <arm_neon.h>

#define WORD_TYPE uint8x16_t

#define WORD_SIZE 16

#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)

/* On ARM processors with 64-bit ldrd instructions, we use those,

except on Cortex-M* where benchmarking has shown them to

be slower. */

#elif defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \

|| defined(__ARM_ARCH_5TEJ__) || defined(_ISA_ARM_6)

#define WORD_TYPE uint64_t

#define WORD_SIZE 8

#define MAYBE_PREFETCH(IN) __builtin_prefetch((IN), 0, 0)

/* On everything else, we use 32-bit loads and stores, and

do not use prefetching. */

#else

#define WORD_TYPE uint32_t

#define WORD_SIZE 4

#define MAYBE_PREFETCH(IN)

#endif

/* On all ARM platforms, 'SHORTWORD' is a 32-bit value. */

#define SHORTWORD_TYPE uint32_t

#define SHORTWORD_SIZE 4

#define SHORTWORD_REF(ADDRESS, OFFSET) \

*((SHORTWORD_TYPE*)((char*)(ADDRESS) + (OFFSET)))

#define SHORTWORD_COPY(OUT, IN, OFFSET) \

SHORTWORD_REF(OUT, OFFSET) = SHORTWORD_REF(IN, OFFSET)

/* Shifting directionality depends on endianness. */

#ifdef __ARMEB__

#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \

((IN0) << ((OFFSET)*8)) | ((IN1) >> (SHORTWORD_SIZE*8 - (OFFSET)*8))

#else

#define SHORTWORD_SHIFT(IN0, IN1, OFFSET) \

((IN0) >> ((OFFSET)*8)) | ((IN1) << (SHORTWORD_SIZE*8 - (OFFSET)*8))

#endif

void *memcpy(void *OUT, const void *IN, size_t N)

{

void* OUT0 = OUT;

#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)

const char* OUT_end = (char*)OUT + N;

while ((char*)OUT < OUT_end) {

*((char*)OUT) = *((char*)IN);

OUT++;

IN++;

}

return OUT0;

#else

/* Handle short strings and immediately return. */

if (__builtin_expect(N < SHORTWORD_SIZE, 1)) {

size_t i = 0;

while (i < N) {

((char*)OUT)[i] = ((char*)IN)[i];

100

i++;

101

}

102

return OUT;

103

}

104

105

const char* OUT_end = (char*)OUT + N;

106

107

/* Align OUT to SHORTWORD_SIZE. */

108

while ((uintptr_t)OUT % SHORTWORD_SIZE != 0) {

109

*(char*) (OUT++) = *(char*) (IN++);

110

}

111

112

if ((uintptr_t) IN % SHORTWORD_SIZE == 0) {

113

114

#if WORD_SIZE > SHORTWORD_SIZE

115

/* Align OUT to WORD_SIZE in steps of SHORTWORD_SIZE. */

116

if (__builtin_expect(OUT_end - (char*)OUT >= WORD_SIZE, 0)) {

117

while ((uintptr_t)OUT % WORD_SIZE != 0) {

118

SHORTWORD_COPY(OUT, IN, 0);

119

OUT += SHORTWORD_SIZE;

120

IN += SHORTWORD_SIZE;

121

}

122

123

if ((uintptr_t) IN % WORD_SIZE == 0) {

124

#endif /* WORD_SIZE > SHORTWORD_SIZE */

125

126

#if defined(__ARM_NEON__)

127

/* Testing on Cortex-A8 indicates that the following idiom

128

produces faster assembly code when doing vector copies,

129

but not when doing regular copies. */

130

size_t i = 0;

131

N = OUT_end - (char*)OUT;

132

MAYBE_PREFETCH(IN + 64);

133

MAYBE_PREFETCH(IN + 128);

134

MAYBE_PREFETCH(IN + 192);

135

if (N >= 640) {

136

MAYBE_PREFETCH(IN + 256);

137

MAYBE_PREFETCH(IN + 320);

138

MAYBE_PREFETCH(IN + 384);

139

MAYBE_PREFETCH(IN + 448);

140

MAYBE_PREFETCH(IN + 512);

141

MAYBE_PREFETCH(IN + 576);

142

MAYBE_PREFETCH(IN + 640);

143

MAYBE_PREFETCH(IN + 704);

144

/* We phrase the loop condition in this way so that the

145

i + WORD_SIZE * 16 value can be reused to increment i. */

146

while (i + WORD_SIZE * 16 <= N - 640) {

147

MAYBE_PREFETCH(IN + 768);

148

MAYBE_PREFETCH(IN + 832);

149

MAYBE_PREFETCH(IN + 896);

150

MAYBE_PREFETCH(IN + 960);

151

WORD_COPY(OUT, IN, i);

152

WORD_COPY(OUT, IN, i + WORD_SIZE * 1);

153

WORD_COPY(OUT, IN, i + WORD_SIZE * 2);

154

WORD_COPY(OUT, IN, i + WORD_SIZE * 3);

155

WORD_COPY(OUT, IN, i + WORD_SIZE * 4);

156

WORD_COPY(OUT, IN, i + WORD_SIZE * 5);

157

WORD_COPY(OUT, IN, i + WORD_SIZE * 6);

158

WORD_COPY(OUT, IN, i + WORD_SIZE * 7);

159

WORD_COPY(OUT, IN, i + WORD_SIZE * 8);

160

WORD_COPY(OUT, IN, i + WORD_SIZE * 9);

161

WORD_COPY(OUT, IN, i + WORD_SIZE * 10);

162

WORD_COPY(OUT, IN, i + WORD_SIZE * 11);

163

WORD_COPY(OUT, IN, i + WORD_SIZE * 12);

164

WORD_COPY(OUT, IN, i + WORD_SIZE * 13);

165

WORD_COPY(OUT, IN, i + WORD_SIZE * 14);

166

WORD_COPY(OUT, IN, i + WORD_SIZE * 15);

167

i += WORD_SIZE * 16;

168

}

169

}

170

while (i + WORD_SIZE * 16 <= N) {

171

WORD_COPY(OUT, IN, i);

172

WORD_COPY(OUT, IN, i + WORD_SIZE * 1);

173

WORD_COPY(OUT, IN, i + WORD_SIZE * 2);

174

WORD_COPY(OUT, IN, i + WORD_SIZE * 3);

175

WORD_COPY(OUT, IN, i + WORD_SIZE * 4);

176

WORD_COPY(OUT, IN, i + WORD_SIZE * 5);

177

WORD_COPY(OUT, IN, i + WORD_SIZE * 6);

178

WORD_COPY(OUT, IN, i + WORD_SIZE * 7);

179

WORD_COPY(OUT, IN, i + WORD_SIZE * 8);

180

WORD_COPY(OUT, IN, i + WORD_SIZE * 9);

181

WORD_COPY(OUT, IN, i + WORD_SIZE * 10);

182

WORD_COPY(OUT, IN, i + WORD_SIZE * 11);

183

WORD_COPY(OUT, IN, i + WORD_SIZE * 12);

184

WORD_COPY(OUT, IN, i + WORD_SIZE * 13);

185

WORD_COPY(OUT, IN, i + WORD_SIZE * 14);

186

WORD_COPY(OUT, IN, i + WORD_SIZE * 15);

187

i += WORD_SIZE * 16;

188

}

189

while (i + WORD_SIZE * 4 <= N) {

190

WORD_COPY(OUT, IN, i);

191

WORD_COPY(OUT, IN, i + WORD_SIZE * 1);

192

WORD_COPY(OUT, IN, i + WORD_SIZE * 2);

193

WORD_COPY(OUT, IN, i + WORD_SIZE * 3);

194

i += WORD_SIZE * 4;

195

}

196

while (i + WORD_SIZE <= N) {

197

WORD_COPY(OUT, IN, i);

198

i += WORD_SIZE;

199

}

200

OUT += i;

201

IN += i;

202

#else /* not defined(__ARM_NEON__) */

203

/* Note: 16-times unrolling is about 20% faster than 4-times

204

unrolling on both ARM Cortex-A8 and Cortex-M3. */

205

MAYBE_PREFETCH(IN + 64);

206

MAYBE_PREFETCH(IN + 128);

207

MAYBE_PREFETCH(IN + 192);

208

while (OUT_end - (char*)OUT >= WORD_SIZE * 16) {

209

MAYBE_PREFETCH(IN + 256);

210

MAYBE_PREFETCH(IN + 320);

211

WORD_COPY(OUT, IN, 0);

212

WORD_COPY(OUT, IN, WORD_SIZE * 1);

213

WORD_COPY(OUT, IN, WORD_SIZE * 2);

214

WORD_COPY(OUT, IN, WORD_SIZE * 3);

215

WORD_COPY(OUT, IN, WORD_SIZE * 4);

216

WORD_COPY(OUT, IN, WORD_SIZE * 5);

217

WORD_COPY(OUT, IN, WORD_SIZE * 6);

218

WORD_COPY(OUT, IN, WORD_SIZE * 7);

219

WORD_COPY(OUT, IN, WORD_SIZE * 8);

220

WORD_COPY(OUT, IN, WORD_SIZE * 9);

221

WORD_COPY(OUT, IN, WORD_SIZE * 10);

222

WORD_COPY(OUT, IN, WORD_SIZE * 11);

223

WORD_COPY(OUT, IN, WORD_SIZE * 12);

224

WORD_COPY(OUT, IN, WORD_SIZE * 13);

225

WORD_COPY(OUT, IN, WORD_SIZE * 14);

226

WORD_COPY(OUT, IN, WORD_SIZE * 15);

227

OUT += WORD_SIZE * 16;

228

IN += WORD_SIZE * 16;

229

}

230

while (WORD_SIZE * 4 <= OUT_end - (char*)OUT) {

231

WORD_COPY(OUT, IN, 0);

232

WORD_COPY(OUT, IN, WORD_SIZE * 1);

233

WORD_COPY(OUT, IN, WORD_SIZE * 2);

234

WORD_COPY(OUT, IN, WORD_SIZE * 3);

235

OUT += WORD_SIZE * 4;

236

IN += WORD_SIZE * 4;

237

}

238

while (WORD_SIZE <= OUT_end - (char*)OUT) {

239

WORD_COPY(OUT, IN, 0);

240

OUT += WORD_SIZE;

241

IN += WORD_SIZE;

242

}

243

#endif /* not defined(__ARM_NEON__) */

244

245

#if WORD_SIZE > SHORTWORD_SIZE

246

} else { /* if IN is not WORD_SIZE aligned */

247

while (SHORTWORD_SIZE * 4 <= OUT_end - (char*)OUT) {

248

SHORTWORD_COPY(OUT, IN, 0);

249

SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 1);

250

SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 2);

251

SHORTWORD_COPY(OUT, IN, SHORTWORD_SIZE * 3);

252

OUT += SHORTWORD_SIZE * 4;

253

IN += SHORTWORD_SIZE * 4;

254

}

255

} /* end if IN is not WORD_SIZE aligned */

256

} /* end if N >= WORD_SIZE */

257

258

while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {

259

SHORTWORD_COPY(OUT, IN, 0);

260

OUT += SHORTWORD_SIZE;

261

IN += SHORTWORD_SIZE;

262

}

263

#endif /* WORD_SIZE > SHORTWORD_SIZE */

264

265

} else { /* if IN is not SHORTWORD_SIZE aligned */

266

ptrdiff_t misalign = (uintptr_t)IN % SHORTWORD_SIZE;

267

268

SHORTWORD_TYPE temp1, temp2;

269

temp1 = SHORTWORD_REF(IN, -misalign);

270

271

/* Benchmarking indicates that unrolling this loop doesn't

272

produce a measurable performance improvement on ARM. */

273

while (SHORTWORD_SIZE <= OUT_end - (char*)OUT) {

274

IN += SHORTWORD_SIZE;

275

temp2 = SHORTWORD_REF(IN, -misalign);

276

SHORTWORD_REF(OUT, 0) = SHORTWORD_SHIFT(temp1, temp2, misalign);

277

temp1 = temp2;

278

OUT += SHORTWORD_SIZE;

279

}

280

281

} /* end if IN is not SHORTWORD_SIZE aligned */

282

283

while ((char*)OUT < OUT_end) {

284

*((char*)OUT) = *((char*)IN);

285

OUT++;

286

IN++;

287

}

288

289

return OUT0;

290

#endif

291

}

Older »