2
* Optimized memory copy routines.
4
* Copyright (C) 2004 Randolph Chung <tausq@debian.org>
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; either version 2, or (at your option)
11
* This program is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with this program; if not, write to the Free Software
18
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
* Portions derived from the GNU C Library
21
* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
23
* Several strategies are tried to try to get the best performance for various
24
* conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25
* fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26
* general registers. Unaligned copies are handled either by aligning the
27
* destination and then using shift-and-write method, or in a few cases by
28
* falling back to a byte-at-a-time copy.
30
* I chose to implement this in C because it is easier to maintain and debug,
31
* and in my experiments it appears that the C code generated by gcc (3.3/3.4
32
* at the time of writing) is fairly optimal. Unfortunately some of the
33
* semantics of the copy routine (exception handling) is difficult to express
34
* in C, so we have to play some tricks to get it to work.
36
* All the loads and stores are done via explicit asm() code in order to use
37
* the right space registers.
39
* Testing with various alignments and buffer sizes shows that this code is
40
* often >10x faster than a simple byte-at-a-time copy, even for strangely
41
* aligned operands. It is interesting to note that the glibc version
42
* of memcpy (written in C) is actually quite fast already. This routine is
43
* able to beat it by 30-40% for aligned copies because of the loop unrolling,
44
* but in some cases the glibc version is still slightly faster. This lends
45
* more credibility that gcc can generate very good code as long as we are
49
* - cache prefetching needs more experimentation to get optimal settings
50
* - try not to use the post-increment address modifiers; they create additional
52
* - replace byte-copy loops with stybs sequences
56
#include <linux/module.h>
57
#include <linux/compiler.h>
58
#include <asm/uaccess.h>
59
#define s_space "%%sr1"
60
#define d_space "%%sr2"
63
#define s_space "%%sr0"
64
#define d_space "%%sr0"
65
#define pa_memcpy new2_copy
68
DECLARE_PER_CPU(struct exception_data, exception_data);
70
#define preserve_branch(label) do { \
72
/* The following branch is never taken, it's just here to */ \
73
/* prevent gcc from optimizing away our exception code. */ \
74
if (unlikely(dummy != dummy)) \
78
#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
79
#define get_kernel_space() (0)
81
#define MERGE(w0, sh_1, w1, sh_2) ({ \
85
"shrpw %1, %2, %%sar, %0\n" \
87
: "r"(w0), "r"(w1), "r"(sh_2) \
94
#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
96
#define DPRINTF(fmt, args...)
99
#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
100
__asm__ __volatile__ ( \
101
"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
102
ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
103
: _tt(_t), "+r"(_a) \
107
#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
108
__asm__ __volatile__ ( \
109
"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
110
ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
115
#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
116
#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
117
#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
118
#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
119
#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
120
#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
122
#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
123
__asm__ __volatile__ ( \
124
"1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \
125
ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
130
#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
131
__asm__ __volatile__ ( \
132
"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \
133
ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
138
#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
139
#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
141
#ifdef CONFIG_PREFETCH
142
static inline void prefetch_src(const void *addr)
144
__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
147
static inline void prefetch_dst(const void *addr)
149
__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
152
#define prefetch_src(addr) do { } while(0)
153
#define prefetch_dst(addr) do { } while(0)
156
/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
157
* per loop. This code is derived from glibc.
159
static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
161
/* gcc complains that a2 and a3 may be uninitialized, but actually
162
* they cannot be. Initialize a2/a3 to shut gcc up.
164
register unsigned int a0, a1, a2 = 0, a3 = 0;
166
struct exception_data *d;
168
/* prefetch_src((const void *)src); */
170
/* Calculate how to shift a word read at the memory operation
171
aligned srcp to make it aligned for copy. */
172
sh_1 = 8 * (src % sizeof(unsigned int));
173
sh_2 = 8 * sizeof(unsigned int) - sh_1;
175
/* Make src aligned by rounding it down. */
176
src &= -sizeof(unsigned int);
181
/* a1 = ((unsigned int *) src)[0];
182
a2 = ((unsigned int *) src)[1]; */
183
ldw(s_space, 0, src, a1, cda_ldw_exc);
184
ldw(s_space, 4, src, a2, cda_ldw_exc);
185
src -= 1 * sizeof(unsigned int);
186
dst -= 3 * sizeof(unsigned int);
190
/* a0 = ((unsigned int *) src)[0];
191
a1 = ((unsigned int *) src)[1]; */
192
ldw(s_space, 0, src, a0, cda_ldw_exc);
193
ldw(s_space, 4, src, a1, cda_ldw_exc);
194
src -= 0 * sizeof(unsigned int);
195
dst -= 2 * sizeof(unsigned int);
201
/* a3 = ((unsigned int *) src)[0];
202
a0 = ((unsigned int *) src)[1]; */
203
ldw(s_space, 0, src, a3, cda_ldw_exc);
204
ldw(s_space, 4, src, a0, cda_ldw_exc);
205
src -=-1 * sizeof(unsigned int);
206
dst -= 1 * sizeof(unsigned int);
210
/* a2 = ((unsigned int *) src)[0];
211
a3 = ((unsigned int *) src)[1]; */
212
ldw(s_space, 0, src, a2, cda_ldw_exc);
213
ldw(s_space, 4, src, a3, cda_ldw_exc);
214
src -=-2 * sizeof(unsigned int);
215
dst -= 0 * sizeof(unsigned int);
219
goto do4; /* No-op. */
224
/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
226
/* a0 = ((unsigned int *) src)[0]; */
227
ldw(s_space, 0, src, a0, cda_ldw_exc);
228
/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
229
stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
231
/* a1 = ((unsigned int *) src)[1]; */
232
ldw(s_space, 4, src, a1, cda_ldw_exc);
233
/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
234
stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
236
/* a2 = ((unsigned int *) src)[2]; */
237
ldw(s_space, 8, src, a2, cda_ldw_exc);
238
/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
239
stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
241
/* a3 = ((unsigned int *) src)[3]; */
242
ldw(s_space, 12, src, a3, cda_ldw_exc);
243
/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
244
stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
246
src += 4 * sizeof(unsigned int);
247
dst += 4 * sizeof(unsigned int);
253
/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
254
stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
256
preserve_branch(handle_load_error);
257
preserve_branch(handle_store_error);
262
__asm__ __volatile__ ("cda_ldw_exc:\n");
263
d = &__get_cpu_var(exception_data);
264
DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
265
o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
266
return o_len * 4 - d->fault_addr + o_src;
269
__asm__ __volatile__ ("cda_stw_exc:\n");
270
d = &__get_cpu_var(exception_data);
271
DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
272
o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
273
return o_len * 4 - d->fault_addr + o_dst;
277
/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
278
static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
280
register unsigned long src, dst, t1, t2, t3;
281
register unsigned char *pcs, *pcd;
282
register unsigned int *pws, *pwd;
283
register double *pds, *pdd;
284
unsigned long ret = 0;
285
unsigned long o_dst, o_src, o_len;
286
struct exception_data *d;
288
src = (unsigned long)srcp;
289
dst = (unsigned long)dstp;
290
pcs = (unsigned char *)srcp;
291
pcd = (unsigned char *)dstp;
293
o_dst = dst; o_src = src; o_len = len;
295
/* prefetch_src((const void *)srcp); */
300
/* Check alignment */
302
if (unlikely(t1 & (sizeof(double)-1)))
305
/* src and dst have same alignment. */
307
/* Copy bytes till we are double-aligned. */
308
t2 = src & (sizeof(double) - 1);
309
if (unlikely(t2 != 0)) {
310
t2 = sizeof(double) - t2;
312
/* *pcd++ = *pcs++; */
313
ldbma(s_space, pcs, t3, pmc_load_exc);
315
stbma(d_space, t3, pcd, pmc_store_exc);
324
/* Copy 8 doubles at a time */
325
while (len >= 8*sizeof(double)) {
326
register double r1, r2, r3, r4, r5, r6, r7, r8;
327
/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
328
flddma(s_space, pds, r1, pmc_load_exc);
329
flddma(s_space, pds, r2, pmc_load_exc);
330
flddma(s_space, pds, r3, pmc_load_exc);
331
flddma(s_space, pds, r4, pmc_load_exc);
332
fstdma(d_space, r1, pdd, pmc_store_exc);
333
fstdma(d_space, r2, pdd, pmc_store_exc);
334
fstdma(d_space, r3, pdd, pmc_store_exc);
335
fstdma(d_space, r4, pdd, pmc_store_exc);
338
if (L1_CACHE_BYTES <= 32)
339
prefetch_src((char *)pds + L1_CACHE_BYTES);
341
flddma(s_space, pds, r5, pmc_load_exc);
342
flddma(s_space, pds, r6, pmc_load_exc);
343
flddma(s_space, pds, r7, pmc_load_exc);
344
flddma(s_space, pds, r8, pmc_load_exc);
345
fstdma(d_space, r5, pdd, pmc_store_exc);
346
fstdma(d_space, r6, pdd, pmc_store_exc);
347
fstdma(d_space, r7, pdd, pmc_store_exc);
348
fstdma(d_space, r8, pdd, pmc_store_exc);
349
len -= 8*sizeof(double);
353
pws = (unsigned int *)pds;
354
pwd = (unsigned int *)pdd;
357
while (len >= 8*sizeof(unsigned int)) {
358
register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
359
/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
360
ldwma(s_space, pws, r1, pmc_load_exc);
361
ldwma(s_space, pws, r2, pmc_load_exc);
362
ldwma(s_space, pws, r3, pmc_load_exc);
363
ldwma(s_space, pws, r4, pmc_load_exc);
364
stwma(d_space, r1, pwd, pmc_store_exc);
365
stwma(d_space, r2, pwd, pmc_store_exc);
366
stwma(d_space, r3, pwd, pmc_store_exc);
367
stwma(d_space, r4, pwd, pmc_store_exc);
369
ldwma(s_space, pws, r5, pmc_load_exc);
370
ldwma(s_space, pws, r6, pmc_load_exc);
371
ldwma(s_space, pws, r7, pmc_load_exc);
372
ldwma(s_space, pws, r8, pmc_load_exc);
373
stwma(d_space, r5, pwd, pmc_store_exc);
374
stwma(d_space, r6, pwd, pmc_store_exc);
375
stwma(d_space, r7, pwd, pmc_store_exc);
376
stwma(d_space, r8, pwd, pmc_store_exc);
377
len -= 8*sizeof(unsigned int);
380
while (len >= 4*sizeof(unsigned int)) {
381
register unsigned int r1,r2,r3,r4;
382
ldwma(s_space, pws, r1, pmc_load_exc);
383
ldwma(s_space, pws, r2, pmc_load_exc);
384
ldwma(s_space, pws, r3, pmc_load_exc);
385
ldwma(s_space, pws, r4, pmc_load_exc);
386
stwma(d_space, r1, pwd, pmc_store_exc);
387
stwma(d_space, r2, pwd, pmc_store_exc);
388
stwma(d_space, r3, pwd, pmc_store_exc);
389
stwma(d_space, r4, pwd, pmc_store_exc);
390
len -= 4*sizeof(unsigned int);
393
pcs = (unsigned char *)pws;
394
pcd = (unsigned char *)pwd;
398
/* *pcd++ = *pcs++; */
399
ldbma(s_space, pcs, t3, pmc_load_exc);
400
stbma(d_space, t3, pcd, pmc_store_exc);
407
/* possibly we are aligned on a word, but not on a double... */
408
if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
409
t2 = src & (sizeof(unsigned int) - 1);
411
if (unlikely(t2 != 0)) {
412
t2 = sizeof(unsigned int) - t2;
414
/* *pcd++ = *pcs++; */
415
ldbma(s_space, pcs, t3, pmc_load_exc);
416
stbma(d_space, t3, pcd, pmc_store_exc);
422
pws = (unsigned int *)pcs;
423
pwd = (unsigned int *)pcd;
427
/* Align the destination. */
428
if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
429
t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
431
/* *pcd++ = *pcs++; */
432
ldbma(s_space, pcs, t3, pmc_load_exc);
433
stbma(d_space, t3, pcd, pmc_store_exc);
437
dst = (unsigned long)pcd;
438
src = (unsigned long)pcs;
441
ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
442
o_dst, o_src, o_len);
446
pcs += (len & -sizeof(unsigned int));
447
pcd += (len & -sizeof(unsigned int));
448
len %= sizeof(unsigned int);
450
preserve_branch(handle_load_error);
451
preserve_branch(handle_store_error);
456
__asm__ __volatile__ ("pmc_load_exc:\n");
457
d = &__get_cpu_var(exception_data);
458
DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
459
o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
460
return o_len - d->fault_addr + o_src;
463
__asm__ __volatile__ ("pmc_store_exc:\n");
464
d = &__get_cpu_var(exception_data);
465
DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
466
o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
467
return o_len - d->fault_addr + o_dst;
471
unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
473
mtsp(get_kernel_space(), 1);
474
mtsp(get_user_space(), 2);
475
return pa_memcpy((void __force *)dst, src, len);
478
EXPORT_SYMBOL(__copy_from_user);
479
unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
481
mtsp(get_user_space(), 1);
482
mtsp(get_kernel_space(), 2);
483
return pa_memcpy(dst, (void __force *)src, len);
486
unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
488
mtsp(get_user_space(), 1);
489
mtsp(get_user_space(), 2);
490
return pa_memcpy((void __force *)dst, (void __force *)src, len);
494
void * memcpy(void * dst,const void *src, size_t count)
496
mtsp(get_kernel_space(), 1);
497
mtsp(get_kernel_space(), 2);
498
pa_memcpy(dst, src, count);
502
EXPORT_SYMBOL(copy_to_user);
503
EXPORT_SYMBOL(copy_from_user);
504
EXPORT_SYMBOL(copy_in_user);
505
EXPORT_SYMBOL(memcpy);