2
* Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3
* Copyright (C) 2008-2009 PetaLogix
4
* Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
6
* This file is subject to the terms and conditions of the GNU General
7
* Public License. See the file COPYING in the main directory of this
8
* archive for more details.
10
* Written by Jim Law <jlaw@irispower.com>
12
* intended to replace:
13
* memcpy in memcpy.c and
14
* memmove in memmove.c
15
* ... in arch/microblaze/lib
20
* Attempt at quicker memcpy and memmove for MicroBlaze
21
* Input : Operand1 in Reg r5 - destination address
22
* Operand2 in Reg r6 - source address
23
* Operand3 in Reg r7 - number of bytes to transfer
24
* Output: Result in Reg r3 - starting destinaition address
28
* Perform (possibly unaligned) copy of a block of memory
29
* between mem locations with size of xfer spec'd in bytes
32
#ifdef __MICROBLAZEEL__
33
#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.
36
#include <linux/linkage.h>
39
.type memcpy, @function
43
fast_memcpy_ascending:
44
/* move d to return register as value of function */
47
addi r4, r0, 4 /* n = 4 */
48
cmpu r4, r4, r7 /* n = c - n (unsigned) */
49
blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
51
/* transfer first 0~3 bytes to get aligned dest address */
52
andi r4, r5, 3 /* n = d & 3 */
53
/* if zero, destination already aligned */
54
beqi r4, a_dalign_done
55
/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
57
rsub r7, r4, r7 /* c = c - n adjust c */
60
/* if no bytes left to transfer, transfer the bulk */
61
beqi r4, a_dalign_done
62
lbui r11, r6, 0 /* h = *s */
63
sbi r11, r5, 0 /* *d = h */
64
addi r6, r6, 1 /* s++ */
65
addi r5, r5, 1 /* d++ */
66
brid a_xfer_first_loop /* loop */
67
addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
70
addi r4, r0, 32 /* n = 32 */
71
cmpu r4, r4, r7 /* n = c - n (unsigned) */
72
/* if n < 0, less than one block to transfer */
76
andi r4, r7, 0xffffffe0 /* n = c & ~31 */
77
rsub r7, r4, r7 /* c = c - n */
79
andi r9, r6, 3 /* t1 = s & 3 */
80
/* if temp != 0, unaligned transfers needed */
81
bnei r9, a_block_unaligned
84
lwi r9, r6, 0 /* t1 = *(s + 0) */
85
lwi r10, r6, 4 /* t2 = *(s + 4) */
86
lwi r11, r6, 8 /* t3 = *(s + 8) */
87
lwi r12, r6, 12 /* t4 = *(s + 12) */
88
swi r9, r5, 0 /* *(d + 0) = t1 */
89
swi r10, r5, 4 /* *(d + 4) = t2 */
90
swi r11, r5, 8 /* *(d + 8) = t3 */
91
swi r12, r5, 12 /* *(d + 12) = t4 */
92
lwi r9, r6, 16 /* t1 = *(s + 16) */
93
lwi r10, r6, 20 /* t2 = *(s + 20) */
94
lwi r11, r6, 24 /* t3 = *(s + 24) */
95
lwi r12, r6, 28 /* t4 = *(s + 28) */
96
swi r9, r5, 16 /* *(d + 16) = t1 */
97
swi r10, r5, 20 /* *(d + 20) = t2 */
98
swi r11, r5, 24 /* *(d + 24) = t3 */
99
swi r12, r5, 28 /* *(d + 28) = t4 */
100
addi r6, r6, 32 /* s = s + 32 */
101
addi r4, r4, -32 /* n = n - 32 */
102
bneid r4, a_block_aligned /* while (n) loop */
103
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
107
andi r8, r6, 0xfffffffc /* as = s & ~3 */
108
add r6, r6, r4 /* s = s + n */
109
lwi r11, r8, 0 /* h = *(as + 0) */
112
beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
114
beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
117
bslli r11, r11, 24 /* h = h << 24 */
119
lwi r12, r8, 4 /* v = *(as + 4) */
120
bsrli r9, r12, 8 /* t1 = v >> 8 */
121
or r9, r11, r9 /* t1 = h | t1 */
122
swi r9, r5, 0 /* *(d + 0) = t1 */
123
bslli r11, r12, 24 /* h = v << 24 */
124
lwi r12, r8, 8 /* v = *(as + 8) */
125
bsrli r9, r12, 8 /* t1 = v >> 8 */
126
or r9, r11, r9 /* t1 = h | t1 */
127
swi r9, r5, 4 /* *(d + 4) = t1 */
128
bslli r11, r12, 24 /* h = v << 24 */
129
lwi r12, r8, 12 /* v = *(as + 12) */
130
bsrli r9, r12, 8 /* t1 = v >> 8 */
131
or r9, r11, r9 /* t1 = h | t1 */
132
swi r9, r5, 8 /* *(d + 8) = t1 */
133
bslli r11, r12, 24 /* h = v << 24 */
134
lwi r12, r8, 16 /* v = *(as + 16) */
135
bsrli r9, r12, 8 /* t1 = v >> 8 */
136
or r9, r11, r9 /* t1 = h | t1 */
137
swi r9, r5, 12 /* *(d + 12) = t1 */
138
bslli r11, r12, 24 /* h = v << 24 */
139
lwi r12, r8, 20 /* v = *(as + 20) */
140
bsrli r9, r12, 8 /* t1 = v >> 8 */
141
or r9, r11, r9 /* t1 = h | t1 */
142
swi r9, r5, 16 /* *(d + 16) = t1 */
143
bslli r11, r12, 24 /* h = v << 24 */
144
lwi r12, r8, 24 /* v = *(as + 24) */
145
bsrli r9, r12, 8 /* t1 = v >> 8 */
146
or r9, r11, r9 /* t1 = h | t1 */
147
swi r9, r5, 20 /* *(d + 20) = t1 */
148
bslli r11, r12, 24 /* h = v << 24 */
149
lwi r12, r8, 28 /* v = *(as + 28) */
150
bsrli r9, r12, 8 /* t1 = v >> 8 */
151
or r9, r11, r9 /* t1 = h | t1 */
152
swi r9, r5, 24 /* *(d + 24) = t1 */
153
bslli r11, r12, 24 /* h = v << 24 */
154
lwi r12, r8, 32 /* v = *(as + 32) */
155
bsrli r9, r12, 8 /* t1 = v >> 8 */
156
or r9, r11, r9 /* t1 = h | t1 */
157
swi r9, r5, 28 /* *(d + 28) = t1 */
158
bslli r11, r12, 24 /* h = v << 24 */
159
addi r8, r8, 32 /* as = as + 32 */
160
addi r4, r4, -32 /* n = n - 32 */
161
bneid r4, a_bu3_loop /* while (n) loop */
162
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
166
bslli r11, r11, 8 /* h = h << 8 */
168
lwi r12, r8, 4 /* v = *(as + 4) */
169
bsrli r9, r12, 24 /* t1 = v >> 24 */
170
or r9, r11, r9 /* t1 = h | t1 */
171
swi r9, r5, 0 /* *(d + 0) = t1 */
172
bslli r11, r12, 8 /* h = v << 8 */
173
lwi r12, r8, 8 /* v = *(as + 8) */
174
bsrli r9, r12, 24 /* t1 = v >> 24 */
175
or r9, r11, r9 /* t1 = h | t1 */
176
swi r9, r5, 4 /* *(d + 4) = t1 */
177
bslli r11, r12, 8 /* h = v << 8 */
178
lwi r12, r8, 12 /* v = *(as + 12) */
179
bsrli r9, r12, 24 /* t1 = v >> 24 */
180
or r9, r11, r9 /* t1 = h | t1 */
181
swi r9, r5, 8 /* *(d + 8) = t1 */
182
bslli r11, r12, 8 /* h = v << 8 */
183
lwi r12, r8, 16 /* v = *(as + 16) */
184
bsrli r9, r12, 24 /* t1 = v >> 24 */
185
or r9, r11, r9 /* t1 = h | t1 */
186
swi r9, r5, 12 /* *(d + 12) = t1 */
187
bslli r11, r12, 8 /* h = v << 8 */
188
lwi r12, r8, 20 /* v = *(as + 20) */
189
bsrli r9, r12, 24 /* t1 = v >> 24 */
190
or r9, r11, r9 /* t1 = h | t1 */
191
swi r9, r5, 16 /* *(d + 16) = t1 */
192
bslli r11, r12, 8 /* h = v << 8 */
193
lwi r12, r8, 24 /* v = *(as + 24) */
194
bsrli r9, r12, 24 /* t1 = v >> 24 */
195
or r9, r11, r9 /* t1 = h | t1 */
196
swi r9, r5, 20 /* *(d + 20) = t1 */
197
bslli r11, r12, 8 /* h = v << 8 */
198
lwi r12, r8, 28 /* v = *(as + 28) */
199
bsrli r9, r12, 24 /* t1 = v >> 24 */
200
or r9, r11, r9 /* t1 = h | t1 */
201
swi r9, r5, 24 /* *(d + 24) = t1 */
202
bslli r11, r12, 8 /* h = v << 8 */
203
lwi r12, r8, 32 /* v = *(as + 32) */
204
bsrli r9, r12, 24 /* t1 = v >> 24 */
205
or r9, r11, r9 /* t1 = h | t1 */
206
swi r9, r5, 28 /* *(d + 28) = t1 */
207
bslli r11, r12, 8 /* h = v << 8 */
208
addi r8, r8, 32 /* as = as + 32 */
209
addi r4, r4, -32 /* n = n - 32 */
210
bneid r4, a_bu1_loop /* while (n) loop */
211
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
215
bslli r11, r11, 16 /* h = h << 16 */
217
lwi r12, r8, 4 /* v = *(as + 4) */
218
bsrli r9, r12, 16 /* t1 = v >> 16 */
219
or r9, r11, r9 /* t1 = h | t1 */
220
swi r9, r5, 0 /* *(d + 0) = t1 */
221
bslli r11, r12, 16 /* h = v << 16 */
222
lwi r12, r8, 8 /* v = *(as + 8) */
223
bsrli r9, r12, 16 /* t1 = v >> 16 */
224
or r9, r11, r9 /* t1 = h | t1 */
225
swi r9, r5, 4 /* *(d + 4) = t1 */
226
bslli r11, r12, 16 /* h = v << 16 */
227
lwi r12, r8, 12 /* v = *(as + 12) */
228
bsrli r9, r12, 16 /* t1 = v >> 16 */
229
or r9, r11, r9 /* t1 = h | t1 */
230
swi r9, r5, 8 /* *(d + 8) = t1 */
231
bslli r11, r12, 16 /* h = v << 16 */
232
lwi r12, r8, 16 /* v = *(as + 16) */
233
bsrli r9, r12, 16 /* t1 = v >> 16 */
234
or r9, r11, r9 /* t1 = h | t1 */
235
swi r9, r5, 12 /* *(d + 12) = t1 */
236
bslli r11, r12, 16 /* h = v << 16 */
237
lwi r12, r8, 20 /* v = *(as + 20) */
238
bsrli r9, r12, 16 /* t1 = v >> 16 */
239
or r9, r11, r9 /* t1 = h | t1 */
240
swi r9, r5, 16 /* *(d + 16) = t1 */
241
bslli r11, r12, 16 /* h = v << 16 */
242
lwi r12, r8, 24 /* v = *(as + 24) */
243
bsrli r9, r12, 16 /* t1 = v >> 16 */
244
or r9, r11, r9 /* t1 = h | t1 */
245
swi r9, r5, 20 /* *(d + 20) = t1 */
246
bslli r11, r12, 16 /* h = v << 16 */
247
lwi r12, r8, 28 /* v = *(as + 28) */
248
bsrli r9, r12, 16 /* t1 = v >> 16 */
249
or r9, r11, r9 /* t1 = h | t1 */
250
swi r9, r5, 24 /* *(d + 24) = t1 */
251
bslli r11, r12, 16 /* h = v << 16 */
252
lwi r12, r8, 32 /* v = *(as + 32) */
253
bsrli r9, r12, 16 /* t1 = v >> 16 */
254
or r9, r11, r9 /* t1 = h | t1 */
255
swi r9, r5, 28 /* *(d + 28) = t1 */
256
bslli r11, r12, 16 /* h = v << 16 */
257
addi r8, r8, 32 /* as = as + 32 */
258
addi r4, r4, -32 /* n = n - 32 */
259
bneid r4, a_bu2_loop /* while (n) loop */
260
addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
263
addi r4, r0, 4 /* n = 4 */
264
cmpu r4, r4, r7 /* n = c - n (unsigned) */
265
blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
268
andi r4, r7, 0xfffffffc /* n = c & ~3 */
269
addi r10, r0, 0 /* offset = 0 */
271
andi r9, r6, 3 /* t1 = s & 3 */
272
/* if temp != 0, unaligned transfers needed */
273
bnei r9, a_word_unaligned
276
lw r9, r6, r10 /* t1 = *(s+offset) */
277
sw r9, r5, r10 /* *(d+offset) = t1 */
278
addi r4, r4,-4 /* n-- */
279
bneid r4, a_word_aligned /* loop */
280
addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
285
andi r8, r6, 0xfffffffc /* as = s & ~3 */
286
lwi r11, r8, 0 /* h = *(as + 0) */
287
addi r8, r8, 4 /* as = as + 4 */
290
beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
292
beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
295
bslli r11, r11, 24 /* h = h << 24 */
297
lw r12, r8, r10 /* v = *(as + offset) */
298
bsrli r9, r12, 8 /* t1 = v >> 8 */
299
or r9, r11, r9 /* t1 = h | t1 */
300
sw r9, r5, r10 /* *(d + offset) = t1 */
301
bslli r11, r12, 24 /* h = v << 24 */
302
addi r4, r4,-4 /* n = n - 4 */
303
bneid r4, a_wu3_loop /* while (n) loop */
304
addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
309
bslli r11, r11, 8 /* h = h << 8 */
311
lw r12, r8, r10 /* v = *(as + offset) */
312
bsrli r9, r12, 24 /* t1 = v >> 24 */
313
or r9, r11, r9 /* t1 = h | t1 */
314
sw r9, r5, r10 /* *(d + offset) = t1 */
315
bslli r11, r12, 8 /* h = v << 8 */
316
addi r4, r4,-4 /* n = n - 4 */
317
bneid r4, a_wu1_loop /* while (n) loop */
318
addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
323
bslli r11, r11, 16 /* h = h << 16 */
325
lw r12, r8, r10 /* v = *(as + offset) */
326
bsrli r9, r12, 16 /* t1 = v >> 16 */
327
or r9, r11, r9 /* t1 = h | t1 */
328
sw r9, r5, r10 /* *(d + offset) = t1 */
329
bslli r11, r12, 16 /* h = v << 16 */
330
addi r4, r4,-4 /* n = n - 4 */
331
bneid r4, a_wu2_loop /* while (n) loop */
332
addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
335
add r5, r5, r10 /* d = d + offset */
336
add r6, r6, r10 /* s = s + offset */
337
rsub r7, r10, r7 /* c = c - offset */
341
beqi r7, a_done /* while (c) */
342
lbui r9, r6, 0 /* t1 = *s */
343
addi r6, r6, 1 /* s++ */
344
sbi r9, r5, 0 /* *d = t1 */
345
addi r7, r7, -1 /* c-- */
346
brid a_xfer_end_loop /* loop */
347
addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
353
.size memcpy, . - memcpy
355
/*----------------------------------------------------------------------------*/
357
.type memmove, @function
361
cmpu r4, r5, r6 /* n = s - d */
362
bgei r4,fast_memcpy_ascending
364
fast_memcpy_descending:
365
/* move d to return register as value of function */
368
add r5, r5, r7 /* d = d + c */
369
add r6, r6, r7 /* s = s + c */
371
addi r4, r0, 4 /* n = 4 */
372
cmpu r4, r4, r7 /* n = c - n (unsigned) */
373
blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
375
/* transfer first 0~3 bytes to get aligned dest address */
376
andi r4, r5, 3 /* n = d & 3 */
377
/* if zero, destination already aligned */
378
beqi r4,d_dalign_done
379
rsub r7, r4, r7 /* c = c - n adjust c */
382
/* if no bytes left to transfer, transfer the bulk */
383
beqi r4,d_dalign_done
384
addi r6, r6, -1 /* s-- */
385
addi r5, r5, -1 /* d-- */
386
lbui r11, r6, 0 /* h = *s */
387
sbi r11, r5, 0 /* *d = h */
388
brid d_xfer_first_loop /* loop */
389
addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
392
addi r4, r0, 32 /* n = 32 */
393
cmpu r4, r4, r7 /* n = c - n (unsigned) */
394
/* if n < 0, less than one block to transfer */
395
blti r4, d_block_done
398
andi r4, r7, 0xffffffe0 /* n = c & ~31 */
399
rsub r7, r4, r7 /* c = c - n */
401
andi r9, r6, 3 /* t1 = s & 3 */
402
/* if temp != 0, unaligned transfers needed */
403
bnei r9, d_block_unaligned
406
addi r6, r6, -32 /* s = s - 32 */
407
addi r5, r5, -32 /* d = d - 32 */
408
lwi r9, r6, 28 /* t1 = *(s + 28) */
409
lwi r10, r6, 24 /* t2 = *(s + 24) */
410
lwi r11, r6, 20 /* t3 = *(s + 20) */
411
lwi r12, r6, 16 /* t4 = *(s + 16) */
412
swi r9, r5, 28 /* *(d + 28) = t1 */
413
swi r10, r5, 24 /* *(d + 24) = t2 */
414
swi r11, r5, 20 /* *(d + 20) = t3 */
415
swi r12, r5, 16 /* *(d + 16) = t4 */
416
lwi r9, r6, 12 /* t1 = *(s + 12) */
417
lwi r10, r6, 8 /* t2 = *(s + 8) */
418
lwi r11, r6, 4 /* t3 = *(s + 4) */
419
lwi r12, r6, 0 /* t4 = *(s + 0) */
420
swi r9, r5, 12 /* *(d + 12) = t1 */
421
swi r10, r5, 8 /* *(d + 8) = t2 */
422
swi r11, r5, 4 /* *(d + 4) = t3 */
423
addi r4, r4, -32 /* n = n - 32 */
424
bneid r4, d_block_aligned /* while (n) loop */
425
swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
429
andi r8, r6, 0xfffffffc /* as = s & ~3 */
430
rsub r6, r4, r6 /* s = s - n */
431
lwi r11, r8, 0 /* h = *(as + 0) */
434
beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
436
beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
439
bsrli r11, r11, 8 /* h = h >> 8 */
441
addi r8, r8, -32 /* as = as - 32 */
442
addi r5, r5, -32 /* d = d - 32 */
443
lwi r12, r8, 28 /* v = *(as + 28) */
444
bslli r9, r12, 24 /* t1 = v << 24 */
445
or r9, r11, r9 /* t1 = h | t1 */
446
swi r9, r5, 28 /* *(d + 28) = t1 */
447
bsrli r11, r12, 8 /* h = v >> 8 */
448
lwi r12, r8, 24 /* v = *(as + 24) */
449
bslli r9, r12, 24 /* t1 = v << 24 */
450
or r9, r11, r9 /* t1 = h | t1 */
451
swi r9, r5, 24 /* *(d + 24) = t1 */
452
bsrli r11, r12, 8 /* h = v >> 8 */
453
lwi r12, r8, 20 /* v = *(as + 20) */
454
bslli r9, r12, 24 /* t1 = v << 24 */
455
or r9, r11, r9 /* t1 = h | t1 */
456
swi r9, r5, 20 /* *(d + 20) = t1 */
457
bsrli r11, r12, 8 /* h = v >> 8 */
458
lwi r12, r8, 16 /* v = *(as + 16) */
459
bslli r9, r12, 24 /* t1 = v << 24 */
460
or r9, r11, r9 /* t1 = h | t1 */
461
swi r9, r5, 16 /* *(d + 16) = t1 */
462
bsrli r11, r12, 8 /* h = v >> 8 */
463
lwi r12, r8, 12 /* v = *(as + 12) */
464
bslli r9, r12, 24 /* t1 = v << 24 */
465
or r9, r11, r9 /* t1 = h | t1 */
466
swi r9, r5, 12 /* *(d + 112) = t1 */
467
bsrli r11, r12, 8 /* h = v >> 8 */
468
lwi r12, r8, 8 /* v = *(as + 8) */
469
bslli r9, r12, 24 /* t1 = v << 24 */
470
or r9, r11, r9 /* t1 = h | t1 */
471
swi r9, r5, 8 /* *(d + 8) = t1 */
472
bsrli r11, r12, 8 /* h = v >> 8 */
473
lwi r12, r8, 4 /* v = *(as + 4) */
474
bslli r9, r12, 24 /* t1 = v << 24 */
475
or r9, r11, r9 /* t1 = h | t1 */
476
swi r9, r5, 4 /* *(d + 4) = t1 */
477
bsrli r11, r12, 8 /* h = v >> 8 */
478
lwi r12, r8, 0 /* v = *(as + 0) */
479
bslli r9, r12, 24 /* t1 = v << 24 */
480
or r9, r11, r9 /* t1 = h | t1 */
481
swi r9, r5, 0 /* *(d + 0) = t1 */
482
addi r4, r4, -32 /* n = n - 32 */
483
bneid r4, d_bu3_loop /* while (n) loop */
484
bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
488
bsrli r11, r11, 24 /* h = h >> 24 */
490
addi r8, r8, -32 /* as = as - 32 */
491
addi r5, r5, -32 /* d = d - 32 */
492
lwi r12, r8, 28 /* v = *(as + 28) */
493
bslli r9, r12, 8 /* t1 = v << 8 */
494
or r9, r11, r9 /* t1 = h | t1 */
495
swi r9, r5, 28 /* *(d + 28) = t1 */
496
bsrli r11, r12, 24 /* h = v >> 24 */
497
lwi r12, r8, 24 /* v = *(as + 24) */
498
bslli r9, r12, 8 /* t1 = v << 8 */
499
or r9, r11, r9 /* t1 = h | t1 */
500
swi r9, r5, 24 /* *(d + 24) = t1 */
501
bsrli r11, r12, 24 /* h = v >> 24 */
502
lwi r12, r8, 20 /* v = *(as + 20) */
503
bslli r9, r12, 8 /* t1 = v << 8 */
504
or r9, r11, r9 /* t1 = h | t1 */
505
swi r9, r5, 20 /* *(d + 20) = t1 */
506
bsrli r11, r12, 24 /* h = v >> 24 */
507
lwi r12, r8, 16 /* v = *(as + 16) */
508
bslli r9, r12, 8 /* t1 = v << 8 */
509
or r9, r11, r9 /* t1 = h | t1 */
510
swi r9, r5, 16 /* *(d + 16) = t1 */
511
bsrli r11, r12, 24 /* h = v >> 24 */
512
lwi r12, r8, 12 /* v = *(as + 12) */
513
bslli r9, r12, 8 /* t1 = v << 8 */
514
or r9, r11, r9 /* t1 = h | t1 */
515
swi r9, r5, 12 /* *(d + 112) = t1 */
516
bsrli r11, r12, 24 /* h = v >> 24 */
517
lwi r12, r8, 8 /* v = *(as + 8) */
518
bslli r9, r12, 8 /* t1 = v << 8 */
519
or r9, r11, r9 /* t1 = h | t1 */
520
swi r9, r5, 8 /* *(d + 8) = t1 */
521
bsrli r11, r12, 24 /* h = v >> 24 */
522
lwi r12, r8, 4 /* v = *(as + 4) */
523
bslli r9, r12, 8 /* t1 = v << 8 */
524
or r9, r11, r9 /* t1 = h | t1 */
525
swi r9, r5, 4 /* *(d + 4) = t1 */
526
bsrli r11, r12, 24 /* h = v >> 24 */
527
lwi r12, r8, 0 /* v = *(as + 0) */
528
bslli r9, r12, 8 /* t1 = v << 8 */
529
or r9, r11, r9 /* t1 = h | t1 */
530
swi r9, r5, 0 /* *(d + 0) = t1 */
531
addi r4, r4, -32 /* n = n - 32 */
532
bneid r4, d_bu1_loop /* while (n) loop */
533
bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
537
bsrli r11, r11, 16 /* h = h >> 16 */
539
addi r8, r8, -32 /* as = as - 32 */
540
addi r5, r5, -32 /* d = d - 32 */
541
lwi r12, r8, 28 /* v = *(as + 28) */
542
bslli r9, r12, 16 /* t1 = v << 16 */
543
or r9, r11, r9 /* t1 = h | t1 */
544
swi r9, r5, 28 /* *(d + 28) = t1 */
545
bsrli r11, r12, 16 /* h = v >> 16 */
546
lwi r12, r8, 24 /* v = *(as + 24) */
547
bslli r9, r12, 16 /* t1 = v << 16 */
548
or r9, r11, r9 /* t1 = h | t1 */
549
swi r9, r5, 24 /* *(d + 24) = t1 */
550
bsrli r11, r12, 16 /* h = v >> 16 */
551
lwi r12, r8, 20 /* v = *(as + 20) */
552
bslli r9, r12, 16 /* t1 = v << 16 */
553
or r9, r11, r9 /* t1 = h | t1 */
554
swi r9, r5, 20 /* *(d + 20) = t1 */
555
bsrli r11, r12, 16 /* h = v >> 16 */
556
lwi r12, r8, 16 /* v = *(as + 16) */
557
bslli r9, r12, 16 /* t1 = v << 16 */
558
or r9, r11, r9 /* t1 = h | t1 */
559
swi r9, r5, 16 /* *(d + 16) = t1 */
560
bsrli r11, r12, 16 /* h = v >> 16 */
561
lwi r12, r8, 12 /* v = *(as + 12) */
562
bslli r9, r12, 16 /* t1 = v << 16 */
563
or r9, r11, r9 /* t1 = h | t1 */
564
swi r9, r5, 12 /* *(d + 112) = t1 */
565
bsrli r11, r12, 16 /* h = v >> 16 */
566
lwi r12, r8, 8 /* v = *(as + 8) */
567
bslli r9, r12, 16 /* t1 = v << 16 */
568
or r9, r11, r9 /* t1 = h | t1 */
569
swi r9, r5, 8 /* *(d + 8) = t1 */
570
bsrli r11, r12, 16 /* h = v >> 16 */
571
lwi r12, r8, 4 /* v = *(as + 4) */
572
bslli r9, r12, 16 /* t1 = v << 16 */
573
or r9, r11, r9 /* t1 = h | t1 */
574
swi r9, r5, 4 /* *(d + 4) = t1 */
575
bsrli r11, r12, 16 /* h = v >> 16 */
576
lwi r12, r8, 0 /* v = *(as + 0) */
577
bslli r9, r12, 16 /* t1 = v << 16 */
578
or r9, r11, r9 /* t1 = h | t1 */
579
swi r9, r5, 0 /* *(d + 0) = t1 */
580
addi r4, r4, -32 /* n = n - 32 */
581
bneid r4, d_bu2_loop /* while (n) loop */
582
bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
585
addi r4, r0, 4 /* n = 4 */
586
cmpu r4, r4, r7 /* n = c - n (unsigned) */
587
blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
590
andi r4, r7, 0xfffffffc /* n = c & ~3 */
591
rsub r5, r4, r5 /* d = d - n */
592
rsub r6, r4, r6 /* s = s - n */
593
rsub r7, r4, r7 /* c = c - n */
595
andi r9, r6, 3 /* t1 = s & 3 */
596
/* if temp != 0, unaligned transfers needed */
597
bnei r9, d_word_unaligned
600
addi r4, r4,-4 /* n-- */
601
lw r9, r6, r4 /* t1 = *(s+n) */
602
bneid r4, d_word_aligned /* loop */
603
sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
608
andi r8, r6, 0xfffffffc /* as = s & ~3 */
609
lw r11, r8, r4 /* h = *(as + n) */
612
beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
614
beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
617
bsrli r11, r11, 8 /* h = h >> 8 */
619
addi r4, r4,-4 /* n = n - 4 */
620
lw r12, r8, r4 /* v = *(as + n) */
621
bslli r9, r12, 24 /* t1 = v << 24 */
622
or r9, r11, r9 /* t1 = h | t1 */
623
sw r9, r5, r4 /* *(d + n) = t1 */
624
bneid r4, d_wu3_loop /* while (n) loop */
625
bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
630
bsrli r11, r11, 24 /* h = h >> 24 */
632
addi r4, r4,-4 /* n = n - 4 */
633
lw r12, r8, r4 /* v = *(as + n) */
634
bslli r9, r12, 8 /* t1 = v << 8 */
635
or r9, r11, r9 /* t1 = h | t1 */
636
sw r9, r5, r4 /* *(d + n) = t1 */
637
bneid r4, d_wu1_loop /* while (n) loop */
638
bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
643
bsrli r11, r11, 16 /* h = h >> 16 */
645
addi r4, r4,-4 /* n = n - 4 */
646
lw r12, r8, r4 /* v = *(as + n) */
647
bslli r9, r12, 16 /* t1 = v << 16 */
648
or r9, r11, r9 /* t1 = h | t1 */
649
sw r9, r5, r4 /* *(d + n) = t1 */
650
bneid r4, d_wu2_loop /* while (n) loop */
651
bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
657
beqi r7, a_done /* while (c) */
658
addi r6, r6, -1 /* s-- */
659
lbui r9, r6, 0 /* t1 = *s */
660
addi r5, r5, -1 /* d-- */
661
sbi r9, r5, 0 /* *d = t1 */
662
brid d_xfer_end_loop /* loop */
663
addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
669
.size memmove, . - memmove