~ubuntu-branches/ubuntu/precise/xf86-video-msm-lts-quantal/precise-proposed

« back to all changes in this revision

Viewing changes to src/neon_memmove.S

  • Committer: Package Import Robot
  • Author(s): Maarten Lankhorst
  • Date: 2012-11-30 20:59:12 UTC
  • Revision ID: package-import@ubuntu.com-20121130205912-cgks3325tnmsq53c
Tags: upstream-1.0.1+git20100122.5f7df591
ImportĀ upstreamĀ versionĀ 1.0.1+git20100122.5f7df591

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/***************************************************************************
 
2
 Copyright (c) 2009, Code Aurora Forum. All rights reserved.
 
3
 
 
4
 Redistribution and use in source and binary forms, with or without
 
5
 modification, are permitted provided that the following conditions are met:
 
6
     * Redistributions of source code must retain the above copyright
 
7
       notice, this list of conditions and the following disclaimer.
 
8
     * Redistributions in binary form must reproduce the above copyright
 
9
       notice, this list of conditions and the following disclaimer in the
 
10
       documentation and/or other materials provided with the distribution.
 
11
     * Neither the name of Code Aurora nor the names of its contributors may
 
12
       be used to endorse or promote products derived from this software
 
13
       without specific prior written permission.
 
14
 
 
15
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 
16
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 
17
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 
18
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 
19
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 
20
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 
21
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 
22
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 
23
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 
24
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 
25
 POSSIBILITY OF SUCH DAMAGE.
 
26
  ***************************************************************************/
 
27
 
 
28
/***************************************************************************
 
29
 *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
 
30
 *     Inputs:
 
31
 *        dest: The destination buffer
 
32
 *        src: The source buffer
 
33
 *        n: The size of the buffer to transfer
 
34
 *     Outputs:
 
35
 *
 
36
 ***************************************************************************/
 
37
 
 
38
/*
 
39
 * General note:
 
40
 * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
 
41
 * However, it looks like the 2006 CodeSourcery Assembler has issues generating
 
42
 * the correct object code for VPOP, resulting in horrific stack crashes.
 
43
 * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
 
44
 * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
 
45
 *
 
46
 * Also, VSHL swaps the source register and the shift-amount register
 
47
 * around in 2006-q3.  I've coded this incorrectly so it turns out correct
 
48
 * in the object code, but we'll need to undo that later...
 
49
 */
 
50
        .code 32
 
51
        .align 4
 
52
        .globl neon_memmove
 
53
        .func
 
54
 
 
55
neon_memmove:
 
56
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
57
        stmdb           sp!, {r0}
 
58
#else
 
59
        push            {r0}
 
60
#endif
 
61
 
 
62
        /*
 
63
         * The requirements for memmove state that the function should
 
64
         * operate as if data were being copied from the source to a
 
65
         * buffer, then to the destination.  This is to allow a user
 
66
         * to copy data from a source and target that overlap.
 
67
         *
 
68
         * We can't just do byte copies front-to-back automatically, since
 
69
         * there's a good chance we may have an overlap (why else would someone
 
70
         * intentionally use memmove then?).
 
71
         *
 
72
         * We'll break this into two parts.  Front-to-back, or back-to-front
 
73
         * copies.
 
74
         */
 
75
neon_memmove_cmf:
 
76
        cmp             r0, r1
 
77
        blt             neon_front_to_back_copy
 
78
        bgt             neon_back_to_front_copy
 
79
        b               neon_memmove_done
 
80
 
 
81
        /* #############################################################
 
82
         * Front to Back copy
 
83
         */
 
84
neon_front_to_back_copy:
 
85
        /*
 
86
         * For small copies, just do a quick memcpy.  We can do this for
 
87
         * front-to-back copies, aligned or unaligned, since we're only
 
88
         * doing 1 byte at a time...
 
89
         */
 
90
        cmp             r2, #4
 
91
        bgt             neon_f2b_gt4
 
92
        cmp             r2, #0
 
93
neon_f2b_smallcopy_loop:
 
94
        beq             neon_memmove_done
 
95
        ldrb            r12, [r1], #1
 
96
        subs            r2, r2, #1
 
97
        strb            r12, [r0], #1
 
98
        b               neon_f2b_smallcopy_loop
 
99
neon_f2b_gt4:
 
100
        /* Preload what we can...*/
 
101
        pld             [r0,#0]
 
102
        pld             [r1,#0] 
 
103
        /* The window size is in r3. */
 
104
        sub             r3, r1, r0
 
105
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
106
        stmdb           sp!, {r4-r6}
 
107
#else
 
108
        push            {r4-r6}
 
109
#endif
 
110
 
 
111
neon_f2b_check_align:
 
112
        /* Check alignment. */
 
113
        ands            r12, r0, #0x3
 
114
        beq             neon_f2b_source_align_check
 
115
        cmp             r12, #2
 
116
        ldrb            r4, [r1], #1
 
117
        ldrleb          r5, [r1], #1
 
118
        ldrltb          r6, [r1], #1
 
119
        rsb             r12, r12, #4
 
120
        sub             r2, r2, r12
 
121
        strb            r4, [r0], #1
 
122
        strleb          r5, [r0], #1
 
123
        strltb          r6, [r0], #1
 
124
        
 
125
neon_f2b_source_align_check:
 
126
        ands            r12, r1, #0x3
 
127
        bne             neon_f2b_nonaligned
 
128
 
 
129
neon_f2b_try_16_align:
 
130
        /* If we're >64, attempt to align on 16-bytes.  Smaller amounts
 
131
         * don't seem to be worth handling. */
 
132
        cmp             r2, #64
 
133
        blt             neon_f2b_align_route
 
134
        /* This is where we try 16-byte alignment. */
 
135
        ands            r12, r0, #0xf
 
136
        beq             neon_f2b_align_route
 
137
        rsb             r12, r12, #16
 
138
neon_f2b_16_start:
 
139
        sub             r2, r2, r12
 
140
        lsrs            r5, r12, #2
 
141
neon_f2b_align_16_4:
 
142
        ldr             r4, [r1], #4
 
143
        subs            r5, r5, #1
 
144
        str             r4, [r0], #4
 
145
        bne             neon_f2b_align_16_4
 
146
neon_f2b_align_route:
 
147
        /* #############################################################
 
148
         * Front to Back copy - aligned
 
149
         */
 
150
        /*
 
151
         * Note that we can't just route based on the size in r2.  If that's
 
152
         * larger than the overlap window in r3, we could potentially
 
153
         * (and likely!) destroy data we're copying.
 
154
         */
 
155
        cmp             r2, r3
 
156
        movle           r12, r2
 
157
        movgt           r12, r3
 
158
        cmp             r12, #256
 
159
        bge             neon_f2b_copy_128_a
 
160
        cmp             r12, #64
 
161
        bge             neon_f2b_copy_32_a
 
162
        cmp             r12, #16
 
163
        bge             neon_f2b_copy_16_a
 
164
        cmp             r12, #8
 
165
        bge             neon_f2b_copy_8_a
 
166
        cmp             r12, #4
 
167
        bge             neon_f2b_copy_4_a
 
168
        b               neon_f2b_copy_1_a
 
169
neon_f2b_copy_128_a:
 
170
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
171
        vstmdb          sp!, {q4-q7}
 
172
#else
 
173
        vpush           {q4-q7}
 
174
#endif
 
175
        mov             r12, r2, lsr #7
 
176
neon_f2b_copy_128_a_loop:
 
177
        vld1.32         {q0,q1}, [r1]!
 
178
        vld1.32         {q2,q3}, [r1]!
 
179
        vld1.32         {q4,q5}, [r1]!
 
180
        vld1.32         {q6,q7}, [r1]!
 
181
        pld             [r1, #0]
 
182
        pld             [r1, #128]
 
183
        vst1.32         {q0,q1}, [r0]!
 
184
        vst1.32         {q2,q3}, [r0]!
 
185
        vst1.32         {q4,q5}, [r0]!
 
186
        vst1.32         {q6,q7}, [r0]!
 
187
        subs            r12, r12, #1
 
188
        pld             [r0, #0]
 
189
        pld             [r0, #128]
 
190
        bne             neon_f2b_copy_128_a_loop
 
191
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
192
        vldmia          sp!, {q4-q7}
 
193
#else
 
194
        vpop            {q4-q7}
 
195
#endif
 
196
        ands            r2, r2, #0x7f
 
197
        beq             neon_f2b_finish
 
198
        cmp             r2, #32
 
199
        bge             neon_f2b_copy_32_a
 
200
        b               neon_f2b_copy_finish_a
 
201
neon_f2b_copy_32_a:
 
202
        mov             r12, r2, lsr #5
 
203
neon_f2b_copy_32_a_loop:
 
204
        vld1.32         {q0,q1}, [r1]!
 
205
        subs            r12, r12, #1
 
206
        pld             [r1, #0]
 
207
        vst1.32         {q0,q1}, [r0]!
 
208
        bne             neon_f2b_copy_32_a_loop
 
209
        ands            r2, r2, #0x1f
 
210
        beq             neon_f2b_finish
 
211
neon_f2b_copy_finish_a:
 
212
neon_f2b_copy_16_a:
 
213
        movs            r12, r2, lsr #4
 
214
        beq             neon_f2b_copy_8_a
 
215
neon_f2b_copy_16_a_loop:
 
216
        vld1.32         {q0}, [r1]!
 
217
        subs            r12, r12, #1
 
218
        vst1.32         {q0}, [r0]!
 
219
        bne             neon_f2b_copy_16_a_loop
 
220
        ands            r2, r2, #0xf
 
221
        beq             neon_f2b_finish
 
222
neon_f2b_copy_8_a:
 
223
        cmp             r2, #8
 
224
        blt             neon_f2b_copy_4_a
 
225
        ldm             r1!, {r4-r5}
 
226
        subs            r2, r2, #8
 
227
        stm             r0!, {r4-r5}
 
228
neon_f2b_copy_4_a:
 
229
        cmp             r2, #4
 
230
        blt             neon_f2b_copy_1_a
 
231
        ldr             r4, [r1], #4
 
232
        subs            r2, r2, #4
 
233
        str             r4, [r0], #4
 
234
neon_f2b_copy_1_a:
 
235
        cmp             r2, #0
 
236
        beq             neon_f2b_finish
 
237
neon_f2b_copy_1_a_loop:
 
238
        ldrb            r12, [r1], #1
 
239
        subs            r2, r2, #1
 
240
        strb            r12, [r0], #1
 
241
        bne             neon_f2b_copy_1_a_loop
 
242
        b               neon_f2b_finish
 
243
                
 
244
        /* #############################################################
 
245
         * Front to Back copy - unaligned
 
246
         */
 
247
neon_f2b_nonaligned:
 
248
        /*
 
249
         * For sizes < 8, does it really make sense to do the whole shift
 
250
         * party?  Note that we DON'T want to call neon_f2b_copy_1_u,
 
251
         * since we'll end up trying to pop r8-r11, and we DON'T want
 
252
         * to do that...
 
253
         */
 
254
        cmp             r2, #8
 
255
        ble             neon_f2b_copy_1_a
 
256
 
 
257
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
258
        stmdb           sp!, {r7-r9}
 
259
#else
 
260
        push            {r7-r9}
 
261
#endif
 
262
        cmp             r12, #2
 
263
        ldrb            r4, [r1], #1
 
264
        ldrleb          r5, [r1], #1
 
265
        ldrltb          r6, [r1], #1
 
266
        rsb             r8, r12, #4
 
267
        sub             r2, r2, r8
 
268
        lsl             r8, r8, #3
 
269
        orrle           r4, r4, r5, lsl #8
 
270
        orrlt           r4, r4, r6, lsl #16
 
271
        rsb             r9, r8, #32
 
272
        /*
 
273
         * r4  = overflow bits
 
274
         * r8 = # of bits we copied into the r4 register to align source.
 
275
         * r9 = 32 - r8
 
276
         * r12 = Index counter for each size, so we determine how many times
 
277
         *       the given size will go into r2, then count down that # of
 
278
         *       times in r12.
 
279
         */
 
280
        cmp             r2, #64
 
281
        blt             neon_f2b_unaligned_route
 
282
        ands            r12, r0, #0xf
 
283
        beq             neon_f2b_unaligned_route
 
284
        cmp             r3, #4
 
285
        blt             neon_f2b_unaligned_route
 
286
        rsb             r12, r12, #16
 
287
neon_f2b_16_start_u:
 
288
        sub             r2, r2, r12
 
289
        lsrs            r6, r12, #2
 
290
neon_f2b_align_16_4_u:
 
291
        ldr             r5, [r1], #4
 
292
        subs            r6, r6, #1
 
293
        orr             r4, r4, r5, lsl r8
 
294
        str             r4, [r0], #4
 
295
        mov             r4, r5, lsr r9
 
296
        bne             neon_f2b_align_16_4_u
 
297
neon_f2b_unaligned_route:
 
298
        cmp             r2, r3
 
299
        movle           r12, r2
 
300
        movgt           r12, r3
 
301
        cmp             r12, #256
 
302
        bge             neon_f2b_copy_64_u
 
303
        cmp             r12, #64
 
304
        bge             neon_f2b_copy_32_u
 
305
        cmp             r12, #16
 
306
        bge             neon_f2b_copy_16_u
 
307
        cmp             r12, #8
 
308
        bge             neon_f2b_copy_8_u
 
309
        cmp             r12, #4
 
310
        bge             neon_f2b_copy_4_u
 
311
        b               neon_f2b_last_bits_u
 
312
neon_f2b_copy_64_u:
 
313
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
314
        vstmdb          sp!, {q4}
 
315
        vstmdb          sp!, {q5-q8}
 
316
#else
 
317
        vpush           {q4}
 
318
        vpush           {q5-q8}
 
319
#endif
 
320
        vdup.u32        q8, r8
 
321
        mov             r12, r2, lsr #6
 
322
        and             r2, r2, #0x3f
 
323
neon_f2b_copy_64_u_loop:
 
324
        vld1.32         {q4, q5}, [r1]!
 
325
        vld1.32         {q6, q7}, [r1]!
 
326
        lsls            r5, r8, #28
 
327
        bcc             neon_f2b_copy_64_u_b8
 
328
        bpl             neon_f2b_copy_64_u_b16
 
329
        vshr.u64        q0, q4, #40
 
330
        vshr.u64        q1, q5, #40
 
331
        vshr.u64        q2, q6, #40
 
332
        vshr.u64        q3, q7, #40
 
333
        b               neon_f2b_copy_64_unify
 
334
neon_f2b_copy_64_u_b8:
 
335
        vshr.u64        q0, q4, #56
 
336
        vshr.u64        q1, q5, #56
 
337
        vshr.u64        q2, q6, #56
 
338
        vshr.u64        q3, q7, #56
 
339
        b               neon_f2b_copy_64_unify
 
340
neon_f2b_copy_64_u_b16:
 
341
        vshr.u64        q0, q4, #48
 
342
        vshr.u64        q1, q5, #48
 
343
        vshr.u64        q2, q6, #48
 
344
        vshr.u64        q3, q7, #48
 
345
neon_f2b_copy_64_unify:
 
346
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
347
        vshl.u64        q4, q8, q4
 
348
        vshl.u64        q5, q8, q5
 
349
        vshl.u64        q6, q8, q6
 
350
        vshl.u64        q7, q8, q7
 
351
#else
 
352
        vshl.u64        q4, q4, q8
 
353
        vshl.u64        q5, q5, q8
 
354
        vshl.u64        q6, q6, q8
 
355
        vshl.u64        q7, q7, q8
 
356
#endif
 
357
        vmov            r5, s14
 
358
        vorr            d9, d9, d0
 
359
        vmov            s14, r4
 
360
        vorr            d10, d10, d1
 
361
        vorr            d11, d11, d2
 
362
        vorr            d12, d12, d3
 
363
        vorr            d13, d13, d4
 
364
        vorr            d14, d14, d5
 
365
        vorr            d15, d15, d6
 
366
        vorr            d8, d8, d7
 
367
        subs            r12, r12, #1
 
368
        pld             [r1, #0]
 
369
        pld             [r1, #128]
 
370
        mov             r4, r5
 
371
        vst1.32         {q4, q5}, [r0]!
 
372
        vst1.32         {q6, q7}, [r0]!
 
373
        pld             [r0, #0]
 
374
        pld             [r0, #128]
 
375
        bne             neon_f2b_copy_64_u_loop
 
376
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
377
        vldmia          sp!, {q5-q8}
 
378
        vldmia          sp!, {q4}
 
379
#else
 
380
        vpop            {q5-q8}
 
381
        vpop            {q4}
 
382
#endif
 
383
        cmp             r2, #32
 
384
        bge             neon_f2b_copy_32_u
 
385
        b               neon_f2b_copy_finish_u
 
386
neon_f2b_copy_32_u:
 
387
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
388
        vstmdb          sp!, {q4}
 
389
#else
 
390
        vpush           {q4}
 
391
#endif
 
392
        vdup.u32        q4, r8
 
393
        mov             r12, r2, lsr #5
 
394
        and             r2, r2, #0x1f
 
395
neon_f2b_copy_32_u_loop:
 
396
        vld1.32         {q0, q1}, [r1]!
 
397
        lsls            r5, r8, #28
 
398
        bcc             neon_f2b_copy_32_u_b8
 
399
        bpl             neon_f2b_copy_32_u_b16
 
400
        vshr.u64        q2, q0, #40
 
401
        vshr.u64        q3, q1, #40
 
402
        b               neon_f2b_copy_32_unify
 
403
neon_f2b_copy_32_u_b8:
 
404
        vshr.u64        q2, q0, #56
 
405
        vshr.u64        q3, q1, #56
 
406
        b               neon_f2b_copy_32_unify
 
407
neon_f2b_copy_32_u_b16:
 
408
        vshr.u64        q2, q0, #48
 
409
        vshr.u64        q3, q1, #48
 
410
neon_f2b_copy_32_unify:
 
411
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
412
        vshl.u64        q0, q4, q0
 
413
        vshl.u64        q1, q4, q1
 
414
#else
 
415
        vshl.u64        q0, q0, q4
 
416
        vshl.u64        q1, q1, q4
 
417
#endif
 
418
        vmov            r5, s14
 
419
        vorr            d1, d1, d4
 
420
        vmov            s14, r4
 
421
        vorr            d2, d2, d5
 
422
        vorr            d3, d3, d6
 
423
        vorr            d0, d0, d7
 
424
        subs            r12, r12, #1
 
425
        pld             [r1, #0]
 
426
        mov             r4, r5
 
427
        vst1.32         {q0, q1}, [r0]!
 
428
        bne             neon_f2b_copy_32_u_loop
 
429
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
430
        vldmia          sp!, {q4}
 
431
#else
 
432
        vpop            {q4}
 
433
#endif
 
434
neon_f2b_copy_finish_u:
 
435
neon_f2b_copy_16_u:
 
436
        movs            r12, r2, lsr #4
 
437
        beq             neon_f2b_copy_8_u
 
438
        vdup.u32        q2, r8
 
439
        and             r2, r2, #0xf
 
440
neon_f2b_copy_16_u_loop:
 
441
        vld1.32         {q0}, [r1]!
 
442
        lsls            r5, r8, #28
 
443
        bcc             neon_f2b_copy_16_u_b8
 
444
        bpl             neon_f2b_copy_16_u_b16
 
445
        vshr.u64        q1, q0, #40
 
446
        b               neon_f2b_copy_16_unify
 
447
neon_f2b_copy_16_u_b8:
 
448
        vshr.u64        q1, q0, #56
 
449
        b               neon_f2b_copy_16_unify
 
450
neon_f2b_copy_16_u_b16:
 
451
        vshr.u64        q1, q0, #48
 
452
neon_f2b_copy_16_unify:
 
453
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
454
        vshl.u64        q0, q2, q0
 
455
#else
 
456
        vshl.u64        q0, q0, q2
 
457
#endif
 
458
        vmov            r5, s6
 
459
        vorr            d1, d1, d2
 
460
        vmov            s6, r4
 
461
        vorr            d0, d0, d3
 
462
        subs            r12, r12, #1
 
463
        mov             r4, r5
 
464
        vst1.32         {q0}, [r0]!
 
465
        bne             neon_f2b_copy_16_u_loop
 
466
neon_f2b_copy_8_u:
 
467
        cmp             r2, #8
 
468
        blt             neon_f2b_copy_4_u
 
469
        ldm             r1!, {r6-r7}
 
470
        subs            r2, r2, #8
 
471
        orr             r4, r4, r6, lsl r8
 
472
        mov             r5, r6, lsr r9
 
473
        orr             r5, r5, r7, lsl r8
 
474
        stm             r0!, {r4-r5}
 
475
        mov             r4, r7, lsr r9
 
476
neon_f2b_copy_4_u:
 
477
        cmp             r2, #4
 
478
        blt             neon_f2b_last_bits_u
 
479
        ldr             r5, [r1], #4
 
480
        subs            r2, r2, #4
 
481
        orr             r4, r4, r5, lsl r8
 
482
        str             r4, [r0], #4
 
483
        mov             r4, r5, lsr r9
 
484
neon_f2b_last_bits_u:
 
485
        lsr             r8, r8, #0x3
 
486
neon_f2b_last_bits_u_loop:
 
487
        strb            r4, [r0], #1
 
488
        subs            r8, r8, #1
 
489
        lsr             r4, r4, #8
 
490
        bne             neon_f2b_last_bits_u_loop
 
491
neon_f2b_copy_1_u:
 
492
        cmp             r2, #0
 
493
        beq             neon_f2b_finish_u
 
494
neon_f2b_copy_1_u_loop:
 
495
        ldrb            r12, [r1], #1
 
496
        subs            r2, r2, #1
 
497
        strb            r12, [r0], #1
 
498
        bne             neon_f2b_copy_1_u_loop
 
499
neon_f2b_finish_u:
 
500
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
501
        ldmia           sp!, {r7-r9}
 
502
#else
 
503
        pop             {r7-r9}
 
504
#endif
 
505
        /* #############################################################
 
506
         * Front to Back copy - finish
 
507
         */
 
508
neon_f2b_finish:
 
509
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
510
        ldmia           sp!, {r4-r6}
 
511
#else
 
512
        pop             {r4-r6}
 
513
#endif
 
514
        b               neon_memmove_done
 
515
 
 
516
        /* #############################################################
 
517
         * Back to Front copy
 
518
         */
 
519
neon_back_to_front_copy:
 
520
        /*
 
521
         * Here, we'll want to shift to the end of the buffers.  This
 
522
         * actually points us one past where we need to go, but since
 
523
         * we'll pre-decrement throughout, this will be fine.
 
524
         */
 
525
        add             r0, r0, r2
 
526
        add             r1, r1, r2
 
527
        cmp             r2, #4
 
528
        bgt             neon_b2f_gt4
 
529
        cmp             r2, #0
 
530
neon_b2f_smallcopy_loop:
 
531
        beq             neon_memmove_done
 
532
        ldrb            r12, [r1, #-1]!
 
533
        subs            r2, r2, #1
 
534
        strb            r12, [r0, #-1]!
 
535
        b               neon_b2f_smallcopy_loop
 
536
neon_b2f_gt4:
 
537
        pld             [r0, #0]
 
538
        pld             [r1, #0]
 
539
        /*
 
540
         * The minimum of the overlap window size and the copy size
 
541
         * is in r3.
 
542
         */
 
543
        sub             r3, r0, r1
 
544
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
545
        stmdb           sp!, {r4-r5}
 
546
#else
 
547
        push            {r4-r5}
 
548
#endif
 
549
 
 
550
        /*
 
551
         * Check alignment.  Since we'll pre-decrement as we step thru, we'll
 
552
         * need to make sure we're on word-alignment.
 
553
         */
 
554
neon_b2f_check_align:
 
555
        ands            r12, r0, #0x3
 
556
        beq             neon_b2f_source_align_check
 
557
        sub             r2, r2, r12
 
558
neon_b2f_shift_align:
 
559
        ldrb            r4, [r1, #-1]!
 
560
        subs            r12, r12, #1
 
561
        strb            r4, [r0, #-1]!
 
562
        bne             neon_b2f_shift_align
 
563
neon_b2f_source_align_check:
 
564
        ands            r4, r1, #0x3
 
565
        bne             neon_b2f_nonaligned
 
566
        
 
567
neon_b2f_try_16_align:
 
568
        /* If we're >64, attempt to align on 16-bytes.  Smaller amounts
 
569
         * don't seem to be worth handling. */
 
570
        cmp             r2, #64
 
571
        blt             neon_b2f_align_route
 
572
        ands            r12, r0, #0xf
 
573
        beq             neon_b2f_align_route
 
574
        /* In this case, r12 has the number of bytes to roll backward. */
 
575
neon_b2f_16_start:
 
576
        sub             r2, r2, r12
 
577
        lsrs            r5, r12, #2
 
578
neon_b2f_align_16_4:
 
579
        ldr             r4, [r1, #-4]!
 
580
        subs            r5, r5, #1
 
581
        str             r4, [r0, #-4]!
 
582
        bne             neon_b2f_align_16_4
 
583
neon_b2f_align_route:
 
584
        /*
 
585
         * #############################################################
 
586
         * Back to Front copy - aligned
 
587
         */
 
588
        cmp             r2, r3
 
589
        movle           r12, r2
 
590
        movgt           r12, r3
 
591
        cmp             r12, #256
 
592
        bge             neon_b2f_copy_128_a
 
593
        cmp             r12, #64
 
594
        bge             neon_b2f_copy_32_a
 
595
        cmp             r12, #8
 
596
        bge             neon_b2f_copy_8_a
 
597
        cmp             r12, #4
 
598
        bge             neon_b2f_copy_4_a
 
599
        b               neon_b2f_copy_1_a
 
600
neon_b2f_copy_128_a:
 
601
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
602
        vstmdb          sp!, {q4-q7}
 
603
#else
 
604
        vpush           {q4-q7}
 
605
#endif
 
606
        movs            r12, r2, lsr #7
 
607
        /*
 
608
         * This irks me.  There MUST be a better way to read these in and
 
609
         * scan the register backward instead of making it go forward.  Then
 
610
         * we need to do two subtractions...
 
611
         */
 
612
neon_b2f_copy_128_a_loop:
 
613
        sub             r1, r1, #128
 
614
        sub             r0, r0, #128
 
615
        vld1.32         {q0, q1}, [r1]!
 
616
        vld1.32         {q2, q3}, [r1]!
 
617
        vld1.32         {q4, q5}, [r1]!
 
618
        vld1.32         {q6, q7}, [r1]!
 
619
        pld             [r1, #-128]
 
620
        pld             [r1, #-256]
 
621
        vst1.32         {q0, q1}, [r0]!
 
622
        vst1.32         {q2, q3}, [r0]!
 
623
        vst1.32         {q4, q5}, [r0]!
 
624
        vst1.32         {q6, q7}, [r0]!
 
625
        subs            r12, r12, #1
 
626
        pld             [r0, #-128]
 
627
        pld             [r0, #-256]
 
628
        sub             r1, r1, #128
 
629
        sub             r0, r0, #128
 
630
        bne             neon_b2f_copy_128_a_loop
 
631
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
632
        vldmia          sp!, {q4-q7}
 
633
#else
 
634
        vpop            {q4-q7}
 
635
#endif
 
636
        ands            r2, r2, #0x7f
 
637
        beq             neon_b2f_finish
 
638
        cmp             r2, #32
 
639
        bge             neon_b2f_copy_32_a
 
640
        b               neon_b2f_copy_finish_a
 
641
neon_b2f_copy_32_a:
 
642
        mov             r12, r2, lsr #5
 
643
neon_b2f_copy_32_a_loop:
 
644
        sub             r1, r1, #32
 
645
        sub             r0, r0, #32
 
646
        vld1.32         {q0,q1}, [r1]
 
647
        subs            r12, r12, #1
 
648
        vst1.32         {q0,q1}, [r0]
 
649
        pld             [r1, #0]
 
650
        bne             neon_b2f_copy_32_a_loop
 
651
        ands            r2, r2, #0x1f
 
652
        beq             neon_b2f_finish
 
653
neon_b2f_copy_finish_a:
 
654
neon_b2f_copy_8_a:
 
655
        movs            r12, r2, lsr #0x3
 
656
        beq             neon_b2f_copy_4_a
 
657
neon_b2f_copy_8_a_loop:
 
658
        ldmdb           r1!, {r4-r5}
 
659
        subs            r12, r12, #1
 
660
        stmdb           r0!, {r4-r5}
 
661
        bne             neon_b2f_copy_8_a_loop
 
662
        and             r2, r2, #0x7
 
663
neon_b2f_copy_4_a:
 
664
        movs            r12, r2, lsr #0x2
 
665
        beq             neon_b2f_copy_1_a
 
666
        and             r2, r2, #0x3
 
667
neon_b2f_copy_4_a_loop:
 
668
        ldr             r4, [r1, #-4]!
 
669
        subs            r12, r12, #1
 
670
        str             r4, [r0, #-4]!
 
671
        bne             neon_b2f_copy_4_a_loop
 
672
neon_b2f_copy_1_a:
 
673
        cmp             r2, #0
 
674
        beq             neon_b2f_finish
 
675
neon_b2f_copy_1_a_loop:
 
676
        ldrb            r12, [r1, #-1]!
 
677
        subs            r2, r2, #1
 
678
        strb            r12, [r0, #-1]!
 
679
        bne             neon_b2f_copy_1_a_loop
 
680
 
 
681
        /* #############################################################
 
682
         * Back to Front copy - unaligned
 
683
         */
 
684
neon_b2f_nonaligned:
 
685
        /*
 
686
         * For sizes < 8, does it really make sense to do the whole shift
 
687
         * party?
 
688
         */
 
689
        cmp             r2, #8
 
690
        ble             neon_b2f_copy_1_a
 
691
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
692
        stmdb           sp!, {r6-r11}
 
693
#else
 
694
        push            {r6-r11}
 
695
#endif
 
696
        /*
 
697
         * r3 = max window size
 
698
         * r4 = overflow bytes
 
699
         * r5 = bytes we're reading into
 
700
         * r6 = # bytes we're off.
 
701
         * r10 = copy of r6
 
702
         */
 
703
        and             r6, r1, #0x3
 
704
        eor             r4, r4, r4
 
705
        mov             r10, r6
 
706
neon_b2f_realign:
 
707
        ldrb            r5, [r1, #-1]!
 
708
        subs            r6, r6, #1
 
709
        orr             r4, r5, r4, lsl #8
 
710
        bne             neon_b2f_realign
 
711
        /*
 
712
         * r10 = # of bits we copied into the r4 register to align source.
 
713
         * r11 = 32 - r10
 
714
         * r12 = Index counter for each size, so we determine how many times
 
715
         *       the given size will go into r2, then count down that # of
 
716
         *       times in r12.
 
717
         */
 
718
        sub             r2, r2, r10
 
719
        lsl             r10, r10, #0x3
 
720
        rsb             r11, r10, #32
 
721
 
 
722
        cmp             r2, r3
 
723
        movle           r12, r2
 
724
        movgt           r12, r3
 
725
        cmp             r12, #256
 
726
        bge             neon_b2f_copy_64_u
 
727
        cmp             r12, #64
 
728
        bge             neon_b2f_copy_32_u
 
729
        cmp             r12, #8
 
730
        bge             neon_b2f_copy_8_u
 
731
        cmp             r12, #4
 
732
        bge             neon_b2f_copy_4_u
 
733
        b               neon_b2f_last_bits_u
 
734
neon_b2f_copy_64_u:
 
735
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
736
        vstmdb          sp!, {q4,q5}
 
737
        vstmdb          sp!, {q6-q8}
 
738
#else
 
739
        vpush           {q4,q5}
 
740
        vpush           {q6-q8}
 
741
#endif
 
742
        add             r7, r11, #32
 
743
        movs            r12, r2, lsr #6
 
744
        vdup.u32        q8, r7
 
745
neon_b2f_copy_64_u_loop:
 
746
        sub             r1, r1, #64
 
747
        sub             r0, r0, #64
 
748
        vld1.32         {q0, q1}, [r1]!
 
749
        vld1.32         {q2, q3}, [r1]
 
750
        sub             r1, r1, #32
 
751
        vmov            q4, q0
 
752
        vmov            q5, q1
 
753
        vmov            q6, q2
 
754
        vmov            q7, q3
 
755
        vmov            r5, s0
 
756
        mov             r4, r4, lsl r11
 
757
        lsls            r6, r10, #28
 
758
        bcc             neon_b2f_copy_64_u_b8
 
759
        bpl             neon_b2f_copy_64_u_b16
 
760
        vshr.u64        q0, q0, #24
 
761
        vshr.u64        q1, q1, #24
 
762
        vshr.u64        q2, q2, #24
 
763
        vshr.u64        q3, q3, #24
 
764
        b               neon_b2f_copy_64_unify
 
765
neon_b2f_copy_64_u_b8:
 
766
        vshr.u64        q0, q0, #8
 
767
        vshr.u64        q1, q1, #8
 
768
        vshr.u64        q2, q2, #8
 
769
        vshr.u64        q3, q3, #8
 
770
        b               neon_b2f_copy_64_unify
 
771
neon_b2f_copy_64_u_b16:
 
772
        vshr.u64        q0, q0, #16
 
773
        vshr.u64        q1, q1, #16
 
774
        vshr.u64        q2, q2, #16
 
775
        vshr.u64        q3, q3, #16
 
776
neon_b2f_copy_64_unify:
 
777
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
778
        vshl.u64        q4, q8, q4
 
779
        vshl.u64        q5, q8, q5
 
780
        vshl.u64        q6, q8, q6
 
781
        vshl.u64        q7, q8, q7
 
782
#else
 
783
        vshl.u64        q4, q4, q8
 
784
        vshl.u64        q5, q5, q8
 
785
        vshl.u64        q6, q6, q8
 
786
        vshl.u64        q7, q7, q8
 
787
#endif
 
788
        vmov            s17, r4
 
789
        vorr            d7, d7, d8
 
790
        vorr            d6, d6, d15
 
791
        vorr            d5, d5, d14
 
792
        vorr            d4, d4, d13
 
793
        vorr            d3, d3, d12
 
794
        vorr            d2, d2, d11
 
795
        vorr            d1, d1, d10
 
796
        vorr            d0, d0, d9
 
797
        mov             r4, r5, lsl r11
 
798
        subs            r12, r12, #1
 
799
        lsr             r4, r4, r11
 
800
        vst1.32         {q0, q1}, [r0]!
 
801
        vst1.32         {q2, q3}, [r0]
 
802
        pld             [r1, #0]
 
803
        sub             r0, r0, #32
 
804
        bne             neon_b2f_copy_64_u_loop
 
805
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
806
        vldmia          sp!, {q6-q8}
 
807
        vldmia          sp!, {q4,q5}
 
808
#else
 
809
        vpop            {q6-q8}
 
810
        vpop            {q4,q5}
 
811
#endif
 
812
        ands            r2, r2, #0x3f
 
813
        cmp             r2, #32
 
814
        bge             neon_b2f_copy_32_u
 
815
        b               neon_b2f_copy_finish_u
 
816
neon_b2f_copy_32_u:
 
817
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
818
        vstmdb          sp!, {q4}
 
819
#else
 
820
        vpush           {q4}
 
821
#endif
 
822
        add             r7, r11, #32
 
823
        movs            r12, r2, lsr #5
 
824
        vdup.u32        q4, r7
 
825
        and             r2, r2, #0x1f
 
826
neon_b2f_copy_32_u_loop:
 
827
        sub             r1, r1, #32
 
828
        sub             r0, r0, #32
 
829
        vld1.32         {q0, q1}, [r1]
 
830
        vmov            q2, q0
 
831
        vmov            q3, q1
 
832
        vmov            r5, s0
 
833
        mov             r4, r4, lsl r11
 
834
        lsls            r6, r10, #28
 
835
        bcc             neon_b2f_copy_32_u_b8
 
836
        bpl             neon_b2f_copy_32_u_b16
 
837
        vshr.u64        q0, q0, #24
 
838
        vshr.u64        q1, q1, #24
 
839
        b               neon_b2f_copy_32_unify
 
840
neon_b2f_copy_32_u_b8:
 
841
        vshr.u64        q0, q0, #8
 
842
        vshr.u64        q1, q1, #8
 
843
        b               neon_b2f_copy_32_unify
 
844
neon_b2f_copy_32_u_b16:
 
845
        vshr.u64        q0, q0, #16
 
846
        vshr.u64        q1, q1, #16
 
847
neon_b2f_copy_32_unify:
 
848
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
849
        vshl.u64        q2, q4, q2
 
850
        vshl.u64        q3, q4, q3
 
851
#else
 
852
        vshl.u64        q2, q2, q4
 
853
        vshl.u64        q3, q3, q4
 
854
#endif
 
855
        vmov            s9, r4
 
856
        vorr            d3, d3, d4
 
857
        vorr            d2, d2, d7
 
858
        vorr            d1, d1, d6
 
859
        vorr            d0, d0, d5
 
860
        mov             r4, r5, lsl r11
 
861
        subs            r12, r12, #1
 
862
        lsr             r4, r4, r11
 
863
        vst1.32         {q0, q1}, [r0]
 
864
        pld             [r1, #0]
 
865
        bne             neon_b2f_copy_32_u_loop
 
866
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
867
        vldmia          sp!, {q4}
 
868
#else
 
869
        vpop            {q4}
 
870
#endif
 
871
neon_b2f_copy_finish_u:
 
872
neon_b2f_copy_8_u:
 
873
        movs            r12, r2, lsr #0x3
 
874
        beq             neon_b2f_copy_4_u
 
875
        mov             r5, r4, lsl r11
 
876
neon_b2f_copy_8_u_loop:
 
877
        ldmdb           r1!, {r6-r7}
 
878
        subs            r12, r12, #1
 
879
        orr             r5, r5, r7, lsr r10
 
880
        mov             r4, r7, lsl r11
 
881
        orr             r4, r4, r6, lsr r10
 
882
        stmdb           r0!, {r4-r5}
 
883
        mov             r4, r6, lsl r11
 
884
        lsr             r4, r4, r11
 
885
        mov             r5, r4, lsl r11
 
886
        bne             neon_b2f_copy_8_u_loop
 
887
        ands            r2, r2, #0x7
 
888
neon_b2f_copy_4_u:
 
889
        movs            r12, r2, lsr #0x2
 
890
        beq             neon_b2f_last_bits_u
 
891
        mov             r5, r4, lsl r11
 
892
neon_b2f_copy_4_u_loop:
 
893
        ldr             r6, [r1, #-4]!
 
894
        subs            r12, r12, #1
 
895
        orr             r5, r5, r6, lsr r10
 
896
        str             r5, [r0, #-4]!
 
897
        mov             r4, r6, lsl r11
 
898
        lsr             r4, r4, r11
 
899
        mov             r5, r4, lsl r11
 
900
        bne             neon_b2f_copy_4_u_loop
 
901
        and             r2, r2, #0x3
 
902
neon_b2f_last_bits_u:
 
903
neon_b2f_last_bits_u_loop:
 
904
        subs            r10, r10, #8
 
905
        mov             r5, r4, lsr r10
 
906
        strb            r5, [r0, #-1]!
 
907
        bne             neon_b2f_last_bits_u_loop
 
908
neon_b2f_copy_1_u:
 
909
        cmp             r2, #0
 
910
        beq             neon_b2f_finish_u
 
911
neon_b2f_copy_1_u_loop:
 
912
        ldrb            r12, [r1, #-1]!
 
913
        subs            r2, r2, #1
 
914
        strb            r12, [r0, #-1]!
 
915
        bne             neon_b2f_copy_1_u_loop
 
916
neon_b2f_finish_u:
 
917
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
918
        ldmia           sp!, {r6-r11}
 
919
#else
 
920
        pop             {r6-r11}
 
921
#endif
 
922
 
 
923
neon_b2f_finish:
 
924
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
925
        ldmia           sp!, {r4-r5}
 
926
#else
 
927
        pop             {r4-r5}
 
928
#endif
 
929
 
 
930
neon_memmove_done:
 
931
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
 
932
        ldmia           sp!, {r0}
 
933
#else
 
934
        pop             {r0}
 
935
#endif
 
936
        bx              lr
 
937
 
 
938
        .endfunc
 
939
        .end