1
/***************************************************************************
2
Copyright (c) 2009, Code Aurora Forum. All rights reserved.
4
Redistribution and use in source and binary forms, with or without
5
modification, are permitted provided that the following conditions are met:
6
* Redistributions of source code must retain the above copyright
7
notice, this list of conditions and the following disclaimer.
8
* Redistributions in binary form must reproduce the above copyright
9
notice, this list of conditions and the following disclaimer in the
10
documentation and/or other materials provided with the distribution.
11
* Neither the name of Code Aurora nor the names of its contributors may
12
be used to endorse or promote products derived from this software
13
without specific prior written permission.
15
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25
POSSIBILITY OF SUCH DAMAGE.
26
***************************************************************************/
28
/***************************************************************************
29
* Neon memmove: Attempts to do a memmove with Neon registers if possible,
31
* dest: The destination buffer
32
* src: The source buffer
33
* n: The size of the buffer to transfer
36
***************************************************************************/
40
* The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
41
* However, it looks like the 2006 CodeSourcery Assembler has issues generating
42
* the correct object code for VPOP, resulting in horrific stack crashes.
43
* As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
44
* and VPOP->VLDMIA. We can revert this back once we update our toolchain.
46
* Also, VSHL swaps the source register and the shift-amount register
47
* around in 2006-q3. I've coded this incorrectly so it turns out correct
48
* in the object code, but we'll need to undo that later...
56
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
63
* The requirements for memmove state that the function should
64
* operate as if data were being copied from the source to a
65
* buffer, then to the destination. This is to allow a user
66
* to copy data from a source and target that overlap.
68
* We can't just do byte copies front-to-back automatically, since
69
* there's a good chance we may have an overlap (why else would someone
70
* intentionally use memmove then?).
72
* We'll break this into two parts. Front-to-back, or back-to-front
77
blt neon_front_to_back_copy
78
bgt neon_back_to_front_copy
81
/* #############################################################
84
neon_front_to_back_copy:
86
* For small copies, just do a quick memcpy. We can do this for
87
* front-to-back copies, aligned or unaligned, since we're only
88
* doing 1 byte at a time...
93
neon_f2b_smallcopy_loop:
98
b neon_f2b_smallcopy_loop
100
/* Preload what we can...*/
103
/* The window size is in r3. */
105
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
111
neon_f2b_check_align:
112
/* Check alignment. */
114
beq neon_f2b_source_align_check
125
neon_f2b_source_align_check:
127
bne neon_f2b_nonaligned
129
neon_f2b_try_16_align:
130
/* If we're >64, attempt to align on 16-bytes. Smaller amounts
131
* don't seem to be worth handling. */
133
blt neon_f2b_align_route
134
/* This is where we try 16-byte alignment. */
136
beq neon_f2b_align_route
145
bne neon_f2b_align_16_4
146
neon_f2b_align_route:
147
/* #############################################################
148
* Front to Back copy - aligned
151
* Note that we can't just route based on the size in r2. If that's
152
* larger than the overlap window in r3, we could potentially
153
* (and likely!) destroy data we're copying.
159
bge neon_f2b_copy_128_a
161
bge neon_f2b_copy_32_a
163
bge neon_f2b_copy_16_a
165
bge neon_f2b_copy_8_a
167
bge neon_f2b_copy_4_a
170
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
176
neon_f2b_copy_128_a_loop:
177
vld1.32 {q0,q1}, [r1]!
178
vld1.32 {q2,q3}, [r1]!
179
vld1.32 {q4,q5}, [r1]!
180
vld1.32 {q6,q7}, [r1]!
183
vst1.32 {q0,q1}, [r0]!
184
vst1.32 {q2,q3}, [r0]!
185
vst1.32 {q4,q5}, [r0]!
186
vst1.32 {q6,q7}, [r0]!
190
bne neon_f2b_copy_128_a_loop
191
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
199
bge neon_f2b_copy_32_a
200
b neon_f2b_copy_finish_a
203
neon_f2b_copy_32_a_loop:
204
vld1.32 {q0,q1}, [r1]!
207
vst1.32 {q0,q1}, [r0]!
208
bne neon_f2b_copy_32_a_loop
211
neon_f2b_copy_finish_a:
214
beq neon_f2b_copy_8_a
215
neon_f2b_copy_16_a_loop:
219
bne neon_f2b_copy_16_a_loop
224
blt neon_f2b_copy_4_a
230
blt neon_f2b_copy_1_a
237
neon_f2b_copy_1_a_loop:
241
bne neon_f2b_copy_1_a_loop
244
/* #############################################################
245
* Front to Back copy - unaligned
249
* For sizes < 8, does it really make sense to do the whole shift
250
* party? Note that we DON'T want to call neon_f2b_copy_1_u,
251
* since we'll end up trying to pop r8-r11, and we DON'T want
255
ble neon_f2b_copy_1_a
257
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
269
orrle r4, r4, r5, lsl #8
270
orrlt r4, r4, r6, lsl #16
274
* r8 = # of bits we copied into the r4 register to align source.
276
* r12 = Index counter for each size, so we determine how many times
277
* the given size will go into r2, then count down that # of
281
blt neon_f2b_unaligned_route
283
beq neon_f2b_unaligned_route
285
blt neon_f2b_unaligned_route
290
neon_f2b_align_16_4_u:
293
orr r4, r4, r5, lsl r8
296
bne neon_f2b_align_16_4_u
297
neon_f2b_unaligned_route:
302
bge neon_f2b_copy_64_u
304
bge neon_f2b_copy_32_u
306
bge neon_f2b_copy_16_u
308
bge neon_f2b_copy_8_u
310
bge neon_f2b_copy_4_u
311
b neon_f2b_last_bits_u
313
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
323
neon_f2b_copy_64_u_loop:
324
vld1.32 {q4, q5}, [r1]!
325
vld1.32 {q6, q7}, [r1]!
327
bcc neon_f2b_copy_64_u_b8
328
bpl neon_f2b_copy_64_u_b16
333
b neon_f2b_copy_64_unify
334
neon_f2b_copy_64_u_b8:
339
b neon_f2b_copy_64_unify
340
neon_f2b_copy_64_u_b16:
345
neon_f2b_copy_64_unify:
346
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
371
vst1.32 {q4, q5}, [r0]!
372
vst1.32 {q6, q7}, [r0]!
375
bne neon_f2b_copy_64_u_loop
376
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
384
bge neon_f2b_copy_32_u
385
b neon_f2b_copy_finish_u
387
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
395
neon_f2b_copy_32_u_loop:
396
vld1.32 {q0, q1}, [r1]!
398
bcc neon_f2b_copy_32_u_b8
399
bpl neon_f2b_copy_32_u_b16
402
b neon_f2b_copy_32_unify
403
neon_f2b_copy_32_u_b8:
406
b neon_f2b_copy_32_unify
407
neon_f2b_copy_32_u_b16:
410
neon_f2b_copy_32_unify:
411
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
427
vst1.32 {q0, q1}, [r0]!
428
bne neon_f2b_copy_32_u_loop
429
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
434
neon_f2b_copy_finish_u:
437
beq neon_f2b_copy_8_u
440
neon_f2b_copy_16_u_loop:
443
bcc neon_f2b_copy_16_u_b8
444
bpl neon_f2b_copy_16_u_b16
446
b neon_f2b_copy_16_unify
447
neon_f2b_copy_16_u_b8:
449
b neon_f2b_copy_16_unify
450
neon_f2b_copy_16_u_b16:
452
neon_f2b_copy_16_unify:
453
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
465
bne neon_f2b_copy_16_u_loop
468
blt neon_f2b_copy_4_u
471
orr r4, r4, r6, lsl r8
473
orr r5, r5, r7, lsl r8
478
blt neon_f2b_last_bits_u
481
orr r4, r4, r5, lsl r8
484
neon_f2b_last_bits_u:
486
neon_f2b_last_bits_u_loop:
490
bne neon_f2b_last_bits_u_loop
493
beq neon_f2b_finish_u
494
neon_f2b_copy_1_u_loop:
498
bne neon_f2b_copy_1_u_loop
500
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
505
/* #############################################################
506
* Front to Back copy - finish
509
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
516
/* #############################################################
519
neon_back_to_front_copy:
521
* Here, we'll want to shift to the end of the buffers. This
522
* actually points us one past where we need to go, but since
523
* we'll pre-decrement throughout, this will be fine.
530
neon_b2f_smallcopy_loop:
531
beq neon_memmove_done
535
b neon_b2f_smallcopy_loop
540
* The minimum of the overlap window size and the copy size
544
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
551
* Check alignment. Since we'll pre-decrement as we step thru, we'll
552
* need to make sure we're on word-alignment.
554
neon_b2f_check_align:
556
beq neon_b2f_source_align_check
558
neon_b2f_shift_align:
562
bne neon_b2f_shift_align
563
neon_b2f_source_align_check:
565
bne neon_b2f_nonaligned
567
neon_b2f_try_16_align:
568
/* If we're >64, attempt to align on 16-bytes. Smaller amounts
569
* don't seem to be worth handling. */
571
blt neon_b2f_align_route
573
beq neon_b2f_align_route
574
/* In this case, r12 has the number of bytes to roll backward. */
582
bne neon_b2f_align_16_4
583
neon_b2f_align_route:
585
* #############################################################
586
* Back to Front copy - aligned
592
bge neon_b2f_copy_128_a
594
bge neon_b2f_copy_32_a
596
bge neon_b2f_copy_8_a
598
bge neon_b2f_copy_4_a
601
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
608
* This irks me. There MUST be a better way to read these in and
609
* scan the register backward instead of making it go forward. Then
610
* we need to do two subtractions...
612
neon_b2f_copy_128_a_loop:
615
vld1.32 {q0, q1}, [r1]!
616
vld1.32 {q2, q3}, [r1]!
617
vld1.32 {q4, q5}, [r1]!
618
vld1.32 {q6, q7}, [r1]!
621
vst1.32 {q0, q1}, [r0]!
622
vst1.32 {q2, q3}, [r0]!
623
vst1.32 {q4, q5}, [r0]!
624
vst1.32 {q6, q7}, [r0]!
630
bne neon_b2f_copy_128_a_loop
631
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
639
bge neon_b2f_copy_32_a
640
b neon_b2f_copy_finish_a
643
neon_b2f_copy_32_a_loop:
646
vld1.32 {q0,q1}, [r1]
648
vst1.32 {q0,q1}, [r0]
650
bne neon_b2f_copy_32_a_loop
653
neon_b2f_copy_finish_a:
655
movs r12, r2, lsr #0x3
656
beq neon_b2f_copy_4_a
657
neon_b2f_copy_8_a_loop:
661
bne neon_b2f_copy_8_a_loop
664
movs r12, r2, lsr #0x2
665
beq neon_b2f_copy_1_a
667
neon_b2f_copy_4_a_loop:
671
bne neon_b2f_copy_4_a_loop
675
neon_b2f_copy_1_a_loop:
679
bne neon_b2f_copy_1_a_loop
681
/* #############################################################
682
* Back to Front copy - unaligned
686
* For sizes < 8, does it really make sense to do the whole shift
690
ble neon_b2f_copy_1_a
691
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
697
* r3 = max window size
698
* r4 = overflow bytes
699
* r5 = bytes we're reading into
700
* r6 = # bytes we're off.
709
orr r4, r5, r4, lsl #8
712
* r10 = # of bits we copied into the r4 register to align source.
714
* r12 = Index counter for each size, so we determine how many times
715
* the given size will go into r2, then count down that # of
726
bge neon_b2f_copy_64_u
728
bge neon_b2f_copy_32_u
730
bge neon_b2f_copy_8_u
732
bge neon_b2f_copy_4_u
733
b neon_b2f_last_bits_u
735
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
745
neon_b2f_copy_64_u_loop:
748
vld1.32 {q0, q1}, [r1]!
749
vld1.32 {q2, q3}, [r1]
758
bcc neon_b2f_copy_64_u_b8
759
bpl neon_b2f_copy_64_u_b16
764
b neon_b2f_copy_64_unify
765
neon_b2f_copy_64_u_b8:
770
b neon_b2f_copy_64_unify
771
neon_b2f_copy_64_u_b16:
776
neon_b2f_copy_64_unify:
777
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
800
vst1.32 {q0, q1}, [r0]!
801
vst1.32 {q2, q3}, [r0]
804
bne neon_b2f_copy_64_u_loop
805
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
814
bge neon_b2f_copy_32_u
815
b neon_b2f_copy_finish_u
817
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
826
neon_b2f_copy_32_u_loop:
829
vld1.32 {q0, q1}, [r1]
835
bcc neon_b2f_copy_32_u_b8
836
bpl neon_b2f_copy_32_u_b16
839
b neon_b2f_copy_32_unify
840
neon_b2f_copy_32_u_b8:
843
b neon_b2f_copy_32_unify
844
neon_b2f_copy_32_u_b16:
847
neon_b2f_copy_32_unify:
848
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
863
vst1.32 {q0, q1}, [r0]
865
bne neon_b2f_copy_32_u_loop
866
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
871
neon_b2f_copy_finish_u:
873
movs r12, r2, lsr #0x3
874
beq neon_b2f_copy_4_u
876
neon_b2f_copy_8_u_loop:
879
orr r5, r5, r7, lsr r10
881
orr r4, r4, r6, lsr r10
886
bne neon_b2f_copy_8_u_loop
889
movs r12, r2, lsr #0x2
890
beq neon_b2f_last_bits_u
892
neon_b2f_copy_4_u_loop:
895
orr r5, r5, r6, lsr r10
900
bne neon_b2f_copy_4_u_loop
902
neon_b2f_last_bits_u:
903
neon_b2f_last_bits_u_loop:
907
bne neon_b2f_last_bits_u_loop
910
beq neon_b2f_finish_u
911
neon_b2f_copy_1_u_loop:
915
bne neon_b2f_copy_1_u_loop
917
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
924
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
931
#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)