103
by Will Newton
Split bionic reference code into A15 and A9 versions. |
1 |
/*
|
2 |
* Copyright (C) 2013 The Android Open Source Project |
|
3 |
* All rights reserved. |
|
4 |
*
|
|
5 |
* Redistribution and use in source and binary forms, with or without |
|
6 |
* modification, are permitted provided that the following conditions |
|
7 |
* are met: |
|
8 |
* * Redistributions of source code must retain the above copyright |
|
9 |
* notice, this list of conditions and the following disclaimer. |
|
10 |
* * Redistributions in binary form must reproduce the above copyright |
|
11 |
* notice, this list of conditions and the following disclaimer in |
|
12 |
* the documentation and/or other materials provided with the |
|
13 |
* distribution. |
|
14 |
*
|
|
15 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
16 |
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
17 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
|
18 |
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
|
19 |
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
|
20 |
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
|
21 |
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
|
22 |
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
|
23 |
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
|
24 |
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
|
25 |
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
26 |
* SUCH DAMAGE. |
|
27 |
*/
|
|
28 |
||
29 |
/*
|
|
30 |
* Optimized memset() for ARM. |
|
31 |
*
|
|
32 |
* memset() returns its first argument. |
|
33 |
*/
|
|
34 |
||
35 |
.fpu neon |
|
36 |
.syntax unified |
|
37 |
||
38 |
.globl memset |
|
39 |
.type memset,%function |
|
40 |
memset: |
|
41 |
.fnstart
|
|
42 |
.save {r0} |
|
43 |
stmfd sp!, {r0} |
|
44 |
||
45 |
// The new algorithm is slower for copies < 16 so use the old |
|
46 |
// neon code in that case. |
|
47 |
cmp r2, #16 |
|
48 |
blo set_less_than_16_unknown_align |
|
49 |
||
50 |
// Use strd which requires an even and odd register so move the |
|
51 |
// values so that: |
|
52 |
// r0 and r1 contain the memset value |
|
53 |
// r2 is the number of bytes to set |
|
54 |
// r3 is the destination pointer |
|
55 |
mov r3, r0 |
|
56 |
||
57 |
// Copy the byte value in every byte of r1. |
|
58 |
mov r1, r1, lsl #24 |
|
59 |
orr r1, r1, r1, lsr #8 |
|
60 |
orr r1, r1, r1, lsr #16 |
|
61 |
||
62 |
check_alignment: |
|
63 |
// Align destination to a double word to avoid the strd crossing |
|
64 |
// a cache line boundary. |
|
65 |
ands ip, r3, #7 |
|
66 |
bne do_double_word_align |
|
67 |
||
68 |
double_word_aligned: |
|
69 |
mov r0, r1 |
|
70 |
||
71 |
subs r2, #64 |
|
72 |
blo set_less_than_64 |
|
73 |
||
74 |
1: // Main loop sets 64 bytes at a time. |
|
75 |
.irp offset, #0, #8, #16, #24, #32, #40, #48, #56 |
|
76 |
strd r0, r1, [r3, \offset] |
|
77 |
.endr
|
|
78 |
||
79 |
add r3, #64 |
|
80 |
subs r2, #64 |
|
81 |
bge 1b |
|
82 |
||
83 |
set_less_than_64: |
|
84 |
// Restore r2 to the count of bytes left to set. |
|
85 |
add r2, #64 |
|
86 |
lsls ip, r2, #27 |
|
87 |
bcc set_less_than_32 |
|
88 |
// Set 32 bytes. |
|
89 |
.irp offset, #0, #8, #16, #24 |
|
90 |
strd r0, r1, [r3, \offset] |
|
91 |
.endr
|
|
92 |
add r3, #32 |
|
93 |
||
94 |
set_less_than_32: |
|
95 |
bpl set_less_than_16 |
|
96 |
// Set 16 bytes. |
|
97 |
.irp offset, #0, #8 |
|
98 |
strd r0, r1, [r3, \offset] |
|
99 |
.endr
|
|
100 |
add r3, #16 |
|
101 |
||
102 |
set_less_than_16: |
|
103 |
// Less than 16 bytes to set. |
|
104 |
lsls ip, r2, #29 |
|
105 |
bcc set_less_than_8 |
|
106 |
||
107 |
// Set 8 bytes. |
|
108 |
strd r0, r1, [r3], #8 |
|
109 |
||
110 |
set_less_than_8: |
|
111 |
bpl set_less_than_4 |
|
112 |
// Set 4 bytes |
|
113 |
str r1, [r3], #4 |
|
114 |
||
115 |
set_less_than_4: |
|
116 |
lsls ip, r2, #31 |
|
117 |
it ne |
|
118 |
strbne r1, [r3], #1 |
|
119 |
itt cs |
|
120 |
strbcs r1, [r3], #1 |
|
121 |
strbcs r1, [r3] |
|
122 |
||
123 |
ldmfd sp!, {r0} |
|
124 |
bx lr |
|
125 |
||
126 |
do_double_word_align: |
|
127 |
rsb ip, ip, #8 |
|
128 |
sub r2, r2, ip |
|
129 |
movs r0, ip, lsl #31 |
|
130 |
it mi |
|
131 |
strbmi r1, [r3], #1 |
|
132 |
itt cs |
|
133 |
strbcs r1, [r3], #1 |
|
134 |
strbcs r1, [r3], #1 |
|
135 |
||
136 |
// Dst is at least word aligned by this point. |
|
137 |
cmp ip, #4 |
|
138 |
blo double_word_aligned |
|
139 |
str r1, [r3], #4 |
|
140 |
b double_word_aligned |
|
141 |
||
142 |
set_less_than_16_unknown_align: |
|
143 |
// Set up to 15 bytes. |
|
144 |
vdup.8 d0, r1 |
|
145 |
movs ip, r2, lsl #29 |
|
146 |
bcc 1f |
|
147 |
vst1.8 {d0}, [r0]! |
|
148 |
1: bge 2f |
|
149 |
vst1.32 {d0[0]}, [r0]! |
|
150 |
2: movs ip, r2, lsl #31 |
|
151 |
it mi |
|
152 |
strbmi r1, [r0], #1 |
|
153 |
itt cs |
|
154 |
strbcs r1, [r0], #1 |
|
155 |
strbcs r1, [r0], #1 |
|
156 |
ldmfd sp!, {r0} |
|
157 |
bx lr |
|
158 |
.fnend
|
|
159 |
.size memset, .-memset |