~linaro-toolchain-dev/cortex-strings/trunk

1 by Michael Hope
Pulled in the initial versions
1
/*
2
 * Copyright (C) 2008 The Android Open Source Project
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *  * Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 *  * Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in
12
 *    the documentation and/or other materials provided with the
13
 *    distribution.
14
 *
15
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28
77 by Michael Hope
Added function type attributes to all reference assembly versions.
29
	.global memcmp
30
	.type memcmp, %function
31
	.text
1 by Michael Hope
Pulled in the initial versions
32
33
/*
34
 * Optimized memcmp() for ARM9.
35
 * This would not be optimal on XScale or ARM11, where more prefetching
36
 * and use of PLD will be needed.
37
 * The 2 major optimzations here are
38
 * (1) The main loop compares 16 bytes at a time
39
 * (2) The loads are scheduled in a way they won't stall
40
 */
41
42
memcmp:
43
        .fnstart
4 by Michael Hope
Modified the imported versions to build locally. Added the CSL routines.
44
        PLD         [r0, #0]
45
        PLD         [r1, #0]
1 by Michael Hope
Pulled in the initial versions
46
47
        /* take of the case where length is 0 or the buffers are the same */
48
        cmp         r0, r1
49
        cmpne       r2, #0
50
        moveq       r0, #0
51
        bxeq        lr
52
53
        .save {r4, lr}
54
        /* save registers */
55
        stmfd       sp!, {r4, lr}
56
        
4 by Michael Hope
Modified the imported versions to build locally. Added the CSL routines.
57
        PLD         [r0, #32]
58
        PLD         [r1, #32]
1 by Michael Hope
Pulled in the initial versions
59
60
        /* since r0 hold the result, move the first source
61
         * pointer somewhere else
62
         */
63
         
64
         mov        r4, r0
65
         
66
         /* make sure we have at least 8+4 bytes, this simplify things below
67
          * and avoid some overhead for small blocks
68
          */
69
         cmp        r2, #(8+4)
70
         bmi        8f
71
        
72
        /* align first pointer to word boundary
73
         * offset = -src & 3
74
         */
75
        rsb         r3, r4, #0
76
        ands        r3, r3, #3
77
        beq         0f
78
79
        /* align first pointer  */
80
        sub         r2, r2, r3
81
1:      ldrb        r0, [r4], #1
82
        ldrb        ip, [r1], #1
83
        subs        r0, r0, ip
84
        bne         9f
85
        subs        r3, r3, #1
86
        bne         1b
87
88
89
0:      /* here the first pointer is aligned, and we have at least 4 bytes
90
         * to process.
91
         */
92
93
        /* see if the pointers are congruent */
94
        eor         r0, r4, r1
95
        ands        r0, r0, #3
96
        bne         5f
97
98
        /* congruent case, 32 bytes per iteration
99
         * We need to make sure there are at least 32+4 bytes left
100
         * because we effectively read ahead one word, and we could
101
         * read past the buffer (and segfault) if we're not careful.
102
         */
103
104
        ldr         ip, [r1]
105
        subs        r2, r2, #(32 + 4)
106
        bmi         1f
107
        
4 by Michael Hope
Modified the imported versions to build locally. Added the CSL routines.
108
0:      PLD         [r4, #64]
109
        PLD         [r1, #64]
1 by Michael Hope
Pulled in the initial versions
110
        ldr         r0, [r4], #4
111
        ldr         lr, [r1, #4]!
112
        eors        r0, r0, ip
113
        ldreq       r0, [r4], #4
114
        ldreq       ip, [r1, #4]!
115
        eoreqs      r0, r0, lr
116
        ldreq       r0, [r4], #4
117
        ldreq       lr, [r1, #4]!
118
        eoreqs      r0, r0, ip
119
        ldreq       r0, [r4], #4
120
        ldreq       ip, [r1, #4]!
121
        eoreqs      r0, r0, lr
122
        ldreq       r0, [r4], #4
123
        ldreq       lr, [r1, #4]!
124
        eoreqs      r0, r0, ip
125
        ldreq       r0, [r4], #4
126
        ldreq       ip, [r1, #4]!
127
        eoreqs      r0, r0, lr
128
        ldreq       r0, [r4], #4
129
        ldreq       lr, [r1, #4]!
130
        eoreqs      r0, r0, ip
131
        ldreq       r0, [r4], #4
132
        ldreq       ip, [r1, #4]!
133
        eoreqs      r0, r0, lr
134
        bne         2f        
135
        subs        r2, r2, #32
136
        bhs         0b
137
138
        /* do we have at least 4 bytes left? */
139
1:      adds        r2, r2, #(32 - 4 + 4)
140
        bmi         4f
141
        
142
        /* finish off 4 bytes at a time */
143
3:      ldr         r0, [r4], #4
144
        ldr         ip, [r1], #4
145
        eors        r0, r0, ip
146
        bne         2f
147
        subs        r2, r2, #4
148
        bhs         3b
149
150
        /* are we done? */
151
4:      adds        r2, r2, #4
152
        moveq       r0, #0
153
        beq         9f
154
155
        /* finish off the remaining bytes */
156
        b           8f
157
158
2:      /* the last 4 bytes are different, restart them */
159
        sub         r4, r4, #4
160
        sub         r1, r1, #4
161
        mov         r2, #4
162
163
        /* process the last few bytes */
164
8:      ldrb        r0, [r4], #1
165
        ldrb        ip, [r1], #1
166
        // stall
167
        subs        r0, r0, ip
168
        bne         9f
169
        subs        r2, r2, #1
170
        bne         8b
171
172
9:      /* restore registers and return */
173
        ldmfd       sp!, {r4, lr}
174
        bx          lr
175
        .fnend
176
177
178
179
180
181
5:      /*************** non-congruent case ***************/
182
        and         r0, r1, #3      
183
        cmp         r0, #2
184
        bne         4f
185
186
        /* here, offset is 2 (16-bits aligned, special cased) */
187
        
188
        /* make sure we have at least 16 bytes to process */
189
        subs        r2, r2, #16
190
        addmi       r2, r2, #16
191
        bmi         8b
192
193
        /* align the unaligned pointer */
194
        bic         r1, r1, #3
195
        ldr         lr, [r1], #4
196
4 by Michael Hope
Modified the imported versions to build locally. Added the CSL routines.
197
6:      PLD         [r1, #64]
198
        PLD         [r4, #64]
1 by Michael Hope
Pulled in the initial versions
199
        mov         ip, lr, lsr #16
200
        ldr         lr, [r1], #4
201
        ldr         r0, [r4], #4
202
        orr         ip, ip, lr, lsl #16
203
        eors        r0, r0, ip
204
        moveq       ip, lr, lsr #16
205
        ldreq       lr, [r1], #4
206
        ldreq       r0, [r4], #4
207
        orreq       ip, ip, lr, lsl #16
208
        eoreqs      r0, r0, ip
209
        moveq       ip, lr, lsr #16
210
        ldreq       lr, [r1], #4
211
        ldreq       r0, [r4], #4
212
        orreq       ip, ip, lr, lsl #16
213
        eoreqs      r0, r0, ip
214
        moveq       ip, lr, lsr #16
215
        ldreq       lr, [r1], #4
216
        ldreq       r0, [r4], #4
217
        orreq       ip, ip, lr, lsl #16
218
        eoreqs      r0, r0, ip
219
        bne         7f
220
        subs        r2, r2, #16
221
        bhs         6b
222
        sub         r1, r1, #2
223
        /* are we done? */
224
        adds        r2, r2, #16
225
        moveq       r0, #0
226
        beq         9b
227
        /* finish off the remaining bytes */
228
        b           8b
229
230
7:      /* fix up the 2 pointers and fallthrough... */
231
        sub         r1, r1, #(4+2)
232
        sub         r4, r4, #4
233
        mov         r2, #4
234
        b           8b
235
236
237
4:      /*************** offset is 1 or 3 (less optimized) ***************/
238
239
		stmfd		sp!, {r5, r6, r7}
240
241
        // r5 = rhs
242
        // r6 = lhs
243
        // r7 = scratch
244
245
        mov         r5, r0, lsl #3		/* r5 = right shift */
246
        rsb         r6, r5, #32         /* r6 = left shift */
247
248
        /* align the unaligned pointer */
249
        bic         r1, r1, #3
250
        ldr         r7, [r1], #4
251
        sub         r2, r2, #8
252
253
6:      mov         ip, r7, lsr r5
254
        ldr         r7, [r1], #4
255
        ldr         r0, [r4], #4
256
        orr         ip, ip, r7, lsl r6
257
        eors        r0, r0, ip
258
        moveq       ip, r7, lsr r5
259
        ldreq       r7, [r1], #4
260
        ldreq       r0, [r4], #4
261
        orreq       ip, ip, r7, lsl r6
262
        eoreqs      r0, r0, ip
263
        bne         7f
264
        subs        r2, r2, #8
265
        bhs         6b
266
267
        sub         r1, r1, r6, lsr #3
268
		ldmfd       sp!, {r5, r6, r7}
269
270
        /* are we done? */
271
        adds        r2, r2, #8
272
        moveq       r0, #0
273
        beq         9b
274
275
        /* finish off the remaining bytes */
276
        b           8b
277
278
7:      /* fix up the 2 pointers and fallthrough... */
279
        sub         r1, r1, #4
280
        sub         r1, r1, r6, lsr #3
281
        sub         r4, r4, #4
282
        mov         r2, #4
283
		ldmfd		sp!, {r5, r6, r7}
284
        b           8b