~linaro-toolchain-dev/cortex-strings/trunk

103 by Will Newton
Split bionic reference code into A15 and A9 versions.
1
/*
2
 * Copyright (C) 2013 The Android Open Source Project
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 *  * Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 *  * Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in
12
 *    the documentation and/or other materials provided with the
13
 *    distribution.
14
 *
15
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26
 * SUCH DAMAGE.
27
 */
28
29
		/*
30
		 * Optimized memset() for ARM.
31
         *
32
         * memset() returns its first argument.
33
		 */
34
35
        .fpu        neon
36
        .syntax     unified
37
38
	.globl memset
39
	.type memset,%function
40
memset:
41
	.fnstart
42
        .save       {r0}
43
        stmfd       sp!, {r0}
44
45
        // The new algorithm is slower for copies < 16 so use the old
46
        // neon code in that case.
47
        cmp         r2, #16
48
        blo         set_less_than_16_unknown_align
49
50
        // Use strd which requires an even and odd register so move the
51
        // values so that:
52
        //   r0 and r1 contain the memset value
53
        //   r2 is the number of bytes to set
54
        //   r3 is the destination pointer
55
        mov         r3, r0
56
57
        // Copy the byte value in every byte of r1.
58
        mov         r1, r1, lsl #24
59
        orr         r1, r1, r1, lsr #8
60
        orr         r1, r1, r1, lsr #16
61
62
check_alignment:
63
        // Align destination to a double word to avoid the strd crossing
64
        // a cache line boundary.
65
        ands        ip, r3, #7
66
        bne         do_double_word_align
67
68
double_word_aligned:
69
        mov         r0, r1
70
71
        subs        r2, #64
72
        blo         set_less_than_64
73
74
1:      // Main loop sets 64 bytes at a time.
75
        .irp        offset, #0, #8, #16, #24, #32, #40, #48, #56
76
        strd        r0, r1, [r3, \offset]
77
        .endr
78
79
        add         r3, #64
80
        subs        r2, #64
81
        bge         1b
82
83
set_less_than_64:
84
        // Restore r2 to the count of bytes left to set.
85
        add         r2, #64
86
        lsls        ip, r2, #27
87
        bcc         set_less_than_32
88
        // Set 32 bytes.
89
        .irp        offset, #0, #8, #16, #24
90
        strd        r0, r1, [r3, \offset]
91
        .endr
92
        add         r3, #32
93
94
set_less_than_32:
95
        bpl         set_less_than_16
96
        // Set 16 bytes.
97
        .irp        offset, #0, #8
98
        strd        r0, r1, [r3, \offset]
99
        .endr
100
        add         r3, #16
101
102
set_less_than_16:
103
        // Less than 16 bytes to set.
104
        lsls        ip, r2, #29
105
        bcc         set_less_than_8
106
107
        // Set 8 bytes.
108
        strd        r0, r1, [r3], #8
109
110
set_less_than_8:
111
        bpl         set_less_than_4
112
        // Set 4 bytes
113
        str         r1, [r3], #4
114
115
set_less_than_4:
116
        lsls        ip, r2, #31
117
        it          ne
118
        strbne      r1, [r3], #1
119
        itt         cs
120
        strbcs      r1, [r3], #1
121
        strbcs      r1, [r3]
122
123
        ldmfd       sp!, {r0}
124
        bx          lr
125
126
do_double_word_align:
127
        rsb         ip, ip, #8
128
        sub         r2, r2, ip
129
        movs        r0, ip, lsl #31
130
        it          mi
131
        strbmi      r1, [r3], #1
132
        itt         cs
133
        strbcs      r1, [r3], #1
134
        strbcs      r1, [r3], #1
135
136
        // Dst is at least word aligned by this point.
137
        cmp         ip, #4
138
        blo         double_word_aligned
139
        str         r1, [r3], #4
140
        b           double_word_aligned
141
142
set_less_than_16_unknown_align:
143
        // Set up to 15 bytes.
144
        vdup.8      d0, r1
145
        movs        ip, r2, lsl #29
146
        bcc         1f
147
        vst1.8      {d0}, [r0]!
148
1:      bge         2f
149
        vst1.32     {d0[0]}, [r0]!
150
2:      movs        ip, r2, lsl #31
151
        it          mi
152
        strbmi      r1, [r0], #1
153
        itt         cs
154
        strbcs      r1, [r0], #1
155
        strbcs      r1, [r0], #1
156
        ldmfd       sp!, {r0}
157
        bx          lr
158
	.fnend
159
	.size memset, .-memset