1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
/*
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
.text
@ .syntax unified
@ .thumb
.fpu neon
.global memset
.type memset, %function
.align 4
/* a prefetch distance of 4 cache-lines works best experimentally */
#define CACHE_LINE_SIZE 64
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
@ memset(dest, c, n)
memset:
.fnstart
.save {r0}
stmfd sp!, {r0}
vdup.8 d0, r1
vdup.8 d1, r1
vdup.8 d2, r1
vdup.8 d3, r1
/* do we have at least 16-bytes to copy (needed for alignment below) */
cmp r2, #16
blo 5f
/* align destination to half cache-line for the write-buffer */
rsb r3, r0, #0
ands r3, r3, #0xF
beq 0f
/* copy up to 15-bytes (count in r3) */
sub r2, r2, r3
movs ip, r3, lsl #31
strmib r1, [r0], #1
strcsb r1, [r0], #1
strcsb r1, [r0], #1
movs ip, r3, lsl #29
bge 1f
// copies 4 bytes, destination 32-bits aligned
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
1: bcc 2f
// copies 8 bytes, destination 64-bits aligned
vst1.8 {d0}, [r0, :64]!
2:
0:
/* make sure we have at least 64 bytes to copy */
subs r2, r2, #64
blo 2f
1: /* The main loop copies 64 bytes at a time */
subs r2, r2, #64
vst1.8 {d0 - d3}, [r0, :128]!
vst1.8 {d0 - d3}, [r0, :128]!
bhs 1b
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
add r2, r2, #64
subs r2, r2, #32
blo 4f
3: /* 32 bytes at a time. These cache lines were already preloaded */
subs r2, r2, #32
vst1.8 {d0 - d3}, [r0, :128]!
bhs 3b
4: /* less than 32 left */
add r2, r2, #32
tst r2, #0x10
beq 5f
// copies 16 bytes, 128-bits aligned
vst1.8 {d0, d1}, [r0, :128]!
5: /* copy up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
vst1.8 {d0}, [r0]!
1: bge 2f
vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
2: movs ip, r2, lsl #31
strmib r1, [r0], #1
strcsb r1, [r0], #1
strcsb r1, [r0], #1
ldmfd sp!, {r0}
bx lr
.fnend
|