1 /* Copyright (c) 2013-2015, Linaro Limited
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
32 /* To test the page crossing code path more thoroughly, compile with
33 -DTEST_PAGE_CROSS - this will force all calls through the slower
34 entry path. This option is not intended for production use. */
36 /* Arguments and results. */
40 /* Locals and temporaries. */
54 .macro def_fn f p2align=0
62 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
63 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
64 can be done in parallel across the entire word. A faster check
65 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
66 false hits for characters 129..255. */
68 #define REP8_01 0x0101010101010101
69 #define REP8_7f 0x7f7f7f7f7f7f7f7f
70 #define REP8_80 0x8080808080808080
72 #ifdef TEST_PAGE_CROSS
73 # define MIN_PAGE_SIZE 15
75 # define MIN_PAGE_SIZE 4096
78 /* Since strings are short on average, we check the first 16 bytes
79 of the string for a NUL character. In order to do an unaligned ldp
80 safely we have to do a page cross check first. If there is a NUL
81 byte we calculate the length from the 2 8-byte words using
82 conditional select to reduce branch mispredictions (it is unlikely
83 strlen will be repeatedly called on strings with the same length).
85 If the string is longer than 16 bytes, we align src so don't need
86 further page cross checks, and process 32 bytes per iteration
87 using the fast NUL check. If we encounter non-ASCII characters,
88 fallback to a second loop using the full NUL check.
90 If the page cross check fails, we read 16 bytes from an aligned
91 address, remove any characters before the string, and continue
92 in the main loop using aligned loads. Since strings crossing a
93 page in the first 16 bytes are rare (probability of
94 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
96 AArch64 systems have a minimum page size of 4k. We don't bother
97 checking for larger page sizes - the cost of setting up the correct
98 page size is just not worth the extra gain from a small reduction in
99 the cases taking the slow path. Note that we only care about
100 whether the first fetch, which may be misaligned, crosses a page
103 def_fn strlen p2align=6
104 and tmp1, srcin, MIN_PAGE_SIZE - 1
105 mov zeroones, REP8_01
106 cmp tmp1, MIN_PAGE_SIZE - 16
108 ldp data1, data2, [srcin]
110 /* For big-endian, carry propagation (if the final byte in the
111 string is 0x01) means we cannot use has_nul1/2 directly.
112 Since we expect strings to be small and early-exit,
113 byte-swap the data now so has_null1/2 will be correct. */
117 sub tmp1, data1, zeroones
118 orr tmp2, data1, REP8_7f
119 sub tmp3, data2, zeroones
120 orr tmp4, data2, REP8_7f
121 bics has_nul1, tmp1, tmp2
122 bic has_nul2, tmp3, tmp4
123 ccmp has_nul2, 0, 0, eq
124 beq L(main_loop_entry)
126 /* Enter with C = has_nul1 == 0. */
127 csel has_nul1, has_nul1, has_nul2, cc
129 rev has_nul1, has_nul1
131 csel len, xzr, len, cc
132 add len, len, tmp1, lsr 3
135 /* The inner loop processes 32 bytes per iteration and uses the fast
136 NUL check. If we encounter non-ASCII characters, use a second
137 loop with the accurate NUL check. */
143 ldp data1, data2, [src, 32]!
145 sub tmp1, data1, zeroones
146 sub tmp3, data2, zeroones
148 tst tmp2, zeroones, lsl 7
150 ldp data1, data2, [src, 16]
151 sub tmp1, data1, zeroones
152 sub tmp3, data2, zeroones
154 tst tmp2, zeroones, lsl 7
158 /* The fast check failed, so do the slower, accurate NUL check. */
159 orr tmp2, data1, REP8_7f
160 orr tmp4, data2, REP8_7f
161 bics has_nul1, tmp1, tmp2
162 bic has_nul2, tmp3, tmp4
163 ccmp has_nul2, 0, 0, eq
166 /* Enter with C = has_nul1 == 0. */
169 /* For big-endian, carry propagation (if the final byte in the
170 string is 0x01) means we cannot use has_nul1/2 directly. The
171 easiest way to get the correct byte is to byte-swap the data
172 and calculate the syndrome a second time. */
173 csel data1, data1, data2, cc
175 sub tmp1, data1, zeroones
176 orr tmp2, data1, REP8_7f
177 bic has_nul1, tmp1, tmp2
179 csel has_nul1, has_nul1, has_nul2, cc
182 rev has_nul1, has_nul1
185 csel len, len, tmp2, cc
186 add len, len, tmp1, lsr 3
190 ldp data1, data2, [src, 16]!
191 sub tmp1, data1, zeroones
192 orr tmp2, data1, REP8_7f
193 sub tmp3, data2, zeroones
194 orr tmp4, data2, REP8_7f
195 bics has_nul1, tmp1, tmp2
196 bic has_nul2, tmp3, tmp4
197 ccmp has_nul2, 0, 0, eq
199 ldp data1, data2, [src, 16]!
200 sub tmp1, data1, zeroones
201 orr tmp2, data1, REP8_7f
202 sub tmp3, data2, zeroones
203 orr tmp4, data2, REP8_7f
204 bics has_nul1, tmp1, tmp2
205 bic has_nul2, tmp3, tmp4
206 ccmp has_nul2, 0, 0, eq
210 /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
211 srcin to 0x7f, so we ignore any NUL bytes before the string.
212 Then continue in the aligned loop. */
215 ldp data1, data2, [src]
219 /* Big-endian. Early bytes are at MSB. */
220 lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
222 /* Little-endian. Early bytes are at LSB. */
223 lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
225 orr tmp1, tmp1, REP8_80
226 orn data1, data1, tmp1
227 orn tmp2, data2, tmp1
229 csel data1, data1, tmp4, eq
230 csel data2, data2, tmp2, eq
231 b L(page_cross_entry)
233 .size strlen, . - strlen