2 * Copyright (c) 2012-2014 ARM Ltd
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 /* Implementation of strcmp for ARMv7 when DSP instructions are
30 available. Use ldrd to support wider loads, provided the data
31 is sufficiently aligned. Use saturating arithmetic to optimize
35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36 byte in the string. If comparing completely random strings
37 the pre-check will save time, since there is a very high
38 probability of a mismatch in the first character: we save
39 significant overhead if this is the common case. However,
40 if strings are likely to be identical (eg because we're
41 verifying a hit in a hash table), then this check is largely
44 #define STRCMP_NO_PRECHECK 0
46 /* This version uses Thumb-2 code. */
50 #ifdef __ARM_BIG_ENDIAN
54 #define MSB 0x000000ff
55 #define LSB 0xff000000
56 #define BYTE0_OFFSET 24
57 #define BYTE1_OFFSET 16
58 #define BYTE2_OFFSET 8
59 #define BYTE3_OFFSET 0
60 #else /* not __ARM_BIG_ENDIAN */
64 #define BYTE0_OFFSET 0
65 #define BYTE1_OFFSET 8
66 #define BYTE2_OFFSET 16
67 #define BYTE3_OFFSET 24
68 #define MSB 0xff000000
69 #define LSB 0x000000ff
70 #endif /* not __ARM_BIG_ENDIAN */
72 .macro def_fn f p2align=0
80 /* Parameters and result. */
83 #define result r0 /* Overlaps src1. */
85 /* Internal variables. */
90 /* Additional internal variables for 64-bit aligned data. */
95 #define syndrome_a tmp1
96 #define syndrome_b tmp2
98 /* Additional internal variables for 32-bit aligned data. */
101 #define syndrome tmp2
104 /* Macro to compute and return the result value for word-aligned
106 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
107 #ifdef __ARM_BIG_ENDIAN
108 /* If data1 contains a zero byte, then syndrome will contain a 1 in
109 bit 7 of that byte. Otherwise, the highest set bit in the
110 syndrome will highlight the first different bit. It is therefore
111 sufficient to extract the eight bits starting with the syndrome
116 ldrd r6, r7, [sp, #8]
123 ldrd r4, r5, [sp], #16
126 sub result, result, r1, lsr #24
129 /* To use the big-endian trick we'd have to reverse all three words.
130 that's slower than this approach. */
137 ldrd r6, r7, [sp, #8]
142 and result, \d1, #255
144 ldrd r4, r5, [sp], #16
147 sub result, result, r1
156 #if STRCMP_NO_PRECHECK == 0
163 #if STRCMP_NO_PRECHECK == 0
172 strd r4, r5, [sp, #-16]!
173 .cfi_def_cfa_offset 16
177 strd r6, r7, [sp, #8]
182 cbz r2, .Lloop_aligned8
189 /* Deal with mutual misalignment by aligning downwards and then
190 masking off the unwanted loaded data to prevent a difference. */
195 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
196 ldrd data1a, data1b, [src1], #16
198 ldrd data2a, data2b, [src2], #16
199 /* In thumb code we can't use MVN with a register shift, but
201 S2HI tmp1, const_m1, tmp2
202 orn data1a, data1a, tmp1
203 orn data2a, data2a, tmp1
204 beq .Lstart_realigned8
205 orn data1b, data1b, tmp1
207 orn data2b, data2b, tmp1
211 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
213 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
214 .p2align 2 /* Always word aligned. */
216 ldrd data1a, data1b, [src1], #16
217 ldrd data2a, data2b, [src2], #16
219 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
220 eor syndrome_a, data1a, data2a
221 sel syndrome_a, syndrome_a, const_m1
222 cbnz syndrome_a, .Ldiff_in_a
223 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
224 eor syndrome_b, data1b, data2b
225 sel syndrome_b, syndrome_b, const_m1
226 cbnz syndrome_b, .Ldiff_in_b
228 ldrd data1a, data1b, [src1, #-8]
229 ldrd data2a, data2b, [src2, #-8]
230 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
231 eor syndrome_a, data1a, data2a
232 sel syndrome_a, syndrome_a, const_m1
233 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
234 eor syndrome_b, data1b, data2b
235 sel syndrome_b, syndrome_b, const_m1
236 /* Can't use CBZ for backwards branch. */
237 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
241 cbnz syndrome_a, .Ldiff_in_a
244 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
248 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
257 /* Unrolled by a factor of 2, to reduce the number of post-increment
260 ldr data1, [src1], #8
261 ldr data2, [src2], #8
263 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
264 eor syndrome, data1, data2
265 sel syndrome, syndrome, const_m1
266 cbnz syndrome, .Laligned4_done
267 ldr data1, [src1, #-4]
268 ldr data2, [src2, #-4]
269 uadd8 syndrome, data1, const_m1
270 eor syndrome, data1, data2
271 sel syndrome, syndrome, const_m1
276 strcmp_epilogue_aligned syndrome, data1, data2, 0
280 /* Deal with mutual misalignment by aligning downwards and then
281 masking off the unwanted loaded data to prevent a difference. */
282 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
284 ldr data1, [src1], #8
286 ldr data2, [src2], #8
288 /* In thumb code we can't use MVN with a register shift, but
290 S2HI tmp1, const_m1, tmp1
291 orn data1, data1, tmp1
292 orn data2, data2, tmp1
301 ldr data1, [src1], #4
305 #if STRCMP_NO_PRECHECK == 1
306 ldrb data2, [src2, #1]
307 uxtb tmp1, data1, ror #BYTE1_OFFSET
308 subs tmp1, tmp1, data2
309 bne .Lmisaligned_exit
310 cbz data2, .Lmisaligned_exit
313 ldrb data2, [src2, #2]
314 uxtb tmp1, data1, ror #BYTE2_OFFSET
315 subs tmp1, tmp1, data2
316 bne .Lmisaligned_exit
317 cbz data2, .Lmisaligned_exit
320 ldrb data2, [src2, #3]
321 uxtb tmp1, data1, ror #BYTE3_OFFSET
322 subs tmp1, tmp1, data2
323 bne .Lmisaligned_exit
325 cbnz data2, .Lsrc1_aligned
326 #else /* STRCMP_NO_PRECHECK */
327 /* If we've done the pre-check, then we don't need to check the
328 first byte again here. */
329 ldrb data2, [src2, #2]
330 uxtb tmp1, data1, ror #BYTE2_OFFSET
331 subs tmp1, tmp1, data2
332 bne .Lmisaligned_exit
333 cbz data2, .Lmisaligned_exit
336 ldrb data2, [src2, #3]
337 uxtb tmp1, data1, ror #BYTE3_OFFSET
338 subs tmp1, tmp1, data2
339 bne .Lmisaligned_exit
340 cbnz data2, .Laligned_m1
350 #if STRCMP_NO_PRECHECK == 0
356 /* src1 is word aligned, but src2 has no common alignment
358 ldr data1, [src1], #4
359 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
362 ldr data2, [src2], #4
363 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
364 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
366 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
368 bic tmp1, data1, #MSB
369 uadd8 syndrome, data1, const_m1
370 eors syndrome, tmp1, data2, S2LO #8
371 sel syndrome, syndrome, const_m1
374 ldr data2, [src2], #4
375 eor tmp1, tmp1, data1
376 cmp tmp1, data2, S2HI #24
378 ldr data1, [src1], #4
381 S2LO data2, data2, #8
385 bics syndrome, syndrome, #MSB
386 bne .Lstrcmp_done_equal
388 /* We can only get here if the MSB of data1 contains 0, so
389 fast-path the exit. */
392 ldrd r4, r5, [sp], #16
395 /* R6/7 Not used in this sequence. */
403 S2LO data1, data1, #24
404 and data2, data2, #LSB
407 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
409 and tmp1, data1, const_m1, S2LO #16
410 uadd8 syndrome, data1, const_m1
411 eors syndrome, tmp1, data2, S2LO #16
412 sel syndrome, syndrome, const_m1
415 ldr data2, [src2], #4
416 eor tmp1, tmp1, data1
417 cmp tmp1, data2, S2HI #16
419 ldr data1, [src1], #4
422 S2LO data2, data2, #16
425 ands syndrome, syndrome, const_m1, S2LO #16
426 bne .Lstrcmp_done_equal
429 S2LO data1, data1, #16
430 #ifdef __ARM_BIG_ENDIAN
431 lsl data2, data2, #16
436 S2LO data1, data1, #16
437 and data2, data2, const_m1, S2LO #16
440 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
442 and tmp1, data1, #LSB
443 uadd8 syndrome, data1, const_m1
444 eors syndrome, tmp1, data2, S2LO #24
445 sel syndrome, syndrome, const_m1
448 ldr data2, [src2], #4
449 eor tmp1, tmp1, data1
450 cmp tmp1, data2, S2HI #8
452 ldr data1, [src1], #4
455 S2LO data2, data2, #24
459 bne .Lstrcmp_done_equal
462 S2LO data1, data1, #8
463 bic data2, data2, #MSB
469 ldrd r4, r5, [sp], #16
472 /* R6/7 not used in this sequence. */
479 #ifndef __ARM_BIG_ENDIAN
482 /* Now everything looks big-endian... */
484 uadd8 tmp1, data1, const_m1
485 eor tmp1, data1, data2
486 sel syndrome, tmp1, const_m1
488 lsl data1, data1, tmp1
489 lsl data2, data2, tmp1
490 lsr result, data1, #24
491 ldrd r4, r5, [sp], #16
494 /* R6/7 not used in this sequence. */
497 sub result, result, data2, lsr #24
500 .size strcmp, . - .Lstrcmp_start_addr