2 * Copyright (C) 2008 The Android Open Source Project
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #ifdef HAVE_32_BYTE_CACHE_LINE
30 #define CACHE_LINE_SIZE 32
32 #define CACHE_LINE_SIZE 64
36 * Optimized memcmp() for Cortex-A9.
40 .type memcmp,%function
43 pld [r0, #(CACHE_LINE_SIZE * 0)]
44 pld [r0, #(CACHE_LINE_SIZE * 1)]
46 /* take of the case where length is 0 or the buffers are the same */
51 pld [r1, #(CACHE_LINE_SIZE * 0)]
52 pld [r1, #(CACHE_LINE_SIZE * 1)]
54 /* make sure we have at least 8+4 bytes, this simplify things below
55 * and avoid some overhead for small blocks
61 * Comparing 32 bytes at a time
63 #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
67 /* preload all the cache lines we need. */
68 pld [r0, #(CACHE_LINE_SIZE * 2)]
69 pld [r1, #(CACHE_LINE_SIZE * 2)]
71 1: /* The main loop compares 32 bytes at a time */
72 vld1.8 {d0 - d3}, [r0]!
73 pld [r0, #(CACHE_LINE_SIZE * 2)]
74 vld1.8 {d4 - d7}, [r1]!
75 pld [r1, #(CACHE_LINE_SIZE * 2)]
77 /* Start subtracting the values and merge results */
83 /* Check if there are any differences among the 32 bytes */
90 /* Check if the difference was in the first or last 16 bytes */
96 /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
102 3: /* fix-up the remaining count */
113 /* since r0 hold the result, move the first source
114 * pointer somewhere else
118 /* align first pointer to word boundary
125 /* align first pointer */
135 0: /* here the first pointer is aligned, and we have at least 4 bytes
139 /* see if the pointers are congruent */
144 /* congruent case, 32 bytes per iteration
145 * We need to make sure there are at least 32+4 bytes left
146 * because we effectively read ahead one word, and we could
147 * read past the buffer (and segfault) if we're not careful.
151 subs r2, r2, #(32 + 4)
154 0: pld [r4, #(CACHE_LINE_SIZE * 2)]
155 pld [r1, #(CACHE_LINE_SIZE * 2)]
184 /* do we have at least 4 bytes left? */
185 1: adds r2, r2, #(32 - 4 + 4)
188 /* finish off 4 bytes at a time */
201 /* finish off the remaining bytes */
204 2: /* the last 4 bytes are different, restart them */
209 /* process the last few bytes */
218 9: /* restore registers and return */
222 10: /* process less than 12 bytes */
236 5: /*************** non-congruent case ***************/
241 /* here, offset is 2 (16-bits aligned, special cased) */
243 /* make sure we have at least 16 bytes to process */
248 /* align the unaligned pointer */
252 6: pld [r1, #(CACHE_LINE_SIZE * 2)]
253 pld [r4, #(CACHE_LINE_SIZE * 2)]
257 orr ip, ip, lr, lsl #16
259 moveq ip, lr, lsr #16
262 orreq ip, ip, lr, lsl #16
264 moveq ip, lr, lsr #16
267 orreq ip, ip, lr, lsl #16
269 moveq ip, lr, lsr #16
272 orreq ip, ip, lr, lsl #16
282 /* finish off the remaining bytes */
285 7: /* fix up the 2 pointers and fallthrough... */
292 4: /*************** offset is 1 or 3 (less optimized) ***************/
294 stmfd sp!, {r5, r6, r7}
300 mov r5, r0, lsl #3 /* r5 = right shift */
301 rsb r6, r5, #32 /* r6 = left shift */
303 /* align the unaligned pointer */
308 6: mov ip, r7, lsr r5
311 orr ip, ip, r7, lsl r6
316 orreq ip, ip, r7, lsl r6
322 sub r1, r1, r6, lsr #3
323 ldmfd sp!, {r5, r6, r7}
330 /* finish off the remaining bytes */
333 7: /* fix up the 2 pointers and fallthrough... */
335 sub r1, r1, r6, lsr #3
338 ldmfd sp!, {r5, r6, r7}
341 .size memcmp, .-memcmp