reference/bionic-a9/memcmp.S

   1 /*
   2  * Copyright (C) 2008 The Android Open Source Project
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *  * Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  *  * Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in
  12  *    the documentation and/or other materials provided with the
  13  *    distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #ifdef HAVE_32_BYTE_CACHE_LINE
  30 #define CACHE_LINE_SIZE     32
  31 #else
  32 #define CACHE_LINE_SIZE     64
  33 #endif
  34
  35 /*
  36  * Optimized memcmp() for Cortex-A9.
  37  */
  38         .text
  39         .globl memcmp
  40         .type memcmp,%function
  41 memcmp:
  42         .fnstart
  43         pld         [r0, #(CACHE_LINE_SIZE * 0)]
  44         pld         [r0, #(CACHE_LINE_SIZE * 1)]
  45
  46         /* take of the case where length is 0 or the buffers are the same */
  47         cmp         r0, r1
  48         moveq       r0, #0
  49         bxeq        lr
  50
  51         pld         [r1, #(CACHE_LINE_SIZE * 0)]
  52         pld         [r1, #(CACHE_LINE_SIZE * 1)]
  53
  54         /* make sure we have at least 8+4 bytes, this simplify things below
  55          * and avoid some overhead for small blocks
  56          */
  57         cmp        r2, #(8+4)
  58         bmi        10f
  59 /*
  60  * Neon optimization
  61  * Comparing 32 bytes at a time
  62  */
  63 #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
  64         subs        r2, r2, #32
  65         blo         3f
  66
  67         /* preload all the cache lines we need. */
  68         pld         [r0, #(CACHE_LINE_SIZE * 2)]
  69         pld         [r1, #(CACHE_LINE_SIZE * 2)]
  70
  71 1:      /* The main loop compares 32 bytes at a time */
  72         vld1.8      {d0 - d3}, [r0]!
  73         pld         [r0, #(CACHE_LINE_SIZE * 2)]
  74         vld1.8      {d4 - d7}, [r1]!
  75         pld         [r1, #(CACHE_LINE_SIZE * 2)]
  76
  77         /* Start subtracting the values and merge results */
  78         vsub.i8     q0, q2
  79         vsub.i8     q1, q3
  80         vorr        q2, q0, q1
  81         vorr        d4, d5
  82         vmov        r3, ip, d4
  83         /* Check if there are any differences among the 32 bytes */
  84         orrs        r3, ip
  85         bne         2f
  86         subs        r2, r2, #32
  87         bhs         1b
  88         b           3f
  89 2:
  90         /* Check if the difference was in the first or last 16 bytes */
  91         sub         r0, #32
  92         vorr        d0, d1
  93         sub         r1, #32
  94         vmov        r3, ip, d0
  95         orrs        r3, ip
  96         /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
  97         ittt        eq
  98         subeq       r2, #16
  99         addeq       r0, #16
 100         addeq       r1, #16
 101
 102 3:      /* fix-up the remaining count */
 103         add         r2, r2, #32
 104
 105         cmp        r2, #(8+4)
 106         bmi        10f
 107 #endif
 108
 109         .save {r4, lr}
 110         /* save registers */
 111         stmfd       sp!, {r4, lr}
 112
 113         /* since r0 hold the result, move the first source
 114          * pointer somewhere else
 115          */
 116          mov        r4, r0
 117
 118         /* align first pointer to word boundary
 119          * offset = -src & 3
 120          */
 121         rsb         r3, r4, #0
 122         ands        r3, r3, #3
 123         beq         0f
 124
 125         /* align first pointer  */
 126         sub         r2, r2, r3
 127 1:      ldrb        r0, [r4], #1
 128         ldrb        ip, [r1], #1
 129         subs        r0, r0, ip
 130         bne         9f
 131         subs        r3, r3, #1
 132         bne         1b
 133
 134
 135 0:      /* here the first pointer is aligned, and we have at least 4 bytes
 136          * to process.
 137          */
 138
 139         /* see if the pointers are congruent */
 140         eor         r0, r4, r1
 141         ands        r0, r0, #3
 142         bne         5f
 143
 144         /* congruent case, 32 bytes per iteration
 145          * We need to make sure there are at least 32+4 bytes left
 146          * because we effectively read ahead one word, and we could
 147          * read past the buffer (and segfault) if we're not careful.
 148          */
 149
 150         ldr         ip, [r1]
 151         subs        r2, r2, #(32 + 4)
 152         bmi         1f
 153
 154 0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
 155         pld         [r1, #(CACHE_LINE_SIZE * 2)]
 156         ldr         r0, [r4], #4
 157         ldr         lr, [r1, #4]!
 158         eors        r0, r0, ip
 159         ldreq       r0, [r4], #4
 160         ldreq       ip, [r1, #4]!
 161         eoreqs      r0, r0, lr
 162         ldreq       r0, [r4], #4
 163         ldreq       lr, [r1, #4]!
 164         eoreqs      r0, r0, ip
 165         ldreq       r0, [r4], #4
 166         ldreq       ip, [r1, #4]!
 167         eoreqs      r0, r0, lr
 168         ldreq       r0, [r4], #4
 169         ldreq       lr, [r1, #4]!
 170         eoreqs      r0, r0, ip
 171         ldreq       r0, [r4], #4
 172         ldreq       ip, [r1, #4]!
 173         eoreqs      r0, r0, lr
 174         ldreq       r0, [r4], #4
 175         ldreq       lr, [r1, #4]!
 176         eoreqs      r0, r0, ip
 177         ldreq       r0, [r4], #4
 178         ldreq       ip, [r1, #4]!
 179         eoreqs      r0, r0, lr
 180         bne         2f
 181         subs        r2, r2, #32
 182         bhs         0b
 183
 184         /* do we have at least 4 bytes left? */
 185 1:      adds        r2, r2, #(32 - 4 + 4)
 186         bmi         4f
 187
 188         /* finish off 4 bytes at a time */
 189 3:      ldr         r0, [r4], #4
 190         ldr         ip, [r1], #4
 191         eors        r0, r0, ip
 192         bne         2f
 193         subs        r2, r2, #4
 194         bhs         3b
 195
 196         /* are we done? */
 197 4:      adds        r2, r2, #4
 198         moveq       r0, #0
 199         beq         9f
 200
 201         /* finish off the remaining bytes */
 202         b           8f
 203
 204 2:      /* the last 4 bytes are different, restart them */
 205         sub         r4, r4, #4
 206         sub         r1, r1, #4
 207         mov         r2, #4
 208
 209         /* process the last few bytes */
 210 8:      ldrb        r0, [r4], #1
 211         ldrb        ip, [r1], #1
 212         // stall
 213         subs        r0, r0, ip
 214         bne         9f
 215         subs        r2, r2, #1
 216         bne         8b
 217
 218 9:      /* restore registers and return */
 219         ldmfd       sp!, {r4, lr}
 220         bx          lr
 221
 222 10:     /* process less than 12 bytes */
 223         cmp         r2, #0
 224         moveq       r0, #0
 225         bxeq        lr
 226         mov         r3, r0
 227 11:
 228         ldrb        r0, [r3], #1
 229         ldrb        ip, [r1], #1
 230         subs        r0, ip
 231         bxne        lr
 232         subs        r2, r2, #1
 233         bne         11b
 234         bx          lr
 235
 236 5:      /*************** non-congruent case ***************/
 237         and         r0, r1, #3
 238         cmp         r0, #2
 239         bne         4f
 240
 241         /* here, offset is 2 (16-bits aligned, special cased) */
 242
 243         /* make sure we have at least 16 bytes to process */
 244         subs        r2, r2, #16
 245         addmi       r2, r2, #16
 246         bmi         8b
 247
 248         /* align the unaligned pointer */
 249         bic         r1, r1, #3
 250         ldr         lr, [r1], #4
 251
 252 6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
 253         pld         [r4, #(CACHE_LINE_SIZE * 2)]
 254         mov         ip, lr, lsr #16
 255         ldr         lr, [r1], #4
 256         ldr         r0, [r4], #4
 257         orr         ip, ip, lr, lsl #16
 258         eors        r0, r0, ip
 259         moveq       ip, lr, lsr #16
 260         ldreq       lr, [r1], #4
 261         ldreq       r0, [r4], #4
 262         orreq       ip, ip, lr, lsl #16
 263         eoreqs      r0, r0, ip
 264         moveq       ip, lr, lsr #16
 265         ldreq       lr, [r1], #4
 266         ldreq       r0, [r4], #4
 267         orreq       ip, ip, lr, lsl #16
 268         eoreqs      r0, r0, ip
 269         moveq       ip, lr, lsr #16
 270         ldreq       lr, [r1], #4
 271         ldreq       r0, [r4], #4
 272         orreq       ip, ip, lr, lsl #16
 273         eoreqs      r0, r0, ip
 274         bne         7f
 275         subs        r2, r2, #16
 276         bhs         6b
 277         sub         r1, r1, #2
 278         /* are we done? */
 279         adds        r2, r2, #16
 280         moveq       r0, #0
 281         beq         9b
 282         /* finish off the remaining bytes */
 283         b           8b
 284
 285 7:      /* fix up the 2 pointers and fallthrough... */
 286         sub         r1, r1, #(4+2)
 287         sub         r4, r4, #4
 288         mov         r2, #4
 289         b           8b
 290
 291
 292 4:      /*************** offset is 1 or 3 (less optimized) ***************/
 293
 294                 stmfd           sp!, {r5, r6, r7}
 295
 296         // r5 = rhs
 297         // r6 = lhs
 298         // r7 = scratch
 299
 300         mov         r5, r0, lsl #3              /* r5 = right shift */
 301         rsb         r6, r5, #32         /* r6 = left shift */
 302
 303         /* align the unaligned pointer */
 304         bic         r1, r1, #3
 305         ldr         r7, [r1], #4
 306         sub         r2, r2, #8
 307
 308 6:      mov         ip, r7, lsr r5
 309         ldr         r7, [r1], #4
 310         ldr         r0, [r4], #4
 311         orr         ip, ip, r7, lsl r6
 312         eors        r0, r0, ip
 313         moveq       ip, r7, lsr r5
 314         ldreq       r7, [r1], #4
 315         ldreq       r0, [r4], #4
 316         orreq       ip, ip, r7, lsl r6
 317         eoreqs      r0, r0, ip
 318         bne         7f
 319         subs        r2, r2, #8
 320         bhs         6b
 321
 322         sub         r1, r1, r6, lsr #3
 323                 ldmfd       sp!, {r5, r6, r7}
 324
 325         /* are we done? */
 326         adds        r2, r2, #8
 327         moveq       r0, #0
 328         beq         9b
 329
 330         /* finish off the remaining bytes */
 331         b           8b
 332
 333 7:      /* fix up the 2 pointers and fallthrough... */
 334         sub         r1, r1, #4
 335         sub         r1, r1, r6, lsr #3
 336         sub         r4, r4, #4
 337         mov         r2, #4
 338                 ldmfd           sp!, {r5, r6, r7}
 339         b           8b
 340         .fnend
 341         .size memcmp, .-memcmp