contrib/cortex-strings/src/aarch64/strncmp.S

   1 /* Copyright (c) 2013, Linaro Limited
   2    All rights reserved.
   3
   4    Redistribution and use in source and binary forms, with or without
   5    modification, are permitted provided that the following conditions are met:
   6        * Redistributions of source code must retain the above copyright
   7          notice, this list of conditions and the following disclaimer.
   8        * Redistributions in binary form must reproduce the above copyright
   9          notice, this list of conditions and the following disclaimer in the
  10          documentation and/or other materials provided with the distribution.
  11        * Neither the name of the Linaro nor the
  12          names of its contributors may be used to endorse or promote products
  13          derived from this software without specific prior written permission.
  14
  15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
  26
  27 /* Assumptions:
  28  *
  29  * ARMv8-a, AArch64
  30  */
  31
  32         .macro def_fn f p2align=0
  33         .text
  34         .p2align \p2align
  35         .global \f
  36         .type \f, %function
  37 \f:
  38         .endm
  39
  40 #define REP8_01 0x0101010101010101
  41 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  42 #define REP8_80 0x8080808080808080
  43
  44 /* Parameters and result.  */
  45 #define src1            x0
  46 #define src2            x1
  47 #define limit           x2
  48 #define result          x0
  49
  50 /* Internal variables.  */
  51 #define data1           x3
  52 #define data1w          w3
  53 #define data2           x4
  54 #define data2w          w4
  55 #define has_nul         x5
  56 #define diff            x6
  57 #define syndrome        x7
  58 #define tmp1            x8
  59 #define tmp2            x9
  60 #define tmp3            x10
  61 #define zeroones        x11
  62 #define pos             x12
  63 #define limit_wd        x13
  64 #define mask            x14
  65 #define endloop         x15
  66
  67         .text
  68         .p2align 6
  69         .rep 7
  70         nop     /* Pad so that the loop below fits a cache line.  */
  71         .endr
  72 def_fn strncmp
  73         cbz     limit, .Lret0
  74         eor     tmp1, src1, src2
  75         mov     zeroones, #REP8_01
  76         tst     tmp1, #7
  77         b.ne    .Lmisaligned8
  78         ands    tmp1, src1, #7
  79         b.ne    .Lmutual_align
  80         /* Calculate the number of full and partial words -1.  */
  81         sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
  82         lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
  83
  84         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  85            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  86            can be done in parallel across the entire word.  */
  87         /* Start of performance-critical section  -- one 64B cache line.  */
  88 .Lloop_aligned:
  89         ldr     data1, [src1], #8
  90         ldr     data2, [src2], #8
  91 .Lstart_realigned:
  92         subs    limit_wd, limit_wd, #1
  93         sub     tmp1, data1, zeroones
  94         orr     tmp2, data1, #REP8_7f
  95         eor     diff, data1, data2      /* Non-zero if differences found.  */
  96         csinv   endloop, diff, xzr, pl  /* Last Dword or differences.  */
  97         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  98         ccmp    endloop, #0, #0, eq
  99         b.eq    .Lloop_aligned
 100         /* End of performance-critical section  -- one 64B cache line.  */
 101
 102         /* Not reached the limit, must have found the end or a diff.  */
 103         tbz     limit_wd, #63, .Lnot_limit
 104
 105         /* Limit % 8 == 0 => all bytes significant.  */
 106         ands    limit, limit, #7
 107         b.eq    .Lnot_limit
 108
 109         lsl     limit, limit, #3        /* Bits -> bytes.  */
 110         mov     mask, #~0
 111 #ifdef __AARCH64EB__
 112         lsr     mask, mask, limit
 113 #else
 114         lsl     mask, mask, limit
 115 #endif
 116         bic     data1, data1, mask
 117         bic     data2, data2, mask
 118
 119         /* Make sure that the NUL byte is marked in the syndrome.  */
 120         orr     has_nul, has_nul, mask
 121
 122 .Lnot_limit:
 123         orr     syndrome, diff, has_nul
 124
 125 #ifndef __AARCH64EB__
 126         rev     syndrome, syndrome
 127         rev     data1, data1
 128         /* The MS-non-zero bit of the syndrome marks either the first bit
 129            that is different, or the top bit of the first zero byte.
 130            Shifting left now will bring the critical information into the
 131            top bits.  */
 132         clz     pos, syndrome
 133         rev     data2, data2
 134         lsl     data1, data1, pos
 135         lsl     data2, data2, pos
 136         /* But we need to zero-extend (char is unsigned) the value and then
 137            perform a signed 32-bit subtraction.  */
 138         lsr     data1, data1, #56
 139         sub     result, data1, data2, lsr #56
 140         ret
 141 #else
 142         /* For big-endian we cannot use the trick with the syndrome value
 143            as carry-propagation can corrupt the upper bits if the trailing
 144            bytes in the string contain 0x01.  */
 145         /* However, if there is no NUL byte in the dword, we can generate
 146            the result directly.  We can't just subtract the bytes as the
 147            MSB might be significant.  */
 148         cbnz    has_nul, 1f
 149         cmp     data1, data2
 150         cset    result, ne
 151         cneg    result, result, lo
 152         ret
 153 1:
 154         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 155         rev     tmp3, data1
 156         sub     tmp1, tmp3, zeroones
 157         orr     tmp2, tmp3, #REP8_7f
 158         bic     has_nul, tmp1, tmp2
 159         rev     has_nul, has_nul
 160         orr     syndrome, diff, has_nul
 161         clz     pos, syndrome
 162         /* The MS-non-zero bit of the syndrome marks either the first bit
 163            that is different, or the top bit of the first zero byte.
 164            Shifting left now will bring the critical information into the
 165            top bits.  */
 166         lsl     data1, data1, pos
 167         lsl     data2, data2, pos
 168         /* But we need to zero-extend (char is unsigned) the value and then
 169            perform a signed 32-bit subtraction.  */
 170         lsr     data1, data1, #56
 171         sub     result, data1, data2, lsr #56
 172         ret
 173 #endif
 174
 175 .Lmutual_align:
 176         /* Sources are mutually aligned, but are not currently at an
 177            alignment boundary.  Round down the addresses and then mask off
 178            the bytes that precede the start point.
 179            We also need to adjust the limit calculations, but without
 180            overflowing if the limit is near ULONG_MAX.  */
 181         bic     src1, src1, #7
 182         bic     src2, src2, #7
 183         ldr     data1, [src1], #8
 184         neg     tmp3, tmp1, lsl #3      /* 64 - bits(bytes beyond align). */
 185         ldr     data2, [src2], #8
 186         mov     tmp2, #~0
 187         sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
 188 #ifdef __AARCH64EB__
 189         /* Big-endian.  Early bytes are at MSB.  */
 190         lsl     tmp2, tmp2, tmp3        /* Shift (tmp1 & 63).  */
 191 #else
 192         /* Little-endian.  Early bytes are at LSB.  */
 193         lsr     tmp2, tmp2, tmp3        /* Shift (tmp1 & 63).  */
 194 #endif
 195         and     tmp3, limit_wd, #7
 196         lsr     limit_wd, limit_wd, #3
 197         /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
 198         add     limit, limit, tmp1
 199         add     tmp3, tmp3, tmp1
 200         orr     data1, data1, tmp2
 201         orr     data2, data2, tmp2
 202         add     limit_wd, limit_wd, tmp3, lsr #3
 203         b       .Lstart_realigned
 204
 205 .Lret0:
 206         mov     result, #0
 207         ret
 208
 209         .p2align 6
 210 .Lmisaligned8:
 211         sub     limit, limit, #1
 212 1:
 213         /* Perhaps we can do better than this.  */
 214         ldrb    data1w, [src1], #1
 215         ldrb    data2w, [src2], #1
 216         subs    limit, limit, #1
 217         ccmp    data1w, #1, #0, cs      /* NZCV = 0b0000.  */
 218         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 219         b.eq    1b
 220         sub     result, data1, data2
 221         ret
 222         .size strncmp, . - strncmp