sys/arm64/arm64/strncmp.S

   1 /*
   2  * strncmp - compare two strings
   3  *
   4  * Copyright (c) 2013-2022, Arm Limited.
   5  * SPDX-License-Identifier: MIT
   6  */
   7
   8 /* Assumptions:
   9  *
  10  * ARMv8-a, AArch64.
  11  * MTE compatible.
  12  */
  13
  14 #include <machine/asm.h>
  15
  16 #define L(l) .L ## l
  17
  18 #define REP8_01 0x0101010101010101
  19 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  20
  21 /* Parameters and result.  */
  22 #define src1            x0
  23 #define src2            x1
  24 #define limit           x2
  25 #define result          x0
  26
  27 /* Internal variables.  */
  28 #define data1           x3
  29 #define data1w          w3
  30 #define data2           x4
  31 #define data2w          w4
  32 #define has_nul         x5
  33 #define diff            x6
  34 #define syndrome        x7
  35 #define tmp1            x8
  36 #define tmp2            x9
  37 #define tmp3            x10
  38 #define zeroones        x11
  39 #define pos             x12
  40 #define mask            x13
  41 #define endloop         x14
  42 #define count           mask
  43 #define offset          pos
  44 #define neg_offset      x15
  45
  46 /* Define endian dependent shift operations.
  47    On big-endian early bytes are at MSB and on little-endian LSB.
  48    LS_FW means shifting towards early bytes.
  49    LS_BK means shifting towards later bytes.
  50    */
  51 #ifdef __AARCH64EB__
  52 #define LS_FW lsl
  53 #define LS_BK lsr
  54 #else
  55 #define LS_FW lsr
  56 #define LS_BK lsl
  57 #endif
  58
  59 ENTRY (strncmp)
  60         cbz     limit, L(ret0)
  61         eor     tmp1, src1, src2
  62         mov     zeroones, #REP8_01
  63         tst     tmp1, #7
  64         and     count, src1, #7
  65         b.ne    L(misaligned8)
  66         cbnz    count, L(mutual_align)
  67
  68         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  69            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  70            can be done in parallel across the entire word.  */
  71         .p2align 4
  72 L(loop_aligned):
  73         ldr     data1, [src1], #8
  74         ldr     data2, [src2], #8
  75 L(start_realigned):
  76         subs    limit, limit, #8
  77         sub     tmp1, data1, zeroones
  78         orr     tmp2, data1, #REP8_7f
  79         eor     diff, data1, data2      /* Non-zero if differences found.  */
  80         csinv   endloop, diff, xzr, hi  /* Last Dword or differences.  */
  81         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  82         ccmp    endloop, #0, #0, eq
  83         b.eq    L(loop_aligned)
  84         /* End of main loop */
  85
  86 L(full_check):
  87 #ifndef __AARCH64EB__
  88         orr     syndrome, diff, has_nul
  89         add     limit, limit, 8 /* Rewind limit to before last subs. */
  90 L(syndrome_check):
  91         /* Limit was reached. Check if the NUL byte or the difference
  92            is before the limit. */
  93         rev     syndrome, syndrome
  94         rev     data1, data1
  95         clz     pos, syndrome
  96         rev     data2, data2
  97         lsl     data1, data1, pos
  98         cmp     limit, pos, lsr #3
  99         lsl     data2, data2, pos
 100         /* But we need to zero-extend (char is unsigned) the value and then
 101            perform a signed 32-bit subtraction.  */
 102         lsr     data1, data1, #56
 103         sub     result, data1, data2, lsr #56
 104         csel result, result, xzr, hi
 105         ret
 106 #else
 107         /* Not reached the limit, must have found the end or a diff.  */
 108         tbz     limit, #63, L(not_limit)
 109         add     tmp1, limit, 8
 110         cbz     limit, L(not_limit)
 111
 112         lsl     limit, tmp1, #3 /* Bits -> bytes.  */
 113         mov     mask, #~0
 114         lsr     mask, mask, limit
 115         bic     data1, data1, mask
 116         bic     data2, data2, mask
 117
 118         /* Make sure that the NUL byte is marked in the syndrome.  */
 119         orr     has_nul, has_nul, mask
 120
 121 L(not_limit):
 122         /* For big-endian we cannot use the trick with the syndrome value
 123            as carry-propagation can corrupt the upper bits if the trailing
 124            bytes in the string contain 0x01.  */
 125         /* However, if there is no NUL byte in the dword, we can generate
 126            the result directly.  We can't just subtract the bytes as the
 127            MSB might be significant.  */
 128         cbnz    has_nul, 1f
 129         cmp     data1, data2
 130         cset    result, ne
 131         cneg    result, result, lo
 132         ret
 133 1:
 134         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 135         rev     tmp3, data1
 136         sub     tmp1, tmp3, zeroones
 137         orr     tmp2, tmp3, #REP8_7f
 138         bic     has_nul, tmp1, tmp2
 139         rev     has_nul, has_nul
 140         orr     syndrome, diff, has_nul
 141         clz     pos, syndrome
 142         /* The most-significant-non-zero bit of the syndrome marks either the
 143            first bit that is different, or the top bit of the first zero byte.
 144            Shifting left now will bring the critical information into the
 145            top bits.  */
 146 L(end_quick):
 147         lsl     data1, data1, pos
 148         lsl     data2, data2, pos
 149         /* But we need to zero-extend (char is unsigned) the value and then
 150            perform a signed 32-bit subtraction.  */
 151         lsr     data1, data1, #56
 152         sub     result, data1, data2, lsr #56
 153         ret
 154 #endif
 155
 156 L(mutual_align):
 157         /* Sources are mutually aligned, but are not currently at an
 158            alignment boundary.  Round down the addresses and then mask off
 159            the bytes that precede the start point.
 160            We also need to adjust the limit calculations, but without
 161            overflowing if the limit is near ULONG_MAX.  */
 162         bic     src1, src1, #7
 163         bic     src2, src2, #7
 164         ldr     data1, [src1], #8
 165         neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
 166         ldr     data2, [src2], #8
 167         mov     tmp2, #~0
 168         LS_FW   tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 169         /* Adjust the limit and ensure it doesn't overflow.  */
 170         adds    limit, limit, count
 171         csinv   limit, limit, xzr, lo
 172         orr     data1, data1, tmp2
 173         orr     data2, data2, tmp2
 174         b       L(start_realigned)
 175
 176         .p2align 4
 177         /* Don't bother with dwords for up to 16 bytes.  */
 178 L(misaligned8):
 179         cmp     limit, #16
 180         b.hs    L(try_misaligned_words)
 181
 182 L(byte_loop):
 183         /* Perhaps we can do better than this.  */
 184         ldrb    data1w, [src1], #1
 185         ldrb    data2w, [src2], #1
 186         subs    limit, limit, #1
 187         ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
 188         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 189         b.eq    L(byte_loop)
 190 L(done):
 191         sub     result, data1, data2
 192         ret
 193         /* Align the SRC1 to a dword by doing a bytewise compare and then do
 194            the dword loop.  */
 195 L(try_misaligned_words):
 196         cbz     count, L(src1_aligned)
 197
 198         neg     count, count
 199         and     count, count, #7
 200         sub     limit, limit, count
 201
 202 L(page_end_loop):
 203         ldrb    data1w, [src1], #1
 204         ldrb    data2w, [src2], #1
 205         cmp     data1w, #1
 206         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 207         b.ne    L(done)
 208         subs    count, count, #1
 209         b.hi    L(page_end_loop)
 210
 211         /* The following diagram explains the comparison of misaligned strings.
 212            The bytes are shown in natural order. For little-endian, it is
 213            reversed in the registers. The "x" bytes are before the string.
 214            The "|" separates data that is loaded at one time.
 215            src1     | a a a a a a a a | b b b c c c c c | . . .
 216            src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
 217
 218            After shifting in each step, the data looks like this:
 219                         STEP_A              STEP_B              STEP_C
 220            data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
 221            data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 222
 223            The bytes with "0" are eliminated from the syndrome via mask.
 224
 225            Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
 226            time from SRC2. The comparison happens in 3 steps. After each step
 227            the loop can exit, or read from SRC1 or SRC2. */
 228 L(src1_aligned):
 229         /* Calculate offset from 8 byte alignment to string start in bits. No
 230            need to mask offset since shifts are ignoring upper bits. */
 231         lsl     offset, src2, #3
 232         bic     src2, src2, #0xf
 233         mov     mask, -1
 234         neg     neg_offset, offset
 235         ldr     data1, [src1], #8
 236         ldp     tmp1, tmp2, [src2], #16
 237         LS_BK   mask, mask, neg_offset
 238         and     neg_offset, neg_offset, #63     /* Need actual value for cmp later. */
 239         /* Skip the first compare if data in tmp1 is irrelevant. */
 240         tbnz    offset, 6, L(misaligned_mid_loop)
 241
 242 L(loop_misaligned):
 243         /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
 244         LS_FW   data2, tmp1, offset
 245         LS_BK   tmp1, tmp2, neg_offset
 246         subs    limit, limit, #8
 247         orr     data2, data2, tmp1      /* 8 bytes from SRC2 combined from two regs.*/
 248         sub     has_nul, data1, zeroones
 249         eor     diff, data1, data2      /* Non-zero if differences found.  */
 250         orr     tmp3, data1, #REP8_7f
 251         csinv   endloop, diff, xzr, hi  /* If limit, set to all ones. */
 252         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL byte found in SRC1. */
 253         orr     tmp3, endloop, has_nul
 254         cbnz    tmp3, L(full_check)
 255
 256         ldr     data1, [src1], #8
 257 L(misaligned_mid_loop):
 258         /* STEP_B: Compare first part of data1 to second part of tmp2. */
 259         LS_FW   data2, tmp2, offset
 260 #ifdef __AARCH64EB__
 261         /* For big-endian we do a byte reverse to avoid carry-propagation
 262         problem described above. This way we can reuse the has_nul in the
 263         next step and also use syndrome value trick at the end. */
 264         rev     tmp3, data1
 265         #define data1_fixed tmp3
 266 #else
 267         #define data1_fixed data1
 268 #endif
 269         sub     has_nul, data1_fixed, zeroones
 270         orr     tmp3, data1_fixed, #REP8_7f
 271         eor     diff, data2, data1      /* Non-zero if differences found.  */
 272         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL terminator.  */
 273 #ifdef __AARCH64EB__
 274         rev     has_nul, has_nul
 275 #endif
 276         cmp     limit, neg_offset, lsr #3
 277         orr     syndrome, diff, has_nul
 278         bic     syndrome, syndrome, mask        /* Ignore later bytes. */
 279         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 280         cbnz    tmp3, L(syndrome_check)
 281
 282         /* STEP_C: Compare second part of data1 to first part of tmp1. */
 283         ldp     tmp1, tmp2, [src2], #16
 284         cmp     limit, #8
 285         LS_BK   data2, tmp1, neg_offset
 286         eor     diff, data2, data1      /* Non-zero if differences found.  */
 287         orr     syndrome, diff, has_nul
 288         and     syndrome, syndrome, mask        /* Ignore earlier bytes. */
 289         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 290         cbnz    tmp3, L(syndrome_check)
 291
 292         ldr     data1, [src1], #8
 293         sub     limit, limit, #8
 294         b       L(loop_misaligned)
 295
 296 #ifdef  __AARCH64EB__
 297 L(syndrome_check):
 298         clz     pos, syndrome
 299         cmp     pos, limit, lsl #3
 300         b.lo    L(end_quick)
 301 #endif
 302
 303 L(ret0):
 304         mov     result, #0
 305         ret
 306 END(strncmp)
 307