contrib/cortex-strings/src/aarch64/strnlen.S

   1 /* strnlen - calculate the length of a string with limit.
   2
   3    Copyright (c) 2013, Linaro Limited
   4    All rights reserved.
   5
   6    Redistribution and use in source and binary forms, with or without
   7    modification, are permitted provided that the following conditions are met:
   8        * Redistributions of source code must retain the above copyright
   9          notice, this list of conditions and the following disclaimer.
  10        * Redistributions in binary form must reproduce the above copyright
  11          notice, this list of conditions and the following disclaimer in the
  12          documentation and/or other materials provided with the distribution.
  13        * Neither the name of the Linaro nor the
  14          names of its contributors may be used to endorse or promote products
  15          derived from this software without specific prior written permission.
  16
  17    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
  28
  29 /* Assumptions:
  30  *
  31  * ARMv8-a, AArch64
  32  */
  33
  34 /* Arguments and results.  */
  35 #define srcin           x0
  36 #define len             x0
  37 #define limit           x1
  38
  39 /* Locals and temporaries.  */
  40 #define src             x2
  41 #define data1           x3
  42 #define data2           x4
  43 #define data2a          x5
  44 #define has_nul1        x6
  45 #define has_nul2        x7
  46 #define tmp1            x8
  47 #define tmp2            x9
  48 #define tmp3            x10
  49 #define tmp4            x11
  50 #define zeroones        x12
  51 #define pos             x13
  52 #define limit_wd        x14
  53
  54         .macro def_fn f p2align=0
  55         .text
  56         .p2align \p2align
  57         .global \f
  58         .type \f, %function
  59 \f:
  60         .endm
  61
  62 #define REP8_01 0x0101010101010101
  63 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  64 #define REP8_80 0x8080808080808080
  65
  66         .text
  67         .p2align        6
  68 .Lstart:
  69         /* Pre-pad to ensure critical loop begins an icache line.  */
  70         .rep 7
  71         nop
  72         .endr
  73         /* Put this code here to avoid wasting more space with pre-padding.  */
  74 .Lhit_limit:
  75         mov     len, limit
  76         ret
  77
  78 def_fn strnlen
  79         cbz     limit, .Lhit_limit
  80         mov     zeroones, #REP8_01
  81         bic     src, srcin, #15
  82         ands    tmp1, srcin, #15
  83         b.ne    .Lmisaligned
  84         /* Calculate the number of full and partial words -1.  */
  85         sub     limit_wd, limit, #1     /* Limit != 0, so no underflow.  */
  86         lsr     limit_wd, limit_wd, #4  /* Convert to Qwords.  */
  87
  88         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  89            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  90            can be done in parallel across the entire word.  */
  91         /* The inner loop deals with two Dwords at a time.  This has a
  92            slightly higher start-up cost, but we should win quite quickly,
  93            especially on cores with a high number of issue slots per
  94            cycle, as we get much better parallelism out of the operations.  */
  95
  96         /* Start of critial section -- keep to one 64Byte cache line.  */
  97 .Lloop:
  98         ldp     data1, data2, [src], #16
  99 .Lrealigned:
 100         sub     tmp1, data1, zeroones
 101         orr     tmp2, data1, #REP8_7f
 102         sub     tmp3, data2, zeroones
 103         orr     tmp4, data2, #REP8_7f
 104         bic     has_nul1, tmp1, tmp2
 105         bic     has_nul2, tmp3, tmp4
 106         subs    limit_wd, limit_wd, #1
 107         orr     tmp1, has_nul1, has_nul2
 108         ccmp    tmp1, #0, #0, pl        /* NZCV = 0000  */
 109         b.eq    .Lloop
 110         /* End of critical section -- keep to one 64Byte cache line.  */
 111
 112         orr     tmp1, has_nul1, has_nul2
 113         cbz     tmp1, .Lhit_limit       /* No null in final Qword.  */
 114
 115         /* We know there's a null in the final Qword.  The easiest thing
 116            to do now is work out the length of the string and return
 117            MIN (len, limit).  */
 118
 119         sub     len, src, srcin
 120         cbz     has_nul1, .Lnul_in_data2
 121 #ifdef __AARCH64EB__
 122         mov     data2, data1
 123 #endif
 124         sub     len, len, #8
 125         mov     has_nul2, has_nul1
 126 .Lnul_in_data2:
 127 #ifdef __AARCH64EB__
 128         /* For big-endian, carry propagation (if the final byte in the
 129            string is 0x01) means we cannot use has_nul directly.  The
 130            easiest way to get the correct byte is to byte-swap the data
 131            and calculate the syndrome a second time.  */
 132         rev     data2, data2
 133         sub     tmp1, data2, zeroones
 134         orr     tmp2, data2, #REP8_7f
 135         bic     has_nul2, tmp1, tmp2
 136 #endif
 137         sub     len, len, #8
 138         rev     has_nul2, has_nul2
 139         clz     pos, has_nul2
 140         add     len, len, pos, lsr #3           /* Bits to bytes.  */
 141         cmp     len, limit
 142         csel    len, len, limit, ls             /* Return the lower value.  */
 143         ret
 144
 145 .Lmisaligned:
 146         /* Deal with a partial first word.
 147            We're doing two things in parallel here;
 148            1) Calculate the number of words (but avoiding overflow if
 149               limit is near ULONG_MAX) - to do this we need to work out
 150               limit + tmp1 - 1 as a 65-bit value before shifting it;
 151            2) Load and mask the initial data words - we force the bytes
 152               before the ones we are interested in to 0xff - this ensures
 153               early bytes will not hit any zero detection.  */
 154         sub     limit_wd, limit, #1
 155         neg     tmp4, tmp1
 156         cmp     tmp1, #8
 157
 158         and     tmp3, limit_wd, #15
 159         lsr     limit_wd, limit_wd, #4
 160         mov     tmp2, #~0
 161
 162         ldp     data1, data2, [src], #16
 163         lsl     tmp4, tmp4, #3          /* Bytes beyond alignment -> bits.  */
 164         add     tmp3, tmp3, tmp1
 165
 166 #ifdef __AARCH64EB__
 167         /* Big-endian.  Early bytes are at MSB.  */
 168         lsl     tmp2, tmp2, tmp4        /* Shift (tmp1 & 63).  */
 169 #else
 170         /* Little-endian.  Early bytes are at LSB.  */
 171         lsr     tmp2, tmp2, tmp4        /* Shift (tmp1 & 63).  */
 172 #endif
 173         add     limit_wd, limit_wd, tmp3, lsr #4
 174
 175         orr     data1, data1, tmp2
 176         orr     data2a, data2, tmp2
 177
 178         csinv   data1, data1, xzr, le
 179         csel    data2, data2, data2a, le
 180         b       .Lrealigned
 181         .size   strnlen, . - .Lstart    /* Include pre-padding in size.  */