contrib/cortex-strings/src/aarch64/strchrnul.S

   1 /*
   2    strchrnul - find a character or nul in a string
   3
   4    Copyright (c) 2014, ARM Limited
   5    All rights Reserved.
   6
   7    Redistribution and use in source and binary forms, with or without
   8    modification, are permitted provided that the following conditions are met:
   9        * Redistributions of source code must retain the above copyright
  10          notice, this list of conditions and the following disclaimer.
  11        * Redistributions in binary form must reproduce the above copyright
  12          notice, this list of conditions and the following disclaimer in the
  13          documentation and/or other materials provided with the distribution.
  14        * Neither the name of the company nor the names of its contributors
  15          may be used to endorse or promote products derived from this
  16          software without specific prior written permission.
  17
  18    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
  29
  30 /* Assumptions:
  31  *
  32  * ARMv8-a, AArch64
  33  * Neon Available.
  34  */
  35
  36 /* Arguments and results.  */
  37 #define srcin           x0
  38 #define chrin           w1
  39
  40 #define result          x0
  41
  42 #define src             x2
  43 #define tmp1            x3
  44 #define wtmp2           w4
  45 #define tmp3            x5
  46
  47 #define vrepchr         v0
  48 #define vdata1          v1
  49 #define vdata2          v2
  50 #define vhas_nul1       v3
  51 #define vhas_nul2       v4
  52 #define vhas_chr1       v5
  53 #define vhas_chr2       v6
  54 #define vrepmask        v7
  55 #define vend1           v16
  56
  57 /* Core algorithm.
  58
  59    For each 32-byte hunk we calculate a 64-bit syndrome value, with
  60    two bits per byte (LSB is always in bits 0 and 1, for both big
  61    and little-endian systems).  For each tuple, bit 0 is set iff
  62    the relevant byte matched the requested character or nul.  Since the
  63    bits in the syndrome reflect exactly the order in which things occur
  64    in the original string a count_trailing_zeros() operation will
  65    identify exactly which byte is causing the termination.  */
  66
  67 /* Locals and temporaries.  */
  68
  69         .macro def_fn f p2align=0
  70         .text
  71         .p2align \p2align
  72         .global \f
  73         .type \f, %function
  74 \f:
  75         .endm
  76
  77 def_fn strchrnul
  78         /* Magic constant 0x40100401 to allow us to identify which lane
  79            matches the termination condition.  */
  80         mov     wtmp2, #0x0401
  81         movk    wtmp2, #0x4010, lsl #16
  82         dup     vrepchr.16b, chrin
  83         bic     src, srcin, #31         /* Work with aligned 32-byte hunks.  */
  84         dup     vrepmask.4s, wtmp2
  85         ands    tmp1, srcin, #31
  86         b.eq    .Lloop
  87
  88         /* Input string is not 32-byte aligned.  Rather than forcing
  89            the padding bytes to a safe value, we calculate the syndrome
  90            for all the bytes, but then mask off those bits of the
  91            syndrome that are related to the padding.  */
  92         ld1     {vdata1.16b, vdata2.16b}, [src], #32
  93         neg     tmp1, tmp1
  94         cmeq    vhas_nul1.16b, vdata1.16b, #0
  95         cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
  96         cmeq    vhas_nul2.16b, vdata2.16b, #0
  97         cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
  98         orr     vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
  99         orr     vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
 100         and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 101         and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 102         lsl     tmp1, tmp1, #1
 103         addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
 104         mov     tmp3, #~0
 105         addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
 106         lsr     tmp1, tmp3, tmp1
 107
 108         mov     tmp3, vend1.d[0]
 109         bic     tmp1, tmp3, tmp1        // Mask padding bits.
 110         cbnz    tmp1, .Ltail
 111
 112 .Lloop:
 113         ld1     {vdata1.16b, vdata2.16b}, [src], #32
 114         cmeq    vhas_nul1.16b, vdata1.16b, #0
 115         cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
 116         cmeq    vhas_nul2.16b, vdata2.16b, #0
 117         cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
 118         /* Use a fast check for the termination condition.  */
 119         orr     vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
 120         orr     vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
 121         orr     vend1.16b, vhas_chr1.16b, vhas_chr2.16b
 122         addp    vend1.2d, vend1.2d, vend1.2d
 123         mov     tmp1, vend1.d[0]
 124         cbz     tmp1, .Lloop
 125
 126         /* Termination condition found.  Now need to establish exactly why
 127            we terminated.  */
 128         and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 129         and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 130         addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b         // 256->128
 131         addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
 132
 133         mov     tmp1, vend1.d[0]
 134 .Ltail:
 135         /* Count the trailing zeros, by bit reversing...  */
 136         rbit    tmp1, tmp1
 137         /* Re-bias source.  */
 138         sub     src, src, #32
 139         clz     tmp1, tmp1      /* ... and counting the leading zeros.  */
 140         /* tmp1 is twice the offset into the fragment.  */
 141         add     result, src, tmp1, lsr #1
 142         ret
 143
 144         .size   strchrnul, . - strchrnul