contrib/cortex-strings/src/aarch64/strcpy.S

   1 /*
   2    strcpy/stpcpy - copy a string returning pointer to start/end.
   3
   4    Copyright (c) 2013, 2014, 2015 ARM Ltd.
   5    All Rights Reserved.
   6
   7    Redistribution and use in source and binary forms, with or without
   8    modification, are permitted provided that the following conditions are met:
   9        * Redistributions of source code must retain the above copyright
  10          notice, this list of conditions and the following disclaimer.
  11        * Redistributions in binary form must reproduce the above copyright
  12          notice, this list of conditions and the following disclaimer in the
  13          documentation and/or other materials provided with the distribution.
  14        * Neither the name of the company nor the names of its contributors
  15          may be used to endorse or promote products derived from this
  16          software without specific prior written permission.
  17
  18    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
  29
  30 /* Assumptions:
  31  *
  32  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  33  */
  34
  35 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
  36
  37    To test the page crossing code path more thoroughly, compile with
  38    -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
  39    entry path.  This option is not intended for production use.  */
  40
  41 /* Arguments and results.  */
  42 #define dstin           x0
  43 #define srcin           x1
  44
  45 /* Locals and temporaries.  */
  46 #define src             x2
  47 #define dst             x3
  48 #define data1           x4
  49 #define data1w          w4
  50 #define data2           x5
  51 #define data2w          w5
  52 #define has_nul1        x6
  53 #define has_nul2        x7
  54 #define tmp1            x8
  55 #define tmp2            x9
  56 #define tmp3            x10
  57 #define tmp4            x11
  58 #define zeroones        x12
  59 #define data1a          x13
  60 #define data2a          x14
  61 #define pos             x15
  62 #define len             x16
  63 #define to_align        x17
  64
  65 #ifdef BUILD_STPCPY
  66 #define STRCPY stpcpy
  67 #else
  68 #define STRCPY strcpy
  69 #endif
  70
  71         .macro def_fn f p2align=0
  72         .text
  73         .p2align \p2align
  74         .global \f
  75         .type \f, %function
  76 \f:
  77         .endm
  78
  79         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  80            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  81            can be done in parallel across the entire word.  */
  82
  83 #define REP8_01 0x0101010101010101
  84 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  85 #define REP8_80 0x8080808080808080
  86
  87         /* AArch64 systems have a minimum page size of 4k.  We can do a quick
  88            page size check for crossing this boundary on entry and if we
  89            do not, then we can short-circuit much of the entry code.  We
  90            expect early page-crossing strings to be rare (probability of
  91            16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
  92            predictable, even with random strings.
  93
  94            We don't bother checking for larger page sizes, the cost of setting
  95            up the correct page size is just not worth the extra gain from
  96            a small reduction in the cases taking the slow path.  Note that
  97            we only care about whether the first fetch, which may be
  98            misaligned, crosses a page boundary - after that we move to aligned
  99            fetches for the remainder of the string.  */
 100
 101 #ifdef STRCPY_TEST_PAGE_CROSS
 102         /* Make everything that isn't Qword aligned look like a page cross.  */
 103 #define MIN_PAGE_P2 4
 104 #else
 105 #define MIN_PAGE_P2 12
 106 #endif
 107
 108 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 109
 110 def_fn STRCPY p2align=6
 111         /* For moderately short strings, the fastest way to do the copy is to
 112            calculate the length of the string in the same way as strlen, then
 113            essentially do a memcpy of the result.  This avoids the need for
 114            multiple byte copies and further means that by the time we
 115            reach the bulk copy loop we know we can always use DWord
 116            accesses.  We expect strcpy to rarely be called repeatedly
 117            with the same source string, so branch prediction is likely to
 118            always be difficult - we mitigate against this by preferring
 119            conditional select operations over branches whenever this is
 120            feasible.  */
 121         and     tmp2, srcin, #(MIN_PAGE_SIZE - 1)
 122         mov     zeroones, #REP8_01
 123         and     to_align, srcin, #15
 124         cmp     tmp2, #(MIN_PAGE_SIZE - 16)
 125         neg     tmp1, to_align
 126         /* The first fetch will straddle a (possible) page boundary iff
 127            srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
 128            aligned string will never fail the page align check, so will
 129            always take the fast path.  */
 130         b.gt    .Lpage_cross
 131
 132 .Lpage_cross_ok:
 133         ldp     data1, data2, [srcin]
 134 #ifdef __AARCH64EB__
 135         /* Because we expect the end to be found within 16 characters
 136            (profiling shows this is the most common case), it's worth
 137            swapping the bytes now to save having to recalculate the
 138            termination syndrome later.  We preserve data1 and data2
 139            so that we can re-use the values later on.  */
 140         rev     tmp2, data1
 141         sub     tmp1, tmp2, zeroones
 142         orr     tmp2, tmp2, #REP8_7f
 143         bics    has_nul1, tmp1, tmp2
 144         b.ne    .Lfp_le8
 145         rev     tmp4, data2
 146         sub     tmp3, tmp4, zeroones
 147         orr     tmp4, tmp4, #REP8_7f
 148 #else
 149         sub     tmp1, data1, zeroones
 150         orr     tmp2, data1, #REP8_7f
 151         bics    has_nul1, tmp1, tmp2
 152         b.ne    .Lfp_le8
 153         sub     tmp3, data2, zeroones
 154         orr     tmp4, data2, #REP8_7f
 155 #endif
 156         bics    has_nul2, tmp3, tmp4
 157         b.eq    .Lbulk_entry
 158
 159         /* The string is short (<=16 bytes).  We don't know exactly how
 160            short though, yet.  Work out the exact length so that we can
 161            quickly select the optimal copy strategy.  */
 162 .Lfp_gt8:
 163         rev     has_nul2, has_nul2
 164         clz     pos, has_nul2
 165         mov     tmp2, #56
 166         add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
 167         sub     pos, tmp2, pos
 168 #ifdef __AARCH64EB__
 169         lsr     data2, data2, pos
 170 #else
 171         lsl     data2, data2, pos
 172 #endif
 173         str     data2, [dst, #1]
 174         str     data1, [dstin]
 175 #ifdef BUILD_STPCPY
 176         add     dstin, dst, #8
 177 #endif
 178         ret
 179
 180 .Lfp_le8:
 181         rev     has_nul1, has_nul1
 182         clz     pos, has_nul1
 183         add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
 184         subs    tmp2, pos, #24                  /* Pos in bits. */
 185         b.lt    .Lfp_lt4
 186 #ifdef __AARCH64EB__
 187         mov     tmp2, #56
 188         sub     pos, tmp2, pos
 189         lsr     data2, data1, pos
 190         lsr     data1, data1, #32
 191 #else
 192         lsr     data2, data1, tmp2
 193 #endif
 194         /* 4->7 bytes to copy.  */
 195         str     data2w, [dst, #-3]
 196         str     data1w, [dstin]
 197 #ifdef BUILD_STPCPY
 198         mov     dstin, dst
 199 #endif
 200         ret
 201 .Lfp_lt4:
 202         cbz     pos, .Lfp_lt2
 203         /* 2->3 bytes to copy.  */
 204 #ifdef __AARCH64EB__
 205         lsr     data1, data1, #48
 206 #endif
 207         strh    data1w, [dstin]
 208         /* Fall-through, one byte (max) to go.  */
 209 .Lfp_lt2:
 210         /* Null-terminated string.  Last character must be zero!  */
 211         strb    wzr, [dst]
 212 #ifdef BUILD_STPCPY
 213         mov     dstin, dst
 214 #endif
 215         ret
 216
 217         .p2align 6
 218         /* Aligning here ensures that the entry code and main loop all lies
 219            within one 64-byte cache line.  */
 220 .Lbulk_entry:
 221         sub     to_align, to_align, #16
 222         stp     data1, data2, [dstin]
 223         sub     src, srcin, to_align
 224         sub     dst, dstin, to_align
 225         b       .Lentry_no_page_cross
 226
 227         /* The inner loop deals with two Dwords at a time.  This has a
 228            slightly higher start-up cost, but we should win quite quickly,
 229            especially on cores with a high number of issue slots per
 230            cycle, as we get much better parallelism out of the operations.  */
 231 .Lmain_loop:
 232         stp     data1, data2, [dst], #16
 233 .Lentry_no_page_cross:
 234         ldp     data1, data2, [src], #16
 235         sub     tmp1, data1, zeroones
 236         orr     tmp2, data1, #REP8_7f
 237         sub     tmp3, data2, zeroones
 238         orr     tmp4, data2, #REP8_7f
 239         bic     has_nul1, tmp1, tmp2
 240         bics    has_nul2, tmp3, tmp4
 241         ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
 242         b.eq    .Lmain_loop
 243
 244         /* Since we know we are copying at least 16 bytes, the fastest way
 245            to deal with the tail is to determine the location of the
 246            trailing NUL, then (re)copy the 16 bytes leading up to that.  */
 247         cmp     has_nul1, #0
 248 #ifdef __AARCH64EB__
 249         /* For big-endian, carry propagation (if the final byte in the
 250            string is 0x01) means we cannot use has_nul directly.  The
 251            easiest way to get the correct byte is to byte-swap the data
 252            and calculate the syndrome a second time.  */
 253         csel    data1, data1, data2, ne
 254         rev     data1, data1
 255         sub     tmp1, data1, zeroones
 256         orr     tmp2, data1, #REP8_7f
 257         bic     has_nul1, tmp1, tmp2
 258 #else
 259         csel    has_nul1, has_nul1, has_nul2, ne
 260 #endif
 261         rev     has_nul1, has_nul1
 262         clz     pos, has_nul1
 263         add     tmp1, pos, #72
 264         add     pos, pos, #8
 265         csel    pos, pos, tmp1, ne
 266         add     src, src, pos, lsr #3
 267         add     dst, dst, pos, lsr #3
 268         ldp     data1, data2, [src, #-32]
 269         stp     data1, data2, [dst, #-16]
 270 #ifdef BUILD_STPCPY
 271         sub     dstin, dst, #1
 272 #endif
 273         ret
 274
 275 .Lpage_cross:
 276         bic     src, srcin, #15
 277         /* Start by loading two words at [srcin & ~15], then forcing the
 278            bytes that precede srcin to 0xff.  This means they never look
 279            like termination bytes.  */
 280         ldp     data1, data2, [src]
 281         lsl     tmp1, tmp1, #3  /* Bytes beyond alignment -> bits.  */
 282         tst     to_align, #7
 283         csetm   tmp2, ne
 284 #ifdef __AARCH64EB__
 285         lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 286 #else
 287         lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 288 #endif
 289         orr     data1, data1, tmp2
 290         orr     data2a, data2, tmp2
 291         cmp     to_align, #8
 292         csinv   data1, data1, xzr, lt
 293         csel    data2, data2, data2a, lt
 294         sub     tmp1, data1, zeroones
 295         orr     tmp2, data1, #REP8_7f
 296         sub     tmp3, data2, zeroones
 297         orr     tmp4, data2, #REP8_7f
 298         bic     has_nul1, tmp1, tmp2
 299         bics    has_nul2, tmp3, tmp4
 300         ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
 301         b.eq    .Lpage_cross_ok
 302         /* We now need to make data1 and data2 look like they've been
 303            loaded directly from srcin.  Do a rotate on the 128-bit value.  */
 304         lsl     tmp1, to_align, #3      /* Bytes->bits.  */
 305         neg     tmp2, to_align, lsl #3
 306 #ifdef __AARCH64EB__
 307         lsl     data1a, data1, tmp1
 308         lsr     tmp4, data2, tmp2
 309         lsl     data2, data2, tmp1
 310         orr     tmp4, tmp4, data1a
 311         cmp     to_align, #8
 312         csel    data1, tmp4, data2, lt
 313         rev     tmp2, data1
 314         rev     tmp4, data2
 315         sub     tmp1, tmp2, zeroones
 316         orr     tmp2, tmp2, #REP8_7f
 317         sub     tmp3, tmp4, zeroones
 318         orr     tmp4, tmp4, #REP8_7f
 319 #else
 320         lsr     data1a, data1, tmp1
 321         lsl     tmp4, data2, tmp2
 322         lsr     data2, data2, tmp1
 323         orr     tmp4, tmp4, data1a
 324         cmp     to_align, #8
 325         csel    data1, tmp4, data2, lt
 326         sub     tmp1, data1, zeroones
 327         orr     tmp2, data1, #REP8_7f
 328         sub     tmp3, data2, zeroones
 329         orr     tmp4, data2, #REP8_7f
 330 #endif
 331         bic     has_nul1, tmp1, tmp2
 332         cbnz    has_nul1, .Lfp_le8
 333         bic     has_nul2, tmp3, tmp4
 334         b       .Lfp_gt8
 335
 336         .size   STRCPY, . - STRCPY