sys/arm64/arm64/memset.S

   1 /* Copyright (c) 2012, Linaro Limited
   2    All rights reserved.
   3
   4    Redistribution and use in source and binary forms, with or without
   5    modification, are permitted provided that the following conditions are met:
   6        * Redistributions of source code must retain the above copyright
   7          notice, this list of conditions and the following disclaimer.
   8        * Redistributions in binary form must reproduce the above copyright
   9          notice, this list of conditions and the following disclaimer in the
  10          documentation and/or other materials provided with the distribution.
  11        * Neither the name of the Linaro nor the
  12          names of its contributors may be used to endorse or promote products
  13          derived from this software without specific prior written permission.
  14
  15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
  26
  27 /* Assumptions:
  28  *
  29  * ARMv8-a, AArch64
  30  * Unaligned accesses
  31  *
  32  */
  33
  34 #include <machine/asm.h>
  35
  36 #define dstin           x0
  37 #define val             w1
  38 #define count           x2
  39 #define tmp1            x3
  40 #define tmp1w           w3
  41 #define tmp2            x4
  42 #define tmp2w           w4
  43 #define zva_len_x       x5
  44 #define zva_len         w5
  45 #define zva_bits_x      x6
  46
  47 #define A_l             x7
  48 #define A_lw            w7
  49 #define dst             x8
  50 #define tmp3w           w9
  51
  52 ENTRY(memset)
  53
  54         mov     dst, dstin              /* Preserve return value.  */
  55         ands    A_lw, val, #255
  56 #ifndef DONT_USE_DC
  57         b.eq    .Lzero_mem
  58 #endif
  59         orr     A_lw, A_lw, A_lw, lsl #8
  60         orr     A_lw, A_lw, A_lw, lsl #16
  61         orr     A_l, A_l, A_l, lsl #32
  62 .Ltail_maybe_long:
  63         cmp     count, #64
  64         b.ge    .Lnot_short
  65 .Ltail_maybe_tiny:
  66         cmp     count, #15
  67         b.le    .Ltail15tiny
  68 .Ltail63:
  69         ands    tmp1, count, #0x30
  70         b.eq    .Ltail15
  71         add     dst, dst, tmp1
  72         cmp     tmp1w, #0x20
  73         b.eq    1f
  74         b.lt    2f
  75         stp     A_l, A_l, [dst, #-48]
  76 1:
  77         stp     A_l, A_l, [dst, #-32]
  78 2:
  79         stp     A_l, A_l, [dst, #-16]
  80
  81 .Ltail15:
  82         and     count, count, #15
  83         add     dst, dst, count
  84         stp     A_l, A_l, [dst, #-16]   /* Repeat some/all of last store. */
  85         ret
  86
  87 .Ltail15tiny:
  88         /* Set up to 15 bytes.  Does not assume earlier memory
  89            being set.  */
  90         tbz     count, #3, 1f
  91         str     A_l, [dst], #8
  92 1:
  93         tbz     count, #2, 1f
  94         str     A_lw, [dst], #4
  95 1:
  96         tbz     count, #1, 1f
  97         strh    A_lw, [dst], #2
  98 1:
  99         tbz     count, #0, 1f
 100         strb    A_lw, [dst]
 101 1:
 102         ret
 103
 104         /* Critical loop.  Start at a new cache line boundary.  Assuming
 105          * 64 bytes per line, this ensures the entire loop is in one line.  */
 106         .p2align 6
 107 .Lnot_short:
 108         neg     tmp2, dst
 109         ands    tmp2, tmp2, #15
 110         b.eq    2f
 111         /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
 112          * more than that to set, so we simply store 16 bytes and advance by
 113          * the amount required to reach alignment.  */
 114         sub     count, count, tmp2
 115         stp     A_l, A_l, [dst]
 116         add     dst, dst, tmp2
 117         /* There may be less than 63 bytes to go now.  */
 118         cmp     count, #63
 119         b.le    .Ltail63
 120 2:
 121         sub     dst, dst, #16           /* Pre-bias.  */
 122         sub     count, count, #64
 123 1:
 124         stp     A_l, A_l, [dst, #16]
 125         stp     A_l, A_l, [dst, #32]
 126         stp     A_l, A_l, [dst, #48]
 127         stp     A_l, A_l, [dst, #64]!
 128         subs    count, count, #64
 129         b.ge    1b
 130         tst     count, #0x3f
 131         add     dst, dst, #16
 132         b.ne    .Ltail63
 133         ret
 134
 135         /* For zeroing memory, check to see if we can use the ZVA feature to
 136          * zero entire 'cache' lines.  */
 137 .Lzero_mem:
 138         mov     A_l, #0
 139         cmp     count, #63
 140         b.le    .Ltail_maybe_tiny
 141         neg     tmp2, dst
 142         ands    tmp2, tmp2, #15
 143         b.eq    1f
 144         sub     count, count, tmp2
 145         stp     A_l, A_l, [dst]
 146         add     dst, dst, tmp2
 147         cmp     count, #63
 148         b.le    .Ltail63
 149 1:
 150         /* For zeroing small amounts of memory, it's not worth setting up
 151          * the line-clear code.  */
 152         cmp     count, #128
 153         b.lt    .Lnot_short
 154
 155         adrp    tmp2, dczva_line_size
 156         add     tmp2, tmp2, :lo12:dczva_line_size
 157         ldr     zva_len, [tmp2]
 158         cbz     zva_len, .Lnot_short
 159
 160 .Lzero_by_line:
 161         /* Compute how far we need to go to become suitably aligned.  We're
 162          * already at quad-word alignment.  */
 163         cmp     count, zva_len_x
 164         b.lt    .Lnot_short             /* Not enough to reach alignment.  */
 165         sub     zva_bits_x, zva_len_x, #1
 166         neg     tmp2, dst
 167         ands    tmp2, tmp2, zva_bits_x
 168         b.eq    1f                      /* Already aligned.  */
 169         /* Not aligned, check that there's enough to copy after alignment.  */
 170         sub     tmp1, count, tmp2
 171         cmp     tmp1, #64
 172         ccmp    tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
 173         b.lt    .Lnot_short
 174         /* We know that there's at least 64 bytes to zero and that it's safe
 175          * to overrun by 64 bytes.  */
 176         mov     count, tmp1
 177 2:
 178         stp     A_l, A_l, [dst]
 179         stp     A_l, A_l, [dst, #16]
 180         stp     A_l, A_l, [dst, #32]
 181         subs    tmp2, tmp2, #64
 182         stp     A_l, A_l, [dst, #48]
 183         add     dst, dst, #64
 184         b.ge    2b
 185         /* We've overrun a bit, so adjust dst downwards.  */
 186         add     dst, dst, tmp2
 187 1:
 188         sub     count, count, zva_len_x
 189 3:
 190         dc      zva, dst
 191         add     dst, dst, zva_len_x
 192         subs    count, count, zva_len_x
 193         b.ge    3b
 194         ands    count, count, zva_bits_x
 195         b.ne    .Ltail_maybe_long
 196         ret
 197 END(memset)