lib/libc/arm/string/memcpy_arm.S

   1 /*      $NetBSD: memcpy_arm.S,v 1.1 2003/10/14 07:51:45 scw Exp $       */
   2
   3 /*-
   4  * Copyright (c) 1997 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Neil A. Carson and Mark Brinicombe
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <machine/asm.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 .syntax unified
  36
  37 /*
  38  * This is one fun bit of code ...
  39  * Some easy listening music is suggested while trying to understand this
  40  * code e.g. Iron Maiden
  41  *
  42  * For anyone attempting to understand it :
  43  *
  44  * The core code is implemented here with simple stubs for memcpy().
  45  *
  46  * All local labels are prefixed with Lmemcpy_
  47  * Following the prefix a label starting f is used in the forward copy code
  48  * while a label using b is used in the backwards copy code
  49  * The source and destination addresses determine whether a forward or
  50  * backward copy is performed.
  51  * Separate bits of code are used to deal with the following situations
  52  * for both the forward and backwards copy.
  53  * unaligned source address
  54  * unaligned destination address
  55  * Separate copy routines are used to produce an optimised result for each
  56  * of these cases.
  57  * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  58  * a time where possible.
  59  *
  60  * Note: r12 (aka ip) can be trashed during the function along with
  61  * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  62  * Additional registers are preserved prior to use i.e. r4, r5 & lr
  63  *
  64  * Apologies for the state of the comments ;-)
  65  */
  66 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
  67 ENTRY(memcpy)
  68         /* save leaf functions having to store this away */
  69         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
  70
  71         subs    r2, r2, #4
  72         blt     .Lmemcpy_l4             /* less than 4 bytes */
  73         ands    r12, r0, #3
  74         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
  75         ands    r12, r1, #3
  76         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
  77
  78 .Lmemcpy_t8:
  79         /* We have aligned source and destination */
  80         subs    r2, r2, #8
  81         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
  82         subs    r2, r2, #0x14
  83         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
  84         stmdb   sp!, {r4}               /* borrow r4 */
  85
  86         /* blat 32 bytes at a time */
  87         /* XXX for really big copies perhaps we should use more registers */
  88 .Lmemcpy_loop32:
  89         ldmia   r1!, {r3, r4, r12, lr}
  90         stmia   r0!, {r3, r4, r12, lr}
  91         ldmia   r1!, {r3, r4, r12, lr}
  92         stmia   r0!, {r3, r4, r12, lr}
  93         subs    r2, r2, #0x20
  94         bge     .Lmemcpy_loop32
  95
  96         cmn     r2, #0x10
  97         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
  98         stmiage r0!, {r3, r4, r12, lr}
  99         subge   r2, r2, #0x10
 100         ldmia   sp!, {r4}               /* return r4 */
 101
 102 .Lmemcpy_l32:
 103         adds    r2, r2, #0x14
 104
 105         /* blat 12 bytes at a time */
 106 .Lmemcpy_loop12:
 107         ldmiage r1!, {r3, r12, lr}
 108         stmiage r0!, {r3, r12, lr}
 109         subsge  r2, r2, #0x0c
 110         bge     .Lmemcpy_loop12
 111
 112 .Lmemcpy_l12:
 113         adds    r2, r2, #8
 114         blt     .Lmemcpy_l4
 115
 116         subs    r2, r2, #4
 117         ldrlt   r3, [r1], #4
 118         strlt   r3, [r0], #4
 119         ldmiage r1!, {r3, r12}
 120         stmiage r0!, {r3, r12}
 121         subge   r2, r2, #4
 122
 123 .Lmemcpy_l4:
 124         /* less than 4 bytes to go */
 125         adds    r2, r2, #4
 126 #ifdef __APCS_26_
 127         ldmiaeq sp!, {r0, pc}^          /* done */
 128 #else
 129         ldmiaeq sp!, {r0, pc}           /* done */
 130 #endif
 131         /* copy the crud byte at a time */
 132         cmp     r2, #2
 133         ldrb    r3, [r1], #1
 134         strb    r3, [r0], #1
 135         ldrbge  r3, [r1], #1
 136         strbge  r3, [r0], #1
 137         ldrbgt  r3, [r1], #1
 138         strbgt  r3, [r0], #1
 139         ldmia   sp!, {r0, pc}
 140
 141         /* erg - unaligned destination */
 142 .Lmemcpy_destul:
 143         rsb     r12, r12, #4
 144         cmp     r12, #2
 145
 146         /* align destination with byte copies */
 147         ldrb    r3, [r1], #1
 148         strb    r3, [r0], #1
 149         ldrbge  r3, [r1], #1
 150         strbge  r3, [r0], #1
 151         ldrbgt  r3, [r1], #1
 152         strbgt  r3, [r0], #1
 153         subs    r2, r2, r12
 154         blt     .Lmemcpy_l4             /* less the 4 bytes */
 155
 156         ands    r12, r1, #3
 157         beq     .Lmemcpy_t8             /* we have an aligned source */
 158
 159         /* erg - unaligned source */
 160         /* This is where it gets nasty ... */
 161 .Lmemcpy_srcul:
 162         bic     r1, r1, #3
 163         ldr     lr, [r1], #4
 164         cmp     r12, #2
 165         bgt     .Lmemcpy_srcul3
 166         beq     .Lmemcpy_srcul2
 167         cmp     r2, #0x0c
 168         blt     .Lmemcpy_srcul1loop4
 169         sub     r2, r2, #0x0c
 170         stmdb   sp!, {r4, r5}
 171
 172 .Lmemcpy_srcul1loop16:
 173 #ifdef __ARMEB__
 174         mov     r3, lr, lsl #8
 175 #else
 176         mov     r3, lr, lsr #8
 177 #endif
 178         ldmia   r1!, {r4, r5, r12, lr}
 179 #ifdef __ARMEB__
 180         orr     r3, r3, r4, lsr #24
 181         mov     r4, r4, lsl #8
 182         orr     r4, r4, r5, lsr #24
 183         mov     r5, r5, lsl #8
 184         orr     r5, r5, r12, lsr #24
 185         mov     r12, r12, lsl #8
 186         orr     r12, r12, lr, lsr #24
 187 #else
 188         orr     r3, r3, r4, lsl #24
 189         mov     r4, r4, lsr #8
 190         orr     r4, r4, r5, lsl #24
 191         mov     r5, r5, lsr #8
 192         orr     r5, r5, r12, lsl #24
 193         mov     r12, r12, lsr #8
 194         orr     r12, r12, lr, lsl #24
 195 #endif
 196         stmia   r0!, {r3-r5, r12}
 197         subs    r2, r2, #0x10
 198         bge     .Lmemcpy_srcul1loop16
 199         ldmia   sp!, {r4, r5}
 200         adds    r2, r2, #0x0c
 201         blt     .Lmemcpy_srcul1l4
 202
 203 .Lmemcpy_srcul1loop4:
 204 #ifdef __ARMEB__
 205         mov     r12, lr, lsl #8
 206 #else
 207         mov     r12, lr, lsr #8
 208 #endif
 209         ldr     lr, [r1], #4
 210 #ifdef __ARMEB__
 211         orr     r12, r12, lr, lsr #24
 212 #else
 213         orr     r12, r12, lr, lsl #24
 214 #endif
 215         str     r12, [r0], #4
 216         subs    r2, r2, #4
 217         bge     .Lmemcpy_srcul1loop4
 218
 219 .Lmemcpy_srcul1l4:
 220         sub     r1, r1, #3
 221         b       .Lmemcpy_l4
 222
 223 .Lmemcpy_srcul2:
 224         cmp     r2, #0x0c
 225         blt     .Lmemcpy_srcul2loop4
 226         sub     r2, r2, #0x0c
 227         stmdb   sp!, {r4, r5}
 228
 229 .Lmemcpy_srcul2loop16:
 230 #ifdef __ARMEB__
 231         mov     r3, lr, lsl #16
 232 #else
 233         mov     r3, lr, lsr #16
 234 #endif
 235         ldmia   r1!, {r4, r5, r12, lr}
 236 #ifdef __ARMEB__
 237         orr     r3, r3, r4, lsr #16
 238         mov     r4, r4, lsl #16
 239         orr     r4, r4, r5, lsr #16
 240         mov     r5, r5, lsl #16
 241         orr     r5, r5, r12, lsr #16
 242         mov     r12, r12, lsl #16
 243         orr     r12, r12, lr, lsr #16
 244 #else
 245         orr     r3, r3, r4, lsl #16
 246         mov     r4, r4, lsr #16
 247         orr     r4, r4, r5, lsl #16
 248         mov     r5, r5, lsr #16
 249         orr     r5, r5, r12, lsl #16
 250         mov     r12, r12, lsr #16
 251         orr     r12, r12, lr, lsl #16
 252 #endif
 253         stmia   r0!, {r3-r5, r12}
 254         subs    r2, r2, #0x10
 255         bge     .Lmemcpy_srcul2loop16
 256         ldmia   sp!, {r4, r5}
 257         adds    r2, r2, #0x0c
 258         blt     .Lmemcpy_srcul2l4
 259
 260 .Lmemcpy_srcul2loop4:
 261 #ifdef __ARMEB__
 262         mov     r12, lr, lsl #16
 263 #else
 264         mov     r12, lr, lsr #16
 265 #endif
 266         ldr     lr, [r1], #4
 267 #ifdef __ARMEB__
 268         orr     r12, r12, lr, lsr #16
 269 #else
 270         orr     r12, r12, lr, lsl #16
 271 #endif
 272         str     r12, [r0], #4
 273         subs    r2, r2, #4
 274         bge     .Lmemcpy_srcul2loop4
 275
 276 .Lmemcpy_srcul2l4:
 277         sub     r1, r1, #2
 278         b       .Lmemcpy_l4
 279
 280 .Lmemcpy_srcul3:
 281         cmp     r2, #0x0c
 282         blt     .Lmemcpy_srcul3loop4
 283         sub     r2, r2, #0x0c
 284         stmdb   sp!, {r4, r5}
 285
 286 .Lmemcpy_srcul3loop16:
 287 #ifdef __ARMEB__
 288         mov     r3, lr, lsl #24
 289 #else
 290         mov     r3, lr, lsr #24
 291 #endif
 292         ldmia   r1!, {r4, r5, r12, lr}
 293 #ifdef __ARMEB__
 294         orr     r3, r3, r4, lsr #8
 295         mov     r4, r4, lsl #24
 296         orr     r4, r4, r5, lsr #8
 297         mov     r5, r5, lsl #24
 298         orr     r5, r5, r12, lsr #8
 299         mov     r12, r12, lsl #24
 300         orr     r12, r12, lr, lsr #8
 301 #else
 302         orr     r3, r3, r4, lsl #8
 303         mov     r4, r4, lsr #24
 304         orr     r4, r4, r5, lsl #8
 305         mov     r5, r5, lsr #24
 306         orr     r5, r5, r12, lsl #8
 307         mov     r12, r12, lsr #24
 308         orr     r12, r12, lr, lsl #8
 309 #endif
 310         stmia   r0!, {r3-r5, r12}
 311         subs    r2, r2, #0x10
 312         bge     .Lmemcpy_srcul3loop16
 313         ldmia   sp!, {r4, r5}
 314         adds    r2, r2, #0x0c
 315         blt     .Lmemcpy_srcul3l4
 316
 317 .Lmemcpy_srcul3loop4:
 318 #ifdef __ARMEB__
 319         mov     r12, lr, lsl #24
 320 #else
 321         mov     r12, lr, lsr #24
 322 #endif
 323         ldr     lr, [r1], #4
 324 #ifdef __ARMEB__
 325         orr     r12, r12, lr, lsr #8
 326 #else
 327         orr     r12, r12, lr, lsl #8
 328 #endif
 329         str     r12, [r0], #4
 330         subs    r2, r2, #4
 331         bge     .Lmemcpy_srcul3loop4
 332
 333 .Lmemcpy_srcul3l4:
 334         sub     r1, r1, #1
 335         b       .Lmemcpy_l4
 336 END(memcpy)