sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  *
  76  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  77  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  78  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  79  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  80  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  81  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  82  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  83  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  84  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  85  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  86  * POSSIBILITY OF SUCH DAMAGE.
  87  */
  88
  89 #include <machine/asm.h>
  90 __FBSDID("$FreeBSD$");
  91
  92 #include "assym.inc"
  93
  94         .syntax unified
  95
  96 .L_arm_memcpy:
  97         .word   _C_LABEL(_arm_memcpy)
  98 .L_arm_bzero:
  99         .word   _C_LABEL(_arm_bzero)
 100 .L_min_memcpy_size:
 101         .word   _C_LABEL(_min_memcpy_size)
 102 .L_min_bzero_size:
 103         .word   _C_LABEL(_min_bzero_size)
 104 /*
 105  * memset: Sets a block of memory to the specified value
 106  *
 107  * On entry:
 108  *   r0 - dest address
 109  *   r1 - byte to write
 110  *   r2 - number of bytes to write
 111  *
 112  * On exit:
 113  *   r0 - dest address
 114  */
 115 /* LINTSTUB: Func: void bzero(void *, size_t) */
 116 ENTRY(bzero)
 117         ldr     r3, .L_arm_bzero
 118         ldr     r3, [r3]
 119         cmp     r3, #0
 120         beq     .Lnormal0
 121         ldr     r2, .L_min_bzero_size
 122         ldr     r2, [r2]
 123         cmp     r1, r2
 124         blt     .Lnormal0
 125         stmfd   sp!, {r0, r1, lr}
 126         mov     r2, #0
 127         mov     lr, pc
 128         mov     pc, r3
 129         cmp     r0, #0
 130         ldmfd   sp!, {r0, r1, lr}
 131         RETeq
 132 .Lnormal0:
 133         mov     r3, #0x00
 134         b       do_memset
 135 END(bzero)
 136 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 137 ENTRY(memset)
 138         and     r3, r1, #0xff           /* We deal with bytes */
 139         mov     r1, r2
 140 do_memset:
 141         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 142         mov     ip, r0
 143         blt     .Lmemset_lessthanfour
 144
 145         /* Ok first we will word align the address */
 146         ands    r2, ip, #0x03           /* Get the bottom two bits */
 147         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 148
 149         /* We are now word aligned */
 150 .Lmemset_wordaligned:
 151         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 152         tst     ip, #0x04               /* Quad-align for armv5e */
 153         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 154         subne   r1, r1, #0x04           /* Quad-align if necessary */
 155         strne   r3, [ip], #0x04
 156         cmp     r1, #0x10
 157         blt     .Lmemset_loop4          /* If less than 16 then use words */
 158         mov     r2, r3                  /* Duplicate data */
 159         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 160         blt     .Lmemset_loop32
 161
 162         /* Do 128 bytes at a time */
 163 .Lmemset_loop128:
 164         subs    r1, r1, #0x80
 165         strdge  r2, [ip], #0x08
 166         strdge  r2, [ip], #0x08
 167         strdge  r2, [ip], #0x08
 168         strdge  r2, [ip], #0x08
 169         strdge  r2, [ip], #0x08
 170         strdge  r2, [ip], #0x08
 171         strdge  r2, [ip], #0x08
 172         strdge  r2, [ip], #0x08
 173         strdge  r2, [ip], #0x08
 174         strdge  r2, [ip], #0x08
 175         strdge  r2, [ip], #0x08
 176         strdge  r2, [ip], #0x08
 177         strdge  r2, [ip], #0x08
 178         strdge  r2, [ip], #0x08
 179         strdge  r2, [ip], #0x08
 180         strdge  r2, [ip], #0x08
 181         bgt     .Lmemset_loop128
 182         RETeq                   /* Zero length so just exit */
 183
 184         add     r1, r1, #0x80           /* Adjust for extra sub */
 185
 186         /* Do 32 bytes at a time */
 187 .Lmemset_loop32:
 188         subs    r1, r1, #0x20
 189         strdge  r2, [ip], #0x08
 190         strdge  r2, [ip], #0x08
 191         strdge  r2, [ip], #0x08
 192         strdge  r2, [ip], #0x08
 193         bgt     .Lmemset_loop32
 194         RETeq                   /* Zero length so just exit */
 195
 196         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 197
 198         /* Deal with 16 bytes or more */
 199         strdge  r2, [ip], #0x08
 200         strdge  r2, [ip], #0x08
 201         RETeq                   /* Zero length so just exit */
 202
 203         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 204
 205         /* We have at least 4 bytes so copy as words */
 206 .Lmemset_loop4:
 207         subs    r1, r1, #0x04
 208         strge   r3, [ip], #0x04
 209         bgt     .Lmemset_loop4
 210         RETeq                   /* Zero length so just exit */
 211
 212         /* Compensate for 64-bit alignment check */
 213         adds    r1, r1, #0x04
 214         RETeq
 215         cmp     r1, #2
 216
 217         strb    r3, [ip], #0x01         /* Set 1 byte */
 218         strbge  r3, [ip], #0x01         /* Set another byte */
 219         strbgt  r3, [ip]                /* and a third */
 220         RET                     /* Exit */
 221
 222 .Lmemset_wordunaligned:
 223         rsb     r2, r2, #0x004
 224         strb    r3, [ip], #0x01         /* Set 1 byte */
 225         cmp     r2, #0x02
 226         strbge  r3, [ip], #0x01         /* Set another byte */
 227         sub     r1, r1, r2
 228         strbgt  r3, [ip], #0x01         /* and a third */
 229         cmp     r1, #0x04               /* More than 4 bytes left? */
 230         bge     .Lmemset_wordaligned    /* Yup */
 231
 232 .Lmemset_lessthanfour:
 233         cmp     r1, #0x00
 234         RETeq                   /* Zero length so exit */
 235         strb    r3, [ip], #0x01         /* Set 1 byte */
 236         cmp     r1, #0x02
 237         strbge  r3, [ip], #0x01         /* Set another byte */
 238         strbgt  r3, [ip]                /* and a third */
 239         RET                     /* Exit */
 240 EEND(memset)
 241 END(bzero)
 242
 243 ENTRY(bcmp)
 244         mov     ip, r0
 245         cmp     r2, #0x06
 246         beq     .Lmemcmp_6bytes
 247         mov     r0, #0x00
 248
 249         /* Are both addresses aligned the same way? */
 250         cmp     r2, #0x00
 251         eorsne  r3, ip, r1
 252         RETeq                   /* len == 0, or same addresses! */
 253         tst     r3, #0x03
 254         subne   r2, r2, #0x01
 255         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 256
 257         /* Word-align the addresses, if necessary */
 258         sub     r3, r1, #0x05
 259         ands    r3, r3, #0x03
 260         add     r3, r3, r3, lsl #1
 261         addne   pc, pc, r3, lsl #3
 262         nop
 263
 264         /* Compare up to 3 bytes */
 265         ldrb    r0, [ip], #0x01
 266         ldrb    r3, [r1], #0x01
 267         subs    r0, r0, r3
 268         RETne
 269         subs    r2, r2, #0x01
 270         RETeq
 271
 272         /* Compare up to 2 bytes */
 273         ldrb    r0, [ip], #0x01
 274         ldrb    r3, [r1], #0x01
 275         subs    r0, r0, r3
 276         RETne
 277         subs    r2, r2, #0x01
 278         RETeq
 279
 280         /* Compare 1 byte */
 281         ldrb    r0, [ip], #0x01
 282         ldrb    r3, [r1], #0x01
 283         subs    r0, r0, r3
 284         RETne
 285         subs    r2, r2, #0x01
 286         RETeq
 287
 288         /* Compare 4 bytes at a time, if possible */
 289         subs    r2, r2, #0x04
 290         bcc     .Lmemcmp_bytewise
 291 .Lmemcmp_word_aligned:
 292         ldr     r0, [ip], #0x04
 293         ldr     r3, [r1], #0x04
 294         subs    r2, r2, #0x04
 295         cmpcs   r0, r3
 296         beq     .Lmemcmp_word_aligned
 297         sub     r0, r0, r3
 298
 299         /* Correct for extra subtraction, and check if done */
 300         adds    r2, r2, #0x04
 301         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 302         RETeq                   /* Yup. Just return */
 303
 304         /* Re-do the final word byte-wise */
 305         sub     ip, ip, #0x04
 306         sub     r1, r1, #0x04
 307
 308 .Lmemcmp_bytewise:
 309         add     r2, r2, #0x03
 310 .Lmemcmp_bytewise2:
 311         ldrb    r0, [ip], #0x01
 312         ldrb    r3, [r1], #0x01
 313         subs    r2, r2, #0x01
 314         cmpcs   r0, r3
 315         beq     .Lmemcmp_bytewise2
 316         sub     r0, r0, r3
 317         RET
 318
 319         /*
 320          * 6 byte compares are very common, thanks to the network stack.
 321          * This code is hand-scheduled to reduce the number of stalls for
 322          * load results. Everything else being equal, this will be ~32%
 323          * faster than a byte-wise memcmp.
 324          */
 325         .align  5
 326 .Lmemcmp_6bytes:
 327         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 328         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 329         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 330         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 331         ldrbeq  r3, [ip, #0x01]         /* r3 = b1#1 */
 332         RETne                   /* Return if mismatch on #0 */
 333         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 334         ldrbeq  r3, [r1, #0x02]         /* r3 = b2#2 */
 335         ldrbeq  r0, [ip, #0x02]         /* r0 = b1#2 */
 336         RETne                   /* Return if mismatch on #1 */
 337         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 338         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 339         ldrbeq  r3, [ip, #0x03]         /* r3 = b1#3 */
 340         RETne                   /* Return if mismatch on #2 */
 341         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 342         ldrbeq  r3, [r1, #0x04]         /* r3 = b2#4 */
 343         ldrbeq  r0, [ip, #0x04]         /* r0 = b1#4 */
 344         RETne                   /* Return if mismatch on #3 */
 345         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 346         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 347         ldrbeq  r3, [ip, #0x05]         /* r3 = b1#5 */
 348         RETne                   /* Return if mismatch on #4 */
 349         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 350         RET
 351 END(bcmp)
 352
 353 ENTRY(bcopy)
 354         /* switch the source and destination registers */
 355         eor     r0, r1, r0
 356         eor     r1, r0, r1
 357         eor     r0, r1, r0
 358 EENTRY(memmove)
 359         /* Do the buffers overlap? */
 360         cmp     r0, r1
 361         RETeq           /* Bail now if src/dst are the same */
 362         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 363         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 364         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 365         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 366
 367         /* Determine copy direction */
 368         cmp     r1, r0
 369         bcc     .Lmemmove_backwards
 370
 371         moveq   r0, #0                  /* Quick abort for len=0 */
 372         RETeq
 373
 374         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 375         subs    r2, r2, #4
 376         blt     .Lmemmove_fl4           /* less than 4 bytes */
 377         ands    r12, r0, #3
 378         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 379         ands    r12, r1, #3
 380         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 381
 382 .Lmemmove_ft8:
 383         /* We have aligned source and destination */
 384         subs    r2, r2, #8
 385         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 386         subs    r2, r2, #0x14
 387         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 388         stmdb   sp!, {r4}               /* borrow r4 */
 389
 390         /* blat 32 bytes at a time */
 391         /* XXX for really big copies perhaps we should use more registers */
 392 .Lmemmove_floop32:
 393         ldmia   r1!, {r3, r4, r12, lr}
 394         stmia   r0!, {r3, r4, r12, lr}
 395         ldmia   r1!, {r3, r4, r12, lr}
 396         stmia   r0!, {r3, r4, r12, lr}
 397         subs    r2, r2, #0x20
 398         bge     .Lmemmove_floop32
 399
 400         cmn     r2, #0x10
 401         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 402         stmiage r0!, {r3, r4, r12, lr}
 403         subge   r2, r2, #0x10
 404         ldmia   sp!, {r4}               /* return r4 */
 405
 406 .Lmemmove_fl32:
 407         adds    r2, r2, #0x14
 408
 409         /* blat 12 bytes at a time */
 410 .Lmemmove_floop12:
 411         ldmiage r1!, {r3, r12, lr}
 412         stmiage r0!, {r3, r12, lr}
 413         subsge  r2, r2, #0x0c
 414         bge     .Lmemmove_floop12
 415
 416 .Lmemmove_fl12:
 417         adds    r2, r2, #8
 418         blt     .Lmemmove_fl4
 419
 420         subs    r2, r2, #4
 421         ldrlt   r3, [r1], #4
 422         strlt   r3, [r0], #4
 423         ldmiage r1!, {r3, r12}
 424         stmiage r0!, {r3, r12}
 425         subge   r2, r2, #4
 426
 427 .Lmemmove_fl4:
 428         /* less than 4 bytes to go */
 429         adds    r2, r2, #4
 430         ldmiaeq sp!, {r0, pc}           /* done */
 431
 432         /* copy the crud byte at a time */
 433         cmp     r2, #2
 434         ldrb    r3, [r1], #1
 435         strb    r3, [r0], #1
 436         ldrbge  r3, [r1], #1
 437         strbge  r3, [r0], #1
 438         ldrbgt  r3, [r1], #1
 439         strbgt  r3, [r0], #1
 440         ldmia   sp!, {r0, pc}
 441
 442         /* erg - unaligned destination */
 443 .Lmemmove_fdestul:
 444         rsb     r12, r12, #4
 445         cmp     r12, #2
 446
 447         /* align destination with byte copies */
 448         ldrb    r3, [r1], #1
 449         strb    r3, [r0], #1
 450         ldrbge  r3, [r1], #1
 451         strbge  r3, [r0], #1
 452         ldrbgt  r3, [r1], #1
 453         strbgt  r3, [r0], #1
 454         subs    r2, r2, r12
 455         blt     .Lmemmove_fl4           /* less the 4 bytes */
 456
 457         ands    r12, r1, #3
 458         beq     .Lmemmove_ft8           /* we have an aligned source */
 459
 460         /* erg - unaligned source */
 461         /* This is where it gets nasty ... */
 462 .Lmemmove_fsrcul:
 463         bic     r1, r1, #3
 464         ldr     lr, [r1], #4
 465         cmp     r12, #2
 466         bgt     .Lmemmove_fsrcul3
 467         beq     .Lmemmove_fsrcul2
 468         cmp     r2, #0x0c
 469         blt     .Lmemmove_fsrcul1loop4
 470         sub     r2, r2, #0x0c
 471         stmdb   sp!, {r4, r5}
 472
 473 .Lmemmove_fsrcul1loop16:
 474         mov     r3, lr, lsr #8
 475         ldmia   r1!, {r4, r5, r12, lr}
 476         orr     r3, r3, r4, lsl #24
 477         mov     r4, r4, lsr #8
 478         orr     r4, r4, r5, lsl #24
 479         mov     r5, r5, lsr #8
 480         orr     r5, r5, r12, lsl #24
 481         mov     r12, r12, lsr #8
 482         orr     r12, r12, lr, lsl #24
 483         stmia   r0!, {r3-r5, r12}
 484         subs    r2, r2, #0x10
 485         bge     .Lmemmove_fsrcul1loop16
 486         ldmia   sp!, {r4, r5}
 487         adds    r2, r2, #0x0c
 488         blt     .Lmemmove_fsrcul1l4
 489
 490 .Lmemmove_fsrcul1loop4:
 491         mov     r12, lr, lsr #8
 492         ldr     lr, [r1], #4
 493         orr     r12, r12, lr, lsl #24
 494         str     r12, [r0], #4
 495         subs    r2, r2, #4
 496         bge     .Lmemmove_fsrcul1loop4
 497
 498 .Lmemmove_fsrcul1l4:
 499         sub     r1, r1, #3
 500         b       .Lmemmove_fl4
 501
 502 .Lmemmove_fsrcul2:
 503         cmp     r2, #0x0c
 504         blt     .Lmemmove_fsrcul2loop4
 505         sub     r2, r2, #0x0c
 506         stmdb   sp!, {r4, r5}
 507
 508 .Lmemmove_fsrcul2loop16:
 509         mov     r3, lr, lsr #16
 510         ldmia   r1!, {r4, r5, r12, lr}
 511         orr     r3, r3, r4, lsl #16
 512         mov     r4, r4, lsr #16
 513         orr     r4, r4, r5, lsl #16
 514         mov     r5, r5, lsr #16
 515         orr     r5, r5, r12, lsl #16
 516         mov     r12, r12, lsr #16
 517         orr     r12, r12, lr, lsl #16
 518         stmia   r0!, {r3-r5, r12}
 519         subs    r2, r2, #0x10
 520         bge     .Lmemmove_fsrcul2loop16
 521         ldmia   sp!, {r4, r5}
 522         adds    r2, r2, #0x0c
 523         blt     .Lmemmove_fsrcul2l4
 524
 525 .Lmemmove_fsrcul2loop4:
 526         mov     r12, lr, lsr #16
 527         ldr     lr, [r1], #4
 528         orr     r12, r12, lr, lsl #16
 529         str     r12, [r0], #4
 530         subs    r2, r2, #4
 531         bge     .Lmemmove_fsrcul2loop4
 532
 533 .Lmemmove_fsrcul2l4:
 534         sub     r1, r1, #2
 535         b       .Lmemmove_fl4
 536
 537 .Lmemmove_fsrcul3:
 538         cmp     r2, #0x0c
 539         blt     .Lmemmove_fsrcul3loop4
 540         sub     r2, r2, #0x0c
 541         stmdb   sp!, {r4, r5}
 542
 543 .Lmemmove_fsrcul3loop16:
 544         mov     r3, lr, lsr #24
 545         ldmia   r1!, {r4, r5, r12, lr}
 546         orr     r3, r3, r4, lsl #8
 547         mov     r4, r4, lsr #24
 548         orr     r4, r4, r5, lsl #8
 549         mov     r5, r5, lsr #24
 550         orr     r5, r5, r12, lsl #8
 551         mov     r12, r12, lsr #24
 552         orr     r12, r12, lr, lsl #8
 553         stmia   r0!, {r3-r5, r12}
 554         subs    r2, r2, #0x10
 555         bge     .Lmemmove_fsrcul3loop16
 556         ldmia   sp!, {r4, r5}
 557         adds    r2, r2, #0x0c
 558         blt     .Lmemmove_fsrcul3l4
 559
 560 .Lmemmove_fsrcul3loop4:
 561         mov     r12, lr, lsr #24
 562         ldr     lr, [r1], #4
 563         orr     r12, r12, lr, lsl #8
 564         str     r12, [r0], #4
 565         subs    r2, r2, #4
 566         bge     .Lmemmove_fsrcul3loop4
 567
 568 .Lmemmove_fsrcul3l4:
 569         sub     r1, r1, #1
 570         b       .Lmemmove_fl4
 571
 572 .Lmemmove_backwards:
 573         add     r1, r1, r2
 574         add     r0, r0, r2
 575         subs    r2, r2, #4
 576         blt     .Lmemmove_bl4           /* less than 4 bytes */
 577         ands    r12, r0, #3
 578         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 579         ands    r12, r1, #3
 580         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 581
 582 .Lmemmove_bt8:
 583         /* We have aligned source and destination */
 584         subs    r2, r2, #8
 585         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 586         stmdb   sp!, {r4, lr}
 587         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 588         blt     .Lmemmove_bl32
 589
 590         /* blat 32 bytes at a time */
 591         /* XXX for really big copies perhaps we should use more registers */
 592 .Lmemmove_bloop32:
 593         ldmdb   r1!, {r3, r4, r12, lr}
 594         stmdb   r0!, {r3, r4, r12, lr}
 595         ldmdb   r1!, {r3, r4, r12, lr}
 596         stmdb   r0!, {r3, r4, r12, lr}
 597         subs    r2, r2, #0x20
 598         bge     .Lmemmove_bloop32
 599
 600 .Lmemmove_bl32:
 601         cmn     r2, #0x10
 602         ldmdbge r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 603         stmdbge r0!, {r3, r4, r12, lr}
 604         subge   r2, r2, #0x10
 605         adds    r2, r2, #0x14
 606         ldmdbge r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 607         stmdbge r0!, {r3, r12, lr}
 608         subge   r2, r2, #0x0c
 609         ldmia   sp!, {r4, lr}
 610
 611 .Lmemmove_bl12:
 612         adds    r2, r2, #8
 613         blt     .Lmemmove_bl4
 614         subs    r2, r2, #4
 615         ldrlt   r3, [r1, #-4]!
 616         strlt   r3, [r0, #-4]!
 617         ldmdbge r1!, {r3, r12}
 618         stmdbge r0!, {r3, r12}
 619         subge   r2, r2, #4
 620
 621 .Lmemmove_bl4:
 622         /* less than 4 bytes to go */
 623         adds    r2, r2, #4
 624         RETeq                   /* done */
 625
 626         /* copy the crud byte at a time */
 627         cmp     r2, #2
 628         ldrb    r3, [r1, #-1]!
 629         strb    r3, [r0, #-1]!
 630         ldrbge  r3, [r1, #-1]!
 631         strbge  r3, [r0, #-1]!
 632         ldrbgt  r3, [r1, #-1]!
 633         strbgt  r3, [r0, #-1]!
 634         RET
 635
 636         /* erg - unaligned destination */
 637 .Lmemmove_bdestul:
 638         cmp     r12, #2
 639
 640         /* align destination with byte copies */
 641         ldrb    r3, [r1, #-1]!
 642         strb    r3, [r0, #-1]!
 643         ldrbge  r3, [r1, #-1]!
 644         strbge  r3, [r0, #-1]!
 645         ldrbgt  r3, [r1, #-1]!
 646         strbgt  r3, [r0, #-1]!
 647         subs    r2, r2, r12
 648         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 649         ands    r12, r1, #3
 650         beq     .Lmemmove_bt8           /* we have an aligned source */
 651
 652         /* erg - unaligned source */
 653         /* This is where it gets nasty ... */
 654 .Lmemmove_bsrcul:
 655         bic     r1, r1, #3
 656         ldr     r3, [r1, #0]
 657         cmp     r12, #2
 658         blt     .Lmemmove_bsrcul1
 659         beq     .Lmemmove_bsrcul2
 660         cmp     r2, #0x0c
 661         blt     .Lmemmove_bsrcul3loop4
 662         sub     r2, r2, #0x0c
 663         stmdb   sp!, {r4, r5, lr}
 664
 665 .Lmemmove_bsrcul3loop16:
 666         mov     lr, r3, lsl #8
 667         ldmdb   r1!, {r3-r5, r12}
 668         orr     lr, lr, r12, lsr #24
 669         mov     r12, r12, lsl #8
 670         orr     r12, r12, r5, lsr #24
 671         mov     r5, r5, lsl #8
 672         orr     r5, r5, r4, lsr #24
 673         mov     r4, r4, lsl #8
 674         orr     r4, r4, r3, lsr #24
 675         stmdb   r0!, {r4, r5, r12, lr}
 676         subs    r2, r2, #0x10
 677         bge     .Lmemmove_bsrcul3loop16
 678         ldmia   sp!, {r4, r5, lr}
 679         adds    r2, r2, #0x0c
 680         blt     .Lmemmove_bsrcul3l4
 681
 682 .Lmemmove_bsrcul3loop4:
 683         mov     r12, r3, lsl #8
 684         ldr     r3, [r1, #-4]!
 685         orr     r12, r12, r3, lsr #24
 686         str     r12, [r0, #-4]!
 687         subs    r2, r2, #4
 688         bge     .Lmemmove_bsrcul3loop4
 689
 690 .Lmemmove_bsrcul3l4:
 691         add     r1, r1, #3
 692         b       .Lmemmove_bl4
 693
 694 .Lmemmove_bsrcul2:
 695         cmp     r2, #0x0c
 696         blt     .Lmemmove_bsrcul2loop4
 697         sub     r2, r2, #0x0c
 698         stmdb   sp!, {r4, r5, lr}
 699
 700 .Lmemmove_bsrcul2loop16:
 701         mov     lr, r3, lsl #16
 702         ldmdb   r1!, {r3-r5, r12}
 703         orr     lr, lr, r12, lsr #16
 704         mov     r12, r12, lsl #16
 705         orr     r12, r12, r5, lsr #16
 706         mov     r5, r5, lsl #16
 707         orr     r5, r5, r4, lsr #16
 708         mov     r4, r4, lsl #16
 709         orr     r4, r4, r3, lsr #16
 710         stmdb   r0!, {r4, r5, r12, lr}
 711         subs    r2, r2, #0x10
 712         bge     .Lmemmove_bsrcul2loop16
 713         ldmia   sp!, {r4, r5, lr}
 714         adds    r2, r2, #0x0c
 715         blt     .Lmemmove_bsrcul2l4
 716
 717 .Lmemmove_bsrcul2loop4:
 718         mov     r12, r3, lsl #16
 719         ldr     r3, [r1, #-4]!
 720         orr     r12, r12, r3, lsr #16
 721         str     r12, [r0, #-4]!
 722         subs    r2, r2, #4
 723         bge     .Lmemmove_bsrcul2loop4
 724
 725 .Lmemmove_bsrcul2l4:
 726         add     r1, r1, #2
 727         b       .Lmemmove_bl4
 728
 729 .Lmemmove_bsrcul1:
 730         cmp     r2, #0x0c
 731         blt     .Lmemmove_bsrcul1loop4
 732         sub     r2, r2, #0x0c
 733         stmdb   sp!, {r4, r5, lr}
 734
 735 .Lmemmove_bsrcul1loop32:
 736         mov     lr, r3, lsl #24
 737         ldmdb   r1!, {r3-r5, r12}
 738         orr     lr, lr, r12, lsr #8
 739         mov     r12, r12, lsl #24
 740         orr     r12, r12, r5, lsr #8
 741         mov     r5, r5, lsl #24
 742         orr     r5, r5, r4, lsr #8
 743         mov     r4, r4, lsl #24
 744         orr     r4, r4, r3, lsr #8
 745         stmdb   r0!, {r4, r5, r12, lr}
 746         subs    r2, r2, #0x10
 747         bge     .Lmemmove_bsrcul1loop32
 748         ldmia   sp!, {r4, r5, lr}
 749         adds    r2, r2, #0x0c
 750         blt     .Lmemmove_bsrcul1l4
 751
 752 .Lmemmove_bsrcul1loop4:
 753         mov     r12, r3, lsl #24
 754         ldr     r3, [r1, #-4]!
 755         orr     r12, r12, r3, lsr #8
 756         str     r12, [r0, #-4]!
 757         subs    r2, r2, #4
 758         bge     .Lmemmove_bsrcul1loop4
 759
 760 .Lmemmove_bsrcul1l4:
 761         add     r1, r1, #1
 762         b       .Lmemmove_bl4
 763 EEND(memmove)
 764 END(bcopy)
 765
 766 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
 767 ENTRY(memcpy)
 768         pld     [r1]
 769         cmp     r2, #0x0c
 770         ble     .Lmemcpy_short          /* <= 12 bytes */
 771 #ifdef FLASHADDR
 772 #if FLASHADDR > PHYSADDR
 773         ldr     r3, =FLASHADDR
 774         cmp     r3, pc
 775         bls     .Lnormal
 776 #else
 777         ldr     r3, =FLASHADDR
 778         cmp     r3, pc
 779         bhi     .Lnormal
 780 #endif
 781 #endif
 782         ldr     r3, .L_arm_memcpy
 783         ldr     r3, [r3]
 784         cmp     r3, #0
 785         beq     .Lnormal
 786         ldr     r3, .L_min_memcpy_size
 787         ldr     r3, [r3]
 788         cmp     r2, r3
 789         blt     .Lnormal
 790         stmfd   sp!, {r0-r2, r4, lr}
 791         mov     r3, #0
 792         ldr     r4, .L_arm_memcpy
 793         mov     lr, pc
 794         ldr     pc, [r4]
 795         cmp     r0, #0
 796         ldmfd   sp!, {r0-r2, r4, lr}
 797         RETeq
 798 .Lnormal:
 799         mov     r3, r0                  /* We must not clobber r0 */
 800
 801         /* Word-align the destination buffer */
 802         ands    ip, r3, #0x03           /* Already word aligned? */
 803         beq     .Lmemcpy_wordaligned    /* Yup */
 804         cmp     ip, #0x02
 805         ldrb    ip, [r1], #0x01
 806         sub     r2, r2, #0x01
 807         strb    ip, [r3], #0x01
 808         ldrble  ip, [r1], #0x01
 809         suble   r2, r2, #0x01
 810         strble  ip, [r3], #0x01
 811         ldrblt  ip, [r1], #0x01
 812         sublt   r2, r2, #0x01
 813         strblt  ip, [r3], #0x01
 814
 815         /* Destination buffer is now word aligned */
 816 .Lmemcpy_wordaligned:
 817         ands    ip, r1, #0x03           /* Is src also word-aligned? */
 818         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
 819
 820         /* Quad-align the destination buffer */
 821         tst     r3, #0x07               /* Already quad aligned? */
 822         ldrne   ip, [r1], #0x04
 823         stmfd   sp!, {r4-r9}            /* Free up some registers */
 824         subne   r2, r2, #0x04
 825         strne   ip, [r3], #0x04
 826
 827         /* Destination buffer quad aligned, source is at least word aligned */
 828         subs    r2, r2, #0x80
 829         blt     .Lmemcpy_w_lessthan128
 830
 831         /* Copy 128 bytes at a time */
 832 .Lmemcpy_w_loop128:
 833         ldr     r4, [r1], #0x04         /* LD:00-03 */
 834         ldr     r5, [r1], #0x04         /* LD:04-07 */
 835         pld     [r1, #0x18]             /* Prefetch 0x20 */
 836         ldr     r6, [r1], #0x04         /* LD:08-0b */
 837         ldr     r7, [r1], #0x04         /* LD:0c-0f */
 838         ldr     r8, [r1], #0x04         /* LD:10-13 */
 839         ldr     r9, [r1], #0x04         /* LD:14-17 */
 840         strd    r4, [r3], #0x08         /* ST:00-07 */
 841         ldr     r4, [r1], #0x04         /* LD:18-1b */
 842         ldr     r5, [r1], #0x04         /* LD:1c-1f */
 843         strd    r6, [r3], #0x08         /* ST:08-0f */
 844         ldr     r6, [r1], #0x04         /* LD:20-23 */
 845         ldr     r7, [r1], #0x04         /* LD:24-27 */
 846         pld     [r1, #0x18]             /* Prefetch 0x40 */
 847         strd    r8, [r3], #0x08         /* ST:10-17 */
 848         ldr     r8, [r1], #0x04         /* LD:28-2b */
 849         ldr     r9, [r1], #0x04         /* LD:2c-2f */
 850         strd    r4, [r3], #0x08         /* ST:18-1f */
 851         ldr     r4, [r1], #0x04         /* LD:30-33 */
 852         ldr     r5, [r1], #0x04         /* LD:34-37 */
 853         strd    r6, [r3], #0x08         /* ST:20-27 */
 854         ldr     r6, [r1], #0x04         /* LD:38-3b */
 855         ldr     r7, [r1], #0x04         /* LD:3c-3f */
 856         strd    r8, [r3], #0x08         /* ST:28-2f */
 857         ldr     r8, [r1], #0x04         /* LD:40-43 */
 858         ldr     r9, [r1], #0x04         /* LD:44-47 */
 859         pld     [r1, #0x18]             /* Prefetch 0x60 */
 860         strd    r4, [r3], #0x08         /* ST:30-37 */
 861         ldr     r4, [r1], #0x04         /* LD:48-4b */
 862         ldr     r5, [r1], #0x04         /* LD:4c-4f */
 863         strd    r6, [r3], #0x08         /* ST:38-3f */
 864         ldr     r6, [r1], #0x04         /* LD:50-53 */
 865         ldr     r7, [r1], #0x04         /* LD:54-57 */
 866         strd    r8, [r3], #0x08         /* ST:40-47 */
 867         ldr     r8, [r1], #0x04         /* LD:58-5b */
 868         ldr     r9, [r1], #0x04         /* LD:5c-5f */
 869         strd    r4, [r3], #0x08         /* ST:48-4f */
 870         ldr     r4, [r1], #0x04         /* LD:60-63 */
 871         ldr     r5, [r1], #0x04         /* LD:64-67 */
 872         pld     [r1, #0x18]             /* Prefetch 0x80 */
 873         strd    r6, [r3], #0x08         /* ST:50-57 */
 874         ldr     r6, [r1], #0x04         /* LD:68-6b */
 875         ldr     r7, [r1], #0x04         /* LD:6c-6f */
 876         strd    r8, [r3], #0x08         /* ST:58-5f */
 877         ldr     r8, [r1], #0x04         /* LD:70-73 */
 878         ldr     r9, [r1], #0x04         /* LD:74-77 */
 879         strd    r4, [r3], #0x08         /* ST:60-67 */
 880         ldr     r4, [r1], #0x04         /* LD:78-7b */
 881         ldr     r5, [r1], #0x04         /* LD:7c-7f */
 882         strd    r6, [r3], #0x08         /* ST:68-6f */
 883         strd    r8, [r3], #0x08         /* ST:70-77 */
 884         subs    r2, r2, #0x80
 885         strd    r4, [r3], #0x08         /* ST:78-7f */
 886         bge     .Lmemcpy_w_loop128
 887
 888 .Lmemcpy_w_lessthan128:
 889         adds    r2, r2, #0x80           /* Adjust for extra sub */
 890         ldmfdeq sp!, {r4-r9}
 891         RETeq                   /* Return now if done */
 892         subs    r2, r2, #0x20
 893         blt     .Lmemcpy_w_lessthan32
 894
 895         /* Copy 32 bytes at a time */
 896 .Lmemcpy_w_loop32:
 897         ldr     r4, [r1], #0x04
 898         ldr     r5, [r1], #0x04
 899         pld     [r1, #0x18]
 900         ldr     r6, [r1], #0x04
 901         ldr     r7, [r1], #0x04
 902         ldr     r8, [r1], #0x04
 903         ldr     r9, [r1], #0x04
 904         strd    r4, [r3], #0x08
 905         ldr     r4, [r1], #0x04
 906         ldr     r5, [r1], #0x04
 907         strd    r6, [r3], #0x08
 908         strd    r8, [r3], #0x08
 909         subs    r2, r2, #0x20
 910         strd    r4, [r3], #0x08
 911         bge     .Lmemcpy_w_loop32
 912
 913 .Lmemcpy_w_lessthan32:
 914         adds    r2, r2, #0x20           /* Adjust for extra sub */
 915         ldmfdeq sp!, {r4-r9}
 916         RETeq                   /* Return now if done */
 917
 918         and     r4, r2, #0x18
 919         rsbs    r4, r4, #0x18
 920         addne   pc, pc, r4, lsl #1
 921         nop
 922
 923         /* At least 24 bytes remaining */
 924         ldr     r4, [r1], #0x04
 925         ldr     r5, [r1], #0x04
 926         sub     r2, r2, #0x08
 927         strd    r4, [r3], #0x08
 928
 929         /* At least 16 bytes remaining */
 930         ldr     r4, [r1], #0x04
 931         ldr     r5, [r1], #0x04
 932         sub     r2, r2, #0x08
 933         strd    r4, [r3], #0x08
 934
 935         /* At least 8 bytes remaining */
 936         ldr     r4, [r1], #0x04
 937         ldr     r5, [r1], #0x04
 938         subs    r2, r2, #0x08
 939         strd    r4, [r3], #0x08
 940
 941         /* Less than 8 bytes remaining */
 942         ldmfd   sp!, {r4-r9}
 943         RETeq                   /* Return now if done */
 944         subs    r2, r2, #0x04
 945         ldrge   ip, [r1], #0x04
 946         strge   ip, [r3], #0x04
 947         RETeq                   /* Return now if done */
 948         addlt   r2, r2, #0x04
 949         ldrb    ip, [r1], #0x01
 950         cmp     r2, #0x02
 951         ldrbge  r2, [r1], #0x01
 952         strb    ip, [r3], #0x01
 953         ldrbgt  ip, [r1]
 954         strbge  r2, [r3], #0x01
 955         strbgt  ip, [r3]
 956         RET
 957 /* Place a literal pool here for the above ldr instructions to use */
 958 .ltorg
 959
 960
 961 /*
 962  * At this point, it has not been possible to word align both buffers.
 963  * The destination buffer is word aligned, but the source buffer is not.
 964  */
 965 .Lmemcpy_bad_align:
 966         stmfd   sp!, {r4-r7}
 967         bic     r1, r1, #0x03
 968         cmp     ip, #2
 969         ldr     ip, [r1], #0x04
 970         bgt     .Lmemcpy_bad3
 971         beq     .Lmemcpy_bad2
 972         b       .Lmemcpy_bad1
 973
 974 .Lmemcpy_bad1_loop16:
 975         mov     r4, ip, lsr #8
 976         ldr     r5, [r1], #0x04
 977         pld     [r1, #0x018]
 978         ldr     r6, [r1], #0x04
 979         ldr     r7, [r1], #0x04
 980         ldr     ip, [r1], #0x04
 981         orr     r4, r4, r5, lsl #24
 982         mov     r5, r5, lsr #8
 983         orr     r5, r5, r6, lsl #24
 984         mov     r6, r6, lsr #8
 985         orr     r6, r6, r7, lsl #24
 986         mov     r7, r7, lsr #8
 987         orr     r7, r7, ip, lsl #24
 988         str     r4, [r3], #0x04
 989         str     r5, [r3], #0x04
 990         str     r6, [r3], #0x04
 991         str     r7, [r3], #0x04
 992 .Lmemcpy_bad1:
 993         subs    r2, r2, #0x10
 994         bge     .Lmemcpy_bad1_loop16
 995
 996         adds    r2, r2, #0x10
 997         ldmfdeq sp!, {r4-r7}
 998         RETeq                   /* Return now if done */
 999         subs    r2, r2, #0x04
1000         sublt   r1, r1, #0x03
1001         blt     .Lmemcpy_bad_done
1002
1003 .Lmemcpy_bad1_loop4:
1004         mov     r4, ip, lsr #8
1005         ldr     ip, [r1], #0x04
1006         subs    r2, r2, #0x04
1007         orr     r4, r4, ip, lsl #24
1008         str     r4, [r3], #0x04
1009         bge     .Lmemcpy_bad1_loop4
1010         sub     r1, r1, #0x03
1011         b       .Lmemcpy_bad_done
1012
1013 .Lmemcpy_bad2_loop16:
1014         mov     r4, ip, lsr #16
1015         ldr     r5, [r1], #0x04
1016         pld     [r1, #0x018]
1017         ldr     r6, [r1], #0x04
1018         ldr     r7, [r1], #0x04
1019         ldr     ip, [r1], #0x04
1020         orr     r4, r4, r5, lsl #16
1021         mov     r5, r5, lsr #16
1022         orr     r5, r5, r6, lsl #16
1023         mov     r6, r6, lsr #16
1024         orr     r6, r6, r7, lsl #16
1025         mov     r7, r7, lsr #16
1026         orr     r7, r7, ip, lsl #16
1027         str     r4, [r3], #0x04
1028         str     r5, [r3], #0x04
1029         str     r6, [r3], #0x04
1030         str     r7, [r3], #0x04
1031 .Lmemcpy_bad2:
1032         subs    r2, r2, #0x10
1033         bge     .Lmemcpy_bad2_loop16
1034
1035         adds    r2, r2, #0x10
1036         ldmfdeq sp!, {r4-r7}
1037         RETeq                   /* Return now if done */
1038         subs    r2, r2, #0x04
1039         sublt   r1, r1, #0x02
1040         blt     .Lmemcpy_bad_done
1041
1042 .Lmemcpy_bad2_loop4:
1043         mov     r4, ip, lsr #16
1044         ldr     ip, [r1], #0x04
1045         subs    r2, r2, #0x04
1046         orr     r4, r4, ip, lsl #16
1047         str     r4, [r3], #0x04
1048         bge     .Lmemcpy_bad2_loop4
1049         sub     r1, r1, #0x02
1050         b       .Lmemcpy_bad_done
1051
1052 .Lmemcpy_bad3_loop16:
1053         mov     r4, ip, lsr #24
1054         ldr     r5, [r1], #0x04
1055         pld     [r1, #0x018]
1056         ldr     r6, [r1], #0x04
1057         ldr     r7, [r1], #0x04
1058         ldr     ip, [r1], #0x04
1059         orr     r4, r4, r5, lsl #8
1060         mov     r5, r5, lsr #24
1061         orr     r5, r5, r6, lsl #8
1062         mov     r6, r6, lsr #24
1063         orr     r6, r6, r7, lsl #8
1064         mov     r7, r7, lsr #24
1065         orr     r7, r7, ip, lsl #8
1066         str     r4, [r3], #0x04
1067         str     r5, [r3], #0x04
1068         str     r6, [r3], #0x04
1069         str     r7, [r3], #0x04
1070 .Lmemcpy_bad3:
1071         subs    r2, r2, #0x10
1072         bge     .Lmemcpy_bad3_loop16
1073
1074         adds    r2, r2, #0x10
1075         ldmfdeq sp!, {r4-r7}
1076         RETeq                   /* Return now if done */
1077         subs    r2, r2, #0x04
1078         sublt   r1, r1, #0x01
1079         blt     .Lmemcpy_bad_done
1080
1081 .Lmemcpy_bad3_loop4:
1082         mov     r4, ip, lsr #24
1083         ldr     ip, [r1], #0x04
1084         subs    r2, r2, #0x04
1085         orr     r4, r4, ip, lsl #8
1086         str     r4, [r3], #0x04
1087         bge     .Lmemcpy_bad3_loop4
1088         sub     r1, r1, #0x01
1089
1090 .Lmemcpy_bad_done:
1091         ldmfd   sp!, {r4-r7}
1092         adds    r2, r2, #0x04
1093         RETeq
1094         ldrb    ip, [r1], #0x01
1095         cmp     r2, #0x02
1096         ldrbge  r2, [r1], #0x01
1097         strb    ip, [r3], #0x01
1098         ldrbgt  ip, [r1]
1099         strbge  r2, [r3], #0x01
1100         strbgt  ip, [r3]
1101         RET
1102
1103
1104 /*
1105  * Handle short copies (less than 16 bytes), possibly misaligned.
1106  * Some of these are *very* common, thanks to the network stack,
1107  * and so are handled specially.
1108  */
1109 .Lmemcpy_short:
1110         add     pc, pc, r2, lsl #2
1111         nop
1112         RET                     /* 0x00 */
1113         b       .Lmemcpy_bytewise       /* 0x01 */
1114         b       .Lmemcpy_bytewise       /* 0x02 */
1115         b       .Lmemcpy_bytewise       /* 0x03 */
1116         b       .Lmemcpy_4              /* 0x04 */
1117         b       .Lmemcpy_bytewise       /* 0x05 */
1118         b       .Lmemcpy_6              /* 0x06 */
1119         b       .Lmemcpy_bytewise       /* 0x07 */
1120         b       .Lmemcpy_8              /* 0x08 */
1121         b       .Lmemcpy_bytewise       /* 0x09 */
1122         b       .Lmemcpy_bytewise       /* 0x0a */
1123         b       .Lmemcpy_bytewise       /* 0x0b */
1124         b       .Lmemcpy_c              /* 0x0c */
1125 .Lmemcpy_bytewise:
1126         mov     r3, r0                  /* We must not clobber r0 */
1127         ldrb    ip, [r1], #0x01
1128 1:      subs    r2, r2, #0x01
1129         strb    ip, [r3], #0x01
1130         ldrbne  ip, [r1], #0x01
1131         bne     1b
1132         RET
1133
1134 /******************************************************************************
1135  * Special case for 4 byte copies
1136  */
1137 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1138 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1139         LMEMCPY_4_PAD
1140 .Lmemcpy_4:
1141         and     r2, r1, #0x03
1142         orr     r2, r2, r0, lsl #2
1143         ands    r2, r2, #0x0f
1144         sub     r3, pc, #0x14
1145         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1146
1147 /*
1148  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1149  */
1150         ldr     r2, [r1]
1151         str     r2, [r0]
1152         RET
1153         LMEMCPY_4_PAD
1154
1155 /*
1156  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1157  */
1158         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1159         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1160         mov     r3, r3, lsr #8          /* r3 = .210 */
1161         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1162         str     r3, [r0]
1163         RET
1164         LMEMCPY_4_PAD
1165
1166 /*
1167  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1168  */
1169         ldrh    r3, [r1, #0x02]
1170         ldrh    r2, [r1]
1171         orr     r3, r2, r3, lsl #16
1172         str     r3, [r0]
1173         RET
1174         LMEMCPY_4_PAD
1175
1176 /*
1177  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1178  */
1179         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1180         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1181         mov     r3, r3, lsr #24         /* r3 = ...0 */
1182         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1183         str     r3, [r0]
1184         RET
1185         LMEMCPY_4_PAD
1186
1187 /*
1188  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1189  */
1190         ldr     r2, [r1]
1191         strb    r2, [r0]
1192         mov     r3, r2, lsr #8
1193         mov     r1, r2, lsr #24
1194         strb    r1, [r0, #0x03]
1195         strh    r3, [r0, #0x01]
1196         RET
1197         LMEMCPY_4_PAD
1198
1199 /*
1200  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1201  */
1202         ldrb    r2, [r1]
1203         ldrh    r3, [r1, #0x01]
1204         ldrb    r1, [r1, #0x03]
1205         strb    r2, [r0]
1206         strh    r3, [r0, #0x01]
1207         strb    r1, [r0, #0x03]
1208         RET
1209         LMEMCPY_4_PAD
1210
1211 /*
1212  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1213  */
1214         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1215         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1216         strb    r2, [r0]
1217         mov     r2, r2, lsr #8          /* r2 = ...1 */
1218         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1219         mov     r3, r3, lsr #8          /* r3 = ...3 */
1220         strh    r2, [r0, #0x01]
1221         strb    r3, [r0, #0x03]
1222         RET
1223         LMEMCPY_4_PAD
1224
1225 /*
1226  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1227  */
1228         ldrb    r2, [r1]
1229         ldrh    r3, [r1, #0x01]
1230         ldrb    r1, [r1, #0x03]
1231         strb    r2, [r0]
1232         strh    r3, [r0, #0x01]
1233         strb    r1, [r0, #0x03]
1234         RET
1235         LMEMCPY_4_PAD
1236
1237 /*
1238  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1239  */
1240         ldr     r2, [r1]
1241         strh    r2, [r0]
1242         mov     r3, r2, lsr #16
1243         strh    r3, [r0, #0x02]
1244         RET
1245         LMEMCPY_4_PAD
1246
1247 /*
1248  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1249  */
1250         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1251         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1252         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1253         strh    r1, [r0]
1254         mov     r2, r2, lsr #24         /* r2 = ...2 */
1255         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1256         strh    r2, [r0, #0x02]
1257         RET
1258         LMEMCPY_4_PAD
1259
1260 /*
1261  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1262  */
1263         ldrh    r2, [r1]
1264         ldrh    r3, [r1, #0x02]
1265         strh    r2, [r0]
1266         strh    r3, [r0, #0x02]
1267         RET
1268         LMEMCPY_4_PAD
1269
1270 /*
1271  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1272  */
1273         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1274         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1275         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1276         strh    r1, [r0, #0x02]
1277         mov     r3, r3, lsl #8          /* r3 = 321. */
1278         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1279         strh    r3, [r0]
1280         RET
1281         LMEMCPY_4_PAD
1282
1283 /*
1284  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1285  */
1286         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1287         strb    r2, [r0]
1288         mov     r3, r2, lsr #8
1289         mov     r1, r2, lsr #24
1290         strh    r3, [r0, #0x01]
1291         strb    r1, [r0, #0x03]
1292         RET
1293         LMEMCPY_4_PAD
1294
1295 /*
1296  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1297  */
1298         ldrb    r2, [r1]
1299         ldrh    r3, [r1, #0x01]
1300         ldrb    r1, [r1, #0x03]
1301         strb    r2, [r0]
1302         strh    r3, [r0, #0x01]
1303         strb    r1, [r0, #0x03]
1304         RET
1305         LMEMCPY_4_PAD
1306
1307 /*
1308  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1309  */
1310         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1311         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1312         strb    r2, [r0]
1313         mov     r2, r2, lsr #8          /* r2 = ...1 */
1314         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1315         strh    r2, [r0, #0x01]
1316         mov     r3, r3, lsr #8          /* r3 = ...3 */
1317         strb    r3, [r0, #0x03]
1318         RET
1319         LMEMCPY_4_PAD
1320
1321 /*
1322  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1323  */
1324         ldrb    r2, [r1]
1325         ldrh    r3, [r1, #0x01]
1326         ldrb    r1, [r1, #0x03]
1327         strb    r2, [r0]
1328         strh    r3, [r0, #0x01]
1329         strb    r1, [r0, #0x03]
1330         RET
1331         LMEMCPY_4_PAD
1332
1333
1334 /******************************************************************************
1335  * Special case for 6 byte copies
1336  */
1337 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1338 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1339         LMEMCPY_6_PAD
1340 .Lmemcpy_6:
1341         and     r2, r1, #0x03
1342         orr     r2, r2, r0, lsl #2
1343         ands    r2, r2, #0x0f
1344         sub     r3, pc, #0x14
1345         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1346
1347 /*
1348  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1349  */
1350         ldr     r2, [r1]
1351         ldrh    r3, [r1, #0x04]
1352         str     r2, [r0]
1353         strh    r3, [r0, #0x04]
1354         RET
1355         LMEMCPY_6_PAD
1356
1357 /*
1358  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1359  */
1360         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1361         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1362         mov     r2, r2, lsr #8          /* r2 = .210 */
1363         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1364         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1365         str     r2, [r0]
1366         strh    r3, [r0, #0x04]
1367         RET
1368         LMEMCPY_6_PAD
1369
1370 /*
1371  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1372  */
1373         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1374         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1375         mov     r1, r3, lsr #16         /* r1 = ..54 */
1376         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1377         str     r2, [r0]
1378         strh    r1, [r0, #0x04]
1379         RET
1380         LMEMCPY_6_PAD
1381
1382 /*
1383  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1384  */
1385         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1386         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1387         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1388         mov     r2, r2, lsr #24         /* r2 = ...0 */
1389         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1390         mov     r1, r1, lsl #8          /* r1 = xx5. */
1391         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1392         str     r2, [r0]
1393         strh    r1, [r0, #0x04]
1394         RET
1395         LMEMCPY_6_PAD
1396
1397 /*
1398  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1399  */
1400         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1401         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1402         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1403         strh    r1, [r0, #0x01]
1404         strb    r3, [r0]
1405         mov     r3, r3, lsr #24         /* r3 = ...3 */
1406         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1407         mov     r2, r2, lsr #8          /* r2 = ...5 */
1408         strh    r3, [r0, #0x03]
1409         strb    r2, [r0, #0x05]
1410         RET
1411         LMEMCPY_6_PAD
1412
1413 /*
1414  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1415  */
1416         ldrb    r2, [r1]
1417         ldrh    r3, [r1, #0x01]
1418         ldrh    ip, [r1, #0x03]
1419         ldrb    r1, [r1, #0x05]
1420         strb    r2, [r0]
1421         strh    r3, [r0, #0x01]
1422         strh    ip, [r0, #0x03]
1423         strb    r1, [r0, #0x05]
1424         RET
1425         LMEMCPY_6_PAD
1426
1427 /*
1428  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1429  */
1430         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1431         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1432         strb    r2, [r0]
1433         mov     r3, r1, lsr #24
1434         strb    r3, [r0, #0x05]
1435         mov     r3, r1, lsr #8          /* r3 = .543 */
1436         strh    r3, [r0, #0x03]
1437         mov     r3, r2, lsr #8          /* r3 = ...1 */
1438         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
1439         strh    r3, [r0, #0x01]
1440         RET
1441         LMEMCPY_6_PAD
1442
1443 /*
1444  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1445  */
1446         ldrb    r2, [r1]
1447         ldrh    r3, [r1, #0x01]
1448         ldrh    ip, [r1, #0x03]
1449         ldrb    r1, [r1, #0x05]
1450         strb    r2, [r0]
1451         strh    r3, [r0, #0x01]
1452         strh    ip, [r0, #0x03]
1453         strb    r1, [r0, #0x05]
1454         RET
1455         LMEMCPY_6_PAD
1456
1457 /*
1458  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1459  */
1460         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
1461         ldr     r3, [r1]                /* r3 = 3210 */
1462         mov     r2, r2, lsl #16         /* r2 = 54.. */
1463         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
1464         strh    r3, [r0]
1465         str     r2, [r0, #0x02]
1466         RET
1467         LMEMCPY_6_PAD
1468
1469 /*
1470  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1471  */
1472         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1473         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
1474         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1475         mov     r2, r2, lsl #8          /* r2 = 543. */
1476         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
1477         strh    r1, [r0]
1478         str     r2, [r0, #0x02]
1479         RET
1480         LMEMCPY_6_PAD
1481
1482 /*
1483  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1484  */
1485         ldrh    r2, [r1]
1486         ldr     r3, [r1, #0x02]
1487         strh    r2, [r0]
1488         str     r3, [r0, #0x02]
1489         RET
1490         LMEMCPY_6_PAD
1491
1492 /*
1493  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1494  */
1495         ldrb    r3, [r1]                /* r3 = ...0 */
1496         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1497         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
1498         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1499         mov     r1, r1, lsl #24         /* r1 = 5... */
1500         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
1501         strh    r3, [r0]
1502         str     r1, [r0, #0x02]
1503         RET
1504         LMEMCPY_6_PAD
1505
1506 /*
1507  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1508  */
1509         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1510         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
1511         strb    r2, [r0]
1512         mov     r2, r2, lsr #8          /* r2 = .321 */
1513         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
1514         mov     r1, r1, lsr #8          /* r1 = ...5 */
1515         str     r2, [r0, #0x01]
1516         strb    r1, [r0, #0x05]
1517         RET
1518         LMEMCPY_6_PAD
1519
1520 /*
1521  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1522  */
1523         ldrb    r2, [r1]
1524         ldrh    r3, [r1, #0x01]
1525         ldrh    ip, [r1, #0x03]
1526         ldrb    r1, [r1, #0x05]
1527         strb    r2, [r0]
1528         strh    r3, [r0, #0x01]
1529         strh    ip, [r0, #0x03]
1530         strb    r1, [r0, #0x05]
1531         RET
1532         LMEMCPY_6_PAD
1533
1534 /*
1535  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1536  */
1537         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1538         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1539         strb    r2, [r0]
1540         mov     r2, r2, lsr #8          /* r2 = ...1 */
1541         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
1542         mov     r1, r1, lsr #24         /* r1 = ...5 */
1543         str     r2, [r0, #0x01]
1544         strb    r1, [r0, #0x05]
1545         RET
1546         LMEMCPY_6_PAD
1547
1548 /*
1549  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1550  */
1551         ldrb    r2, [r1]
1552         ldr     r3, [r1, #0x01]
1553         ldrb    r1, [r1, #0x05]
1554         strb    r2, [r0]
1555         str     r3, [r0, #0x01]
1556         strb    r1, [r0, #0x05]
1557         RET
1558         LMEMCPY_6_PAD
1559
1560
1561 /******************************************************************************
1562  * Special case for 8 byte copies
1563  */
1564 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
1565 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
1566         LMEMCPY_8_PAD
1567 .Lmemcpy_8:
1568         and     r2, r1, #0x03
1569         orr     r2, r2, r0, lsl #2
1570         ands    r2, r2, #0x0f
1571         sub     r3, pc, #0x14
1572         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
1573
1574 /*
1575  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1576  */
1577         ldr     r2, [r1]
1578         ldr     r3, [r1, #0x04]
1579         str     r2, [r0]
1580         str     r3, [r0, #0x04]
1581         RET
1582         LMEMCPY_8_PAD
1583
1584 /*
1585  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1586  */
1587         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1588         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
1589         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1590         mov     r3, r3, lsr #8          /* r3 = .210 */
1591         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1592         mov     r1, r1, lsl #24         /* r1 = 7... */
1593         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
1594         str     r3, [r0]
1595         str     r2, [r0, #0x04]
1596         RET
1597         LMEMCPY_8_PAD
1598
1599 /*
1600  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1601  */
1602         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1603         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1604         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1605         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1606         mov     r3, r3, lsr #16         /* r3 = ..54 */
1607         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
1608         str     r2, [r0]
1609         str     r3, [r0, #0x04]
1610         RET
1611         LMEMCPY_8_PAD
1612
1613 /*
1614  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1615  */
1616         ldrb    r3, [r1]                /* r3 = ...0 */
1617         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1618         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
1619         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1620         mov     r2, r2, lsr #24         /* r2 = ...4 */
1621         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
1622         str     r3, [r0]
1623         str     r2, [r0, #0x04]
1624         RET
1625         LMEMCPY_8_PAD
1626
1627 /*
1628  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1629  */
1630         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1631         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
1632         strb    r3, [r0]
1633         mov     r1, r2, lsr #24         /* r1 = ...7 */
1634         strb    r1, [r0, #0x07]
1635         mov     r1, r3, lsr #8          /* r1 = .321 */
1636         mov     r3, r3, lsr #24         /* r3 = ...3 */
1637         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
1638         strh    r1, [r0, #0x01]
1639         str     r3, [r0, #0x03]
1640         RET
1641         LMEMCPY_8_PAD
1642
1643 /*
1644  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1645  */
1646         ldrb    r2, [r1]
1647         ldrh    r3, [r1, #0x01]
1648         ldr     ip, [r1, #0x03]
1649         ldrb    r1, [r1, #0x07]
1650         strb    r2, [r0]
1651         strh    r3, [r0, #0x01]
1652         str     ip, [r0, #0x03]
1653         strb    r1, [r0, #0x07]
1654         RET
1655         LMEMCPY_8_PAD
1656
1657 /*
1658  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1659  */
1660         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1661         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1662         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1663         strb    r2, [r0]                /* 0 */
1664         mov     ip, r1, lsr #8          /* ip = ...7 */
1665         strb    ip, [r0, #0x07]         /* 7 */
1666         mov     ip, r2, lsr #8          /* ip = ...1 */
1667         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
1668         mov     r3, r3, lsr #8          /* r3 = .543 */
1669         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
1670         strh    ip, [r0, #0x01]
1671         str     r3, [r0, #0x03]
1672         RET
1673         LMEMCPY_8_PAD
1674
1675 /*
1676  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1677  */
1678         ldrb    r3, [r1]                /* r3 = ...0 */
1679         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
1680         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
1681         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1682         strb    r3, [r0]
1683         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
1684         strh    ip, [r0, #0x01]
1685         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
1686         str     r2, [r0, #0x03]
1687         strb    r1, [r0, #0x07]
1688         RET
1689         LMEMCPY_8_PAD
1690
1691 /*
1692  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1693  */
1694         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1695         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1696         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
1697         strh    r2, [r0]
1698         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
1699         mov     r3, r3, lsr #16         /* r3 = ..76 */
1700         str     r2, [r0, #0x02]
1701         strh    r3, [r0, #0x06]
1702         RET
1703         LMEMCPY_8_PAD
1704
1705 /*
1706  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1707  */
1708         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1709         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1710         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
1711         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1712         strh    r1, [r0]
1713         mov     r1, r2, lsr #24         /* r1 = ...2 */
1714         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
1715         mov     r3, r3, lsr #24         /* r3 = ...6 */
1716         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
1717         str     r1, [r0, #0x02]
1718         strh    r3, [r0, #0x06]
1719         RET
1720         LMEMCPY_8_PAD
1721
1722 /*
1723  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1724  */
1725         ldrh    r2, [r1]
1726         ldr     ip, [r1, #0x02]
1727         ldrh    r3, [r1, #0x06]
1728         strh    r2, [r0]
1729         str     ip, [r0, #0x02]
1730         strh    r3, [r0, #0x06]
1731         RET
1732         LMEMCPY_8_PAD
1733
1734 /*
1735  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1736  */
1737         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
1738         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1739         ldrb    ip, [r1]                /* ip = ...0 */
1740         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
1741         strh    r1, [r0, #0x06]
1742         mov     r3, r3, lsl #24         /* r3 = 5... */
1743         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
1744         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
1745         str     r3, [r0, #0x02]
1746         strh    r2, [r0]
1747         RET
1748         LMEMCPY_8_PAD
1749
1750 /*
1751  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1752  */
1753         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1754         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1755         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
1756         strh    r1, [r0, #0x05]
1757         strb    r2, [r0]
1758         mov     r1, r3, lsr #24         /* r1 = ...7 */
1759         strb    r1, [r0, #0x07]
1760         mov     r2, r2, lsr #8          /* r2 = .321 */
1761         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
1762         str     r2, [r0, #0x01]
1763         RET
1764         LMEMCPY_8_PAD
1765
1766 /*
1767  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1768  */
1769         ldrb    r3, [r1]                /* r3 = ...0 */
1770         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
1771         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
1772         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1773         strb    r3, [r0]
1774         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
1775         strh    r3, [r0, #0x05]
1776         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
1777         str     r2, [r0, #0x01]
1778         strb    r1, [r0, #0x07]
1779         RET
1780         LMEMCPY_8_PAD
1781
1782 /*
1783  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1784  */
1785         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1786         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1787         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1788         strb    r2, [r0]
1789         mov     ip, r2, lsr #8          /* ip = ...1 */
1790         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
1791         mov     r2, r1, lsr #8          /* r2 = ...7 */
1792         strb    r2, [r0, #0x07]
1793         mov     r1, r1, lsl #8          /* r1 = .76. */
1794         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
1795         str     ip, [r0, #0x01]
1796         strh    r1, [r0, #0x05]
1797         RET
1798         LMEMCPY_8_PAD
1799
1800 /*
1801  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1802  */
1803         ldrb    r2, [r1]
1804         ldr     ip, [r1, #0x01]
1805         ldrh    r3, [r1, #0x05]
1806         ldrb    r1, [r1, #0x07]
1807         strb    r2, [r0]
1808         str     ip, [r0, #0x01]
1809         strh    r3, [r0, #0x05]
1810         strb    r1, [r0, #0x07]
1811         RET
1812         LMEMCPY_8_PAD
1813
1814 /******************************************************************************
1815  * Special case for 12 byte copies
1816  */
1817 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
1818 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
1819         LMEMCPY_C_PAD
1820 .Lmemcpy_c:
1821         and     r2, r1, #0x03
1822         orr     r2, r2, r0, lsl #2
1823         ands    r2, r2, #0x0f
1824         sub     r3, pc, #0x14
1825         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
1826
1827 /*
1828  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1829  */
1830         ldr     r2, [r1]
1831         ldr     r3, [r1, #0x04]
1832         ldr     r1, [r1, #0x08]
1833         str     r2, [r0]
1834         str     r3, [r0, #0x04]
1835         str     r1, [r0, #0x08]
1836         RET
1837         LMEMCPY_C_PAD
1838
1839 /*
1840  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1841  */
1842         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
1843         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
1844         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1845         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
1846         mov     r2, r2, lsl #24         /* r2 = B... */
1847         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
1848         str     r2, [r0, #0x08]
1849         mov     r2, ip, lsl #24         /* r2 = 7... */
1850         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
1851         mov     r1, r1, lsr #8          /* r1 = .210 */
1852         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
1853         str     r2, [r0, #0x04]
1854         str     r1, [r0]
1855         RET
1856         LMEMCPY_C_PAD
1857
1858 /*
1859  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1860  */
1861         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1862         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1863         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
1864         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
1865         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1866         str     r2, [r0]
1867         mov     r3, r3, lsr #16         /* r3 = ..54 */
1868         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
1869         mov     r1, r1, lsl #16         /* r1 = BA.. */
1870         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
1871         str     r3, [r0, #0x04]
1872         str     r1, [r0, #0x08]
1873         RET
1874         LMEMCPY_C_PAD
1875
1876 /*
1877  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1878  */
1879         ldrb    r2, [r1]                /* r2 = ...0 */
1880         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
1881         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
1882         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
1883         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1884         str     r2, [r0]
1885         mov     r3, r3, lsr #24         /* r3 = ...4 */
1886         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
1887         mov     r1, r1, lsl #8          /* r1 = BA9. */
1888         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
1889         str     r3, [r0, #0x04]
1890         str     r1, [r0, #0x08]
1891         RET
1892         LMEMCPY_C_PAD
1893
1894 /*
1895  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1896  */
1897         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1898         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1899         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
1900         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1901         strh    r1, [r0, #0x01]
1902         strb    r2, [r0]
1903         mov     r1, r2, lsr #24         /* r1 = ...3 */
1904         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
1905         mov     r1, r3, lsr #24         /* r1 = ...7 */
1906         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
1907         mov     ip, ip, lsr #24         /* ip = ...B */
1908         str     r2, [r0, #0x03]
1909         str     r1, [r0, #0x07]
1910         strb    ip, [r0, #0x0b]
1911         RET
1912         LMEMCPY_C_PAD
1913
1914 /*
1915  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1916  */
1917         ldrb    r2, [r1]
1918         ldrh    r3, [r1, #0x01]
1919         ldr     ip, [r1, #0x03]
1920         strb    r2, [r0]
1921         ldr     r2, [r1, #0x07]
1922         ldrb    r1, [r1, #0x0b]
1923         strh    r3, [r0, #0x01]
1924         str     ip, [r0, #0x03]
1925         str     r2, [r0, #0x07]
1926         strb    r1, [r0, #0x0b]
1927         RET
1928         LMEMCPY_C_PAD
1929
1930 /*
1931  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1932  */
1933         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1934         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1935         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
1936         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
1937         strb    r2, [r0]
1938         mov     r2, r2, lsr #8          /* r2 = ...1 */
1939         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
1940         strh    r2, [r0, #0x01]
1941         mov     r2, r3, lsr #8          /* r2 = .543 */
1942         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
1943         mov     r2, ip, lsr #8          /* r2 = .987 */
1944         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
1945         mov     r1, r1, lsr #8          /* r1 = ...B */
1946         str     r3, [r0, #0x03]
1947         str     r2, [r0, #0x07]
1948         strb    r1, [r0, #0x0b]
1949         RET
1950         LMEMCPY_C_PAD
1951
1952 /*
1953  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1954  */
1955         ldrb    r2, [r1]
1956         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
1957         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
1958         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
1959         strb    r2, [r0]
1960         strh    r3, [r0, #0x01]
1961         mov     r3, r3, lsr #16         /* r3 = ..43 */
1962         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
1963         mov     ip, ip, lsr #16         /* ip = ..87 */
1964         orr     ip, ip, r1, lsl #16     /* ip = A987 */
1965         mov     r1, r1, lsr #16         /* r1 = ..xB */
1966         str     r3, [r0, #0x03]
1967         str     ip, [r0, #0x07]
1968         strb    r1, [r0, #0x0b]
1969         RET
1970         LMEMCPY_C_PAD
1971
1972 /*
1973  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1974  */
1975         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
1976         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1977         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
1978         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
1979         strh    ip, [r0]
1980         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
1981         mov     r3, r3, lsr #16         /* r3 = ..76 */
1982         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
1983         mov     r2, r2, lsr #16         /* r2 = ..BA */
1984         str     r1, [r0, #0x02]
1985         str     r3, [r0, #0x06]
1986         strh    r2, [r0, #0x0a]
1987         RET
1988         LMEMCPY_C_PAD
1989
1990 /*
1991  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1992  */
1993         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1994         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1995         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
1996         strh    ip, [r0]
1997         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
1998         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
1999         mov     r2, r2, lsr #24         /* r2 = ...2 */
2000         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2001         mov     r3, r3, lsr #24         /* r3 = ...6 */
2002         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2003         mov     r1, r1, lsl #8          /* r1 = ..B. */
2004         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2005         str     r2, [r0, #0x02]
2006         str     r3, [r0, #0x06]
2007         strh    r1, [r0, #0x0a]
2008         RET
2009         LMEMCPY_C_PAD
2010
2011 /*
2012  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2013  */
2014         ldrh    r2, [r1]
2015         ldr     r3, [r1, #0x02]
2016         ldr     ip, [r1, #0x06]
2017         ldrh    r1, [r1, #0x0a]
2018         strh    r2, [r0]
2019         str     r3, [r0, #0x02]
2020         str     ip, [r0, #0x06]
2021         strh    r1, [r0, #0x0a]
2022         RET
2023         LMEMCPY_C_PAD
2024
2025 /*
2026  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2027  */
2028         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2029         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2030         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2031         strh    ip, [r0, #0x0a]
2032         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2033         ldrb    r1, [r1]                /* r1 = ...0 */
2034         mov     r2, r2, lsl #24         /* r2 = 9... */
2035         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2036         mov     r3, r3, lsl #24         /* r3 = 5... */
2037         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2038         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2039         str     r2, [r0, #0x06]
2040         str     r3, [r0, #0x02]
2041         strh    r1, [r0]
2042         RET
2043         LMEMCPY_C_PAD
2044
2045 /*
2046  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2047  */
2048         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2049         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2050         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2051         strb    r2, [r0]
2052         mov     r3, r2, lsr #8          /* r3 = .321 */
2053         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2054         str     r3, [r0, #0x01]
2055         mov     r3, ip, lsr #8          /* r3 = .765 */
2056         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2057         str     r3, [r0, #0x05]
2058         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2059         strh    r1, [r0, #0x09]
2060         mov     r1, r1, lsr #16         /* r1 = ...B */
2061         strb    r1, [r0, #0x0b]
2062         RET
2063         LMEMCPY_C_PAD
2064
2065 /*
2066  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2067  */
2068         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2069         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2070         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2071         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2072         strb    r2, [r0, #0x0b]
2073         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2074         strh    r2, [r0, #0x09]
2075         mov     r3, r3, lsl #16         /* r3 = 87.. */
2076         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2077         mov     ip, ip, lsl #16         /* ip = 43.. */
2078         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2079         mov     r1, r1, lsr #8          /* r1 = .210 */
2080         str     r3, [r0, #0x05]
2081         str     ip, [r0, #0x01]
2082         strb    r1, [r0]
2083         RET
2084         LMEMCPY_C_PAD
2085
2086 /*
2087  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2088  */
2089         ldrh    r2, [r1]                /* r2 = ..10 */
2090         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2091         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2092         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2093         strb    r2, [r0]
2094         mov     r2, r2, lsr #8          /* r2 = ...1 */
2095         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2096         mov     r3, r3, lsr #24         /* r3 = ...5 */
2097         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2098         mov     ip, ip, lsr #24         /* ip = ...9 */
2099         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2100         mov     r1, r1, lsr #8          /* r1 = ...B */
2101         str     r2, [r0, #0x01]
2102         str     r3, [r0, #0x05]
2103         strh    ip, [r0, #0x09]
2104         strb    r1, [r0, #0x0b]
2105         RET
2106         LMEMCPY_C_PAD
2107
2108 /*
2109  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2110  */
2111         ldrb    r2, [r1]
2112         ldr     r3, [r1, #0x01]
2113         ldr     ip, [r1, #0x05]
2114         strb    r2, [r0]
2115         ldrh    r2, [r1, #0x09]
2116         ldrb    r1, [r1, #0x0b]
2117         str     r3, [r0, #0x01]
2118         str     ip, [r0, #0x05]
2119         strh    r2, [r0, #0x09]
2120         strb    r1, [r0, #0x0b]
2121         RET
2122 END(memcpy)