sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  *
  76  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  77  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  78  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  79  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  80  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  81  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  82  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  83  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  84  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  85  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  86  * POSSIBILITY OF SUCH DAMAGE.
  87  */
  88
  89 #include <machine/asm.h>
  90 #include <machine/asmacros.h>
  91 __FBSDID("$FreeBSD$");
  92
  93 #include "assym.s"
  94
  95 .L_arm_memcpy:
  96         .word   _C_LABEL(_arm_memcpy)
  97 .L_arm_bzero:
  98         .word   _C_LABEL(_arm_bzero)
  99 .L_min_memcpy_size:
 100         .word   _C_LABEL(_min_memcpy_size)
 101 .L_min_bzero_size:
 102         .word   _C_LABEL(_min_bzero_size)
 103 /*
 104  * memset: Sets a block of memory to the specified value
 105  *
 106  * On entry:
 107  *   r0 - dest address
 108  *   r1 - byte to write
 109  *   r2 - number of bytes to write
 110  *
 111  * On exit:
 112  *   r0 - dest address
 113  */
 114 /* LINTSTUB: Func: void bzero(void *, size_t) */
 115 ENTRY(bzero)
 116         ldr     r3, .L_arm_bzero
 117         ldr     r3, [r3]
 118         cmp     r3, #0
 119         beq     .Lnormal0
 120         ldr     r2, .L_min_bzero_size
 121         ldr     r2, [r2]
 122         cmp     r1, r2
 123         blt     .Lnormal0
 124         stmfd   sp!, {r0, r1, lr}
 125         mov     r2, #0
 126         mov     lr, pc
 127         mov     pc, r3
 128         cmp     r0, #0
 129         ldmfd   sp!, {r0, r1, lr}
 130         RETeq
 131 .Lnormal0:
 132         mov     r3, #0x00
 133         b       do_memset
 134
 135 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 136 ENTRY(memset)
 137         and     r3, r1, #0xff           /* We deal with bytes */
 138         mov     r1, r2
 139 do_memset:
 140         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 141         mov     ip, r0
 142         blt     .Lmemset_lessthanfour
 143
 144         /* Ok first we will word align the address */
 145         ands    r2, ip, #0x03           /* Get the bottom two bits */
 146         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 147
 148         /* We are now word aligned */
 149 .Lmemset_wordaligned:
 150         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 151 #ifdef _ARM_ARCH_5E
 152         tst     ip, #0x04               /* Quad-align for armv5e */
 153 #else
 154         cmp     r1, #0x10
 155 #endif
 156         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 157 #ifdef _ARM_ARCH_5E
 158         subne   r1, r1, #0x04           /* Quad-align if necessary */
 159         strne   r3, [ip], #0x04
 160         cmp     r1, #0x10
 161 #endif
 162         blt     .Lmemset_loop4          /* If less than 16 then use words */
 163         mov     r2, r3                  /* Duplicate data */
 164         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 165         blt     .Lmemset_loop32
 166
 167         /* Do 128 bytes at a time */
 168 .Lmemset_loop128:
 169         subs    r1, r1, #0x80
 170 #ifdef _ARM_ARCH_5E
 171         strged  r2, [ip], #0x08
 172         strged  r2, [ip], #0x08
 173         strged  r2, [ip], #0x08
 174         strged  r2, [ip], #0x08
 175         strged  r2, [ip], #0x08
 176         strged  r2, [ip], #0x08
 177         strged  r2, [ip], #0x08
 178         strged  r2, [ip], #0x08
 179         strged  r2, [ip], #0x08
 180         strged  r2, [ip], #0x08
 181         strged  r2, [ip], #0x08
 182         strged  r2, [ip], #0x08
 183         strged  r2, [ip], #0x08
 184         strged  r2, [ip], #0x08
 185         strged  r2, [ip], #0x08
 186         strged  r2, [ip], #0x08
 187 #else
 188         stmgeia ip!, {r2-r3}
 189         stmgeia ip!, {r2-r3}
 190         stmgeia ip!, {r2-r3}
 191         stmgeia ip!, {r2-r3}
 192         stmgeia ip!, {r2-r3}
 193         stmgeia ip!, {r2-r3}
 194         stmgeia ip!, {r2-r3}
 195         stmgeia ip!, {r2-r3}
 196         stmgeia ip!, {r2-r3}
 197         stmgeia ip!, {r2-r3}
 198         stmgeia ip!, {r2-r3}
 199         stmgeia ip!, {r2-r3}
 200         stmgeia ip!, {r2-r3}
 201         stmgeia ip!, {r2-r3}
 202         stmgeia ip!, {r2-r3}
 203         stmgeia ip!, {r2-r3}
 204 #endif
 205         bgt     .Lmemset_loop128
 206         RETeq                   /* Zero length so just exit */
 207
 208         add     r1, r1, #0x80           /* Adjust for extra sub */
 209
 210         /* Do 32 bytes at a time */
 211 .Lmemset_loop32:
 212         subs    r1, r1, #0x20
 213 #ifdef _ARM_ARCH_5E
 214         strged  r2, [ip], #0x08
 215         strged  r2, [ip], #0x08
 216         strged  r2, [ip], #0x08
 217         strged  r2, [ip], #0x08
 218 #else
 219         stmgeia ip!, {r2-r3}
 220         stmgeia ip!, {r2-r3}
 221         stmgeia ip!, {r2-r3}
 222         stmgeia ip!, {r2-r3}
 223 #endif
 224         bgt     .Lmemset_loop32
 225         RETeq                   /* Zero length so just exit */
 226
 227         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 228
 229         /* Deal with 16 bytes or more */
 230 #ifdef _ARM_ARCH_5E
 231         strged  r2, [ip], #0x08
 232         strged  r2, [ip], #0x08
 233 #else
 234         stmgeia ip!, {r2-r3}
 235         stmgeia ip!, {r2-r3}
 236 #endif
 237         RETeq                   /* Zero length so just exit */
 238
 239         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 240
 241         /* We have at least 4 bytes so copy as words */
 242 .Lmemset_loop4:
 243         subs    r1, r1, #0x04
 244         strge   r3, [ip], #0x04
 245         bgt     .Lmemset_loop4
 246         RETeq                   /* Zero length so just exit */
 247
 248 #ifdef _ARM_ARCH_5E
 249         /* Compensate for 64-bit alignment check */
 250         adds    r1, r1, #0x04
 251         RETeq
 252         cmp     r1, #2
 253 #else
 254         cmp     r1, #-2
 255 #endif
 256
 257         strb    r3, [ip], #0x01         /* Set 1 byte */
 258         strgeb  r3, [ip], #0x01         /* Set another byte */
 259         strgtb  r3, [ip]                /* and a third */
 260         RET                     /* Exit */
 261
 262 .Lmemset_wordunaligned:
 263         rsb     r2, r2, #0x004
 264         strb    r3, [ip], #0x01         /* Set 1 byte */
 265         cmp     r2, #0x02
 266         strgeb  r3, [ip], #0x01         /* Set another byte */
 267         sub     r1, r1, r2
 268         strgtb  r3, [ip], #0x01         /* and a third */
 269         cmp     r1, #0x04               /* More than 4 bytes left? */
 270         bge     .Lmemset_wordaligned    /* Yup */
 271
 272 .Lmemset_lessthanfour:
 273         cmp     r1, #0x00
 274         RETeq                   /* Zero length so exit */
 275         strb    r3, [ip], #0x01         /* Set 1 byte */
 276         cmp     r1, #0x02
 277         strgeb  r3, [ip], #0x01         /* Set another byte */
 278         strgtb  r3, [ip]                /* and a third */
 279         RET                     /* Exit */
 280
 281 ENTRY(bcmp)
 282         mov     ip, r0
 283         cmp     r2, #0x06
 284         beq     .Lmemcmp_6bytes
 285         mov     r0, #0x00
 286
 287         /* Are both addresses aligned the same way? */
 288         cmp     r2, #0x00
 289         eornes  r3, ip, r1
 290         RETeq                   /* len == 0, or same addresses! */
 291         tst     r3, #0x03
 292         subne   r2, r2, #0x01
 293         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 294
 295         /* Word-align the addresses, if necessary */
 296         sub     r3, r1, #0x05
 297         ands    r3, r3, #0x03
 298         add     r3, r3, r3, lsl #1
 299         addne   pc, pc, r3, lsl #3
 300         nop
 301
 302         /* Compare up to 3 bytes */
 303         ldrb    r0, [ip], #0x01
 304         ldrb    r3, [r1], #0x01
 305         subs    r0, r0, r3
 306         RETne
 307         subs    r2, r2, #0x01
 308         RETeq
 309
 310         /* Compare up to 2 bytes */
 311         ldrb    r0, [ip], #0x01
 312         ldrb    r3, [r1], #0x01
 313         subs    r0, r0, r3
 314         RETne
 315         subs    r2, r2, #0x01
 316         RETeq
 317
 318         /* Compare 1 byte */
 319         ldrb    r0, [ip], #0x01
 320         ldrb    r3, [r1], #0x01
 321         subs    r0, r0, r3
 322         RETne
 323         subs    r2, r2, #0x01
 324         RETeq
 325
 326         /* Compare 4 bytes at a time, if possible */
 327         subs    r2, r2, #0x04
 328         bcc     .Lmemcmp_bytewise
 329 .Lmemcmp_word_aligned:
 330         ldr     r0, [ip], #0x04
 331         ldr     r3, [r1], #0x04
 332         subs    r2, r2, #0x04
 333         cmpcs   r0, r3
 334         beq     .Lmemcmp_word_aligned
 335         sub     r0, r0, r3
 336
 337         /* Correct for extra subtraction, and check if done */
 338         adds    r2, r2, #0x04
 339         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 340         RETeq                   /* Yup. Just return */
 341
 342         /* Re-do the final word byte-wise */
 343         sub     ip, ip, #0x04
 344         sub     r1, r1, #0x04
 345
 346 .Lmemcmp_bytewise:
 347         add     r2, r2, #0x03
 348 .Lmemcmp_bytewise2:
 349         ldrb    r0, [ip], #0x01
 350         ldrb    r3, [r1], #0x01
 351         subs    r2, r2, #0x01
 352         cmpcs   r0, r3
 353         beq     .Lmemcmp_bytewise2
 354         sub     r0, r0, r3
 355         RET
 356
 357         /*
 358          * 6 byte compares are very common, thanks to the network stack.
 359          * This code is hand-scheduled to reduce the number of stalls for
 360          * load results. Everything else being equal, this will be ~32%
 361          * faster than a byte-wise memcmp.
 362          */
 363         .align  5
 364 .Lmemcmp_6bytes:
 365         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 366         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 367         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 368         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 369         ldreqb  r3, [ip, #0x01]         /* r3 = b1#1 */
 370         RETne                   /* Return if mismatch on #0 */
 371         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 372         ldreqb  r3, [r1, #0x02]         /* r3 = b2#2 */
 373         ldreqb  r0, [ip, #0x02]         /* r0 = b1#2 */
 374         RETne                   /* Return if mismatch on #1 */
 375         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 376         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 377         ldreqb  r3, [ip, #0x03]         /* r3 = b1#3 */
 378         RETne                   /* Return if mismatch on #2 */
 379         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 380         ldreqb  r3, [r1, #0x04]         /* r3 = b2#4 */
 381         ldreqb  r0, [ip, #0x04]         /* r0 = b1#4 */
 382         RETne                   /* Return if mismatch on #3 */
 383         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 384         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 385         ldreqb  r3, [ip, #0x05]         /* r3 = b1#5 */
 386         RETne                   /* Return if mismatch on #4 */
 387         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 388         RET
 389
 390 ENTRY(bcopy)
 391         /* switch the source and destination registers */
 392         eor     r0, r1, r0
 393         eor     r1, r0, r1
 394         eor     r0, r1, r0
 395 ENTRY(memmove)
 396         /* Do the buffers overlap? */
 397         cmp     r0, r1
 398         RETeq           /* Bail now if src/dst are the same */
 399         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 400         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 401         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 402         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 403
 404         /* Determine copy direction */
 405         cmp     r1, r0
 406         bcc     .Lmemmove_backwards
 407
 408         moveq   r0, #0                  /* Quick abort for len=0 */
 409         RETeq
 410
 411         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 412         subs    r2, r2, #4
 413         blt     .Lmemmove_fl4           /* less than 4 bytes */
 414         ands    r12, r0, #3
 415         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 416         ands    r12, r1, #3
 417         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 418
 419 .Lmemmove_ft8:
 420         /* We have aligned source and destination */
 421         subs    r2, r2, #8
 422         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 423         subs    r2, r2, #0x14
 424         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 425         stmdb   sp!, {r4}               /* borrow r4 */
 426
 427         /* blat 32 bytes at a time */
 428         /* XXX for really big copies perhaps we should use more registers */
 429 .Lmemmove_floop32:
 430         ldmia   r1!, {r3, r4, r12, lr}
 431         stmia   r0!, {r3, r4, r12, lr}
 432         ldmia   r1!, {r3, r4, r12, lr}
 433         stmia   r0!, {r3, r4, r12, lr}
 434         subs    r2, r2, #0x20
 435         bge     .Lmemmove_floop32
 436
 437         cmn     r2, #0x10
 438         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 439         stmgeia r0!, {r3, r4, r12, lr}
 440         subge   r2, r2, #0x10
 441         ldmia   sp!, {r4}               /* return r4 */
 442
 443 .Lmemmove_fl32:
 444         adds    r2, r2, #0x14
 445
 446         /* blat 12 bytes at a time */
 447 .Lmemmove_floop12:
 448         ldmgeia r1!, {r3, r12, lr}
 449         stmgeia r0!, {r3, r12, lr}
 450         subges  r2, r2, #0x0c
 451         bge     .Lmemmove_floop12
 452
 453 .Lmemmove_fl12:
 454         adds    r2, r2, #8
 455         blt     .Lmemmove_fl4
 456
 457         subs    r2, r2, #4
 458         ldrlt   r3, [r1], #4
 459         strlt   r3, [r0], #4
 460         ldmgeia r1!, {r3, r12}
 461         stmgeia r0!, {r3, r12}
 462         subge   r2, r2, #4
 463
 464 .Lmemmove_fl4:
 465         /* less than 4 bytes to go */
 466         adds    r2, r2, #4
 467         ldmeqia sp!, {r0, pc}           /* done */
 468
 469         /* copy the crud byte at a time */
 470         cmp     r2, #2
 471         ldrb    r3, [r1], #1
 472         strb    r3, [r0], #1
 473         ldrgeb  r3, [r1], #1
 474         strgeb  r3, [r0], #1
 475         ldrgtb  r3, [r1], #1
 476         strgtb  r3, [r0], #1
 477         ldmia   sp!, {r0, pc}
 478
 479         /* erg - unaligned destination */
 480 .Lmemmove_fdestul:
 481         rsb     r12, r12, #4
 482         cmp     r12, #2
 483
 484         /* align destination with byte copies */
 485         ldrb    r3, [r1], #1
 486         strb    r3, [r0], #1
 487         ldrgeb  r3, [r1], #1
 488         strgeb  r3, [r0], #1
 489         ldrgtb  r3, [r1], #1
 490         strgtb  r3, [r0], #1
 491         subs    r2, r2, r12
 492         blt     .Lmemmove_fl4           /* less the 4 bytes */
 493
 494         ands    r12, r1, #3
 495         beq     .Lmemmove_ft8           /* we have an aligned source */
 496
 497         /* erg - unaligned source */
 498         /* This is where it gets nasty ... */
 499 .Lmemmove_fsrcul:
 500         bic     r1, r1, #3
 501         ldr     lr, [r1], #4
 502         cmp     r12, #2
 503         bgt     .Lmemmove_fsrcul3
 504         beq     .Lmemmove_fsrcul2
 505         cmp     r2, #0x0c
 506         blt     .Lmemmove_fsrcul1loop4
 507         sub     r2, r2, #0x0c
 508         stmdb   sp!, {r4, r5}
 509
 510 .Lmemmove_fsrcul1loop16:
 511 #ifdef __ARMEB__
 512         mov     r3, lr, lsl #8
 513 #else
 514         mov     r3, lr, lsr #8
 515 #endif
 516         ldmia   r1!, {r4, r5, r12, lr}
 517 #ifdef __ARMEB__
 518         orr     r3, r3, r4, lsr #24
 519         mov     r4, r4, lsl #8
 520         orr     r4, r4, r5, lsr #24
 521         mov     r5, r5, lsl #8
 522         orr     r5, r5, r12, lsr #24
 523         mov     r12, r12, lsl #8
 524         orr     r12, r12, lr, lsr #24
 525 #else
 526         orr     r3, r3, r4, lsl #24
 527         mov     r4, r4, lsr #8
 528         orr     r4, r4, r5, lsl #24
 529         mov     r5, r5, lsr #8
 530         orr     r5, r5, r12, lsl #24
 531         mov     r12, r12, lsr #8
 532         orr     r12, r12, lr, lsl #24
 533 #endif
 534         stmia   r0!, {r3-r5, r12}
 535         subs    r2, r2, #0x10
 536         bge     .Lmemmove_fsrcul1loop16
 537         ldmia   sp!, {r4, r5}
 538         adds    r2, r2, #0x0c
 539         blt     .Lmemmove_fsrcul1l4
 540
 541 .Lmemmove_fsrcul1loop4:
 542 #ifdef __ARMEB__
 543         mov     r12, lr, lsl #8
 544 #else
 545         mov     r12, lr, lsr #8
 546 #endif
 547         ldr     lr, [r1], #4
 548 #ifdef __ARMEB__
 549         orr     r12, r12, lr, lsr #24
 550 #else
 551         orr     r12, r12, lr, lsl #24
 552 #endif
 553         str     r12, [r0], #4
 554         subs    r2, r2, #4
 555         bge     .Lmemmove_fsrcul1loop4
 556
 557 .Lmemmove_fsrcul1l4:
 558         sub     r1, r1, #3
 559         b       .Lmemmove_fl4
 560
 561 .Lmemmove_fsrcul2:
 562         cmp     r2, #0x0c
 563         blt     .Lmemmove_fsrcul2loop4
 564         sub     r2, r2, #0x0c
 565         stmdb   sp!, {r4, r5}
 566
 567 .Lmemmove_fsrcul2loop16:
 568 #ifdef __ARMEB__
 569         mov     r3, lr, lsl #16
 570 #else
 571         mov     r3, lr, lsr #16
 572 #endif
 573         ldmia   r1!, {r4, r5, r12, lr}
 574 #ifdef __ARMEB__
 575         orr     r3, r3, r4, lsr #16
 576         mov     r4, r4, lsl #16
 577         orr     r4, r4, r5, lsr #16
 578         mov     r5, r5, lsl #16
 579         orr     r5, r5, r12, lsr #16
 580         mov     r12, r12, lsl #16
 581         orr     r12, r12, lr, lsr #16
 582 #else
 583         orr     r3, r3, r4, lsl #16
 584         mov     r4, r4, lsr #16
 585         orr     r4, r4, r5, lsl #16
 586         mov     r5, r5, lsr #16
 587         orr     r5, r5, r12, lsl #16
 588         mov     r12, r12, lsr #16
 589         orr     r12, r12, lr, lsl #16
 590 #endif
 591         stmia   r0!, {r3-r5, r12}
 592         subs    r2, r2, #0x10
 593         bge     .Lmemmove_fsrcul2loop16
 594         ldmia   sp!, {r4, r5}
 595         adds    r2, r2, #0x0c
 596         blt     .Lmemmove_fsrcul2l4
 597
 598 .Lmemmove_fsrcul2loop4:
 599 #ifdef __ARMEB__
 600         mov     r12, lr, lsl #16
 601 #else
 602         mov     r12, lr, lsr #16
 603 #endif
 604         ldr     lr, [r1], #4
 605 #ifdef __ARMEB__
 606         orr     r12, r12, lr, lsr #16
 607 #else
 608         orr     r12, r12, lr, lsl #16
 609 #endif
 610         str     r12, [r0], #4
 611         subs    r2, r2, #4
 612         bge     .Lmemmove_fsrcul2loop4
 613
 614 .Lmemmove_fsrcul2l4:
 615         sub     r1, r1, #2
 616         b       .Lmemmove_fl4
 617
 618 .Lmemmove_fsrcul3:
 619         cmp     r2, #0x0c
 620         blt     .Lmemmove_fsrcul3loop4
 621         sub     r2, r2, #0x0c
 622         stmdb   sp!, {r4, r5}
 623
 624 .Lmemmove_fsrcul3loop16:
 625 #ifdef __ARMEB__
 626         mov     r3, lr, lsl #24
 627 #else
 628         mov     r3, lr, lsr #24
 629 #endif
 630         ldmia   r1!, {r4, r5, r12, lr}
 631 #ifdef __ARMEB__
 632         orr     r3, r3, r4, lsr #8
 633         mov     r4, r4, lsl #24
 634         orr     r4, r4, r5, lsr #8
 635         mov     r5, r5, lsl #24
 636         orr     r5, r5, r12, lsr #8
 637         mov     r12, r12, lsl #24
 638         orr     r12, r12, lr, lsr #8
 639 #else
 640         orr     r3, r3, r4, lsl #8
 641         mov     r4, r4, lsr #24
 642         orr     r4, r4, r5, lsl #8
 643         mov     r5, r5, lsr #24
 644         orr     r5, r5, r12, lsl #8
 645         mov     r12, r12, lsr #24
 646         orr     r12, r12, lr, lsl #8
 647 #endif
 648         stmia   r0!, {r3-r5, r12}
 649         subs    r2, r2, #0x10
 650         bge     .Lmemmove_fsrcul3loop16
 651         ldmia   sp!, {r4, r5}
 652         adds    r2, r2, #0x0c
 653         blt     .Lmemmove_fsrcul3l4
 654
 655 .Lmemmove_fsrcul3loop4:
 656 #ifdef __ARMEB__
 657         mov     r12, lr, lsl #24
 658 #else
 659         mov     r12, lr, lsr #24
 660 #endif
 661         ldr     lr, [r1], #4
 662 #ifdef __ARMEB__
 663         orr     r12, r12, lr, lsr #8
 664 #else
 665         orr     r12, r12, lr, lsl #8
 666 #endif
 667         str     r12, [r0], #4
 668         subs    r2, r2, #4
 669         bge     .Lmemmove_fsrcul3loop4
 670
 671 .Lmemmove_fsrcul3l4:
 672         sub     r1, r1, #1
 673         b       .Lmemmove_fl4
 674
 675 .Lmemmove_backwards:
 676         add     r1, r1, r2
 677         add     r0, r0, r2
 678         subs    r2, r2, #4
 679         blt     .Lmemmove_bl4           /* less than 4 bytes */
 680         ands    r12, r0, #3
 681         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 682         ands    r12, r1, #3
 683         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 684
 685 .Lmemmove_bt8:
 686         /* We have aligned source and destination */
 687         subs    r2, r2, #8
 688         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 689         stmdb   sp!, {r4, lr}
 690         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 691         blt     .Lmemmove_bl32
 692
 693         /* blat 32 bytes at a time */
 694         /* XXX for really big copies perhaps we should use more registers */
 695 .Lmemmove_bloop32:
 696         ldmdb   r1!, {r3, r4, r12, lr}
 697         stmdb   r0!, {r3, r4, r12, lr}
 698         ldmdb   r1!, {r3, r4, r12, lr}
 699         stmdb   r0!, {r3, r4, r12, lr}
 700         subs    r2, r2, #0x20
 701         bge     .Lmemmove_bloop32
 702
 703 .Lmemmove_bl32:
 704         cmn     r2, #0x10
 705         ldmgedb r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 706         stmgedb r0!, {r3, r4, r12, lr}
 707         subge   r2, r2, #0x10
 708         adds    r2, r2, #0x14
 709         ldmgedb r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 710         stmgedb r0!, {r3, r12, lr}
 711         subge   r2, r2, #0x0c
 712         ldmia   sp!, {r4, lr}
 713
 714 .Lmemmove_bl12:
 715         adds    r2, r2, #8
 716         blt     .Lmemmove_bl4
 717         subs    r2, r2, #4
 718         ldrlt   r3, [r1, #-4]!
 719         strlt   r3, [r0, #-4]!
 720         ldmgedb r1!, {r3, r12}
 721         stmgedb r0!, {r3, r12}
 722         subge   r2, r2, #4
 723
 724 .Lmemmove_bl4:
 725         /* less than 4 bytes to go */
 726         adds    r2, r2, #4
 727         RETeq                   /* done */
 728
 729         /* copy the crud byte at a time */
 730         cmp     r2, #2
 731         ldrb    r3, [r1, #-1]!
 732         strb    r3, [r0, #-1]!
 733         ldrgeb  r3, [r1, #-1]!
 734         strgeb  r3, [r0, #-1]!
 735         ldrgtb  r3, [r1, #-1]!
 736         strgtb  r3, [r0, #-1]!
 737         RET
 738
 739         /* erg - unaligned destination */
 740 .Lmemmove_bdestul:
 741         cmp     r12, #2
 742
 743         /* align destination with byte copies */
 744         ldrb    r3, [r1, #-1]!
 745         strb    r3, [r0, #-1]!
 746         ldrgeb  r3, [r1, #-1]!
 747         strgeb  r3, [r0, #-1]!
 748         ldrgtb  r3, [r1, #-1]!
 749         strgtb  r3, [r0, #-1]!
 750         subs    r2, r2, r12
 751         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 752         ands    r12, r1, #3
 753         beq     .Lmemmove_bt8           /* we have an aligned source */
 754
 755         /* erg - unaligned source */
 756         /* This is where it gets nasty ... */
 757 .Lmemmove_bsrcul:
 758         bic     r1, r1, #3
 759         ldr     r3, [r1, #0]
 760         cmp     r12, #2
 761         blt     .Lmemmove_bsrcul1
 762         beq     .Lmemmove_bsrcul2
 763         cmp     r2, #0x0c
 764         blt     .Lmemmove_bsrcul3loop4
 765         sub     r2, r2, #0x0c
 766         stmdb   sp!, {r4, r5, lr}
 767
 768 .Lmemmove_bsrcul3loop16:
 769 #ifdef __ARMEB__
 770         mov     lr, r3, lsr #8
 771 #else
 772         mov     lr, r3, lsl #8
 773 #endif
 774         ldmdb   r1!, {r3-r5, r12}
 775 #ifdef __ARMEB__
 776         orr     lr, lr, r12, lsl #24
 777         mov     r12, r12, lsr #8
 778         orr     r12, r12, r5, lsl #24
 779         mov     r5, r5, lsr #8
 780         orr     r5, r5, r4, lsl #24
 781         mov     r4, r4, lsr #8
 782         orr     r4, r4, r3, lsl #24
 783 #else
 784         orr     lr, lr, r12, lsr #24
 785         mov     r12, r12, lsl #8
 786         orr     r12, r12, r5, lsr #24
 787         mov     r5, r5, lsl #8
 788         orr     r5, r5, r4, lsr #24
 789         mov     r4, r4, lsl #8
 790         orr     r4, r4, r3, lsr #24
 791 #endif
 792         stmdb   r0!, {r4, r5, r12, lr}
 793         subs    r2, r2, #0x10
 794         bge     .Lmemmove_bsrcul3loop16
 795         ldmia   sp!, {r4, r5, lr}
 796         adds    r2, r2, #0x0c
 797         blt     .Lmemmove_bsrcul3l4
 798
 799 .Lmemmove_bsrcul3loop4:
 800 #ifdef __ARMEB__
 801         mov     r12, r3, lsr #8
 802 #else
 803         mov     r12, r3, lsl #8
 804 #endif
 805         ldr     r3, [r1, #-4]!
 806 #ifdef __ARMEB__
 807         orr     r12, r12, r3, lsl #24
 808 #else
 809         orr     r12, r12, r3, lsr #24
 810 #endif
 811         str     r12, [r0, #-4]!
 812         subs    r2, r2, #4
 813         bge     .Lmemmove_bsrcul3loop4
 814
 815 .Lmemmove_bsrcul3l4:
 816         add     r1, r1, #3
 817         b       .Lmemmove_bl4
 818
 819 .Lmemmove_bsrcul2:
 820         cmp     r2, #0x0c
 821         blt     .Lmemmove_bsrcul2loop4
 822         sub     r2, r2, #0x0c
 823         stmdb   sp!, {r4, r5, lr}
 824
 825 .Lmemmove_bsrcul2loop16:
 826 #ifdef __ARMEB__
 827         mov     lr, r3, lsr #16
 828 #else
 829         mov     lr, r3, lsl #16
 830 #endif
 831         ldmdb   r1!, {r3-r5, r12}
 832 #ifdef __ARMEB__
 833         orr     lr, lr, r12, lsl #16
 834         mov     r12, r12, lsr #16
 835         orr     r12, r12, r5, lsl #16
 836         mov     r5, r5, lsr #16
 837         orr     r5, r5, r4, lsl #16
 838         mov     r4, r4, lsr #16
 839         orr     r4, r4, r3, lsl #16
 840 #else
 841         orr     lr, lr, r12, lsr #16
 842         mov     r12, r12, lsl #16
 843         orr     r12, r12, r5, lsr #16
 844         mov     r5, r5, lsl #16
 845         orr     r5, r5, r4, lsr #16
 846         mov     r4, r4, lsl #16
 847         orr     r4, r4, r3, lsr #16
 848 #endif
 849         stmdb   r0!, {r4, r5, r12, lr}
 850         subs    r2, r2, #0x10
 851         bge     .Lmemmove_bsrcul2loop16
 852         ldmia   sp!, {r4, r5, lr}
 853         adds    r2, r2, #0x0c
 854         blt     .Lmemmove_bsrcul2l4
 855
 856 .Lmemmove_bsrcul2loop4:
 857 #ifdef __ARMEB__
 858         mov     r12, r3, lsr #16
 859 #else
 860         mov     r12, r3, lsl #16
 861 #endif
 862         ldr     r3, [r1, #-4]!
 863 #ifdef __ARMEB__
 864         orr     r12, r12, r3, lsl #16
 865 #else
 866         orr     r12, r12, r3, lsr #16
 867 #endif
 868         str     r12, [r0, #-4]!
 869         subs    r2, r2, #4
 870         bge     .Lmemmove_bsrcul2loop4
 871
 872 .Lmemmove_bsrcul2l4:
 873         add     r1, r1, #2
 874         b       .Lmemmove_bl4
 875
 876 .Lmemmove_bsrcul1:
 877         cmp     r2, #0x0c
 878         blt     .Lmemmove_bsrcul1loop4
 879         sub     r2, r2, #0x0c
 880         stmdb   sp!, {r4, r5, lr}
 881
 882 .Lmemmove_bsrcul1loop32:
 883 #ifdef __ARMEB__
 884         mov     lr, r3, lsr #24
 885 #else
 886         mov     lr, r3, lsl #24
 887 #endif
 888         ldmdb   r1!, {r3-r5, r12}
 889 #ifdef __ARMEB__
 890         orr     lr, lr, r12, lsl #8
 891         mov     r12, r12, lsr #24
 892         orr     r12, r12, r5, lsl #8
 893         mov     r5, r5, lsr #24
 894         orr     r5, r5, r4, lsl #8
 895         mov     r4, r4, lsr #24
 896         orr     r4, r4, r3, lsl #8
 897 #else
 898         orr     lr, lr, r12, lsr #8
 899         mov     r12, r12, lsl #24
 900         orr     r12, r12, r5, lsr #8
 901         mov     r5, r5, lsl #24
 902         orr     r5, r5, r4, lsr #8
 903         mov     r4, r4, lsl #24
 904         orr     r4, r4, r3, lsr #8
 905 #endif
 906         stmdb   r0!, {r4, r5, r12, lr}
 907         subs    r2, r2, #0x10
 908         bge     .Lmemmove_bsrcul1loop32
 909         ldmia   sp!, {r4, r5, lr}
 910         adds    r2, r2, #0x0c
 911         blt     .Lmemmove_bsrcul1l4
 912
 913 .Lmemmove_bsrcul1loop4:
 914 #ifdef __ARMEB__
 915         mov     r12, r3, lsr #24
 916 #else
 917         mov     r12, r3, lsl #24
 918 #endif
 919         ldr     r3, [r1, #-4]!
 920 #ifdef __ARMEB__
 921         orr     r12, r12, r3, lsl #8
 922 #else
 923         orr     r12, r12, r3, lsr #8
 924 #endif
 925         str     r12, [r0, #-4]!
 926         subs    r2, r2, #4
 927         bge     .Lmemmove_bsrcul1loop4
 928
 929 .Lmemmove_bsrcul1l4:
 930         add     r1, r1, #1
 931         b       .Lmemmove_bl4
 932
 933 #if !defined(_ARM_ARCH_5E)
 934 ENTRY(memcpy)
 935         /* save leaf functions having to store this away */
 936         /* Do not check arm_memcpy if we're running from flash */
 937 #ifdef FLASHADDR
 938 #if FLASHADDR > PHYSADDR
 939         ldr     r3, =FLASHADDR
 940         cmp     r3, pc
 941         bls     .Lnormal
 942 #else
 943         ldr     r3, =FLASHADDR
 944         cmp     r3, pc
 945         bhi     .Lnormal
 946 #endif
 947 #endif
 948         ldr     r3, .L_arm_memcpy
 949         ldr     r3, [r3]
 950         cmp     r3, #0
 951         beq     .Lnormal
 952         ldr     r3, .L_min_memcpy_size
 953         ldr     r3, [r3]
 954         cmp     r2, r3
 955         blt     .Lnormal
 956         stmfd   sp!, {r0-r2, r4, lr}
 957         mov     r3, #0
 958         ldr     r4, .L_arm_memcpy
 959         mov     lr, pc
 960         ldr     pc, [r4]
 961         cmp     r0, #0
 962         ldmfd   sp!, {r0-r2, r4, lr}
 963         RETeq
 964
 965 .Lnormal:
 966         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
 967
 968         subs    r2, r2, #4
 969         blt     .Lmemcpy_l4             /* less than 4 bytes */
 970         ands    r12, r0, #3
 971         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
 972         ands    r12, r1, #3
 973         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
 974
 975 .Lmemcpy_t8:
 976         /* We have aligned source and destination */
 977         subs    r2, r2, #8
 978         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
 979         subs    r2, r2, #0x14
 980         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
 981         stmdb   sp!, {r4}               /* borrow r4 */
 982
 983         /* blat 32 bytes at a time */
 984         /* XXX for really big copies perhaps we should use more registers */
 985 .Lmemcpy_loop32:
 986         ldmia   r1!, {r3, r4, r12, lr}
 987         stmia   r0!, {r3, r4, r12, lr}
 988         ldmia   r1!, {r3, r4, r12, lr}
 989         stmia   r0!, {r3, r4, r12, lr}
 990         subs    r2, r2, #0x20
 991         bge     .Lmemcpy_loop32
 992
 993         cmn     r2, #0x10
 994         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 995         stmgeia r0!, {r3, r4, r12, lr}
 996         subge   r2, r2, #0x10
 997         ldmia   sp!, {r4}               /* return r4 */
 998
 999 .Lmemcpy_l32:
1000         adds    r2, r2, #0x14
1001
1002         /* blat 12 bytes at a time */
1003 .Lmemcpy_loop12:
1004         ldmgeia r1!, {r3, r12, lr}
1005         stmgeia r0!, {r3, r12, lr}
1006         subges  r2, r2, #0x0c
1007         bge     .Lmemcpy_loop12
1008
1009 .Lmemcpy_l12:
1010         adds    r2, r2, #8
1011         blt     .Lmemcpy_l4
1012
1013         subs    r2, r2, #4
1014         ldrlt   r3, [r1], #4
1015         strlt   r3, [r0], #4
1016         ldmgeia r1!, {r3, r12}
1017         stmgeia r0!, {r3, r12}
1018         subge   r2, r2, #4
1019
1020 .Lmemcpy_l4:
1021         /* less than 4 bytes to go */
1022         adds    r2, r2, #4
1023 #ifdef __APCS_26_
1024         ldmeqia sp!, {r0, pc}^          /* done */
1025 #else
1026         ldmeqia sp!, {r0, pc}           /* done */
1027 #endif
1028         /* copy the crud byte at a time */
1029         cmp     r2, #2
1030         ldrb    r3, [r1], #1
1031         strb    r3, [r0], #1
1032         ldrgeb  r3, [r1], #1
1033         strgeb  r3, [r0], #1
1034         ldrgtb  r3, [r1], #1
1035         strgtb  r3, [r0], #1
1036         ldmia   sp!, {r0, pc}
1037
1038         /* erg - unaligned destination */
1039 .Lmemcpy_destul:
1040         rsb     r12, r12, #4
1041         cmp     r12, #2
1042
1043         /* align destination with byte copies */
1044         ldrb    r3, [r1], #1
1045         strb    r3, [r0], #1
1046         ldrgeb  r3, [r1], #1
1047         strgeb  r3, [r0], #1
1048         ldrgtb  r3, [r1], #1
1049         strgtb  r3, [r0], #1
1050         subs    r2, r2, r12
1051         blt     .Lmemcpy_l4             /* less the 4 bytes */
1052
1053         ands    r12, r1, #3
1054         beq     .Lmemcpy_t8             /* we have an aligned source */
1055
1056         /* erg - unaligned source */
1057         /* This is where it gets nasty ... */
1058 .Lmemcpy_srcul:
1059         bic     r1, r1, #3
1060         ldr     lr, [r1], #4
1061         cmp     r12, #2
1062         bgt     .Lmemcpy_srcul3
1063         beq     .Lmemcpy_srcul2
1064         cmp     r2, #0x0c
1065         blt     .Lmemcpy_srcul1loop4
1066         sub     r2, r2, #0x0c
1067         stmdb   sp!, {r4, r5}
1068
1069 .Lmemcpy_srcul1loop16:
1070         mov     r3, lr, lsr #8
1071         ldmia   r1!, {r4, r5, r12, lr}
1072         orr     r3, r3, r4, lsl #24
1073         mov     r4, r4, lsr #8
1074         orr     r4, r4, r5, lsl #24
1075         mov     r5, r5, lsr #8
1076         orr     r5, r5, r12, lsl #24
1077         mov     r12, r12, lsr #8
1078         orr     r12, r12, lr, lsl #24
1079         stmia   r0!, {r3-r5, r12}
1080         subs    r2, r2, #0x10
1081         bge     .Lmemcpy_srcul1loop16
1082         ldmia   sp!, {r4, r5}
1083         adds    r2, r2, #0x0c
1084         blt     .Lmemcpy_srcul1l4
1085
1086 .Lmemcpy_srcul1loop4:
1087         mov     r12, lr, lsr #8
1088         ldr     lr, [r1], #4
1089         orr     r12, r12, lr, lsl #24
1090         str     r12, [r0], #4
1091         subs    r2, r2, #4
1092         bge     .Lmemcpy_srcul1loop4
1093
1094 .Lmemcpy_srcul1l4:
1095         sub     r1, r1, #3
1096         b       .Lmemcpy_l4
1097
1098 .Lmemcpy_srcul2:
1099         cmp     r2, #0x0c
1100         blt     .Lmemcpy_srcul2loop4
1101         sub     r2, r2, #0x0c
1102         stmdb   sp!, {r4, r5}
1103
1104 .Lmemcpy_srcul2loop16:
1105         mov     r3, lr, lsr #16
1106         ldmia   r1!, {r4, r5, r12, lr}
1107         orr     r3, r3, r4, lsl #16
1108         mov     r4, r4, lsr #16
1109         orr     r4, r4, r5, lsl #16
1110         mov     r5, r5, lsr #16
1111         orr     r5, r5, r12, lsl #16
1112         mov     r12, r12, lsr #16
1113         orr     r12, r12, lr, lsl #16
1114         stmia   r0!, {r3-r5, r12}
1115         subs    r2, r2, #0x10
1116         bge     .Lmemcpy_srcul2loop16
1117         ldmia   sp!, {r4, r5}
1118         adds    r2, r2, #0x0c
1119         blt     .Lmemcpy_srcul2l4
1120
1121 .Lmemcpy_srcul2loop4:
1122         mov     r12, lr, lsr #16
1123         ldr     lr, [r1], #4
1124         orr     r12, r12, lr, lsl #16
1125         str     r12, [r0], #4
1126         subs    r2, r2, #4
1127         bge     .Lmemcpy_srcul2loop4
1128
1129 .Lmemcpy_srcul2l4:
1130         sub     r1, r1, #2
1131         b       .Lmemcpy_l4
1132
1133 .Lmemcpy_srcul3:
1134         cmp     r2, #0x0c
1135         blt     .Lmemcpy_srcul3loop4
1136         sub     r2, r2, #0x0c
1137         stmdb   sp!, {r4, r5}
1138
1139 .Lmemcpy_srcul3loop16:
1140         mov     r3, lr, lsr #24
1141         ldmia   r1!, {r4, r5, r12, lr}
1142         orr     r3, r3, r4, lsl #8
1143         mov     r4, r4, lsr #24
1144         orr     r4, r4, r5, lsl #8
1145         mov     r5, r5, lsr #24
1146         orr     r5, r5, r12, lsl #8
1147         mov     r12, r12, lsr #24
1148         orr     r12, r12, lr, lsl #8
1149         stmia   r0!, {r3-r5, r12}
1150         subs    r2, r2, #0x10
1151         bge     .Lmemcpy_srcul3loop16
1152         ldmia   sp!, {r4, r5}
1153         adds    r2, r2, #0x0c
1154         blt     .Lmemcpy_srcul3l4
1155
1156 .Lmemcpy_srcul3loop4:
1157         mov     r12, lr, lsr #24
1158         ldr     lr, [r1], #4
1159         orr     r12, r12, lr, lsl #8
1160         str     r12, [r0], #4
1161         subs    r2, r2, #4
1162         bge     .Lmemcpy_srcul3loop4
1163
1164 .Lmemcpy_srcul3l4:
1165         sub     r1, r1, #1
1166         b       .Lmemcpy_l4
1167 #else
1168 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1169 ENTRY(memcpy)
1170         pld     [r1]
1171         cmp     r2, #0x0c
1172         ble     .Lmemcpy_short          /* <= 12 bytes */
1173 #ifdef FLASHADDR
1174 #if FLASHADDR > PHYSADDR
1175         ldr     r3, =FLASHADDR
1176         cmp     r3, pc
1177         bls     .Lnormal
1178 #else
1179         ldr     r3, =FLASHADDR
1180         cmp     r3, pc
1181         bhi     .Lnormal
1182 #endif
1183 #endif
1184         ldr     r3, .L_arm_memcpy
1185         ldr     r3, [r3]
1186         cmp     r3, #0
1187         beq     .Lnormal
1188         ldr     r3, .L_min_memcpy_size
1189         ldr     r3, [r3]
1190         cmp     r2, r3
1191         blt     .Lnormal
1192         stmfd   sp!, {r0-r2, r4, lr}
1193         mov     r3, #0
1194         ldr     r4, .L_arm_memcpy
1195         mov     lr, pc
1196         ldr     pc, [r4]
1197         cmp     r0, #0
1198         ldmfd   sp!, {r0-r2, r4, lr}
1199         RETeq
1200 .Lnormal:
1201         mov     r3, r0                  /* We must not clobber r0 */
1202
1203         /* Word-align the destination buffer */
1204         ands    ip, r3, #0x03           /* Already word aligned? */
1205         beq     .Lmemcpy_wordaligned    /* Yup */
1206         cmp     ip, #0x02
1207         ldrb    ip, [r1], #0x01
1208         sub     r2, r2, #0x01
1209         strb    ip, [r3], #0x01
1210         ldrleb  ip, [r1], #0x01
1211         suble   r2, r2, #0x01
1212         strleb  ip, [r3], #0x01
1213         ldrltb  ip, [r1], #0x01
1214         sublt   r2, r2, #0x01
1215         strltb  ip, [r3], #0x01
1216
1217         /* Destination buffer is now word aligned */
1218 .Lmemcpy_wordaligned:
1219         ands    ip, r1, #0x03           /* Is src also word-aligned? */
1220         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
1221
1222         /* Quad-align the destination buffer */
1223         tst     r3, #0x07               /* Already quad aligned? */
1224         ldrne   ip, [r1], #0x04
1225         stmfd   sp!, {r4-r9}            /* Free up some registers */
1226         subne   r2, r2, #0x04
1227         strne   ip, [r3], #0x04
1228
1229         /* Destination buffer quad aligned, source is at least word aligned */
1230         subs    r2, r2, #0x80
1231         blt     .Lmemcpy_w_lessthan128
1232
1233         /* Copy 128 bytes at a time */
1234 .Lmemcpy_w_loop128:
1235         ldr     r4, [r1], #0x04         /* LD:00-03 */
1236         ldr     r5, [r1], #0x04         /* LD:04-07 */
1237         pld     [r1, #0x18]             /* Prefetch 0x20 */
1238         ldr     r6, [r1], #0x04         /* LD:08-0b */
1239         ldr     r7, [r1], #0x04         /* LD:0c-0f */
1240         ldr     r8, [r1], #0x04         /* LD:10-13 */
1241         ldr     r9, [r1], #0x04         /* LD:14-17 */
1242         strd    r4, [r3], #0x08         /* ST:00-07 */
1243         ldr     r4, [r1], #0x04         /* LD:18-1b */
1244         ldr     r5, [r1], #0x04         /* LD:1c-1f */
1245         strd    r6, [r3], #0x08         /* ST:08-0f */
1246         ldr     r6, [r1], #0x04         /* LD:20-23 */
1247         ldr     r7, [r1], #0x04         /* LD:24-27 */
1248         pld     [r1, #0x18]             /* Prefetch 0x40 */
1249         strd    r8, [r3], #0x08         /* ST:10-17 */
1250         ldr     r8, [r1], #0x04         /* LD:28-2b */
1251         ldr     r9, [r1], #0x04         /* LD:2c-2f */
1252         strd    r4, [r3], #0x08         /* ST:18-1f */
1253         ldr     r4, [r1], #0x04         /* LD:30-33 */
1254         ldr     r5, [r1], #0x04         /* LD:34-37 */
1255         strd    r6, [r3], #0x08         /* ST:20-27 */
1256         ldr     r6, [r1], #0x04         /* LD:38-3b */
1257         ldr     r7, [r1], #0x04         /* LD:3c-3f */
1258         strd    r8, [r3], #0x08         /* ST:28-2f */
1259         ldr     r8, [r1], #0x04         /* LD:40-43 */
1260         ldr     r9, [r1], #0x04         /* LD:44-47 */
1261         pld     [r1, #0x18]             /* Prefetch 0x60 */
1262         strd    r4, [r3], #0x08         /* ST:30-37 */
1263         ldr     r4, [r1], #0x04         /* LD:48-4b */
1264         ldr     r5, [r1], #0x04         /* LD:4c-4f */
1265         strd    r6, [r3], #0x08         /* ST:38-3f */
1266         ldr     r6, [r1], #0x04         /* LD:50-53 */
1267         ldr     r7, [r1], #0x04         /* LD:54-57 */
1268         strd    r8, [r3], #0x08         /* ST:40-47 */
1269         ldr     r8, [r1], #0x04         /* LD:58-5b */
1270         ldr     r9, [r1], #0x04         /* LD:5c-5f */
1271         strd    r4, [r3], #0x08         /* ST:48-4f */
1272         ldr     r4, [r1], #0x04         /* LD:60-63 */
1273         ldr     r5, [r1], #0x04         /* LD:64-67 */
1274         pld     [r1, #0x18]             /* Prefetch 0x80 */
1275         strd    r6, [r3], #0x08         /* ST:50-57 */
1276         ldr     r6, [r1], #0x04         /* LD:68-6b */
1277         ldr     r7, [r1], #0x04         /* LD:6c-6f */
1278         strd    r8, [r3], #0x08         /* ST:58-5f */
1279         ldr     r8, [r1], #0x04         /* LD:70-73 */
1280         ldr     r9, [r1], #0x04         /* LD:74-77 */
1281         strd    r4, [r3], #0x08         /* ST:60-67 */
1282         ldr     r4, [r1], #0x04         /* LD:78-7b */
1283         ldr     r5, [r1], #0x04         /* LD:7c-7f */
1284         strd    r6, [r3], #0x08         /* ST:68-6f */
1285         strd    r8, [r3], #0x08         /* ST:70-77 */
1286         subs    r2, r2, #0x80
1287         strd    r4, [r3], #0x08         /* ST:78-7f */
1288         bge     .Lmemcpy_w_loop128
1289
1290 .Lmemcpy_w_lessthan128:
1291         adds    r2, r2, #0x80           /* Adjust for extra sub */
1292         ldmeqfd sp!, {r4-r9}
1293         RETeq                   /* Return now if done */
1294         subs    r2, r2, #0x20
1295         blt     .Lmemcpy_w_lessthan32
1296
1297         /* Copy 32 bytes at a time */
1298 .Lmemcpy_w_loop32:
1299         ldr     r4, [r1], #0x04
1300         ldr     r5, [r1], #0x04
1301         pld     [r1, #0x18]
1302         ldr     r6, [r1], #0x04
1303         ldr     r7, [r1], #0x04
1304         ldr     r8, [r1], #0x04
1305         ldr     r9, [r1], #0x04
1306         strd    r4, [r3], #0x08
1307         ldr     r4, [r1], #0x04
1308         ldr     r5, [r1], #0x04
1309         strd    r6, [r3], #0x08
1310         strd    r8, [r3], #0x08
1311         subs    r2, r2, #0x20
1312         strd    r4, [r3], #0x08
1313         bge     .Lmemcpy_w_loop32
1314
1315 .Lmemcpy_w_lessthan32:
1316         adds    r2, r2, #0x20           /* Adjust for extra sub */
1317         ldmeqfd sp!, {r4-r9}
1318         RETeq                   /* Return now if done */
1319
1320         and     r4, r2, #0x18
1321         rsbs    r4, r4, #0x18
1322         addne   pc, pc, r4, lsl #1
1323         nop
1324
1325         /* At least 24 bytes remaining */
1326         ldr     r4, [r1], #0x04
1327         ldr     r5, [r1], #0x04
1328         sub     r2, r2, #0x08
1329         strd    r4, [r3], #0x08
1330
1331         /* At least 16 bytes remaining */
1332         ldr     r4, [r1], #0x04
1333         ldr     r5, [r1], #0x04
1334         sub     r2, r2, #0x08
1335         strd    r4, [r3], #0x08
1336
1337         /* At least 8 bytes remaining */
1338         ldr     r4, [r1], #0x04
1339         ldr     r5, [r1], #0x04
1340         subs    r2, r2, #0x08
1341         strd    r4, [r3], #0x08
1342
1343         /* Less than 8 bytes remaining */
1344         ldmfd   sp!, {r4-r9}
1345         RETeq                   /* Return now if done */
1346         subs    r2, r2, #0x04
1347         ldrge   ip, [r1], #0x04
1348         strge   ip, [r3], #0x04
1349         RETeq                   /* Return now if done */
1350         addlt   r2, r2, #0x04
1351         ldrb    ip, [r1], #0x01
1352         cmp     r2, #0x02
1353         ldrgeb  r2, [r1], #0x01
1354         strb    ip, [r3], #0x01
1355         ldrgtb  ip, [r1]
1356         strgeb  r2, [r3], #0x01
1357         strgtb  ip, [r3]
1358         RET
1359
1360
1361 /*
1362  * At this point, it has not been possible to word align both buffers.
1363  * The destination buffer is word aligned, but the source buffer is not.
1364  */
1365 .Lmemcpy_bad_align:
1366         stmfd   sp!, {r4-r7}
1367         bic     r1, r1, #0x03
1368         cmp     ip, #2
1369         ldr     ip, [r1], #0x04
1370         bgt     .Lmemcpy_bad3
1371         beq     .Lmemcpy_bad2
1372         b       .Lmemcpy_bad1
1373
1374 .Lmemcpy_bad1_loop16:
1375 #ifdef __ARMEB__
1376         mov     r4, ip, lsl #8
1377 #else
1378         mov     r4, ip, lsr #8
1379 #endif
1380         ldr     r5, [r1], #0x04
1381         pld     [r1, #0x018]
1382         ldr     r6, [r1], #0x04
1383         ldr     r7, [r1], #0x04
1384         ldr     ip, [r1], #0x04
1385 #ifdef __ARMEB__
1386         orr     r4, r4, r5, lsr #24
1387         mov     r5, r5, lsl #8
1388         orr     r5, r5, r6, lsr #24
1389         mov     r6, r6, lsl #8
1390         orr     r6, r6, r7, lsr #24
1391         mov     r7, r7, lsl #8
1392         orr     r7, r7, ip, lsr #24
1393 #else
1394         orr     r4, r4, r5, lsl #24
1395         mov     r5, r5, lsr #8
1396         orr     r5, r5, r6, lsl #24
1397         mov     r6, r6, lsr #8
1398         orr     r6, r6, r7, lsl #24
1399         mov     r7, r7, lsr #8
1400         orr     r7, r7, ip, lsl #24
1401 #endif
1402         str     r4, [r3], #0x04
1403         str     r5, [r3], #0x04
1404         str     r6, [r3], #0x04
1405         str     r7, [r3], #0x04
1406 .Lmemcpy_bad1:
1407         subs    r2, r2, #0x10
1408         bge     .Lmemcpy_bad1_loop16
1409
1410         adds    r2, r2, #0x10
1411         ldmeqfd sp!, {r4-r7}
1412         RETeq                   /* Return now if done */
1413         subs    r2, r2, #0x04
1414         sublt   r1, r1, #0x03
1415         blt     .Lmemcpy_bad_done
1416
1417 .Lmemcpy_bad1_loop4:
1418 #ifdef __ARMEB__
1419         mov     r4, ip, lsl #8
1420 #else
1421         mov     r4, ip, lsr #8
1422 #endif
1423         ldr     ip, [r1], #0x04
1424         subs    r2, r2, #0x04
1425 #ifdef __ARMEB__
1426         orr     r4, r4, ip, lsr #24
1427 #else
1428         orr     r4, r4, ip, lsl #24
1429 #endif
1430         str     r4, [r3], #0x04
1431         bge     .Lmemcpy_bad1_loop4
1432         sub     r1, r1, #0x03
1433         b       .Lmemcpy_bad_done
1434
1435 .Lmemcpy_bad2_loop16:
1436 #ifdef __ARMEB__
1437         mov     r4, ip, lsl #16
1438 #else
1439         mov     r4, ip, lsr #16
1440 #endif
1441         ldr     r5, [r1], #0x04
1442         pld     [r1, #0x018]
1443         ldr     r6, [r1], #0x04
1444         ldr     r7, [r1], #0x04
1445         ldr     ip, [r1], #0x04
1446 #ifdef __ARMEB__
1447         orr     r4, r4, r5, lsr #16
1448         mov     r5, r5, lsl #16
1449         orr     r5, r5, r6, lsr #16
1450         mov     r6, r6, lsl #16
1451         orr     r6, r6, r7, lsr #16
1452         mov     r7, r7, lsl #16
1453         orr     r7, r7, ip, lsr #16
1454 #else
1455         orr     r4, r4, r5, lsl #16
1456         mov     r5, r5, lsr #16
1457         orr     r5, r5, r6, lsl #16
1458         mov     r6, r6, lsr #16
1459         orr     r6, r6, r7, lsl #16
1460         mov     r7, r7, lsr #16
1461         orr     r7, r7, ip, lsl #16
1462 #endif
1463         str     r4, [r3], #0x04
1464         str     r5, [r3], #0x04
1465         str     r6, [r3], #0x04
1466         str     r7, [r3], #0x04
1467 .Lmemcpy_bad2:
1468         subs    r2, r2, #0x10
1469         bge     .Lmemcpy_bad2_loop16
1470
1471         adds    r2, r2, #0x10
1472         ldmeqfd sp!, {r4-r7}
1473         RETeq                   /* Return now if done */
1474         subs    r2, r2, #0x04
1475         sublt   r1, r1, #0x02
1476         blt     .Lmemcpy_bad_done
1477
1478 .Lmemcpy_bad2_loop4:
1479 #ifdef __ARMEB__
1480         mov     r4, ip, lsl #16
1481 #else
1482         mov     r4, ip, lsr #16
1483 #endif
1484         ldr     ip, [r1], #0x04
1485         subs    r2, r2, #0x04
1486 #ifdef __ARMEB__
1487         orr     r4, r4, ip, lsr #16
1488 #else
1489         orr     r4, r4, ip, lsl #16
1490 #endif
1491         str     r4, [r3], #0x04
1492         bge     .Lmemcpy_bad2_loop4
1493         sub     r1, r1, #0x02
1494         b       .Lmemcpy_bad_done
1495
1496 .Lmemcpy_bad3_loop16:
1497 #ifdef __ARMEB__
1498         mov     r4, ip, lsl #24
1499 #else
1500         mov     r4, ip, lsr #24
1501 #endif
1502         ldr     r5, [r1], #0x04
1503         pld     [r1, #0x018]
1504         ldr     r6, [r1], #0x04
1505         ldr     r7, [r1], #0x04
1506         ldr     ip, [r1], #0x04
1507 #ifdef __ARMEB__
1508         orr     r4, r4, r5, lsr #8
1509         mov     r5, r5, lsl #24
1510         orr     r5, r5, r6, lsr #8
1511         mov     r6, r6, lsl #24
1512         orr     r6, r6, r7, lsr #8
1513         mov     r7, r7, lsl #24
1514         orr     r7, r7, ip, lsr #8
1515 #else
1516         orr     r4, r4, r5, lsl #8
1517         mov     r5, r5, lsr #24
1518         orr     r5, r5, r6, lsl #8
1519         mov     r6, r6, lsr #24
1520         orr     r6, r6, r7, lsl #8
1521         mov     r7, r7, lsr #24
1522         orr     r7, r7, ip, lsl #8
1523 #endif
1524         str     r4, [r3], #0x04
1525         str     r5, [r3], #0x04
1526         str     r6, [r3], #0x04
1527         str     r7, [r3], #0x04
1528 .Lmemcpy_bad3:
1529         subs    r2, r2, #0x10
1530         bge     .Lmemcpy_bad3_loop16
1531
1532         adds    r2, r2, #0x10
1533         ldmeqfd sp!, {r4-r7}
1534         RETeq                   /* Return now if done */
1535         subs    r2, r2, #0x04
1536         sublt   r1, r1, #0x01
1537         blt     .Lmemcpy_bad_done
1538
1539 .Lmemcpy_bad3_loop4:
1540 #ifdef __ARMEB__
1541         mov     r4, ip, lsl #24
1542 #else
1543         mov     r4, ip, lsr #24
1544 #endif
1545         ldr     ip, [r1], #0x04
1546         subs    r2, r2, #0x04
1547 #ifdef __ARMEB__
1548         orr     r4, r4, ip, lsr #8
1549 #else
1550         orr     r4, r4, ip, lsl #8
1551 #endif
1552         str     r4, [r3], #0x04
1553         bge     .Lmemcpy_bad3_loop4
1554         sub     r1, r1, #0x01
1555
1556 .Lmemcpy_bad_done:
1557         ldmfd   sp!, {r4-r7}
1558         adds    r2, r2, #0x04
1559         RETeq
1560         ldrb    ip, [r1], #0x01
1561         cmp     r2, #0x02
1562         ldrgeb  r2, [r1], #0x01
1563         strb    ip, [r3], #0x01
1564         ldrgtb  ip, [r1]
1565         strgeb  r2, [r3], #0x01
1566         strgtb  ip, [r3]
1567         RET
1568
1569
1570 /*
1571  * Handle short copies (less than 16 bytes), possibly misaligned.
1572  * Some of these are *very* common, thanks to the network stack,
1573  * and so are handled specially.
1574  */
1575 .Lmemcpy_short:
1576         add     pc, pc, r2, lsl #2
1577         nop
1578         RET                     /* 0x00 */
1579         b       .Lmemcpy_bytewise       /* 0x01 */
1580         b       .Lmemcpy_bytewise       /* 0x02 */
1581         b       .Lmemcpy_bytewise       /* 0x03 */
1582         b       .Lmemcpy_4              /* 0x04 */
1583         b       .Lmemcpy_bytewise       /* 0x05 */
1584         b       .Lmemcpy_6              /* 0x06 */
1585         b       .Lmemcpy_bytewise       /* 0x07 */
1586         b       .Lmemcpy_8              /* 0x08 */
1587         b       .Lmemcpy_bytewise       /* 0x09 */
1588         b       .Lmemcpy_bytewise       /* 0x0a */
1589         b       .Lmemcpy_bytewise       /* 0x0b */
1590         b       .Lmemcpy_c              /* 0x0c */
1591 .Lmemcpy_bytewise:
1592         mov     r3, r0                  /* We must not clobber r0 */
1593         ldrb    ip, [r1], #0x01
1594 1:      subs    r2, r2, #0x01
1595         strb    ip, [r3], #0x01
1596         ldrneb  ip, [r1], #0x01
1597         bne     1b
1598         RET
1599
1600 /******************************************************************************
1601  * Special case for 4 byte copies
1602  */
1603 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1604 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1605         LMEMCPY_4_PAD
1606 .Lmemcpy_4:
1607         and     r2, r1, #0x03
1608         orr     r2, r2, r0, lsl #2
1609         ands    r2, r2, #0x0f
1610         sub     r3, pc, #0x14
1611         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1612
1613 /*
1614  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1615  */
1616         ldr     r2, [r1]
1617         str     r2, [r0]
1618         RET
1619         LMEMCPY_4_PAD
1620
1621 /*
1622  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1623  */
1624         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1625         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1626 #ifdef __ARMEB__
1627         mov     r3, r3, lsl #8          /* r3 = 012. */
1628         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
1629 #else
1630         mov     r3, r3, lsr #8          /* r3 = .210 */
1631         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1632 #endif
1633         str     r3, [r0]
1634         RET
1635         LMEMCPY_4_PAD
1636
1637 /*
1638  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1639  */
1640 #ifdef __ARMEB__
1641         ldrh    r3, [r1]
1642         ldrh    r2, [r1, #0x02]
1643 #else
1644         ldrh    r3, [r1, #0x02]
1645         ldrh    r2, [r1]
1646 #endif
1647         orr     r3, r2, r3, lsl #16
1648         str     r3, [r0]
1649         RET
1650         LMEMCPY_4_PAD
1651
1652 /*
1653  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1654  */
1655         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1656         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1657 #ifdef __ARMEB__
1658         mov     r3, r3, lsl #24         /* r3 = 0... */
1659         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
1660 #else
1661         mov     r3, r3, lsr #24         /* r3 = ...0 */
1662         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1663 #endif
1664         str     r3, [r0]
1665         RET
1666         LMEMCPY_4_PAD
1667
1668 /*
1669  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1670  */
1671         ldr     r2, [r1]
1672 #ifdef __ARMEB__
1673         strb    r2, [r0, #0x03]
1674         mov     r3, r2, lsr #8
1675         mov     r1, r2, lsr #24
1676         strb    r1, [r0]
1677 #else
1678         strb    r2, [r0]
1679         mov     r3, r2, lsr #8
1680         mov     r1, r2, lsr #24
1681         strb    r1, [r0, #0x03]
1682 #endif
1683         strh    r3, [r0, #0x01]
1684         RET
1685         LMEMCPY_4_PAD
1686
1687 /*
1688  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1689  */
1690         ldrb    r2, [r1]
1691         ldrh    r3, [r1, #0x01]
1692         ldrb    r1, [r1, #0x03]
1693         strb    r2, [r0]
1694         strh    r3, [r0, #0x01]
1695         strb    r1, [r0, #0x03]
1696         RET
1697         LMEMCPY_4_PAD
1698
1699 /*
1700  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1701  */
1702         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1703         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1704 #ifdef __ARMEB__
1705         mov     r1, r2, lsr #8          /* r1 = ...0 */
1706         strb    r1, [r0]
1707         mov     r2, r2, lsl #8          /* r2 = .01. */
1708         orr     r2, r2, r3, lsr #8      /* r2 = .012 */
1709 #else
1710         strb    r2, [r0]
1711         mov     r2, r2, lsr #8          /* r2 = ...1 */
1712         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1713         mov     r3, r3, lsr #8          /* r3 = ...3 */
1714 #endif
1715         strh    r2, [r0, #0x01]
1716         strb    r3, [r0, #0x03]
1717         RET
1718         LMEMCPY_4_PAD
1719
1720 /*
1721  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1722  */
1723         ldrb    r2, [r1]
1724         ldrh    r3, [r1, #0x01]
1725         ldrb    r1, [r1, #0x03]
1726         strb    r2, [r0]
1727         strh    r3, [r0, #0x01]
1728         strb    r1, [r0, #0x03]
1729         RET
1730         LMEMCPY_4_PAD
1731
1732 /*
1733  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1734  */
1735         ldr     r2, [r1]
1736 #ifdef __ARMEB__
1737         strh    r2, [r0, #0x02]
1738         mov     r3, r2, lsr #16
1739         strh    r3, [r0]
1740 #else
1741         strh    r2, [r0]
1742         mov     r3, r2, lsr #16
1743         strh    r3, [r0, #0x02]
1744 #endif
1745         RET
1746         LMEMCPY_4_PAD
1747
1748 /*
1749  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1750  */
1751         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1752         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1753         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1754         strh    r1, [r0]
1755 #ifdef __ARMEB__
1756         mov     r2, r2, lsl #8          /* r2 = 012. */
1757         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1758 #else
1759         mov     r2, r2, lsr #24         /* r2 = ...2 */
1760         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1761 #endif
1762         strh    r2, [r0, #0x02]
1763         RET
1764         LMEMCPY_4_PAD
1765
1766 /*
1767  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1768  */
1769         ldrh    r2, [r1]
1770         ldrh    r3, [r1, #0x02]
1771         strh    r2, [r0]
1772         strh    r3, [r0, #0x02]
1773         RET
1774         LMEMCPY_4_PAD
1775
1776 /*
1777  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1778  */
1779         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1780         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1781         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1782         strh    r1, [r0, #0x02]
1783 #ifdef __ARMEB__
1784         mov     r3, r3, lsr #24         /* r3 = ...1 */
1785         orr     r3, r3, r2, lsl #8      /* r3 = xx01 */
1786 #else
1787         mov     r3, r3, lsl #8          /* r3 = 321. */
1788         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1789 #endif
1790         strh    r3, [r0]
1791         RET
1792         LMEMCPY_4_PAD
1793
1794 /*
1795  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1796  */
1797         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1798 #ifdef __ARMEB__
1799         strb    r2, [r0, #0x03]
1800         mov     r3, r2, lsr #8
1801         mov     r1, r2, lsr #24
1802         strh    r3, [r0, #0x01]
1803         strb    r1, [r0]
1804 #else
1805         strb    r2, [r0]
1806         mov     r3, r2, lsr #8
1807         mov     r1, r2, lsr #24
1808         strh    r3, [r0, #0x01]
1809         strb    r1, [r0, #0x03]
1810 #endif
1811         RET
1812         LMEMCPY_4_PAD
1813
1814 /*
1815  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1816  */
1817         ldrb    r2, [r1]
1818         ldrh    r3, [r1, #0x01]
1819         ldrb    r1, [r1, #0x03]
1820         strb    r2, [r0]
1821         strh    r3, [r0, #0x01]
1822         strb    r1, [r0, #0x03]
1823         RET
1824         LMEMCPY_4_PAD
1825
1826 /*
1827  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1828  */
1829 #ifdef __ARMEB__
1830         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1831         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1832         strb    r3, [r0, #0x03]
1833         mov     r3, r3, lsr #8          /* r3 = ...2 */
1834         orr     r3, r3, r2, lsl #8      /* r3 = ..12 */
1835         strh    r3, [r0, #0x01]
1836         mov     r2, r2, lsr #8          /* r2 = ...0 */
1837         strb    r2, [r0]
1838 #else
1839         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1840         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1841         strb    r2, [r0]
1842         mov     r2, r2, lsr #8          /* r2 = ...1 */
1843         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1844         strh    r2, [r0, #0x01]
1845         mov     r3, r3, lsr #8          /* r3 = ...3 */
1846         strb    r3, [r0, #0x03]
1847 #endif
1848         RET
1849         LMEMCPY_4_PAD
1850
1851 /*
1852  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1853  */
1854         ldrb    r2, [r1]
1855         ldrh    r3, [r1, #0x01]
1856         ldrb    r1, [r1, #0x03]
1857         strb    r2, [r0]
1858         strh    r3, [r0, #0x01]
1859         strb    r1, [r0, #0x03]
1860         RET
1861         LMEMCPY_4_PAD
1862
1863
1864 /******************************************************************************
1865  * Special case for 6 byte copies
1866  */
1867 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1868 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1869         LMEMCPY_6_PAD
1870 .Lmemcpy_6:
1871         and     r2, r1, #0x03
1872         orr     r2, r2, r0, lsl #2
1873         ands    r2, r2, #0x0f
1874         sub     r3, pc, #0x14
1875         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1876
1877 /*
1878  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1879  */
1880         ldr     r2, [r1]
1881         ldrh    r3, [r1, #0x04]
1882         str     r2, [r0]
1883         strh    r3, [r0, #0x04]
1884         RET
1885         LMEMCPY_6_PAD
1886
1887 /*
1888  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1889  */
1890         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1891         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1892 #ifdef __ARMEB__
1893         mov     r2, r2, lsl #8          /* r2 = 012. */
1894         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1895 #else
1896         mov     r2, r2, lsr #8          /* r2 = .210 */
1897         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1898 #endif
1899         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1900         str     r2, [r0]
1901         strh    r3, [r0, #0x04]
1902         RET
1903         LMEMCPY_6_PAD
1904
1905 /*
1906  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1907  */
1908         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1909         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1910 #ifdef __ARMEB__
1911         mov     r1, r3, lsr #16         /* r1 = ..23 */
1912         orr     r1, r1, r2, lsl #16     /* r1 = 0123 */
1913         str     r1, [r0]
1914         strh    r3, [r0, #0x04]
1915 #else
1916         mov     r1, r3, lsr #16         /* r1 = ..54 */
1917         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1918         str     r2, [r0]
1919         strh    r1, [r0, #0x04]
1920 #endif
1921         RET
1922         LMEMCPY_6_PAD
1923
1924 /*
1925  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1926  */
1927         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1928         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1929         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1930 #ifdef __ARMEB__
1931         mov     r2, r2, lsl #24         /* r2 = 0... */
1932         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
1933         mov     r3, r3, lsl #8          /* r3 = 234. */
1934         orr     r1, r3, r1, lsr #24     /* r1 = 2345 */
1935 #else
1936         mov     r2, r2, lsr #24         /* r2 = ...0 */
1937         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1938         mov     r1, r1, lsl #8          /* r1 = xx5. */
1939         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1940 #endif
1941         str     r2, [r0]
1942         strh    r1, [r0, #0x04]
1943         RET
1944         LMEMCPY_6_PAD
1945
1946 /*
1947  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1948  */
1949         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1950         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1951         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1952         strh    r1, [r0, #0x01]
1953 #ifdef __ARMEB__
1954         mov     r1, r3, lsr #24         /* r1 = ...0 */
1955         strb    r1, [r0]
1956         mov     r3, r3, lsl #8          /* r3 = 123. */
1957         orr     r3, r3, r2, lsr #8      /* r3 = 1234 */
1958 #else
1959         strb    r3, [r0]
1960         mov     r3, r3, lsr #24         /* r3 = ...3 */
1961         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1962         mov     r2, r2, lsr #8          /* r2 = ...5 */
1963 #endif
1964         strh    r3, [r0, #0x03]
1965         strb    r2, [r0, #0x05]
1966         RET
1967         LMEMCPY_6_PAD
1968
1969 /*
1970  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1971  */
1972         ldrb    r2, [r1]
1973         ldrh    r3, [r1, #0x01]
1974         ldrh    ip, [r1, #0x03]
1975         ldrb    r1, [r1, #0x05]
1976         strb    r2, [r0]
1977         strh    r3, [r0, #0x01]
1978         strh    ip, [r0, #0x03]
1979         strb    r1, [r0, #0x05]
1980         RET
1981         LMEMCPY_6_PAD
1982
1983 /*
1984  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1985  */
1986         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1987         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1988 #ifdef __ARMEB__
1989         mov     r3, r2, lsr #8          /* r3 = ...0 */
1990         strb    r3, [r0]
1991         strb    r1, [r0, #0x05]
1992         mov     r3, r1, lsr #8          /* r3 = .234 */
1993         strh    r3, [r0, #0x03]
1994         mov     r3, r2, lsl #8          /* r3 = .01. */
1995         orr     r3, r3, r1, lsr #24     /* r3 = .012 */
1996         strh    r3, [r0, #0x01]
1997 #else
1998         strb    r2, [r0]
1999         mov     r3, r1, lsr #24
2000         strb    r3, [r0, #0x05]
2001         mov     r3, r1, lsr #8          /* r3 = .543 */
2002         strh    r3, [r0, #0x03]
2003         mov     r3, r2, lsr #8          /* r3 = ...1 */
2004         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
2005         strh    r3, [r0, #0x01]
2006 #endif
2007         RET
2008         LMEMCPY_6_PAD
2009
2010 /*
2011  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2012  */
2013         ldrb    r2, [r1]
2014         ldrh    r3, [r1, #0x01]
2015         ldrh    ip, [r1, #0x03]
2016         ldrb    r1, [r1, #0x05]
2017         strb    r2, [r0]
2018         strh    r3, [r0, #0x01]
2019         strh    ip, [r0, #0x03]
2020         strb    r1, [r0, #0x05]
2021         RET
2022         LMEMCPY_6_PAD
2023
2024 /*
2025  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2026  */
2027 #ifdef __ARMEB__
2028         ldr     r2, [r1]                /* r2 = 0123 */
2029         ldrh    r3, [r1, #0x04]         /* r3 = ..45 */
2030         mov     r1, r2, lsr #16         /* r1 = ..01 */
2031         orr     r3, r3, r2, lsl#16      /* r3 = 2345 */
2032         strh    r1, [r0]
2033         str     r3, [r0, #0x02]
2034 #else
2035         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
2036         ldr     r3, [r1]                /* r3 = 3210 */
2037         mov     r2, r2, lsl #16         /* r2 = 54.. */
2038         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
2039         strh    r3, [r0]
2040         str     r2, [r0, #0x02]
2041 #endif
2042         RET
2043         LMEMCPY_6_PAD
2044
2045 /*
2046  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2047  */
2048         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2049         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
2050         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2051 #ifdef __ARMEB__
2052         mov     r2, r2, lsr #8          /* r2 = .345 */
2053         orr     r2, r2, r3, lsl #24     /* r2 = 2345 */
2054 #else
2055         mov     r2, r2, lsl #8          /* r2 = 543. */
2056         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
2057 #endif
2058         strh    r1, [r0]
2059         str     r2, [r0, #0x02]
2060         RET
2061         LMEMCPY_6_PAD
2062
2063 /*
2064  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2065  */
2066         ldrh    r2, [r1]
2067         ldr     r3, [r1, #0x02]
2068         strh    r2, [r0]
2069         str     r3, [r0, #0x02]
2070         RET
2071         LMEMCPY_6_PAD
2072
2073 /*
2074  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2075  */
2076         ldrb    r3, [r1]                /* r3 = ...0 */
2077         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2078         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
2079 #ifdef __ARMEB__
2080         mov     r3, r3, lsl #8          /* r3 = ..0. */
2081         orr     r3, r3, r2, lsr #24     /* r3 = ..01 */
2082         orr     r1, r1, r2, lsl #8      /* r1 = 2345 */
2083 #else
2084         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2085         mov     r1, r1, lsl #24         /* r1 = 5... */
2086         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
2087 #endif
2088         strh    r3, [r0]
2089         str     r1, [r0, #0x02]
2090         RET
2091         LMEMCPY_6_PAD
2092
2093 /*
2094  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2095  */
2096         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2097         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
2098 #ifdef __ARMEB__
2099         mov     r3, r2, lsr #24         /* r3 = ...0 */
2100         strb    r3, [r0]
2101         mov     r2, r2, lsl #8          /* r2 = 123. */
2102         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2103 #else
2104         strb    r2, [r0]
2105         mov     r2, r2, lsr #8          /* r2 = .321 */
2106         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
2107         mov     r1, r1, lsr #8          /* r1 = ...5 */
2108 #endif
2109         str     r2, [r0, #0x01]
2110         strb    r1, [r0, #0x05]
2111         RET
2112         LMEMCPY_6_PAD
2113
2114 /*
2115  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2116  */
2117         ldrb    r2, [r1]
2118         ldrh    r3, [r1, #0x01]
2119         ldrh    ip, [r1, #0x03]
2120         ldrb    r1, [r1, #0x05]
2121         strb    r2, [r0]
2122         strh    r3, [r0, #0x01]
2123         strh    ip, [r0, #0x03]
2124         strb    r1, [r0, #0x05]
2125         RET
2126         LMEMCPY_6_PAD
2127
2128 /*
2129  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2130  */
2131         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2132         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
2133 #ifdef __ARMEB__
2134         mov     r3, r2, lsr #8          /* r3 = ...0 */
2135         strb    r3, [r0]
2136         mov     r2, r2, lsl #24         /* r2 = 1... */
2137         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2138 #else
2139         strb    r2, [r0]
2140         mov     r2, r2, lsr #8          /* r2 = ...1 */
2141         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
2142         mov     r1, r1, lsr #24         /* r1 = ...5 */
2143 #endif
2144         str     r2, [r0, #0x01]
2145         strb    r1, [r0, #0x05]
2146         RET
2147         LMEMCPY_6_PAD
2148
2149 /*
2150  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2151  */
2152         ldrb    r2, [r1]
2153         ldr     r3, [r1, #0x01]
2154         ldrb    r1, [r1, #0x05]
2155         strb    r2, [r0]
2156         str     r3, [r0, #0x01]
2157         strb    r1, [r0, #0x05]
2158         RET
2159         LMEMCPY_6_PAD
2160
2161
2162 /******************************************************************************
2163  * Special case for 8 byte copies
2164  */
2165 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
2166 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
2167         LMEMCPY_8_PAD
2168 .Lmemcpy_8:
2169         and     r2, r1, #0x03
2170         orr     r2, r2, r0, lsl #2
2171         ands    r2, r2, #0x0f
2172         sub     r3, pc, #0x14
2173         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
2174
2175 /*
2176  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2177  */
2178         ldr     r2, [r1]
2179         ldr     r3, [r1, #0x04]
2180         str     r2, [r0]
2181         str     r3, [r0, #0x04]
2182         RET
2183         LMEMCPY_8_PAD
2184
2185 /*
2186  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2187  */
2188         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2189         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
2190         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2191 #ifdef __ARMEB__
2192         mov     r3, r3, lsl #8          /* r3 = 012. */
2193         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
2194         orr     r2, r1, r2, lsl #8      /* r2 = 4567 */
2195 #else
2196         mov     r3, r3, lsr #8          /* r3 = .210 */
2197         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
2198         mov     r1, r1, lsl #24         /* r1 = 7... */
2199         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
2200 #endif
2201         str     r3, [r0]
2202         str     r2, [r0, #0x04]
2203         RET
2204         LMEMCPY_8_PAD
2205
2206 /*
2207  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2208  */
2209         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2210         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2211         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2212 #ifdef __ARMEB__
2213         mov     r2, r2, lsl #16         /* r2 = 01.. */
2214         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2215         orr     r3, r1, r3, lsl #16     /* r3 = 4567 */
2216 #else
2217         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2218         mov     r3, r3, lsr #16         /* r3 = ..54 */
2219         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
2220 #endif
2221         str     r2, [r0]
2222         str     r3, [r0, #0x04]
2223         RET
2224         LMEMCPY_8_PAD
2225
2226 /*
2227  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2228  */
2229         ldrb    r3, [r1]                /* r3 = ...0 */
2230         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2231         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
2232 #ifdef __ARMEB__
2233         mov     r3, r3, lsl #24         /* r3 = 0... */
2234         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
2235         mov     r2, r2, lsl #24         /* r2 = 4... */
2236         orr     r2, r2, r1, lsr #8      /* r2 = 4567 */
2237 #else
2238         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2239         mov     r2, r2, lsr #24         /* r2 = ...4 */
2240         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
2241 #endif
2242         str     r3, [r0]
2243         str     r2, [r0, #0x04]
2244         RET
2245         LMEMCPY_8_PAD
2246
2247 /*
2248  * 0100: dst is 8-bit aligned, src is 32-bit aligned
2249  */
2250         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
2251         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
2252 #ifdef __ARMEB__
2253         mov     r1, r3, lsr #24         /* r1 = ...0 */
2254         strb    r1, [r0]
2255         mov     r1, r3, lsr #8          /* r1 = .012 */
2256         strb    r2, [r0, #0x07]
2257         mov     r3, r3, lsl #24         /* r3 = 3... */
2258         orr     r3, r3, r2, lsr #8      /* r3 = 3456 */
2259 #else
2260         strb    r3, [r0]
2261         mov     r1, r2, lsr #24         /* r1 = ...7 */
2262         strb    r1, [r0, #0x07]
2263         mov     r1, r3, lsr #8          /* r1 = .321 */
2264         mov     r3, r3, lsr #24         /* r3 = ...3 */
2265         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
2266 #endif
2267         strh    r1, [r0, #0x01]
2268         str     r3, [r0, #0x03]
2269         RET
2270         LMEMCPY_8_PAD
2271
2272 /*
2273  * 0101: dst is 8-bit aligned, src is 8-bit aligned
2274  */
2275         ldrb    r2, [r1]
2276         ldrh    r3, [r1, #0x01]
2277         ldr     ip, [r1, #0x03]
2278         ldrb    r1, [r1, #0x07]
2279         strb    r2, [r0]
2280         strh    r3, [r0, #0x01]
2281         str     ip, [r0, #0x03]
2282         strb    r1, [r0, #0x07]
2283         RET
2284         LMEMCPY_8_PAD
2285
2286 /*
2287  * 0110: dst is 8-bit aligned, src is 16-bit aligned
2288  */
2289         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2290         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2291         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2292 #ifdef __ARMEB__
2293         mov     ip, r2, lsr #8          /* ip = ...0 */
2294         strb    ip, [r0]
2295         mov     ip, r2, lsl #8          /* ip = .01. */
2296         orr     ip, ip, r3, lsr #24     /* ip = .012 */
2297         strb    r1, [r0, #0x07]
2298         mov     r3, r3, lsl #8          /* r3 = 345. */
2299         orr     r3, r3, r1, lsr #8      /* r3 = 3456 */
2300 #else
2301         strb    r2, [r0]                /* 0 */
2302         mov     ip, r1, lsr #8          /* ip = ...7 */
2303         strb    ip, [r0, #0x07]         /* 7 */
2304         mov     ip, r2, lsr #8          /* ip = ...1 */
2305         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2306         mov     r3, r3, lsr #8          /* r3 = .543 */
2307         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
2308 #endif
2309         strh    ip, [r0, #0x01]
2310         str     r3, [r0, #0x03]
2311         RET
2312         LMEMCPY_8_PAD
2313
2314 /*
2315  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2316  */
2317         ldrb    r3, [r1]                /* r3 = ...0 */
2318         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2319         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
2320         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2321         strb    r3, [r0]
2322         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
2323 #ifdef __ARMEB__
2324         strh    r3, [r0, #0x01]
2325         orr     r2, r2, ip, lsl #16     /* r2 = 3456 */
2326 #else
2327         strh    ip, [r0, #0x01]
2328         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
2329 #endif
2330         str     r2, [r0, #0x03]
2331         strb    r1, [r0, #0x07]
2332         RET
2333         LMEMCPY_8_PAD
2334
2335 /*
2336  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2337  */
2338         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2339         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2340         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2341 #ifdef __ARMEB__
2342         strh    r1, [r0]
2343         mov     r1, r3, lsr #16         /* r1 = ..45 */
2344         orr     r2, r1 ,r2, lsl #16     /* r2 = 2345 */
2345 #else
2346         strh    r2, [r0]
2347         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
2348         mov     r3, r3, lsr #16         /* r3 = ..76 */
2349 #endif
2350         str     r2, [r0, #0x02]
2351         strh    r3, [r0, #0x06]
2352         RET
2353         LMEMCPY_8_PAD
2354
2355 /*
2356  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2357  */
2358         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2359         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2360         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
2361         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2362         strh    r1, [r0]
2363 #ifdef __ARMEB__
2364         mov     r1, r2, lsl #24         /* r1 = 2... */
2365         orr     r1, r1, r3, lsr #8      /* r1 = 2345 */
2366         orr     r3, ip, r3, lsl #8      /* r3 = 4567 */
2367 #else
2368         mov     r1, r2, lsr #24         /* r1 = ...2 */
2369         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
2370         mov     r3, r3, lsr #24         /* r3 = ...6 */
2371         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
2372 #endif
2373         str     r1, [r0, #0x02]
2374         strh    r3, [r0, #0x06]
2375         RET
2376         LMEMCPY_8_PAD
2377
2378 /*
2379  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2380  */
2381         ldrh    r2, [r1]
2382         ldr     ip, [r1, #0x02]
2383         ldrh    r3, [r1, #0x06]
2384         strh    r2, [r0]
2385         str     ip, [r0, #0x02]
2386         strh    r3, [r0, #0x06]
2387         RET
2388         LMEMCPY_8_PAD
2389
2390 /*
2391  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2392  */
2393         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
2394         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2395         ldrb    ip, [r1]                /* ip = ...0 */
2396         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
2397         strh    r1, [r0, #0x06]
2398 #ifdef __ARMEB__
2399         mov     r3, r3, lsr #24         /* r3 = ...5 */
2400         orr     r3, r3, r2, lsl #8      /* r3 = 2345 */
2401         mov     r2, r2, lsr #24         /* r2 = ...1 */
2402         orr     r2, r2, ip, lsl #8      /* r2 = ..01 */
2403 #else
2404         mov     r3, r3, lsl #24         /* r3 = 5... */
2405         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
2406         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
2407 #endif
2408         str     r3, [r0, #0x02]
2409         strh    r2, [r0]
2410         RET
2411         LMEMCPY_8_PAD
2412
2413 /*
2414  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2415  */
2416         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2417         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2418         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
2419         strh    r1, [r0, #0x05]
2420 #ifdef __ARMEB__
2421         strb    r3, [r0, #0x07]
2422         mov     r1, r2, lsr #24         /* r1 = ...0 */
2423         strb    r1, [r0]
2424         mov     r2, r2, lsl #8          /* r2 = 123. */
2425         orr     r2, r2, r3, lsr #24     /* r2 = 1234 */
2426         str     r2, [r0, #0x01]
2427 #else
2428         strb    r2, [r0]
2429         mov     r1, r3, lsr #24         /* r1 = ...7 */
2430         strb    r1, [r0, #0x07]
2431         mov     r2, r2, lsr #8          /* r2 = .321 */
2432         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
2433         str     r2, [r0, #0x01]
2434 #endif
2435         RET
2436         LMEMCPY_8_PAD
2437
2438 /*
2439  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2440  */
2441         ldrb    r3, [r1]                /* r3 = ...0 */
2442         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
2443         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2444         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2445         strb    r3, [r0]
2446         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
2447 #ifdef __ARMEB__
2448         strh    ip, [r0, #0x05]
2449         orr     r2, r3, r2, lsl #16     /* r2 = 1234 */
2450 #else
2451         strh    r3, [r0, #0x05]
2452         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
2453 #endif
2454         str     r2, [r0, #0x01]
2455         strb    r1, [r0, #0x07]
2456         RET
2457         LMEMCPY_8_PAD
2458
2459 /*
2460  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2461  */
2462         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2463         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2464         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2465 #ifdef __ARMEB__
2466         mov     ip, r2, lsr #8          /* ip = ...0 */
2467         strb    ip, [r0]
2468         mov     ip, r2, lsl #24         /* ip = 1... */
2469         orr     ip, ip, r3, lsr #8      /* ip = 1234 */
2470         strb    r1, [r0, #0x07]
2471         mov     r1, r1, lsr #8          /* r1 = ...6 */
2472         orr     r1, r1, r3, lsl #8      /* r1 = 3456 */
2473 #else
2474         strb    r2, [r0]
2475         mov     ip, r2, lsr #8          /* ip = ...1 */
2476         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2477         mov     r2, r1, lsr #8          /* r2 = ...7 */
2478         strb    r2, [r0, #0x07]
2479         mov     r1, r1, lsl #8          /* r1 = .76. */
2480         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
2481 #endif
2482         str     ip, [r0, #0x01]
2483         strh    r1, [r0, #0x05]
2484         RET
2485         LMEMCPY_8_PAD
2486
2487 /*
2488  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2489  */
2490         ldrb    r2, [r1]
2491         ldr     ip, [r1, #0x01]
2492         ldrh    r3, [r1, #0x05]
2493         ldrb    r1, [r1, #0x07]
2494         strb    r2, [r0]
2495         str     ip, [r0, #0x01]
2496         strh    r3, [r0, #0x05]
2497         strb    r1, [r0, #0x07]
2498         RET
2499         LMEMCPY_8_PAD
2500
2501 /******************************************************************************
2502  * Special case for 12 byte copies
2503  */
2504 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
2505 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
2506         LMEMCPY_C_PAD
2507 .Lmemcpy_c:
2508         and     r2, r1, #0x03
2509         orr     r2, r2, r0, lsl #2
2510         ands    r2, r2, #0x0f
2511         sub     r3, pc, #0x14
2512         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
2513
2514 /*
2515  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2516  */
2517         ldr     r2, [r1]
2518         ldr     r3, [r1, #0x04]
2519         ldr     r1, [r1, #0x08]
2520         str     r2, [r0]
2521         str     r3, [r0, #0x04]
2522         str     r1, [r0, #0x08]
2523         RET
2524         LMEMCPY_C_PAD
2525
2526 /*
2527  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2528  */
2529         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
2530         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2531         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2532         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2533 #ifdef __ARMEB__
2534         orr     r2, r2, ip, lsl #8      /* r2 = 89AB */
2535         str     r2, [r0, #0x08]
2536         mov     r2, ip, lsr #24         /* r2 = ...7 */
2537         orr     r2, r2, r3, lsl #8      /* r2 = 4567 */
2538         mov     r1, r1, lsl #8          /* r1 = 012. */
2539         orr     r1, r1, r3, lsr #24     /* r1 = 0123 */
2540 #else
2541         mov     r2, r2, lsl #24         /* r2 = B... */
2542         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
2543         str     r2, [r0, #0x08]
2544         mov     r2, ip, lsl #24         /* r2 = 7... */
2545         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
2546         mov     r1, r1, lsr #8          /* r1 = .210 */
2547         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
2548 #endif
2549         str     r2, [r0, #0x04]
2550         str     r1, [r0]
2551         RET
2552         LMEMCPY_C_PAD
2553
2554 /*
2555  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2556  */
2557         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2558         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2559         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2560         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2561 #ifdef __ARMEB__
2562         mov     r2, r2, lsl #16         /* r2 = 01.. */
2563         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2564         str     r2, [r0]
2565         mov     r3, r3, lsl #16         /* r3 = 45.. */
2566         orr     r3, r3, ip, lsr #16     /* r3 = 4567 */
2567         orr     r1, r1, ip, lsl #16     /* r1 = 89AB */
2568 #else
2569         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2570         str     r2, [r0]
2571         mov     r3, r3, lsr #16         /* r3 = ..54 */
2572         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
2573         mov     r1, r1, lsl #16         /* r1 = BA.. */
2574         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
2575 #endif
2576         str     r3, [r0, #0x04]
2577         str     r1, [r0, #0x08]
2578         RET
2579         LMEMCPY_C_PAD
2580
2581 /*
2582  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2583  */
2584         ldrb    r2, [r1]                /* r2 = ...0 */
2585         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2586         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2587         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2588 #ifdef __ARMEB__
2589         mov     r2, r2, lsl #24         /* r2 = 0... */
2590         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
2591         str     r2, [r0]
2592         mov     r3, r3, lsl #24         /* r3 = 4... */
2593         orr     r3, r3, ip, lsr #8      /* r3 = 4567 */
2594         mov     r1, r1, lsr #8          /* r1 = .9AB */
2595         orr     r1, r1, ip, lsl #24     /* r1 = 89AB */
2596 #else
2597         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
2598         str     r2, [r0]
2599         mov     r3, r3, lsr #24         /* r3 = ...4 */
2600         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
2601         mov     r1, r1, lsl #8          /* r1 = BA9. */
2602         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
2603 #endif
2604         str     r3, [r0, #0x04]
2605         str     r1, [r0, #0x08]
2606         RET
2607         LMEMCPY_C_PAD
2608
2609 /*
2610  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2611  */
2612         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2613         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2614         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
2615         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
2616         strh    r1, [r0, #0x01]
2617 #ifdef __ARMEB__
2618         mov     r1, r2, lsr #24         /* r1 = ...0 */
2619         strb    r1, [r0]
2620         mov     r1, r2, lsl #24         /* r1 = 3... */
2621         orr     r2, r1, r3, lsr #8      /* r1 = 3456 */
2622         mov     r1, r3, lsl #24         /* r1 = 7... */
2623         orr     r1, r1, ip, lsr #8      /* r1 = 789A */
2624 #else
2625         strb    r2, [r0]
2626         mov     r1, r2, lsr #24         /* r1 = ...3 */
2627         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
2628         mov     r1, r3, lsr #24         /* r1 = ...7 */
2629         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
2630         mov     ip, ip, lsr #24         /* ip = ...B */
2631 #endif
2632         str     r2, [r0, #0x03]
2633         str     r1, [r0, #0x07]
2634         strb    ip, [r0, #0x0b]
2635         RET
2636         LMEMCPY_C_PAD
2637
2638 /*
2639  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2640  */
2641         ldrb    r2, [r1]
2642         ldrh    r3, [r1, #0x01]
2643         ldr     ip, [r1, #0x03]
2644         strb    r2, [r0]
2645         ldr     r2, [r1, #0x07]
2646         ldrb    r1, [r1, #0x0b]
2647         strh    r3, [r0, #0x01]
2648         str     ip, [r0, #0x03]
2649         str     r2, [r0, #0x07]
2650         strb    r1, [r0, #0x0b]
2651         RET
2652         LMEMCPY_C_PAD
2653
2654 /*
2655  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2656  */
2657         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2658         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2659         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2660         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2661 #ifdef __ARMEB__
2662         mov     r2, r2, ror #8          /* r2 = 1..0 */
2663         strb    r2, [r0]
2664         mov     r2, r2, lsr #16         /* r2 = ..1. */
2665         orr     r2, r2, r3, lsr #24     /* r2 = ..12 */
2666         strh    r2, [r0, #0x01]
2667         mov     r2, r3, lsl #8          /* r2 = 345. */
2668         orr     r3, r2, ip, lsr #24     /* r3 = 3456 */
2669         mov     r2, ip, lsl #8          /* r2 = 789. */
2670         orr     r2, r2, r1, lsr #8      /* r2 = 789A */
2671 #else
2672         strb    r2, [r0]
2673         mov     r2, r2, lsr #8          /* r2 = ...1 */
2674         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2675         strh    r2, [r0, #0x01]
2676         mov     r2, r3, lsr #8          /* r2 = .543 */
2677         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
2678         mov     r2, ip, lsr #8          /* r2 = .987 */
2679         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
2680         mov     r1, r1, lsr #8          /* r1 = ...B */
2681 #endif
2682         str     r3, [r0, #0x03]
2683         str     r2, [r0, #0x07]
2684         strb    r1, [r0, #0x0b]
2685         RET
2686         LMEMCPY_C_PAD
2687
2688 /*
2689  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2690  */
2691         ldrb    r2, [r1]
2692         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2693         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2694         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2695         strb    r2, [r0]
2696 #ifdef __ARMEB__
2697         mov     r2, r3, lsr #16         /* r2 = ..12 */
2698         strh    r2, [r0, #0x01]
2699         mov     r3, r3, lsl #16         /* r3 = 34.. */
2700         orr     r3, r3, ip, lsr #16     /* r3 = 3456 */
2701         mov     ip, ip, lsl #16         /* ip = 78.. */
2702         orr     ip, ip, r1, lsr #16     /* ip = 789A */
2703         mov     r1, r1, lsr #8          /* r1 = .9AB */
2704 #else
2705         strh    r3, [r0, #0x01]
2706         mov     r3, r3, lsr #16         /* r3 = ..43 */
2707         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
2708         mov     ip, ip, lsr #16         /* ip = ..87 */
2709         orr     ip, ip, r1, lsl #16     /* ip = A987 */
2710         mov     r1, r1, lsr #16         /* r1 = ..xB */
2711 #endif
2712         str     r3, [r0, #0x03]
2713         str     ip, [r0, #0x07]
2714         strb    r1, [r0, #0x0b]
2715         RET
2716         LMEMCPY_C_PAD
2717
2718 /*
2719  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2720  */
2721         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
2722         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2723         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
2724         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2725 #ifdef __ARMEB__
2726         strh    r1, [r0]
2727         mov     r1, ip, lsl #16         /* r1 = 23.. */
2728         orr     r1, r1, r3, lsr #16     /* r1 = 2345 */
2729         mov     r3, r3, lsl #16         /* r3 = 67.. */
2730         orr     r3, r3, r2, lsr #16     /* r3 = 6789 */
2731 #else
2732         strh    ip, [r0]
2733         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
2734         mov     r3, r3, lsr #16         /* r3 = ..76 */
2735         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
2736         mov     r2, r2, lsr #16         /* r2 = ..BA */
2737 #endif
2738         str     r1, [r0, #0x02]
2739         str     r3, [r0, #0x06]
2740         strh    r2, [r0, #0x0a]
2741         RET
2742         LMEMCPY_C_PAD
2743
2744 /*
2745  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2746  */
2747         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2748         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2749         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
2750         strh    ip, [r0]
2751         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2752         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
2753 #ifdef __ARMEB__
2754         mov     r2, r2, lsl #24         /* r2 = 2... */
2755         orr     r2, r2, r3, lsr #8      /* r2 = 2345 */
2756         mov     r3, r3, lsl #24         /* r3 = 6... */
2757         orr     r3, r3, ip, lsr #8      /* r3 = 6789 */
2758         orr     r1, r1, ip, lsl #8      /* r1 = 89AB */
2759 #else
2760         mov     r2, r2, lsr #24         /* r2 = ...2 */
2761         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2762         mov     r3, r3, lsr #24         /* r3 = ...6 */
2763         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2764         mov     r1, r1, lsl #8          /* r1 = ..B. */
2765         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2766 #endif
2767         str     r2, [r0, #0x02]
2768         str     r3, [r0, #0x06]
2769         strh    r1, [r0, #0x0a]
2770         RET
2771         LMEMCPY_C_PAD
2772
2773 /*
2774  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2775  */
2776         ldrh    r2, [r1]
2777         ldr     r3, [r1, #0x02]
2778         ldr     ip, [r1, #0x06]
2779         ldrh    r1, [r1, #0x0a]
2780         strh    r2, [r0]
2781         str     r3, [r0, #0x02]
2782         str     ip, [r0, #0x06]
2783         strh    r1, [r0, #0x0a]
2784         RET
2785         LMEMCPY_C_PAD
2786
2787 /*
2788  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2789  */
2790         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2791         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2792         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2793         strh    ip, [r0, #0x0a]
2794         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2795         ldrb    r1, [r1]                /* r1 = ...0 */
2796 #ifdef __ARMEB__
2797         mov     r2, r2, lsr #24         /* r2 = ...9 */
2798         orr     r2, r2, r3, lsl #8      /* r2 = 6789 */
2799         mov     r3, r3, lsr #24         /* r3 = ...5 */
2800         orr     r3, r3, ip, lsl #8      /* r3 = 2345 */
2801         mov     r1, r1, lsl #8          /* r1 = ..0. */
2802         orr     r1, r1, ip, lsr #24     /* r1 = ..01 */
2803 #else
2804         mov     r2, r2, lsl #24         /* r2 = 9... */
2805         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2806         mov     r3, r3, lsl #24         /* r3 = 5... */
2807         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2808         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2809 #endif
2810         str     r2, [r0, #0x06]
2811         str     r3, [r0, #0x02]
2812         strh    r1, [r0]
2813         RET
2814         LMEMCPY_C_PAD
2815
2816 /*
2817  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2818  */
2819         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2820         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2821         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2822 #ifdef __ARMEB__
2823         mov     r3, r2, lsr #24         /* r3 = ...0 */
2824         strb    r3, [r0]
2825         mov     r2, r2, lsl #8          /* r2 = 123. */
2826         orr     r2, r2, ip, lsr #24     /* r2 = 1234 */
2827         str     r2, [r0, #0x01]
2828         mov     r2, ip, lsl #8          /* r2 = 567. */
2829         orr     r2, r2, r1, lsr #24     /* r2 = 5678 */
2830         str     r2, [r0, #0x05]
2831         mov     r2, r1, lsr #8          /* r2 = ..9A */
2832         strh    r2, [r0, #0x09]
2833         strb    r1, [r0, #0x0b]
2834 #else
2835         strb    r2, [r0]
2836         mov     r3, r2, lsr #8          /* r3 = .321 */
2837         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2838         str     r3, [r0, #0x01]
2839         mov     r3, ip, lsr #8          /* r3 = .765 */
2840         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2841         str     r3, [r0, #0x05]
2842         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2843         strh    r1, [r0, #0x09]
2844         mov     r1, r1, lsr #16         /* r1 = ...B */
2845         strb    r1, [r0, #0x0b]
2846 #endif
2847         RET
2848         LMEMCPY_C_PAD
2849
2850 /*
2851  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2852  */
2853         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2854         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2855         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2856         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2857         strb    r2, [r0, #0x0b]
2858 #ifdef __ARMEB__
2859         strh    r3, [r0, #0x09]
2860         mov     r3, r3, lsr #16         /* r3 = ..78 */
2861         orr     r3, r3, ip, lsl #16     /* r3 = 5678 */
2862         mov     ip, ip, lsr #16         /* ip = ..34 */
2863         orr     ip, ip, r1, lsl #16     /* ip = 1234 */
2864         mov     r1, r1, lsr #16         /* r1 = ..x0 */
2865 #else
2866         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2867         strh    r2, [r0, #0x09]
2868         mov     r3, r3, lsl #16         /* r3 = 87.. */
2869         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2870         mov     ip, ip, lsl #16         /* ip = 43.. */
2871         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2872         mov     r1, r1, lsr #8          /* r1 = .210 */
2873 #endif
2874         str     r3, [r0, #0x05]
2875         str     ip, [r0, #0x01]
2876         strb    r1, [r0]
2877         RET
2878         LMEMCPY_C_PAD
2879
2880 /*
2881  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2882  */
2883 #ifdef __ARMEB__
2884         ldrh    r2, [r1, #0x0a]         /* r2 = ..AB */
2885         ldr     ip, [r1, #0x06]         /* ip = 6789 */
2886         ldr     r3, [r1, #0x02]         /* r3 = 2345 */
2887         ldrh    r1, [r1]                /* r1 = ..01 */
2888         strb    r2, [r0, #0x0b]
2889         mov     r2, r2, lsr #8          /* r2 = ...A */
2890         orr     r2, r2, ip, lsl #8      /* r2 = 789A */
2891         mov     ip, ip, lsr #8          /* ip = .678 */
2892         orr     ip, ip, r3, lsl #24     /* ip = 5678 */
2893         mov     r3, r3, lsr #8          /* r3 = .234 */
2894         orr     r3, r3, r1, lsl #24     /* r3 = 1234 */
2895         mov     r1, r1, lsr #8          /* r1 = ...0 */
2896         strb    r1, [r0]
2897         str     r3, [r0, #0x01]
2898         str     ip, [r0, #0x05]
2899         strh    r2, [r0, #0x09]
2900 #else
2901         ldrh    r2, [r1]                /* r2 = ..10 */
2902         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2903         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2904         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2905         strb    r2, [r0]
2906         mov     r2, r2, lsr #8          /* r2 = ...1 */
2907         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2908         mov     r3, r3, lsr #24         /* r3 = ...5 */
2909         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2910         mov     ip, ip, lsr #24         /* ip = ...9 */
2911         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2912         mov     r1, r1, lsr #8          /* r1 = ...B */
2913         str     r2, [r0, #0x01]
2914         str     r3, [r0, #0x05]
2915         strh    ip, [r0, #0x09]
2916         strb    r1, [r0, #0x0b]
2917 #endif
2918         RET
2919         LMEMCPY_C_PAD
2920
2921 /*
2922  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2923  */
2924         ldrb    r2, [r1]
2925         ldr     r3, [r1, #0x01]
2926         ldr     ip, [r1, #0x05]
2927         strb    r2, [r0]
2928         ldrh    r2, [r1, #0x09]
2929         ldrb    r1, [r1, #0x0b]
2930         str     r3, [r0, #0x01]
2931         str     ip, [r0, #0x05]
2932         strh    r2, [r0, #0x09]
2933         strb    r1, [r0, #0x0b]
2934         RET
2935 #endif /* _ARM_ARCH_5E */
2936
2937 #ifdef GPROF
2938
2939 ENTRY(user)
2940         nop
2941 ENTRY(btrap)
2942         nop
2943 ENTRY(etrap)
2944         nop
2945 ENTRY(bintr)
2946         nop
2947 ENTRY(eintr)
2948         nop
2949
2950 #endif