sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  * 3. All advertising materials mentioning features or use of this software
  76  *    must display the following acknowledgement:
  77  *      This product includes software developed by the NetBSD
  78  *      Foundation, Inc. and its contributors.
  79  * 4. Neither the name of The NetBSD Foundation nor the names of its
  80  *    contributors may be used to endorse or promote products derived
  81  *    from this software without specific prior written permission.
  82  *
  83  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  84  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  85  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  86  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  87  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  88  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  89  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  90  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  91  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  92  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  93  * POSSIBILITY OF SUCH DAMAGE.
  94  */
  95
  96 #include <machine/asm.h>
  97 #include <machine/asmacros.h>
  98 __FBSDID("$FreeBSD$");
  99
 100 #include "assym.s"
 101
 102 .L_arm_memcpy:
 103         .word   _C_LABEL(_arm_memcpy)
 104 .L_arm_bzero:
 105         .word   _C_LABEL(_arm_bzero)
 106 .L_min_memcpy_size:
 107         .word   _C_LABEL(_min_memcpy_size)
 108 .L_min_bzero_size:
 109         .word   _C_LABEL(_min_bzero_size)
 110 /*
 111  * memset: Sets a block of memory to the specified value
 112  *
 113  * On entry:
 114  *   r0 - dest address
 115  *   r1 - byte to write
 116  *   r2 - number of bytes to write
 117  *
 118  * On exit:
 119  *   r0 - dest address
 120  */
 121 /* LINTSTUB: Func: void bzero(void *, size_t) */
 122 ENTRY(bzero)
 123         ldr     r3, .L_arm_bzero
 124         ldr     r3, [r3]
 125         cmp     r3, #0
 126         beq     .Lnormal0
 127         ldr     r2, .L_min_bzero_size
 128         ldr     r2, [r2]
 129         cmp     r1, r2
 130         blt     .Lnormal0
 131         stmfd   sp!, {r0, r1, lr}
 132         mov     r2, #0
 133         mov     lr, pc
 134         mov     pc, r3
 135         cmp     r0, #0
 136         ldmfd   sp!, {r0, r1, lr}
 137         RETeq
 138 .Lnormal0:
 139         mov     r3, #0x00
 140         b       do_memset
 141
 142 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 143 ENTRY(memset)
 144         and     r3, r1, #0xff           /* We deal with bytes */
 145         mov     r1, r2
 146 do_memset:
 147         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 148         mov     ip, r0
 149         blt     .Lmemset_lessthanfour
 150
 151         /* Ok first we will word align the address */
 152         ands    r2, ip, #0x03           /* Get the bottom two bits */
 153         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 154
 155         /* We are now word aligned */
 156 .Lmemset_wordaligned:
 157         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 158 #ifdef _ARM_ARCH_5E
 159         tst     ip, #0x04               /* Quad-align for armv5e */
 160 #else
 161         cmp     r1, #0x10
 162 #endif
 163         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 164 #ifdef _ARM_ARCH_5E
 165         subne   r1, r1, #0x04           /* Quad-align if necessary */
 166         strne   r3, [ip], #0x04
 167         cmp     r1, #0x10
 168 #endif
 169         blt     .Lmemset_loop4          /* If less than 16 then use words */
 170         mov     r2, r3                  /* Duplicate data */
 171         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 172         blt     .Lmemset_loop32
 173
 174         /* Do 128 bytes at a time */
 175 .Lmemset_loop128:
 176         subs    r1, r1, #0x80
 177 #ifdef _ARM_ARCH_5E
 178         strged  r2, [ip], #0x08
 179         strged  r2, [ip], #0x08
 180         strged  r2, [ip], #0x08
 181         strged  r2, [ip], #0x08
 182         strged  r2, [ip], #0x08
 183         strged  r2, [ip], #0x08
 184         strged  r2, [ip], #0x08
 185         strged  r2, [ip], #0x08
 186         strged  r2, [ip], #0x08
 187         strged  r2, [ip], #0x08
 188         strged  r2, [ip], #0x08
 189         strged  r2, [ip], #0x08
 190         strged  r2, [ip], #0x08
 191         strged  r2, [ip], #0x08
 192         strged  r2, [ip], #0x08
 193         strged  r2, [ip], #0x08
 194 #else
 195         stmgeia ip!, {r2-r3}
 196         stmgeia ip!, {r2-r3}
 197         stmgeia ip!, {r2-r3}
 198         stmgeia ip!, {r2-r3}
 199         stmgeia ip!, {r2-r3}
 200         stmgeia ip!, {r2-r3}
 201         stmgeia ip!, {r2-r3}
 202         stmgeia ip!, {r2-r3}
 203         stmgeia ip!, {r2-r3}
 204         stmgeia ip!, {r2-r3}
 205         stmgeia ip!, {r2-r3}
 206         stmgeia ip!, {r2-r3}
 207         stmgeia ip!, {r2-r3}
 208         stmgeia ip!, {r2-r3}
 209         stmgeia ip!, {r2-r3}
 210         stmgeia ip!, {r2-r3}
 211 #endif
 212         bgt     .Lmemset_loop128
 213         RETeq                   /* Zero length so just exit */
 214
 215         add     r1, r1, #0x80           /* Adjust for extra sub */
 216
 217         /* Do 32 bytes at a time */
 218 .Lmemset_loop32:
 219         subs    r1, r1, #0x20
 220 #ifdef _ARM_ARCH_5E
 221         strged  r2, [ip], #0x08
 222         strged  r2, [ip], #0x08
 223         strged  r2, [ip], #0x08
 224         strged  r2, [ip], #0x08
 225 #else
 226         stmgeia ip!, {r2-r3}
 227         stmgeia ip!, {r2-r3}
 228         stmgeia ip!, {r2-r3}
 229         stmgeia ip!, {r2-r3}
 230 #endif
 231         bgt     .Lmemset_loop32
 232         RETeq                   /* Zero length so just exit */
 233
 234         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 235
 236         /* Deal with 16 bytes or more */
 237 #ifdef _ARM_ARCH_5E
 238         strged  r2, [ip], #0x08
 239         strged  r2, [ip], #0x08
 240 #else
 241         stmgeia ip!, {r2-r3}
 242         stmgeia ip!, {r2-r3}
 243 #endif
 244         RETeq                   /* Zero length so just exit */
 245
 246         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 247
 248         /* We have at least 4 bytes so copy as words */
 249 .Lmemset_loop4:
 250         subs    r1, r1, #0x04
 251         strge   r3, [ip], #0x04
 252         bgt     .Lmemset_loop4
 253         RETeq                   /* Zero length so just exit */
 254
 255 #ifdef _ARM_ARCH_5E
 256         /* Compensate for 64-bit alignment check */
 257         adds    r1, r1, #0x04
 258         RETeq
 259         cmp     r1, #2
 260 #else
 261         cmp     r1, #-2
 262 #endif
 263
 264         strb    r3, [ip], #0x01         /* Set 1 byte */
 265         strgeb  r3, [ip], #0x01         /* Set another byte */
 266         strgtb  r3, [ip]                /* and a third */
 267         RET                     /* Exit */
 268
 269 .Lmemset_wordunaligned:
 270         rsb     r2, r2, #0x004
 271         strb    r3, [ip], #0x01         /* Set 1 byte */
 272         cmp     r2, #0x02
 273         strgeb  r3, [ip], #0x01         /* Set another byte */
 274         sub     r1, r1, r2
 275         strgtb  r3, [ip], #0x01         /* and a third */
 276         cmp     r1, #0x04               /* More than 4 bytes left? */
 277         bge     .Lmemset_wordaligned    /* Yup */
 278
 279 .Lmemset_lessthanfour:
 280         cmp     r1, #0x00
 281         RETeq                   /* Zero length so exit */
 282         strb    r3, [ip], #0x01         /* Set 1 byte */
 283         cmp     r1, #0x02
 284         strgeb  r3, [ip], #0x01         /* Set another byte */
 285         strgtb  r3, [ip]                /* and a third */
 286         RET                     /* Exit */
 287
 288 ENTRY(bcmp)
 289         mov     ip, r0
 290         cmp     r2, #0x06
 291         beq     .Lmemcmp_6bytes
 292         mov     r0, #0x00
 293
 294         /* Are both addresses aligned the same way? */
 295         cmp     r2, #0x00
 296         eornes  r3, ip, r1
 297         RETeq                   /* len == 0, or same addresses! */
 298         tst     r3, #0x03
 299         subne   r2, r2, #0x01
 300         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 301
 302         /* Word-align the addresses, if necessary */
 303         sub     r3, r1, #0x05
 304         ands    r3, r3, #0x03
 305         add     r3, r3, r3, lsl #1
 306         addne   pc, pc, r3, lsl #3
 307         nop
 308
 309         /* Compare up to 3 bytes */
 310         ldrb    r0, [ip], #0x01
 311         ldrb    r3, [r1], #0x01
 312         subs    r0, r0, r3
 313         RETne
 314         subs    r2, r2, #0x01
 315         RETeq
 316
 317         /* Compare up to 2 bytes */
 318         ldrb    r0, [ip], #0x01
 319         ldrb    r3, [r1], #0x01
 320         subs    r0, r0, r3
 321         RETne
 322         subs    r2, r2, #0x01
 323         RETeq
 324
 325         /* Compare 1 byte */
 326         ldrb    r0, [ip], #0x01
 327         ldrb    r3, [r1], #0x01
 328         subs    r0, r0, r3
 329         RETne
 330         subs    r2, r2, #0x01
 331         RETeq
 332
 333         /* Compare 4 bytes at a time, if possible */
 334         subs    r2, r2, #0x04
 335         bcc     .Lmemcmp_bytewise
 336 .Lmemcmp_word_aligned:
 337         ldr     r0, [ip], #0x04
 338         ldr     r3, [r1], #0x04
 339         subs    r2, r2, #0x04
 340         cmpcs   r0, r3
 341         beq     .Lmemcmp_word_aligned
 342         sub     r0, r0, r3
 343
 344         /* Correct for extra subtraction, and check if done */
 345         adds    r2, r2, #0x04
 346         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 347         RETeq                   /* Yup. Just return */
 348
 349         /* Re-do the final word byte-wise */
 350         sub     ip, ip, #0x04
 351         sub     r1, r1, #0x04
 352
 353 .Lmemcmp_bytewise:
 354         add     r2, r2, #0x03
 355 .Lmemcmp_bytewise2:
 356         ldrb    r0, [ip], #0x01
 357         ldrb    r3, [r1], #0x01
 358         subs    r2, r2, #0x01
 359         cmpcs   r0, r3
 360         beq     .Lmemcmp_bytewise2
 361         sub     r0, r0, r3
 362         RET
 363
 364         /*
 365          * 6 byte compares are very common, thanks to the network stack.
 366          * This code is hand-scheduled to reduce the number of stalls for
 367          * load results. Everything else being equal, this will be ~32%
 368          * faster than a byte-wise memcmp.
 369          */
 370         .align  5
 371 .Lmemcmp_6bytes:
 372         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 373         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 374         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 375         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 376         ldreqb  r3, [ip, #0x01]         /* r3 = b1#1 */
 377         RETne                   /* Return if mismatch on #0 */
 378         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 379         ldreqb  r3, [r1, #0x02]         /* r3 = b2#2 */
 380         ldreqb  r0, [ip, #0x02]         /* r0 = b1#2 */
 381         RETne                   /* Return if mismatch on #1 */
 382         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 383         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 384         ldreqb  r3, [ip, #0x03]         /* r3 = b1#3 */
 385         RETne                   /* Return if mismatch on #2 */
 386         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 387         ldreqb  r3, [r1, #0x04]         /* r3 = b2#4 */
 388         ldreqb  r0, [ip, #0x04]         /* r0 = b1#4 */
 389         RETne                   /* Return if mismatch on #3 */
 390         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 391         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 392         ldreqb  r3, [ip, #0x05]         /* r3 = b1#5 */
 393         RETne                   /* Return if mismatch on #4 */
 394         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 395         RET
 396
 397 ENTRY(bcopy)
 398         /* switch the source and destination registers */
 399         eor     r0, r1, r0
 400         eor     r1, r0, r1
 401         eor     r0, r1, r0
 402 ENTRY(memmove)
 403         /* Do the buffers overlap? */
 404         cmp     r0, r1
 405         RETeq           /* Bail now if src/dst are the same */
 406         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 407         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 408         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 409         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 410
 411         /* Determine copy direction */
 412         cmp     r1, r0
 413         bcc     .Lmemmove_backwards
 414
 415         moveq   r0, #0                  /* Quick abort for len=0 */
 416         RETeq
 417
 418         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 419         subs    r2, r2, #4
 420         blt     .Lmemmove_fl4           /* less than 4 bytes */
 421         ands    r12, r0, #3
 422         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 423         ands    r12, r1, #3
 424         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 425
 426 .Lmemmove_ft8:
 427         /* We have aligned source and destination */
 428         subs    r2, r2, #8
 429         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 430         subs    r2, r2, #0x14
 431         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 432         stmdb   sp!, {r4}               /* borrow r4 */
 433
 434         /* blat 32 bytes at a time */
 435         /* XXX for really big copies perhaps we should use more registers */
 436 .Lmemmove_floop32:
 437         ldmia   r1!, {r3, r4, r12, lr}
 438         stmia   r0!, {r3, r4, r12, lr}
 439         ldmia   r1!, {r3, r4, r12, lr}
 440         stmia   r0!, {r3, r4, r12, lr}
 441         subs    r2, r2, #0x20
 442         bge     .Lmemmove_floop32
 443
 444         cmn     r2, #0x10
 445         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 446         stmgeia r0!, {r3, r4, r12, lr}
 447         subge   r2, r2, #0x10
 448         ldmia   sp!, {r4}               /* return r4 */
 449
 450 .Lmemmove_fl32:
 451         adds    r2, r2, #0x14
 452
 453         /* blat 12 bytes at a time */
 454 .Lmemmove_floop12:
 455         ldmgeia r1!, {r3, r12, lr}
 456         stmgeia r0!, {r3, r12, lr}
 457         subges  r2, r2, #0x0c
 458         bge     .Lmemmove_floop12
 459
 460 .Lmemmove_fl12:
 461         adds    r2, r2, #8
 462         blt     .Lmemmove_fl4
 463
 464         subs    r2, r2, #4
 465         ldrlt   r3, [r1], #4
 466         strlt   r3, [r0], #4
 467         ldmgeia r1!, {r3, r12}
 468         stmgeia r0!, {r3, r12}
 469         subge   r2, r2, #4
 470
 471 .Lmemmove_fl4:
 472         /* less than 4 bytes to go */
 473         adds    r2, r2, #4
 474         ldmeqia sp!, {r0, pc}           /* done */
 475
 476         /* copy the crud byte at a time */
 477         cmp     r2, #2
 478         ldrb    r3, [r1], #1
 479         strb    r3, [r0], #1
 480         ldrgeb  r3, [r1], #1
 481         strgeb  r3, [r0], #1
 482         ldrgtb  r3, [r1], #1
 483         strgtb  r3, [r0], #1
 484         ldmia   sp!, {r0, pc}
 485
 486         /* erg - unaligned destination */
 487 .Lmemmove_fdestul:
 488         rsb     r12, r12, #4
 489         cmp     r12, #2
 490
 491         /* align destination with byte copies */
 492         ldrb    r3, [r1], #1
 493         strb    r3, [r0], #1
 494         ldrgeb  r3, [r1], #1
 495         strgeb  r3, [r0], #1
 496         ldrgtb  r3, [r1], #1
 497         strgtb  r3, [r0], #1
 498         subs    r2, r2, r12
 499         blt     .Lmemmove_fl4           /* less the 4 bytes */
 500
 501         ands    r12, r1, #3
 502         beq     .Lmemmove_ft8           /* we have an aligned source */
 503
 504         /* erg - unaligned source */
 505         /* This is where it gets nasty ... */
 506 .Lmemmove_fsrcul:
 507         bic     r1, r1, #3
 508         ldr     lr, [r1], #4
 509         cmp     r12, #2
 510         bgt     .Lmemmove_fsrcul3
 511         beq     .Lmemmove_fsrcul2
 512         cmp     r2, #0x0c
 513         blt     .Lmemmove_fsrcul1loop4
 514         sub     r2, r2, #0x0c
 515         stmdb   sp!, {r4, r5}
 516
 517 .Lmemmove_fsrcul1loop16:
 518 #ifdef __ARMEB__
 519         mov     r3, lr, lsl #8
 520 #else
 521         mov     r3, lr, lsr #8
 522 #endif
 523         ldmia   r1!, {r4, r5, r12, lr}
 524 #ifdef __ARMEB__
 525         orr     r3, r3, r4, lsr #24
 526         mov     r4, r4, lsl #8
 527         orr     r4, r4, r5, lsr #24
 528         mov     r5, r5, lsl #8
 529         orr     r5, r5, r12, lsr #24
 530         mov     r12, r12, lsl #8
 531         orr     r12, r12, lr, lsr #24
 532 #else
 533         orr     r3, r3, r4, lsl #24
 534         mov     r4, r4, lsr #8
 535         orr     r4, r4, r5, lsl #24
 536         mov     r5, r5, lsr #8
 537         orr     r5, r5, r12, lsl #24
 538         mov     r12, r12, lsr #8
 539         orr     r12, r12, lr, lsl #24
 540 #endif
 541         stmia   r0!, {r3-r5, r12}
 542         subs    r2, r2, #0x10
 543         bge     .Lmemmove_fsrcul1loop16
 544         ldmia   sp!, {r4, r5}
 545         adds    r2, r2, #0x0c
 546         blt     .Lmemmove_fsrcul1l4
 547
 548 .Lmemmove_fsrcul1loop4:
 549 #ifdef __ARMEB__
 550         mov     r12, lr, lsl #8
 551 #else
 552         mov     r12, lr, lsr #8
 553 #endif
 554         ldr     lr, [r1], #4
 555 #ifdef __ARMEB__
 556         orr     r12, r12, lr, lsr #24
 557 #else
 558         orr     r12, r12, lr, lsl #24
 559 #endif
 560         str     r12, [r0], #4
 561         subs    r2, r2, #4
 562         bge     .Lmemmove_fsrcul1loop4
 563
 564 .Lmemmove_fsrcul1l4:
 565         sub     r1, r1, #3
 566         b       .Lmemmove_fl4
 567
 568 .Lmemmove_fsrcul2:
 569         cmp     r2, #0x0c
 570         blt     .Lmemmove_fsrcul2loop4
 571         sub     r2, r2, #0x0c
 572         stmdb   sp!, {r4, r5}
 573
 574 .Lmemmove_fsrcul2loop16:
 575 #ifdef __ARMEB__
 576         mov     r3, lr, lsl #16
 577 #else
 578         mov     r3, lr, lsr #16
 579 #endif
 580         ldmia   r1!, {r4, r5, r12, lr}
 581 #ifdef __ARMEB__
 582         orr     r3, r3, r4, lsr #16
 583         mov     r4, r4, lsl #16
 584         orr     r4, r4, r5, lsr #16
 585         mov     r5, r5, lsl #16
 586         orr     r5, r5, r12, lsr #16
 587         mov     r12, r12, lsl #16
 588         orr     r12, r12, lr, lsr #16
 589 #else
 590         orr     r3, r3, r4, lsl #16
 591         mov     r4, r4, lsr #16
 592         orr     r4, r4, r5, lsl #16
 593         mov     r5, r5, lsr #16
 594         orr     r5, r5, r12, lsl #16
 595         mov     r12, r12, lsr #16
 596         orr     r12, r12, lr, lsl #16
 597 #endif
 598         stmia   r0!, {r3-r5, r12}
 599         subs    r2, r2, #0x10
 600         bge     .Lmemmove_fsrcul2loop16
 601         ldmia   sp!, {r4, r5}
 602         adds    r2, r2, #0x0c
 603         blt     .Lmemmove_fsrcul2l4
 604
 605 .Lmemmove_fsrcul2loop4:
 606 #ifdef __ARMEB__
 607         mov     r12, lr, lsl #16
 608 #else
 609         mov     r12, lr, lsr #16
 610 #endif
 611         ldr     lr, [r1], #4
 612 #ifdef __ARMEB__
 613         orr     r12, r12, lr, lsr #16
 614 #else
 615         orr     r12, r12, lr, lsl #16
 616 #endif
 617         str     r12, [r0], #4
 618         subs    r2, r2, #4
 619         bge     .Lmemmove_fsrcul2loop4
 620
 621 .Lmemmove_fsrcul2l4:
 622         sub     r1, r1, #2
 623         b       .Lmemmove_fl4
 624
 625 .Lmemmove_fsrcul3:
 626         cmp     r2, #0x0c
 627         blt     .Lmemmove_fsrcul3loop4
 628         sub     r2, r2, #0x0c
 629         stmdb   sp!, {r4, r5}
 630
 631 .Lmemmove_fsrcul3loop16:
 632 #ifdef __ARMEB__
 633         mov     r3, lr, lsl #24
 634 #else
 635         mov     r3, lr, lsr #24
 636 #endif
 637         ldmia   r1!, {r4, r5, r12, lr}
 638 #ifdef __ARMEB__
 639         orr     r3, r3, r4, lsr #8
 640         mov     r4, r4, lsl #24
 641         orr     r4, r4, r5, lsr #8
 642         mov     r5, r5, lsl #24
 643         orr     r5, r5, r12, lsr #8
 644         mov     r12, r12, lsl #24
 645         orr     r12, r12, lr, lsr #8
 646 #else
 647         orr     r3, r3, r4, lsl #8
 648         mov     r4, r4, lsr #24
 649         orr     r4, r4, r5, lsl #8
 650         mov     r5, r5, lsr #24
 651         orr     r5, r5, r12, lsl #8
 652         mov     r12, r12, lsr #24
 653         orr     r12, r12, lr, lsl #8
 654 #endif
 655         stmia   r0!, {r3-r5, r12}
 656         subs    r2, r2, #0x10
 657         bge     .Lmemmove_fsrcul3loop16
 658         ldmia   sp!, {r4, r5}
 659         adds    r2, r2, #0x0c
 660         blt     .Lmemmove_fsrcul3l4
 661
 662 .Lmemmove_fsrcul3loop4:
 663 #ifdef __ARMEB__
 664         mov     r12, lr, lsl #24
 665 #else
 666         mov     r12, lr, lsr #24
 667 #endif
 668         ldr     lr, [r1], #4
 669 #ifdef __ARMEB__
 670         orr     r12, r12, lr, lsr #8
 671 #else
 672         orr     r12, r12, lr, lsl #8
 673 #endif
 674         str     r12, [r0], #4
 675         subs    r2, r2, #4
 676         bge     .Lmemmove_fsrcul3loop4
 677
 678 .Lmemmove_fsrcul3l4:
 679         sub     r1, r1, #1
 680         b       .Lmemmove_fl4
 681
 682 .Lmemmove_backwards:
 683         add     r1, r1, r2
 684         add     r0, r0, r2
 685         subs    r2, r2, #4
 686         blt     .Lmemmove_bl4           /* less than 4 bytes */
 687         ands    r12, r0, #3
 688         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 689         ands    r12, r1, #3
 690         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 691
 692 .Lmemmove_bt8:
 693         /* We have aligned source and destination */
 694         subs    r2, r2, #8
 695         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 696         stmdb   sp!, {r4, lr}
 697         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 698         blt     .Lmemmove_bl32
 699
 700         /* blat 32 bytes at a time */
 701         /* XXX for really big copies perhaps we should use more registers */
 702 .Lmemmove_bloop32:
 703         ldmdb   r1!, {r3, r4, r12, lr}
 704         stmdb   r0!, {r3, r4, r12, lr}
 705         ldmdb   r1!, {r3, r4, r12, lr}
 706         stmdb   r0!, {r3, r4, r12, lr}
 707         subs    r2, r2, #0x20
 708         bge     .Lmemmove_bloop32
 709
 710 .Lmemmove_bl32:
 711         cmn     r2, #0x10
 712         ldmgedb r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 713         stmgedb r0!, {r3, r4, r12, lr}
 714         subge   r2, r2, #0x10
 715         adds    r2, r2, #0x14
 716         ldmgedb r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 717         stmgedb r0!, {r3, r12, lr}
 718         subge   r2, r2, #0x0c
 719         ldmia   sp!, {r4, lr}
 720
 721 .Lmemmove_bl12:
 722         adds    r2, r2, #8
 723         blt     .Lmemmove_bl4
 724         subs    r2, r2, #4
 725         ldrlt   r3, [r1, #-4]!
 726         strlt   r3, [r0, #-4]!
 727         ldmgedb r1!, {r3, r12}
 728         stmgedb r0!, {r3, r12}
 729         subge   r2, r2, #4
 730
 731 .Lmemmove_bl4:
 732         /* less than 4 bytes to go */
 733         adds    r2, r2, #4
 734         RETeq                   /* done */
 735
 736         /* copy the crud byte at a time */
 737         cmp     r2, #2
 738         ldrb    r3, [r1, #-1]!
 739         strb    r3, [r0, #-1]!
 740         ldrgeb  r3, [r1, #-1]!
 741         strgeb  r3, [r0, #-1]!
 742         ldrgtb  r3, [r1, #-1]!
 743         strgtb  r3, [r0, #-1]!
 744         RET
 745
 746         /* erg - unaligned destination */
 747 .Lmemmove_bdestul:
 748         cmp     r12, #2
 749
 750         /* align destination with byte copies */
 751         ldrb    r3, [r1, #-1]!
 752         strb    r3, [r0, #-1]!
 753         ldrgeb  r3, [r1, #-1]!
 754         strgeb  r3, [r0, #-1]!
 755         ldrgtb  r3, [r1, #-1]!
 756         strgtb  r3, [r0, #-1]!
 757         subs    r2, r2, r12
 758         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 759         ands    r12, r1, #3
 760         beq     .Lmemmove_bt8           /* we have an aligned source */
 761
 762         /* erg - unaligned source */
 763         /* This is where it gets nasty ... */
 764 .Lmemmove_bsrcul:
 765         bic     r1, r1, #3
 766         ldr     r3, [r1, #0]
 767         cmp     r12, #2
 768         blt     .Lmemmove_bsrcul1
 769         beq     .Lmemmove_bsrcul2
 770         cmp     r2, #0x0c
 771         blt     .Lmemmove_bsrcul3loop4
 772         sub     r2, r2, #0x0c
 773         stmdb   sp!, {r4, r5, lr}
 774
 775 .Lmemmove_bsrcul3loop16:
 776 #ifdef __ARMEB__
 777         mov     lr, r3, lsr #8
 778 #else
 779         mov     lr, r3, lsl #8
 780 #endif
 781         ldmdb   r1!, {r3-r5, r12}
 782 #ifdef __ARMEB__
 783         orr     lr, lr, r12, lsl #24
 784         mov     r12, r12, lsr #8
 785         orr     r12, r12, r5, lsl #24
 786         mov     r5, r5, lsr #8
 787         orr     r5, r5, r4, lsl #24
 788         mov     r4, r4, lsr #8
 789         orr     r4, r4, r3, lsl #24
 790 #else
 791         orr     lr, lr, r12, lsr #24
 792         mov     r12, r12, lsl #8
 793         orr     r12, r12, r5, lsr #24
 794         mov     r5, r5, lsl #8
 795         orr     r5, r5, r4, lsr #24
 796         mov     r4, r4, lsl #8
 797         orr     r4, r4, r3, lsr #24
 798 #endif
 799         stmdb   r0!, {r4, r5, r12, lr}
 800         subs    r2, r2, #0x10
 801         bge     .Lmemmove_bsrcul3loop16
 802         ldmia   sp!, {r4, r5, lr}
 803         adds    r2, r2, #0x0c
 804         blt     .Lmemmove_bsrcul3l4
 805
 806 .Lmemmove_bsrcul3loop4:
 807 #ifdef __ARMEB__
 808         mov     r12, r3, lsr #8
 809 #else
 810         mov     r12, r3, lsl #8
 811 #endif
 812         ldr     r3, [r1, #-4]!
 813 #ifdef __ARMEB__
 814         orr     r12, r12, r3, lsl #24
 815 #else
 816         orr     r12, r12, r3, lsr #24
 817 #endif
 818         str     r12, [r0, #-4]!
 819         subs    r2, r2, #4
 820         bge     .Lmemmove_bsrcul3loop4
 821
 822 .Lmemmove_bsrcul3l4:
 823         add     r1, r1, #3
 824         b       .Lmemmove_bl4
 825
 826 .Lmemmove_bsrcul2:
 827         cmp     r2, #0x0c
 828         blt     .Lmemmove_bsrcul2loop4
 829         sub     r2, r2, #0x0c
 830         stmdb   sp!, {r4, r5, lr}
 831
 832 .Lmemmove_bsrcul2loop16:
 833 #ifdef __ARMEB__
 834         mov     lr, r3, lsr #16
 835 #else
 836         mov     lr, r3, lsl #16
 837 #endif
 838         ldmdb   r1!, {r3-r5, r12}
 839 #ifdef __ARMEB__
 840         orr     lr, lr, r12, lsl #16
 841         mov     r12, r12, lsr #16
 842         orr     r12, r12, r5, lsl #16
 843         mov     r5, r5, lsr #16
 844         orr     r5, r5, r4, lsl #16
 845         mov     r4, r4, lsr #16
 846         orr     r4, r4, r3, lsl #16
 847 #else
 848         orr     lr, lr, r12, lsr #16
 849         mov     r12, r12, lsl #16
 850         orr     r12, r12, r5, lsr #16
 851         mov     r5, r5, lsl #16
 852         orr     r5, r5, r4, lsr #16
 853         mov     r4, r4, lsl #16
 854         orr     r4, r4, r3, lsr #16
 855 #endif
 856         stmdb   r0!, {r4, r5, r12, lr}
 857         subs    r2, r2, #0x10
 858         bge     .Lmemmove_bsrcul2loop16
 859         ldmia   sp!, {r4, r5, lr}
 860         adds    r2, r2, #0x0c
 861         blt     .Lmemmove_bsrcul2l4
 862
 863 .Lmemmove_bsrcul2loop4:
 864 #ifdef __ARMEB__
 865         mov     r12, r3, lsr #16
 866 #else
 867         mov     r12, r3, lsl #16
 868 #endif
 869         ldr     r3, [r1, #-4]!
 870 #ifdef __ARMEB__
 871         orr     r12, r12, r3, lsl #16
 872 #else
 873         orr     r12, r12, r3, lsr #16
 874 #endif
 875         str     r12, [r0, #-4]!
 876         subs    r2, r2, #4
 877         bge     .Lmemmove_bsrcul2loop4
 878
 879 .Lmemmove_bsrcul2l4:
 880         add     r1, r1, #2
 881         b       .Lmemmove_bl4
 882
 883 .Lmemmove_bsrcul1:
 884         cmp     r2, #0x0c
 885         blt     .Lmemmove_bsrcul1loop4
 886         sub     r2, r2, #0x0c
 887         stmdb   sp!, {r4, r5, lr}
 888
 889 .Lmemmove_bsrcul1loop32:
 890 #ifdef __ARMEB__
 891         mov     lr, r3, lsr #24
 892 #else
 893         mov     lr, r3, lsl #24
 894 #endif
 895         ldmdb   r1!, {r3-r5, r12}
 896 #ifdef __ARMEB__
 897         orr     lr, lr, r12, lsl #8
 898         mov     r12, r12, lsr #24
 899         orr     r12, r12, r5, lsl #8
 900         mov     r5, r5, lsr #24
 901         orr     r5, r5, r4, lsl #8
 902         mov     r4, r4, lsr #24
 903         orr     r4, r4, r3, lsl #8
 904 #else
 905         orr     lr, lr, r12, lsr #8
 906         mov     r12, r12, lsl #24
 907         orr     r12, r12, r5, lsr #8
 908         mov     r5, r5, lsl #24
 909         orr     r5, r5, r4, lsr #8
 910         mov     r4, r4, lsl #24
 911         orr     r4, r4, r3, lsr #8
 912 #endif
 913         stmdb   r0!, {r4, r5, r12, lr}
 914         subs    r2, r2, #0x10
 915         bge     .Lmemmove_bsrcul1loop32
 916         ldmia   sp!, {r4, r5, lr}
 917         adds    r2, r2, #0x0c
 918         blt     .Lmemmove_bsrcul1l4
 919
 920 .Lmemmove_bsrcul1loop4:
 921 #ifdef __ARMEB__
 922         mov     r12, r3, lsr #24
 923 #else
 924         mov     r12, r3, lsl #24
 925 #endif
 926         ldr     r3, [r1, #-4]!
 927 #ifdef __ARMEB__
 928         orr     r12, r12, r3, lsl #8
 929 #else
 930         orr     r12, r12, r3, lsr #8
 931 #endif
 932         str     r12, [r0, #-4]!
 933         subs    r2, r2, #4
 934         bge     .Lmemmove_bsrcul1loop4
 935
 936 .Lmemmove_bsrcul1l4:
 937         add     r1, r1, #1
 938         b       .Lmemmove_bl4
 939
 940 #if !defined(_ARM_ARCH_5E)
 941 ENTRY(memcpy)
 942         /* save leaf functions having to store this away */
 943         /* Do not check arm_memcpy if we're running from flash */
 944 #ifdef FLASHADDR
 945 #if FLASHADDR > PHYSADDR
 946         ldr     r3, =FLASHADDR
 947         cmp     r3, pc
 948         bls     .Lnormal
 949 #else
 950         ldr     r3, =FLASHADDR
 951         cmp     r3, pc
 952         bhi     .Lnormal
 953 #endif
 954 #endif
 955         ldr     r3, .L_arm_memcpy
 956         ldr     r3, [r3]
 957         cmp     r3, #0
 958         beq     .Lnormal
 959         ldr     r3, .L_min_memcpy_size
 960         ldr     r3, [r3]
 961         cmp     r2, r3
 962         blt     .Lnormal
 963         stmfd   sp!, {r0-r2, r4, lr}
 964         mov     r3, #0
 965         ldr     r4, .L_arm_memcpy
 966         mov     lr, pc
 967         ldr     pc, [r4]
 968         cmp     r0, #0
 969         ldmfd   sp!, {r0-r2, r4, lr}
 970         RETeq
 971
 972 .Lnormal:
 973         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
 974
 975         subs    r2, r2, #4
 976         blt     .Lmemcpy_l4             /* less than 4 bytes */
 977         ands    r12, r0, #3
 978         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
 979         ands    r12, r1, #3
 980         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
 981
 982 .Lmemcpy_t8:
 983         /* We have aligned source and destination */
 984         subs    r2, r2, #8
 985         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
 986         subs    r2, r2, #0x14
 987         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
 988         stmdb   sp!, {r4}               /* borrow r4 */
 989
 990         /* blat 32 bytes at a time */
 991         /* XXX for really big copies perhaps we should use more registers */
 992 .Lmemcpy_loop32:
 993         ldmia   r1!, {r3, r4, r12, lr}
 994         stmia   r0!, {r3, r4, r12, lr}
 995         ldmia   r1!, {r3, r4, r12, lr}
 996         stmia   r0!, {r3, r4, r12, lr}
 997         subs    r2, r2, #0x20
 998         bge     .Lmemcpy_loop32
 999
1000         cmn     r2, #0x10
1001         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
1002         stmgeia r0!, {r3, r4, r12, lr}
1003         subge   r2, r2, #0x10
1004         ldmia   sp!, {r4}               /* return r4 */
1005
1006 .Lmemcpy_l32:
1007         adds    r2, r2, #0x14
1008
1009         /* blat 12 bytes at a time */
1010 .Lmemcpy_loop12:
1011         ldmgeia r1!, {r3, r12, lr}
1012         stmgeia r0!, {r3, r12, lr}
1013         subges  r2, r2, #0x0c
1014         bge     .Lmemcpy_loop12
1015
1016 .Lmemcpy_l12:
1017         adds    r2, r2, #8
1018         blt     .Lmemcpy_l4
1019
1020         subs    r2, r2, #4
1021         ldrlt   r3, [r1], #4
1022         strlt   r3, [r0], #4
1023         ldmgeia r1!, {r3, r12}
1024         stmgeia r0!, {r3, r12}
1025         subge   r2, r2, #4
1026
1027 .Lmemcpy_l4:
1028         /* less than 4 bytes to go */
1029         adds    r2, r2, #4
1030 #ifdef __APCS_26_
1031         ldmeqia sp!, {r0, pc}^          /* done */
1032 #else
1033         ldmeqia sp!, {r0, pc}           /* done */
1034 #endif
1035         /* copy the crud byte at a time */
1036         cmp     r2, #2
1037         ldrb    r3, [r1], #1
1038         strb    r3, [r0], #1
1039         ldrgeb  r3, [r1], #1
1040         strgeb  r3, [r0], #1
1041         ldrgtb  r3, [r1], #1
1042         strgtb  r3, [r0], #1
1043         ldmia   sp!, {r0, pc}
1044
1045         /* erg - unaligned destination */
1046 .Lmemcpy_destul:
1047         rsb     r12, r12, #4
1048         cmp     r12, #2
1049
1050         /* align destination with byte copies */
1051         ldrb    r3, [r1], #1
1052         strb    r3, [r0], #1
1053         ldrgeb  r3, [r1], #1
1054         strgeb  r3, [r0], #1
1055         ldrgtb  r3, [r1], #1
1056         strgtb  r3, [r0], #1
1057         subs    r2, r2, r12
1058         blt     .Lmemcpy_l4             /* less the 4 bytes */
1059
1060         ands    r12, r1, #3
1061         beq     .Lmemcpy_t8             /* we have an aligned source */
1062
1063         /* erg - unaligned source */
1064         /* This is where it gets nasty ... */
1065 .Lmemcpy_srcul:
1066         bic     r1, r1, #3
1067         ldr     lr, [r1], #4
1068         cmp     r12, #2
1069         bgt     .Lmemcpy_srcul3
1070         beq     .Lmemcpy_srcul2
1071         cmp     r2, #0x0c
1072         blt     .Lmemcpy_srcul1loop4
1073         sub     r2, r2, #0x0c
1074         stmdb   sp!, {r4, r5}
1075
1076 .Lmemcpy_srcul1loop16:
1077         mov     r3, lr, lsr #8
1078         ldmia   r1!, {r4, r5, r12, lr}
1079         orr     r3, r3, r4, lsl #24
1080         mov     r4, r4, lsr #8
1081         orr     r4, r4, r5, lsl #24
1082         mov     r5, r5, lsr #8
1083         orr     r5, r5, r12, lsl #24
1084         mov     r12, r12, lsr #8
1085         orr     r12, r12, lr, lsl #24
1086         stmia   r0!, {r3-r5, r12}
1087         subs    r2, r2, #0x10
1088         bge     .Lmemcpy_srcul1loop16
1089         ldmia   sp!, {r4, r5}
1090         adds    r2, r2, #0x0c
1091         blt     .Lmemcpy_srcul1l4
1092
1093 .Lmemcpy_srcul1loop4:
1094         mov     r12, lr, lsr #8
1095         ldr     lr, [r1], #4
1096         orr     r12, r12, lr, lsl #24
1097         str     r12, [r0], #4
1098         subs    r2, r2, #4
1099         bge     .Lmemcpy_srcul1loop4
1100
1101 .Lmemcpy_srcul1l4:
1102         sub     r1, r1, #3
1103         b       .Lmemcpy_l4
1104
1105 .Lmemcpy_srcul2:
1106         cmp     r2, #0x0c
1107         blt     .Lmemcpy_srcul2loop4
1108         sub     r2, r2, #0x0c
1109         stmdb   sp!, {r4, r5}
1110
1111 .Lmemcpy_srcul2loop16:
1112         mov     r3, lr, lsr #16
1113         ldmia   r1!, {r4, r5, r12, lr}
1114         orr     r3, r3, r4, lsl #16
1115         mov     r4, r4, lsr #16
1116         orr     r4, r4, r5, lsl #16
1117         mov     r5, r5, lsr #16
1118         orr     r5, r5, r12, lsl #16
1119         mov     r12, r12, lsr #16
1120         orr     r12, r12, lr, lsl #16
1121         stmia   r0!, {r3-r5, r12}
1122         subs    r2, r2, #0x10
1123         bge     .Lmemcpy_srcul2loop16
1124         ldmia   sp!, {r4, r5}
1125         adds    r2, r2, #0x0c
1126         blt     .Lmemcpy_srcul2l4
1127
1128 .Lmemcpy_srcul2loop4:
1129         mov     r12, lr, lsr #16
1130         ldr     lr, [r1], #4
1131         orr     r12, r12, lr, lsl #16
1132         str     r12, [r0], #4
1133         subs    r2, r2, #4
1134         bge     .Lmemcpy_srcul2loop4
1135
1136 .Lmemcpy_srcul2l4:
1137         sub     r1, r1, #2
1138         b       .Lmemcpy_l4
1139
1140 .Lmemcpy_srcul3:
1141         cmp     r2, #0x0c
1142         blt     .Lmemcpy_srcul3loop4
1143         sub     r2, r2, #0x0c
1144         stmdb   sp!, {r4, r5}
1145
1146 .Lmemcpy_srcul3loop16:
1147         mov     r3, lr, lsr #24
1148         ldmia   r1!, {r4, r5, r12, lr}
1149         orr     r3, r3, r4, lsl #8
1150         mov     r4, r4, lsr #24
1151         orr     r4, r4, r5, lsl #8
1152         mov     r5, r5, lsr #24
1153         orr     r5, r5, r12, lsl #8
1154         mov     r12, r12, lsr #24
1155         orr     r12, r12, lr, lsl #8
1156         stmia   r0!, {r3-r5, r12}
1157         subs    r2, r2, #0x10
1158         bge     .Lmemcpy_srcul3loop16
1159         ldmia   sp!, {r4, r5}
1160         adds    r2, r2, #0x0c
1161         blt     .Lmemcpy_srcul3l4
1162
1163 .Lmemcpy_srcul3loop4:
1164         mov     r12, lr, lsr #24
1165         ldr     lr, [r1], #4
1166         orr     r12, r12, lr, lsl #8
1167         str     r12, [r0], #4
1168         subs    r2, r2, #4
1169         bge     .Lmemcpy_srcul3loop4
1170
1171 .Lmemcpy_srcul3l4:
1172         sub     r1, r1, #1
1173         b       .Lmemcpy_l4
1174 #else
1175 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1176 ENTRY(memcpy)
1177         pld     [r1]
1178         cmp     r2, #0x0c
1179         ble     .Lmemcpy_short          /* <= 12 bytes */
1180 #ifdef FLASHADDR
1181 #if FLASHADDR > PHYSADDR
1182         ldr     r3, =FLASHADDR
1183         cmp     r3, pc
1184         bls     .Lnormal
1185 #else
1186         ldr     r3, =FLASHADDR
1187         cmp     r3, pc
1188         bhi     .Lnormal
1189 #endif
1190 #endif
1191         ldr     r3, .L_arm_memcpy
1192         ldr     r3, [r3]
1193         cmp     r3, #0
1194         beq     .Lnormal
1195         ldr     r3, .L_min_memcpy_size
1196         ldr     r3, [r3]
1197         cmp     r2, r3
1198         blt     .Lnormal
1199         stmfd   sp!, {r0-r2, r4, lr}
1200         mov     r3, #0
1201         ldr     r4, .L_arm_memcpy
1202         mov     lr, pc
1203         ldr     pc, [r4]
1204         cmp     r0, #0
1205         ldmfd   sp!, {r0-r2, r4, lr}
1206         RETeq
1207 .Lnormal:
1208         mov     r3, r0                  /* We must not clobber r0 */
1209
1210         /* Word-align the destination buffer */
1211         ands    ip, r3, #0x03           /* Already word aligned? */
1212         beq     .Lmemcpy_wordaligned    /* Yup */
1213         cmp     ip, #0x02
1214         ldrb    ip, [r1], #0x01
1215         sub     r2, r2, #0x01
1216         strb    ip, [r3], #0x01
1217         ldrleb  ip, [r1], #0x01
1218         suble   r2, r2, #0x01
1219         strleb  ip, [r3], #0x01
1220         ldrltb  ip, [r1], #0x01
1221         sublt   r2, r2, #0x01
1222         strltb  ip, [r3], #0x01
1223
1224         /* Destination buffer is now word aligned */
1225 .Lmemcpy_wordaligned:
1226         ands    ip, r1, #0x03           /* Is src also word-aligned? */
1227         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
1228
1229         /* Quad-align the destination buffer */
1230         tst     r3, #0x07               /* Already quad aligned? */
1231         ldrne   ip, [r1], #0x04
1232         stmfd   sp!, {r4-r9}            /* Free up some registers */
1233         subne   r2, r2, #0x04
1234         strne   ip, [r3], #0x04
1235
1236         /* Destination buffer quad aligned, source is at least word aligned */
1237         subs    r2, r2, #0x80
1238         blt     .Lmemcpy_w_lessthan128
1239
1240         /* Copy 128 bytes at a time */
1241 .Lmemcpy_w_loop128:
1242         ldr     r4, [r1], #0x04         /* LD:00-03 */
1243         ldr     r5, [r1], #0x04         /* LD:04-07 */
1244         pld     [r1, #0x18]             /* Prefetch 0x20 */
1245         ldr     r6, [r1], #0x04         /* LD:08-0b */
1246         ldr     r7, [r1], #0x04         /* LD:0c-0f */
1247         ldr     r8, [r1], #0x04         /* LD:10-13 */
1248         ldr     r9, [r1], #0x04         /* LD:14-17 */
1249         strd    r4, [r3], #0x08         /* ST:00-07 */
1250         ldr     r4, [r1], #0x04         /* LD:18-1b */
1251         ldr     r5, [r1], #0x04         /* LD:1c-1f */
1252         strd    r6, [r3], #0x08         /* ST:08-0f */
1253         ldr     r6, [r1], #0x04         /* LD:20-23 */
1254         ldr     r7, [r1], #0x04         /* LD:24-27 */
1255         pld     [r1, #0x18]             /* Prefetch 0x40 */
1256         strd    r8, [r3], #0x08         /* ST:10-17 */
1257         ldr     r8, [r1], #0x04         /* LD:28-2b */
1258         ldr     r9, [r1], #0x04         /* LD:2c-2f */
1259         strd    r4, [r3], #0x08         /* ST:18-1f */
1260         ldr     r4, [r1], #0x04         /* LD:30-33 */
1261         ldr     r5, [r1], #0x04         /* LD:34-37 */
1262         strd    r6, [r3], #0x08         /* ST:20-27 */
1263         ldr     r6, [r1], #0x04         /* LD:38-3b */
1264         ldr     r7, [r1], #0x04         /* LD:3c-3f */
1265         strd    r8, [r3], #0x08         /* ST:28-2f */
1266         ldr     r8, [r1], #0x04         /* LD:40-43 */
1267         ldr     r9, [r1], #0x04         /* LD:44-47 */
1268         pld     [r1, #0x18]             /* Prefetch 0x60 */
1269         strd    r4, [r3], #0x08         /* ST:30-37 */
1270         ldr     r4, [r1], #0x04         /* LD:48-4b */
1271         ldr     r5, [r1], #0x04         /* LD:4c-4f */
1272         strd    r6, [r3], #0x08         /* ST:38-3f */
1273         ldr     r6, [r1], #0x04         /* LD:50-53 */
1274         ldr     r7, [r1], #0x04         /* LD:54-57 */
1275         strd    r8, [r3], #0x08         /* ST:40-47 */
1276         ldr     r8, [r1], #0x04         /* LD:58-5b */
1277         ldr     r9, [r1], #0x04         /* LD:5c-5f */
1278         strd    r4, [r3], #0x08         /* ST:48-4f */
1279         ldr     r4, [r1], #0x04         /* LD:60-63 */
1280         ldr     r5, [r1], #0x04         /* LD:64-67 */
1281         pld     [r1, #0x18]             /* Prefetch 0x80 */
1282         strd    r6, [r3], #0x08         /* ST:50-57 */
1283         ldr     r6, [r1], #0x04         /* LD:68-6b */
1284         ldr     r7, [r1], #0x04         /* LD:6c-6f */
1285         strd    r8, [r3], #0x08         /* ST:58-5f */
1286         ldr     r8, [r1], #0x04         /* LD:70-73 */
1287         ldr     r9, [r1], #0x04         /* LD:74-77 */
1288         strd    r4, [r3], #0x08         /* ST:60-67 */
1289         ldr     r4, [r1], #0x04         /* LD:78-7b */
1290         ldr     r5, [r1], #0x04         /* LD:7c-7f */
1291         strd    r6, [r3], #0x08         /* ST:68-6f */
1292         strd    r8, [r3], #0x08         /* ST:70-77 */
1293         subs    r2, r2, #0x80
1294         strd    r4, [r3], #0x08         /* ST:78-7f */
1295         bge     .Lmemcpy_w_loop128
1296
1297 .Lmemcpy_w_lessthan128:
1298         adds    r2, r2, #0x80           /* Adjust for extra sub */
1299         ldmeqfd sp!, {r4-r9}
1300         RETeq                   /* Return now if done */
1301         subs    r2, r2, #0x20
1302         blt     .Lmemcpy_w_lessthan32
1303
1304         /* Copy 32 bytes at a time */
1305 .Lmemcpy_w_loop32:
1306         ldr     r4, [r1], #0x04
1307         ldr     r5, [r1], #0x04
1308         pld     [r1, #0x18]
1309         ldr     r6, [r1], #0x04
1310         ldr     r7, [r1], #0x04
1311         ldr     r8, [r1], #0x04
1312         ldr     r9, [r1], #0x04
1313         strd    r4, [r3], #0x08
1314         ldr     r4, [r1], #0x04
1315         ldr     r5, [r1], #0x04
1316         strd    r6, [r3], #0x08
1317         strd    r8, [r3], #0x08
1318         subs    r2, r2, #0x20
1319         strd    r4, [r3], #0x08
1320         bge     .Lmemcpy_w_loop32
1321
1322 .Lmemcpy_w_lessthan32:
1323         adds    r2, r2, #0x20           /* Adjust for extra sub */
1324         ldmeqfd sp!, {r4-r9}
1325         RETeq                   /* Return now if done */
1326
1327         and     r4, r2, #0x18
1328         rsbs    r4, r4, #0x18
1329         addne   pc, pc, r4, lsl #1
1330         nop
1331
1332         /* At least 24 bytes remaining */
1333         ldr     r4, [r1], #0x04
1334         ldr     r5, [r1], #0x04
1335         sub     r2, r2, #0x08
1336         strd    r4, [r3], #0x08
1337
1338         /* At least 16 bytes remaining */
1339         ldr     r4, [r1], #0x04
1340         ldr     r5, [r1], #0x04
1341         sub     r2, r2, #0x08
1342         strd    r4, [r3], #0x08
1343
1344         /* At least 8 bytes remaining */
1345         ldr     r4, [r1], #0x04
1346         ldr     r5, [r1], #0x04
1347         subs    r2, r2, #0x08
1348         strd    r4, [r3], #0x08
1349
1350         /* Less than 8 bytes remaining */
1351         ldmfd   sp!, {r4-r9}
1352         RETeq                   /* Return now if done */
1353         subs    r2, r2, #0x04
1354         ldrge   ip, [r1], #0x04
1355         strge   ip, [r3], #0x04
1356         RETeq                   /* Return now if done */
1357         addlt   r2, r2, #0x04
1358         ldrb    ip, [r1], #0x01
1359         cmp     r2, #0x02
1360         ldrgeb  r2, [r1], #0x01
1361         strb    ip, [r3], #0x01
1362         ldrgtb  ip, [r1]
1363         strgeb  r2, [r3], #0x01
1364         strgtb  ip, [r3]
1365         RET
1366
1367
1368 /*
1369  * At this point, it has not been possible to word align both buffers.
1370  * The destination buffer is word aligned, but the source buffer is not.
1371  */
1372 .Lmemcpy_bad_align:
1373         stmfd   sp!, {r4-r7}
1374         bic     r1, r1, #0x03
1375         cmp     ip, #2
1376         ldr     ip, [r1], #0x04
1377         bgt     .Lmemcpy_bad3
1378         beq     .Lmemcpy_bad2
1379         b       .Lmemcpy_bad1
1380
1381 .Lmemcpy_bad1_loop16:
1382 #ifdef __ARMEB__
1383         mov     r4, ip, lsl #8
1384 #else
1385         mov     r4, ip, lsr #8
1386 #endif
1387         ldr     r5, [r1], #0x04
1388         pld     [r1, #0x018]
1389         ldr     r6, [r1], #0x04
1390         ldr     r7, [r1], #0x04
1391         ldr     ip, [r1], #0x04
1392 #ifdef __ARMEB__
1393         orr     r4, r4, r5, lsr #24
1394         mov     r5, r5, lsl #8
1395         orr     r5, r5, r6, lsr #24
1396         mov     r6, r6, lsl #8
1397         orr     r6, r6, r7, lsr #24
1398         mov     r7, r7, lsl #8
1399         orr     r7, r7, ip, lsr #24
1400 #else
1401         orr     r4, r4, r5, lsl #24
1402         mov     r5, r5, lsr #8
1403         orr     r5, r5, r6, lsl #24
1404         mov     r6, r6, lsr #8
1405         orr     r6, r6, r7, lsl #24
1406         mov     r7, r7, lsr #8
1407         orr     r7, r7, ip, lsl #24
1408 #endif
1409         str     r4, [r3], #0x04
1410         str     r5, [r3], #0x04
1411         str     r6, [r3], #0x04
1412         str     r7, [r3], #0x04
1413 .Lmemcpy_bad1:
1414         subs    r2, r2, #0x10
1415         bge     .Lmemcpy_bad1_loop16
1416
1417         adds    r2, r2, #0x10
1418         ldmeqfd sp!, {r4-r7}
1419         RETeq                   /* Return now if done */
1420         subs    r2, r2, #0x04
1421         sublt   r1, r1, #0x03
1422         blt     .Lmemcpy_bad_done
1423
1424 .Lmemcpy_bad1_loop4:
1425 #ifdef __ARMEB__
1426         mov     r4, ip, lsl #8
1427 #else
1428         mov     r4, ip, lsr #8
1429 #endif
1430         ldr     ip, [r1], #0x04
1431         subs    r2, r2, #0x04
1432 #ifdef __ARMEB__
1433         orr     r4, r4, ip, lsr #24
1434 #else
1435         orr     r4, r4, ip, lsl #24
1436 #endif
1437         str     r4, [r3], #0x04
1438         bge     .Lmemcpy_bad1_loop4
1439         sub     r1, r1, #0x03
1440         b       .Lmemcpy_bad_done
1441
1442 .Lmemcpy_bad2_loop16:
1443 #ifdef __ARMEB__
1444         mov     r4, ip, lsl #16
1445 #else
1446         mov     r4, ip, lsr #16
1447 #endif
1448         ldr     r5, [r1], #0x04
1449         pld     [r1, #0x018]
1450         ldr     r6, [r1], #0x04
1451         ldr     r7, [r1], #0x04
1452         ldr     ip, [r1], #0x04
1453 #ifdef __ARMEB__
1454         orr     r4, r4, r5, lsr #16
1455         mov     r5, r5, lsl #16
1456         orr     r5, r5, r6, lsr #16
1457         mov     r6, r6, lsl #16
1458         orr     r6, r6, r7, lsr #16
1459         mov     r7, r7, lsl #16
1460         orr     r7, r7, ip, lsr #16
1461 #else
1462         orr     r4, r4, r5, lsl #16
1463         mov     r5, r5, lsr #16
1464         orr     r5, r5, r6, lsl #16
1465         mov     r6, r6, lsr #16
1466         orr     r6, r6, r7, lsl #16
1467         mov     r7, r7, lsr #16
1468         orr     r7, r7, ip, lsl #16
1469 #endif
1470         str     r4, [r3], #0x04
1471         str     r5, [r3], #0x04
1472         str     r6, [r3], #0x04
1473         str     r7, [r3], #0x04
1474 .Lmemcpy_bad2:
1475         subs    r2, r2, #0x10
1476         bge     .Lmemcpy_bad2_loop16
1477
1478         adds    r2, r2, #0x10
1479         ldmeqfd sp!, {r4-r7}
1480         RETeq                   /* Return now if done */
1481         subs    r2, r2, #0x04
1482         sublt   r1, r1, #0x02
1483         blt     .Lmemcpy_bad_done
1484
1485 .Lmemcpy_bad2_loop4:
1486 #ifdef __ARMEB__
1487         mov     r4, ip, lsl #16
1488 #else
1489         mov     r4, ip, lsr #16
1490 #endif
1491         ldr     ip, [r1], #0x04
1492         subs    r2, r2, #0x04
1493 #ifdef __ARMEB__
1494         orr     r4, r4, ip, lsr #16
1495 #else
1496         orr     r4, r4, ip, lsl #16
1497 #endif
1498         str     r4, [r3], #0x04
1499         bge     .Lmemcpy_bad2_loop4
1500         sub     r1, r1, #0x02
1501         b       .Lmemcpy_bad_done
1502
1503 .Lmemcpy_bad3_loop16:
1504 #ifdef __ARMEB__
1505         mov     r4, ip, lsl #24
1506 #else
1507         mov     r4, ip, lsr #24
1508 #endif
1509         ldr     r5, [r1], #0x04
1510         pld     [r1, #0x018]
1511         ldr     r6, [r1], #0x04
1512         ldr     r7, [r1], #0x04
1513         ldr     ip, [r1], #0x04
1514 #ifdef __ARMEB__
1515         orr     r4, r4, r5, lsr #8
1516         mov     r5, r5, lsl #24
1517         orr     r5, r5, r6, lsr #8
1518         mov     r6, r6, lsl #24
1519         orr     r6, r6, r7, lsr #8
1520         mov     r7, r7, lsl #24
1521         orr     r7, r7, ip, lsr #8
1522 #else
1523         orr     r4, r4, r5, lsl #8
1524         mov     r5, r5, lsr #24
1525         orr     r5, r5, r6, lsl #8
1526         mov     r6, r6, lsr #24
1527         orr     r6, r6, r7, lsl #8
1528         mov     r7, r7, lsr #24
1529         orr     r7, r7, ip, lsl #8
1530 #endif
1531         str     r4, [r3], #0x04
1532         str     r5, [r3], #0x04
1533         str     r6, [r3], #0x04
1534         str     r7, [r3], #0x04
1535 .Lmemcpy_bad3:
1536         subs    r2, r2, #0x10
1537         bge     .Lmemcpy_bad3_loop16
1538
1539         adds    r2, r2, #0x10
1540         ldmeqfd sp!, {r4-r7}
1541         RETeq                   /* Return now if done */
1542         subs    r2, r2, #0x04
1543         sublt   r1, r1, #0x01
1544         blt     .Lmemcpy_bad_done
1545
1546 .Lmemcpy_bad3_loop4:
1547 #ifdef __ARMEB__
1548         mov     r4, ip, lsl #24
1549 #else
1550         mov     r4, ip, lsr #24
1551 #endif
1552         ldr     ip, [r1], #0x04
1553         subs    r2, r2, #0x04
1554 #ifdef __ARMEB__
1555         orr     r4, r4, ip, lsr #8
1556 #else
1557         orr     r4, r4, ip, lsl #8
1558 #endif
1559         str     r4, [r3], #0x04
1560         bge     .Lmemcpy_bad3_loop4
1561         sub     r1, r1, #0x01
1562
1563 .Lmemcpy_bad_done:
1564         ldmfd   sp!, {r4-r7}
1565         adds    r2, r2, #0x04
1566         RETeq
1567         ldrb    ip, [r1], #0x01
1568         cmp     r2, #0x02
1569         ldrgeb  r2, [r1], #0x01
1570         strb    ip, [r3], #0x01
1571         ldrgtb  ip, [r1]
1572         strgeb  r2, [r3], #0x01
1573         strgtb  ip, [r3]
1574         RET
1575
1576
1577 /*
1578  * Handle short copies (less than 16 bytes), possibly misaligned.
1579  * Some of these are *very* common, thanks to the network stack,
1580  * and so are handled specially.
1581  */
1582 .Lmemcpy_short:
1583         add     pc, pc, r2, lsl #2
1584         nop
1585         RET                     /* 0x00 */
1586         b       .Lmemcpy_bytewise       /* 0x01 */
1587         b       .Lmemcpy_bytewise       /* 0x02 */
1588         b       .Lmemcpy_bytewise       /* 0x03 */
1589         b       .Lmemcpy_4              /* 0x04 */
1590         b       .Lmemcpy_bytewise       /* 0x05 */
1591         b       .Lmemcpy_6              /* 0x06 */
1592         b       .Lmemcpy_bytewise       /* 0x07 */
1593         b       .Lmemcpy_8              /* 0x08 */
1594         b       .Lmemcpy_bytewise       /* 0x09 */
1595         b       .Lmemcpy_bytewise       /* 0x0a */
1596         b       .Lmemcpy_bytewise       /* 0x0b */
1597         b       .Lmemcpy_c              /* 0x0c */
1598 .Lmemcpy_bytewise:
1599         mov     r3, r0                  /* We must not clobber r0 */
1600         ldrb    ip, [r1], #0x01
1601 1:      subs    r2, r2, #0x01
1602         strb    ip, [r3], #0x01
1603         ldrneb  ip, [r1], #0x01
1604         bne     1b
1605         RET
1606
1607 /******************************************************************************
1608  * Special case for 4 byte copies
1609  */
1610 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1611 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1612         LMEMCPY_4_PAD
1613 .Lmemcpy_4:
1614         and     r2, r1, #0x03
1615         orr     r2, r2, r0, lsl #2
1616         ands    r2, r2, #0x0f
1617         sub     r3, pc, #0x14
1618         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1619
1620 /*
1621  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1622  */
1623         ldr     r2, [r1]
1624         str     r2, [r0]
1625         RET
1626         LMEMCPY_4_PAD
1627
1628 /*
1629  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1630  */
1631         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1632         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1633 #ifdef __ARMEB__
1634         mov     r3, r3, lsl #8          /* r3 = 012. */
1635         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
1636 #else
1637         mov     r3, r3, lsr #8          /* r3 = .210 */
1638         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1639 #endif
1640         str     r3, [r0]
1641         RET
1642         LMEMCPY_4_PAD
1643
1644 /*
1645  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1646  */
1647 #ifdef __ARMEB__
1648         ldrh    r3, [r1]
1649         ldrh    r2, [r1, #0x02]
1650 #else
1651         ldrh    r3, [r1, #0x02]
1652         ldrh    r2, [r1]
1653 #endif
1654         orr     r3, r2, r3, lsl #16
1655         str     r3, [r0]
1656         RET
1657         LMEMCPY_4_PAD
1658
1659 /*
1660  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1661  */
1662         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1663         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1664 #ifdef __ARMEB__
1665         mov     r3, r3, lsl #24         /* r3 = 0... */
1666         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
1667 #else
1668         mov     r3, r3, lsr #24         /* r3 = ...0 */
1669         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1670 #endif
1671         str     r3, [r0]
1672         RET
1673         LMEMCPY_4_PAD
1674
1675 /*
1676  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1677  */
1678         ldr     r2, [r1]
1679 #ifdef __ARMEB__
1680         strb    r2, [r0, #0x03]
1681         mov     r3, r2, lsr #8
1682         mov     r1, r2, lsr #24
1683         strb    r1, [r0]
1684 #else
1685         strb    r2, [r0]
1686         mov     r3, r2, lsr #8
1687         mov     r1, r2, lsr #24
1688         strb    r1, [r0, #0x03]
1689 #endif
1690         strh    r3, [r0, #0x01]
1691         RET
1692         LMEMCPY_4_PAD
1693
1694 /*
1695  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1696  */
1697         ldrb    r2, [r1]
1698         ldrh    r3, [r1, #0x01]
1699         ldrb    r1, [r1, #0x03]
1700         strb    r2, [r0]
1701         strh    r3, [r0, #0x01]
1702         strb    r1, [r0, #0x03]
1703         RET
1704         LMEMCPY_4_PAD
1705
1706 /*
1707  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1708  */
1709         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1710         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1711 #ifdef __ARMEB__
1712         mov     r1, r2, lsr #8          /* r1 = ...0 */
1713         strb    r1, [r0]
1714         mov     r2, r2, lsl #8          /* r2 = .01. */
1715         orr     r2, r2, r3, lsr #8      /* r2 = .012 */
1716 #else
1717         strb    r2, [r0]
1718         mov     r2, r2, lsr #8          /* r2 = ...1 */
1719         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1720         mov     r3, r3, lsr #8          /* r3 = ...3 */
1721 #endif
1722         strh    r2, [r0, #0x01]
1723         strb    r3, [r0, #0x03]
1724         RET
1725         LMEMCPY_4_PAD
1726
1727 /*
1728  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1729  */
1730         ldrb    r2, [r1]
1731         ldrh    r3, [r1, #0x01]
1732         ldrb    r1, [r1, #0x03]
1733         strb    r2, [r0]
1734         strh    r3, [r0, #0x01]
1735         strb    r1, [r0, #0x03]
1736         RET
1737         LMEMCPY_4_PAD
1738
1739 /*
1740  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1741  */
1742         ldr     r2, [r1]
1743 #ifdef __ARMEB__
1744         strh    r2, [r0, #0x02]
1745         mov     r3, r2, lsr #16
1746         strh    r3, [r0]
1747 #else
1748         strh    r2, [r0]
1749         mov     r3, r2, lsr #16
1750         strh    r3, [r0, #0x02]
1751 #endif
1752         RET
1753         LMEMCPY_4_PAD
1754
1755 /*
1756  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1757  */
1758         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1759         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1760         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1761         strh    r1, [r0]
1762 #ifdef __ARMEB__
1763         mov     r2, r2, lsl #8          /* r2 = 012. */
1764         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1765 #else
1766         mov     r2, r2, lsr #24         /* r2 = ...2 */
1767         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1768 #endif
1769         strh    r2, [r0, #0x02]
1770         RET
1771         LMEMCPY_4_PAD
1772
1773 /*
1774  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1775  */
1776         ldrh    r2, [r1]
1777         ldrh    r3, [r1, #0x02]
1778         strh    r2, [r0]
1779         strh    r3, [r0, #0x02]
1780         RET
1781         LMEMCPY_4_PAD
1782
1783 /*
1784  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1785  */
1786         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1787         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1788         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1789         strh    r1, [r0, #0x02]
1790 #ifdef __ARMEB__
1791         mov     r3, r3, lsr #24         /* r3 = ...1 */
1792         orr     r3, r3, r2, lsl #8      /* r3 = xx01 */
1793 #else
1794         mov     r3, r3, lsl #8          /* r3 = 321. */
1795         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1796 #endif
1797         strh    r3, [r0]
1798         RET
1799         LMEMCPY_4_PAD
1800
1801 /*
1802  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1803  */
1804         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1805 #ifdef __ARMEB__
1806         strb    r2, [r0, #0x03]
1807         mov     r3, r2, lsr #8
1808         mov     r1, r2, lsr #24
1809         strh    r3, [r0, #0x01]
1810         strb    r1, [r0]
1811 #else
1812         strb    r2, [r0]
1813         mov     r3, r2, lsr #8
1814         mov     r1, r2, lsr #24
1815         strh    r3, [r0, #0x01]
1816         strb    r1, [r0, #0x03]
1817 #endif
1818         RET
1819         LMEMCPY_4_PAD
1820
1821 /*
1822  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1823  */
1824         ldrb    r2, [r1]
1825         ldrh    r3, [r1, #0x01]
1826         ldrb    r1, [r1, #0x03]
1827         strb    r2, [r0]
1828         strh    r3, [r0, #0x01]
1829         strb    r1, [r0, #0x03]
1830         RET
1831         LMEMCPY_4_PAD
1832
1833 /*
1834  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1835  */
1836 #ifdef __ARMEB__
1837         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1838         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1839         strb    r3, [r0, #0x03]
1840         mov     r3, r3, lsr #8          /* r3 = ...2 */
1841         orr     r3, r3, r2, lsl #8      /* r3 = ..12 */
1842         strh    r3, [r0, #0x01]
1843         mov     r2, r2, lsr #8          /* r2 = ...0 */
1844         strb    r2, [r0]
1845 #else
1846         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1847         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1848         strb    r2, [r0]
1849         mov     r2, r2, lsr #8          /* r2 = ...1 */
1850         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1851         strh    r2, [r0, #0x01]
1852         mov     r3, r3, lsr #8          /* r3 = ...3 */
1853         strb    r3, [r0, #0x03]
1854 #endif
1855         RET
1856         LMEMCPY_4_PAD
1857
1858 /*
1859  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1860  */
1861         ldrb    r2, [r1]
1862         ldrh    r3, [r1, #0x01]
1863         ldrb    r1, [r1, #0x03]
1864         strb    r2, [r0]
1865         strh    r3, [r0, #0x01]
1866         strb    r1, [r0, #0x03]
1867         RET
1868         LMEMCPY_4_PAD
1869
1870
1871 /******************************************************************************
1872  * Special case for 6 byte copies
1873  */
1874 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1875 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1876         LMEMCPY_6_PAD
1877 .Lmemcpy_6:
1878         and     r2, r1, #0x03
1879         orr     r2, r2, r0, lsl #2
1880         ands    r2, r2, #0x0f
1881         sub     r3, pc, #0x14
1882         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1883
1884 /*
1885  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1886  */
1887         ldr     r2, [r1]
1888         ldrh    r3, [r1, #0x04]
1889         str     r2, [r0]
1890         strh    r3, [r0, #0x04]
1891         RET
1892         LMEMCPY_6_PAD
1893
1894 /*
1895  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1896  */
1897         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1898         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1899 #ifdef __ARMEB__
1900         mov     r2, r2, lsl #8          /* r2 = 012. */
1901         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1902 #else
1903         mov     r2, r2, lsr #8          /* r2 = .210 */
1904         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1905 #endif
1906         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1907         str     r2, [r0]
1908         strh    r3, [r0, #0x04]
1909         RET
1910         LMEMCPY_6_PAD
1911
1912 /*
1913  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1914  */
1915         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1916         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1917 #ifdef __ARMEB__
1918         mov     r1, r3, lsr #16         /* r1 = ..23 */
1919         orr     r1, r1, r2, lsl #16     /* r1 = 0123 */
1920         str     r1, [r0]
1921         strh    r3, [r0, #0x04]
1922 #else
1923         mov     r1, r3, lsr #16         /* r1 = ..54 */
1924         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1925         str     r2, [r0]
1926         strh    r1, [r0, #0x04]
1927 #endif
1928         RET
1929         LMEMCPY_6_PAD
1930
1931 /*
1932  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1933  */
1934         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1935         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1936         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1937 #ifdef __ARMEB__
1938         mov     r2, r2, lsl #24         /* r2 = 0... */
1939         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
1940         mov     r3, r3, lsl #8          /* r3 = 234. */
1941         orr     r1, r3, r1, lsr #24     /* r1 = 2345 */
1942 #else
1943         mov     r2, r2, lsr #24         /* r2 = ...0 */
1944         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1945         mov     r1, r1, lsl #8          /* r1 = xx5. */
1946         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1947 #endif
1948         str     r2, [r0]
1949         strh    r1, [r0, #0x04]
1950         RET
1951         LMEMCPY_6_PAD
1952
1953 /*
1954  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1955  */
1956         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1957         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1958         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1959         strh    r1, [r0, #0x01]
1960 #ifdef __ARMEB__
1961         mov     r1, r3, lsr #24         /* r1 = ...0 */
1962         strb    r1, [r0]
1963         mov     r3, r3, lsl #8          /* r3 = 123. */
1964         orr     r3, r3, r2, lsr #8      /* r3 = 1234 */
1965 #else
1966         strb    r3, [r0]
1967         mov     r3, r3, lsr #24         /* r3 = ...3 */
1968         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1969         mov     r2, r2, lsr #8          /* r2 = ...5 */
1970 #endif
1971         strh    r3, [r0, #0x03]
1972         strb    r2, [r0, #0x05]
1973         RET
1974         LMEMCPY_6_PAD
1975
1976 /*
1977  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1978  */
1979         ldrb    r2, [r1]
1980         ldrh    r3, [r1, #0x01]
1981         ldrh    ip, [r1, #0x03]
1982         ldrb    r1, [r1, #0x05]
1983         strb    r2, [r0]
1984         strh    r3, [r0, #0x01]
1985         strh    ip, [r0, #0x03]
1986         strb    r1, [r0, #0x05]
1987         RET
1988         LMEMCPY_6_PAD
1989
1990 /*
1991  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1992  */
1993         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1994         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1995 #ifdef __ARMEB__
1996         mov     r3, r2, lsr #8          /* r3 = ...0 */
1997         strb    r3, [r0]
1998         strb    r1, [r0, #0x05]
1999         mov     r3, r1, lsr #8          /* r3 = .234 */
2000         strh    r3, [r0, #0x03]
2001         mov     r3, r2, lsl #8          /* r3 = .01. */
2002         orr     r3, r3, r1, lsr #24     /* r3 = .012 */
2003         strh    r3, [r0, #0x01]
2004 #else
2005         strb    r2, [r0]
2006         mov     r3, r1, lsr #24
2007         strb    r3, [r0, #0x05]
2008         mov     r3, r1, lsr #8          /* r3 = .543 */
2009         strh    r3, [r0, #0x03]
2010         mov     r3, r2, lsr #8          /* r3 = ...1 */
2011         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
2012         strh    r3, [r0, #0x01]
2013 #endif
2014         RET
2015         LMEMCPY_6_PAD
2016
2017 /*
2018  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2019  */
2020         ldrb    r2, [r1]
2021         ldrh    r3, [r1, #0x01]
2022         ldrh    ip, [r1, #0x03]
2023         ldrb    r1, [r1, #0x05]
2024         strb    r2, [r0]
2025         strh    r3, [r0, #0x01]
2026         strh    ip, [r0, #0x03]
2027         strb    r1, [r0, #0x05]
2028         RET
2029         LMEMCPY_6_PAD
2030
2031 /*
2032  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2033  */
2034 #ifdef __ARMEB__
2035         ldr     r2, [r1]                /* r2 = 0123 */
2036         ldrh    r3, [r1, #0x04]         /* r3 = ..45 */
2037         mov     r1, r2, lsr #16         /* r1 = ..01 */
2038         orr     r3, r3, r2, lsl#16      /* r3 = 2345 */
2039         strh    r1, [r0]
2040         str     r3, [r0, #0x02]
2041 #else
2042         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
2043         ldr     r3, [r1]                /* r3 = 3210 */
2044         mov     r2, r2, lsl #16         /* r2 = 54.. */
2045         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
2046         strh    r3, [r0]
2047         str     r2, [r0, #0x02]
2048 #endif
2049         RET
2050         LMEMCPY_6_PAD
2051
2052 /*
2053  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2054  */
2055         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2056         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
2057         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2058 #ifdef __ARMEB__
2059         mov     r2, r2, lsr #8          /* r2 = .345 */
2060         orr     r2, r2, r3, lsl #24     /* r2 = 2345 */
2061 #else
2062         mov     r2, r2, lsl #8          /* r2 = 543. */
2063         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
2064 #endif
2065         strh    r1, [r0]
2066         str     r2, [r0, #0x02]
2067         RET
2068         LMEMCPY_6_PAD
2069
2070 /*
2071  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2072  */
2073         ldrh    r2, [r1]
2074         ldr     r3, [r1, #0x02]
2075         strh    r2, [r0]
2076         str     r3, [r0, #0x02]
2077         RET
2078         LMEMCPY_6_PAD
2079
2080 /*
2081  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2082  */
2083         ldrb    r3, [r1]                /* r3 = ...0 */
2084         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2085         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
2086 #ifdef __ARMEB__
2087         mov     r3, r3, lsl #8          /* r3 = ..0. */
2088         orr     r3, r3, r2, lsr #24     /* r3 = ..01 */
2089         orr     r1, r1, r2, lsl #8      /* r1 = 2345 */
2090 #else
2091         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2092         mov     r1, r1, lsl #24         /* r1 = 5... */
2093         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
2094 #endif
2095         strh    r3, [r0]
2096         str     r1, [r0, #0x02]
2097         RET
2098         LMEMCPY_6_PAD
2099
2100 /*
2101  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2102  */
2103         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2104         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
2105 #ifdef __ARMEB__
2106         mov     r3, r2, lsr #24         /* r3 = ...0 */
2107         strb    r3, [r0]
2108         mov     r2, r2, lsl #8          /* r2 = 123. */
2109         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2110 #else
2111         strb    r2, [r0]
2112         mov     r2, r2, lsr #8          /* r2 = .321 */
2113         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
2114         mov     r1, r1, lsr #8          /* r1 = ...5 */
2115 #endif
2116         str     r2, [r0, #0x01]
2117         strb    r1, [r0, #0x05]
2118         RET
2119         LMEMCPY_6_PAD
2120
2121 /*
2122  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2123  */
2124         ldrb    r2, [r1]
2125         ldrh    r3, [r1, #0x01]
2126         ldrh    ip, [r1, #0x03]
2127         ldrb    r1, [r1, #0x05]
2128         strb    r2, [r0]
2129         strh    r3, [r0, #0x01]
2130         strh    ip, [r0, #0x03]
2131         strb    r1, [r0, #0x05]
2132         RET
2133         LMEMCPY_6_PAD
2134
2135 /*
2136  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2137  */
2138         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2139         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
2140 #ifdef __ARMEB__
2141         mov     r3, r2, lsr #8          /* r3 = ...0 */
2142         strb    r3, [r0]
2143         mov     r2, r2, lsl #24         /* r2 = 1... */
2144         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2145 #else
2146         strb    r2, [r0]
2147         mov     r2, r2, lsr #8          /* r2 = ...1 */
2148         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
2149         mov     r1, r1, lsr #24         /* r1 = ...5 */
2150 #endif
2151         str     r2, [r0, #0x01]
2152         strb    r1, [r0, #0x05]
2153         RET
2154         LMEMCPY_6_PAD
2155
2156 /*
2157  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2158  */
2159         ldrb    r2, [r1]
2160         ldr     r3, [r1, #0x01]
2161         ldrb    r1, [r1, #0x05]
2162         strb    r2, [r0]
2163         str     r3, [r0, #0x01]
2164         strb    r1, [r0, #0x05]
2165         RET
2166         LMEMCPY_6_PAD
2167
2168
2169 /******************************************************************************
2170  * Special case for 8 byte copies
2171  */
2172 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
2173 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
2174         LMEMCPY_8_PAD
2175 .Lmemcpy_8:
2176         and     r2, r1, #0x03
2177         orr     r2, r2, r0, lsl #2
2178         ands    r2, r2, #0x0f
2179         sub     r3, pc, #0x14
2180         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
2181
2182 /*
2183  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2184  */
2185         ldr     r2, [r1]
2186         ldr     r3, [r1, #0x04]
2187         str     r2, [r0]
2188         str     r3, [r0, #0x04]
2189         RET
2190         LMEMCPY_8_PAD
2191
2192 /*
2193  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2194  */
2195         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2196         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
2197         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2198 #ifdef __ARMEB__
2199         mov     r3, r3, lsl #8          /* r3 = 012. */
2200         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
2201         orr     r2, r1, r2, lsl #8      /* r2 = 4567 */
2202 #else
2203         mov     r3, r3, lsr #8          /* r3 = .210 */
2204         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
2205         mov     r1, r1, lsl #24         /* r1 = 7... */
2206         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
2207 #endif
2208         str     r3, [r0]
2209         str     r2, [r0, #0x04]
2210         RET
2211         LMEMCPY_8_PAD
2212
2213 /*
2214  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2215  */
2216         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2217         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2218         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2219 #ifdef __ARMEB__
2220         mov     r2, r2, lsl #16         /* r2 = 01.. */
2221         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2222         orr     r3, r1, r3, lsl #16     /* r3 = 4567 */
2223 #else
2224         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2225         mov     r3, r3, lsr #16         /* r3 = ..54 */
2226         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
2227 #endif
2228         str     r2, [r0]
2229         str     r3, [r0, #0x04]
2230         RET
2231         LMEMCPY_8_PAD
2232
2233 /*
2234  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2235  */
2236         ldrb    r3, [r1]                /* r3 = ...0 */
2237         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2238         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
2239 #ifdef __ARMEB__
2240         mov     r3, r3, lsl #24         /* r3 = 0... */
2241         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
2242         mov     r2, r2, lsl #24         /* r2 = 4... */
2243         orr     r2, r2, r1, lsr #8      /* r2 = 4567 */
2244 #else
2245         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2246         mov     r2, r2, lsr #24         /* r2 = ...4 */
2247         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
2248 #endif
2249         str     r3, [r0]
2250         str     r2, [r0, #0x04]
2251         RET
2252         LMEMCPY_8_PAD
2253
2254 /*
2255  * 0100: dst is 8-bit aligned, src is 32-bit aligned
2256  */
2257         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
2258         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
2259 #ifdef __ARMEB__
2260         mov     r1, r3, lsr #24         /* r1 = ...0 */
2261         strb    r1, [r0]
2262         mov     r1, r3, lsr #8          /* r1 = .012 */
2263         strb    r2, [r0, #0x07]
2264         mov     r3, r3, lsl #24         /* r3 = 3... */
2265         orr     r3, r3, r2, lsr #8      /* r3 = 3456 */
2266 #else
2267         strb    r3, [r0]
2268         mov     r1, r2, lsr #24         /* r1 = ...7 */
2269         strb    r1, [r0, #0x07]
2270         mov     r1, r3, lsr #8          /* r1 = .321 */
2271         mov     r3, r3, lsr #24         /* r3 = ...3 */
2272         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
2273 #endif
2274         strh    r1, [r0, #0x01]
2275         str     r3, [r0, #0x03]
2276         RET
2277         LMEMCPY_8_PAD
2278
2279 /*
2280  * 0101: dst is 8-bit aligned, src is 8-bit aligned
2281  */
2282         ldrb    r2, [r1]
2283         ldrh    r3, [r1, #0x01]
2284         ldr     ip, [r1, #0x03]
2285         ldrb    r1, [r1, #0x07]
2286         strb    r2, [r0]
2287         strh    r3, [r0, #0x01]
2288         str     ip, [r0, #0x03]
2289         strb    r1, [r0, #0x07]
2290         RET
2291         LMEMCPY_8_PAD
2292
2293 /*
2294  * 0110: dst is 8-bit aligned, src is 16-bit aligned
2295  */
2296         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2297         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2298         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2299 #ifdef __ARMEB__
2300         mov     ip, r2, lsr #8          /* ip = ...0 */
2301         strb    ip, [r0]
2302         mov     ip, r2, lsl #8          /* ip = .01. */
2303         orr     ip, ip, r3, lsr #24     /* ip = .012 */
2304         strb    r1, [r0, #0x07]
2305         mov     r3, r3, lsl #8          /* r3 = 345. */
2306         orr     r3, r3, r1, lsr #8      /* r3 = 3456 */
2307 #else
2308         strb    r2, [r0]                /* 0 */
2309         mov     ip, r1, lsr #8          /* ip = ...7 */
2310         strb    ip, [r0, #0x07]         /* 7 */
2311         mov     ip, r2, lsr #8          /* ip = ...1 */
2312         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2313         mov     r3, r3, lsr #8          /* r3 = .543 */
2314         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
2315 #endif
2316         strh    ip, [r0, #0x01]
2317         str     r3, [r0, #0x03]
2318         RET
2319         LMEMCPY_8_PAD
2320
2321 /*
2322  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2323  */
2324         ldrb    r3, [r1]                /* r3 = ...0 */
2325         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2326         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
2327         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2328         strb    r3, [r0]
2329         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
2330 #ifdef __ARMEB__
2331         strh    r3, [r0, #0x01]
2332         orr     r2, r2, ip, lsl #16     /* r2 = 3456 */
2333 #else
2334         strh    ip, [r0, #0x01]
2335         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
2336 #endif
2337         str     r2, [r0, #0x03]
2338         strb    r1, [r0, #0x07]
2339         RET
2340         LMEMCPY_8_PAD
2341
2342 /*
2343  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2344  */
2345         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2346         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2347         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2348 #ifdef __ARMEB__
2349         strh    r1, [r0]
2350         mov     r1, r3, lsr #16         /* r1 = ..45 */
2351         orr     r2, r1 ,r2, lsl #16     /* r2 = 2345 */
2352 #else
2353         strh    r2, [r0]
2354         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
2355         mov     r3, r3, lsr #16         /* r3 = ..76 */
2356 #endif
2357         str     r2, [r0, #0x02]
2358         strh    r3, [r0, #0x06]
2359         RET
2360         LMEMCPY_8_PAD
2361
2362 /*
2363  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2364  */
2365         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2366         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2367         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
2368         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2369         strh    r1, [r0]
2370 #ifdef __ARMEB__
2371         mov     r1, r2, lsl #24         /* r1 = 2... */
2372         orr     r1, r1, r3, lsr #8      /* r1 = 2345 */
2373         orr     r3, ip, r3, lsl #8      /* r3 = 4567 */
2374 #else
2375         mov     r1, r2, lsr #24         /* r1 = ...2 */
2376         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
2377         mov     r3, r3, lsr #24         /* r3 = ...6 */
2378         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
2379 #endif
2380         str     r1, [r0, #0x02]
2381         strh    r3, [r0, #0x06]
2382         RET
2383         LMEMCPY_8_PAD
2384
2385 /*
2386  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2387  */
2388         ldrh    r2, [r1]
2389         ldr     ip, [r1, #0x02]
2390         ldrh    r3, [r1, #0x06]
2391         strh    r2, [r0]
2392         str     ip, [r0, #0x02]
2393         strh    r3, [r0, #0x06]
2394         RET
2395         LMEMCPY_8_PAD
2396
2397 /*
2398  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2399  */
2400         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
2401         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2402         ldrb    ip, [r1]                /* ip = ...0 */
2403         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
2404         strh    r1, [r0, #0x06]
2405 #ifdef __ARMEB__
2406         mov     r3, r3, lsr #24         /* r3 = ...5 */
2407         orr     r3, r3, r2, lsl #8      /* r3 = 2345 */
2408         mov     r2, r2, lsr #24         /* r2 = ...1 */
2409         orr     r2, r2, ip, lsl #8      /* r2 = ..01 */
2410 #else
2411         mov     r3, r3, lsl #24         /* r3 = 5... */
2412         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
2413         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
2414 #endif
2415         str     r3, [r0, #0x02]
2416         strh    r2, [r0]
2417         RET
2418         LMEMCPY_8_PAD
2419
2420 /*
2421  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2422  */
2423         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2424         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2425         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
2426         strh    r1, [r0, #0x05]
2427 #ifdef __ARMEB__
2428         strb    r3, [r0, #0x07]
2429         mov     r1, r2, lsr #24         /* r1 = ...0 */
2430         strb    r1, [r0]
2431         mov     r2, r2, lsl #8          /* r2 = 123. */
2432         orr     r2, r2, r3, lsr #24     /* r2 = 1234 */
2433         str     r2, [r0, #0x01]
2434 #else
2435         strb    r2, [r0]
2436         mov     r1, r3, lsr #24         /* r1 = ...7 */
2437         strb    r1, [r0, #0x07]
2438         mov     r2, r2, lsr #8          /* r2 = .321 */
2439         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
2440         str     r2, [r0, #0x01]
2441 #endif
2442         RET
2443         LMEMCPY_8_PAD
2444
2445 /*
2446  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2447  */
2448         ldrb    r3, [r1]                /* r3 = ...0 */
2449         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
2450         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2451         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2452         strb    r3, [r0]
2453         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
2454 #ifdef __ARMEB__
2455         strh    ip, [r0, #0x05]
2456         orr     r2, r3, r2, lsl #16     /* r2 = 1234 */
2457 #else
2458         strh    r3, [r0, #0x05]
2459         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
2460 #endif
2461         str     r2, [r0, #0x01]
2462         strb    r1, [r0, #0x07]
2463         RET
2464         LMEMCPY_8_PAD
2465
2466 /*
2467  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2468  */
2469         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2470         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2471         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2472 #ifdef __ARMEB__
2473         mov     ip, r2, lsr #8          /* ip = ...0 */
2474         strb    ip, [r0]
2475         mov     ip, r2, lsl #24         /* ip = 1... */
2476         orr     ip, ip, r3, lsr #8      /* ip = 1234 */
2477         strb    r1, [r0, #0x07]
2478         mov     r1, r1, lsr #8          /* r1 = ...6 */
2479         orr     r1, r1, r3, lsl #8      /* r1 = 3456 */
2480 #else
2481         strb    r2, [r0]
2482         mov     ip, r2, lsr #8          /* ip = ...1 */
2483         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2484         mov     r2, r1, lsr #8          /* r2 = ...7 */
2485         strb    r2, [r0, #0x07]
2486         mov     r1, r1, lsl #8          /* r1 = .76. */
2487         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
2488 #endif
2489         str     ip, [r0, #0x01]
2490         strh    r1, [r0, #0x05]
2491         RET
2492         LMEMCPY_8_PAD
2493
2494 /*
2495  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2496  */
2497         ldrb    r2, [r1]
2498         ldr     ip, [r1, #0x01]
2499         ldrh    r3, [r1, #0x05]
2500         ldrb    r1, [r1, #0x07]
2501         strb    r2, [r0]
2502         str     ip, [r0, #0x01]
2503         strh    r3, [r0, #0x05]
2504         strb    r1, [r0, #0x07]
2505         RET
2506         LMEMCPY_8_PAD
2507
2508 /******************************************************************************
2509  * Special case for 12 byte copies
2510  */
2511 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
2512 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
2513         LMEMCPY_C_PAD
2514 .Lmemcpy_c:
2515         and     r2, r1, #0x03
2516         orr     r2, r2, r0, lsl #2
2517         ands    r2, r2, #0x0f
2518         sub     r3, pc, #0x14
2519         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
2520
2521 /*
2522  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2523  */
2524         ldr     r2, [r1]
2525         ldr     r3, [r1, #0x04]
2526         ldr     r1, [r1, #0x08]
2527         str     r2, [r0]
2528         str     r3, [r0, #0x04]
2529         str     r1, [r0, #0x08]
2530         RET
2531         LMEMCPY_C_PAD
2532
2533 /*
2534  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2535  */
2536         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
2537         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2538         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2539         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2540 #ifdef __ARMEB__
2541         orr     r2, r2, ip, lsl #8      /* r2 = 89AB */
2542         str     r2, [r0, #0x08]
2543         mov     r2, ip, lsr #24         /* r2 = ...7 */
2544         orr     r2, r2, r3, lsl #8      /* r2 = 4567 */
2545         mov     r1, r1, lsl #8          /* r1 = 012. */
2546         orr     r1, r1, r3, lsr #24     /* r1 = 0123 */
2547 #else
2548         mov     r2, r2, lsl #24         /* r2 = B... */
2549         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
2550         str     r2, [r0, #0x08]
2551         mov     r2, ip, lsl #24         /* r2 = 7... */
2552         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
2553         mov     r1, r1, lsr #8          /* r1 = .210 */
2554         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
2555 #endif
2556         str     r2, [r0, #0x04]
2557         str     r1, [r0]
2558         RET
2559         LMEMCPY_C_PAD
2560
2561 /*
2562  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2563  */
2564         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2565         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2566         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2567         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2568 #ifdef __ARMEB__
2569         mov     r2, r2, lsl #16         /* r2 = 01.. */
2570         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2571         str     r2, [r0]
2572         mov     r3, r3, lsl #16         /* r3 = 45.. */
2573         orr     r3, r3, ip, lsr #16     /* r3 = 4567 */
2574         orr     r1, r1, ip, lsl #16     /* r1 = 89AB */
2575 #else
2576         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2577         str     r2, [r0]
2578         mov     r3, r3, lsr #16         /* r3 = ..54 */
2579         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
2580         mov     r1, r1, lsl #16         /* r1 = BA.. */
2581         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
2582 #endif
2583         str     r3, [r0, #0x04]
2584         str     r1, [r0, #0x08]
2585         RET
2586         LMEMCPY_C_PAD
2587
2588 /*
2589  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2590  */
2591         ldrb    r2, [r1]                /* r2 = ...0 */
2592         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2593         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2594         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2595 #ifdef __ARMEB__
2596         mov     r2, r2, lsl #24         /* r2 = 0... */
2597         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
2598         str     r2, [r0]
2599         mov     r3, r3, lsl #24         /* r3 = 4... */
2600         orr     r3, r3, ip, lsr #8      /* r3 = 4567 */
2601         mov     r1, r1, lsr #8          /* r1 = .9AB */
2602         orr     r1, r1, ip, lsl #24     /* r1 = 89AB */
2603 #else
2604         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
2605         str     r2, [r0]
2606         mov     r3, r3, lsr #24         /* r3 = ...4 */
2607         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
2608         mov     r1, r1, lsl #8          /* r1 = BA9. */
2609         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
2610 #endif
2611         str     r3, [r0, #0x04]
2612         str     r1, [r0, #0x08]
2613         RET
2614         LMEMCPY_C_PAD
2615
2616 /*
2617  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2618  */
2619         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2620         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2621         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
2622         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
2623         strh    r1, [r0, #0x01]
2624 #ifdef __ARMEB__
2625         mov     r1, r2, lsr #24         /* r1 = ...0 */
2626         strb    r1, [r0]
2627         mov     r1, r2, lsl #24         /* r1 = 3... */
2628         orr     r2, r1, r3, lsr #8      /* r1 = 3456 */
2629         mov     r1, r3, lsl #24         /* r1 = 7... */
2630         orr     r1, r1, ip, lsr #8      /* r1 = 789A */
2631 #else
2632         strb    r2, [r0]
2633         mov     r1, r2, lsr #24         /* r1 = ...3 */
2634         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
2635         mov     r1, r3, lsr #24         /* r1 = ...7 */
2636         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
2637         mov     ip, ip, lsr #24         /* ip = ...B */
2638 #endif
2639         str     r2, [r0, #0x03]
2640         str     r1, [r0, #0x07]
2641         strb    ip, [r0, #0x0b]
2642         RET
2643         LMEMCPY_C_PAD
2644
2645 /*
2646  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2647  */
2648         ldrb    r2, [r1]
2649         ldrh    r3, [r1, #0x01]
2650         ldr     ip, [r1, #0x03]
2651         strb    r2, [r0]
2652         ldr     r2, [r1, #0x07]
2653         ldrb    r1, [r1, #0x0b]
2654         strh    r3, [r0, #0x01]
2655         str     ip, [r0, #0x03]
2656         str     r2, [r0, #0x07]
2657         strb    r1, [r0, #0x0b]
2658         RET
2659         LMEMCPY_C_PAD
2660
2661 /*
2662  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2663  */
2664         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2665         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2666         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2667         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2668 #ifdef __ARMEB__
2669         mov     r2, r2, ror #8          /* r2 = 1..0 */
2670         strb    r2, [r0]
2671         mov     r2, r2, lsr #16         /* r2 = ..1. */
2672         orr     r2, r2, r3, lsr #24     /* r2 = ..12 */
2673         strh    r2, [r0, #0x01]
2674         mov     r2, r3, lsl #8          /* r2 = 345. */
2675         orr     r3, r2, ip, lsr #24     /* r3 = 3456 */
2676         mov     r2, ip, lsl #8          /* r2 = 789. */
2677         orr     r2, r2, r1, lsr #8      /* r2 = 789A */
2678 #else
2679         strb    r2, [r0]
2680         mov     r2, r2, lsr #8          /* r2 = ...1 */
2681         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2682         strh    r2, [r0, #0x01]
2683         mov     r2, r3, lsr #8          /* r2 = .543 */
2684         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
2685         mov     r2, ip, lsr #8          /* r2 = .987 */
2686         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
2687         mov     r1, r1, lsr #8          /* r1 = ...B */
2688 #endif
2689         str     r3, [r0, #0x03]
2690         str     r2, [r0, #0x07]
2691         strb    r1, [r0, #0x0b]
2692         RET
2693         LMEMCPY_C_PAD
2694
2695 /*
2696  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2697  */
2698         ldrb    r2, [r1]
2699         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2700         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2701         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2702         strb    r2, [r0]
2703 #ifdef __ARMEB__
2704         mov     r2, r3, lsr #16         /* r2 = ..12 */
2705         strh    r2, [r0, #0x01]
2706         mov     r3, r3, lsl #16         /* r3 = 34.. */
2707         orr     r3, r3, ip, lsr #16     /* r3 = 3456 */
2708         mov     ip, ip, lsl #16         /* ip = 78.. */
2709         orr     ip, ip, r1, lsr #16     /* ip = 789A */
2710         mov     r1, r1, lsr #8          /* r1 = .9AB */
2711 #else
2712         strh    r3, [r0, #0x01]
2713         mov     r3, r3, lsr #16         /* r3 = ..43 */
2714         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
2715         mov     ip, ip, lsr #16         /* ip = ..87 */
2716         orr     ip, ip, r1, lsl #16     /* ip = A987 */
2717         mov     r1, r1, lsr #16         /* r1 = ..xB */
2718 #endif
2719         str     r3, [r0, #0x03]
2720         str     ip, [r0, #0x07]
2721         strb    r1, [r0, #0x0b]
2722         RET
2723         LMEMCPY_C_PAD
2724
2725 /*
2726  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2727  */
2728         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
2729         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2730         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
2731         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2732 #ifdef __ARMEB__
2733         strh    r1, [r0]
2734         mov     r1, ip, lsl #16         /* r1 = 23.. */
2735         orr     r1, r1, r3, lsr #16     /* r1 = 2345 */
2736         mov     r3, r3, lsl #16         /* r3 = 67.. */
2737         orr     r3, r3, r2, lsr #16     /* r3 = 6789 */
2738 #else
2739         strh    ip, [r0]
2740         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
2741         mov     r3, r3, lsr #16         /* r3 = ..76 */
2742         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
2743         mov     r2, r2, lsr #16         /* r2 = ..BA */
2744 #endif
2745         str     r1, [r0, #0x02]
2746         str     r3, [r0, #0x06]
2747         strh    r2, [r0, #0x0a]
2748         RET
2749         LMEMCPY_C_PAD
2750
2751 /*
2752  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2753  */
2754         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2755         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2756         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
2757         strh    ip, [r0]
2758         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2759         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
2760 #ifdef __ARMEB__
2761         mov     r2, r2, lsl #24         /* r2 = 2... */
2762         orr     r2, r2, r3, lsr #8      /* r2 = 2345 */
2763         mov     r3, r3, lsl #24         /* r3 = 6... */
2764         orr     r3, r3, ip, lsr #8      /* r3 = 6789 */
2765         orr     r1, r1, ip, lsl #8      /* r1 = 89AB */
2766 #else
2767         mov     r2, r2, lsr #24         /* r2 = ...2 */
2768         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2769         mov     r3, r3, lsr #24         /* r3 = ...6 */
2770         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2771         mov     r1, r1, lsl #8          /* r1 = ..B. */
2772         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2773 #endif
2774         str     r2, [r0, #0x02]
2775         str     r3, [r0, #0x06]
2776         strh    r1, [r0, #0x0a]
2777         RET
2778         LMEMCPY_C_PAD
2779
2780 /*
2781  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2782  */
2783         ldrh    r2, [r1]
2784         ldr     r3, [r1, #0x02]
2785         ldr     ip, [r1, #0x06]
2786         ldrh    r1, [r1, #0x0a]
2787         strh    r2, [r0]
2788         str     r3, [r0, #0x02]
2789         str     ip, [r0, #0x06]
2790         strh    r1, [r0, #0x0a]
2791         RET
2792         LMEMCPY_C_PAD
2793
2794 /*
2795  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2796  */
2797         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2798         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2799         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2800         strh    ip, [r0, #0x0a]
2801         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2802         ldrb    r1, [r1]                /* r1 = ...0 */
2803 #ifdef __ARMEB__
2804         mov     r2, r2, lsr #24         /* r2 = ...9 */
2805         orr     r2, r2, r3, lsl #8      /* r2 = 6789 */
2806         mov     r3, r3, lsr #24         /* r3 = ...5 */
2807         orr     r3, r3, ip, lsl #8      /* r3 = 2345 */
2808         mov     r1, r1, lsl #8          /* r1 = ..0. */
2809         orr     r1, r1, ip, lsr #24     /* r1 = ..01 */
2810 #else
2811         mov     r2, r2, lsl #24         /* r2 = 9... */
2812         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2813         mov     r3, r3, lsl #24         /* r3 = 5... */
2814         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2815         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2816 #endif
2817         str     r2, [r0, #0x06]
2818         str     r3, [r0, #0x02]
2819         strh    r1, [r0]
2820         RET
2821         LMEMCPY_C_PAD
2822
2823 /*
2824  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2825  */
2826         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2827         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2828         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2829 #ifdef __ARMEB__
2830         mov     r3, r2, lsr #24         /* r3 = ...0 */
2831         strb    r3, [r0]
2832         mov     r2, r2, lsl #8          /* r2 = 123. */
2833         orr     r2, r2, ip, lsr #24     /* r2 = 1234 */
2834         str     r2, [r0, #0x01]
2835         mov     r2, ip, lsl #8          /* r2 = 567. */
2836         orr     r2, r2, r1, lsr #24     /* r2 = 5678 */
2837         str     r2, [r0, #0x05]
2838         mov     r2, r1, lsr #8          /* r2 = ..9A */
2839         strh    r2, [r0, #0x09]
2840         strb    r1, [r0, #0x0b]
2841 #else
2842         strb    r2, [r0]
2843         mov     r3, r2, lsr #8          /* r3 = .321 */
2844         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2845         str     r3, [r0, #0x01]
2846         mov     r3, ip, lsr #8          /* r3 = .765 */
2847         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2848         str     r3, [r0, #0x05]
2849         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2850         strh    r1, [r0, #0x09]
2851         mov     r1, r1, lsr #16         /* r1 = ...B */
2852         strb    r1, [r0, #0x0b]
2853 #endif
2854         RET
2855         LMEMCPY_C_PAD
2856
2857 /*
2858  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2859  */
2860         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2861         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2862         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2863         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2864         strb    r2, [r0, #0x0b]
2865 #ifdef __ARMEB__
2866         strh    r3, [r0, #0x09]
2867         mov     r3, r3, lsr #16         /* r3 = ..78 */
2868         orr     r3, r3, ip, lsl #16     /* r3 = 5678 */
2869         mov     ip, ip, lsr #16         /* ip = ..34 */
2870         orr     ip, ip, r1, lsl #16     /* ip = 1234 */
2871         mov     r1, r1, lsr #16         /* r1 = ..x0 */
2872 #else
2873         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2874         strh    r2, [r0, #0x09]
2875         mov     r3, r3, lsl #16         /* r3 = 87.. */
2876         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2877         mov     ip, ip, lsl #16         /* ip = 43.. */
2878         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2879         mov     r1, r1, lsr #8          /* r1 = .210 */
2880 #endif
2881         str     r3, [r0, #0x05]
2882         str     ip, [r0, #0x01]
2883         strb    r1, [r0]
2884         RET
2885         LMEMCPY_C_PAD
2886
2887 /*
2888  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2889  */
2890 #ifdef __ARMEB__
2891         ldrh    r2, [r1, #0x0a]         /* r2 = ..AB */
2892         ldr     ip, [r1, #0x06]         /* ip = 6789 */
2893         ldr     r3, [r1, #0x02]         /* r3 = 2345 */
2894         ldrh    r1, [r1]                /* r1 = ..01 */
2895         strb    r2, [r0, #0x0b]
2896         mov     r2, r2, lsr #8          /* r2 = ...A */
2897         orr     r2, r2, ip, lsl #8      /* r2 = 789A */
2898         mov     ip, ip, lsr #8          /* ip = .678 */
2899         orr     ip, ip, r3, lsl #24     /* ip = 5678 */
2900         mov     r3, r3, lsr #8          /* r3 = .234 */
2901         orr     r3, r3, r1, lsl #24     /* r3 = 1234 */
2902         mov     r1, r1, lsr #8          /* r1 = ...0 */
2903         strb    r1, [r0]
2904         str     r3, [r0, #0x01]
2905         str     ip, [r0, #0x05]
2906         strh    r2, [r0, #0x09]
2907 #else
2908         ldrh    r2, [r1]                /* r2 = ..10 */
2909         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2910         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2911         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2912         strb    r2, [r0]
2913         mov     r2, r2, lsr #8          /* r2 = ...1 */
2914         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2915         mov     r3, r3, lsr #24         /* r3 = ...5 */
2916         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2917         mov     ip, ip, lsr #24         /* ip = ...9 */
2918         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2919         mov     r1, r1, lsr #8          /* r1 = ...B */
2920         str     r2, [r0, #0x01]
2921         str     r3, [r0, #0x05]
2922         strh    ip, [r0, #0x09]
2923         strb    r1, [r0, #0x0b]
2924 #endif
2925         RET
2926         LMEMCPY_C_PAD
2927
2928 /*
2929  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2930  */
2931         ldrb    r2, [r1]
2932         ldr     r3, [r1, #0x01]
2933         ldr     ip, [r1, #0x05]
2934         strb    r2, [r0]
2935         ldrh    r2, [r1, #0x09]
2936         ldrb    r1, [r1, #0x0b]
2937         str     r3, [r0, #0x01]
2938         str     ip, [r0, #0x05]
2939         strh    r2, [r0, #0x09]
2940         strb    r1, [r0, #0x0b]
2941         RET
2942 #endif /* _ARM_ARCH_5E */
2943
2944 #ifdef GPROF
2945
2946 ENTRY(user)
2947         nop
2948 ENTRY(btrap)
2949         nop
2950 ENTRY(etrap)
2951         nop
2952 ENTRY(bintr)
2953         nop
2954 ENTRY(eintr)
2955         nop
2956
2957 #endif