sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  *
  76  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  77  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  78  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  79  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  80  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  81  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  82  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  83  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  84  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  85  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  86  * POSSIBILITY OF SUCH DAMAGE.
  87  */
  88
  89 #include <machine/asm.h>
  90 __FBSDID("$FreeBSD$");
  91
  92 #include "assym.s"
  93
  94 .L_arm_memcpy:
  95         .word   _C_LABEL(_arm_memcpy)
  96 .L_arm_bzero:
  97         .word   _C_LABEL(_arm_bzero)
  98 .L_min_memcpy_size:
  99         .word   _C_LABEL(_min_memcpy_size)
 100 .L_min_bzero_size:
 101         .word   _C_LABEL(_min_bzero_size)
 102 /*
 103  * memset: Sets a block of memory to the specified value
 104  *
 105  * On entry:
 106  *   r0 - dest address
 107  *   r1 - byte to write
 108  *   r2 - number of bytes to write
 109  *
 110  * On exit:
 111  *   r0 - dest address
 112  */
 113 /* LINTSTUB: Func: void bzero(void *, size_t) */
 114 ENTRY(bzero)
 115         ldr     r3, .L_arm_bzero
 116         ldr     r3, [r3]
 117         cmp     r3, #0
 118         beq     .Lnormal0
 119         ldr     r2, .L_min_bzero_size
 120         ldr     r2, [r2]
 121         cmp     r1, r2
 122         blt     .Lnormal0
 123         stmfd   sp!, {r0, r1, lr}
 124         mov     r2, #0
 125         mov     lr, pc
 126         mov     pc, r3
 127         cmp     r0, #0
 128         ldmfd   sp!, {r0, r1, lr}
 129         RETeq
 130 .Lnormal0:
 131         mov     r3, #0x00
 132         b       do_memset
 133 EEND(bzero)
 134 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 135 ENTRY(memset)
 136         and     r3, r1, #0xff           /* We deal with bytes */
 137         mov     r1, r2
 138 do_memset:
 139         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 140         mov     ip, r0
 141         blt     .Lmemset_lessthanfour
 142
 143         /* Ok first we will word align the address */
 144         ands    r2, ip, #0x03           /* Get the bottom two bits */
 145         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 146
 147         /* We are now word aligned */
 148 .Lmemset_wordaligned:
 149         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 150 #ifdef _ARM_ARCH_5E
 151         tst     ip, #0x04               /* Quad-align for armv5e */
 152 #else
 153         cmp     r1, #0x10
 154 #endif
 155         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 156 #ifdef _ARM_ARCH_5E
 157         subne   r1, r1, #0x04           /* Quad-align if necessary */
 158         strne   r3, [ip], #0x04
 159         cmp     r1, #0x10
 160 #endif
 161         blt     .Lmemset_loop4          /* If less than 16 then use words */
 162         mov     r2, r3                  /* Duplicate data */
 163         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 164         blt     .Lmemset_loop32
 165
 166         /* Do 128 bytes at a time */
 167 .Lmemset_loop128:
 168         subs    r1, r1, #0x80
 169 #ifdef _ARM_ARCH_5E
 170         strged  r2, [ip], #0x08
 171         strged  r2, [ip], #0x08
 172         strged  r2, [ip], #0x08
 173         strged  r2, [ip], #0x08
 174         strged  r2, [ip], #0x08
 175         strged  r2, [ip], #0x08
 176         strged  r2, [ip], #0x08
 177         strged  r2, [ip], #0x08
 178         strged  r2, [ip], #0x08
 179         strged  r2, [ip], #0x08
 180         strged  r2, [ip], #0x08
 181         strged  r2, [ip], #0x08
 182         strged  r2, [ip], #0x08
 183         strged  r2, [ip], #0x08
 184         strged  r2, [ip], #0x08
 185         strged  r2, [ip], #0x08
 186 #else
 187         stmgeia ip!, {r2-r3}
 188         stmgeia ip!, {r2-r3}
 189         stmgeia ip!, {r2-r3}
 190         stmgeia ip!, {r2-r3}
 191         stmgeia ip!, {r2-r3}
 192         stmgeia ip!, {r2-r3}
 193         stmgeia ip!, {r2-r3}
 194         stmgeia ip!, {r2-r3}
 195         stmgeia ip!, {r2-r3}
 196         stmgeia ip!, {r2-r3}
 197         stmgeia ip!, {r2-r3}
 198         stmgeia ip!, {r2-r3}
 199         stmgeia ip!, {r2-r3}
 200         stmgeia ip!, {r2-r3}
 201         stmgeia ip!, {r2-r3}
 202         stmgeia ip!, {r2-r3}
 203 #endif
 204         bgt     .Lmemset_loop128
 205         RETeq                   /* Zero length so just exit */
 206
 207         add     r1, r1, #0x80           /* Adjust for extra sub */
 208
 209         /* Do 32 bytes at a time */
 210 .Lmemset_loop32:
 211         subs    r1, r1, #0x20
 212 #ifdef _ARM_ARCH_5E
 213         strged  r2, [ip], #0x08
 214         strged  r2, [ip], #0x08
 215         strged  r2, [ip], #0x08
 216         strged  r2, [ip], #0x08
 217 #else
 218         stmgeia ip!, {r2-r3}
 219         stmgeia ip!, {r2-r3}
 220         stmgeia ip!, {r2-r3}
 221         stmgeia ip!, {r2-r3}
 222 #endif
 223         bgt     .Lmemset_loop32
 224         RETeq                   /* Zero length so just exit */
 225
 226         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 227
 228         /* Deal with 16 bytes or more */
 229 #ifdef _ARM_ARCH_5E
 230         strged  r2, [ip], #0x08
 231         strged  r2, [ip], #0x08
 232 #else
 233         stmgeia ip!, {r2-r3}
 234         stmgeia ip!, {r2-r3}
 235 #endif
 236         RETeq                   /* Zero length so just exit */
 237
 238         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 239
 240         /* We have at least 4 bytes so copy as words */
 241 .Lmemset_loop4:
 242         subs    r1, r1, #0x04
 243         strge   r3, [ip], #0x04
 244         bgt     .Lmemset_loop4
 245         RETeq                   /* Zero length so just exit */
 246
 247 #ifdef _ARM_ARCH_5E
 248         /* Compensate for 64-bit alignment check */
 249         adds    r1, r1, #0x04
 250         RETeq
 251         cmp     r1, #2
 252 #else
 253         cmp     r1, #-2
 254 #endif
 255
 256         strb    r3, [ip], #0x01         /* Set 1 byte */
 257         strgeb  r3, [ip], #0x01         /* Set another byte */
 258         strgtb  r3, [ip]                /* and a third */
 259         RET                     /* Exit */
 260
 261 .Lmemset_wordunaligned:
 262         rsb     r2, r2, #0x004
 263         strb    r3, [ip], #0x01         /* Set 1 byte */
 264         cmp     r2, #0x02
 265         strgeb  r3, [ip], #0x01         /* Set another byte */
 266         sub     r1, r1, r2
 267         strgtb  r3, [ip], #0x01         /* and a third */
 268         cmp     r1, #0x04               /* More than 4 bytes left? */
 269         bge     .Lmemset_wordaligned    /* Yup */
 270
 271 .Lmemset_lessthanfour:
 272         cmp     r1, #0x00
 273         RETeq                   /* Zero length so exit */
 274         strb    r3, [ip], #0x01         /* Set 1 byte */
 275         cmp     r1, #0x02
 276         strgeb  r3, [ip], #0x01         /* Set another byte */
 277         strgtb  r3, [ip]                /* and a third */
 278         RET                     /* Exit */
 279 END(memset)
 280
 281 ENTRY(bcmp)
 282         mov     ip, r0
 283         cmp     r2, #0x06
 284         beq     .Lmemcmp_6bytes
 285         mov     r0, #0x00
 286
 287         /* Are both addresses aligned the same way? */
 288         cmp     r2, #0x00
 289         eornes  r3, ip, r1
 290         RETeq                   /* len == 0, or same addresses! */
 291         tst     r3, #0x03
 292         subne   r2, r2, #0x01
 293         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 294
 295         /* Word-align the addresses, if necessary */
 296         sub     r3, r1, #0x05
 297         ands    r3, r3, #0x03
 298         add     r3, r3, r3, lsl #1
 299         addne   pc, pc, r3, lsl #3
 300         nop
 301
 302         /* Compare up to 3 bytes */
 303         ldrb    r0, [ip], #0x01
 304         ldrb    r3, [r1], #0x01
 305         subs    r0, r0, r3
 306         RETne
 307         subs    r2, r2, #0x01
 308         RETeq
 309
 310         /* Compare up to 2 bytes */
 311         ldrb    r0, [ip], #0x01
 312         ldrb    r3, [r1], #0x01
 313         subs    r0, r0, r3
 314         RETne
 315         subs    r2, r2, #0x01
 316         RETeq
 317
 318         /* Compare 1 byte */
 319         ldrb    r0, [ip], #0x01
 320         ldrb    r3, [r1], #0x01
 321         subs    r0, r0, r3
 322         RETne
 323         subs    r2, r2, #0x01
 324         RETeq
 325
 326         /* Compare 4 bytes at a time, if possible */
 327         subs    r2, r2, #0x04
 328         bcc     .Lmemcmp_bytewise
 329 .Lmemcmp_word_aligned:
 330         ldr     r0, [ip], #0x04
 331         ldr     r3, [r1], #0x04
 332         subs    r2, r2, #0x04
 333         cmpcs   r0, r3
 334         beq     .Lmemcmp_word_aligned
 335         sub     r0, r0, r3
 336
 337         /* Correct for extra subtraction, and check if done */
 338         adds    r2, r2, #0x04
 339         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 340         RETeq                   /* Yup. Just return */
 341
 342         /* Re-do the final word byte-wise */
 343         sub     ip, ip, #0x04
 344         sub     r1, r1, #0x04
 345
 346 .Lmemcmp_bytewise:
 347         add     r2, r2, #0x03
 348 .Lmemcmp_bytewise2:
 349         ldrb    r0, [ip], #0x01
 350         ldrb    r3, [r1], #0x01
 351         subs    r2, r2, #0x01
 352         cmpcs   r0, r3
 353         beq     .Lmemcmp_bytewise2
 354         sub     r0, r0, r3
 355         RET
 356
 357         /*
 358          * 6 byte compares are very common, thanks to the network stack.
 359          * This code is hand-scheduled to reduce the number of stalls for
 360          * load results. Everything else being equal, this will be ~32%
 361          * faster than a byte-wise memcmp.
 362          */
 363         .align  5
 364 .Lmemcmp_6bytes:
 365         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 366         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 367         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 368         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 369         ldreqb  r3, [ip, #0x01]         /* r3 = b1#1 */
 370         RETne                   /* Return if mismatch on #0 */
 371         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 372         ldreqb  r3, [r1, #0x02]         /* r3 = b2#2 */
 373         ldreqb  r0, [ip, #0x02]         /* r0 = b1#2 */
 374         RETne                   /* Return if mismatch on #1 */
 375         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 376         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 377         ldreqb  r3, [ip, #0x03]         /* r3 = b1#3 */
 378         RETne                   /* Return if mismatch on #2 */
 379         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 380         ldreqb  r3, [r1, #0x04]         /* r3 = b2#4 */
 381         ldreqb  r0, [ip, #0x04]         /* r0 = b1#4 */
 382         RETne                   /* Return if mismatch on #3 */
 383         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 384         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 385         ldreqb  r3, [ip, #0x05]         /* r3 = b1#5 */
 386         RETne                   /* Return if mismatch on #4 */
 387         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 388         RET
 389 END(bcmp)
 390
 391 ENTRY(bcopy)
 392         /* switch the source and destination registers */
 393         eor     r0, r1, r0
 394         eor     r1, r0, r1
 395         eor     r0, r1, r0
 396 EENTRY(memmove)
 397         /* Do the buffers overlap? */
 398         cmp     r0, r1
 399         RETeq           /* Bail now if src/dst are the same */
 400         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 401         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 402         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 403         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 404
 405         /* Determine copy direction */
 406         cmp     r1, r0
 407         bcc     .Lmemmove_backwards
 408
 409         moveq   r0, #0                  /* Quick abort for len=0 */
 410         RETeq
 411
 412         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 413         subs    r2, r2, #4
 414         blt     .Lmemmove_fl4           /* less than 4 bytes */
 415         ands    r12, r0, #3
 416         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 417         ands    r12, r1, #3
 418         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 419
 420 .Lmemmove_ft8:
 421         /* We have aligned source and destination */
 422         subs    r2, r2, #8
 423         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 424         subs    r2, r2, #0x14
 425         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 426         stmdb   sp!, {r4}               /* borrow r4 */
 427
 428         /* blat 32 bytes at a time */
 429         /* XXX for really big copies perhaps we should use more registers */
 430 .Lmemmove_floop32:
 431         ldmia   r1!, {r3, r4, r12, lr}
 432         stmia   r0!, {r3, r4, r12, lr}
 433         ldmia   r1!, {r3, r4, r12, lr}
 434         stmia   r0!, {r3, r4, r12, lr}
 435         subs    r2, r2, #0x20
 436         bge     .Lmemmove_floop32
 437
 438         cmn     r2, #0x10
 439         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 440         stmgeia r0!, {r3, r4, r12, lr}
 441         subge   r2, r2, #0x10
 442         ldmia   sp!, {r4}               /* return r4 */
 443
 444 .Lmemmove_fl32:
 445         adds    r2, r2, #0x14
 446
 447         /* blat 12 bytes at a time */
 448 .Lmemmove_floop12:
 449         ldmgeia r1!, {r3, r12, lr}
 450         stmgeia r0!, {r3, r12, lr}
 451         subges  r2, r2, #0x0c
 452         bge     .Lmemmove_floop12
 453
 454 .Lmemmove_fl12:
 455         adds    r2, r2, #8
 456         blt     .Lmemmove_fl4
 457
 458         subs    r2, r2, #4
 459         ldrlt   r3, [r1], #4
 460         strlt   r3, [r0], #4
 461         ldmgeia r1!, {r3, r12}
 462         stmgeia r0!, {r3, r12}
 463         subge   r2, r2, #4
 464
 465 .Lmemmove_fl4:
 466         /* less than 4 bytes to go */
 467         adds    r2, r2, #4
 468         ldmeqia sp!, {r0, pc}           /* done */
 469
 470         /* copy the crud byte at a time */
 471         cmp     r2, #2
 472         ldrb    r3, [r1], #1
 473         strb    r3, [r0], #1
 474         ldrgeb  r3, [r1], #1
 475         strgeb  r3, [r0], #1
 476         ldrgtb  r3, [r1], #1
 477         strgtb  r3, [r0], #1
 478         ldmia   sp!, {r0, pc}
 479
 480         /* erg - unaligned destination */
 481 .Lmemmove_fdestul:
 482         rsb     r12, r12, #4
 483         cmp     r12, #2
 484
 485         /* align destination with byte copies */
 486         ldrb    r3, [r1], #1
 487         strb    r3, [r0], #1
 488         ldrgeb  r3, [r1], #1
 489         strgeb  r3, [r0], #1
 490         ldrgtb  r3, [r1], #1
 491         strgtb  r3, [r0], #1
 492         subs    r2, r2, r12
 493         blt     .Lmemmove_fl4           /* less the 4 bytes */
 494
 495         ands    r12, r1, #3
 496         beq     .Lmemmove_ft8           /* we have an aligned source */
 497
 498         /* erg - unaligned source */
 499         /* This is where it gets nasty ... */
 500 .Lmemmove_fsrcul:
 501         bic     r1, r1, #3
 502         ldr     lr, [r1], #4
 503         cmp     r12, #2
 504         bgt     .Lmemmove_fsrcul3
 505         beq     .Lmemmove_fsrcul2
 506         cmp     r2, #0x0c
 507         blt     .Lmemmove_fsrcul1loop4
 508         sub     r2, r2, #0x0c
 509         stmdb   sp!, {r4, r5}
 510
 511 .Lmemmove_fsrcul1loop16:
 512 #ifdef __ARMEB__
 513         mov     r3, lr, lsl #8
 514 #else
 515         mov     r3, lr, lsr #8
 516 #endif
 517         ldmia   r1!, {r4, r5, r12, lr}
 518 #ifdef __ARMEB__
 519         orr     r3, r3, r4, lsr #24
 520         mov     r4, r4, lsl #8
 521         orr     r4, r4, r5, lsr #24
 522         mov     r5, r5, lsl #8
 523         orr     r5, r5, r12, lsr #24
 524         mov     r12, r12, lsl #8
 525         orr     r12, r12, lr, lsr #24
 526 #else
 527         orr     r3, r3, r4, lsl #24
 528         mov     r4, r4, lsr #8
 529         orr     r4, r4, r5, lsl #24
 530         mov     r5, r5, lsr #8
 531         orr     r5, r5, r12, lsl #24
 532         mov     r12, r12, lsr #8
 533         orr     r12, r12, lr, lsl #24
 534 #endif
 535         stmia   r0!, {r3-r5, r12}
 536         subs    r2, r2, #0x10
 537         bge     .Lmemmove_fsrcul1loop16
 538         ldmia   sp!, {r4, r5}
 539         adds    r2, r2, #0x0c
 540         blt     .Lmemmove_fsrcul1l4
 541
 542 .Lmemmove_fsrcul1loop4:
 543 #ifdef __ARMEB__
 544         mov     r12, lr, lsl #8
 545 #else
 546         mov     r12, lr, lsr #8
 547 #endif
 548         ldr     lr, [r1], #4
 549 #ifdef __ARMEB__
 550         orr     r12, r12, lr, lsr #24
 551 #else
 552         orr     r12, r12, lr, lsl #24
 553 #endif
 554         str     r12, [r0], #4
 555         subs    r2, r2, #4
 556         bge     .Lmemmove_fsrcul1loop4
 557
 558 .Lmemmove_fsrcul1l4:
 559         sub     r1, r1, #3
 560         b       .Lmemmove_fl4
 561
 562 .Lmemmove_fsrcul2:
 563         cmp     r2, #0x0c
 564         blt     .Lmemmove_fsrcul2loop4
 565         sub     r2, r2, #0x0c
 566         stmdb   sp!, {r4, r5}
 567
 568 .Lmemmove_fsrcul2loop16:
 569 #ifdef __ARMEB__
 570         mov     r3, lr, lsl #16
 571 #else
 572         mov     r3, lr, lsr #16
 573 #endif
 574         ldmia   r1!, {r4, r5, r12, lr}
 575 #ifdef __ARMEB__
 576         orr     r3, r3, r4, lsr #16
 577         mov     r4, r4, lsl #16
 578         orr     r4, r4, r5, lsr #16
 579         mov     r5, r5, lsl #16
 580         orr     r5, r5, r12, lsr #16
 581         mov     r12, r12, lsl #16
 582         orr     r12, r12, lr, lsr #16
 583 #else
 584         orr     r3, r3, r4, lsl #16
 585         mov     r4, r4, lsr #16
 586         orr     r4, r4, r5, lsl #16
 587         mov     r5, r5, lsr #16
 588         orr     r5, r5, r12, lsl #16
 589         mov     r12, r12, lsr #16
 590         orr     r12, r12, lr, lsl #16
 591 #endif
 592         stmia   r0!, {r3-r5, r12}
 593         subs    r2, r2, #0x10
 594         bge     .Lmemmove_fsrcul2loop16
 595         ldmia   sp!, {r4, r5}
 596         adds    r2, r2, #0x0c
 597         blt     .Lmemmove_fsrcul2l4
 598
 599 .Lmemmove_fsrcul2loop4:
 600 #ifdef __ARMEB__
 601         mov     r12, lr, lsl #16
 602 #else
 603         mov     r12, lr, lsr #16
 604 #endif
 605         ldr     lr, [r1], #4
 606 #ifdef __ARMEB__
 607         orr     r12, r12, lr, lsr #16
 608 #else
 609         orr     r12, r12, lr, lsl #16
 610 #endif
 611         str     r12, [r0], #4
 612         subs    r2, r2, #4
 613         bge     .Lmemmove_fsrcul2loop4
 614
 615 .Lmemmove_fsrcul2l4:
 616         sub     r1, r1, #2
 617         b       .Lmemmove_fl4
 618
 619 .Lmemmove_fsrcul3:
 620         cmp     r2, #0x0c
 621         blt     .Lmemmove_fsrcul3loop4
 622         sub     r2, r2, #0x0c
 623         stmdb   sp!, {r4, r5}
 624
 625 .Lmemmove_fsrcul3loop16:
 626 #ifdef __ARMEB__
 627         mov     r3, lr, lsl #24
 628 #else
 629         mov     r3, lr, lsr #24
 630 #endif
 631         ldmia   r1!, {r4, r5, r12, lr}
 632 #ifdef __ARMEB__
 633         orr     r3, r3, r4, lsr #8
 634         mov     r4, r4, lsl #24
 635         orr     r4, r4, r5, lsr #8
 636         mov     r5, r5, lsl #24
 637         orr     r5, r5, r12, lsr #8
 638         mov     r12, r12, lsl #24
 639         orr     r12, r12, lr, lsr #8
 640 #else
 641         orr     r3, r3, r4, lsl #8
 642         mov     r4, r4, lsr #24
 643         orr     r4, r4, r5, lsl #8
 644         mov     r5, r5, lsr #24
 645         orr     r5, r5, r12, lsl #8
 646         mov     r12, r12, lsr #24
 647         orr     r12, r12, lr, lsl #8
 648 #endif
 649         stmia   r0!, {r3-r5, r12}
 650         subs    r2, r2, #0x10
 651         bge     .Lmemmove_fsrcul3loop16
 652         ldmia   sp!, {r4, r5}
 653         adds    r2, r2, #0x0c
 654         blt     .Lmemmove_fsrcul3l4
 655
 656 .Lmemmove_fsrcul3loop4:
 657 #ifdef __ARMEB__
 658         mov     r12, lr, lsl #24
 659 #else
 660         mov     r12, lr, lsr #24
 661 #endif
 662         ldr     lr, [r1], #4
 663 #ifdef __ARMEB__
 664         orr     r12, r12, lr, lsr #8
 665 #else
 666         orr     r12, r12, lr, lsl #8
 667 #endif
 668         str     r12, [r0], #4
 669         subs    r2, r2, #4
 670         bge     .Lmemmove_fsrcul3loop4
 671
 672 .Lmemmove_fsrcul3l4:
 673         sub     r1, r1, #1
 674         b       .Lmemmove_fl4
 675
 676 .Lmemmove_backwards:
 677         add     r1, r1, r2
 678         add     r0, r0, r2
 679         subs    r2, r2, #4
 680         blt     .Lmemmove_bl4           /* less than 4 bytes */
 681         ands    r12, r0, #3
 682         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 683         ands    r12, r1, #3
 684         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 685
 686 .Lmemmove_bt8:
 687         /* We have aligned source and destination */
 688         subs    r2, r2, #8
 689         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 690         stmdb   sp!, {r4, lr}
 691         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 692         blt     .Lmemmove_bl32
 693
 694         /* blat 32 bytes at a time */
 695         /* XXX for really big copies perhaps we should use more registers */
 696 .Lmemmove_bloop32:
 697         ldmdb   r1!, {r3, r4, r12, lr}
 698         stmdb   r0!, {r3, r4, r12, lr}
 699         ldmdb   r1!, {r3, r4, r12, lr}
 700         stmdb   r0!, {r3, r4, r12, lr}
 701         subs    r2, r2, #0x20
 702         bge     .Lmemmove_bloop32
 703
 704 .Lmemmove_bl32:
 705         cmn     r2, #0x10
 706         ldmgedb r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 707         stmgedb r0!, {r3, r4, r12, lr}
 708         subge   r2, r2, #0x10
 709         adds    r2, r2, #0x14
 710         ldmgedb r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 711         stmgedb r0!, {r3, r12, lr}
 712         subge   r2, r2, #0x0c
 713         ldmia   sp!, {r4, lr}
 714
 715 .Lmemmove_bl12:
 716         adds    r2, r2, #8
 717         blt     .Lmemmove_bl4
 718         subs    r2, r2, #4
 719         ldrlt   r3, [r1, #-4]!
 720         strlt   r3, [r0, #-4]!
 721         ldmgedb r1!, {r3, r12}
 722         stmgedb r0!, {r3, r12}
 723         subge   r2, r2, #4
 724
 725 .Lmemmove_bl4:
 726         /* less than 4 bytes to go */
 727         adds    r2, r2, #4
 728         RETeq                   /* done */
 729
 730         /* copy the crud byte at a time */
 731         cmp     r2, #2
 732         ldrb    r3, [r1, #-1]!
 733         strb    r3, [r0, #-1]!
 734         ldrgeb  r3, [r1, #-1]!
 735         strgeb  r3, [r0, #-1]!
 736         ldrgtb  r3, [r1, #-1]!
 737         strgtb  r3, [r0, #-1]!
 738         RET
 739
 740         /* erg - unaligned destination */
 741 .Lmemmove_bdestul:
 742         cmp     r12, #2
 743
 744         /* align destination with byte copies */
 745         ldrb    r3, [r1, #-1]!
 746         strb    r3, [r0, #-1]!
 747         ldrgeb  r3, [r1, #-1]!
 748         strgeb  r3, [r0, #-1]!
 749         ldrgtb  r3, [r1, #-1]!
 750         strgtb  r3, [r0, #-1]!
 751         subs    r2, r2, r12
 752         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 753         ands    r12, r1, #3
 754         beq     .Lmemmove_bt8           /* we have an aligned source */
 755
 756         /* erg - unaligned source */
 757         /* This is where it gets nasty ... */
 758 .Lmemmove_bsrcul:
 759         bic     r1, r1, #3
 760         ldr     r3, [r1, #0]
 761         cmp     r12, #2
 762         blt     .Lmemmove_bsrcul1
 763         beq     .Lmemmove_bsrcul2
 764         cmp     r2, #0x0c
 765         blt     .Lmemmove_bsrcul3loop4
 766         sub     r2, r2, #0x0c
 767         stmdb   sp!, {r4, r5, lr}
 768
 769 .Lmemmove_bsrcul3loop16:
 770 #ifdef __ARMEB__
 771         mov     lr, r3, lsr #8
 772 #else
 773         mov     lr, r3, lsl #8
 774 #endif
 775         ldmdb   r1!, {r3-r5, r12}
 776 #ifdef __ARMEB__
 777         orr     lr, lr, r12, lsl #24
 778         mov     r12, r12, lsr #8
 779         orr     r12, r12, r5, lsl #24
 780         mov     r5, r5, lsr #8
 781         orr     r5, r5, r4, lsl #24
 782         mov     r4, r4, lsr #8
 783         orr     r4, r4, r3, lsl #24
 784 #else
 785         orr     lr, lr, r12, lsr #24
 786         mov     r12, r12, lsl #8
 787         orr     r12, r12, r5, lsr #24
 788         mov     r5, r5, lsl #8
 789         orr     r5, r5, r4, lsr #24
 790         mov     r4, r4, lsl #8
 791         orr     r4, r4, r3, lsr #24
 792 #endif
 793         stmdb   r0!, {r4, r5, r12, lr}
 794         subs    r2, r2, #0x10
 795         bge     .Lmemmove_bsrcul3loop16
 796         ldmia   sp!, {r4, r5, lr}
 797         adds    r2, r2, #0x0c
 798         blt     .Lmemmove_bsrcul3l4
 799
 800 .Lmemmove_bsrcul3loop4:
 801 #ifdef __ARMEB__
 802         mov     r12, r3, lsr #8
 803 #else
 804         mov     r12, r3, lsl #8
 805 #endif
 806         ldr     r3, [r1, #-4]!
 807 #ifdef __ARMEB__
 808         orr     r12, r12, r3, lsl #24
 809 #else
 810         orr     r12, r12, r3, lsr #24
 811 #endif
 812         str     r12, [r0, #-4]!
 813         subs    r2, r2, #4
 814         bge     .Lmemmove_bsrcul3loop4
 815
 816 .Lmemmove_bsrcul3l4:
 817         add     r1, r1, #3
 818         b       .Lmemmove_bl4
 819
 820 .Lmemmove_bsrcul2:
 821         cmp     r2, #0x0c
 822         blt     .Lmemmove_bsrcul2loop4
 823         sub     r2, r2, #0x0c
 824         stmdb   sp!, {r4, r5, lr}
 825
 826 .Lmemmove_bsrcul2loop16:
 827 #ifdef __ARMEB__
 828         mov     lr, r3, lsr #16
 829 #else
 830         mov     lr, r3, lsl #16
 831 #endif
 832         ldmdb   r1!, {r3-r5, r12}
 833 #ifdef __ARMEB__
 834         orr     lr, lr, r12, lsl #16
 835         mov     r12, r12, lsr #16
 836         orr     r12, r12, r5, lsl #16
 837         mov     r5, r5, lsr #16
 838         orr     r5, r5, r4, lsl #16
 839         mov     r4, r4, lsr #16
 840         orr     r4, r4, r3, lsl #16
 841 #else
 842         orr     lr, lr, r12, lsr #16
 843         mov     r12, r12, lsl #16
 844         orr     r12, r12, r5, lsr #16
 845         mov     r5, r5, lsl #16
 846         orr     r5, r5, r4, lsr #16
 847         mov     r4, r4, lsl #16
 848         orr     r4, r4, r3, lsr #16
 849 #endif
 850         stmdb   r0!, {r4, r5, r12, lr}
 851         subs    r2, r2, #0x10
 852         bge     .Lmemmove_bsrcul2loop16
 853         ldmia   sp!, {r4, r5, lr}
 854         adds    r2, r2, #0x0c
 855         blt     .Lmemmove_bsrcul2l4
 856
 857 .Lmemmove_bsrcul2loop4:
 858 #ifdef __ARMEB__
 859         mov     r12, r3, lsr #16
 860 #else
 861         mov     r12, r3, lsl #16
 862 #endif
 863         ldr     r3, [r1, #-4]!
 864 #ifdef __ARMEB__
 865         orr     r12, r12, r3, lsl #16
 866 #else
 867         orr     r12, r12, r3, lsr #16
 868 #endif
 869         str     r12, [r0, #-4]!
 870         subs    r2, r2, #4
 871         bge     .Lmemmove_bsrcul2loop4
 872
 873 .Lmemmove_bsrcul2l4:
 874         add     r1, r1, #2
 875         b       .Lmemmove_bl4
 876
 877 .Lmemmove_bsrcul1:
 878         cmp     r2, #0x0c
 879         blt     .Lmemmove_bsrcul1loop4
 880         sub     r2, r2, #0x0c
 881         stmdb   sp!, {r4, r5, lr}
 882
 883 .Lmemmove_bsrcul1loop32:
 884 #ifdef __ARMEB__
 885         mov     lr, r3, lsr #24
 886 #else
 887         mov     lr, r3, lsl #24
 888 #endif
 889         ldmdb   r1!, {r3-r5, r12}
 890 #ifdef __ARMEB__
 891         orr     lr, lr, r12, lsl #8
 892         mov     r12, r12, lsr #24
 893         orr     r12, r12, r5, lsl #8
 894         mov     r5, r5, lsr #24
 895         orr     r5, r5, r4, lsl #8
 896         mov     r4, r4, lsr #24
 897         orr     r4, r4, r3, lsl #8
 898 #else
 899         orr     lr, lr, r12, lsr #8
 900         mov     r12, r12, lsl #24
 901         orr     r12, r12, r5, lsr #8
 902         mov     r5, r5, lsl #24
 903         orr     r5, r5, r4, lsr #8
 904         mov     r4, r4, lsl #24
 905         orr     r4, r4, r3, lsr #8
 906 #endif
 907         stmdb   r0!, {r4, r5, r12, lr}
 908         subs    r2, r2, #0x10
 909         bge     .Lmemmove_bsrcul1loop32
 910         ldmia   sp!, {r4, r5, lr}
 911         adds    r2, r2, #0x0c
 912         blt     .Lmemmove_bsrcul1l4
 913
 914 .Lmemmove_bsrcul1loop4:
 915 #ifdef __ARMEB__
 916         mov     r12, r3, lsr #24
 917 #else
 918         mov     r12, r3, lsl #24
 919 #endif
 920         ldr     r3, [r1, #-4]!
 921 #ifdef __ARMEB__
 922         orr     r12, r12, r3, lsl #8
 923 #else
 924         orr     r12, r12, r3, lsr #8
 925 #endif
 926         str     r12, [r0, #-4]!
 927         subs    r2, r2, #4
 928         bge     .Lmemmove_bsrcul1loop4
 929
 930 .Lmemmove_bsrcul1l4:
 931         add     r1, r1, #1
 932         b       .Lmemmove_bl4
 933 EEND(memmove)
 934 END(bcopy)
 935
 936 #if !defined(_ARM_ARCH_5E)
 937 ENTRY(memcpy)
 938         /* save leaf functions having to store this away */
 939         /* Do not check arm_memcpy if we're running from flash */
 940 #if defined(FLASHADDR) && defined(PHYSADDR)
 941 #if FLASHADDR > PHYSADDR
 942         ldr     r3, =FLASHADDR
 943         cmp     r3, pc
 944         bls     .Lnormal
 945 #else
 946         ldr     r3, =FLASHADDR
 947         cmp     r3, pc
 948         bhi     .Lnormal
 949 #endif
 950 #endif
 951         ldr     r3, .L_arm_memcpy
 952         ldr     r3, [r3]
 953         cmp     r3, #0
 954         beq     .Lnormal
 955         ldr     r3, .L_min_memcpy_size
 956         ldr     r3, [r3]
 957         cmp     r2, r3
 958         blt     .Lnormal
 959         stmfd   sp!, {r0-r2, r4, lr}
 960         mov     r3, #0
 961         ldr     r4, .L_arm_memcpy
 962         mov     lr, pc
 963         ldr     pc, [r4]
 964         cmp     r0, #0
 965         ldmfd   sp!, {r0-r2, r4, lr}
 966         RETeq
 967
 968 .Lnormal:
 969         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
 970
 971         subs    r2, r2, #4
 972         blt     .Lmemcpy_l4             /* less than 4 bytes */
 973         ands    r12, r0, #3
 974         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
 975         ands    r12, r1, #3
 976         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
 977
 978 .Lmemcpy_t8:
 979         /* We have aligned source and destination */
 980         subs    r2, r2, #8
 981         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
 982         subs    r2, r2, #0x14
 983         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
 984         stmdb   sp!, {r4}               /* borrow r4 */
 985
 986         /* blat 32 bytes at a time */
 987         /* XXX for really big copies perhaps we should use more registers */
 988 .Lmemcpy_loop32:
 989         ldmia   r1!, {r3, r4, r12, lr}
 990         stmia   r0!, {r3, r4, r12, lr}
 991         ldmia   r1!, {r3, r4, r12, lr}
 992         stmia   r0!, {r3, r4, r12, lr}
 993         subs    r2, r2, #0x20
 994         bge     .Lmemcpy_loop32
 995
 996         cmn     r2, #0x10
 997         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 998         stmgeia r0!, {r3, r4, r12, lr}
 999         subge   r2, r2, #0x10
1000         ldmia   sp!, {r4}               /* return r4 */
1001
1002 .Lmemcpy_l32:
1003         adds    r2, r2, #0x14
1004
1005         /* blat 12 bytes at a time */
1006 .Lmemcpy_loop12:
1007         ldmgeia r1!, {r3, r12, lr}
1008         stmgeia r0!, {r3, r12, lr}
1009         subges  r2, r2, #0x0c
1010         bge     .Lmemcpy_loop12
1011
1012 .Lmemcpy_l12:
1013         adds    r2, r2, #8
1014         blt     .Lmemcpy_l4
1015
1016         subs    r2, r2, #4
1017         ldrlt   r3, [r1], #4
1018         strlt   r3, [r0], #4
1019         ldmgeia r1!, {r3, r12}
1020         stmgeia r0!, {r3, r12}
1021         subge   r2, r2, #4
1022
1023 .Lmemcpy_l4:
1024         /* less than 4 bytes to go */
1025         adds    r2, r2, #4
1026 #ifdef __APCS_26_
1027         ldmeqia sp!, {r0, pc}^          /* done */
1028 #else
1029         ldmeqia sp!, {r0, pc}           /* done */
1030 #endif
1031         /* copy the crud byte at a time */
1032         cmp     r2, #2
1033         ldrb    r3, [r1], #1
1034         strb    r3, [r0], #1
1035         ldrgeb  r3, [r1], #1
1036         strgeb  r3, [r0], #1
1037         ldrgtb  r3, [r1], #1
1038         strgtb  r3, [r0], #1
1039         ldmia   sp!, {r0, pc}
1040
1041         /* erg - unaligned destination */
1042 .Lmemcpy_destul:
1043         rsb     r12, r12, #4
1044         cmp     r12, #2
1045
1046         /* align destination with byte copies */
1047         ldrb    r3, [r1], #1
1048         strb    r3, [r0], #1
1049         ldrgeb  r3, [r1], #1
1050         strgeb  r3, [r0], #1
1051         ldrgtb  r3, [r1], #1
1052         strgtb  r3, [r0], #1
1053         subs    r2, r2, r12
1054         blt     .Lmemcpy_l4             /* less the 4 bytes */
1055
1056         ands    r12, r1, #3
1057         beq     .Lmemcpy_t8             /* we have an aligned source */
1058
1059         /* erg - unaligned source */
1060         /* This is where it gets nasty ... */
1061 .Lmemcpy_srcul:
1062         bic     r1, r1, #3
1063         ldr     lr, [r1], #4
1064         cmp     r12, #2
1065         bgt     .Lmemcpy_srcul3
1066         beq     .Lmemcpy_srcul2
1067         cmp     r2, #0x0c
1068         blt     .Lmemcpy_srcul1loop4
1069         sub     r2, r2, #0x0c
1070         stmdb   sp!, {r4, r5}
1071
1072 .Lmemcpy_srcul1loop16:
1073         mov     r3, lr, lsr #8
1074         ldmia   r1!, {r4, r5, r12, lr}
1075         orr     r3, r3, r4, lsl #24
1076         mov     r4, r4, lsr #8
1077         orr     r4, r4, r5, lsl #24
1078         mov     r5, r5, lsr #8
1079         orr     r5, r5, r12, lsl #24
1080         mov     r12, r12, lsr #8
1081         orr     r12, r12, lr, lsl #24
1082         stmia   r0!, {r3-r5, r12}
1083         subs    r2, r2, #0x10
1084         bge     .Lmemcpy_srcul1loop16
1085         ldmia   sp!, {r4, r5}
1086         adds    r2, r2, #0x0c
1087         blt     .Lmemcpy_srcul1l4
1088
1089 .Lmemcpy_srcul1loop4:
1090         mov     r12, lr, lsr #8
1091         ldr     lr, [r1], #4
1092         orr     r12, r12, lr, lsl #24
1093         str     r12, [r0], #4
1094         subs    r2, r2, #4
1095         bge     .Lmemcpy_srcul1loop4
1096
1097 .Lmemcpy_srcul1l4:
1098         sub     r1, r1, #3
1099         b       .Lmemcpy_l4
1100
1101 .Lmemcpy_srcul2:
1102         cmp     r2, #0x0c
1103         blt     .Lmemcpy_srcul2loop4
1104         sub     r2, r2, #0x0c
1105         stmdb   sp!, {r4, r5}
1106
1107 .Lmemcpy_srcul2loop16:
1108         mov     r3, lr, lsr #16
1109         ldmia   r1!, {r4, r5, r12, lr}
1110         orr     r3, r3, r4, lsl #16
1111         mov     r4, r4, lsr #16
1112         orr     r4, r4, r5, lsl #16
1113         mov     r5, r5, lsr #16
1114         orr     r5, r5, r12, lsl #16
1115         mov     r12, r12, lsr #16
1116         orr     r12, r12, lr, lsl #16
1117         stmia   r0!, {r3-r5, r12}
1118         subs    r2, r2, #0x10
1119         bge     .Lmemcpy_srcul2loop16
1120         ldmia   sp!, {r4, r5}
1121         adds    r2, r2, #0x0c
1122         blt     .Lmemcpy_srcul2l4
1123
1124 .Lmemcpy_srcul2loop4:
1125         mov     r12, lr, lsr #16
1126         ldr     lr, [r1], #4
1127         orr     r12, r12, lr, lsl #16
1128         str     r12, [r0], #4
1129         subs    r2, r2, #4
1130         bge     .Lmemcpy_srcul2loop4
1131
1132 .Lmemcpy_srcul2l4:
1133         sub     r1, r1, #2
1134         b       .Lmemcpy_l4
1135
1136 .Lmemcpy_srcul3:
1137         cmp     r2, #0x0c
1138         blt     .Lmemcpy_srcul3loop4
1139         sub     r2, r2, #0x0c
1140         stmdb   sp!, {r4, r5}
1141
1142 .Lmemcpy_srcul3loop16:
1143         mov     r3, lr, lsr #24
1144         ldmia   r1!, {r4, r5, r12, lr}
1145         orr     r3, r3, r4, lsl #8
1146         mov     r4, r4, lsr #24
1147         orr     r4, r4, r5, lsl #8
1148         mov     r5, r5, lsr #24
1149         orr     r5, r5, r12, lsl #8
1150         mov     r12, r12, lsr #24
1151         orr     r12, r12, lr, lsl #8
1152         stmia   r0!, {r3-r5, r12}
1153         subs    r2, r2, #0x10
1154         bge     .Lmemcpy_srcul3loop16
1155         ldmia   sp!, {r4, r5}
1156         adds    r2, r2, #0x0c
1157         blt     .Lmemcpy_srcul3l4
1158
1159 .Lmemcpy_srcul3loop4:
1160         mov     r12, lr, lsr #24
1161         ldr     lr, [r1], #4
1162         orr     r12, r12, lr, lsl #8
1163         str     r12, [r0], #4
1164         subs    r2, r2, #4
1165         bge     .Lmemcpy_srcul3loop4
1166
1167 .Lmemcpy_srcul3l4:
1168         sub     r1, r1, #1
1169         b       .Lmemcpy_l4
1170 END(memcpy)
1171
1172 #else
1173 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1174 ENTRY(memcpy)
1175         pld     [r1]
1176         cmp     r2, #0x0c
1177         ble     .Lmemcpy_short          /* <= 12 bytes */
1178 #ifdef FLASHADDR
1179 #if FLASHADDR > PHYSADDR
1180         ldr     r3, =FLASHADDR
1181         cmp     r3, pc
1182         bls     .Lnormal
1183 #else
1184         ldr     r3, =FLASHADDR
1185         cmp     r3, pc
1186         bhi     .Lnormal
1187 #endif
1188 #endif
1189         ldr     r3, .L_arm_memcpy
1190         ldr     r3, [r3]
1191         cmp     r3, #0
1192         beq     .Lnormal
1193         ldr     r3, .L_min_memcpy_size
1194         ldr     r3, [r3]
1195         cmp     r2, r3
1196         blt     .Lnormal
1197         stmfd   sp!, {r0-r2, r4, lr}
1198         mov     r3, #0
1199         ldr     r4, .L_arm_memcpy
1200         mov     lr, pc
1201         ldr     pc, [r4]
1202         cmp     r0, #0
1203         ldmfd   sp!, {r0-r2, r4, lr}
1204         RETeq
1205 .Lnormal:
1206         mov     r3, r0                  /* We must not clobber r0 */
1207
1208         /* Word-align the destination buffer */
1209         ands    ip, r3, #0x03           /* Already word aligned? */
1210         beq     .Lmemcpy_wordaligned    /* Yup */
1211         cmp     ip, #0x02
1212         ldrb    ip, [r1], #0x01
1213         sub     r2, r2, #0x01
1214         strb    ip, [r3], #0x01
1215         ldrleb  ip, [r1], #0x01
1216         suble   r2, r2, #0x01
1217         strleb  ip, [r3], #0x01
1218         ldrltb  ip, [r1], #0x01
1219         sublt   r2, r2, #0x01
1220         strltb  ip, [r3], #0x01
1221
1222         /* Destination buffer is now word aligned */
1223 .Lmemcpy_wordaligned:
1224         ands    ip, r1, #0x03           /* Is src also word-aligned? */
1225         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
1226
1227         /* Quad-align the destination buffer */
1228         tst     r3, #0x07               /* Already quad aligned? */
1229         ldrne   ip, [r1], #0x04
1230         stmfd   sp!, {r4-r9}            /* Free up some registers */
1231         subne   r2, r2, #0x04
1232         strne   ip, [r3], #0x04
1233
1234         /* Destination buffer quad aligned, source is at least word aligned */
1235         subs    r2, r2, #0x80
1236         blt     .Lmemcpy_w_lessthan128
1237
1238         /* Copy 128 bytes at a time */
1239 .Lmemcpy_w_loop128:
1240         ldr     r4, [r1], #0x04         /* LD:00-03 */
1241         ldr     r5, [r1], #0x04         /* LD:04-07 */
1242         pld     [r1, #0x18]             /* Prefetch 0x20 */
1243         ldr     r6, [r1], #0x04         /* LD:08-0b */
1244         ldr     r7, [r1], #0x04         /* LD:0c-0f */
1245         ldr     r8, [r1], #0x04         /* LD:10-13 */
1246         ldr     r9, [r1], #0x04         /* LD:14-17 */
1247         strd    r4, [r3], #0x08         /* ST:00-07 */
1248         ldr     r4, [r1], #0x04         /* LD:18-1b */
1249         ldr     r5, [r1], #0x04         /* LD:1c-1f */
1250         strd    r6, [r3], #0x08         /* ST:08-0f */
1251         ldr     r6, [r1], #0x04         /* LD:20-23 */
1252         ldr     r7, [r1], #0x04         /* LD:24-27 */
1253         pld     [r1, #0x18]             /* Prefetch 0x40 */
1254         strd    r8, [r3], #0x08         /* ST:10-17 */
1255         ldr     r8, [r1], #0x04         /* LD:28-2b */
1256         ldr     r9, [r1], #0x04         /* LD:2c-2f */
1257         strd    r4, [r3], #0x08         /* ST:18-1f */
1258         ldr     r4, [r1], #0x04         /* LD:30-33 */
1259         ldr     r5, [r1], #0x04         /* LD:34-37 */
1260         strd    r6, [r3], #0x08         /* ST:20-27 */
1261         ldr     r6, [r1], #0x04         /* LD:38-3b */
1262         ldr     r7, [r1], #0x04         /* LD:3c-3f */
1263         strd    r8, [r3], #0x08         /* ST:28-2f */
1264         ldr     r8, [r1], #0x04         /* LD:40-43 */
1265         ldr     r9, [r1], #0x04         /* LD:44-47 */
1266         pld     [r1, #0x18]             /* Prefetch 0x60 */
1267         strd    r4, [r3], #0x08         /* ST:30-37 */
1268         ldr     r4, [r1], #0x04         /* LD:48-4b */
1269         ldr     r5, [r1], #0x04         /* LD:4c-4f */
1270         strd    r6, [r3], #0x08         /* ST:38-3f */
1271         ldr     r6, [r1], #0x04         /* LD:50-53 */
1272         ldr     r7, [r1], #0x04         /* LD:54-57 */
1273         strd    r8, [r3], #0x08         /* ST:40-47 */
1274         ldr     r8, [r1], #0x04         /* LD:58-5b */
1275         ldr     r9, [r1], #0x04         /* LD:5c-5f */
1276         strd    r4, [r3], #0x08         /* ST:48-4f */
1277         ldr     r4, [r1], #0x04         /* LD:60-63 */
1278         ldr     r5, [r1], #0x04         /* LD:64-67 */
1279         pld     [r1, #0x18]             /* Prefetch 0x80 */
1280         strd    r6, [r3], #0x08         /* ST:50-57 */
1281         ldr     r6, [r1], #0x04         /* LD:68-6b */
1282         ldr     r7, [r1], #0x04         /* LD:6c-6f */
1283         strd    r8, [r3], #0x08         /* ST:58-5f */
1284         ldr     r8, [r1], #0x04         /* LD:70-73 */
1285         ldr     r9, [r1], #0x04         /* LD:74-77 */
1286         strd    r4, [r3], #0x08         /* ST:60-67 */
1287         ldr     r4, [r1], #0x04         /* LD:78-7b */
1288         ldr     r5, [r1], #0x04         /* LD:7c-7f */
1289         strd    r6, [r3], #0x08         /* ST:68-6f */
1290         strd    r8, [r3], #0x08         /* ST:70-77 */
1291         subs    r2, r2, #0x80
1292         strd    r4, [r3], #0x08         /* ST:78-7f */
1293         bge     .Lmemcpy_w_loop128
1294
1295 .Lmemcpy_w_lessthan128:
1296         adds    r2, r2, #0x80           /* Adjust for extra sub */
1297         ldmeqfd sp!, {r4-r9}
1298         RETeq                   /* Return now if done */
1299         subs    r2, r2, #0x20
1300         blt     .Lmemcpy_w_lessthan32
1301
1302         /* Copy 32 bytes at a time */
1303 .Lmemcpy_w_loop32:
1304         ldr     r4, [r1], #0x04
1305         ldr     r5, [r1], #0x04
1306         pld     [r1, #0x18]
1307         ldr     r6, [r1], #0x04
1308         ldr     r7, [r1], #0x04
1309         ldr     r8, [r1], #0x04
1310         ldr     r9, [r1], #0x04
1311         strd    r4, [r3], #0x08
1312         ldr     r4, [r1], #0x04
1313         ldr     r5, [r1], #0x04
1314         strd    r6, [r3], #0x08
1315         strd    r8, [r3], #0x08
1316         subs    r2, r2, #0x20
1317         strd    r4, [r3], #0x08
1318         bge     .Lmemcpy_w_loop32
1319
1320 .Lmemcpy_w_lessthan32:
1321         adds    r2, r2, #0x20           /* Adjust for extra sub */
1322         ldmeqfd sp!, {r4-r9}
1323         RETeq                   /* Return now if done */
1324
1325         and     r4, r2, #0x18
1326         rsbs    r4, r4, #0x18
1327         addne   pc, pc, r4, lsl #1
1328         nop
1329
1330         /* At least 24 bytes remaining */
1331         ldr     r4, [r1], #0x04
1332         ldr     r5, [r1], #0x04
1333         sub     r2, r2, #0x08
1334         strd    r4, [r3], #0x08
1335
1336         /* At least 16 bytes remaining */
1337         ldr     r4, [r1], #0x04
1338         ldr     r5, [r1], #0x04
1339         sub     r2, r2, #0x08
1340         strd    r4, [r3], #0x08
1341
1342         /* At least 8 bytes remaining */
1343         ldr     r4, [r1], #0x04
1344         ldr     r5, [r1], #0x04
1345         subs    r2, r2, #0x08
1346         strd    r4, [r3], #0x08
1347
1348         /* Less than 8 bytes remaining */
1349         ldmfd   sp!, {r4-r9}
1350         RETeq                   /* Return now if done */
1351         subs    r2, r2, #0x04
1352         ldrge   ip, [r1], #0x04
1353         strge   ip, [r3], #0x04
1354         RETeq                   /* Return now if done */
1355         addlt   r2, r2, #0x04
1356         ldrb    ip, [r1], #0x01
1357         cmp     r2, #0x02
1358         ldrgeb  r2, [r1], #0x01
1359         strb    ip, [r3], #0x01
1360         ldrgtb  ip, [r1]
1361         strgeb  r2, [r3], #0x01
1362         strgtb  ip, [r3]
1363         RET
1364
1365
1366 /*
1367  * At this point, it has not been possible to word align both buffers.
1368  * The destination buffer is word aligned, but the source buffer is not.
1369  */
1370 .Lmemcpy_bad_align:
1371         stmfd   sp!, {r4-r7}
1372         bic     r1, r1, #0x03
1373         cmp     ip, #2
1374         ldr     ip, [r1], #0x04
1375         bgt     .Lmemcpy_bad3
1376         beq     .Lmemcpy_bad2
1377         b       .Lmemcpy_bad1
1378
1379 .Lmemcpy_bad1_loop16:
1380 #ifdef __ARMEB__
1381         mov     r4, ip, lsl #8
1382 #else
1383         mov     r4, ip, lsr #8
1384 #endif
1385         ldr     r5, [r1], #0x04
1386         pld     [r1, #0x018]
1387         ldr     r6, [r1], #0x04
1388         ldr     r7, [r1], #0x04
1389         ldr     ip, [r1], #0x04
1390 #ifdef __ARMEB__
1391         orr     r4, r4, r5, lsr #24
1392         mov     r5, r5, lsl #8
1393         orr     r5, r5, r6, lsr #24
1394         mov     r6, r6, lsl #8
1395         orr     r6, r6, r7, lsr #24
1396         mov     r7, r7, lsl #8
1397         orr     r7, r7, ip, lsr #24
1398 #else
1399         orr     r4, r4, r5, lsl #24
1400         mov     r5, r5, lsr #8
1401         orr     r5, r5, r6, lsl #24
1402         mov     r6, r6, lsr #8
1403         orr     r6, r6, r7, lsl #24
1404         mov     r7, r7, lsr #8
1405         orr     r7, r7, ip, lsl #24
1406 #endif
1407         str     r4, [r3], #0x04
1408         str     r5, [r3], #0x04
1409         str     r6, [r3], #0x04
1410         str     r7, [r3], #0x04
1411 .Lmemcpy_bad1:
1412         subs    r2, r2, #0x10
1413         bge     .Lmemcpy_bad1_loop16
1414
1415         adds    r2, r2, #0x10
1416         ldmeqfd sp!, {r4-r7}
1417         RETeq                   /* Return now if done */
1418         subs    r2, r2, #0x04
1419         sublt   r1, r1, #0x03
1420         blt     .Lmemcpy_bad_done
1421
1422 .Lmemcpy_bad1_loop4:
1423 #ifdef __ARMEB__
1424         mov     r4, ip, lsl #8
1425 #else
1426         mov     r4, ip, lsr #8
1427 #endif
1428         ldr     ip, [r1], #0x04
1429         subs    r2, r2, #0x04
1430 #ifdef __ARMEB__
1431         orr     r4, r4, ip, lsr #24
1432 #else
1433         orr     r4, r4, ip, lsl #24
1434 #endif
1435         str     r4, [r3], #0x04
1436         bge     .Lmemcpy_bad1_loop4
1437         sub     r1, r1, #0x03
1438         b       .Lmemcpy_bad_done
1439
1440 .Lmemcpy_bad2_loop16:
1441 #ifdef __ARMEB__
1442         mov     r4, ip, lsl #16
1443 #else
1444         mov     r4, ip, lsr #16
1445 #endif
1446         ldr     r5, [r1], #0x04
1447         pld     [r1, #0x018]
1448         ldr     r6, [r1], #0x04
1449         ldr     r7, [r1], #0x04
1450         ldr     ip, [r1], #0x04
1451 #ifdef __ARMEB__
1452         orr     r4, r4, r5, lsr #16
1453         mov     r5, r5, lsl #16
1454         orr     r5, r5, r6, lsr #16
1455         mov     r6, r6, lsl #16
1456         orr     r6, r6, r7, lsr #16
1457         mov     r7, r7, lsl #16
1458         orr     r7, r7, ip, lsr #16
1459 #else
1460         orr     r4, r4, r5, lsl #16
1461         mov     r5, r5, lsr #16
1462         orr     r5, r5, r6, lsl #16
1463         mov     r6, r6, lsr #16
1464         orr     r6, r6, r7, lsl #16
1465         mov     r7, r7, lsr #16
1466         orr     r7, r7, ip, lsl #16
1467 #endif
1468         str     r4, [r3], #0x04
1469         str     r5, [r3], #0x04
1470         str     r6, [r3], #0x04
1471         str     r7, [r3], #0x04
1472 .Lmemcpy_bad2:
1473         subs    r2, r2, #0x10
1474         bge     .Lmemcpy_bad2_loop16
1475
1476         adds    r2, r2, #0x10
1477         ldmeqfd sp!, {r4-r7}
1478         RETeq                   /* Return now if done */
1479         subs    r2, r2, #0x04
1480         sublt   r1, r1, #0x02
1481         blt     .Lmemcpy_bad_done
1482
1483 .Lmemcpy_bad2_loop4:
1484 #ifdef __ARMEB__
1485         mov     r4, ip, lsl #16
1486 #else
1487         mov     r4, ip, lsr #16
1488 #endif
1489         ldr     ip, [r1], #0x04
1490         subs    r2, r2, #0x04
1491 #ifdef __ARMEB__
1492         orr     r4, r4, ip, lsr #16
1493 #else
1494         orr     r4, r4, ip, lsl #16
1495 #endif
1496         str     r4, [r3], #0x04
1497         bge     .Lmemcpy_bad2_loop4
1498         sub     r1, r1, #0x02
1499         b       .Lmemcpy_bad_done
1500
1501 .Lmemcpy_bad3_loop16:
1502 #ifdef __ARMEB__
1503         mov     r4, ip, lsl #24
1504 #else
1505         mov     r4, ip, lsr #24
1506 #endif
1507         ldr     r5, [r1], #0x04
1508         pld     [r1, #0x018]
1509         ldr     r6, [r1], #0x04
1510         ldr     r7, [r1], #0x04
1511         ldr     ip, [r1], #0x04
1512 #ifdef __ARMEB__
1513         orr     r4, r4, r5, lsr #8
1514         mov     r5, r5, lsl #24
1515         orr     r5, r5, r6, lsr #8
1516         mov     r6, r6, lsl #24
1517         orr     r6, r6, r7, lsr #8
1518         mov     r7, r7, lsl #24
1519         orr     r7, r7, ip, lsr #8
1520 #else
1521         orr     r4, r4, r5, lsl #8
1522         mov     r5, r5, lsr #24
1523         orr     r5, r5, r6, lsl #8
1524         mov     r6, r6, lsr #24
1525         orr     r6, r6, r7, lsl #8
1526         mov     r7, r7, lsr #24
1527         orr     r7, r7, ip, lsl #8
1528 #endif
1529         str     r4, [r3], #0x04
1530         str     r5, [r3], #0x04
1531         str     r6, [r3], #0x04
1532         str     r7, [r3], #0x04
1533 .Lmemcpy_bad3:
1534         subs    r2, r2, #0x10
1535         bge     .Lmemcpy_bad3_loop16
1536
1537         adds    r2, r2, #0x10
1538         ldmeqfd sp!, {r4-r7}
1539         RETeq                   /* Return now if done */
1540         subs    r2, r2, #0x04
1541         sublt   r1, r1, #0x01
1542         blt     .Lmemcpy_bad_done
1543
1544 .Lmemcpy_bad3_loop4:
1545 #ifdef __ARMEB__
1546         mov     r4, ip, lsl #24
1547 #else
1548         mov     r4, ip, lsr #24
1549 #endif
1550         ldr     ip, [r1], #0x04
1551         subs    r2, r2, #0x04
1552 #ifdef __ARMEB__
1553         orr     r4, r4, ip, lsr #8
1554 #else
1555         orr     r4, r4, ip, lsl #8
1556 #endif
1557         str     r4, [r3], #0x04
1558         bge     .Lmemcpy_bad3_loop4
1559         sub     r1, r1, #0x01
1560
1561 .Lmemcpy_bad_done:
1562         ldmfd   sp!, {r4-r7}
1563         adds    r2, r2, #0x04
1564         RETeq
1565         ldrb    ip, [r1], #0x01
1566         cmp     r2, #0x02
1567         ldrgeb  r2, [r1], #0x01
1568         strb    ip, [r3], #0x01
1569         ldrgtb  ip, [r1]
1570         strgeb  r2, [r3], #0x01
1571         strgtb  ip, [r3]
1572         RET
1573
1574
1575 /*
1576  * Handle short copies (less than 16 bytes), possibly misaligned.
1577  * Some of these are *very* common, thanks to the network stack,
1578  * and so are handled specially.
1579  */
1580 .Lmemcpy_short:
1581         add     pc, pc, r2, lsl #2
1582         nop
1583         RET                     /* 0x00 */
1584         b       .Lmemcpy_bytewise       /* 0x01 */
1585         b       .Lmemcpy_bytewise       /* 0x02 */
1586         b       .Lmemcpy_bytewise       /* 0x03 */
1587         b       .Lmemcpy_4              /* 0x04 */
1588         b       .Lmemcpy_bytewise       /* 0x05 */
1589         b       .Lmemcpy_6              /* 0x06 */
1590         b       .Lmemcpy_bytewise       /* 0x07 */
1591         b       .Lmemcpy_8              /* 0x08 */
1592         b       .Lmemcpy_bytewise       /* 0x09 */
1593         b       .Lmemcpy_bytewise       /* 0x0a */
1594         b       .Lmemcpy_bytewise       /* 0x0b */
1595         b       .Lmemcpy_c              /* 0x0c */
1596 .Lmemcpy_bytewise:
1597         mov     r3, r0                  /* We must not clobber r0 */
1598         ldrb    ip, [r1], #0x01
1599 1:      subs    r2, r2, #0x01
1600         strb    ip, [r3], #0x01
1601         ldrneb  ip, [r1], #0x01
1602         bne     1b
1603         RET
1604
1605 /******************************************************************************
1606  * Special case for 4 byte copies
1607  */
1608 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1609 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1610         LMEMCPY_4_PAD
1611 .Lmemcpy_4:
1612         and     r2, r1, #0x03
1613         orr     r2, r2, r0, lsl #2
1614         ands    r2, r2, #0x0f
1615         sub     r3, pc, #0x14
1616         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1617
1618 /*
1619  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1620  */
1621         ldr     r2, [r1]
1622         str     r2, [r0]
1623         RET
1624         LMEMCPY_4_PAD
1625
1626 /*
1627  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1628  */
1629         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1630         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1631 #ifdef __ARMEB__
1632         mov     r3, r3, lsl #8          /* r3 = 012. */
1633         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
1634 #else
1635         mov     r3, r3, lsr #8          /* r3 = .210 */
1636         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1637 #endif
1638         str     r3, [r0]
1639         RET
1640         LMEMCPY_4_PAD
1641
1642 /*
1643  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1644  */
1645 #ifdef __ARMEB__
1646         ldrh    r3, [r1]
1647         ldrh    r2, [r1, #0x02]
1648 #else
1649         ldrh    r3, [r1, #0x02]
1650         ldrh    r2, [r1]
1651 #endif
1652         orr     r3, r2, r3, lsl #16
1653         str     r3, [r0]
1654         RET
1655         LMEMCPY_4_PAD
1656
1657 /*
1658  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1659  */
1660         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1661         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1662 #ifdef __ARMEB__
1663         mov     r3, r3, lsl #24         /* r3 = 0... */
1664         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
1665 #else
1666         mov     r3, r3, lsr #24         /* r3 = ...0 */
1667         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1668 #endif
1669         str     r3, [r0]
1670         RET
1671         LMEMCPY_4_PAD
1672
1673 /*
1674  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1675  */
1676         ldr     r2, [r1]
1677 #ifdef __ARMEB__
1678         strb    r2, [r0, #0x03]
1679         mov     r3, r2, lsr #8
1680         mov     r1, r2, lsr #24
1681         strb    r1, [r0]
1682 #else
1683         strb    r2, [r0]
1684         mov     r3, r2, lsr #8
1685         mov     r1, r2, lsr #24
1686         strb    r1, [r0, #0x03]
1687 #endif
1688         strh    r3, [r0, #0x01]
1689         RET
1690         LMEMCPY_4_PAD
1691
1692 /*
1693  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1694  */
1695         ldrb    r2, [r1]
1696         ldrh    r3, [r1, #0x01]
1697         ldrb    r1, [r1, #0x03]
1698         strb    r2, [r0]
1699         strh    r3, [r0, #0x01]
1700         strb    r1, [r0, #0x03]
1701         RET
1702         LMEMCPY_4_PAD
1703
1704 /*
1705  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1706  */
1707         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1708         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1709 #ifdef __ARMEB__
1710         mov     r1, r2, lsr #8          /* r1 = ...0 */
1711         strb    r1, [r0]
1712         mov     r2, r2, lsl #8          /* r2 = .01. */
1713         orr     r2, r2, r3, lsr #8      /* r2 = .012 */
1714 #else
1715         strb    r2, [r0]
1716         mov     r2, r2, lsr #8          /* r2 = ...1 */
1717         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1718         mov     r3, r3, lsr #8          /* r3 = ...3 */
1719 #endif
1720         strh    r2, [r0, #0x01]
1721         strb    r3, [r0, #0x03]
1722         RET
1723         LMEMCPY_4_PAD
1724
1725 /*
1726  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1727  */
1728         ldrb    r2, [r1]
1729         ldrh    r3, [r1, #0x01]
1730         ldrb    r1, [r1, #0x03]
1731         strb    r2, [r0]
1732         strh    r3, [r0, #0x01]
1733         strb    r1, [r0, #0x03]
1734         RET
1735         LMEMCPY_4_PAD
1736
1737 /*
1738  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1739  */
1740         ldr     r2, [r1]
1741 #ifdef __ARMEB__
1742         strh    r2, [r0, #0x02]
1743         mov     r3, r2, lsr #16
1744         strh    r3, [r0]
1745 #else
1746         strh    r2, [r0]
1747         mov     r3, r2, lsr #16
1748         strh    r3, [r0, #0x02]
1749 #endif
1750         RET
1751         LMEMCPY_4_PAD
1752
1753 /*
1754  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1755  */
1756         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1757         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1758         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1759         strh    r1, [r0]
1760 #ifdef __ARMEB__
1761         mov     r2, r2, lsl #8          /* r2 = 012. */
1762         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1763 #else
1764         mov     r2, r2, lsr #24         /* r2 = ...2 */
1765         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1766 #endif
1767         strh    r2, [r0, #0x02]
1768         RET
1769         LMEMCPY_4_PAD
1770
1771 /*
1772  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1773  */
1774         ldrh    r2, [r1]
1775         ldrh    r3, [r1, #0x02]
1776         strh    r2, [r0]
1777         strh    r3, [r0, #0x02]
1778         RET
1779         LMEMCPY_4_PAD
1780
1781 /*
1782  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1783  */
1784         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1785         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1786         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1787         strh    r1, [r0, #0x02]
1788 #ifdef __ARMEB__
1789         mov     r3, r3, lsr #24         /* r3 = ...1 */
1790         orr     r3, r3, r2, lsl #8      /* r3 = xx01 */
1791 #else
1792         mov     r3, r3, lsl #8          /* r3 = 321. */
1793         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1794 #endif
1795         strh    r3, [r0]
1796         RET
1797         LMEMCPY_4_PAD
1798
1799 /*
1800  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1801  */
1802         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1803 #ifdef __ARMEB__
1804         strb    r2, [r0, #0x03]
1805         mov     r3, r2, lsr #8
1806         mov     r1, r2, lsr #24
1807         strh    r3, [r0, #0x01]
1808         strb    r1, [r0]
1809 #else
1810         strb    r2, [r0]
1811         mov     r3, r2, lsr #8
1812         mov     r1, r2, lsr #24
1813         strh    r3, [r0, #0x01]
1814         strb    r1, [r0, #0x03]
1815 #endif
1816         RET
1817         LMEMCPY_4_PAD
1818
1819 /*
1820  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1821  */
1822         ldrb    r2, [r1]
1823         ldrh    r3, [r1, #0x01]
1824         ldrb    r1, [r1, #0x03]
1825         strb    r2, [r0]
1826         strh    r3, [r0, #0x01]
1827         strb    r1, [r0, #0x03]
1828         RET
1829         LMEMCPY_4_PAD
1830
1831 /*
1832  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1833  */
1834 #ifdef __ARMEB__
1835         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1836         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1837         strb    r3, [r0, #0x03]
1838         mov     r3, r3, lsr #8          /* r3 = ...2 */
1839         orr     r3, r3, r2, lsl #8      /* r3 = ..12 */
1840         strh    r3, [r0, #0x01]
1841         mov     r2, r2, lsr #8          /* r2 = ...0 */
1842         strb    r2, [r0]
1843 #else
1844         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1845         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1846         strb    r2, [r0]
1847         mov     r2, r2, lsr #8          /* r2 = ...1 */
1848         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1849         strh    r2, [r0, #0x01]
1850         mov     r3, r3, lsr #8          /* r3 = ...3 */
1851         strb    r3, [r0, #0x03]
1852 #endif
1853         RET
1854         LMEMCPY_4_PAD
1855
1856 /*
1857  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1858  */
1859         ldrb    r2, [r1]
1860         ldrh    r3, [r1, #0x01]
1861         ldrb    r1, [r1, #0x03]
1862         strb    r2, [r0]
1863         strh    r3, [r0, #0x01]
1864         strb    r1, [r0, #0x03]
1865         RET
1866         LMEMCPY_4_PAD
1867
1868
1869 /******************************************************************************
1870  * Special case for 6 byte copies
1871  */
1872 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1873 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1874         LMEMCPY_6_PAD
1875 .Lmemcpy_6:
1876         and     r2, r1, #0x03
1877         orr     r2, r2, r0, lsl #2
1878         ands    r2, r2, #0x0f
1879         sub     r3, pc, #0x14
1880         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1881
1882 /*
1883  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1884  */
1885         ldr     r2, [r1]
1886         ldrh    r3, [r1, #0x04]
1887         str     r2, [r0]
1888         strh    r3, [r0, #0x04]
1889         RET
1890         LMEMCPY_6_PAD
1891
1892 /*
1893  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1894  */
1895         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1896         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1897 #ifdef __ARMEB__
1898         mov     r2, r2, lsl #8          /* r2 = 012. */
1899         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1900 #else
1901         mov     r2, r2, lsr #8          /* r2 = .210 */
1902         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1903 #endif
1904         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1905         str     r2, [r0]
1906         strh    r3, [r0, #0x04]
1907         RET
1908         LMEMCPY_6_PAD
1909
1910 /*
1911  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1912  */
1913         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1914         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1915 #ifdef __ARMEB__
1916         mov     r1, r3, lsr #16         /* r1 = ..23 */
1917         orr     r1, r1, r2, lsl #16     /* r1 = 0123 */
1918         str     r1, [r0]
1919         strh    r3, [r0, #0x04]
1920 #else
1921         mov     r1, r3, lsr #16         /* r1 = ..54 */
1922         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1923         str     r2, [r0]
1924         strh    r1, [r0, #0x04]
1925 #endif
1926         RET
1927         LMEMCPY_6_PAD
1928
1929 /*
1930  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1931  */
1932         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1933         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1934         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1935 #ifdef __ARMEB__
1936         mov     r2, r2, lsl #24         /* r2 = 0... */
1937         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
1938         mov     r3, r3, lsl #8          /* r3 = 234. */
1939         orr     r1, r3, r1, lsr #24     /* r1 = 2345 */
1940 #else
1941         mov     r2, r2, lsr #24         /* r2 = ...0 */
1942         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1943         mov     r1, r1, lsl #8          /* r1 = xx5. */
1944         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1945 #endif
1946         str     r2, [r0]
1947         strh    r1, [r0, #0x04]
1948         RET
1949         LMEMCPY_6_PAD
1950
1951 /*
1952  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1953  */
1954         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1955         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1956         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1957         strh    r1, [r0, #0x01]
1958 #ifdef __ARMEB__
1959         mov     r1, r3, lsr #24         /* r1 = ...0 */
1960         strb    r1, [r0]
1961         mov     r3, r3, lsl #8          /* r3 = 123. */
1962         orr     r3, r3, r2, lsr #8      /* r3 = 1234 */
1963 #else
1964         strb    r3, [r0]
1965         mov     r3, r3, lsr #24         /* r3 = ...3 */
1966         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1967         mov     r2, r2, lsr #8          /* r2 = ...5 */
1968 #endif
1969         strh    r3, [r0, #0x03]
1970         strb    r2, [r0, #0x05]
1971         RET
1972         LMEMCPY_6_PAD
1973
1974 /*
1975  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1976  */
1977         ldrb    r2, [r1]
1978         ldrh    r3, [r1, #0x01]
1979         ldrh    ip, [r1, #0x03]
1980         ldrb    r1, [r1, #0x05]
1981         strb    r2, [r0]
1982         strh    r3, [r0, #0x01]
1983         strh    ip, [r0, #0x03]
1984         strb    r1, [r0, #0x05]
1985         RET
1986         LMEMCPY_6_PAD
1987
1988 /*
1989  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1990  */
1991         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1992         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1993 #ifdef __ARMEB__
1994         mov     r3, r2, lsr #8          /* r3 = ...0 */
1995         strb    r3, [r0]
1996         strb    r1, [r0, #0x05]
1997         mov     r3, r1, lsr #8          /* r3 = .234 */
1998         strh    r3, [r0, #0x03]
1999         mov     r3, r2, lsl #8          /* r3 = .01. */
2000         orr     r3, r3, r1, lsr #24     /* r3 = .012 */
2001         strh    r3, [r0, #0x01]
2002 #else
2003         strb    r2, [r0]
2004         mov     r3, r1, lsr #24
2005         strb    r3, [r0, #0x05]
2006         mov     r3, r1, lsr #8          /* r3 = .543 */
2007         strh    r3, [r0, #0x03]
2008         mov     r3, r2, lsr #8          /* r3 = ...1 */
2009         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
2010         strh    r3, [r0, #0x01]
2011 #endif
2012         RET
2013         LMEMCPY_6_PAD
2014
2015 /*
2016  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2017  */
2018         ldrb    r2, [r1]
2019         ldrh    r3, [r1, #0x01]
2020         ldrh    ip, [r1, #0x03]
2021         ldrb    r1, [r1, #0x05]
2022         strb    r2, [r0]
2023         strh    r3, [r0, #0x01]
2024         strh    ip, [r0, #0x03]
2025         strb    r1, [r0, #0x05]
2026         RET
2027         LMEMCPY_6_PAD
2028
2029 /*
2030  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2031  */
2032 #ifdef __ARMEB__
2033         ldr     r2, [r1]                /* r2 = 0123 */
2034         ldrh    r3, [r1, #0x04]         /* r3 = ..45 */
2035         mov     r1, r2, lsr #16         /* r1 = ..01 */
2036         orr     r3, r3, r2, lsl#16      /* r3 = 2345 */
2037         strh    r1, [r0]
2038         str     r3, [r0, #0x02]
2039 #else
2040         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
2041         ldr     r3, [r1]                /* r3 = 3210 */
2042         mov     r2, r2, lsl #16         /* r2 = 54.. */
2043         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
2044         strh    r3, [r0]
2045         str     r2, [r0, #0x02]
2046 #endif
2047         RET
2048         LMEMCPY_6_PAD
2049
2050 /*
2051  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2052  */
2053         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2054         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
2055         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2056 #ifdef __ARMEB__
2057         mov     r2, r2, lsr #8          /* r2 = .345 */
2058         orr     r2, r2, r3, lsl #24     /* r2 = 2345 */
2059 #else
2060         mov     r2, r2, lsl #8          /* r2 = 543. */
2061         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
2062 #endif
2063         strh    r1, [r0]
2064         str     r2, [r0, #0x02]
2065         RET
2066         LMEMCPY_6_PAD
2067
2068 /*
2069  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2070  */
2071         ldrh    r2, [r1]
2072         ldr     r3, [r1, #0x02]
2073         strh    r2, [r0]
2074         str     r3, [r0, #0x02]
2075         RET
2076         LMEMCPY_6_PAD
2077
2078 /*
2079  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2080  */
2081         ldrb    r3, [r1]                /* r3 = ...0 */
2082         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2083         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
2084 #ifdef __ARMEB__
2085         mov     r3, r3, lsl #8          /* r3 = ..0. */
2086         orr     r3, r3, r2, lsr #24     /* r3 = ..01 */
2087         orr     r1, r1, r2, lsl #8      /* r1 = 2345 */
2088 #else
2089         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2090         mov     r1, r1, lsl #24         /* r1 = 5... */
2091         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
2092 #endif
2093         strh    r3, [r0]
2094         str     r1, [r0, #0x02]
2095         RET
2096         LMEMCPY_6_PAD
2097
2098 /*
2099  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2100  */
2101         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2102         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
2103 #ifdef __ARMEB__
2104         mov     r3, r2, lsr #24         /* r3 = ...0 */
2105         strb    r3, [r0]
2106         mov     r2, r2, lsl #8          /* r2 = 123. */
2107         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2108 #else
2109         strb    r2, [r0]
2110         mov     r2, r2, lsr #8          /* r2 = .321 */
2111         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
2112         mov     r1, r1, lsr #8          /* r1 = ...5 */
2113 #endif
2114         str     r2, [r0, #0x01]
2115         strb    r1, [r0, #0x05]
2116         RET
2117         LMEMCPY_6_PAD
2118
2119 /*
2120  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2121  */
2122         ldrb    r2, [r1]
2123         ldrh    r3, [r1, #0x01]
2124         ldrh    ip, [r1, #0x03]
2125         ldrb    r1, [r1, #0x05]
2126         strb    r2, [r0]
2127         strh    r3, [r0, #0x01]
2128         strh    ip, [r0, #0x03]
2129         strb    r1, [r0, #0x05]
2130         RET
2131         LMEMCPY_6_PAD
2132
2133 /*
2134  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2135  */
2136         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2137         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
2138 #ifdef __ARMEB__
2139         mov     r3, r2, lsr #8          /* r3 = ...0 */
2140         strb    r3, [r0]
2141         mov     r2, r2, lsl #24         /* r2 = 1... */
2142         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2143 #else
2144         strb    r2, [r0]
2145         mov     r2, r2, lsr #8          /* r2 = ...1 */
2146         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
2147         mov     r1, r1, lsr #24         /* r1 = ...5 */
2148 #endif
2149         str     r2, [r0, #0x01]
2150         strb    r1, [r0, #0x05]
2151         RET
2152         LMEMCPY_6_PAD
2153
2154 /*
2155  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2156  */
2157         ldrb    r2, [r1]
2158         ldr     r3, [r1, #0x01]
2159         ldrb    r1, [r1, #0x05]
2160         strb    r2, [r0]
2161         str     r3, [r0, #0x01]
2162         strb    r1, [r0, #0x05]
2163         RET
2164         LMEMCPY_6_PAD
2165
2166
2167 /******************************************************************************
2168  * Special case for 8 byte copies
2169  */
2170 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
2171 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
2172         LMEMCPY_8_PAD
2173 .Lmemcpy_8:
2174         and     r2, r1, #0x03
2175         orr     r2, r2, r0, lsl #2
2176         ands    r2, r2, #0x0f
2177         sub     r3, pc, #0x14
2178         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
2179
2180 /*
2181  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2182  */
2183         ldr     r2, [r1]
2184         ldr     r3, [r1, #0x04]
2185         str     r2, [r0]
2186         str     r3, [r0, #0x04]
2187         RET
2188         LMEMCPY_8_PAD
2189
2190 /*
2191  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2192  */
2193         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2194         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
2195         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2196 #ifdef __ARMEB__
2197         mov     r3, r3, lsl #8          /* r3 = 012. */
2198         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
2199         orr     r2, r1, r2, lsl #8      /* r2 = 4567 */
2200 #else
2201         mov     r3, r3, lsr #8          /* r3 = .210 */
2202         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
2203         mov     r1, r1, lsl #24         /* r1 = 7... */
2204         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
2205 #endif
2206         str     r3, [r0]
2207         str     r2, [r0, #0x04]
2208         RET
2209         LMEMCPY_8_PAD
2210
2211 /*
2212  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2213  */
2214         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2215         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2216         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2217 #ifdef __ARMEB__
2218         mov     r2, r2, lsl #16         /* r2 = 01.. */
2219         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2220         orr     r3, r1, r3, lsl #16     /* r3 = 4567 */
2221 #else
2222         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2223         mov     r3, r3, lsr #16         /* r3 = ..54 */
2224         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
2225 #endif
2226         str     r2, [r0]
2227         str     r3, [r0, #0x04]
2228         RET
2229         LMEMCPY_8_PAD
2230
2231 /*
2232  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2233  */
2234         ldrb    r3, [r1]                /* r3 = ...0 */
2235         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2236         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
2237 #ifdef __ARMEB__
2238         mov     r3, r3, lsl #24         /* r3 = 0... */
2239         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
2240         mov     r2, r2, lsl #24         /* r2 = 4... */
2241         orr     r2, r2, r1, lsr #8      /* r2 = 4567 */
2242 #else
2243         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2244         mov     r2, r2, lsr #24         /* r2 = ...4 */
2245         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
2246 #endif
2247         str     r3, [r0]
2248         str     r2, [r0, #0x04]
2249         RET
2250         LMEMCPY_8_PAD
2251
2252 /*
2253  * 0100: dst is 8-bit aligned, src is 32-bit aligned
2254  */
2255         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
2256         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
2257 #ifdef __ARMEB__
2258         mov     r1, r3, lsr #24         /* r1 = ...0 */
2259         strb    r1, [r0]
2260         mov     r1, r3, lsr #8          /* r1 = .012 */
2261         strb    r2, [r0, #0x07]
2262         mov     r3, r3, lsl #24         /* r3 = 3... */
2263         orr     r3, r3, r2, lsr #8      /* r3 = 3456 */
2264 #else
2265         strb    r3, [r0]
2266         mov     r1, r2, lsr #24         /* r1 = ...7 */
2267         strb    r1, [r0, #0x07]
2268         mov     r1, r3, lsr #8          /* r1 = .321 */
2269         mov     r3, r3, lsr #24         /* r3 = ...3 */
2270         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
2271 #endif
2272         strh    r1, [r0, #0x01]
2273         str     r3, [r0, #0x03]
2274         RET
2275         LMEMCPY_8_PAD
2276
2277 /*
2278  * 0101: dst is 8-bit aligned, src is 8-bit aligned
2279  */
2280         ldrb    r2, [r1]
2281         ldrh    r3, [r1, #0x01]
2282         ldr     ip, [r1, #0x03]
2283         ldrb    r1, [r1, #0x07]
2284         strb    r2, [r0]
2285         strh    r3, [r0, #0x01]
2286         str     ip, [r0, #0x03]
2287         strb    r1, [r0, #0x07]
2288         RET
2289         LMEMCPY_8_PAD
2290
2291 /*
2292  * 0110: dst is 8-bit aligned, src is 16-bit aligned
2293  */
2294         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2295         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2296         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2297 #ifdef __ARMEB__
2298         mov     ip, r2, lsr #8          /* ip = ...0 */
2299         strb    ip, [r0]
2300         mov     ip, r2, lsl #8          /* ip = .01. */
2301         orr     ip, ip, r3, lsr #24     /* ip = .012 */
2302         strb    r1, [r0, #0x07]
2303         mov     r3, r3, lsl #8          /* r3 = 345. */
2304         orr     r3, r3, r1, lsr #8      /* r3 = 3456 */
2305 #else
2306         strb    r2, [r0]                /* 0 */
2307         mov     ip, r1, lsr #8          /* ip = ...7 */
2308         strb    ip, [r0, #0x07]         /* 7 */
2309         mov     ip, r2, lsr #8          /* ip = ...1 */
2310         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2311         mov     r3, r3, lsr #8          /* r3 = .543 */
2312         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
2313 #endif
2314         strh    ip, [r0, #0x01]
2315         str     r3, [r0, #0x03]
2316         RET
2317         LMEMCPY_8_PAD
2318
2319 /*
2320  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2321  */
2322         ldrb    r3, [r1]                /* r3 = ...0 */
2323         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2324         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
2325         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2326         strb    r3, [r0]
2327         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
2328 #ifdef __ARMEB__
2329         strh    r3, [r0, #0x01]
2330         orr     r2, r2, ip, lsl #16     /* r2 = 3456 */
2331 #else
2332         strh    ip, [r0, #0x01]
2333         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
2334 #endif
2335         str     r2, [r0, #0x03]
2336         strb    r1, [r0, #0x07]
2337         RET
2338         LMEMCPY_8_PAD
2339
2340 /*
2341  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2342  */
2343         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2344         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2345         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2346 #ifdef __ARMEB__
2347         strh    r1, [r0]
2348         mov     r1, r3, lsr #16         /* r1 = ..45 */
2349         orr     r2, r1 ,r2, lsl #16     /* r2 = 2345 */
2350 #else
2351         strh    r2, [r0]
2352         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
2353         mov     r3, r3, lsr #16         /* r3 = ..76 */
2354 #endif
2355         str     r2, [r0, #0x02]
2356         strh    r3, [r0, #0x06]
2357         RET
2358         LMEMCPY_8_PAD
2359
2360 /*
2361  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2362  */
2363         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2364         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2365         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
2366         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2367         strh    r1, [r0]
2368 #ifdef __ARMEB__
2369         mov     r1, r2, lsl #24         /* r1 = 2... */
2370         orr     r1, r1, r3, lsr #8      /* r1 = 2345 */
2371         orr     r3, ip, r3, lsl #8      /* r3 = 4567 */
2372 #else
2373         mov     r1, r2, lsr #24         /* r1 = ...2 */
2374         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
2375         mov     r3, r3, lsr #24         /* r3 = ...6 */
2376         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
2377 #endif
2378         str     r1, [r0, #0x02]
2379         strh    r3, [r0, #0x06]
2380         RET
2381         LMEMCPY_8_PAD
2382
2383 /*
2384  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2385  */
2386         ldrh    r2, [r1]
2387         ldr     ip, [r1, #0x02]
2388         ldrh    r3, [r1, #0x06]
2389         strh    r2, [r0]
2390         str     ip, [r0, #0x02]
2391         strh    r3, [r0, #0x06]
2392         RET
2393         LMEMCPY_8_PAD
2394
2395 /*
2396  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2397  */
2398         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
2399         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2400         ldrb    ip, [r1]                /* ip = ...0 */
2401         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
2402         strh    r1, [r0, #0x06]
2403 #ifdef __ARMEB__
2404         mov     r3, r3, lsr #24         /* r3 = ...5 */
2405         orr     r3, r3, r2, lsl #8      /* r3 = 2345 */
2406         mov     r2, r2, lsr #24         /* r2 = ...1 */
2407         orr     r2, r2, ip, lsl #8      /* r2 = ..01 */
2408 #else
2409         mov     r3, r3, lsl #24         /* r3 = 5... */
2410         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
2411         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
2412 #endif
2413         str     r3, [r0, #0x02]
2414         strh    r2, [r0]
2415         RET
2416         LMEMCPY_8_PAD
2417
2418 /*
2419  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2420  */
2421         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2422         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2423         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
2424         strh    r1, [r0, #0x05]
2425 #ifdef __ARMEB__
2426         strb    r3, [r0, #0x07]
2427         mov     r1, r2, lsr #24         /* r1 = ...0 */
2428         strb    r1, [r0]
2429         mov     r2, r2, lsl #8          /* r2 = 123. */
2430         orr     r2, r2, r3, lsr #24     /* r2 = 1234 */
2431         str     r2, [r0, #0x01]
2432 #else
2433         strb    r2, [r0]
2434         mov     r1, r3, lsr #24         /* r1 = ...7 */
2435         strb    r1, [r0, #0x07]
2436         mov     r2, r2, lsr #8          /* r2 = .321 */
2437         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
2438         str     r2, [r0, #0x01]
2439 #endif
2440         RET
2441         LMEMCPY_8_PAD
2442
2443 /*
2444  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2445  */
2446         ldrb    r3, [r1]                /* r3 = ...0 */
2447         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
2448         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2449         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2450         strb    r3, [r0]
2451         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
2452 #ifdef __ARMEB__
2453         strh    ip, [r0, #0x05]
2454         orr     r2, r3, r2, lsl #16     /* r2 = 1234 */
2455 #else
2456         strh    r3, [r0, #0x05]
2457         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
2458 #endif
2459         str     r2, [r0, #0x01]
2460         strb    r1, [r0, #0x07]
2461         RET
2462         LMEMCPY_8_PAD
2463
2464 /*
2465  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2466  */
2467         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2468         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2469         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2470 #ifdef __ARMEB__
2471         mov     ip, r2, lsr #8          /* ip = ...0 */
2472         strb    ip, [r0]
2473         mov     ip, r2, lsl #24         /* ip = 1... */
2474         orr     ip, ip, r3, lsr #8      /* ip = 1234 */
2475         strb    r1, [r0, #0x07]
2476         mov     r1, r1, lsr #8          /* r1 = ...6 */
2477         orr     r1, r1, r3, lsl #8      /* r1 = 3456 */
2478 #else
2479         strb    r2, [r0]
2480         mov     ip, r2, lsr #8          /* ip = ...1 */
2481         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2482         mov     r2, r1, lsr #8          /* r2 = ...7 */
2483         strb    r2, [r0, #0x07]
2484         mov     r1, r1, lsl #8          /* r1 = .76. */
2485         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
2486 #endif
2487         str     ip, [r0, #0x01]
2488         strh    r1, [r0, #0x05]
2489         RET
2490         LMEMCPY_8_PAD
2491
2492 /*
2493  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2494  */
2495         ldrb    r2, [r1]
2496         ldr     ip, [r1, #0x01]
2497         ldrh    r3, [r1, #0x05]
2498         ldrb    r1, [r1, #0x07]
2499         strb    r2, [r0]
2500         str     ip, [r0, #0x01]
2501         strh    r3, [r0, #0x05]
2502         strb    r1, [r0, #0x07]
2503         RET
2504         LMEMCPY_8_PAD
2505
2506 /******************************************************************************
2507  * Special case for 12 byte copies
2508  */
2509 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
2510 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
2511         LMEMCPY_C_PAD
2512 .Lmemcpy_c:
2513         and     r2, r1, #0x03
2514         orr     r2, r2, r0, lsl #2
2515         ands    r2, r2, #0x0f
2516         sub     r3, pc, #0x14
2517         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
2518
2519 /*
2520  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2521  */
2522         ldr     r2, [r1]
2523         ldr     r3, [r1, #0x04]
2524         ldr     r1, [r1, #0x08]
2525         str     r2, [r0]
2526         str     r3, [r0, #0x04]
2527         str     r1, [r0, #0x08]
2528         RET
2529         LMEMCPY_C_PAD
2530
2531 /*
2532  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2533  */
2534         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
2535         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2536         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2537         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2538 #ifdef __ARMEB__
2539         orr     r2, r2, ip, lsl #8      /* r2 = 89AB */
2540         str     r2, [r0, #0x08]
2541         mov     r2, ip, lsr #24         /* r2 = ...7 */
2542         orr     r2, r2, r3, lsl #8      /* r2 = 4567 */
2543         mov     r1, r1, lsl #8          /* r1 = 012. */
2544         orr     r1, r1, r3, lsr #24     /* r1 = 0123 */
2545 #else
2546         mov     r2, r2, lsl #24         /* r2 = B... */
2547         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
2548         str     r2, [r0, #0x08]
2549         mov     r2, ip, lsl #24         /* r2 = 7... */
2550         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
2551         mov     r1, r1, lsr #8          /* r1 = .210 */
2552         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
2553 #endif
2554         str     r2, [r0, #0x04]
2555         str     r1, [r0]
2556         RET
2557         LMEMCPY_C_PAD
2558
2559 /*
2560  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2561  */
2562         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2563         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2564         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2565         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2566 #ifdef __ARMEB__
2567         mov     r2, r2, lsl #16         /* r2 = 01.. */
2568         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2569         str     r2, [r0]
2570         mov     r3, r3, lsl #16         /* r3 = 45.. */
2571         orr     r3, r3, ip, lsr #16     /* r3 = 4567 */
2572         orr     r1, r1, ip, lsl #16     /* r1 = 89AB */
2573 #else
2574         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2575         str     r2, [r0]
2576         mov     r3, r3, lsr #16         /* r3 = ..54 */
2577         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
2578         mov     r1, r1, lsl #16         /* r1 = BA.. */
2579         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
2580 #endif
2581         str     r3, [r0, #0x04]
2582         str     r1, [r0, #0x08]
2583         RET
2584         LMEMCPY_C_PAD
2585
2586 /*
2587  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2588  */
2589         ldrb    r2, [r1]                /* r2 = ...0 */
2590         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2591         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2592         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2593 #ifdef __ARMEB__
2594         mov     r2, r2, lsl #24         /* r2 = 0... */
2595         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
2596         str     r2, [r0]
2597         mov     r3, r3, lsl #24         /* r3 = 4... */
2598         orr     r3, r3, ip, lsr #8      /* r3 = 4567 */
2599         mov     r1, r1, lsr #8          /* r1 = .9AB */
2600         orr     r1, r1, ip, lsl #24     /* r1 = 89AB */
2601 #else
2602         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
2603         str     r2, [r0]
2604         mov     r3, r3, lsr #24         /* r3 = ...4 */
2605         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
2606         mov     r1, r1, lsl #8          /* r1 = BA9. */
2607         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
2608 #endif
2609         str     r3, [r0, #0x04]
2610         str     r1, [r0, #0x08]
2611         RET
2612         LMEMCPY_C_PAD
2613
2614 /*
2615  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2616  */
2617         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2618         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2619         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
2620         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
2621         strh    r1, [r0, #0x01]
2622 #ifdef __ARMEB__
2623         mov     r1, r2, lsr #24         /* r1 = ...0 */
2624         strb    r1, [r0]
2625         mov     r1, r2, lsl #24         /* r1 = 3... */
2626         orr     r2, r1, r3, lsr #8      /* r1 = 3456 */
2627         mov     r1, r3, lsl #24         /* r1 = 7... */
2628         orr     r1, r1, ip, lsr #8      /* r1 = 789A */
2629 #else
2630         strb    r2, [r0]
2631         mov     r1, r2, lsr #24         /* r1 = ...3 */
2632         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
2633         mov     r1, r3, lsr #24         /* r1 = ...7 */
2634         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
2635         mov     ip, ip, lsr #24         /* ip = ...B */
2636 #endif
2637         str     r2, [r0, #0x03]
2638         str     r1, [r0, #0x07]
2639         strb    ip, [r0, #0x0b]
2640         RET
2641         LMEMCPY_C_PAD
2642
2643 /*
2644  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2645  */
2646         ldrb    r2, [r1]
2647         ldrh    r3, [r1, #0x01]
2648         ldr     ip, [r1, #0x03]
2649         strb    r2, [r0]
2650         ldr     r2, [r1, #0x07]
2651         ldrb    r1, [r1, #0x0b]
2652         strh    r3, [r0, #0x01]
2653         str     ip, [r0, #0x03]
2654         str     r2, [r0, #0x07]
2655         strb    r1, [r0, #0x0b]
2656         RET
2657         LMEMCPY_C_PAD
2658
2659 /*
2660  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2661  */
2662         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2663         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2664         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2665         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2666 #ifdef __ARMEB__
2667         mov     r2, r2, ror #8          /* r2 = 1..0 */
2668         strb    r2, [r0]
2669         mov     r2, r2, lsr #16         /* r2 = ..1. */
2670         orr     r2, r2, r3, lsr #24     /* r2 = ..12 */
2671         strh    r2, [r0, #0x01]
2672         mov     r2, r3, lsl #8          /* r2 = 345. */
2673         orr     r3, r2, ip, lsr #24     /* r3 = 3456 */
2674         mov     r2, ip, lsl #8          /* r2 = 789. */
2675         orr     r2, r2, r1, lsr #8      /* r2 = 789A */
2676 #else
2677         strb    r2, [r0]
2678         mov     r2, r2, lsr #8          /* r2 = ...1 */
2679         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2680         strh    r2, [r0, #0x01]
2681         mov     r2, r3, lsr #8          /* r2 = .543 */
2682         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
2683         mov     r2, ip, lsr #8          /* r2 = .987 */
2684         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
2685         mov     r1, r1, lsr #8          /* r1 = ...B */
2686 #endif
2687         str     r3, [r0, #0x03]
2688         str     r2, [r0, #0x07]
2689         strb    r1, [r0, #0x0b]
2690         RET
2691         LMEMCPY_C_PAD
2692
2693 /*
2694  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2695  */
2696         ldrb    r2, [r1]
2697         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2698         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2699         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2700         strb    r2, [r0]
2701 #ifdef __ARMEB__
2702         mov     r2, r3, lsr #16         /* r2 = ..12 */
2703         strh    r2, [r0, #0x01]
2704         mov     r3, r3, lsl #16         /* r3 = 34.. */
2705         orr     r3, r3, ip, lsr #16     /* r3 = 3456 */
2706         mov     ip, ip, lsl #16         /* ip = 78.. */
2707         orr     ip, ip, r1, lsr #16     /* ip = 789A */
2708         mov     r1, r1, lsr #8          /* r1 = .9AB */
2709 #else
2710         strh    r3, [r0, #0x01]
2711         mov     r3, r3, lsr #16         /* r3 = ..43 */
2712         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
2713         mov     ip, ip, lsr #16         /* ip = ..87 */
2714         orr     ip, ip, r1, lsl #16     /* ip = A987 */
2715         mov     r1, r1, lsr #16         /* r1 = ..xB */
2716 #endif
2717         str     r3, [r0, #0x03]
2718         str     ip, [r0, #0x07]
2719         strb    r1, [r0, #0x0b]
2720         RET
2721         LMEMCPY_C_PAD
2722
2723 /*
2724  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2725  */
2726         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
2727         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2728         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
2729         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2730 #ifdef __ARMEB__
2731         strh    r1, [r0]
2732         mov     r1, ip, lsl #16         /* r1 = 23.. */
2733         orr     r1, r1, r3, lsr #16     /* r1 = 2345 */
2734         mov     r3, r3, lsl #16         /* r3 = 67.. */
2735         orr     r3, r3, r2, lsr #16     /* r3 = 6789 */
2736 #else
2737         strh    ip, [r0]
2738         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
2739         mov     r3, r3, lsr #16         /* r3 = ..76 */
2740         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
2741         mov     r2, r2, lsr #16         /* r2 = ..BA */
2742 #endif
2743         str     r1, [r0, #0x02]
2744         str     r3, [r0, #0x06]
2745         strh    r2, [r0, #0x0a]
2746         RET
2747         LMEMCPY_C_PAD
2748
2749 /*
2750  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2751  */
2752         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2753         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2754         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
2755         strh    ip, [r0]
2756         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2757         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
2758 #ifdef __ARMEB__
2759         mov     r2, r2, lsl #24         /* r2 = 2... */
2760         orr     r2, r2, r3, lsr #8      /* r2 = 2345 */
2761         mov     r3, r3, lsl #24         /* r3 = 6... */
2762         orr     r3, r3, ip, lsr #8      /* r3 = 6789 */
2763         orr     r1, r1, ip, lsl #8      /* r1 = 89AB */
2764 #else
2765         mov     r2, r2, lsr #24         /* r2 = ...2 */
2766         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2767         mov     r3, r3, lsr #24         /* r3 = ...6 */
2768         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2769         mov     r1, r1, lsl #8          /* r1 = ..B. */
2770         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2771 #endif
2772         str     r2, [r0, #0x02]
2773         str     r3, [r0, #0x06]
2774         strh    r1, [r0, #0x0a]
2775         RET
2776         LMEMCPY_C_PAD
2777
2778 /*
2779  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2780  */
2781         ldrh    r2, [r1]
2782         ldr     r3, [r1, #0x02]
2783         ldr     ip, [r1, #0x06]
2784         ldrh    r1, [r1, #0x0a]
2785         strh    r2, [r0]
2786         str     r3, [r0, #0x02]
2787         str     ip, [r0, #0x06]
2788         strh    r1, [r0, #0x0a]
2789         RET
2790         LMEMCPY_C_PAD
2791
2792 /*
2793  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2794  */
2795         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2796         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2797         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2798         strh    ip, [r0, #0x0a]
2799         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2800         ldrb    r1, [r1]                /* r1 = ...0 */
2801 #ifdef __ARMEB__
2802         mov     r2, r2, lsr #24         /* r2 = ...9 */
2803         orr     r2, r2, r3, lsl #8      /* r2 = 6789 */
2804         mov     r3, r3, lsr #24         /* r3 = ...5 */
2805         orr     r3, r3, ip, lsl #8      /* r3 = 2345 */
2806         mov     r1, r1, lsl #8          /* r1 = ..0. */
2807         orr     r1, r1, ip, lsr #24     /* r1 = ..01 */
2808 #else
2809         mov     r2, r2, lsl #24         /* r2 = 9... */
2810         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2811         mov     r3, r3, lsl #24         /* r3 = 5... */
2812         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2813         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2814 #endif
2815         str     r2, [r0, #0x06]
2816         str     r3, [r0, #0x02]
2817         strh    r1, [r0]
2818         RET
2819         LMEMCPY_C_PAD
2820
2821 /*
2822  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2823  */
2824         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2825         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2826         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2827 #ifdef __ARMEB__
2828         mov     r3, r2, lsr #24         /* r3 = ...0 */
2829         strb    r3, [r0]
2830         mov     r2, r2, lsl #8          /* r2 = 123. */
2831         orr     r2, r2, ip, lsr #24     /* r2 = 1234 */
2832         str     r2, [r0, #0x01]
2833         mov     r2, ip, lsl #8          /* r2 = 567. */
2834         orr     r2, r2, r1, lsr #24     /* r2 = 5678 */
2835         str     r2, [r0, #0x05]
2836         mov     r2, r1, lsr #8          /* r2 = ..9A */
2837         strh    r2, [r0, #0x09]
2838         strb    r1, [r0, #0x0b]
2839 #else
2840         strb    r2, [r0]
2841         mov     r3, r2, lsr #8          /* r3 = .321 */
2842         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2843         str     r3, [r0, #0x01]
2844         mov     r3, ip, lsr #8          /* r3 = .765 */
2845         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2846         str     r3, [r0, #0x05]
2847         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2848         strh    r1, [r0, #0x09]
2849         mov     r1, r1, lsr #16         /* r1 = ...B */
2850         strb    r1, [r0, #0x0b]
2851 #endif
2852         RET
2853         LMEMCPY_C_PAD
2854
2855 /*
2856  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2857  */
2858         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2859         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2860         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2861         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2862         strb    r2, [r0, #0x0b]
2863 #ifdef __ARMEB__
2864         strh    r3, [r0, #0x09]
2865         mov     r3, r3, lsr #16         /* r3 = ..78 */
2866         orr     r3, r3, ip, lsl #16     /* r3 = 5678 */
2867         mov     ip, ip, lsr #16         /* ip = ..34 */
2868         orr     ip, ip, r1, lsl #16     /* ip = 1234 */
2869         mov     r1, r1, lsr #16         /* r1 = ..x0 */
2870 #else
2871         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2872         strh    r2, [r0, #0x09]
2873         mov     r3, r3, lsl #16         /* r3 = 87.. */
2874         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2875         mov     ip, ip, lsl #16         /* ip = 43.. */
2876         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2877         mov     r1, r1, lsr #8          /* r1 = .210 */
2878 #endif
2879         str     r3, [r0, #0x05]
2880         str     ip, [r0, #0x01]
2881         strb    r1, [r0]
2882         RET
2883         LMEMCPY_C_PAD
2884
2885 /*
2886  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2887  */
2888 #ifdef __ARMEB__
2889         ldrh    r2, [r1, #0x0a]         /* r2 = ..AB */
2890         ldr     ip, [r1, #0x06]         /* ip = 6789 */
2891         ldr     r3, [r1, #0x02]         /* r3 = 2345 */
2892         ldrh    r1, [r1]                /* r1 = ..01 */
2893         strb    r2, [r0, #0x0b]
2894         mov     r2, r2, lsr #8          /* r2 = ...A */
2895         orr     r2, r2, ip, lsl #8      /* r2 = 789A */
2896         mov     ip, ip, lsr #8          /* ip = .678 */
2897         orr     ip, ip, r3, lsl #24     /* ip = 5678 */
2898         mov     r3, r3, lsr #8          /* r3 = .234 */
2899         orr     r3, r3, r1, lsl #24     /* r3 = 1234 */
2900         mov     r1, r1, lsr #8          /* r1 = ...0 */
2901         strb    r1, [r0]
2902         str     r3, [r0, #0x01]
2903         str     ip, [r0, #0x05]
2904         strh    r2, [r0, #0x09]
2905 #else
2906         ldrh    r2, [r1]                /* r2 = ..10 */
2907         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2908         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2909         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2910         strb    r2, [r0]
2911         mov     r2, r2, lsr #8          /* r2 = ...1 */
2912         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2913         mov     r3, r3, lsr #24         /* r3 = ...5 */
2914         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2915         mov     ip, ip, lsr #24         /* ip = ...9 */
2916         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2917         mov     r1, r1, lsr #8          /* r1 = ...B */
2918         str     r2, [r0, #0x01]
2919         str     r3, [r0, #0x05]
2920         strh    ip, [r0, #0x09]
2921         strb    r1, [r0, #0x0b]
2922 #endif
2923         RET
2924         LMEMCPY_C_PAD
2925
2926 /*
2927  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2928  */
2929         ldrb    r2, [r1]
2930         ldr     r3, [r1, #0x01]
2931         ldr     ip, [r1, #0x05]
2932         strb    r2, [r0]
2933         ldrh    r2, [r1, #0x09]
2934         ldrb    r1, [r1, #0x0b]
2935         str     r3, [r0, #0x01]
2936         str     ip, [r0, #0x05]
2937         strh    r2, [r0, #0x09]
2938         strb    r1, [r0, #0x0b]
2939         RET
2940 END(memcpy)
2941 #endif /* _ARM_ARCH_5E */
2942
2943 #ifdef GPROF
2944
2945 ENTRY(user)
2946         nop
2947 END(user)
2948 ENTRY(btrap)
2949         nop
2950 END(btrap)
2951 ENTRY(etrap)
2952         nop
2953 END(etrap)
2954 ENTRY(bintr)
2955         nop
2956 END(bintr)
2957 ENTRY(eintr)
2958         nop
2959 END(eintr)
2960 #endif