sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  *
  76  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  77  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  78  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  79  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  80  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  81  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  82  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  83  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  84  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  85  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  86  * POSSIBILITY OF SUCH DAMAGE.
  87  */
  88
  89 #include <machine/asm.h>
  90 __FBSDID("$FreeBSD$");
  91
  92 #include "assym.s"
  93
  94         .syntax unified
  95
  96 .L_arm_memcpy:
  97         .word   _C_LABEL(_arm_memcpy)
  98 .L_arm_bzero:
  99         .word   _C_LABEL(_arm_bzero)
 100 .L_min_memcpy_size:
 101         .word   _C_LABEL(_min_memcpy_size)
 102 .L_min_bzero_size:
 103         .word   _C_LABEL(_min_bzero_size)
 104 /*
 105  * memset: Sets a block of memory to the specified value
 106  *
 107  * On entry:
 108  *   r0 - dest address
 109  *   r1 - byte to write
 110  *   r2 - number of bytes to write
 111  *
 112  * On exit:
 113  *   r0 - dest address
 114  */
 115 /* LINTSTUB: Func: void bzero(void *, size_t) */
 116 ENTRY(bzero)
 117         ldr     r3, .L_arm_bzero
 118         ldr     r3, [r3]
 119         cmp     r3, #0
 120         beq     .Lnormal0
 121         ldr     r2, .L_min_bzero_size
 122         ldr     r2, [r2]
 123         cmp     r1, r2
 124         blt     .Lnormal0
 125         stmfd   sp!, {r0, r1, lr}
 126         mov     r2, #0
 127         mov     lr, pc
 128         mov     pc, r3
 129         cmp     r0, #0
 130         ldmfd   sp!, {r0, r1, lr}
 131         RETeq
 132 .Lnormal0:
 133         mov     r3, #0x00
 134         b       do_memset
 135 END(bzero)
 136 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 137 ENTRY(memset)
 138         and     r3, r1, #0xff           /* We deal with bytes */
 139         mov     r1, r2
 140 do_memset:
 141         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 142         mov     ip, r0
 143         blt     .Lmemset_lessthanfour
 144
 145         /* Ok first we will word align the address */
 146         ands    r2, ip, #0x03           /* Get the bottom two bits */
 147         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 148
 149         /* We are now word aligned */
 150 .Lmemset_wordaligned:
 151         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 152 #ifdef _ARM_ARCH_5E
 153         tst     ip, #0x04               /* Quad-align for armv5e */
 154 #else
 155         cmp     r1, #0x10
 156 #endif
 157         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 158 #ifdef _ARM_ARCH_5E
 159         subne   r1, r1, #0x04           /* Quad-align if necessary */
 160         strne   r3, [ip], #0x04
 161         cmp     r1, #0x10
 162 #endif
 163         blt     .Lmemset_loop4          /* If less than 16 then use words */
 164         mov     r2, r3                  /* Duplicate data */
 165         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 166         blt     .Lmemset_loop32
 167
 168         /* Do 128 bytes at a time */
 169 .Lmemset_loop128:
 170         subs    r1, r1, #0x80
 171 #ifdef _ARM_ARCH_5E
 172         strdge  r2, [ip], #0x08
 173         strdge  r2, [ip], #0x08
 174         strdge  r2, [ip], #0x08
 175         strdge  r2, [ip], #0x08
 176         strdge  r2, [ip], #0x08
 177         strdge  r2, [ip], #0x08
 178         strdge  r2, [ip], #0x08
 179         strdge  r2, [ip], #0x08
 180         strdge  r2, [ip], #0x08
 181         strdge  r2, [ip], #0x08
 182         strdge  r2, [ip], #0x08
 183         strdge  r2, [ip], #0x08
 184         strdge  r2, [ip], #0x08
 185         strdge  r2, [ip], #0x08
 186         strdge  r2, [ip], #0x08
 187         strdge  r2, [ip], #0x08
 188 #else
 189         stmiage ip!, {r2-r3}
 190         stmiage ip!, {r2-r3}
 191         stmiage ip!, {r2-r3}
 192         stmiage ip!, {r2-r3}
 193         stmiage ip!, {r2-r3}
 194         stmiage ip!, {r2-r3}
 195         stmiage ip!, {r2-r3}
 196         stmiage ip!, {r2-r3}
 197         stmiage ip!, {r2-r3}
 198         stmiage ip!, {r2-r3}
 199         stmiage ip!, {r2-r3}
 200         stmiage ip!, {r2-r3}
 201         stmiage ip!, {r2-r3}
 202         stmiage ip!, {r2-r3}
 203         stmiage ip!, {r2-r3}
 204         stmiage ip!, {r2-r3}
 205 #endif
 206         bgt     .Lmemset_loop128
 207         RETeq                   /* Zero length so just exit */
 208
 209         add     r1, r1, #0x80           /* Adjust for extra sub */
 210
 211         /* Do 32 bytes at a time */
 212 .Lmemset_loop32:
 213         subs    r1, r1, #0x20
 214 #ifdef _ARM_ARCH_5E
 215         strdge  r2, [ip], #0x08
 216         strdge  r2, [ip], #0x08
 217         strdge  r2, [ip], #0x08
 218         strdge  r2, [ip], #0x08
 219 #else
 220         stmiage ip!, {r2-r3}
 221         stmiage ip!, {r2-r3}
 222         stmiage ip!, {r2-r3}
 223         stmiage ip!, {r2-r3}
 224 #endif
 225         bgt     .Lmemset_loop32
 226         RETeq                   /* Zero length so just exit */
 227
 228         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 229
 230         /* Deal with 16 bytes or more */
 231 #ifdef _ARM_ARCH_5E
 232         strdge  r2, [ip], #0x08
 233         strdge  r2, [ip], #0x08
 234 #else
 235         stmiage ip!, {r2-r3}
 236         stmiage ip!, {r2-r3}
 237 #endif
 238         RETeq                   /* Zero length so just exit */
 239
 240         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 241
 242         /* We have at least 4 bytes so copy as words */
 243 .Lmemset_loop4:
 244         subs    r1, r1, #0x04
 245         strge   r3, [ip], #0x04
 246         bgt     .Lmemset_loop4
 247         RETeq                   /* Zero length so just exit */
 248
 249 #ifdef _ARM_ARCH_5E
 250         /* Compensate for 64-bit alignment check */
 251         adds    r1, r1, #0x04
 252         RETeq
 253         cmp     r1, #2
 254 #else
 255         cmp     r1, #-2
 256 #endif
 257
 258         strb    r3, [ip], #0x01         /* Set 1 byte */
 259         strbge  r3, [ip], #0x01         /* Set another byte */
 260         strbgt  r3, [ip]                /* and a third */
 261         RET                     /* Exit */
 262
 263 .Lmemset_wordunaligned:
 264         rsb     r2, r2, #0x004
 265         strb    r3, [ip], #0x01         /* Set 1 byte */
 266         cmp     r2, #0x02
 267         strbge  r3, [ip], #0x01         /* Set another byte */
 268         sub     r1, r1, r2
 269         strbgt  r3, [ip], #0x01         /* and a third */
 270         cmp     r1, #0x04               /* More than 4 bytes left? */
 271         bge     .Lmemset_wordaligned    /* Yup */
 272
 273 .Lmemset_lessthanfour:
 274         cmp     r1, #0x00
 275         RETeq                   /* Zero length so exit */
 276         strb    r3, [ip], #0x01         /* Set 1 byte */
 277         cmp     r1, #0x02
 278         strbge  r3, [ip], #0x01         /* Set another byte */
 279         strbgt  r3, [ip]                /* and a third */
 280         RET                     /* Exit */
 281 EEND(memset)
 282 END(bzero)
 283
 284 ENTRY(bcmp)
 285         mov     ip, r0
 286         cmp     r2, #0x06
 287         beq     .Lmemcmp_6bytes
 288         mov     r0, #0x00
 289
 290         /* Are both addresses aligned the same way? */
 291         cmp     r2, #0x00
 292         eorsne  r3, ip, r1
 293         RETeq                   /* len == 0, or same addresses! */
 294         tst     r3, #0x03
 295         subne   r2, r2, #0x01
 296         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 297
 298         /* Word-align the addresses, if necessary */
 299         sub     r3, r1, #0x05
 300         ands    r3, r3, #0x03
 301         add     r3, r3, r3, lsl #1
 302         addne   pc, pc, r3, lsl #3
 303         nop
 304
 305         /* Compare up to 3 bytes */
 306         ldrb    r0, [ip], #0x01
 307         ldrb    r3, [r1], #0x01
 308         subs    r0, r0, r3
 309         RETne
 310         subs    r2, r2, #0x01
 311         RETeq
 312
 313         /* Compare up to 2 bytes */
 314         ldrb    r0, [ip], #0x01
 315         ldrb    r3, [r1], #0x01
 316         subs    r0, r0, r3
 317         RETne
 318         subs    r2, r2, #0x01
 319         RETeq
 320
 321         /* Compare 1 byte */
 322         ldrb    r0, [ip], #0x01
 323         ldrb    r3, [r1], #0x01
 324         subs    r0, r0, r3
 325         RETne
 326         subs    r2, r2, #0x01
 327         RETeq
 328
 329         /* Compare 4 bytes at a time, if possible */
 330         subs    r2, r2, #0x04
 331         bcc     .Lmemcmp_bytewise
 332 .Lmemcmp_word_aligned:
 333         ldr     r0, [ip], #0x04
 334         ldr     r3, [r1], #0x04
 335         subs    r2, r2, #0x04
 336         cmpcs   r0, r3
 337         beq     .Lmemcmp_word_aligned
 338         sub     r0, r0, r3
 339
 340         /* Correct for extra subtraction, and check if done */
 341         adds    r2, r2, #0x04
 342         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 343         RETeq                   /* Yup. Just return */
 344
 345         /* Re-do the final word byte-wise */
 346         sub     ip, ip, #0x04
 347         sub     r1, r1, #0x04
 348
 349 .Lmemcmp_bytewise:
 350         add     r2, r2, #0x03
 351 .Lmemcmp_bytewise2:
 352         ldrb    r0, [ip], #0x01
 353         ldrb    r3, [r1], #0x01
 354         subs    r2, r2, #0x01
 355         cmpcs   r0, r3
 356         beq     .Lmemcmp_bytewise2
 357         sub     r0, r0, r3
 358         RET
 359
 360         /*
 361          * 6 byte compares are very common, thanks to the network stack.
 362          * This code is hand-scheduled to reduce the number of stalls for
 363          * load results. Everything else being equal, this will be ~32%
 364          * faster than a byte-wise memcmp.
 365          */
 366         .align  5
 367 .Lmemcmp_6bytes:
 368         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 369         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 370         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 371         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 372         ldrbeq  r3, [ip, #0x01]         /* r3 = b1#1 */
 373         RETne                   /* Return if mismatch on #0 */
 374         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 375         ldrbeq  r3, [r1, #0x02]         /* r3 = b2#2 */
 376         ldrbeq  r0, [ip, #0x02]         /* r0 = b1#2 */
 377         RETne                   /* Return if mismatch on #1 */
 378         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 379         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 380         ldrbeq  r3, [ip, #0x03]         /* r3 = b1#3 */
 381         RETne                   /* Return if mismatch on #2 */
 382         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 383         ldrbeq  r3, [r1, #0x04]         /* r3 = b2#4 */
 384         ldrbeq  r0, [ip, #0x04]         /* r0 = b1#4 */
 385         RETne                   /* Return if mismatch on #3 */
 386         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 387         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 388         ldrbeq  r3, [ip, #0x05]         /* r3 = b1#5 */
 389         RETne                   /* Return if mismatch on #4 */
 390         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 391         RET
 392 END(bcmp)
 393
 394 ENTRY(bcopy)
 395         /* switch the source and destination registers */
 396         eor     r0, r1, r0
 397         eor     r1, r0, r1
 398         eor     r0, r1, r0
 399 EENTRY(memmove)
 400         /* Do the buffers overlap? */
 401         cmp     r0, r1
 402         RETeq           /* Bail now if src/dst are the same */
 403         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 404         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 405         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 406         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 407
 408         /* Determine copy direction */
 409         cmp     r1, r0
 410         bcc     .Lmemmove_backwards
 411
 412         moveq   r0, #0                  /* Quick abort for len=0 */
 413         RETeq
 414
 415         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 416         subs    r2, r2, #4
 417         blt     .Lmemmove_fl4           /* less than 4 bytes */
 418         ands    r12, r0, #3
 419         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 420         ands    r12, r1, #3
 421         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 422
 423 .Lmemmove_ft8:
 424         /* We have aligned source and destination */
 425         subs    r2, r2, #8
 426         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 427         subs    r2, r2, #0x14
 428         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 429         stmdb   sp!, {r4}               /* borrow r4 */
 430
 431         /* blat 32 bytes at a time */
 432         /* XXX for really big copies perhaps we should use more registers */
 433 .Lmemmove_floop32:
 434         ldmia   r1!, {r3, r4, r12, lr}
 435         stmia   r0!, {r3, r4, r12, lr}
 436         ldmia   r1!, {r3, r4, r12, lr}
 437         stmia   r0!, {r3, r4, r12, lr}
 438         subs    r2, r2, #0x20
 439         bge     .Lmemmove_floop32
 440
 441         cmn     r2, #0x10
 442         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 443         stmiage r0!, {r3, r4, r12, lr}
 444         subge   r2, r2, #0x10
 445         ldmia   sp!, {r4}               /* return r4 */
 446
 447 .Lmemmove_fl32:
 448         adds    r2, r2, #0x14
 449
 450         /* blat 12 bytes at a time */
 451 .Lmemmove_floop12:
 452         ldmiage r1!, {r3, r12, lr}
 453         stmiage r0!, {r3, r12, lr}
 454         subsge  r2, r2, #0x0c
 455         bge     .Lmemmove_floop12
 456
 457 .Lmemmove_fl12:
 458         adds    r2, r2, #8
 459         blt     .Lmemmove_fl4
 460
 461         subs    r2, r2, #4
 462         ldrlt   r3, [r1], #4
 463         strlt   r3, [r0], #4
 464         ldmiage r1!, {r3, r12}
 465         stmiage r0!, {r3, r12}
 466         subge   r2, r2, #4
 467
 468 .Lmemmove_fl4:
 469         /* less than 4 bytes to go */
 470         adds    r2, r2, #4
 471         ldmiaeq sp!, {r0, pc}           /* done */
 472
 473         /* copy the crud byte at a time */
 474         cmp     r2, #2
 475         ldrb    r3, [r1], #1
 476         strb    r3, [r0], #1
 477         ldrbge  r3, [r1], #1
 478         strbge  r3, [r0], #1
 479         ldrbgt  r3, [r1], #1
 480         strbgt  r3, [r0], #1
 481         ldmia   sp!, {r0, pc}
 482
 483         /* erg - unaligned destination */
 484 .Lmemmove_fdestul:
 485         rsb     r12, r12, #4
 486         cmp     r12, #2
 487
 488         /* align destination with byte copies */
 489         ldrb    r3, [r1], #1
 490         strb    r3, [r0], #1
 491         ldrbge  r3, [r1], #1
 492         strbge  r3, [r0], #1
 493         ldrbgt  r3, [r1], #1
 494         strbgt  r3, [r0], #1
 495         subs    r2, r2, r12
 496         blt     .Lmemmove_fl4           /* less the 4 bytes */
 497
 498         ands    r12, r1, #3
 499         beq     .Lmemmove_ft8           /* we have an aligned source */
 500
 501         /* erg - unaligned source */
 502         /* This is where it gets nasty ... */
 503 .Lmemmove_fsrcul:
 504         bic     r1, r1, #3
 505         ldr     lr, [r1], #4
 506         cmp     r12, #2
 507         bgt     .Lmemmove_fsrcul3
 508         beq     .Lmemmove_fsrcul2
 509         cmp     r2, #0x0c
 510         blt     .Lmemmove_fsrcul1loop4
 511         sub     r2, r2, #0x0c
 512         stmdb   sp!, {r4, r5}
 513
 514 .Lmemmove_fsrcul1loop16:
 515 #ifdef __ARMEB__
 516         mov     r3, lr, lsl #8
 517 #else
 518         mov     r3, lr, lsr #8
 519 #endif
 520         ldmia   r1!, {r4, r5, r12, lr}
 521 #ifdef __ARMEB__
 522         orr     r3, r3, r4, lsr #24
 523         mov     r4, r4, lsl #8
 524         orr     r4, r4, r5, lsr #24
 525         mov     r5, r5, lsl #8
 526         orr     r5, r5, r12, lsr #24
 527         mov     r12, r12, lsl #8
 528         orr     r12, r12, lr, lsr #24
 529 #else
 530         orr     r3, r3, r4, lsl #24
 531         mov     r4, r4, lsr #8
 532         orr     r4, r4, r5, lsl #24
 533         mov     r5, r5, lsr #8
 534         orr     r5, r5, r12, lsl #24
 535         mov     r12, r12, lsr #8
 536         orr     r12, r12, lr, lsl #24
 537 #endif
 538         stmia   r0!, {r3-r5, r12}
 539         subs    r2, r2, #0x10
 540         bge     .Lmemmove_fsrcul1loop16
 541         ldmia   sp!, {r4, r5}
 542         adds    r2, r2, #0x0c
 543         blt     .Lmemmove_fsrcul1l4
 544
 545 .Lmemmove_fsrcul1loop4:
 546 #ifdef __ARMEB__
 547         mov     r12, lr, lsl #8
 548 #else
 549         mov     r12, lr, lsr #8
 550 #endif
 551         ldr     lr, [r1], #4
 552 #ifdef __ARMEB__
 553         orr     r12, r12, lr, lsr #24
 554 #else
 555         orr     r12, r12, lr, lsl #24
 556 #endif
 557         str     r12, [r0], #4
 558         subs    r2, r2, #4
 559         bge     .Lmemmove_fsrcul1loop4
 560
 561 .Lmemmove_fsrcul1l4:
 562         sub     r1, r1, #3
 563         b       .Lmemmove_fl4
 564
 565 .Lmemmove_fsrcul2:
 566         cmp     r2, #0x0c
 567         blt     .Lmemmove_fsrcul2loop4
 568         sub     r2, r2, #0x0c
 569         stmdb   sp!, {r4, r5}
 570
 571 .Lmemmove_fsrcul2loop16:
 572 #ifdef __ARMEB__
 573         mov     r3, lr, lsl #16
 574 #else
 575         mov     r3, lr, lsr #16
 576 #endif
 577         ldmia   r1!, {r4, r5, r12, lr}
 578 #ifdef __ARMEB__
 579         orr     r3, r3, r4, lsr #16
 580         mov     r4, r4, lsl #16
 581         orr     r4, r4, r5, lsr #16
 582         mov     r5, r5, lsl #16
 583         orr     r5, r5, r12, lsr #16
 584         mov     r12, r12, lsl #16
 585         orr     r12, r12, lr, lsr #16
 586 #else
 587         orr     r3, r3, r4, lsl #16
 588         mov     r4, r4, lsr #16
 589         orr     r4, r4, r5, lsl #16
 590         mov     r5, r5, lsr #16
 591         orr     r5, r5, r12, lsl #16
 592         mov     r12, r12, lsr #16
 593         orr     r12, r12, lr, lsl #16
 594 #endif
 595         stmia   r0!, {r3-r5, r12}
 596         subs    r2, r2, #0x10
 597         bge     .Lmemmove_fsrcul2loop16
 598         ldmia   sp!, {r4, r5}
 599         adds    r2, r2, #0x0c
 600         blt     .Lmemmove_fsrcul2l4
 601
 602 .Lmemmove_fsrcul2loop4:
 603 #ifdef __ARMEB__
 604         mov     r12, lr, lsl #16
 605 #else
 606         mov     r12, lr, lsr #16
 607 #endif
 608         ldr     lr, [r1], #4
 609 #ifdef __ARMEB__
 610         orr     r12, r12, lr, lsr #16
 611 #else
 612         orr     r12, r12, lr, lsl #16
 613 #endif
 614         str     r12, [r0], #4
 615         subs    r2, r2, #4
 616         bge     .Lmemmove_fsrcul2loop4
 617
 618 .Lmemmove_fsrcul2l4:
 619         sub     r1, r1, #2
 620         b       .Lmemmove_fl4
 621
 622 .Lmemmove_fsrcul3:
 623         cmp     r2, #0x0c
 624         blt     .Lmemmove_fsrcul3loop4
 625         sub     r2, r2, #0x0c
 626         stmdb   sp!, {r4, r5}
 627
 628 .Lmemmove_fsrcul3loop16:
 629 #ifdef __ARMEB__
 630         mov     r3, lr, lsl #24
 631 #else
 632         mov     r3, lr, lsr #24
 633 #endif
 634         ldmia   r1!, {r4, r5, r12, lr}
 635 #ifdef __ARMEB__
 636         orr     r3, r3, r4, lsr #8
 637         mov     r4, r4, lsl #24
 638         orr     r4, r4, r5, lsr #8
 639         mov     r5, r5, lsl #24
 640         orr     r5, r5, r12, lsr #8
 641         mov     r12, r12, lsl #24
 642         orr     r12, r12, lr, lsr #8
 643 #else
 644         orr     r3, r3, r4, lsl #8
 645         mov     r4, r4, lsr #24
 646         orr     r4, r4, r5, lsl #8
 647         mov     r5, r5, lsr #24
 648         orr     r5, r5, r12, lsl #8
 649         mov     r12, r12, lsr #24
 650         orr     r12, r12, lr, lsl #8
 651 #endif
 652         stmia   r0!, {r3-r5, r12}
 653         subs    r2, r2, #0x10
 654         bge     .Lmemmove_fsrcul3loop16
 655         ldmia   sp!, {r4, r5}
 656         adds    r2, r2, #0x0c
 657         blt     .Lmemmove_fsrcul3l4
 658
 659 .Lmemmove_fsrcul3loop4:
 660 #ifdef __ARMEB__
 661         mov     r12, lr, lsl #24
 662 #else
 663         mov     r12, lr, lsr #24
 664 #endif
 665         ldr     lr, [r1], #4
 666 #ifdef __ARMEB__
 667         orr     r12, r12, lr, lsr #8
 668 #else
 669         orr     r12, r12, lr, lsl #8
 670 #endif
 671         str     r12, [r0], #4
 672         subs    r2, r2, #4
 673         bge     .Lmemmove_fsrcul3loop4
 674
 675 .Lmemmove_fsrcul3l4:
 676         sub     r1, r1, #1
 677         b       .Lmemmove_fl4
 678
 679 .Lmemmove_backwards:
 680         add     r1, r1, r2
 681         add     r0, r0, r2
 682         subs    r2, r2, #4
 683         blt     .Lmemmove_bl4           /* less than 4 bytes */
 684         ands    r12, r0, #3
 685         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 686         ands    r12, r1, #3
 687         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 688
 689 .Lmemmove_bt8:
 690         /* We have aligned source and destination */
 691         subs    r2, r2, #8
 692         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 693         stmdb   sp!, {r4, lr}
 694         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 695         blt     .Lmemmove_bl32
 696
 697         /* blat 32 bytes at a time */
 698         /* XXX for really big copies perhaps we should use more registers */
 699 .Lmemmove_bloop32:
 700         ldmdb   r1!, {r3, r4, r12, lr}
 701         stmdb   r0!, {r3, r4, r12, lr}
 702         ldmdb   r1!, {r3, r4, r12, lr}
 703         stmdb   r0!, {r3, r4, r12, lr}
 704         subs    r2, r2, #0x20
 705         bge     .Lmemmove_bloop32
 706
 707 .Lmemmove_bl32:
 708         cmn     r2, #0x10
 709         ldmdbge r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 710         stmdbge r0!, {r3, r4, r12, lr}
 711         subge   r2, r2, #0x10
 712         adds    r2, r2, #0x14
 713         ldmdbge r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 714         stmdbge r0!, {r3, r12, lr}
 715         subge   r2, r2, #0x0c
 716         ldmia   sp!, {r4, lr}
 717
 718 .Lmemmove_bl12:
 719         adds    r2, r2, #8
 720         blt     .Lmemmove_bl4
 721         subs    r2, r2, #4
 722         ldrlt   r3, [r1, #-4]!
 723         strlt   r3, [r0, #-4]!
 724         ldmdbge r1!, {r3, r12}
 725         stmdbge r0!, {r3, r12}
 726         subge   r2, r2, #4
 727
 728 .Lmemmove_bl4:
 729         /* less than 4 bytes to go */
 730         adds    r2, r2, #4
 731         RETeq                   /* done */
 732
 733         /* copy the crud byte at a time */
 734         cmp     r2, #2
 735         ldrb    r3, [r1, #-1]!
 736         strb    r3, [r0, #-1]!
 737         ldrbge  r3, [r1, #-1]!
 738         strbge  r3, [r0, #-1]!
 739         ldrbgt  r3, [r1, #-1]!
 740         strbgt  r3, [r0, #-1]!
 741         RET
 742
 743         /* erg - unaligned destination */
 744 .Lmemmove_bdestul:
 745         cmp     r12, #2
 746
 747         /* align destination with byte copies */
 748         ldrb    r3, [r1, #-1]!
 749         strb    r3, [r0, #-1]!
 750         ldrbge  r3, [r1, #-1]!
 751         strbge  r3, [r0, #-1]!
 752         ldrbgt  r3, [r1, #-1]!
 753         strbgt  r3, [r0, #-1]!
 754         subs    r2, r2, r12
 755         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 756         ands    r12, r1, #3
 757         beq     .Lmemmove_bt8           /* we have an aligned source */
 758
 759         /* erg - unaligned source */
 760         /* This is where it gets nasty ... */
 761 .Lmemmove_bsrcul:
 762         bic     r1, r1, #3
 763         ldr     r3, [r1, #0]
 764         cmp     r12, #2
 765         blt     .Lmemmove_bsrcul1
 766         beq     .Lmemmove_bsrcul2
 767         cmp     r2, #0x0c
 768         blt     .Lmemmove_bsrcul3loop4
 769         sub     r2, r2, #0x0c
 770         stmdb   sp!, {r4, r5, lr}
 771
 772 .Lmemmove_bsrcul3loop16:
 773 #ifdef __ARMEB__
 774         mov     lr, r3, lsr #8
 775 #else
 776         mov     lr, r3, lsl #8
 777 #endif
 778         ldmdb   r1!, {r3-r5, r12}
 779 #ifdef __ARMEB__
 780         orr     lr, lr, r12, lsl #24
 781         mov     r12, r12, lsr #8
 782         orr     r12, r12, r5, lsl #24
 783         mov     r5, r5, lsr #8
 784         orr     r5, r5, r4, lsl #24
 785         mov     r4, r4, lsr #8
 786         orr     r4, r4, r3, lsl #24
 787 #else
 788         orr     lr, lr, r12, lsr #24
 789         mov     r12, r12, lsl #8
 790         orr     r12, r12, r5, lsr #24
 791         mov     r5, r5, lsl #8
 792         orr     r5, r5, r4, lsr #24
 793         mov     r4, r4, lsl #8
 794         orr     r4, r4, r3, lsr #24
 795 #endif
 796         stmdb   r0!, {r4, r5, r12, lr}
 797         subs    r2, r2, #0x10
 798         bge     .Lmemmove_bsrcul3loop16
 799         ldmia   sp!, {r4, r5, lr}
 800         adds    r2, r2, #0x0c
 801         blt     .Lmemmove_bsrcul3l4
 802
 803 .Lmemmove_bsrcul3loop4:
 804 #ifdef __ARMEB__
 805         mov     r12, r3, lsr #8
 806 #else
 807         mov     r12, r3, lsl #8
 808 #endif
 809         ldr     r3, [r1, #-4]!
 810 #ifdef __ARMEB__
 811         orr     r12, r12, r3, lsl #24
 812 #else
 813         orr     r12, r12, r3, lsr #24
 814 #endif
 815         str     r12, [r0, #-4]!
 816         subs    r2, r2, #4
 817         bge     .Lmemmove_bsrcul3loop4
 818
 819 .Lmemmove_bsrcul3l4:
 820         add     r1, r1, #3
 821         b       .Lmemmove_bl4
 822
 823 .Lmemmove_bsrcul2:
 824         cmp     r2, #0x0c
 825         blt     .Lmemmove_bsrcul2loop4
 826         sub     r2, r2, #0x0c
 827         stmdb   sp!, {r4, r5, lr}
 828
 829 .Lmemmove_bsrcul2loop16:
 830 #ifdef __ARMEB__
 831         mov     lr, r3, lsr #16
 832 #else
 833         mov     lr, r3, lsl #16
 834 #endif
 835         ldmdb   r1!, {r3-r5, r12}
 836 #ifdef __ARMEB__
 837         orr     lr, lr, r12, lsl #16
 838         mov     r12, r12, lsr #16
 839         orr     r12, r12, r5, lsl #16
 840         mov     r5, r5, lsr #16
 841         orr     r5, r5, r4, lsl #16
 842         mov     r4, r4, lsr #16
 843         orr     r4, r4, r3, lsl #16
 844 #else
 845         orr     lr, lr, r12, lsr #16
 846         mov     r12, r12, lsl #16
 847         orr     r12, r12, r5, lsr #16
 848         mov     r5, r5, lsl #16
 849         orr     r5, r5, r4, lsr #16
 850         mov     r4, r4, lsl #16
 851         orr     r4, r4, r3, lsr #16
 852 #endif
 853         stmdb   r0!, {r4, r5, r12, lr}
 854         subs    r2, r2, #0x10
 855         bge     .Lmemmove_bsrcul2loop16
 856         ldmia   sp!, {r4, r5, lr}
 857         adds    r2, r2, #0x0c
 858         blt     .Lmemmove_bsrcul2l4
 859
 860 .Lmemmove_bsrcul2loop4:
 861 #ifdef __ARMEB__
 862         mov     r12, r3, lsr #16
 863 #else
 864         mov     r12, r3, lsl #16
 865 #endif
 866         ldr     r3, [r1, #-4]!
 867 #ifdef __ARMEB__
 868         orr     r12, r12, r3, lsl #16
 869 #else
 870         orr     r12, r12, r3, lsr #16
 871 #endif
 872         str     r12, [r0, #-4]!
 873         subs    r2, r2, #4
 874         bge     .Lmemmove_bsrcul2loop4
 875
 876 .Lmemmove_bsrcul2l4:
 877         add     r1, r1, #2
 878         b       .Lmemmove_bl4
 879
 880 .Lmemmove_bsrcul1:
 881         cmp     r2, #0x0c
 882         blt     .Lmemmove_bsrcul1loop4
 883         sub     r2, r2, #0x0c
 884         stmdb   sp!, {r4, r5, lr}
 885
 886 .Lmemmove_bsrcul1loop32:
 887 #ifdef __ARMEB__
 888         mov     lr, r3, lsr #24
 889 #else
 890         mov     lr, r3, lsl #24
 891 #endif
 892         ldmdb   r1!, {r3-r5, r12}
 893 #ifdef __ARMEB__
 894         orr     lr, lr, r12, lsl #8
 895         mov     r12, r12, lsr #24
 896         orr     r12, r12, r5, lsl #8
 897         mov     r5, r5, lsr #24
 898         orr     r5, r5, r4, lsl #8
 899         mov     r4, r4, lsr #24
 900         orr     r4, r4, r3, lsl #8
 901 #else
 902         orr     lr, lr, r12, lsr #8
 903         mov     r12, r12, lsl #24
 904         orr     r12, r12, r5, lsr #8
 905         mov     r5, r5, lsl #24
 906         orr     r5, r5, r4, lsr #8
 907         mov     r4, r4, lsl #24
 908         orr     r4, r4, r3, lsr #8
 909 #endif
 910         stmdb   r0!, {r4, r5, r12, lr}
 911         subs    r2, r2, #0x10
 912         bge     .Lmemmove_bsrcul1loop32
 913         ldmia   sp!, {r4, r5, lr}
 914         adds    r2, r2, #0x0c
 915         blt     .Lmemmove_bsrcul1l4
 916
 917 .Lmemmove_bsrcul1loop4:
 918 #ifdef __ARMEB__
 919         mov     r12, r3, lsr #24
 920 #else
 921         mov     r12, r3, lsl #24
 922 #endif
 923         ldr     r3, [r1, #-4]!
 924 #ifdef __ARMEB__
 925         orr     r12, r12, r3, lsl #8
 926 #else
 927         orr     r12, r12, r3, lsr #8
 928 #endif
 929         str     r12, [r0, #-4]!
 930         subs    r2, r2, #4
 931         bge     .Lmemmove_bsrcul1loop4
 932
 933 .Lmemmove_bsrcul1l4:
 934         add     r1, r1, #1
 935         b       .Lmemmove_bl4
 936 EEND(memmove)
 937 END(bcopy)
 938
 939 #if !defined(_ARM_ARCH_5E)
 940 ENTRY(memcpy)
 941         /* save leaf functions having to store this away */
 942         /* Do not check arm_memcpy if we're running from flash */
 943 #if defined(FLASHADDR) && defined(PHYSADDR)
 944 #if FLASHADDR > PHYSADDR
 945         ldr     r3, =FLASHADDR
 946         cmp     r3, pc
 947         bls     .Lnormal
 948 #else
 949         ldr     r3, =FLASHADDR
 950         cmp     r3, pc
 951         bhi     .Lnormal
 952 #endif
 953 #endif
 954         ldr     r3, .L_arm_memcpy
 955         ldr     r3, [r3]
 956         cmp     r3, #0
 957         beq     .Lnormal
 958         ldr     r3, .L_min_memcpy_size
 959         ldr     r3, [r3]
 960         cmp     r2, r3
 961         blt     .Lnormal
 962         stmfd   sp!, {r0-r2, r4, lr}
 963         mov     r3, #0
 964         ldr     r4, .L_arm_memcpy
 965         mov     lr, pc
 966         ldr     pc, [r4]
 967         cmp     r0, #0
 968         ldmfd   sp!, {r0-r2, r4, lr}
 969         RETeq
 970
 971 .Lnormal:
 972         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
 973
 974         subs    r2, r2, #4
 975         blt     .Lmemcpy_l4             /* less than 4 bytes */
 976         ands    r12, r0, #3
 977         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
 978         ands    r12, r1, #3
 979         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
 980
 981 .Lmemcpy_t8:
 982         /* We have aligned source and destination */
 983         subs    r2, r2, #8
 984         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
 985         subs    r2, r2, #0x14
 986         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
 987         stmdb   sp!, {r4}               /* borrow r4 */
 988
 989         /* blat 32 bytes at a time */
 990         /* XXX for really big copies perhaps we should use more registers */
 991 .Lmemcpy_loop32:
 992         ldmia   r1!, {r3, r4, r12, lr}
 993         stmia   r0!, {r3, r4, r12, lr}
 994         ldmia   r1!, {r3, r4, r12, lr}
 995         stmia   r0!, {r3, r4, r12, lr}
 996         subs    r2, r2, #0x20
 997         bge     .Lmemcpy_loop32
 998
 999         cmn     r2, #0x10
1000         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
1001         stmiage r0!, {r3, r4, r12, lr}
1002         subge   r2, r2, #0x10
1003         ldmia   sp!, {r4}               /* return r4 */
1004
1005 .Lmemcpy_l32:
1006         adds    r2, r2, #0x14
1007
1008         /* blat 12 bytes at a time */
1009 .Lmemcpy_loop12:
1010         ldmiage r1!, {r3, r12, lr}
1011         stmiage r0!, {r3, r12, lr}
1012         subsge  r2, r2, #0x0c
1013         bge     .Lmemcpy_loop12
1014
1015 .Lmemcpy_l12:
1016         adds    r2, r2, #8
1017         blt     .Lmemcpy_l4
1018
1019         subs    r2, r2, #4
1020         ldrlt   r3, [r1], #4
1021         strlt   r3, [r0], #4
1022         ldmiage r1!, {r3, r12}
1023         stmiage r0!, {r3, r12}
1024         subge   r2, r2, #4
1025
1026 .Lmemcpy_l4:
1027         /* less than 4 bytes to go */
1028         adds    r2, r2, #4
1029 #ifdef __APCS_26_
1030         ldmiaeq sp!, {r0, pc}^          /* done */
1031 #else
1032         ldmiaeq sp!, {r0, pc}           /* done */
1033 #endif
1034         /* copy the crud byte at a time */
1035         cmp     r2, #2
1036         ldrb    r3, [r1], #1
1037         strb    r3, [r0], #1
1038         ldrbge  r3, [r1], #1
1039         strbge  r3, [r0], #1
1040         ldrbgt  r3, [r1], #1
1041         strbgt  r3, [r0], #1
1042         ldmia   sp!, {r0, pc}
1043
1044         /* erg - unaligned destination */
1045 .Lmemcpy_destul:
1046         rsb     r12, r12, #4
1047         cmp     r12, #2
1048
1049         /* align destination with byte copies */
1050         ldrb    r3, [r1], #1
1051         strb    r3, [r0], #1
1052         ldrbge  r3, [r1], #1
1053         strbge  r3, [r0], #1
1054         ldrbgt  r3, [r1], #1
1055         strbgt  r3, [r0], #1
1056         subs    r2, r2, r12
1057         blt     .Lmemcpy_l4             /* less the 4 bytes */
1058
1059         ands    r12, r1, #3
1060         beq     .Lmemcpy_t8             /* we have an aligned source */
1061
1062         /* erg - unaligned source */
1063         /* This is where it gets nasty ... */
1064 .Lmemcpy_srcul:
1065         bic     r1, r1, #3
1066         ldr     lr, [r1], #4
1067         cmp     r12, #2
1068         bgt     .Lmemcpy_srcul3
1069         beq     .Lmemcpy_srcul2
1070         cmp     r2, #0x0c
1071         blt     .Lmemcpy_srcul1loop4
1072         sub     r2, r2, #0x0c
1073         stmdb   sp!, {r4, r5}
1074
1075 .Lmemcpy_srcul1loop16:
1076         mov     r3, lr, lsr #8
1077         ldmia   r1!, {r4, r5, r12, lr}
1078         orr     r3, r3, r4, lsl #24
1079         mov     r4, r4, lsr #8
1080         orr     r4, r4, r5, lsl #24
1081         mov     r5, r5, lsr #8
1082         orr     r5, r5, r12, lsl #24
1083         mov     r12, r12, lsr #8
1084         orr     r12, r12, lr, lsl #24
1085         stmia   r0!, {r3-r5, r12}
1086         subs    r2, r2, #0x10
1087         bge     .Lmemcpy_srcul1loop16
1088         ldmia   sp!, {r4, r5}
1089         adds    r2, r2, #0x0c
1090         blt     .Lmemcpy_srcul1l4
1091
1092 .Lmemcpy_srcul1loop4:
1093         mov     r12, lr, lsr #8
1094         ldr     lr, [r1], #4
1095         orr     r12, r12, lr, lsl #24
1096         str     r12, [r0], #4
1097         subs    r2, r2, #4
1098         bge     .Lmemcpy_srcul1loop4
1099
1100 .Lmemcpy_srcul1l4:
1101         sub     r1, r1, #3
1102         b       .Lmemcpy_l4
1103
1104 .Lmemcpy_srcul2:
1105         cmp     r2, #0x0c
1106         blt     .Lmemcpy_srcul2loop4
1107         sub     r2, r2, #0x0c
1108         stmdb   sp!, {r4, r5}
1109
1110 .Lmemcpy_srcul2loop16:
1111         mov     r3, lr, lsr #16
1112         ldmia   r1!, {r4, r5, r12, lr}
1113         orr     r3, r3, r4, lsl #16
1114         mov     r4, r4, lsr #16
1115         orr     r4, r4, r5, lsl #16
1116         mov     r5, r5, lsr #16
1117         orr     r5, r5, r12, lsl #16
1118         mov     r12, r12, lsr #16
1119         orr     r12, r12, lr, lsl #16
1120         stmia   r0!, {r3-r5, r12}
1121         subs    r2, r2, #0x10
1122         bge     .Lmemcpy_srcul2loop16
1123         ldmia   sp!, {r4, r5}
1124         adds    r2, r2, #0x0c
1125         blt     .Lmemcpy_srcul2l4
1126
1127 .Lmemcpy_srcul2loop4:
1128         mov     r12, lr, lsr #16
1129         ldr     lr, [r1], #4
1130         orr     r12, r12, lr, lsl #16
1131         str     r12, [r0], #4
1132         subs    r2, r2, #4
1133         bge     .Lmemcpy_srcul2loop4
1134
1135 .Lmemcpy_srcul2l4:
1136         sub     r1, r1, #2
1137         b       .Lmemcpy_l4
1138
1139 .Lmemcpy_srcul3:
1140         cmp     r2, #0x0c
1141         blt     .Lmemcpy_srcul3loop4
1142         sub     r2, r2, #0x0c
1143         stmdb   sp!, {r4, r5}
1144
1145 .Lmemcpy_srcul3loop16:
1146         mov     r3, lr, lsr #24
1147         ldmia   r1!, {r4, r5, r12, lr}
1148         orr     r3, r3, r4, lsl #8
1149         mov     r4, r4, lsr #24
1150         orr     r4, r4, r5, lsl #8
1151         mov     r5, r5, lsr #24
1152         orr     r5, r5, r12, lsl #8
1153         mov     r12, r12, lsr #24
1154         orr     r12, r12, lr, lsl #8
1155         stmia   r0!, {r3-r5, r12}
1156         subs    r2, r2, #0x10
1157         bge     .Lmemcpy_srcul3loop16
1158         ldmia   sp!, {r4, r5}
1159         adds    r2, r2, #0x0c
1160         blt     .Lmemcpy_srcul3l4
1161
1162 .Lmemcpy_srcul3loop4:
1163         mov     r12, lr, lsr #24
1164         ldr     lr, [r1], #4
1165         orr     r12, r12, lr, lsl #8
1166         str     r12, [r0], #4
1167         subs    r2, r2, #4
1168         bge     .Lmemcpy_srcul3loop4
1169
1170 .Lmemcpy_srcul3l4:
1171         sub     r1, r1, #1
1172         b       .Lmemcpy_l4
1173 END(memcpy)
1174
1175 #else
1176 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1177 ENTRY(memcpy)
1178         pld     [r1]
1179         cmp     r2, #0x0c
1180         ble     .Lmemcpy_short          /* <= 12 bytes */
1181 #ifdef FLASHADDR
1182 #if FLASHADDR > PHYSADDR
1183         ldr     r3, =FLASHADDR
1184         cmp     r3, pc
1185         bls     .Lnormal
1186 #else
1187         ldr     r3, =FLASHADDR
1188         cmp     r3, pc
1189         bhi     .Lnormal
1190 #endif
1191 #endif
1192         ldr     r3, .L_arm_memcpy
1193         ldr     r3, [r3]
1194         cmp     r3, #0
1195         beq     .Lnormal
1196         ldr     r3, .L_min_memcpy_size
1197         ldr     r3, [r3]
1198         cmp     r2, r3
1199         blt     .Lnormal
1200         stmfd   sp!, {r0-r2, r4, lr}
1201         mov     r3, #0
1202         ldr     r4, .L_arm_memcpy
1203         mov     lr, pc
1204         ldr     pc, [r4]
1205         cmp     r0, #0
1206         ldmfd   sp!, {r0-r2, r4, lr}
1207         RETeq
1208 .Lnormal:
1209         mov     r3, r0                  /* We must not clobber r0 */
1210
1211         /* Word-align the destination buffer */
1212         ands    ip, r3, #0x03           /* Already word aligned? */
1213         beq     .Lmemcpy_wordaligned    /* Yup */
1214         cmp     ip, #0x02
1215         ldrb    ip, [r1], #0x01
1216         sub     r2, r2, #0x01
1217         strb    ip, [r3], #0x01
1218         ldrble  ip, [r1], #0x01
1219         suble   r2, r2, #0x01
1220         strble  ip, [r3], #0x01
1221         ldrblt  ip, [r1], #0x01
1222         sublt   r2, r2, #0x01
1223         strblt  ip, [r3], #0x01
1224
1225         /* Destination buffer is now word aligned */
1226 .Lmemcpy_wordaligned:
1227         ands    ip, r1, #0x03           /* Is src also word-aligned? */
1228         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
1229
1230         /* Quad-align the destination buffer */
1231         tst     r3, #0x07               /* Already quad aligned? */
1232         ldrne   ip, [r1], #0x04
1233         stmfd   sp!, {r4-r9}            /* Free up some registers */
1234         subne   r2, r2, #0x04
1235         strne   ip, [r3], #0x04
1236
1237         /* Destination buffer quad aligned, source is at least word aligned */
1238         subs    r2, r2, #0x80
1239         blt     .Lmemcpy_w_lessthan128
1240
1241         /* Copy 128 bytes at a time */
1242 .Lmemcpy_w_loop128:
1243         ldr     r4, [r1], #0x04         /* LD:00-03 */
1244         ldr     r5, [r1], #0x04         /* LD:04-07 */
1245         pld     [r1, #0x18]             /* Prefetch 0x20 */
1246         ldr     r6, [r1], #0x04         /* LD:08-0b */
1247         ldr     r7, [r1], #0x04         /* LD:0c-0f */
1248         ldr     r8, [r1], #0x04         /* LD:10-13 */
1249         ldr     r9, [r1], #0x04         /* LD:14-17 */
1250         strd    r4, [r3], #0x08         /* ST:00-07 */
1251         ldr     r4, [r1], #0x04         /* LD:18-1b */
1252         ldr     r5, [r1], #0x04         /* LD:1c-1f */
1253         strd    r6, [r3], #0x08         /* ST:08-0f */
1254         ldr     r6, [r1], #0x04         /* LD:20-23 */
1255         ldr     r7, [r1], #0x04         /* LD:24-27 */
1256         pld     [r1, #0x18]             /* Prefetch 0x40 */
1257         strd    r8, [r3], #0x08         /* ST:10-17 */
1258         ldr     r8, [r1], #0x04         /* LD:28-2b */
1259         ldr     r9, [r1], #0x04         /* LD:2c-2f */
1260         strd    r4, [r3], #0x08         /* ST:18-1f */
1261         ldr     r4, [r1], #0x04         /* LD:30-33 */
1262         ldr     r5, [r1], #0x04         /* LD:34-37 */
1263         strd    r6, [r3], #0x08         /* ST:20-27 */
1264         ldr     r6, [r1], #0x04         /* LD:38-3b */
1265         ldr     r7, [r1], #0x04         /* LD:3c-3f */
1266         strd    r8, [r3], #0x08         /* ST:28-2f */
1267         ldr     r8, [r1], #0x04         /* LD:40-43 */
1268         ldr     r9, [r1], #0x04         /* LD:44-47 */
1269         pld     [r1, #0x18]             /* Prefetch 0x60 */
1270         strd    r4, [r3], #0x08         /* ST:30-37 */
1271         ldr     r4, [r1], #0x04         /* LD:48-4b */
1272         ldr     r5, [r1], #0x04         /* LD:4c-4f */
1273         strd    r6, [r3], #0x08         /* ST:38-3f */
1274         ldr     r6, [r1], #0x04         /* LD:50-53 */
1275         ldr     r7, [r1], #0x04         /* LD:54-57 */
1276         strd    r8, [r3], #0x08         /* ST:40-47 */
1277         ldr     r8, [r1], #0x04         /* LD:58-5b */
1278         ldr     r9, [r1], #0x04         /* LD:5c-5f */
1279         strd    r4, [r3], #0x08         /* ST:48-4f */
1280         ldr     r4, [r1], #0x04         /* LD:60-63 */
1281         ldr     r5, [r1], #0x04         /* LD:64-67 */
1282         pld     [r1, #0x18]             /* Prefetch 0x80 */
1283         strd    r6, [r3], #0x08         /* ST:50-57 */
1284         ldr     r6, [r1], #0x04         /* LD:68-6b */
1285         ldr     r7, [r1], #0x04         /* LD:6c-6f */
1286         strd    r8, [r3], #0x08         /* ST:58-5f */
1287         ldr     r8, [r1], #0x04         /* LD:70-73 */
1288         ldr     r9, [r1], #0x04         /* LD:74-77 */
1289         strd    r4, [r3], #0x08         /* ST:60-67 */
1290         ldr     r4, [r1], #0x04         /* LD:78-7b */
1291         ldr     r5, [r1], #0x04         /* LD:7c-7f */
1292         strd    r6, [r3], #0x08         /* ST:68-6f */
1293         strd    r8, [r3], #0x08         /* ST:70-77 */
1294         subs    r2, r2, #0x80
1295         strd    r4, [r3], #0x08         /* ST:78-7f */
1296         bge     .Lmemcpy_w_loop128
1297
1298 .Lmemcpy_w_lessthan128:
1299         adds    r2, r2, #0x80           /* Adjust for extra sub */
1300         ldmfdeq sp!, {r4-r9}
1301         RETeq                   /* Return now if done */
1302         subs    r2, r2, #0x20
1303         blt     .Lmemcpy_w_lessthan32
1304
1305         /* Copy 32 bytes at a time */
1306 .Lmemcpy_w_loop32:
1307         ldr     r4, [r1], #0x04
1308         ldr     r5, [r1], #0x04
1309         pld     [r1, #0x18]
1310         ldr     r6, [r1], #0x04
1311         ldr     r7, [r1], #0x04
1312         ldr     r8, [r1], #0x04
1313         ldr     r9, [r1], #0x04
1314         strd    r4, [r3], #0x08
1315         ldr     r4, [r1], #0x04
1316         ldr     r5, [r1], #0x04
1317         strd    r6, [r3], #0x08
1318         strd    r8, [r3], #0x08
1319         subs    r2, r2, #0x20
1320         strd    r4, [r3], #0x08
1321         bge     .Lmemcpy_w_loop32
1322
1323 .Lmemcpy_w_lessthan32:
1324         adds    r2, r2, #0x20           /* Adjust for extra sub */
1325         ldmfdeq sp!, {r4-r9}
1326         RETeq                   /* Return now if done */
1327
1328         and     r4, r2, #0x18
1329         rsbs    r4, r4, #0x18
1330         addne   pc, pc, r4, lsl #1
1331         nop
1332
1333         /* At least 24 bytes remaining */
1334         ldr     r4, [r1], #0x04
1335         ldr     r5, [r1], #0x04
1336         sub     r2, r2, #0x08
1337         strd    r4, [r3], #0x08
1338
1339         /* At least 16 bytes remaining */
1340         ldr     r4, [r1], #0x04
1341         ldr     r5, [r1], #0x04
1342         sub     r2, r2, #0x08
1343         strd    r4, [r3], #0x08
1344
1345         /* At least 8 bytes remaining */
1346         ldr     r4, [r1], #0x04
1347         ldr     r5, [r1], #0x04
1348         subs    r2, r2, #0x08
1349         strd    r4, [r3], #0x08
1350
1351         /* Less than 8 bytes remaining */
1352         ldmfd   sp!, {r4-r9}
1353         RETeq                   /* Return now if done */
1354         subs    r2, r2, #0x04
1355         ldrge   ip, [r1], #0x04
1356         strge   ip, [r3], #0x04
1357         RETeq                   /* Return now if done */
1358         addlt   r2, r2, #0x04
1359         ldrb    ip, [r1], #0x01
1360         cmp     r2, #0x02
1361         ldrbge  r2, [r1], #0x01
1362         strb    ip, [r3], #0x01
1363         ldrbgt  ip, [r1]
1364         strbge  r2, [r3], #0x01
1365         strbgt  ip, [r3]
1366         RET
1367 /* Place a literal pool here for the above ldr instructions to use */
1368 .ltorg
1369
1370
1371 /*
1372  * At this point, it has not been possible to word align both buffers.
1373  * The destination buffer is word aligned, but the source buffer is not.
1374  */
1375 .Lmemcpy_bad_align:
1376         stmfd   sp!, {r4-r7}
1377         bic     r1, r1, #0x03
1378         cmp     ip, #2
1379         ldr     ip, [r1], #0x04
1380         bgt     .Lmemcpy_bad3
1381         beq     .Lmemcpy_bad2
1382         b       .Lmemcpy_bad1
1383
1384 .Lmemcpy_bad1_loop16:
1385 #ifdef __ARMEB__
1386         mov     r4, ip, lsl #8
1387 #else
1388         mov     r4, ip, lsr #8
1389 #endif
1390         ldr     r5, [r1], #0x04
1391         pld     [r1, #0x018]
1392         ldr     r6, [r1], #0x04
1393         ldr     r7, [r1], #0x04
1394         ldr     ip, [r1], #0x04
1395 #ifdef __ARMEB__
1396         orr     r4, r4, r5, lsr #24
1397         mov     r5, r5, lsl #8
1398         orr     r5, r5, r6, lsr #24
1399         mov     r6, r6, lsl #8
1400         orr     r6, r6, r7, lsr #24
1401         mov     r7, r7, lsl #8
1402         orr     r7, r7, ip, lsr #24
1403 #else
1404         orr     r4, r4, r5, lsl #24
1405         mov     r5, r5, lsr #8
1406         orr     r5, r5, r6, lsl #24
1407         mov     r6, r6, lsr #8
1408         orr     r6, r6, r7, lsl #24
1409         mov     r7, r7, lsr #8
1410         orr     r7, r7, ip, lsl #24
1411 #endif
1412         str     r4, [r3], #0x04
1413         str     r5, [r3], #0x04
1414         str     r6, [r3], #0x04
1415         str     r7, [r3], #0x04
1416 .Lmemcpy_bad1:
1417         subs    r2, r2, #0x10
1418         bge     .Lmemcpy_bad1_loop16
1419
1420         adds    r2, r2, #0x10
1421         ldmfdeq sp!, {r4-r7}
1422         RETeq                   /* Return now if done */
1423         subs    r2, r2, #0x04
1424         sublt   r1, r1, #0x03
1425         blt     .Lmemcpy_bad_done
1426
1427 .Lmemcpy_bad1_loop4:
1428 #ifdef __ARMEB__
1429         mov     r4, ip, lsl #8
1430 #else
1431         mov     r4, ip, lsr #8
1432 #endif
1433         ldr     ip, [r1], #0x04
1434         subs    r2, r2, #0x04
1435 #ifdef __ARMEB__
1436         orr     r4, r4, ip, lsr #24
1437 #else
1438         orr     r4, r4, ip, lsl #24
1439 #endif
1440         str     r4, [r3], #0x04
1441         bge     .Lmemcpy_bad1_loop4
1442         sub     r1, r1, #0x03
1443         b       .Lmemcpy_bad_done
1444
1445 .Lmemcpy_bad2_loop16:
1446 #ifdef __ARMEB__
1447         mov     r4, ip, lsl #16
1448 #else
1449         mov     r4, ip, lsr #16
1450 #endif
1451         ldr     r5, [r1], #0x04
1452         pld     [r1, #0x018]
1453         ldr     r6, [r1], #0x04
1454         ldr     r7, [r1], #0x04
1455         ldr     ip, [r1], #0x04
1456 #ifdef __ARMEB__
1457         orr     r4, r4, r5, lsr #16
1458         mov     r5, r5, lsl #16
1459         orr     r5, r5, r6, lsr #16
1460         mov     r6, r6, lsl #16
1461         orr     r6, r6, r7, lsr #16
1462         mov     r7, r7, lsl #16
1463         orr     r7, r7, ip, lsr #16
1464 #else
1465         orr     r4, r4, r5, lsl #16
1466         mov     r5, r5, lsr #16
1467         orr     r5, r5, r6, lsl #16
1468         mov     r6, r6, lsr #16
1469         orr     r6, r6, r7, lsl #16
1470         mov     r7, r7, lsr #16
1471         orr     r7, r7, ip, lsl #16
1472 #endif
1473         str     r4, [r3], #0x04
1474         str     r5, [r3], #0x04
1475         str     r6, [r3], #0x04
1476         str     r7, [r3], #0x04
1477 .Lmemcpy_bad2:
1478         subs    r2, r2, #0x10
1479         bge     .Lmemcpy_bad2_loop16
1480
1481         adds    r2, r2, #0x10
1482         ldmfdeq sp!, {r4-r7}
1483         RETeq                   /* Return now if done */
1484         subs    r2, r2, #0x04
1485         sublt   r1, r1, #0x02
1486         blt     .Lmemcpy_bad_done
1487
1488 .Lmemcpy_bad2_loop4:
1489 #ifdef __ARMEB__
1490         mov     r4, ip, lsl #16
1491 #else
1492         mov     r4, ip, lsr #16
1493 #endif
1494         ldr     ip, [r1], #0x04
1495         subs    r2, r2, #0x04
1496 #ifdef __ARMEB__
1497         orr     r4, r4, ip, lsr #16
1498 #else
1499         orr     r4, r4, ip, lsl #16
1500 #endif
1501         str     r4, [r3], #0x04
1502         bge     .Lmemcpy_bad2_loop4
1503         sub     r1, r1, #0x02
1504         b       .Lmemcpy_bad_done
1505
1506 .Lmemcpy_bad3_loop16:
1507 #ifdef __ARMEB__
1508         mov     r4, ip, lsl #24
1509 #else
1510         mov     r4, ip, lsr #24
1511 #endif
1512         ldr     r5, [r1], #0x04
1513         pld     [r1, #0x018]
1514         ldr     r6, [r1], #0x04
1515         ldr     r7, [r1], #0x04
1516         ldr     ip, [r1], #0x04
1517 #ifdef __ARMEB__
1518         orr     r4, r4, r5, lsr #8
1519         mov     r5, r5, lsl #24
1520         orr     r5, r5, r6, lsr #8
1521         mov     r6, r6, lsl #24
1522         orr     r6, r6, r7, lsr #8
1523         mov     r7, r7, lsl #24
1524         orr     r7, r7, ip, lsr #8
1525 #else
1526         orr     r4, r4, r5, lsl #8
1527         mov     r5, r5, lsr #24
1528         orr     r5, r5, r6, lsl #8
1529         mov     r6, r6, lsr #24
1530         orr     r6, r6, r7, lsl #8
1531         mov     r7, r7, lsr #24
1532         orr     r7, r7, ip, lsl #8
1533 #endif
1534         str     r4, [r3], #0x04
1535         str     r5, [r3], #0x04
1536         str     r6, [r3], #0x04
1537         str     r7, [r3], #0x04
1538 .Lmemcpy_bad3:
1539         subs    r2, r2, #0x10
1540         bge     .Lmemcpy_bad3_loop16
1541
1542         adds    r2, r2, #0x10
1543         ldmfdeq sp!, {r4-r7}
1544         RETeq                   /* Return now if done */
1545         subs    r2, r2, #0x04
1546         sublt   r1, r1, #0x01
1547         blt     .Lmemcpy_bad_done
1548
1549 .Lmemcpy_bad3_loop4:
1550 #ifdef __ARMEB__
1551         mov     r4, ip, lsl #24
1552 #else
1553         mov     r4, ip, lsr #24
1554 #endif
1555         ldr     ip, [r1], #0x04
1556         subs    r2, r2, #0x04
1557 #ifdef __ARMEB__
1558         orr     r4, r4, ip, lsr #8
1559 #else
1560         orr     r4, r4, ip, lsl #8
1561 #endif
1562         str     r4, [r3], #0x04
1563         bge     .Lmemcpy_bad3_loop4
1564         sub     r1, r1, #0x01
1565
1566 .Lmemcpy_bad_done:
1567         ldmfd   sp!, {r4-r7}
1568         adds    r2, r2, #0x04
1569         RETeq
1570         ldrb    ip, [r1], #0x01
1571         cmp     r2, #0x02
1572         ldrbge  r2, [r1], #0x01
1573         strb    ip, [r3], #0x01
1574         ldrbgt  ip, [r1]
1575         strbge  r2, [r3], #0x01
1576         strbgt  ip, [r3]
1577         RET
1578
1579
1580 /*
1581  * Handle short copies (less than 16 bytes), possibly misaligned.
1582  * Some of these are *very* common, thanks to the network stack,
1583  * and so are handled specially.
1584  */
1585 .Lmemcpy_short:
1586         add     pc, pc, r2, lsl #2
1587         nop
1588         RET                     /* 0x00 */
1589         b       .Lmemcpy_bytewise       /* 0x01 */
1590         b       .Lmemcpy_bytewise       /* 0x02 */
1591         b       .Lmemcpy_bytewise       /* 0x03 */
1592         b       .Lmemcpy_4              /* 0x04 */
1593         b       .Lmemcpy_bytewise       /* 0x05 */
1594         b       .Lmemcpy_6              /* 0x06 */
1595         b       .Lmemcpy_bytewise       /* 0x07 */
1596         b       .Lmemcpy_8              /* 0x08 */
1597         b       .Lmemcpy_bytewise       /* 0x09 */
1598         b       .Lmemcpy_bytewise       /* 0x0a */
1599         b       .Lmemcpy_bytewise       /* 0x0b */
1600         b       .Lmemcpy_c              /* 0x0c */
1601 .Lmemcpy_bytewise:
1602         mov     r3, r0                  /* We must not clobber r0 */
1603         ldrb    ip, [r1], #0x01
1604 1:      subs    r2, r2, #0x01
1605         strb    ip, [r3], #0x01
1606         ldrbne  ip, [r1], #0x01
1607         bne     1b
1608         RET
1609
1610 /******************************************************************************
1611  * Special case for 4 byte copies
1612  */
1613 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1614 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1615         LMEMCPY_4_PAD
1616 .Lmemcpy_4:
1617         and     r2, r1, #0x03
1618         orr     r2, r2, r0, lsl #2
1619         ands    r2, r2, #0x0f
1620         sub     r3, pc, #0x14
1621         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1622
1623 /*
1624  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1625  */
1626         ldr     r2, [r1]
1627         str     r2, [r0]
1628         RET
1629         LMEMCPY_4_PAD
1630
1631 /*
1632  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1633  */
1634         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1635         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1636 #ifdef __ARMEB__
1637         mov     r3, r3, lsl #8          /* r3 = 012. */
1638         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
1639 #else
1640         mov     r3, r3, lsr #8          /* r3 = .210 */
1641         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1642 #endif
1643         str     r3, [r0]
1644         RET
1645         LMEMCPY_4_PAD
1646
1647 /*
1648  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1649  */
1650 #ifdef __ARMEB__
1651         ldrh    r3, [r1]
1652         ldrh    r2, [r1, #0x02]
1653 #else
1654         ldrh    r3, [r1, #0x02]
1655         ldrh    r2, [r1]
1656 #endif
1657         orr     r3, r2, r3, lsl #16
1658         str     r3, [r0]
1659         RET
1660         LMEMCPY_4_PAD
1661
1662 /*
1663  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1664  */
1665         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1666         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1667 #ifdef __ARMEB__
1668         mov     r3, r3, lsl #24         /* r3 = 0... */
1669         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
1670 #else
1671         mov     r3, r3, lsr #24         /* r3 = ...0 */
1672         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1673 #endif
1674         str     r3, [r0]
1675         RET
1676         LMEMCPY_4_PAD
1677
1678 /*
1679  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1680  */
1681         ldr     r2, [r1]
1682 #ifdef __ARMEB__
1683         strb    r2, [r0, #0x03]
1684         mov     r3, r2, lsr #8
1685         mov     r1, r2, lsr #24
1686         strb    r1, [r0]
1687 #else
1688         strb    r2, [r0]
1689         mov     r3, r2, lsr #8
1690         mov     r1, r2, lsr #24
1691         strb    r1, [r0, #0x03]
1692 #endif
1693         strh    r3, [r0, #0x01]
1694         RET
1695         LMEMCPY_4_PAD
1696
1697 /*
1698  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1699  */
1700         ldrb    r2, [r1]
1701         ldrh    r3, [r1, #0x01]
1702         ldrb    r1, [r1, #0x03]
1703         strb    r2, [r0]
1704         strh    r3, [r0, #0x01]
1705         strb    r1, [r0, #0x03]
1706         RET
1707         LMEMCPY_4_PAD
1708
1709 /*
1710  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1711  */
1712         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1713         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1714 #ifdef __ARMEB__
1715         mov     r1, r2, lsr #8          /* r1 = ...0 */
1716         strb    r1, [r0]
1717         mov     r2, r2, lsl #8          /* r2 = .01. */
1718         orr     r2, r2, r3, lsr #8      /* r2 = .012 */
1719 #else
1720         strb    r2, [r0]
1721         mov     r2, r2, lsr #8          /* r2 = ...1 */
1722         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1723         mov     r3, r3, lsr #8          /* r3 = ...3 */
1724 #endif
1725         strh    r2, [r0, #0x01]
1726         strb    r3, [r0, #0x03]
1727         RET
1728         LMEMCPY_4_PAD
1729
1730 /*
1731  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1732  */
1733         ldrb    r2, [r1]
1734         ldrh    r3, [r1, #0x01]
1735         ldrb    r1, [r1, #0x03]
1736         strb    r2, [r0]
1737         strh    r3, [r0, #0x01]
1738         strb    r1, [r0, #0x03]
1739         RET
1740         LMEMCPY_4_PAD
1741
1742 /*
1743  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1744  */
1745         ldr     r2, [r1]
1746 #ifdef __ARMEB__
1747         strh    r2, [r0, #0x02]
1748         mov     r3, r2, lsr #16
1749         strh    r3, [r0]
1750 #else
1751         strh    r2, [r0]
1752         mov     r3, r2, lsr #16
1753         strh    r3, [r0, #0x02]
1754 #endif
1755         RET
1756         LMEMCPY_4_PAD
1757
1758 /*
1759  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1760  */
1761         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1762         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1763         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1764         strh    r1, [r0]
1765 #ifdef __ARMEB__
1766         mov     r2, r2, lsl #8          /* r2 = 012. */
1767         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1768 #else
1769         mov     r2, r2, lsr #24         /* r2 = ...2 */
1770         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1771 #endif
1772         strh    r2, [r0, #0x02]
1773         RET
1774         LMEMCPY_4_PAD
1775
1776 /*
1777  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1778  */
1779         ldrh    r2, [r1]
1780         ldrh    r3, [r1, #0x02]
1781         strh    r2, [r0]
1782         strh    r3, [r0, #0x02]
1783         RET
1784         LMEMCPY_4_PAD
1785
1786 /*
1787  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1788  */
1789         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1790         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1791         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1792         strh    r1, [r0, #0x02]
1793 #ifdef __ARMEB__
1794         mov     r3, r3, lsr #24         /* r3 = ...1 */
1795         orr     r3, r3, r2, lsl #8      /* r3 = xx01 */
1796 #else
1797         mov     r3, r3, lsl #8          /* r3 = 321. */
1798         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1799 #endif
1800         strh    r3, [r0]
1801         RET
1802         LMEMCPY_4_PAD
1803
1804 /*
1805  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1806  */
1807         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1808 #ifdef __ARMEB__
1809         strb    r2, [r0, #0x03]
1810         mov     r3, r2, lsr #8
1811         mov     r1, r2, lsr #24
1812         strh    r3, [r0, #0x01]
1813         strb    r1, [r0]
1814 #else
1815         strb    r2, [r0]
1816         mov     r3, r2, lsr #8
1817         mov     r1, r2, lsr #24
1818         strh    r3, [r0, #0x01]
1819         strb    r1, [r0, #0x03]
1820 #endif
1821         RET
1822         LMEMCPY_4_PAD
1823
1824 /*
1825  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1826  */
1827         ldrb    r2, [r1]
1828         ldrh    r3, [r1, #0x01]
1829         ldrb    r1, [r1, #0x03]
1830         strb    r2, [r0]
1831         strh    r3, [r0, #0x01]
1832         strb    r1, [r0, #0x03]
1833         RET
1834         LMEMCPY_4_PAD
1835
1836 /*
1837  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1838  */
1839 #ifdef __ARMEB__
1840         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1841         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1842         strb    r3, [r0, #0x03]
1843         mov     r3, r3, lsr #8          /* r3 = ...2 */
1844         orr     r3, r3, r2, lsl #8      /* r3 = ..12 */
1845         strh    r3, [r0, #0x01]
1846         mov     r2, r2, lsr #8          /* r2 = ...0 */
1847         strb    r2, [r0]
1848 #else
1849         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1850         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1851         strb    r2, [r0]
1852         mov     r2, r2, lsr #8          /* r2 = ...1 */
1853         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1854         strh    r2, [r0, #0x01]
1855         mov     r3, r3, lsr #8          /* r3 = ...3 */
1856         strb    r3, [r0, #0x03]
1857 #endif
1858         RET
1859         LMEMCPY_4_PAD
1860
1861 /*
1862  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1863  */
1864         ldrb    r2, [r1]
1865         ldrh    r3, [r1, #0x01]
1866         ldrb    r1, [r1, #0x03]
1867         strb    r2, [r0]
1868         strh    r3, [r0, #0x01]
1869         strb    r1, [r0, #0x03]
1870         RET
1871         LMEMCPY_4_PAD
1872
1873
1874 /******************************************************************************
1875  * Special case for 6 byte copies
1876  */
1877 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1878 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1879         LMEMCPY_6_PAD
1880 .Lmemcpy_6:
1881         and     r2, r1, #0x03
1882         orr     r2, r2, r0, lsl #2
1883         ands    r2, r2, #0x0f
1884         sub     r3, pc, #0x14
1885         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1886
1887 /*
1888  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1889  */
1890         ldr     r2, [r1]
1891         ldrh    r3, [r1, #0x04]
1892         str     r2, [r0]
1893         strh    r3, [r0, #0x04]
1894         RET
1895         LMEMCPY_6_PAD
1896
1897 /*
1898  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1899  */
1900         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1901         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1902 #ifdef __ARMEB__
1903         mov     r2, r2, lsl #8          /* r2 = 012. */
1904         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1905 #else
1906         mov     r2, r2, lsr #8          /* r2 = .210 */
1907         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1908 #endif
1909         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1910         str     r2, [r0]
1911         strh    r3, [r0, #0x04]
1912         RET
1913         LMEMCPY_6_PAD
1914
1915 /*
1916  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1917  */
1918         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1919         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1920 #ifdef __ARMEB__
1921         mov     r1, r3, lsr #16         /* r1 = ..23 */
1922         orr     r1, r1, r2, lsl #16     /* r1 = 0123 */
1923         str     r1, [r0]
1924         strh    r3, [r0, #0x04]
1925 #else
1926         mov     r1, r3, lsr #16         /* r1 = ..54 */
1927         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1928         str     r2, [r0]
1929         strh    r1, [r0, #0x04]
1930 #endif
1931         RET
1932         LMEMCPY_6_PAD
1933
1934 /*
1935  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1936  */
1937         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1938         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1939         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1940 #ifdef __ARMEB__
1941         mov     r2, r2, lsl #24         /* r2 = 0... */
1942         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
1943         mov     r3, r3, lsl #8          /* r3 = 234. */
1944         orr     r1, r3, r1, lsr #24     /* r1 = 2345 */
1945 #else
1946         mov     r2, r2, lsr #24         /* r2 = ...0 */
1947         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1948         mov     r1, r1, lsl #8          /* r1 = xx5. */
1949         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1950 #endif
1951         str     r2, [r0]
1952         strh    r1, [r0, #0x04]
1953         RET
1954         LMEMCPY_6_PAD
1955
1956 /*
1957  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1958  */
1959         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1960         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1961         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1962         strh    r1, [r0, #0x01]
1963 #ifdef __ARMEB__
1964         mov     r1, r3, lsr #24         /* r1 = ...0 */
1965         strb    r1, [r0]
1966         mov     r3, r3, lsl #8          /* r3 = 123. */
1967         orr     r3, r3, r2, lsr #8      /* r3 = 1234 */
1968 #else
1969         strb    r3, [r0]
1970         mov     r3, r3, lsr #24         /* r3 = ...3 */
1971         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1972         mov     r2, r2, lsr #8          /* r2 = ...5 */
1973 #endif
1974         strh    r3, [r0, #0x03]
1975         strb    r2, [r0, #0x05]
1976         RET
1977         LMEMCPY_6_PAD
1978
1979 /*
1980  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1981  */
1982         ldrb    r2, [r1]
1983         ldrh    r3, [r1, #0x01]
1984         ldrh    ip, [r1, #0x03]
1985         ldrb    r1, [r1, #0x05]
1986         strb    r2, [r0]
1987         strh    r3, [r0, #0x01]
1988         strh    ip, [r0, #0x03]
1989         strb    r1, [r0, #0x05]
1990         RET
1991         LMEMCPY_6_PAD
1992
1993 /*
1994  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1995  */
1996         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1997         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1998 #ifdef __ARMEB__
1999         mov     r3, r2, lsr #8          /* r3 = ...0 */
2000         strb    r3, [r0]
2001         strb    r1, [r0, #0x05]
2002         mov     r3, r1, lsr #8          /* r3 = .234 */
2003         strh    r3, [r0, #0x03]
2004         mov     r3, r2, lsl #8          /* r3 = .01. */
2005         orr     r3, r3, r1, lsr #24     /* r3 = .012 */
2006         strh    r3, [r0, #0x01]
2007 #else
2008         strb    r2, [r0]
2009         mov     r3, r1, lsr #24
2010         strb    r3, [r0, #0x05]
2011         mov     r3, r1, lsr #8          /* r3 = .543 */
2012         strh    r3, [r0, #0x03]
2013         mov     r3, r2, lsr #8          /* r3 = ...1 */
2014         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
2015         strh    r3, [r0, #0x01]
2016 #endif
2017         RET
2018         LMEMCPY_6_PAD
2019
2020 /*
2021  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2022  */
2023         ldrb    r2, [r1]
2024         ldrh    r3, [r1, #0x01]
2025         ldrh    ip, [r1, #0x03]
2026         ldrb    r1, [r1, #0x05]
2027         strb    r2, [r0]
2028         strh    r3, [r0, #0x01]
2029         strh    ip, [r0, #0x03]
2030         strb    r1, [r0, #0x05]
2031         RET
2032         LMEMCPY_6_PAD
2033
2034 /*
2035  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2036  */
2037 #ifdef __ARMEB__
2038         ldr     r2, [r1]                /* r2 = 0123 */
2039         ldrh    r3, [r1, #0x04]         /* r3 = ..45 */
2040         mov     r1, r2, lsr #16         /* r1 = ..01 */
2041         orr     r3, r3, r2, lsl#16      /* r3 = 2345 */
2042         strh    r1, [r0]
2043         str     r3, [r0, #0x02]
2044 #else
2045         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
2046         ldr     r3, [r1]                /* r3 = 3210 */
2047         mov     r2, r2, lsl #16         /* r2 = 54.. */
2048         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
2049         strh    r3, [r0]
2050         str     r2, [r0, #0x02]
2051 #endif
2052         RET
2053         LMEMCPY_6_PAD
2054
2055 /*
2056  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2057  */
2058         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2059         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
2060         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2061 #ifdef __ARMEB__
2062         mov     r2, r2, lsr #8          /* r2 = .345 */
2063         orr     r2, r2, r3, lsl #24     /* r2 = 2345 */
2064 #else
2065         mov     r2, r2, lsl #8          /* r2 = 543. */
2066         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
2067 #endif
2068         strh    r1, [r0]
2069         str     r2, [r0, #0x02]
2070         RET
2071         LMEMCPY_6_PAD
2072
2073 /*
2074  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2075  */
2076         ldrh    r2, [r1]
2077         ldr     r3, [r1, #0x02]
2078         strh    r2, [r0]
2079         str     r3, [r0, #0x02]
2080         RET
2081         LMEMCPY_6_PAD
2082
2083 /*
2084  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2085  */
2086         ldrb    r3, [r1]                /* r3 = ...0 */
2087         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2088         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
2089 #ifdef __ARMEB__
2090         mov     r3, r3, lsl #8          /* r3 = ..0. */
2091         orr     r3, r3, r2, lsr #24     /* r3 = ..01 */
2092         orr     r1, r1, r2, lsl #8      /* r1 = 2345 */
2093 #else
2094         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2095         mov     r1, r1, lsl #24         /* r1 = 5... */
2096         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
2097 #endif
2098         strh    r3, [r0]
2099         str     r1, [r0, #0x02]
2100         RET
2101         LMEMCPY_6_PAD
2102
2103 /*
2104  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2105  */
2106         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2107         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
2108 #ifdef __ARMEB__
2109         mov     r3, r2, lsr #24         /* r3 = ...0 */
2110         strb    r3, [r0]
2111         mov     r2, r2, lsl #8          /* r2 = 123. */
2112         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2113 #else
2114         strb    r2, [r0]
2115         mov     r2, r2, lsr #8          /* r2 = .321 */
2116         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
2117         mov     r1, r1, lsr #8          /* r1 = ...5 */
2118 #endif
2119         str     r2, [r0, #0x01]
2120         strb    r1, [r0, #0x05]
2121         RET
2122         LMEMCPY_6_PAD
2123
2124 /*
2125  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2126  */
2127         ldrb    r2, [r1]
2128         ldrh    r3, [r1, #0x01]
2129         ldrh    ip, [r1, #0x03]
2130         ldrb    r1, [r1, #0x05]
2131         strb    r2, [r0]
2132         strh    r3, [r0, #0x01]
2133         strh    ip, [r0, #0x03]
2134         strb    r1, [r0, #0x05]
2135         RET
2136         LMEMCPY_6_PAD
2137
2138 /*
2139  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2140  */
2141         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2142         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
2143 #ifdef __ARMEB__
2144         mov     r3, r2, lsr #8          /* r3 = ...0 */
2145         strb    r3, [r0]
2146         mov     r2, r2, lsl #24         /* r2 = 1... */
2147         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2148 #else
2149         strb    r2, [r0]
2150         mov     r2, r2, lsr #8          /* r2 = ...1 */
2151         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
2152         mov     r1, r1, lsr #24         /* r1 = ...5 */
2153 #endif
2154         str     r2, [r0, #0x01]
2155         strb    r1, [r0, #0x05]
2156         RET
2157         LMEMCPY_6_PAD
2158
2159 /*
2160  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2161  */
2162         ldrb    r2, [r1]
2163         ldr     r3, [r1, #0x01]
2164         ldrb    r1, [r1, #0x05]
2165         strb    r2, [r0]
2166         str     r3, [r0, #0x01]
2167         strb    r1, [r0, #0x05]
2168         RET
2169         LMEMCPY_6_PAD
2170
2171
2172 /******************************************************************************
2173  * Special case for 8 byte copies
2174  */
2175 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
2176 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
2177         LMEMCPY_8_PAD
2178 .Lmemcpy_8:
2179         and     r2, r1, #0x03
2180         orr     r2, r2, r0, lsl #2
2181         ands    r2, r2, #0x0f
2182         sub     r3, pc, #0x14
2183         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
2184
2185 /*
2186  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2187  */
2188         ldr     r2, [r1]
2189         ldr     r3, [r1, #0x04]
2190         str     r2, [r0]
2191         str     r3, [r0, #0x04]
2192         RET
2193         LMEMCPY_8_PAD
2194
2195 /*
2196  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2197  */
2198         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2199         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
2200         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2201 #ifdef __ARMEB__
2202         mov     r3, r3, lsl #8          /* r3 = 012. */
2203         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
2204         orr     r2, r1, r2, lsl #8      /* r2 = 4567 */
2205 #else
2206         mov     r3, r3, lsr #8          /* r3 = .210 */
2207         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
2208         mov     r1, r1, lsl #24         /* r1 = 7... */
2209         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
2210 #endif
2211         str     r3, [r0]
2212         str     r2, [r0, #0x04]
2213         RET
2214         LMEMCPY_8_PAD
2215
2216 /*
2217  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2218  */
2219         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2220         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2221         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2222 #ifdef __ARMEB__
2223         mov     r2, r2, lsl #16         /* r2 = 01.. */
2224         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2225         orr     r3, r1, r3, lsl #16     /* r3 = 4567 */
2226 #else
2227         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2228         mov     r3, r3, lsr #16         /* r3 = ..54 */
2229         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
2230 #endif
2231         str     r2, [r0]
2232         str     r3, [r0, #0x04]
2233         RET
2234         LMEMCPY_8_PAD
2235
2236 /*
2237  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2238  */
2239         ldrb    r3, [r1]                /* r3 = ...0 */
2240         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2241         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
2242 #ifdef __ARMEB__
2243         mov     r3, r3, lsl #24         /* r3 = 0... */
2244         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
2245         mov     r2, r2, lsl #24         /* r2 = 4... */
2246         orr     r2, r2, r1, lsr #8      /* r2 = 4567 */
2247 #else
2248         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2249         mov     r2, r2, lsr #24         /* r2 = ...4 */
2250         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
2251 #endif
2252         str     r3, [r0]
2253         str     r2, [r0, #0x04]
2254         RET
2255         LMEMCPY_8_PAD
2256
2257 /*
2258  * 0100: dst is 8-bit aligned, src is 32-bit aligned
2259  */
2260         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
2261         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
2262 #ifdef __ARMEB__
2263         mov     r1, r3, lsr #24         /* r1 = ...0 */
2264         strb    r1, [r0]
2265         mov     r1, r3, lsr #8          /* r1 = .012 */
2266         strb    r2, [r0, #0x07]
2267         mov     r3, r3, lsl #24         /* r3 = 3... */
2268         orr     r3, r3, r2, lsr #8      /* r3 = 3456 */
2269 #else
2270         strb    r3, [r0]
2271         mov     r1, r2, lsr #24         /* r1 = ...7 */
2272         strb    r1, [r0, #0x07]
2273         mov     r1, r3, lsr #8          /* r1 = .321 */
2274         mov     r3, r3, lsr #24         /* r3 = ...3 */
2275         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
2276 #endif
2277         strh    r1, [r0, #0x01]
2278         str     r3, [r0, #0x03]
2279         RET
2280         LMEMCPY_8_PAD
2281
2282 /*
2283  * 0101: dst is 8-bit aligned, src is 8-bit aligned
2284  */
2285         ldrb    r2, [r1]
2286         ldrh    r3, [r1, #0x01]
2287         ldr     ip, [r1, #0x03]
2288         ldrb    r1, [r1, #0x07]
2289         strb    r2, [r0]
2290         strh    r3, [r0, #0x01]
2291         str     ip, [r0, #0x03]
2292         strb    r1, [r0, #0x07]
2293         RET
2294         LMEMCPY_8_PAD
2295
2296 /*
2297  * 0110: dst is 8-bit aligned, src is 16-bit aligned
2298  */
2299         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2300         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2301         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2302 #ifdef __ARMEB__
2303         mov     ip, r2, lsr #8          /* ip = ...0 */
2304         strb    ip, [r0]
2305         mov     ip, r2, lsl #8          /* ip = .01. */
2306         orr     ip, ip, r3, lsr #24     /* ip = .012 */
2307         strb    r1, [r0, #0x07]
2308         mov     r3, r3, lsl #8          /* r3 = 345. */
2309         orr     r3, r3, r1, lsr #8      /* r3 = 3456 */
2310 #else
2311         strb    r2, [r0]                /* 0 */
2312         mov     ip, r1, lsr #8          /* ip = ...7 */
2313         strb    ip, [r0, #0x07]         /* 7 */
2314         mov     ip, r2, lsr #8          /* ip = ...1 */
2315         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2316         mov     r3, r3, lsr #8          /* r3 = .543 */
2317         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
2318 #endif
2319         strh    ip, [r0, #0x01]
2320         str     r3, [r0, #0x03]
2321         RET
2322         LMEMCPY_8_PAD
2323
2324 /*
2325  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2326  */
2327         ldrb    r3, [r1]                /* r3 = ...0 */
2328         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2329         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
2330         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2331         strb    r3, [r0]
2332         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
2333 #ifdef __ARMEB__
2334         strh    r3, [r0, #0x01]
2335         orr     r2, r2, ip, lsl #16     /* r2 = 3456 */
2336 #else
2337         strh    ip, [r0, #0x01]
2338         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
2339 #endif
2340         str     r2, [r0, #0x03]
2341         strb    r1, [r0, #0x07]
2342         RET
2343         LMEMCPY_8_PAD
2344
2345 /*
2346  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2347  */
2348         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2349         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2350         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2351 #ifdef __ARMEB__
2352         strh    r1, [r0]
2353         mov     r1, r3, lsr #16         /* r1 = ..45 */
2354         orr     r2, r1 ,r2, lsl #16     /* r2 = 2345 */
2355 #else
2356         strh    r2, [r0]
2357         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
2358         mov     r3, r3, lsr #16         /* r3 = ..76 */
2359 #endif
2360         str     r2, [r0, #0x02]
2361         strh    r3, [r0, #0x06]
2362         RET
2363         LMEMCPY_8_PAD
2364
2365 /*
2366  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2367  */
2368         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2369         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2370         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
2371         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2372         strh    r1, [r0]
2373 #ifdef __ARMEB__
2374         mov     r1, r2, lsl #24         /* r1 = 2... */
2375         orr     r1, r1, r3, lsr #8      /* r1 = 2345 */
2376         orr     r3, ip, r3, lsl #8      /* r3 = 4567 */
2377 #else
2378         mov     r1, r2, lsr #24         /* r1 = ...2 */
2379         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
2380         mov     r3, r3, lsr #24         /* r3 = ...6 */
2381         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
2382 #endif
2383         str     r1, [r0, #0x02]
2384         strh    r3, [r0, #0x06]
2385         RET
2386         LMEMCPY_8_PAD
2387
2388 /*
2389  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2390  */
2391         ldrh    r2, [r1]
2392         ldr     ip, [r1, #0x02]
2393         ldrh    r3, [r1, #0x06]
2394         strh    r2, [r0]
2395         str     ip, [r0, #0x02]
2396         strh    r3, [r0, #0x06]
2397         RET
2398         LMEMCPY_8_PAD
2399
2400 /*
2401  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2402  */
2403         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
2404         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2405         ldrb    ip, [r1]                /* ip = ...0 */
2406         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
2407         strh    r1, [r0, #0x06]
2408 #ifdef __ARMEB__
2409         mov     r3, r3, lsr #24         /* r3 = ...5 */
2410         orr     r3, r3, r2, lsl #8      /* r3 = 2345 */
2411         mov     r2, r2, lsr #24         /* r2 = ...1 */
2412         orr     r2, r2, ip, lsl #8      /* r2 = ..01 */
2413 #else
2414         mov     r3, r3, lsl #24         /* r3 = 5... */
2415         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
2416         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
2417 #endif
2418         str     r3, [r0, #0x02]
2419         strh    r2, [r0]
2420         RET
2421         LMEMCPY_8_PAD
2422
2423 /*
2424  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2425  */
2426         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2427         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2428         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
2429         strh    r1, [r0, #0x05]
2430 #ifdef __ARMEB__
2431         strb    r3, [r0, #0x07]
2432         mov     r1, r2, lsr #24         /* r1 = ...0 */
2433         strb    r1, [r0]
2434         mov     r2, r2, lsl #8          /* r2 = 123. */
2435         orr     r2, r2, r3, lsr #24     /* r2 = 1234 */
2436         str     r2, [r0, #0x01]
2437 #else
2438         strb    r2, [r0]
2439         mov     r1, r3, lsr #24         /* r1 = ...7 */
2440         strb    r1, [r0, #0x07]
2441         mov     r2, r2, lsr #8          /* r2 = .321 */
2442         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
2443         str     r2, [r0, #0x01]
2444 #endif
2445         RET
2446         LMEMCPY_8_PAD
2447
2448 /*
2449  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2450  */
2451         ldrb    r3, [r1]                /* r3 = ...0 */
2452         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
2453         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2454         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2455         strb    r3, [r0]
2456         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
2457 #ifdef __ARMEB__
2458         strh    ip, [r0, #0x05]
2459         orr     r2, r3, r2, lsl #16     /* r2 = 1234 */
2460 #else
2461         strh    r3, [r0, #0x05]
2462         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
2463 #endif
2464         str     r2, [r0, #0x01]
2465         strb    r1, [r0, #0x07]
2466         RET
2467         LMEMCPY_8_PAD
2468
2469 /*
2470  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2471  */
2472         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2473         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2474         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2475 #ifdef __ARMEB__
2476         mov     ip, r2, lsr #8          /* ip = ...0 */
2477         strb    ip, [r0]
2478         mov     ip, r2, lsl #24         /* ip = 1... */
2479         orr     ip, ip, r3, lsr #8      /* ip = 1234 */
2480         strb    r1, [r0, #0x07]
2481         mov     r1, r1, lsr #8          /* r1 = ...6 */
2482         orr     r1, r1, r3, lsl #8      /* r1 = 3456 */
2483 #else
2484         strb    r2, [r0]
2485         mov     ip, r2, lsr #8          /* ip = ...1 */
2486         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2487         mov     r2, r1, lsr #8          /* r2 = ...7 */
2488         strb    r2, [r0, #0x07]
2489         mov     r1, r1, lsl #8          /* r1 = .76. */
2490         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
2491 #endif
2492         str     ip, [r0, #0x01]
2493         strh    r1, [r0, #0x05]
2494         RET
2495         LMEMCPY_8_PAD
2496
2497 /*
2498  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2499  */
2500         ldrb    r2, [r1]
2501         ldr     ip, [r1, #0x01]
2502         ldrh    r3, [r1, #0x05]
2503         ldrb    r1, [r1, #0x07]
2504         strb    r2, [r0]
2505         str     ip, [r0, #0x01]
2506         strh    r3, [r0, #0x05]
2507         strb    r1, [r0, #0x07]
2508         RET
2509         LMEMCPY_8_PAD
2510
2511 /******************************************************************************
2512  * Special case for 12 byte copies
2513  */
2514 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
2515 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
2516         LMEMCPY_C_PAD
2517 .Lmemcpy_c:
2518         and     r2, r1, #0x03
2519         orr     r2, r2, r0, lsl #2
2520         ands    r2, r2, #0x0f
2521         sub     r3, pc, #0x14
2522         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
2523
2524 /*
2525  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2526  */
2527         ldr     r2, [r1]
2528         ldr     r3, [r1, #0x04]
2529         ldr     r1, [r1, #0x08]
2530         str     r2, [r0]
2531         str     r3, [r0, #0x04]
2532         str     r1, [r0, #0x08]
2533         RET
2534         LMEMCPY_C_PAD
2535
2536 /*
2537  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2538  */
2539         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
2540         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2541         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2542         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2543 #ifdef __ARMEB__
2544         orr     r2, r2, ip, lsl #8      /* r2 = 89AB */
2545         str     r2, [r0, #0x08]
2546         mov     r2, ip, lsr #24         /* r2 = ...7 */
2547         orr     r2, r2, r3, lsl #8      /* r2 = 4567 */
2548         mov     r1, r1, lsl #8          /* r1 = 012. */
2549         orr     r1, r1, r3, lsr #24     /* r1 = 0123 */
2550 #else
2551         mov     r2, r2, lsl #24         /* r2 = B... */
2552         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
2553         str     r2, [r0, #0x08]
2554         mov     r2, ip, lsl #24         /* r2 = 7... */
2555         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
2556         mov     r1, r1, lsr #8          /* r1 = .210 */
2557         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
2558 #endif
2559         str     r2, [r0, #0x04]
2560         str     r1, [r0]
2561         RET
2562         LMEMCPY_C_PAD
2563
2564 /*
2565  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2566  */
2567         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2568         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2569         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2570         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2571 #ifdef __ARMEB__
2572         mov     r2, r2, lsl #16         /* r2 = 01.. */
2573         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2574         str     r2, [r0]
2575         mov     r3, r3, lsl #16         /* r3 = 45.. */
2576         orr     r3, r3, ip, lsr #16     /* r3 = 4567 */
2577         orr     r1, r1, ip, lsl #16     /* r1 = 89AB */
2578 #else
2579         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2580         str     r2, [r0]
2581         mov     r3, r3, lsr #16         /* r3 = ..54 */
2582         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
2583         mov     r1, r1, lsl #16         /* r1 = BA.. */
2584         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
2585 #endif
2586         str     r3, [r0, #0x04]
2587         str     r1, [r0, #0x08]
2588         RET
2589         LMEMCPY_C_PAD
2590
2591 /*
2592  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2593  */
2594         ldrb    r2, [r1]                /* r2 = ...0 */
2595         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2596         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2597         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2598 #ifdef __ARMEB__
2599         mov     r2, r2, lsl #24         /* r2 = 0... */
2600         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
2601         str     r2, [r0]
2602         mov     r3, r3, lsl #24         /* r3 = 4... */
2603         orr     r3, r3, ip, lsr #8      /* r3 = 4567 */
2604         mov     r1, r1, lsr #8          /* r1 = .9AB */
2605         orr     r1, r1, ip, lsl #24     /* r1 = 89AB */
2606 #else
2607         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
2608         str     r2, [r0]
2609         mov     r3, r3, lsr #24         /* r3 = ...4 */
2610         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
2611         mov     r1, r1, lsl #8          /* r1 = BA9. */
2612         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
2613 #endif
2614         str     r3, [r0, #0x04]
2615         str     r1, [r0, #0x08]
2616         RET
2617         LMEMCPY_C_PAD
2618
2619 /*
2620  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2621  */
2622         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2623         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2624         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
2625         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
2626         strh    r1, [r0, #0x01]
2627 #ifdef __ARMEB__
2628         mov     r1, r2, lsr #24         /* r1 = ...0 */
2629         strb    r1, [r0]
2630         mov     r1, r2, lsl #24         /* r1 = 3... */
2631         orr     r2, r1, r3, lsr #8      /* r1 = 3456 */
2632         mov     r1, r3, lsl #24         /* r1 = 7... */
2633         orr     r1, r1, ip, lsr #8      /* r1 = 789A */
2634 #else
2635         strb    r2, [r0]
2636         mov     r1, r2, lsr #24         /* r1 = ...3 */
2637         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
2638         mov     r1, r3, lsr #24         /* r1 = ...7 */
2639         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
2640         mov     ip, ip, lsr #24         /* ip = ...B */
2641 #endif
2642         str     r2, [r0, #0x03]
2643         str     r1, [r0, #0x07]
2644         strb    ip, [r0, #0x0b]
2645         RET
2646         LMEMCPY_C_PAD
2647
2648 /*
2649  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2650  */
2651         ldrb    r2, [r1]
2652         ldrh    r3, [r1, #0x01]
2653         ldr     ip, [r1, #0x03]
2654         strb    r2, [r0]
2655         ldr     r2, [r1, #0x07]
2656         ldrb    r1, [r1, #0x0b]
2657         strh    r3, [r0, #0x01]
2658         str     ip, [r0, #0x03]
2659         str     r2, [r0, #0x07]
2660         strb    r1, [r0, #0x0b]
2661         RET
2662         LMEMCPY_C_PAD
2663
2664 /*
2665  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2666  */
2667         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2668         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2669         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2670         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2671 #ifdef __ARMEB__
2672         mov     r2, r2, ror #8          /* r2 = 1..0 */
2673         strb    r2, [r0]
2674         mov     r2, r2, lsr #16         /* r2 = ..1. */
2675         orr     r2, r2, r3, lsr #24     /* r2 = ..12 */
2676         strh    r2, [r0, #0x01]
2677         mov     r2, r3, lsl #8          /* r2 = 345. */
2678         orr     r3, r2, ip, lsr #24     /* r3 = 3456 */
2679         mov     r2, ip, lsl #8          /* r2 = 789. */
2680         orr     r2, r2, r1, lsr #8      /* r2 = 789A */
2681 #else
2682         strb    r2, [r0]
2683         mov     r2, r2, lsr #8          /* r2 = ...1 */
2684         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2685         strh    r2, [r0, #0x01]
2686         mov     r2, r3, lsr #8          /* r2 = .543 */
2687         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
2688         mov     r2, ip, lsr #8          /* r2 = .987 */
2689         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
2690         mov     r1, r1, lsr #8          /* r1 = ...B */
2691 #endif
2692         str     r3, [r0, #0x03]
2693         str     r2, [r0, #0x07]
2694         strb    r1, [r0, #0x0b]
2695         RET
2696         LMEMCPY_C_PAD
2697
2698 /*
2699  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2700  */
2701         ldrb    r2, [r1]
2702         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2703         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2704         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2705         strb    r2, [r0]
2706 #ifdef __ARMEB__
2707         mov     r2, r3, lsr #16         /* r2 = ..12 */
2708         strh    r2, [r0, #0x01]
2709         mov     r3, r3, lsl #16         /* r3 = 34.. */
2710         orr     r3, r3, ip, lsr #16     /* r3 = 3456 */
2711         mov     ip, ip, lsl #16         /* ip = 78.. */
2712         orr     ip, ip, r1, lsr #16     /* ip = 789A */
2713         mov     r1, r1, lsr #8          /* r1 = .9AB */
2714 #else
2715         strh    r3, [r0, #0x01]
2716         mov     r3, r3, lsr #16         /* r3 = ..43 */
2717         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
2718         mov     ip, ip, lsr #16         /* ip = ..87 */
2719         orr     ip, ip, r1, lsl #16     /* ip = A987 */
2720         mov     r1, r1, lsr #16         /* r1 = ..xB */
2721 #endif
2722         str     r3, [r0, #0x03]
2723         str     ip, [r0, #0x07]
2724         strb    r1, [r0, #0x0b]
2725         RET
2726         LMEMCPY_C_PAD
2727
2728 /*
2729  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2730  */
2731         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
2732         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2733         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
2734         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2735 #ifdef __ARMEB__
2736         strh    r1, [r0]
2737         mov     r1, ip, lsl #16         /* r1 = 23.. */
2738         orr     r1, r1, r3, lsr #16     /* r1 = 2345 */
2739         mov     r3, r3, lsl #16         /* r3 = 67.. */
2740         orr     r3, r3, r2, lsr #16     /* r3 = 6789 */
2741 #else
2742         strh    ip, [r0]
2743         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
2744         mov     r3, r3, lsr #16         /* r3 = ..76 */
2745         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
2746         mov     r2, r2, lsr #16         /* r2 = ..BA */
2747 #endif
2748         str     r1, [r0, #0x02]
2749         str     r3, [r0, #0x06]
2750         strh    r2, [r0, #0x0a]
2751         RET
2752         LMEMCPY_C_PAD
2753
2754 /*
2755  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2756  */
2757         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2758         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2759         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
2760         strh    ip, [r0]
2761         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2762         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
2763 #ifdef __ARMEB__
2764         mov     r2, r2, lsl #24         /* r2 = 2... */
2765         orr     r2, r2, r3, lsr #8      /* r2 = 2345 */
2766         mov     r3, r3, lsl #24         /* r3 = 6... */
2767         orr     r3, r3, ip, lsr #8      /* r3 = 6789 */
2768         orr     r1, r1, ip, lsl #8      /* r1 = 89AB */
2769 #else
2770         mov     r2, r2, lsr #24         /* r2 = ...2 */
2771         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2772         mov     r3, r3, lsr #24         /* r3 = ...6 */
2773         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2774         mov     r1, r1, lsl #8          /* r1 = ..B. */
2775         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2776 #endif
2777         str     r2, [r0, #0x02]
2778         str     r3, [r0, #0x06]
2779         strh    r1, [r0, #0x0a]
2780         RET
2781         LMEMCPY_C_PAD
2782
2783 /*
2784  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2785  */
2786         ldrh    r2, [r1]
2787         ldr     r3, [r1, #0x02]
2788         ldr     ip, [r1, #0x06]
2789         ldrh    r1, [r1, #0x0a]
2790         strh    r2, [r0]
2791         str     r3, [r0, #0x02]
2792         str     ip, [r0, #0x06]
2793         strh    r1, [r0, #0x0a]
2794         RET
2795         LMEMCPY_C_PAD
2796
2797 /*
2798  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2799  */
2800         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2801         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2802         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2803         strh    ip, [r0, #0x0a]
2804         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2805         ldrb    r1, [r1]                /* r1 = ...0 */
2806 #ifdef __ARMEB__
2807         mov     r2, r2, lsr #24         /* r2 = ...9 */
2808         orr     r2, r2, r3, lsl #8      /* r2 = 6789 */
2809         mov     r3, r3, lsr #24         /* r3 = ...5 */
2810         orr     r3, r3, ip, lsl #8      /* r3 = 2345 */
2811         mov     r1, r1, lsl #8          /* r1 = ..0. */
2812         orr     r1, r1, ip, lsr #24     /* r1 = ..01 */
2813 #else
2814         mov     r2, r2, lsl #24         /* r2 = 9... */
2815         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2816         mov     r3, r3, lsl #24         /* r3 = 5... */
2817         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2818         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2819 #endif
2820         str     r2, [r0, #0x06]
2821         str     r3, [r0, #0x02]
2822         strh    r1, [r0]
2823         RET
2824         LMEMCPY_C_PAD
2825
2826 /*
2827  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2828  */
2829         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2830         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2831         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2832 #ifdef __ARMEB__
2833         mov     r3, r2, lsr #24         /* r3 = ...0 */
2834         strb    r3, [r0]
2835         mov     r2, r2, lsl #8          /* r2 = 123. */
2836         orr     r2, r2, ip, lsr #24     /* r2 = 1234 */
2837         str     r2, [r0, #0x01]
2838         mov     r2, ip, lsl #8          /* r2 = 567. */
2839         orr     r2, r2, r1, lsr #24     /* r2 = 5678 */
2840         str     r2, [r0, #0x05]
2841         mov     r2, r1, lsr #8          /* r2 = ..9A */
2842         strh    r2, [r0, #0x09]
2843         strb    r1, [r0, #0x0b]
2844 #else
2845         strb    r2, [r0]
2846         mov     r3, r2, lsr #8          /* r3 = .321 */
2847         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2848         str     r3, [r0, #0x01]
2849         mov     r3, ip, lsr #8          /* r3 = .765 */
2850         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2851         str     r3, [r0, #0x05]
2852         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2853         strh    r1, [r0, #0x09]
2854         mov     r1, r1, lsr #16         /* r1 = ...B */
2855         strb    r1, [r0, #0x0b]
2856 #endif
2857         RET
2858         LMEMCPY_C_PAD
2859
2860 /*
2861  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2862  */
2863         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2864         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2865         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2866         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2867         strb    r2, [r0, #0x0b]
2868 #ifdef __ARMEB__
2869         strh    r3, [r0, #0x09]
2870         mov     r3, r3, lsr #16         /* r3 = ..78 */
2871         orr     r3, r3, ip, lsl #16     /* r3 = 5678 */
2872         mov     ip, ip, lsr #16         /* ip = ..34 */
2873         orr     ip, ip, r1, lsl #16     /* ip = 1234 */
2874         mov     r1, r1, lsr #16         /* r1 = ..x0 */
2875 #else
2876         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2877         strh    r2, [r0, #0x09]
2878         mov     r3, r3, lsl #16         /* r3 = 87.. */
2879         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2880         mov     ip, ip, lsl #16         /* ip = 43.. */
2881         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2882         mov     r1, r1, lsr #8          /* r1 = .210 */
2883 #endif
2884         str     r3, [r0, #0x05]
2885         str     ip, [r0, #0x01]
2886         strb    r1, [r0]
2887         RET
2888         LMEMCPY_C_PAD
2889
2890 /*
2891  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2892  */
2893 #ifdef __ARMEB__
2894         ldrh    r2, [r1, #0x0a]         /* r2 = ..AB */
2895         ldr     ip, [r1, #0x06]         /* ip = 6789 */
2896         ldr     r3, [r1, #0x02]         /* r3 = 2345 */
2897         ldrh    r1, [r1]                /* r1 = ..01 */
2898         strb    r2, [r0, #0x0b]
2899         mov     r2, r2, lsr #8          /* r2 = ...A */
2900         orr     r2, r2, ip, lsl #8      /* r2 = 789A */
2901         mov     ip, ip, lsr #8          /* ip = .678 */
2902         orr     ip, ip, r3, lsl #24     /* ip = 5678 */
2903         mov     r3, r3, lsr #8          /* r3 = .234 */
2904         orr     r3, r3, r1, lsl #24     /* r3 = 1234 */
2905         mov     r1, r1, lsr #8          /* r1 = ...0 */
2906         strb    r1, [r0]
2907         str     r3, [r0, #0x01]
2908         str     ip, [r0, #0x05]
2909         strh    r2, [r0, #0x09]
2910 #else
2911         ldrh    r2, [r1]                /* r2 = ..10 */
2912         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2913         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2914         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2915         strb    r2, [r0]
2916         mov     r2, r2, lsr #8          /* r2 = ...1 */
2917         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2918         mov     r3, r3, lsr #24         /* r3 = ...5 */
2919         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2920         mov     ip, ip, lsr #24         /* ip = ...9 */
2921         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2922         mov     r1, r1, lsr #8          /* r1 = ...B */
2923         str     r2, [r0, #0x01]
2924         str     r3, [r0, #0x05]
2925         strh    ip, [r0, #0x09]
2926         strb    r1, [r0, #0x0b]
2927 #endif
2928         RET
2929         LMEMCPY_C_PAD
2930
2931 /*
2932  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2933  */
2934         ldrb    r2, [r1]
2935         ldr     r3, [r1, #0x01]
2936         ldr     ip, [r1, #0x05]
2937         strb    r2, [r0]
2938         ldrh    r2, [r1, #0x09]
2939         ldrb    r1, [r1, #0x0b]
2940         str     r3, [r0, #0x01]
2941         str     ip, [r0, #0x05]
2942         strh    r2, [r0, #0x09]
2943         strb    r1, [r0, #0x0b]
2944         RET
2945 END(memcpy)
2946 #endif /* _ARM_ARCH_5E */
2947
2948 #ifdef GPROF
2949
2950 ENTRY(user)
2951         nop
2952 END(user)
2953 ENTRY(btrap)
2954         nop
2955 END(btrap)
2956 ENTRY(etrap)
2957         nop
2958 END(etrap)
2959 ENTRY(bintr)
2960         nop
2961 END(bintr)
2962 ENTRY(eintr)
2963         nop
2964 END(eintr)
2965 #endif