sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  *
  76  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  77  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  78  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  79  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  80  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  81  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  82  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  83  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  84  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  85  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  86  * POSSIBILITY OF SUCH DAMAGE.
  87  */
  88
  89 #include <machine/asm.h>
  90 __FBSDID("$FreeBSD$");
  91
  92 #include "assym.inc"
  93
  94         .syntax unified
  95
  96 .L_arm_memcpy:
  97         .word   _C_LABEL(_arm_memcpy)
  98 .L_arm_bzero:
  99         .word   _C_LABEL(_arm_bzero)
 100 .L_min_memcpy_size:
 101         .word   _C_LABEL(_min_memcpy_size)
 102 .L_min_bzero_size:
 103         .word   _C_LABEL(_min_bzero_size)
 104 /*
 105  * memset: Sets a block of memory to the specified value
 106  *
 107  * On entry:
 108  *   r0 - dest address
 109  *   r1 - byte to write
 110  *   r2 - number of bytes to write
 111  *
 112  * On exit:
 113  *   r0 - dest address
 114  */
 115 /* LINTSTUB: Func: void bzero(void *, size_t) */
 116 ENTRY(bzero)
 117         ldr     r3, .L_arm_bzero
 118         ldr     r3, [r3]
 119         cmp     r3, #0
 120         beq     .Lnormal0
 121         ldr     r2, .L_min_bzero_size
 122         ldr     r2, [r2]
 123         cmp     r1, r2
 124         blt     .Lnormal0
 125         stmfd   sp!, {r0, r1, lr}
 126         mov     r2, #0
 127         mov     lr, pc
 128         mov     pc, r3
 129         cmp     r0, #0
 130         ldmfd   sp!, {r0, r1, lr}
 131         RETeq
 132 .Lnormal0:
 133         mov     r3, #0x00
 134         b       do_memset
 135 END(bzero)
 136 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 137 ENTRY(memset)
 138         and     r3, r1, #0xff           /* We deal with bytes */
 139         mov     r1, r2
 140 do_memset:
 141         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 142         mov     ip, r0
 143         blt     .Lmemset_lessthanfour
 144
 145         /* Ok first we will word align the address */
 146         ands    r2, ip, #0x03           /* Get the bottom two bits */
 147         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 148
 149         /* We are now word aligned */
 150 .Lmemset_wordaligned:
 151         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 152 #ifdef _ARM_ARCH_5E
 153         tst     ip, #0x04               /* Quad-align for armv5e */
 154 #else
 155         cmp     r1, #0x10
 156 #endif
 157         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 158 #ifdef _ARM_ARCH_5E
 159         subne   r1, r1, #0x04           /* Quad-align if necessary */
 160         strne   r3, [ip], #0x04
 161         cmp     r1, #0x10
 162 #endif
 163         blt     .Lmemset_loop4          /* If less than 16 then use words */
 164         mov     r2, r3                  /* Duplicate data */
 165         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 166         blt     .Lmemset_loop32
 167
 168         /* Do 128 bytes at a time */
 169 .Lmemset_loop128:
 170         subs    r1, r1, #0x80
 171 #ifdef _ARM_ARCH_5E
 172         strdge  r2, [ip], #0x08
 173         strdge  r2, [ip], #0x08
 174         strdge  r2, [ip], #0x08
 175         strdge  r2, [ip], #0x08
 176         strdge  r2, [ip], #0x08
 177         strdge  r2, [ip], #0x08
 178         strdge  r2, [ip], #0x08
 179         strdge  r2, [ip], #0x08
 180         strdge  r2, [ip], #0x08
 181         strdge  r2, [ip], #0x08
 182         strdge  r2, [ip], #0x08
 183         strdge  r2, [ip], #0x08
 184         strdge  r2, [ip], #0x08
 185         strdge  r2, [ip], #0x08
 186         strdge  r2, [ip], #0x08
 187         strdge  r2, [ip], #0x08
 188 #else
 189         stmiage ip!, {r2-r3}
 190         stmiage ip!, {r2-r3}
 191         stmiage ip!, {r2-r3}
 192         stmiage ip!, {r2-r3}
 193         stmiage ip!, {r2-r3}
 194         stmiage ip!, {r2-r3}
 195         stmiage ip!, {r2-r3}
 196         stmiage ip!, {r2-r3}
 197         stmiage ip!, {r2-r3}
 198         stmiage ip!, {r2-r3}
 199         stmiage ip!, {r2-r3}
 200         stmiage ip!, {r2-r3}
 201         stmiage ip!, {r2-r3}
 202         stmiage ip!, {r2-r3}
 203         stmiage ip!, {r2-r3}
 204         stmiage ip!, {r2-r3}
 205 #endif
 206         bgt     .Lmemset_loop128
 207         RETeq                   /* Zero length so just exit */
 208
 209         add     r1, r1, #0x80           /* Adjust for extra sub */
 210
 211         /* Do 32 bytes at a time */
 212 .Lmemset_loop32:
 213         subs    r1, r1, #0x20
 214 #ifdef _ARM_ARCH_5E
 215         strdge  r2, [ip], #0x08
 216         strdge  r2, [ip], #0x08
 217         strdge  r2, [ip], #0x08
 218         strdge  r2, [ip], #0x08
 219 #else
 220         stmiage ip!, {r2-r3}
 221         stmiage ip!, {r2-r3}
 222         stmiage ip!, {r2-r3}
 223         stmiage ip!, {r2-r3}
 224 #endif
 225         bgt     .Lmemset_loop32
 226         RETeq                   /* Zero length so just exit */
 227
 228         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 229
 230         /* Deal with 16 bytes or more */
 231 #ifdef _ARM_ARCH_5E
 232         strdge  r2, [ip], #0x08
 233         strdge  r2, [ip], #0x08
 234 #else
 235         stmiage ip!, {r2-r3}
 236         stmiage ip!, {r2-r3}
 237 #endif
 238         RETeq                   /* Zero length so just exit */
 239
 240         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 241
 242         /* We have at least 4 bytes so copy as words */
 243 .Lmemset_loop4:
 244         subs    r1, r1, #0x04
 245         strge   r3, [ip], #0x04
 246         bgt     .Lmemset_loop4
 247         RETeq                   /* Zero length so just exit */
 248
 249 #ifdef _ARM_ARCH_5E
 250         /* Compensate for 64-bit alignment check */
 251         adds    r1, r1, #0x04
 252         RETeq
 253         cmp     r1, #2
 254 #else
 255         cmp     r1, #-2
 256 #endif
 257
 258         strb    r3, [ip], #0x01         /* Set 1 byte */
 259         strbge  r3, [ip], #0x01         /* Set another byte */
 260         strbgt  r3, [ip]                /* and a third */
 261         RET                     /* Exit */
 262
 263 .Lmemset_wordunaligned:
 264         rsb     r2, r2, #0x004
 265         strb    r3, [ip], #0x01         /* Set 1 byte */
 266         cmp     r2, #0x02
 267         strbge  r3, [ip], #0x01         /* Set another byte */
 268         sub     r1, r1, r2
 269         strbgt  r3, [ip], #0x01         /* and a third */
 270         cmp     r1, #0x04               /* More than 4 bytes left? */
 271         bge     .Lmemset_wordaligned    /* Yup */
 272
 273 .Lmemset_lessthanfour:
 274         cmp     r1, #0x00
 275         RETeq                   /* Zero length so exit */
 276         strb    r3, [ip], #0x01         /* Set 1 byte */
 277         cmp     r1, #0x02
 278         strbge  r3, [ip], #0x01         /* Set another byte */
 279         strbgt  r3, [ip]                /* and a third */
 280         RET                     /* Exit */
 281 EEND(memset)
 282 END(bzero)
 283
 284 ENTRY(bcmp)
 285         mov     ip, r0
 286         cmp     r2, #0x06
 287         beq     .Lmemcmp_6bytes
 288         mov     r0, #0x00
 289
 290         /* Are both addresses aligned the same way? */
 291         cmp     r2, #0x00
 292         eorsne  r3, ip, r1
 293         RETeq                   /* len == 0, or same addresses! */
 294         tst     r3, #0x03
 295         subne   r2, r2, #0x01
 296         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 297
 298         /* Word-align the addresses, if necessary */
 299         sub     r3, r1, #0x05
 300         ands    r3, r3, #0x03
 301         add     r3, r3, r3, lsl #1
 302         addne   pc, pc, r3, lsl #3
 303         nop
 304
 305         /* Compare up to 3 bytes */
 306         ldrb    r0, [ip], #0x01
 307         ldrb    r3, [r1], #0x01
 308         subs    r0, r0, r3
 309         RETne
 310         subs    r2, r2, #0x01
 311         RETeq
 312
 313         /* Compare up to 2 bytes */
 314         ldrb    r0, [ip], #0x01
 315         ldrb    r3, [r1], #0x01
 316         subs    r0, r0, r3
 317         RETne
 318         subs    r2, r2, #0x01
 319         RETeq
 320
 321         /* Compare 1 byte */
 322         ldrb    r0, [ip], #0x01
 323         ldrb    r3, [r1], #0x01
 324         subs    r0, r0, r3
 325         RETne
 326         subs    r2, r2, #0x01
 327         RETeq
 328
 329         /* Compare 4 bytes at a time, if possible */
 330         subs    r2, r2, #0x04
 331         bcc     .Lmemcmp_bytewise
 332 .Lmemcmp_word_aligned:
 333         ldr     r0, [ip], #0x04
 334         ldr     r3, [r1], #0x04
 335         subs    r2, r2, #0x04
 336         cmpcs   r0, r3
 337         beq     .Lmemcmp_word_aligned
 338         sub     r0, r0, r3
 339
 340         /* Correct for extra subtraction, and check if done */
 341         adds    r2, r2, #0x04
 342         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 343         RETeq                   /* Yup. Just return */
 344
 345         /* Re-do the final word byte-wise */
 346         sub     ip, ip, #0x04
 347         sub     r1, r1, #0x04
 348
 349 .Lmemcmp_bytewise:
 350         add     r2, r2, #0x03
 351 .Lmemcmp_bytewise2:
 352         ldrb    r0, [ip], #0x01
 353         ldrb    r3, [r1], #0x01
 354         subs    r2, r2, #0x01
 355         cmpcs   r0, r3
 356         beq     .Lmemcmp_bytewise2
 357         sub     r0, r0, r3
 358         RET
 359
 360         /*
 361          * 6 byte compares are very common, thanks to the network stack.
 362          * This code is hand-scheduled to reduce the number of stalls for
 363          * load results. Everything else being equal, this will be ~32%
 364          * faster than a byte-wise memcmp.
 365          */
 366         .align  5
 367 .Lmemcmp_6bytes:
 368         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 369         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 370         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 371         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 372         ldrbeq  r3, [ip, #0x01]         /* r3 = b1#1 */
 373         RETne                   /* Return if mismatch on #0 */
 374         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 375         ldrbeq  r3, [r1, #0x02]         /* r3 = b2#2 */
 376         ldrbeq  r0, [ip, #0x02]         /* r0 = b1#2 */
 377         RETne                   /* Return if mismatch on #1 */
 378         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 379         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 380         ldrbeq  r3, [ip, #0x03]         /* r3 = b1#3 */
 381         RETne                   /* Return if mismatch on #2 */
 382         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 383         ldrbeq  r3, [r1, #0x04]         /* r3 = b2#4 */
 384         ldrbeq  r0, [ip, #0x04]         /* r0 = b1#4 */
 385         RETne                   /* Return if mismatch on #3 */
 386         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 387         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 388         ldrbeq  r3, [ip, #0x05]         /* r3 = b1#5 */
 389         RETne                   /* Return if mismatch on #4 */
 390         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 391         RET
 392 END(bcmp)
 393
 394 ENTRY(bcopy)
 395         /* switch the source and destination registers */
 396         eor     r0, r1, r0
 397         eor     r1, r0, r1
 398         eor     r0, r1, r0
 399 EENTRY(memmove)
 400         /* Do the buffers overlap? */
 401         cmp     r0, r1
 402         RETeq           /* Bail now if src/dst are the same */
 403         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 404         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 405         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 406         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 407
 408         /* Determine copy direction */
 409         cmp     r1, r0
 410         bcc     .Lmemmove_backwards
 411
 412         moveq   r0, #0                  /* Quick abort for len=0 */
 413         RETeq
 414
 415         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 416         subs    r2, r2, #4
 417         blt     .Lmemmove_fl4           /* less than 4 bytes */
 418         ands    r12, r0, #3
 419         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 420         ands    r12, r1, #3
 421         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 422
 423 .Lmemmove_ft8:
 424         /* We have aligned source and destination */
 425         subs    r2, r2, #8
 426         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 427         subs    r2, r2, #0x14
 428         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 429         stmdb   sp!, {r4}               /* borrow r4 */
 430
 431         /* blat 32 bytes at a time */
 432         /* XXX for really big copies perhaps we should use more registers */
 433 .Lmemmove_floop32:
 434         ldmia   r1!, {r3, r4, r12, lr}
 435         stmia   r0!, {r3, r4, r12, lr}
 436         ldmia   r1!, {r3, r4, r12, lr}
 437         stmia   r0!, {r3, r4, r12, lr}
 438         subs    r2, r2, #0x20
 439         bge     .Lmemmove_floop32
 440
 441         cmn     r2, #0x10
 442         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 443         stmiage r0!, {r3, r4, r12, lr}
 444         subge   r2, r2, #0x10
 445         ldmia   sp!, {r4}               /* return r4 */
 446
 447 .Lmemmove_fl32:
 448         adds    r2, r2, #0x14
 449
 450         /* blat 12 bytes at a time */
 451 .Lmemmove_floop12:
 452         ldmiage r1!, {r3, r12, lr}
 453         stmiage r0!, {r3, r12, lr}
 454         subsge  r2, r2, #0x0c
 455         bge     .Lmemmove_floop12
 456
 457 .Lmemmove_fl12:
 458         adds    r2, r2, #8
 459         blt     .Lmemmove_fl4
 460
 461         subs    r2, r2, #4
 462         ldrlt   r3, [r1], #4
 463         strlt   r3, [r0], #4
 464         ldmiage r1!, {r3, r12}
 465         stmiage r0!, {r3, r12}
 466         subge   r2, r2, #4
 467
 468 .Lmemmove_fl4:
 469         /* less than 4 bytes to go */
 470         adds    r2, r2, #4
 471         ldmiaeq sp!, {r0, pc}           /* done */
 472
 473         /* copy the crud byte at a time */
 474         cmp     r2, #2
 475         ldrb    r3, [r1], #1
 476         strb    r3, [r0], #1
 477         ldrbge  r3, [r1], #1
 478         strbge  r3, [r0], #1
 479         ldrbgt  r3, [r1], #1
 480         strbgt  r3, [r0], #1
 481         ldmia   sp!, {r0, pc}
 482
 483         /* erg - unaligned destination */
 484 .Lmemmove_fdestul:
 485         rsb     r12, r12, #4
 486         cmp     r12, #2
 487
 488         /* align destination with byte copies */
 489         ldrb    r3, [r1], #1
 490         strb    r3, [r0], #1
 491         ldrbge  r3, [r1], #1
 492         strbge  r3, [r0], #1
 493         ldrbgt  r3, [r1], #1
 494         strbgt  r3, [r0], #1
 495         subs    r2, r2, r12
 496         blt     .Lmemmove_fl4           /* less the 4 bytes */
 497
 498         ands    r12, r1, #3
 499         beq     .Lmemmove_ft8           /* we have an aligned source */
 500
 501         /* erg - unaligned source */
 502         /* This is where it gets nasty ... */
 503 .Lmemmove_fsrcul:
 504         bic     r1, r1, #3
 505         ldr     lr, [r1], #4
 506         cmp     r12, #2
 507         bgt     .Lmemmove_fsrcul3
 508         beq     .Lmemmove_fsrcul2
 509         cmp     r2, #0x0c
 510         blt     .Lmemmove_fsrcul1loop4
 511         sub     r2, r2, #0x0c
 512         stmdb   sp!, {r4, r5}
 513
 514 .Lmemmove_fsrcul1loop16:
 515         mov     r3, lr, lsr #8
 516         ldmia   r1!, {r4, r5, r12, lr}
 517         orr     r3, r3, r4, lsl #24
 518         mov     r4, r4, lsr #8
 519         orr     r4, r4, r5, lsl #24
 520         mov     r5, r5, lsr #8
 521         orr     r5, r5, r12, lsl #24
 522         mov     r12, r12, lsr #8
 523         orr     r12, r12, lr, lsl #24
 524         stmia   r0!, {r3-r5, r12}
 525         subs    r2, r2, #0x10
 526         bge     .Lmemmove_fsrcul1loop16
 527         ldmia   sp!, {r4, r5}
 528         adds    r2, r2, #0x0c
 529         blt     .Lmemmove_fsrcul1l4
 530
 531 .Lmemmove_fsrcul1loop4:
 532         mov     r12, lr, lsr #8
 533         ldr     lr, [r1], #4
 534         orr     r12, r12, lr, lsl #24
 535         str     r12, [r0], #4
 536         subs    r2, r2, #4
 537         bge     .Lmemmove_fsrcul1loop4
 538
 539 .Lmemmove_fsrcul1l4:
 540         sub     r1, r1, #3
 541         b       .Lmemmove_fl4
 542
 543 .Lmemmove_fsrcul2:
 544         cmp     r2, #0x0c
 545         blt     .Lmemmove_fsrcul2loop4
 546         sub     r2, r2, #0x0c
 547         stmdb   sp!, {r4, r5}
 548
 549 .Lmemmove_fsrcul2loop16:
 550         mov     r3, lr, lsr #16
 551         ldmia   r1!, {r4, r5, r12, lr}
 552         orr     r3, r3, r4, lsl #16
 553         mov     r4, r4, lsr #16
 554         orr     r4, r4, r5, lsl #16
 555         mov     r5, r5, lsr #16
 556         orr     r5, r5, r12, lsl #16
 557         mov     r12, r12, lsr #16
 558         orr     r12, r12, lr, lsl #16
 559         stmia   r0!, {r3-r5, r12}
 560         subs    r2, r2, #0x10
 561         bge     .Lmemmove_fsrcul2loop16
 562         ldmia   sp!, {r4, r5}
 563         adds    r2, r2, #0x0c
 564         blt     .Lmemmove_fsrcul2l4
 565
 566 .Lmemmove_fsrcul2loop4:
 567         mov     r12, lr, lsr #16
 568         ldr     lr, [r1], #4
 569         orr     r12, r12, lr, lsl #16
 570         str     r12, [r0], #4
 571         subs    r2, r2, #4
 572         bge     .Lmemmove_fsrcul2loop4
 573
 574 .Lmemmove_fsrcul2l4:
 575         sub     r1, r1, #2
 576         b       .Lmemmove_fl4
 577
 578 .Lmemmove_fsrcul3:
 579         cmp     r2, #0x0c
 580         blt     .Lmemmove_fsrcul3loop4
 581         sub     r2, r2, #0x0c
 582         stmdb   sp!, {r4, r5}
 583
 584 .Lmemmove_fsrcul3loop16:
 585         mov     r3, lr, lsr #24
 586         ldmia   r1!, {r4, r5, r12, lr}
 587         orr     r3, r3, r4, lsl #8
 588         mov     r4, r4, lsr #24
 589         orr     r4, r4, r5, lsl #8
 590         mov     r5, r5, lsr #24
 591         orr     r5, r5, r12, lsl #8
 592         mov     r12, r12, lsr #24
 593         orr     r12, r12, lr, lsl #8
 594         stmia   r0!, {r3-r5, r12}
 595         subs    r2, r2, #0x10
 596         bge     .Lmemmove_fsrcul3loop16
 597         ldmia   sp!, {r4, r5}
 598         adds    r2, r2, #0x0c
 599         blt     .Lmemmove_fsrcul3l4
 600
 601 .Lmemmove_fsrcul3loop4:
 602         mov     r12, lr, lsr #24
 603         ldr     lr, [r1], #4
 604         orr     r12, r12, lr, lsl #8
 605         str     r12, [r0], #4
 606         subs    r2, r2, #4
 607         bge     .Lmemmove_fsrcul3loop4
 608
 609 .Lmemmove_fsrcul3l4:
 610         sub     r1, r1, #1
 611         b       .Lmemmove_fl4
 612
 613 .Lmemmove_backwards:
 614         add     r1, r1, r2
 615         add     r0, r0, r2
 616         subs    r2, r2, #4
 617         blt     .Lmemmove_bl4           /* less than 4 bytes */
 618         ands    r12, r0, #3
 619         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 620         ands    r12, r1, #3
 621         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 622
 623 .Lmemmove_bt8:
 624         /* We have aligned source and destination */
 625         subs    r2, r2, #8
 626         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 627         stmdb   sp!, {r4, lr}
 628         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 629         blt     .Lmemmove_bl32
 630
 631         /* blat 32 bytes at a time */
 632         /* XXX for really big copies perhaps we should use more registers */
 633 .Lmemmove_bloop32:
 634         ldmdb   r1!, {r3, r4, r12, lr}
 635         stmdb   r0!, {r3, r4, r12, lr}
 636         ldmdb   r1!, {r3, r4, r12, lr}
 637         stmdb   r0!, {r3, r4, r12, lr}
 638         subs    r2, r2, #0x20
 639         bge     .Lmemmove_bloop32
 640
 641 .Lmemmove_bl32:
 642         cmn     r2, #0x10
 643         ldmdbge r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 644         stmdbge r0!, {r3, r4, r12, lr}
 645         subge   r2, r2, #0x10
 646         adds    r2, r2, #0x14
 647         ldmdbge r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 648         stmdbge r0!, {r3, r12, lr}
 649         subge   r2, r2, #0x0c
 650         ldmia   sp!, {r4, lr}
 651
 652 .Lmemmove_bl12:
 653         adds    r2, r2, #8
 654         blt     .Lmemmove_bl4
 655         subs    r2, r2, #4
 656         ldrlt   r3, [r1, #-4]!
 657         strlt   r3, [r0, #-4]!
 658         ldmdbge r1!, {r3, r12}
 659         stmdbge r0!, {r3, r12}
 660         subge   r2, r2, #4
 661
 662 .Lmemmove_bl4:
 663         /* less than 4 bytes to go */
 664         adds    r2, r2, #4
 665         RETeq                   /* done */
 666
 667         /* copy the crud byte at a time */
 668         cmp     r2, #2
 669         ldrb    r3, [r1, #-1]!
 670         strb    r3, [r0, #-1]!
 671         ldrbge  r3, [r1, #-1]!
 672         strbge  r3, [r0, #-1]!
 673         ldrbgt  r3, [r1, #-1]!
 674         strbgt  r3, [r0, #-1]!
 675         RET
 676
 677         /* erg - unaligned destination */
 678 .Lmemmove_bdestul:
 679         cmp     r12, #2
 680
 681         /* align destination with byte copies */
 682         ldrb    r3, [r1, #-1]!
 683         strb    r3, [r0, #-1]!
 684         ldrbge  r3, [r1, #-1]!
 685         strbge  r3, [r0, #-1]!
 686         ldrbgt  r3, [r1, #-1]!
 687         strbgt  r3, [r0, #-1]!
 688         subs    r2, r2, r12
 689         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 690         ands    r12, r1, #3
 691         beq     .Lmemmove_bt8           /* we have an aligned source */
 692
 693         /* erg - unaligned source */
 694         /* This is where it gets nasty ... */
 695 .Lmemmove_bsrcul:
 696         bic     r1, r1, #3
 697         ldr     r3, [r1, #0]
 698         cmp     r12, #2
 699         blt     .Lmemmove_bsrcul1
 700         beq     .Lmemmove_bsrcul2
 701         cmp     r2, #0x0c
 702         blt     .Lmemmove_bsrcul3loop4
 703         sub     r2, r2, #0x0c
 704         stmdb   sp!, {r4, r5, lr}
 705
 706 .Lmemmove_bsrcul3loop16:
 707         mov     lr, r3, lsl #8
 708         ldmdb   r1!, {r3-r5, r12}
 709         orr     lr, lr, r12, lsr #24
 710         mov     r12, r12, lsl #8
 711         orr     r12, r12, r5, lsr #24
 712         mov     r5, r5, lsl #8
 713         orr     r5, r5, r4, lsr #24
 714         mov     r4, r4, lsl #8
 715         orr     r4, r4, r3, lsr #24
 716         stmdb   r0!, {r4, r5, r12, lr}
 717         subs    r2, r2, #0x10
 718         bge     .Lmemmove_bsrcul3loop16
 719         ldmia   sp!, {r4, r5, lr}
 720         adds    r2, r2, #0x0c
 721         blt     .Lmemmove_bsrcul3l4
 722
 723 .Lmemmove_bsrcul3loop4:
 724         mov     r12, r3, lsl #8
 725         ldr     r3, [r1, #-4]!
 726         orr     r12, r12, r3, lsr #24
 727         str     r12, [r0, #-4]!
 728         subs    r2, r2, #4
 729         bge     .Lmemmove_bsrcul3loop4
 730
 731 .Lmemmove_bsrcul3l4:
 732         add     r1, r1, #3
 733         b       .Lmemmove_bl4
 734
 735 .Lmemmove_bsrcul2:
 736         cmp     r2, #0x0c
 737         blt     .Lmemmove_bsrcul2loop4
 738         sub     r2, r2, #0x0c
 739         stmdb   sp!, {r4, r5, lr}
 740
 741 .Lmemmove_bsrcul2loop16:
 742         mov     lr, r3, lsl #16
 743         ldmdb   r1!, {r3-r5, r12}
 744         orr     lr, lr, r12, lsr #16
 745         mov     r12, r12, lsl #16
 746         orr     r12, r12, r5, lsr #16
 747         mov     r5, r5, lsl #16
 748         orr     r5, r5, r4, lsr #16
 749         mov     r4, r4, lsl #16
 750         orr     r4, r4, r3, lsr #16
 751         stmdb   r0!, {r4, r5, r12, lr}
 752         subs    r2, r2, #0x10
 753         bge     .Lmemmove_bsrcul2loop16
 754         ldmia   sp!, {r4, r5, lr}
 755         adds    r2, r2, #0x0c
 756         blt     .Lmemmove_bsrcul2l4
 757
 758 .Lmemmove_bsrcul2loop4:
 759         mov     r12, r3, lsl #16
 760         ldr     r3, [r1, #-4]!
 761         orr     r12, r12, r3, lsr #16
 762         str     r12, [r0, #-4]!
 763         subs    r2, r2, #4
 764         bge     .Lmemmove_bsrcul2loop4
 765
 766 .Lmemmove_bsrcul2l4:
 767         add     r1, r1, #2
 768         b       .Lmemmove_bl4
 769
 770 .Lmemmove_bsrcul1:
 771         cmp     r2, #0x0c
 772         blt     .Lmemmove_bsrcul1loop4
 773         sub     r2, r2, #0x0c
 774         stmdb   sp!, {r4, r5, lr}
 775
 776 .Lmemmove_bsrcul1loop32:
 777         mov     lr, r3, lsl #24
 778         ldmdb   r1!, {r3-r5, r12}
 779         orr     lr, lr, r12, lsr #8
 780         mov     r12, r12, lsl #24
 781         orr     r12, r12, r5, lsr #8
 782         mov     r5, r5, lsl #24
 783         orr     r5, r5, r4, lsr #8
 784         mov     r4, r4, lsl #24
 785         orr     r4, r4, r3, lsr #8
 786         stmdb   r0!, {r4, r5, r12, lr}
 787         subs    r2, r2, #0x10
 788         bge     .Lmemmove_bsrcul1loop32
 789         ldmia   sp!, {r4, r5, lr}
 790         adds    r2, r2, #0x0c
 791         blt     .Lmemmove_bsrcul1l4
 792
 793 .Lmemmove_bsrcul1loop4:
 794         mov     r12, r3, lsl #24
 795         ldr     r3, [r1, #-4]!
 796         orr     r12, r12, r3, lsr #8
 797         str     r12, [r0, #-4]!
 798         subs    r2, r2, #4
 799         bge     .Lmemmove_bsrcul1loop4
 800
 801 .Lmemmove_bsrcul1l4:
 802         add     r1, r1, #1
 803         b       .Lmemmove_bl4
 804 EEND(memmove)
 805 END(bcopy)
 806
 807 #if !defined(_ARM_ARCH_5E)
 808 ENTRY(memcpy)
 809         /* save leaf functions having to store this away */
 810         /* Do not check arm_memcpy if we're running from flash */
 811 #if defined(FLASHADDR) && defined(PHYSADDR)
 812 #if FLASHADDR > PHYSADDR
 813         ldr     r3, =FLASHADDR
 814         cmp     r3, pc
 815         bls     .Lnormal
 816 #else
 817         ldr     r3, =FLASHADDR
 818         cmp     r3, pc
 819         bhi     .Lnormal
 820 #endif
 821 #endif
 822         ldr     r3, .L_arm_memcpy
 823         ldr     r3, [r3]
 824         cmp     r3, #0
 825         beq     .Lnormal
 826         ldr     r3, .L_min_memcpy_size
 827         ldr     r3, [r3]
 828         cmp     r2, r3
 829         blt     .Lnormal
 830         stmfd   sp!, {r0-r2, r4, lr}
 831         mov     r3, #0
 832         ldr     r4, .L_arm_memcpy
 833         mov     lr, pc
 834         ldr     pc, [r4]
 835         cmp     r0, #0
 836         ldmfd   sp!, {r0-r2, r4, lr}
 837         RETeq
 838
 839 .Lnormal:
 840         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
 841
 842         subs    r2, r2, #4
 843         blt     .Lmemcpy_l4             /* less than 4 bytes */
 844         ands    r12, r0, #3
 845         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
 846         ands    r12, r1, #3
 847         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
 848
 849 .Lmemcpy_t8:
 850         /* We have aligned source and destination */
 851         subs    r2, r2, #8
 852         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
 853         subs    r2, r2, #0x14
 854         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
 855         stmdb   sp!, {r4}               /* borrow r4 */
 856
 857         /* blat 32 bytes at a time */
 858         /* XXX for really big copies perhaps we should use more registers */
 859 .Lmemcpy_loop32:
 860         ldmia   r1!, {r3, r4, r12, lr}
 861         stmia   r0!, {r3, r4, r12, lr}
 862         ldmia   r1!, {r3, r4, r12, lr}
 863         stmia   r0!, {r3, r4, r12, lr}
 864         subs    r2, r2, #0x20
 865         bge     .Lmemcpy_loop32
 866
 867         cmn     r2, #0x10
 868         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 869         stmiage r0!, {r3, r4, r12, lr}
 870         subge   r2, r2, #0x10
 871         ldmia   sp!, {r4}               /* return r4 */
 872
 873 .Lmemcpy_l32:
 874         adds    r2, r2, #0x14
 875
 876         /* blat 12 bytes at a time */
 877 .Lmemcpy_loop12:
 878         ldmiage r1!, {r3, r12, lr}
 879         stmiage r0!, {r3, r12, lr}
 880         subsge  r2, r2, #0x0c
 881         bge     .Lmemcpy_loop12
 882
 883 .Lmemcpy_l12:
 884         adds    r2, r2, #8
 885         blt     .Lmemcpy_l4
 886
 887         subs    r2, r2, #4
 888         ldrlt   r3, [r1], #4
 889         strlt   r3, [r0], #4
 890         ldmiage r1!, {r3, r12}
 891         stmiage r0!, {r3, r12}
 892         subge   r2, r2, #4
 893
 894 .Lmemcpy_l4:
 895         /* less than 4 bytes to go */
 896         adds    r2, r2, #4
 897 #ifdef __APCS_26_
 898         ldmiaeq sp!, {r0, pc}^          /* done */
 899 #else
 900         ldmiaeq sp!, {r0, pc}           /* done */
 901 #endif
 902         /* copy the crud byte at a time */
 903         cmp     r2, #2
 904         ldrb    r3, [r1], #1
 905         strb    r3, [r0], #1
 906         ldrbge  r3, [r1], #1
 907         strbge  r3, [r0], #1
 908         ldrbgt  r3, [r1], #1
 909         strbgt  r3, [r0], #1
 910         ldmia   sp!, {r0, pc}
 911
 912         /* erg - unaligned destination */
 913 .Lmemcpy_destul:
 914         rsb     r12, r12, #4
 915         cmp     r12, #2
 916
 917         /* align destination with byte copies */
 918         ldrb    r3, [r1], #1
 919         strb    r3, [r0], #1
 920         ldrbge  r3, [r1], #1
 921         strbge  r3, [r0], #1
 922         ldrbgt  r3, [r1], #1
 923         strbgt  r3, [r0], #1
 924         subs    r2, r2, r12
 925         blt     .Lmemcpy_l4             /* less the 4 bytes */
 926
 927         ands    r12, r1, #3
 928         beq     .Lmemcpy_t8             /* we have an aligned source */
 929
 930         /* erg - unaligned source */
 931         /* This is where it gets nasty ... */
 932 .Lmemcpy_srcul:
 933         bic     r1, r1, #3
 934         ldr     lr, [r1], #4
 935         cmp     r12, #2
 936         bgt     .Lmemcpy_srcul3
 937         beq     .Lmemcpy_srcul2
 938         cmp     r2, #0x0c
 939         blt     .Lmemcpy_srcul1loop4
 940         sub     r2, r2, #0x0c
 941         stmdb   sp!, {r4, r5}
 942
 943 .Lmemcpy_srcul1loop16:
 944         mov     r3, lr, lsr #8
 945         ldmia   r1!, {r4, r5, r12, lr}
 946         orr     r3, r3, r4, lsl #24
 947         mov     r4, r4, lsr #8
 948         orr     r4, r4, r5, lsl #24
 949         mov     r5, r5, lsr #8
 950         orr     r5, r5, r12, lsl #24
 951         mov     r12, r12, lsr #8
 952         orr     r12, r12, lr, lsl #24
 953         stmia   r0!, {r3-r5, r12}
 954         subs    r2, r2, #0x10
 955         bge     .Lmemcpy_srcul1loop16
 956         ldmia   sp!, {r4, r5}
 957         adds    r2, r2, #0x0c
 958         blt     .Lmemcpy_srcul1l4
 959
 960 .Lmemcpy_srcul1loop4:
 961         mov     r12, lr, lsr #8
 962         ldr     lr, [r1], #4
 963         orr     r12, r12, lr, lsl #24
 964         str     r12, [r0], #4
 965         subs    r2, r2, #4
 966         bge     .Lmemcpy_srcul1loop4
 967
 968 .Lmemcpy_srcul1l4:
 969         sub     r1, r1, #3
 970         b       .Lmemcpy_l4
 971
 972 .Lmemcpy_srcul2:
 973         cmp     r2, #0x0c
 974         blt     .Lmemcpy_srcul2loop4
 975         sub     r2, r2, #0x0c
 976         stmdb   sp!, {r4, r5}
 977
 978 .Lmemcpy_srcul2loop16:
 979         mov     r3, lr, lsr #16
 980         ldmia   r1!, {r4, r5, r12, lr}
 981         orr     r3, r3, r4, lsl #16
 982         mov     r4, r4, lsr #16
 983         orr     r4, r4, r5, lsl #16
 984         mov     r5, r5, lsr #16
 985         orr     r5, r5, r12, lsl #16
 986         mov     r12, r12, lsr #16
 987         orr     r12, r12, lr, lsl #16
 988         stmia   r0!, {r3-r5, r12}
 989         subs    r2, r2, #0x10
 990         bge     .Lmemcpy_srcul2loop16
 991         ldmia   sp!, {r4, r5}
 992         adds    r2, r2, #0x0c
 993         blt     .Lmemcpy_srcul2l4
 994
 995 .Lmemcpy_srcul2loop4:
 996         mov     r12, lr, lsr #16
 997         ldr     lr, [r1], #4
 998         orr     r12, r12, lr, lsl #16
 999         str     r12, [r0], #4
1000         subs    r2, r2, #4
1001         bge     .Lmemcpy_srcul2loop4
1002
1003 .Lmemcpy_srcul2l4:
1004         sub     r1, r1, #2
1005         b       .Lmemcpy_l4
1006
1007 .Lmemcpy_srcul3:
1008         cmp     r2, #0x0c
1009         blt     .Lmemcpy_srcul3loop4
1010         sub     r2, r2, #0x0c
1011         stmdb   sp!, {r4, r5}
1012
1013 .Lmemcpy_srcul3loop16:
1014         mov     r3, lr, lsr #24
1015         ldmia   r1!, {r4, r5, r12, lr}
1016         orr     r3, r3, r4, lsl #8
1017         mov     r4, r4, lsr #24
1018         orr     r4, r4, r5, lsl #8
1019         mov     r5, r5, lsr #24
1020         orr     r5, r5, r12, lsl #8
1021         mov     r12, r12, lsr #24
1022         orr     r12, r12, lr, lsl #8
1023         stmia   r0!, {r3-r5, r12}
1024         subs    r2, r2, #0x10
1025         bge     .Lmemcpy_srcul3loop16
1026         ldmia   sp!, {r4, r5}
1027         adds    r2, r2, #0x0c
1028         blt     .Lmemcpy_srcul3l4
1029
1030 .Lmemcpy_srcul3loop4:
1031         mov     r12, lr, lsr #24
1032         ldr     lr, [r1], #4
1033         orr     r12, r12, lr, lsl #8
1034         str     r12, [r0], #4
1035         subs    r2, r2, #4
1036         bge     .Lmemcpy_srcul3loop4
1037
1038 .Lmemcpy_srcul3l4:
1039         sub     r1, r1, #1
1040         b       .Lmemcpy_l4
1041 END(memcpy)
1042
1043 #else
1044 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1045 ENTRY(memcpy)
1046         pld     [r1]
1047         cmp     r2, #0x0c
1048         ble     .Lmemcpy_short          /* <= 12 bytes */
1049 #ifdef FLASHADDR
1050 #if FLASHADDR > PHYSADDR
1051         ldr     r3, =FLASHADDR
1052         cmp     r3, pc
1053         bls     .Lnormal
1054 #else
1055         ldr     r3, =FLASHADDR
1056         cmp     r3, pc
1057         bhi     .Lnormal
1058 #endif
1059 #endif
1060         ldr     r3, .L_arm_memcpy
1061         ldr     r3, [r3]
1062         cmp     r3, #0
1063         beq     .Lnormal
1064         ldr     r3, .L_min_memcpy_size
1065         ldr     r3, [r3]
1066         cmp     r2, r3
1067         blt     .Lnormal
1068         stmfd   sp!, {r0-r2, r4, lr}
1069         mov     r3, #0
1070         ldr     r4, .L_arm_memcpy
1071         mov     lr, pc
1072         ldr     pc, [r4]
1073         cmp     r0, #0
1074         ldmfd   sp!, {r0-r2, r4, lr}
1075         RETeq
1076 .Lnormal:
1077         mov     r3, r0                  /* We must not clobber r0 */
1078
1079         /* Word-align the destination buffer */
1080         ands    ip, r3, #0x03           /* Already word aligned? */
1081         beq     .Lmemcpy_wordaligned    /* Yup */
1082         cmp     ip, #0x02
1083         ldrb    ip, [r1], #0x01
1084         sub     r2, r2, #0x01
1085         strb    ip, [r3], #0x01
1086         ldrble  ip, [r1], #0x01
1087         suble   r2, r2, #0x01
1088         strble  ip, [r3], #0x01
1089         ldrblt  ip, [r1], #0x01
1090         sublt   r2, r2, #0x01
1091         strblt  ip, [r3], #0x01
1092
1093         /* Destination buffer is now word aligned */
1094 .Lmemcpy_wordaligned:
1095         ands    ip, r1, #0x03           /* Is src also word-aligned? */
1096         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
1097
1098         /* Quad-align the destination buffer */
1099         tst     r3, #0x07               /* Already quad aligned? */
1100         ldrne   ip, [r1], #0x04
1101         stmfd   sp!, {r4-r9}            /* Free up some registers */
1102         subne   r2, r2, #0x04
1103         strne   ip, [r3], #0x04
1104
1105         /* Destination buffer quad aligned, source is at least word aligned */
1106         subs    r2, r2, #0x80
1107         blt     .Lmemcpy_w_lessthan128
1108
1109         /* Copy 128 bytes at a time */
1110 .Lmemcpy_w_loop128:
1111         ldr     r4, [r1], #0x04         /* LD:00-03 */
1112         ldr     r5, [r1], #0x04         /* LD:04-07 */
1113         pld     [r1, #0x18]             /* Prefetch 0x20 */
1114         ldr     r6, [r1], #0x04         /* LD:08-0b */
1115         ldr     r7, [r1], #0x04         /* LD:0c-0f */
1116         ldr     r8, [r1], #0x04         /* LD:10-13 */
1117         ldr     r9, [r1], #0x04         /* LD:14-17 */
1118         strd    r4, [r3], #0x08         /* ST:00-07 */
1119         ldr     r4, [r1], #0x04         /* LD:18-1b */
1120         ldr     r5, [r1], #0x04         /* LD:1c-1f */
1121         strd    r6, [r3], #0x08         /* ST:08-0f */
1122         ldr     r6, [r1], #0x04         /* LD:20-23 */
1123         ldr     r7, [r1], #0x04         /* LD:24-27 */
1124         pld     [r1, #0x18]             /* Prefetch 0x40 */
1125         strd    r8, [r3], #0x08         /* ST:10-17 */
1126         ldr     r8, [r1], #0x04         /* LD:28-2b */
1127         ldr     r9, [r1], #0x04         /* LD:2c-2f */
1128         strd    r4, [r3], #0x08         /* ST:18-1f */
1129         ldr     r4, [r1], #0x04         /* LD:30-33 */
1130         ldr     r5, [r1], #0x04         /* LD:34-37 */
1131         strd    r6, [r3], #0x08         /* ST:20-27 */
1132         ldr     r6, [r1], #0x04         /* LD:38-3b */
1133         ldr     r7, [r1], #0x04         /* LD:3c-3f */
1134         strd    r8, [r3], #0x08         /* ST:28-2f */
1135         ldr     r8, [r1], #0x04         /* LD:40-43 */
1136         ldr     r9, [r1], #0x04         /* LD:44-47 */
1137         pld     [r1, #0x18]             /* Prefetch 0x60 */
1138         strd    r4, [r3], #0x08         /* ST:30-37 */
1139         ldr     r4, [r1], #0x04         /* LD:48-4b */
1140         ldr     r5, [r1], #0x04         /* LD:4c-4f */
1141         strd    r6, [r3], #0x08         /* ST:38-3f */
1142         ldr     r6, [r1], #0x04         /* LD:50-53 */
1143         ldr     r7, [r1], #0x04         /* LD:54-57 */
1144         strd    r8, [r3], #0x08         /* ST:40-47 */
1145         ldr     r8, [r1], #0x04         /* LD:58-5b */
1146         ldr     r9, [r1], #0x04         /* LD:5c-5f */
1147         strd    r4, [r3], #0x08         /* ST:48-4f */
1148         ldr     r4, [r1], #0x04         /* LD:60-63 */
1149         ldr     r5, [r1], #0x04         /* LD:64-67 */
1150         pld     [r1, #0x18]             /* Prefetch 0x80 */
1151         strd    r6, [r3], #0x08         /* ST:50-57 */
1152         ldr     r6, [r1], #0x04         /* LD:68-6b */
1153         ldr     r7, [r1], #0x04         /* LD:6c-6f */
1154         strd    r8, [r3], #0x08         /* ST:58-5f */
1155         ldr     r8, [r1], #0x04         /* LD:70-73 */
1156         ldr     r9, [r1], #0x04         /* LD:74-77 */
1157         strd    r4, [r3], #0x08         /* ST:60-67 */
1158         ldr     r4, [r1], #0x04         /* LD:78-7b */
1159         ldr     r5, [r1], #0x04         /* LD:7c-7f */
1160         strd    r6, [r3], #0x08         /* ST:68-6f */
1161         strd    r8, [r3], #0x08         /* ST:70-77 */
1162         subs    r2, r2, #0x80
1163         strd    r4, [r3], #0x08         /* ST:78-7f */
1164         bge     .Lmemcpy_w_loop128
1165
1166 .Lmemcpy_w_lessthan128:
1167         adds    r2, r2, #0x80           /* Adjust for extra sub */
1168         ldmfdeq sp!, {r4-r9}
1169         RETeq                   /* Return now if done */
1170         subs    r2, r2, #0x20
1171         blt     .Lmemcpy_w_lessthan32
1172
1173         /* Copy 32 bytes at a time */
1174 .Lmemcpy_w_loop32:
1175         ldr     r4, [r1], #0x04
1176         ldr     r5, [r1], #0x04
1177         pld     [r1, #0x18]
1178         ldr     r6, [r1], #0x04
1179         ldr     r7, [r1], #0x04
1180         ldr     r8, [r1], #0x04
1181         ldr     r9, [r1], #0x04
1182         strd    r4, [r3], #0x08
1183         ldr     r4, [r1], #0x04
1184         ldr     r5, [r1], #0x04
1185         strd    r6, [r3], #0x08
1186         strd    r8, [r3], #0x08
1187         subs    r2, r2, #0x20
1188         strd    r4, [r3], #0x08
1189         bge     .Lmemcpy_w_loop32
1190
1191 .Lmemcpy_w_lessthan32:
1192         adds    r2, r2, #0x20           /* Adjust for extra sub */
1193         ldmfdeq sp!, {r4-r9}
1194         RETeq                   /* Return now if done */
1195
1196         and     r4, r2, #0x18
1197         rsbs    r4, r4, #0x18
1198         addne   pc, pc, r4, lsl #1
1199         nop
1200
1201         /* At least 24 bytes remaining */
1202         ldr     r4, [r1], #0x04
1203         ldr     r5, [r1], #0x04
1204         sub     r2, r2, #0x08
1205         strd    r4, [r3], #0x08
1206
1207         /* At least 16 bytes remaining */
1208         ldr     r4, [r1], #0x04
1209         ldr     r5, [r1], #0x04
1210         sub     r2, r2, #0x08
1211         strd    r4, [r3], #0x08
1212
1213         /* At least 8 bytes remaining */
1214         ldr     r4, [r1], #0x04
1215         ldr     r5, [r1], #0x04
1216         subs    r2, r2, #0x08
1217         strd    r4, [r3], #0x08
1218
1219         /* Less than 8 bytes remaining */
1220         ldmfd   sp!, {r4-r9}
1221         RETeq                   /* Return now if done */
1222         subs    r2, r2, #0x04
1223         ldrge   ip, [r1], #0x04
1224         strge   ip, [r3], #0x04
1225         RETeq                   /* Return now if done */
1226         addlt   r2, r2, #0x04
1227         ldrb    ip, [r1], #0x01
1228         cmp     r2, #0x02
1229         ldrbge  r2, [r1], #0x01
1230         strb    ip, [r3], #0x01
1231         ldrbgt  ip, [r1]
1232         strbge  r2, [r3], #0x01
1233         strbgt  ip, [r3]
1234         RET
1235 /* Place a literal pool here for the above ldr instructions to use */
1236 .ltorg
1237
1238
1239 /*
1240  * At this point, it has not been possible to word align both buffers.
1241  * The destination buffer is word aligned, but the source buffer is not.
1242  */
1243 .Lmemcpy_bad_align:
1244         stmfd   sp!, {r4-r7}
1245         bic     r1, r1, #0x03
1246         cmp     ip, #2
1247         ldr     ip, [r1], #0x04
1248         bgt     .Lmemcpy_bad3
1249         beq     .Lmemcpy_bad2
1250         b       .Lmemcpy_bad1
1251
1252 .Lmemcpy_bad1_loop16:
1253         mov     r4, ip, lsr #8
1254         ldr     r5, [r1], #0x04
1255         pld     [r1, #0x018]
1256         ldr     r6, [r1], #0x04
1257         ldr     r7, [r1], #0x04
1258         ldr     ip, [r1], #0x04
1259         orr     r4, r4, r5, lsl #24
1260         mov     r5, r5, lsr #8
1261         orr     r5, r5, r6, lsl #24
1262         mov     r6, r6, lsr #8
1263         orr     r6, r6, r7, lsl #24
1264         mov     r7, r7, lsr #8
1265         orr     r7, r7, ip, lsl #24
1266         str     r4, [r3], #0x04
1267         str     r5, [r3], #0x04
1268         str     r6, [r3], #0x04
1269         str     r7, [r3], #0x04
1270 .Lmemcpy_bad1:
1271         subs    r2, r2, #0x10
1272         bge     .Lmemcpy_bad1_loop16
1273
1274         adds    r2, r2, #0x10
1275         ldmfdeq sp!, {r4-r7}
1276         RETeq                   /* Return now if done */
1277         subs    r2, r2, #0x04
1278         sublt   r1, r1, #0x03
1279         blt     .Lmemcpy_bad_done
1280
1281 .Lmemcpy_bad1_loop4:
1282         mov     r4, ip, lsr #8
1283         ldr     ip, [r1], #0x04
1284         subs    r2, r2, #0x04
1285         orr     r4, r4, ip, lsl #24
1286         str     r4, [r3], #0x04
1287         bge     .Lmemcpy_bad1_loop4
1288         sub     r1, r1, #0x03
1289         b       .Lmemcpy_bad_done
1290
1291 .Lmemcpy_bad2_loop16:
1292         mov     r4, ip, lsr #16
1293         ldr     r5, [r1], #0x04
1294         pld     [r1, #0x018]
1295         ldr     r6, [r1], #0x04
1296         ldr     r7, [r1], #0x04
1297         ldr     ip, [r1], #0x04
1298         orr     r4, r4, r5, lsl #16
1299         mov     r5, r5, lsr #16
1300         orr     r5, r5, r6, lsl #16
1301         mov     r6, r6, lsr #16
1302         orr     r6, r6, r7, lsl #16
1303         mov     r7, r7, lsr #16
1304         orr     r7, r7, ip, lsl #16
1305         str     r4, [r3], #0x04
1306         str     r5, [r3], #0x04
1307         str     r6, [r3], #0x04
1308         str     r7, [r3], #0x04
1309 .Lmemcpy_bad2:
1310         subs    r2, r2, #0x10
1311         bge     .Lmemcpy_bad2_loop16
1312
1313         adds    r2, r2, #0x10
1314         ldmfdeq sp!, {r4-r7}
1315         RETeq                   /* Return now if done */
1316         subs    r2, r2, #0x04
1317         sublt   r1, r1, #0x02
1318         blt     .Lmemcpy_bad_done
1319
1320 .Lmemcpy_bad2_loop4:
1321         mov     r4, ip, lsr #16
1322         ldr     ip, [r1], #0x04
1323         subs    r2, r2, #0x04
1324         orr     r4, r4, ip, lsl #16
1325         str     r4, [r3], #0x04
1326         bge     .Lmemcpy_bad2_loop4
1327         sub     r1, r1, #0x02
1328         b       .Lmemcpy_bad_done
1329
1330 .Lmemcpy_bad3_loop16:
1331         mov     r4, ip, lsr #24
1332         ldr     r5, [r1], #0x04
1333         pld     [r1, #0x018]
1334         ldr     r6, [r1], #0x04
1335         ldr     r7, [r1], #0x04
1336         ldr     ip, [r1], #0x04
1337         orr     r4, r4, r5, lsl #8
1338         mov     r5, r5, lsr #24
1339         orr     r5, r5, r6, lsl #8
1340         mov     r6, r6, lsr #24
1341         orr     r6, r6, r7, lsl #8
1342         mov     r7, r7, lsr #24
1343         orr     r7, r7, ip, lsl #8
1344         str     r4, [r3], #0x04
1345         str     r5, [r3], #0x04
1346         str     r6, [r3], #0x04
1347         str     r7, [r3], #0x04
1348 .Lmemcpy_bad3:
1349         subs    r2, r2, #0x10
1350         bge     .Lmemcpy_bad3_loop16
1351
1352         adds    r2, r2, #0x10
1353         ldmfdeq sp!, {r4-r7}
1354         RETeq                   /* Return now if done */
1355         subs    r2, r2, #0x04
1356         sublt   r1, r1, #0x01
1357         blt     .Lmemcpy_bad_done
1358
1359 .Lmemcpy_bad3_loop4:
1360         mov     r4, ip, lsr #24
1361         ldr     ip, [r1], #0x04
1362         subs    r2, r2, #0x04
1363         orr     r4, r4, ip, lsl #8
1364         str     r4, [r3], #0x04
1365         bge     .Lmemcpy_bad3_loop4
1366         sub     r1, r1, #0x01
1367
1368 .Lmemcpy_bad_done:
1369         ldmfd   sp!, {r4-r7}
1370         adds    r2, r2, #0x04
1371         RETeq
1372         ldrb    ip, [r1], #0x01
1373         cmp     r2, #0x02
1374         ldrbge  r2, [r1], #0x01
1375         strb    ip, [r3], #0x01
1376         ldrbgt  ip, [r1]
1377         strbge  r2, [r3], #0x01
1378         strbgt  ip, [r3]
1379         RET
1380
1381
1382 /*
1383  * Handle short copies (less than 16 bytes), possibly misaligned.
1384  * Some of these are *very* common, thanks to the network stack,
1385  * and so are handled specially.
1386  */
1387 .Lmemcpy_short:
1388         add     pc, pc, r2, lsl #2
1389         nop
1390         RET                     /* 0x00 */
1391         b       .Lmemcpy_bytewise       /* 0x01 */
1392         b       .Lmemcpy_bytewise       /* 0x02 */
1393         b       .Lmemcpy_bytewise       /* 0x03 */
1394         b       .Lmemcpy_4              /* 0x04 */
1395         b       .Lmemcpy_bytewise       /* 0x05 */
1396         b       .Lmemcpy_6              /* 0x06 */
1397         b       .Lmemcpy_bytewise       /* 0x07 */
1398         b       .Lmemcpy_8              /* 0x08 */
1399         b       .Lmemcpy_bytewise       /* 0x09 */
1400         b       .Lmemcpy_bytewise       /* 0x0a */
1401         b       .Lmemcpy_bytewise       /* 0x0b */
1402         b       .Lmemcpy_c              /* 0x0c */
1403 .Lmemcpy_bytewise:
1404         mov     r3, r0                  /* We must not clobber r0 */
1405         ldrb    ip, [r1], #0x01
1406 1:      subs    r2, r2, #0x01
1407         strb    ip, [r3], #0x01
1408         ldrbne  ip, [r1], #0x01
1409         bne     1b
1410         RET
1411
1412 /******************************************************************************
1413  * Special case for 4 byte copies
1414  */
1415 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1416 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1417         LMEMCPY_4_PAD
1418 .Lmemcpy_4:
1419         and     r2, r1, #0x03
1420         orr     r2, r2, r0, lsl #2
1421         ands    r2, r2, #0x0f
1422         sub     r3, pc, #0x14
1423         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1424
1425 /*
1426  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1427  */
1428         ldr     r2, [r1]
1429         str     r2, [r0]
1430         RET
1431         LMEMCPY_4_PAD
1432
1433 /*
1434  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1435  */
1436         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1437         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1438         mov     r3, r3, lsr #8          /* r3 = .210 */
1439         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1440         str     r3, [r0]
1441         RET
1442         LMEMCPY_4_PAD
1443
1444 /*
1445  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1446  */
1447         ldrh    r3, [r1, #0x02]
1448         ldrh    r2, [r1]
1449         orr     r3, r2, r3, lsl #16
1450         str     r3, [r0]
1451         RET
1452         LMEMCPY_4_PAD
1453
1454 /*
1455  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1456  */
1457         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1458         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1459         mov     r3, r3, lsr #24         /* r3 = ...0 */
1460         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1461         str     r3, [r0]
1462         RET
1463         LMEMCPY_4_PAD
1464
1465 /*
1466  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1467  */
1468         ldr     r2, [r1]
1469         strb    r2, [r0]
1470         mov     r3, r2, lsr #8
1471         mov     r1, r2, lsr #24
1472         strb    r1, [r0, #0x03]
1473         strh    r3, [r0, #0x01]
1474         RET
1475         LMEMCPY_4_PAD
1476
1477 /*
1478  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1479  */
1480         ldrb    r2, [r1]
1481         ldrh    r3, [r1, #0x01]
1482         ldrb    r1, [r1, #0x03]
1483         strb    r2, [r0]
1484         strh    r3, [r0, #0x01]
1485         strb    r1, [r0, #0x03]
1486         RET
1487         LMEMCPY_4_PAD
1488
1489 /*
1490  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1491  */
1492         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1493         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1494         mov     r1, r2, lsr #8          /* r1 = ...0 */
1495         strb    r1, [r0]
1496         mov     r2, r2, lsl #8          /* r2 = .01. */
1497         orr     r2, r2, r3, lsr #8      /* r2 = .012 */
1498         strh    r2, [r0, #0x01]
1499         strb    r3, [r0, #0x03]
1500         RET
1501         LMEMCPY_4_PAD
1502
1503 /*
1504  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1505  */
1506         ldrb    r2, [r1]
1507         ldrh    r3, [r1, #0x01]
1508         ldrb    r1, [r1, #0x03]
1509         strb    r2, [r0]
1510         strh    r3, [r0, #0x01]
1511         strb    r1, [r0, #0x03]
1512         RET
1513         LMEMCPY_4_PAD
1514
1515 /*
1516  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1517  */
1518         ldr     r2, [r1]
1519         strh    r2, [r0]
1520         mov     r3, r2, lsr #16
1521         strh    r3, [r0, #0x02]
1522         RET
1523         LMEMCPY_4_PAD
1524
1525 /*
1526  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1527  */
1528         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1529         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1530         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1531         strh    r1, [r0]
1532         mov     r2, r2, lsr #24         /* r2 = ...2 */
1533         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1534         strh    r2, [r0, #0x02]
1535         RET
1536         LMEMCPY_4_PAD
1537
1538 /*
1539  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1540  */
1541         ldrh    r2, [r1]
1542         ldrh    r3, [r1, #0x02]
1543         strh    r2, [r0]
1544         strh    r3, [r0, #0x02]
1545         RET
1546         LMEMCPY_4_PAD
1547
1548 /*
1549  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1550  */
1551         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1552         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1553         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1554         strh    r1, [r0, #0x02]
1555         mov     r3, r3, lsl #8          /* r3 = 321. */
1556         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1557         strh    r3, [r0]
1558         RET
1559         LMEMCPY_4_PAD
1560
1561 /*
1562  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1563  */
1564         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1565         strb    r2, [r0]
1566         mov     r3, r2, lsr #8
1567         mov     r1, r2, lsr #24
1568         strh    r3, [r0, #0x01]
1569         strb    r1, [r0, #0x03]
1570         RET
1571         LMEMCPY_4_PAD
1572
1573 /*
1574  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1575  */
1576         ldrb    r2, [r1]
1577         ldrh    r3, [r1, #0x01]
1578         ldrb    r1, [r1, #0x03]
1579         strb    r2, [r0]
1580         strh    r3, [r0, #0x01]
1581         strb    r1, [r0, #0x03]
1582         RET
1583         LMEMCPY_4_PAD
1584
1585 /*
1586  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1587  */
1588         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1589         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1590         strb    r2, [r0]
1591         mov     r2, r2, lsr #8          /* r2 = ...1 */
1592         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1593         strh    r2, [r0, #0x01]
1594         mov     r3, r3, lsr #8          /* r3 = ...3 */
1595         strb    r3, [r0, #0x03]
1596         RET
1597         LMEMCPY_4_PAD
1598
1599 /*
1600  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1601  */
1602         ldrb    r2, [r1]
1603         ldrh    r3, [r1, #0x01]
1604         ldrb    r1, [r1, #0x03]
1605         strb    r2, [r0]
1606         strh    r3, [r0, #0x01]
1607         strb    r1, [r0, #0x03]
1608         RET
1609         LMEMCPY_4_PAD
1610
1611
1612 /******************************************************************************
1613  * Special case for 6 byte copies
1614  */
1615 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1616 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1617         LMEMCPY_6_PAD
1618 .Lmemcpy_6:
1619         and     r2, r1, #0x03
1620         orr     r2, r2, r0, lsl #2
1621         ands    r2, r2, #0x0f
1622         sub     r3, pc, #0x14
1623         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1624
1625 /*
1626  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1627  */
1628         ldr     r2, [r1]
1629         ldrh    r3, [r1, #0x04]
1630         str     r2, [r0]
1631         strh    r3, [r0, #0x04]
1632         RET
1633         LMEMCPY_6_PAD
1634
1635 /*
1636  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1637  */
1638         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1639         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1640         mov     r2, r2, lsr #8          /* r2 = .210 */
1641         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1642         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1643         str     r2, [r0]
1644         strh    r3, [r0, #0x04]
1645         RET
1646         LMEMCPY_6_PAD
1647
1648 /*
1649  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1650  */
1651         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1652         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1653         mov     r1, r3, lsr #16         /* r1 = ..54 */
1654         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1655         str     r2, [r0]
1656         strh    r1, [r0, #0x04]
1657         RET
1658         LMEMCPY_6_PAD
1659
1660 /*
1661  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1662  */
1663         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1664         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1665         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1666         mov     r2, r2, lsr #24         /* r2 = ...0 */
1667         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1668         mov     r1, r1, lsl #8          /* r1 = xx5. */
1669         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1670         str     r2, [r0]
1671         strh    r1, [r0, #0x04]
1672         RET
1673         LMEMCPY_6_PAD
1674
1675 /*
1676  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1677  */
1678         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1679         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1680         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1681         strh    r1, [r0, #0x01]
1682         strb    r3, [r0]
1683         mov     r3, r3, lsr #24         /* r3 = ...3 */
1684         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1685         mov     r2, r2, lsr #8          /* r2 = ...5 */
1686         strh    r3, [r0, #0x03]
1687         strb    r2, [r0, #0x05]
1688         RET
1689         LMEMCPY_6_PAD
1690
1691 /*
1692  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1693  */
1694         ldrb    r2, [r1]
1695         ldrh    r3, [r1, #0x01]
1696         ldrh    ip, [r1, #0x03]
1697         ldrb    r1, [r1, #0x05]
1698         strb    r2, [r0]
1699         strh    r3, [r0, #0x01]
1700         strh    ip, [r0, #0x03]
1701         strb    r1, [r0, #0x05]
1702         RET
1703         LMEMCPY_6_PAD
1704
1705 /*
1706  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1707  */
1708         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1709         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1710         strb    r2, [r0]
1711         mov     r3, r1, lsr #24
1712         strb    r3, [r0, #0x05]
1713         mov     r3, r1, lsr #8          /* r3 = .543 */
1714         strh    r3, [r0, #0x03]
1715         mov     r3, r2, lsr #8          /* r3 = ...1 */
1716         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
1717         strh    r3, [r0, #0x01]
1718         RET
1719         LMEMCPY_6_PAD
1720
1721 /*
1722  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1723  */
1724         ldrb    r2, [r1]
1725         ldrh    r3, [r1, #0x01]
1726         ldrh    ip, [r1, #0x03]
1727         ldrb    r1, [r1, #0x05]
1728         strb    r2, [r0]
1729         strh    r3, [r0, #0x01]
1730         strh    ip, [r0, #0x03]
1731         strb    r1, [r0, #0x05]
1732         RET
1733         LMEMCPY_6_PAD
1734
1735 /*
1736  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1737  */
1738         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
1739         ldr     r3, [r1]                /* r3 = 3210 */
1740         mov     r2, r2, lsl #16         /* r2 = 54.. */
1741         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
1742         strh    r3, [r0]
1743         str     r2, [r0, #0x02]
1744         RET
1745         LMEMCPY_6_PAD
1746
1747 /*
1748  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1749  */
1750         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1751         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
1752         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1753         mov     r2, r2, lsl #8          /* r2 = 543. */
1754         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
1755         strh    r1, [r0]
1756         str     r2, [r0, #0x02]
1757         RET
1758         LMEMCPY_6_PAD
1759
1760 /*
1761  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1762  */
1763         ldrh    r2, [r1]
1764         ldr     r3, [r1, #0x02]
1765         strh    r2, [r0]
1766         str     r3, [r0, #0x02]
1767         RET
1768         LMEMCPY_6_PAD
1769
1770 /*
1771  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1772  */
1773         ldrb    r3, [r1]                /* r3 = ...0 */
1774         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1775         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
1776         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1777         mov     r1, r1, lsl #24         /* r1 = 5... */
1778         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
1779         strh    r3, [r0]
1780         str     r1, [r0, #0x02]
1781         RET
1782         LMEMCPY_6_PAD
1783
1784 /*
1785  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1786  */
1787         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1788         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
1789         strb    r2, [r0]
1790         mov     r2, r2, lsr #8          /* r2 = .321 */
1791         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
1792         mov     r1, r1, lsr #8          /* r1 = ...5 */
1793         str     r2, [r0, #0x01]
1794         strb    r1, [r0, #0x05]
1795         RET
1796         LMEMCPY_6_PAD
1797
1798 /*
1799  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1800  */
1801         ldrb    r2, [r1]
1802         ldrh    r3, [r1, #0x01]
1803         ldrh    ip, [r1, #0x03]
1804         ldrb    r1, [r1, #0x05]
1805         strb    r2, [r0]
1806         strh    r3, [r0, #0x01]
1807         strh    ip, [r0, #0x03]
1808         strb    r1, [r0, #0x05]
1809         RET
1810         LMEMCPY_6_PAD
1811
1812 /*
1813  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1814  */
1815         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1816         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1817         strb    r2, [r0]
1818         mov     r2, r2, lsr #8          /* r2 = ...1 */
1819         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
1820         mov     r1, r1, lsr #24         /* r1 = ...5 */
1821         str     r2, [r0, #0x01]
1822         strb    r1, [r0, #0x05]
1823         RET
1824         LMEMCPY_6_PAD
1825
1826 /*
1827  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1828  */
1829         ldrb    r2, [r1]
1830         ldr     r3, [r1, #0x01]
1831         ldrb    r1, [r1, #0x05]
1832         strb    r2, [r0]
1833         str     r3, [r0, #0x01]
1834         strb    r1, [r0, #0x05]
1835         RET
1836         LMEMCPY_6_PAD
1837
1838
1839 /******************************************************************************
1840  * Special case for 8 byte copies
1841  */
1842 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
1843 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
1844         LMEMCPY_8_PAD
1845 .Lmemcpy_8:
1846         and     r2, r1, #0x03
1847         orr     r2, r2, r0, lsl #2
1848         ands    r2, r2, #0x0f
1849         sub     r3, pc, #0x14
1850         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
1851
1852 /*
1853  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1854  */
1855         ldr     r2, [r1]
1856         ldr     r3, [r1, #0x04]
1857         str     r2, [r0]
1858         str     r3, [r0, #0x04]
1859         RET
1860         LMEMCPY_8_PAD
1861
1862 /*
1863  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1864  */
1865         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1866         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
1867         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1868         mov     r3, r3, lsr #8          /* r3 = .210 */
1869         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1870         mov     r1, r1, lsl #24         /* r1 = 7... */
1871         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
1872         str     r3, [r0]
1873         str     r2, [r0, #0x04]
1874         RET
1875         LMEMCPY_8_PAD
1876
1877 /*
1878  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1879  */
1880         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1881         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1882         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1883         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1884         mov     r3, r3, lsr #16         /* r3 = ..54 */
1885         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
1886         str     r2, [r0]
1887         str     r3, [r0, #0x04]
1888         RET
1889         LMEMCPY_8_PAD
1890
1891 /*
1892  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1893  */
1894         ldrb    r3, [r1]                /* r3 = ...0 */
1895         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1896         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
1897         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1898         mov     r2, r2, lsr #24         /* r2 = ...4 */
1899         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
1900         str     r3, [r0]
1901         str     r2, [r0, #0x04]
1902         RET
1903         LMEMCPY_8_PAD
1904
1905 /*
1906  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1907  */
1908         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1909         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
1910         strb    r3, [r0]
1911         mov     r1, r2, lsr #24         /* r1 = ...7 */
1912         strb    r1, [r0, #0x07]
1913         mov     r1, r3, lsr #8          /* r1 = .321 */
1914         mov     r3, r3, lsr #24         /* r3 = ...3 */
1915         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
1916         strh    r1, [r0, #0x01]
1917         str     r3, [r0, #0x03]
1918         RET
1919         LMEMCPY_8_PAD
1920
1921 /*
1922  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1923  */
1924         ldrb    r2, [r1]
1925         ldrh    r3, [r1, #0x01]
1926         ldr     ip, [r1, #0x03]
1927         ldrb    r1, [r1, #0x07]
1928         strb    r2, [r0]
1929         strh    r3, [r0, #0x01]
1930         str     ip, [r0, #0x03]
1931         strb    r1, [r0, #0x07]
1932         RET
1933         LMEMCPY_8_PAD
1934
1935 /*
1936  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1937  */
1938         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1939         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1940         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1941         strb    r2, [r0]                /* 0 */
1942         mov     ip, r1, lsr #8          /* ip = ...7 */
1943         strb    ip, [r0, #0x07]         /* 7 */
1944         mov     ip, r2, lsr #8          /* ip = ...1 */
1945         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
1946         mov     r3, r3, lsr #8          /* r3 = .543 */
1947         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
1948         strh    ip, [r0, #0x01]
1949         str     r3, [r0, #0x03]
1950         RET
1951         LMEMCPY_8_PAD
1952
1953 /*
1954  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1955  */
1956         ldrb    r3, [r1]                /* r3 = ...0 */
1957         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
1958         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
1959         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1960         strb    r3, [r0]
1961         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
1962         strh    ip, [r0, #0x01]
1963         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
1964         str     r2, [r0, #0x03]
1965         strb    r1, [r0, #0x07]
1966         RET
1967         LMEMCPY_8_PAD
1968
1969 /*
1970  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1971  */
1972         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1973         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1974         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
1975         strh    r2, [r0]
1976         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
1977         mov     r3, r3, lsr #16         /* r3 = ..76 */
1978         str     r2, [r0, #0x02]
1979         strh    r3, [r0, #0x06]
1980         RET
1981         LMEMCPY_8_PAD
1982
1983 /*
1984  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1985  */
1986         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1987         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1988         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
1989         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1990         strh    r1, [r0]
1991         mov     r1, r2, lsr #24         /* r1 = ...2 */
1992         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
1993         mov     r3, r3, lsr #24         /* r3 = ...6 */
1994         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
1995         str     r1, [r0, #0x02]
1996         strh    r3, [r0, #0x06]
1997         RET
1998         LMEMCPY_8_PAD
1999
2000 /*
2001  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2002  */
2003         ldrh    r2, [r1]
2004         ldr     ip, [r1, #0x02]
2005         ldrh    r3, [r1, #0x06]
2006         strh    r2, [r0]
2007         str     ip, [r0, #0x02]
2008         strh    r3, [r0, #0x06]
2009         RET
2010         LMEMCPY_8_PAD
2011
2012 /*
2013  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2014  */
2015         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
2016         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2017         ldrb    ip, [r1]                /* ip = ...0 */
2018         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
2019         strh    r1, [r0, #0x06]
2020         mov     r3, r3, lsl #24         /* r3 = 5... */
2021         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
2022         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
2023         str     r3, [r0, #0x02]
2024         strh    r2, [r0]
2025         RET
2026         LMEMCPY_8_PAD
2027
2028 /*
2029  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2030  */
2031         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2032         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2033         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
2034         strh    r1, [r0, #0x05]
2035         strb    r2, [r0]
2036         mov     r1, r3, lsr #24         /* r1 = ...7 */
2037         strb    r1, [r0, #0x07]
2038         mov     r2, r2, lsr #8          /* r2 = .321 */
2039         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
2040         str     r2, [r0, #0x01]
2041         RET
2042         LMEMCPY_8_PAD
2043
2044 /*
2045  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2046  */
2047         ldrb    r3, [r1]                /* r3 = ...0 */
2048         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
2049         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2050         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2051         strb    r3, [r0]
2052         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
2053         strh    r3, [r0, #0x05]
2054         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
2055         str     r2, [r0, #0x01]
2056         strb    r1, [r0, #0x07]
2057         RET
2058         LMEMCPY_8_PAD
2059
2060 /*
2061  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2062  */
2063         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2064         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2065         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2066         strb    r2, [r0]
2067         mov     ip, r2, lsr #8          /* ip = ...1 */
2068         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2069         mov     r2, r1, lsr #8          /* r2 = ...7 */
2070         strb    r2, [r0, #0x07]
2071         mov     r1, r1, lsl #8          /* r1 = .76. */
2072         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
2073         str     ip, [r0, #0x01]
2074         strh    r1, [r0, #0x05]
2075         RET
2076         LMEMCPY_8_PAD
2077
2078 /*
2079  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2080  */
2081         ldrb    r2, [r1]
2082         ldr     ip, [r1, #0x01]
2083         ldrh    r3, [r1, #0x05]
2084         ldrb    r1, [r1, #0x07]
2085         strb    r2, [r0]
2086         str     ip, [r0, #0x01]
2087         strh    r3, [r0, #0x05]
2088         strb    r1, [r0, #0x07]
2089         RET
2090         LMEMCPY_8_PAD
2091
2092 /******************************************************************************
2093  * Special case for 12 byte copies
2094  */
2095 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
2096 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
2097         LMEMCPY_C_PAD
2098 .Lmemcpy_c:
2099         and     r2, r1, #0x03
2100         orr     r2, r2, r0, lsl #2
2101         ands    r2, r2, #0x0f
2102         sub     r3, pc, #0x14
2103         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
2104
2105 /*
2106  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2107  */
2108         ldr     r2, [r1]
2109         ldr     r3, [r1, #0x04]
2110         ldr     r1, [r1, #0x08]
2111         str     r2, [r0]
2112         str     r3, [r0, #0x04]
2113         str     r1, [r0, #0x08]
2114         RET
2115         LMEMCPY_C_PAD
2116
2117 /*
2118  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2119  */
2120         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
2121         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2122         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2123         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2124         mov     r2, r2, lsl #24         /* r2 = B... */
2125         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
2126         str     r2, [r0, #0x08]
2127         mov     r2, ip, lsl #24         /* r2 = 7... */
2128         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
2129         mov     r1, r1, lsr #8          /* r1 = .210 */
2130         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
2131         str     r2, [r0, #0x04]
2132         str     r1, [r0]
2133         RET
2134         LMEMCPY_C_PAD
2135
2136 /*
2137  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2138  */
2139         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2140         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2141         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2142         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2143         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2144         str     r2, [r0]
2145         mov     r3, r3, lsr #16         /* r3 = ..54 */
2146         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
2147         mov     r1, r1, lsl #16         /* r1 = BA.. */
2148         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
2149         str     r3, [r0, #0x04]
2150         str     r1, [r0, #0x08]
2151         RET
2152         LMEMCPY_C_PAD
2153
2154 /*
2155  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2156  */
2157         ldrb    r2, [r1]                /* r2 = ...0 */
2158         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2159         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2160         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2161         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
2162         str     r2, [r0]
2163         mov     r3, r3, lsr #24         /* r3 = ...4 */
2164         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
2165         mov     r1, r1, lsl #8          /* r1 = BA9. */
2166         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
2167         str     r3, [r0, #0x04]
2168         str     r1, [r0, #0x08]
2169         RET
2170         LMEMCPY_C_PAD
2171
2172 /*
2173  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2174  */
2175         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2176         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2177         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
2178         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
2179         strh    r1, [r0, #0x01]
2180         strb    r2, [r0]
2181         mov     r1, r2, lsr #24         /* r1 = ...3 */
2182         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
2183         mov     r1, r3, lsr #24         /* r1 = ...7 */
2184         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
2185         mov     ip, ip, lsr #24         /* ip = ...B */
2186         str     r2, [r0, #0x03]
2187         str     r1, [r0, #0x07]
2188         strb    ip, [r0, #0x0b]
2189         RET
2190         LMEMCPY_C_PAD
2191
2192 /*
2193  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2194  */
2195         ldrb    r2, [r1]
2196         ldrh    r3, [r1, #0x01]
2197         ldr     ip, [r1, #0x03]
2198         strb    r2, [r0]
2199         ldr     r2, [r1, #0x07]
2200         ldrb    r1, [r1, #0x0b]
2201         strh    r3, [r0, #0x01]
2202         str     ip, [r0, #0x03]
2203         str     r2, [r0, #0x07]
2204         strb    r1, [r0, #0x0b]
2205         RET
2206         LMEMCPY_C_PAD
2207
2208 /*
2209  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2210  */
2211         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2212         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2213         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2214         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2215         strb    r2, [r0]
2216         mov     r2, r2, lsr #8          /* r2 = ...1 */
2217         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2218         strh    r2, [r0, #0x01]
2219         mov     r2, r3, lsr #8          /* r2 = .543 */
2220         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
2221         mov     r2, ip, lsr #8          /* r2 = .987 */
2222         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
2223         mov     r1, r1, lsr #8          /* r1 = ...B */
2224         str     r3, [r0, #0x03]
2225         str     r2, [r0, #0x07]
2226         strb    r1, [r0, #0x0b]
2227         RET
2228         LMEMCPY_C_PAD
2229
2230 /*
2231  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2232  */
2233         ldrb    r2, [r1]
2234         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2235         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2236         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2237         strb    r2, [r0]
2238         strh    r3, [r0, #0x01]
2239         mov     r3, r3, lsr #16         /* r3 = ..43 */
2240         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
2241         mov     ip, ip, lsr #16         /* ip = ..87 */
2242         orr     ip, ip, r1, lsl #16     /* ip = A987 */
2243         mov     r1, r1, lsr #16         /* r1 = ..xB */
2244         str     r3, [r0, #0x03]
2245         str     ip, [r0, #0x07]
2246         strb    r1, [r0, #0x0b]
2247         RET
2248         LMEMCPY_C_PAD
2249
2250 /*
2251  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2252  */
2253         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
2254         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2255         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
2256         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2257         strh    ip, [r0]
2258         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
2259         mov     r3, r3, lsr #16         /* r3 = ..76 */
2260         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
2261         mov     r2, r2, lsr #16         /* r2 = ..BA */
2262         str     r1, [r0, #0x02]
2263         str     r3, [r0, #0x06]
2264         strh    r2, [r0, #0x0a]
2265         RET
2266         LMEMCPY_C_PAD
2267
2268 /*
2269  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2270  */
2271         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2272         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2273         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
2274         strh    ip, [r0]
2275         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2276         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
2277         mov     r2, r2, lsr #24         /* r2 = ...2 */
2278         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2279         mov     r3, r3, lsr #24         /* r3 = ...6 */
2280         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2281         mov     r1, r1, lsl #8          /* r1 = ..B. */
2282         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2283         str     r2, [r0, #0x02]
2284         str     r3, [r0, #0x06]
2285         strh    r1, [r0, #0x0a]
2286         RET
2287         LMEMCPY_C_PAD
2288
2289 /*
2290  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2291  */
2292         ldrh    r2, [r1]
2293         ldr     r3, [r1, #0x02]
2294         ldr     ip, [r1, #0x06]
2295         ldrh    r1, [r1, #0x0a]
2296         strh    r2, [r0]
2297         str     r3, [r0, #0x02]
2298         str     ip, [r0, #0x06]
2299         strh    r1, [r0, #0x0a]
2300         RET
2301         LMEMCPY_C_PAD
2302
2303 /*
2304  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2305  */
2306         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2307         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2308         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2309         strh    ip, [r0, #0x0a]
2310         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2311         ldrb    r1, [r1]                /* r1 = ...0 */
2312         mov     r2, r2, lsl #24         /* r2 = 9... */
2313         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2314         mov     r3, r3, lsl #24         /* r3 = 5... */
2315         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2316         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2317         str     r2, [r0, #0x06]
2318         str     r3, [r0, #0x02]
2319         strh    r1, [r0]
2320         RET
2321         LMEMCPY_C_PAD
2322
2323 /*
2324  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2325  */
2326         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2327         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2328         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2329         strb    r2, [r0]
2330         mov     r3, r2, lsr #8          /* r3 = .321 */
2331         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2332         str     r3, [r0, #0x01]
2333         mov     r3, ip, lsr #8          /* r3 = .765 */
2334         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2335         str     r3, [r0, #0x05]
2336         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2337         strh    r1, [r0, #0x09]
2338         mov     r1, r1, lsr #16         /* r1 = ...B */
2339         strb    r1, [r0, #0x0b]
2340         RET
2341         LMEMCPY_C_PAD
2342
2343 /*
2344  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2345  */
2346         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2347         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2348         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2349         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2350         strb    r2, [r0, #0x0b]
2351         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2352         strh    r2, [r0, #0x09]
2353         mov     r3, r3, lsl #16         /* r3 = 87.. */
2354         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2355         mov     ip, ip, lsl #16         /* ip = 43.. */
2356         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2357         mov     r1, r1, lsr #8          /* r1 = .210 */
2358         str     r3, [r0, #0x05]
2359         str     ip, [r0, #0x01]
2360         strb    r1, [r0]
2361         RET
2362         LMEMCPY_C_PAD
2363
2364 /*
2365  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2366  */
2367         ldrh    r2, [r1]                /* r2 = ..10 */
2368         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2369         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2370         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2371         strb    r2, [r0]
2372         mov     r2, r2, lsr #8          /* r2 = ...1 */
2373         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2374         mov     r3, r3, lsr #24         /* r3 = ...5 */
2375         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2376         mov     ip, ip, lsr #24         /* ip = ...9 */
2377         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2378         mov     r1, r1, lsr #8          /* r1 = ...B */
2379         str     r2, [r0, #0x01]
2380         str     r3, [r0, #0x05]
2381         strh    ip, [r0, #0x09]
2382         strb    r1, [r0, #0x0b]
2383         RET
2384         LMEMCPY_C_PAD
2385
2386 /*
2387  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2388  */
2389         ldrb    r2, [r1]
2390         ldr     r3, [r1, #0x01]
2391         ldr     ip, [r1, #0x05]
2392         strb    r2, [r0]
2393         ldrh    r2, [r1, #0x09]
2394         ldrb    r1, [r1, #0x0b]
2395         str     r3, [r0, #0x01]
2396         str     ip, [r0, #0x05]
2397         strh    r2, [r0, #0x09]
2398         strb    r1, [r0, #0x0b]
2399         RET
2400 END(memcpy)
2401 #endif /* _ARM_ARCH_5E */
2402
2403 #ifdef GPROF
2404
2405 ENTRY(user)
2406         nop
2407 END(user)
2408 ENTRY(btrap)
2409         nop
2410 END(btrap)
2411 ENTRY(etrap)
2412         nop
2413 END(etrap)
2414 ENTRY(bintr)
2415         nop
2416 END(bintr)
2417 ENTRY(eintr)
2418         nop
2419 END(eintr)
2420 #endif