6/sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  * 3. All advertising materials mentioning features or use of this software
  76  *    must display the following acknowledgement:
  77  *      This product includes software developed by the NetBSD
  78  *      Foundation, Inc. and its contributors.
  79  * 4. Neither the name of The NetBSD Foundation nor the names of its
  80  *    contributors may be used to endorse or promote products derived
  81  *    from this software without specific prior written permission.
  82  *
  83  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  84  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  85  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  86  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  87  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  88  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  89  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  90  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  91  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  92  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  93  * POSSIBILITY OF SUCH DAMAGE.
  94  */
  95
  96 #include <machine/asm.h>
  97 #include <machine/asmacros.h>
  98 __FBSDID("$FreeBSD$");
  99
 100 #include "assym.s"
 101
 102 /*
 103  * memset: Sets a block of memory to the specified value
 104  *
 105  * On entry:
 106  *   r0 - dest address
 107  *   r1 - byte to write
 108  *   r2 - number of bytes to write
 109  *
 110  * On exit:
 111  *   r0 - dest address
 112  */
 113 /* LINTSTUB: Func: void bzero(void *, size_t) */
 114 ENTRY(bzero)
 115         mov     r3, #0x00
 116         b       do_memset
 117
 118 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 119 ENTRY(memset)
 120         and     r3, r1, #0xff           /* We deal with bytes */
 121         mov     r1, r2
 122 do_memset:
 123         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 124         mov     ip, r0
 125         blt     .Lmemset_lessthanfour
 126
 127         /* Ok first we will word align the address */
 128         ands    r2, ip, #0x03           /* Get the bottom two bits */
 129         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 130
 131         /* We are now word aligned */
 132 .Lmemset_wordaligned:
 133         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 134 #ifdef __XSCALE__
 135         tst     ip, #0x04               /* Quad-align for Xscale */
 136 #else
 137         cmp     r1, #0x10
 138 #endif
 139         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 140 #ifdef __XSCALE__
 141         subne   r1, r1, #0x04           /* Quad-align if necessary */
 142         strne   r3, [ip], #0x04
 143         cmp     r1, #0x10
 144 #endif
 145         blt     .Lmemset_loop4          /* If less than 16 then use words */
 146         mov     r2, r3                  /* Duplicate data */
 147         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 148         blt     .Lmemset_loop32
 149
 150         /* Do 128 bytes at a time */
 151 .Lmemset_loop128:
 152         subs    r1, r1, #0x80
 153 #ifdef __XSCALE__
 154         strged  r2, [ip], #0x08
 155         strged  r2, [ip], #0x08
 156         strged  r2, [ip], #0x08
 157         strged  r2, [ip], #0x08
 158         strged  r2, [ip], #0x08
 159         strged  r2, [ip], #0x08
 160         strged  r2, [ip], #0x08
 161         strged  r2, [ip], #0x08
 162         strged  r2, [ip], #0x08
 163         strged  r2, [ip], #0x08
 164         strged  r2, [ip], #0x08
 165         strged  r2, [ip], #0x08
 166         strged  r2, [ip], #0x08
 167         strged  r2, [ip], #0x08
 168         strged  r2, [ip], #0x08
 169         strged  r2, [ip], #0x08
 170 #else
 171         stmgeia ip!, {r2-r3}
 172         stmgeia ip!, {r2-r3}
 173         stmgeia ip!, {r2-r3}
 174         stmgeia ip!, {r2-r3}
 175         stmgeia ip!, {r2-r3}
 176         stmgeia ip!, {r2-r3}
 177         stmgeia ip!, {r2-r3}
 178         stmgeia ip!, {r2-r3}
 179         stmgeia ip!, {r2-r3}
 180         stmgeia ip!, {r2-r3}
 181         stmgeia ip!, {r2-r3}
 182         stmgeia ip!, {r2-r3}
 183         stmgeia ip!, {r2-r3}
 184         stmgeia ip!, {r2-r3}
 185         stmgeia ip!, {r2-r3}
 186         stmgeia ip!, {r2-r3}
 187 #endif
 188         bgt     .Lmemset_loop128
 189         RETeq                   /* Zero length so just exit */
 190
 191         add     r1, r1, #0x80           /* Adjust for extra sub */
 192
 193         /* Do 32 bytes at a time */
 194 .Lmemset_loop32:
 195         subs    r1, r1, #0x20
 196 #ifdef __XSCALE__
 197         strged  r2, [ip], #0x08
 198         strged  r2, [ip], #0x08
 199         strged  r2, [ip], #0x08
 200         strged  r2, [ip], #0x08
 201 #else
 202         stmgeia ip!, {r2-r3}
 203         stmgeia ip!, {r2-r3}
 204         stmgeia ip!, {r2-r3}
 205         stmgeia ip!, {r2-r3}
 206 #endif
 207         bgt     .Lmemset_loop32
 208         RETeq                   /* Zero length so just exit */
 209
 210         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 211
 212         /* Deal with 16 bytes or more */
 213 #ifdef __XSCALE__
 214         strged  r2, [ip], #0x08
 215         strged  r2, [ip], #0x08
 216 #else
 217         stmgeia ip!, {r2-r3}
 218         stmgeia ip!, {r2-r3}
 219 #endif
 220         RETeq                   /* Zero length so just exit */
 221
 222         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 223
 224         /* We have at least 4 bytes so copy as words */
 225 .Lmemset_loop4:
 226         subs    r1, r1, #0x04
 227         strge   r3, [ip], #0x04
 228         bgt     .Lmemset_loop4
 229         RETeq                   /* Zero length so just exit */
 230
 231 #ifdef __XSCALE__
 232         /* Compensate for 64-bit alignment check */
 233         adds    r1, r1, #0x04
 234         RETeq
 235         cmp     r1, #2
 236 #else
 237         cmp     r1, #-2
 238 #endif
 239
 240         strb    r3, [ip], #0x01         /* Set 1 byte */
 241         strgeb  r3, [ip], #0x01         /* Set another byte */
 242         strgtb  r3, [ip]                /* and a third */
 243         RET                     /* Exit */
 244
 245 .Lmemset_wordunaligned:
 246         rsb     r2, r2, #0x004
 247         strb    r3, [ip], #0x01         /* Set 1 byte */
 248         cmp     r2, #0x02
 249         strgeb  r3, [ip], #0x01         /* Set another byte */
 250         sub     r1, r1, r2
 251         strgtb  r3, [ip], #0x01         /* and a third */
 252         cmp     r1, #0x04               /* More than 4 bytes left? */
 253         bge     .Lmemset_wordaligned    /* Yup */
 254
 255 .Lmemset_lessthanfour:
 256         cmp     r1, #0x00
 257         RETeq                   /* Zero length so exit */
 258         strb    r3, [ip], #0x01         /* Set 1 byte */
 259         cmp     r1, #0x02
 260         strgeb  r3, [ip], #0x01         /* Set another byte */
 261         strgtb  r3, [ip]                /* and a third */
 262         RET                     /* Exit */
 263
 264 ENTRY(bcmp)
 265         mov     ip, r0
 266         cmp     r2, #0x06
 267         beq     .Lmemcmp_6bytes
 268         mov     r0, #0x00
 269
 270         /* Are both addresses aligned the same way? */
 271         cmp     r2, #0x00
 272         eornes  r3, ip, r1
 273         RETeq                   /* len == 0, or same addresses! */
 274         tst     r3, #0x03
 275         subne   r2, r2, #0x01
 276         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 277
 278         /* Word-align the addresses, if necessary */
 279         sub     r3, r1, #0x05
 280         ands    r3, r3, #0x03
 281         add     r3, r3, r3, lsl #1
 282         addne   pc, pc, r3, lsl #3
 283         nop
 284
 285         /* Compare up to 3 bytes */
 286         ldrb    r0, [ip], #0x01
 287         ldrb    r3, [r1], #0x01
 288         subs    r0, r0, r3
 289         RETne
 290         subs    r2, r2, #0x01
 291         RETeq
 292
 293         /* Compare up to 2 bytes */
 294         ldrb    r0, [ip], #0x01
 295         ldrb    r3, [r1], #0x01
 296         subs    r0, r0, r3
 297         RETne
 298         subs    r2, r2, #0x01
 299         RETeq
 300
 301         /* Compare 1 byte */
 302         ldrb    r0, [ip], #0x01
 303         ldrb    r3, [r1], #0x01
 304         subs    r0, r0, r3
 305         RETne
 306         subs    r2, r2, #0x01
 307         RETeq
 308
 309         /* Compare 4 bytes at a time, if possible */
 310         subs    r2, r2, #0x04
 311         bcc     .Lmemcmp_bytewise
 312 .Lmemcmp_word_aligned:
 313         ldr     r0, [ip], #0x04
 314         ldr     r3, [r1], #0x04
 315         subs    r2, r2, #0x04
 316         cmpcs   r0, r3
 317         beq     .Lmemcmp_word_aligned
 318         sub     r0, r0, r3
 319
 320         /* Correct for extra subtraction, and check if done */
 321         adds    r2, r2, #0x04
 322         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 323         RETeq                   /* Yup. Just return */
 324
 325         /* Re-do the final word byte-wise */
 326         sub     ip, ip, #0x04
 327         sub     r1, r1, #0x04
 328
 329 .Lmemcmp_bytewise:
 330         add     r2, r2, #0x03
 331 .Lmemcmp_bytewise2:
 332         ldrb    r0, [ip], #0x01
 333         ldrb    r3, [r1], #0x01
 334         subs    r2, r2, #0x01
 335         cmpcs   r0, r3
 336         beq     .Lmemcmp_bytewise2
 337         sub     r0, r0, r3
 338         RET
 339
 340         /*
 341          * 6 byte compares are very common, thanks to the network stack.
 342          * This code is hand-scheduled to reduce the number of stalls for
 343          * load results. Everything else being equal, this will be ~32%
 344          * faster than a byte-wise memcmp.
 345          */
 346         .align  5
 347 .Lmemcmp_6bytes:
 348         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 349         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 350         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 351         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 352         ldreqb  r3, [ip, #0x01]         /* r3 = b1#1 */
 353         RETne                   /* Return if mismatch on #0 */
 354         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 355         ldreqb  r3, [r1, #0x02]         /* r3 = b2#2 */
 356         ldreqb  r0, [ip, #0x02]         /* r0 = b1#2 */
 357         RETne                   /* Return if mismatch on #1 */
 358         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 359         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 360         ldreqb  r3, [ip, #0x03]         /* r3 = b1#3 */
 361         RETne                   /* Return if mismatch on #2 */
 362         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 363         ldreqb  r3, [r1, #0x04]         /* r3 = b2#4 */
 364         ldreqb  r0, [ip, #0x04]         /* r0 = b1#4 */
 365         RETne                   /* Return if mismatch on #3 */
 366         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 367         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 368         ldreqb  r3, [ip, #0x05]         /* r3 = b1#5 */
 369         RETne                   /* Return if mismatch on #4 */
 370         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 371         RET
 372
 373 ENTRY(bcopy)
 374         /* switch the source and destination registers */
 375         eor     r0, r1, r0
 376         eor     r1, r0, r1
 377         eor     r0, r1, r0
 378 ENTRY(memmove)
 379         /* Do the buffers overlap? */
 380         cmp     r0, r1
 381         RETeq           /* Bail now if src/dst are the same */
 382         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 383         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 384         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 385         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 386
 387         /* Determine copy direction */
 388         cmp     r1, r0
 389         bcc     .Lmemmove_backwards
 390
 391         moveq   r0, #0                  /* Quick abort for len=0 */
 392         RETeq
 393
 394         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 395         subs    r2, r2, #4
 396         blt     .Lmemmove_fl4           /* less than 4 bytes */
 397         ands    r12, r0, #3
 398         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 399         ands    r12, r1, #3
 400         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 401
 402 .Lmemmove_ft8:
 403         /* We have aligned source and destination */
 404         subs    r2, r2, #8
 405         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 406         subs    r2, r2, #0x14
 407         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 408         stmdb   sp!, {r4}               /* borrow r4 */
 409
 410         /* blat 32 bytes at a time */
 411         /* XXX for really big copies perhaps we should use more registers */
 412 .Lmemmove_floop32:
 413         ldmia   r1!, {r3, r4, r12, lr}
 414         stmia   r0!, {r3, r4, r12, lr}
 415         ldmia   r1!, {r3, r4, r12, lr}
 416         stmia   r0!, {r3, r4, r12, lr}
 417         subs    r2, r2, #0x20
 418         bge     .Lmemmove_floop32
 419
 420         cmn     r2, #0x10
 421         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 422         stmgeia r0!, {r3, r4, r12, lr}
 423         subge   r2, r2, #0x10
 424         ldmia   sp!, {r4}               /* return r4 */
 425
 426 .Lmemmove_fl32:
 427         adds    r2, r2, #0x14
 428
 429         /* blat 12 bytes at a time */
 430 .Lmemmove_floop12:
 431         ldmgeia r1!, {r3, r12, lr}
 432         stmgeia r0!, {r3, r12, lr}
 433         subges  r2, r2, #0x0c
 434         bge     .Lmemmove_floop12
 435
 436 .Lmemmove_fl12:
 437         adds    r2, r2, #8
 438         blt     .Lmemmove_fl4
 439
 440         subs    r2, r2, #4
 441         ldrlt   r3, [r1], #4
 442         strlt   r3, [r0], #4
 443         ldmgeia r1!, {r3, r12}
 444         stmgeia r0!, {r3, r12}
 445         subge   r2, r2, #4
 446
 447 .Lmemmove_fl4:
 448         /* less than 4 bytes to go */
 449         adds    r2, r2, #4
 450         ldmeqia sp!, {r0, pc}           /* done */
 451
 452         /* copy the crud byte at a time */
 453         cmp     r2, #2
 454         ldrb    r3, [r1], #1
 455         strb    r3, [r0], #1
 456         ldrgeb  r3, [r1], #1
 457         strgeb  r3, [r0], #1
 458         ldrgtb  r3, [r1], #1
 459         strgtb  r3, [r0], #1
 460         ldmia   sp!, {r0, pc}
 461
 462         /* erg - unaligned destination */
 463 .Lmemmove_fdestul:
 464         rsb     r12, r12, #4
 465         cmp     r12, #2
 466
 467         /* align destination with byte copies */
 468         ldrb    r3, [r1], #1
 469         strb    r3, [r0], #1
 470         ldrgeb  r3, [r1], #1
 471         strgeb  r3, [r0], #1
 472         ldrgtb  r3, [r1], #1
 473         strgtb  r3, [r0], #1
 474         subs    r2, r2, r12
 475         blt     .Lmemmove_fl4           /* less the 4 bytes */
 476
 477         ands    r12, r1, #3
 478         beq     .Lmemmove_ft8           /* we have an aligned source */
 479
 480         /* erg - unaligned source */
 481         /* This is where it gets nasty ... */
 482 .Lmemmove_fsrcul:
 483         bic     r1, r1, #3
 484         ldr     lr, [r1], #4
 485         cmp     r12, #2
 486         bgt     .Lmemmove_fsrcul3
 487         beq     .Lmemmove_fsrcul2
 488         cmp     r2, #0x0c
 489         blt     .Lmemmove_fsrcul1loop4
 490         sub     r2, r2, #0x0c
 491         stmdb   sp!, {r4, r5}
 492
 493 .Lmemmove_fsrcul1loop16:
 494 #ifdef __ARMEB__
 495         mov     r3, lr, lsl #8
 496 #else
 497         mov     r3, lr, lsr #8
 498 #endif
 499         ldmia   r1!, {r4, r5, r12, lr}
 500 #ifdef __ARMEB__
 501         orr     r3, r3, r4, lsr #24
 502         mov     r4, r4, lsl #8
 503         orr     r4, r4, r5, lsr #24
 504         mov     r5, r5, lsl #8
 505         orr     r5, r5, r12, lsr #24
 506         mov     r12, r12, lsl #8
 507         orr     r12, r12, lr, lsr #24
 508 #else
 509         orr     r3, r3, r4, lsl #24
 510         mov     r4, r4, lsr #8
 511         orr     r4, r4, r5, lsl #24
 512         mov     r5, r5, lsr #8
 513         orr     r5, r5, r12, lsl #24
 514         mov     r12, r12, lsr #8
 515         orr     r12, r12, lr, lsl #24
 516 #endif
 517         stmia   r0!, {r3-r5, r12}
 518         subs    r2, r2, #0x10
 519         bge     .Lmemmove_fsrcul1loop16
 520         ldmia   sp!, {r4, r5}
 521         adds    r2, r2, #0x0c
 522         blt     .Lmemmove_fsrcul1l4
 523
 524 .Lmemmove_fsrcul1loop4:
 525 #ifdef __ARMEB__
 526         mov     r12, lr, lsl #8
 527 #else
 528         mov     r12, lr, lsr #8
 529 #endif
 530         ldr     lr, [r1], #4
 531 #ifdef __ARMEB__
 532         orr     r12, r12, lr, lsr #24
 533 #else
 534         orr     r12, r12, lr, lsl #24
 535 #endif
 536         str     r12, [r0], #4
 537         subs    r2, r2, #4
 538         bge     .Lmemmove_fsrcul1loop4
 539
 540 .Lmemmove_fsrcul1l4:
 541         sub     r1, r1, #3
 542         b       .Lmemmove_fl4
 543
 544 .Lmemmove_fsrcul2:
 545         cmp     r2, #0x0c
 546         blt     .Lmemmove_fsrcul2loop4
 547         sub     r2, r2, #0x0c
 548         stmdb   sp!, {r4, r5}
 549
 550 .Lmemmove_fsrcul2loop16:
 551 #ifdef __ARMEB__
 552         mov     r3, lr, lsl #16
 553 #else
 554         mov     r3, lr, lsr #16
 555 #endif
 556         ldmia   r1!, {r4, r5, r12, lr}
 557 #ifdef __ARMEB__
 558         orr     r3, r3, r4, lsr #16
 559         mov     r4, r4, lsl #16
 560         orr     r4, r4, r5, lsr #16
 561         mov     r5, r5, lsl #16
 562         orr     r5, r5, r12, lsr #16
 563         mov     r12, r12, lsl #16
 564         orr     r12, r12, lr, lsr #16
 565 #else
 566         orr     r3, r3, r4, lsl #16
 567         mov     r4, r4, lsr #16
 568         orr     r4, r4, r5, lsl #16
 569         mov     r5, r5, lsr #16
 570         orr     r5, r5, r12, lsl #16
 571         mov     r12, r12, lsr #16
 572         orr     r12, r12, lr, lsl #16
 573 #endif
 574         stmia   r0!, {r3-r5, r12}
 575         subs    r2, r2, #0x10
 576         bge     .Lmemmove_fsrcul2loop16
 577         ldmia   sp!, {r4, r5}
 578         adds    r2, r2, #0x0c
 579         blt     .Lmemmove_fsrcul2l4
 580
 581 .Lmemmove_fsrcul2loop4:
 582 #ifdef __ARMEB__
 583         mov     r12, lr, lsl #16
 584 #else
 585         mov     r12, lr, lsr #16
 586 #endif
 587         ldr     lr, [r1], #4
 588 #ifdef __ARMEB__
 589         orr     r12, r12, lr, lsr #16
 590 #else
 591         orr     r12, r12, lr, lsl #16
 592 #endif
 593         str     r12, [r0], #4
 594         subs    r2, r2, #4
 595         bge     .Lmemmove_fsrcul2loop4
 596
 597 .Lmemmove_fsrcul2l4:
 598         sub     r1, r1, #2
 599         b       .Lmemmove_fl4
 600
 601 .Lmemmove_fsrcul3:
 602         cmp     r2, #0x0c
 603         blt     .Lmemmove_fsrcul3loop4
 604         sub     r2, r2, #0x0c
 605         stmdb   sp!, {r4, r5}
 606
 607 .Lmemmove_fsrcul3loop16:
 608 #ifdef __ARMEB__
 609         mov     r3, lr, lsl #24
 610 #else
 611         mov     r3, lr, lsr #24
 612 #endif
 613         ldmia   r1!, {r4, r5, r12, lr}
 614 #ifdef __ARMEB__
 615         orr     r3, r3, r4, lsr #8
 616         mov     r4, r4, lsl #24
 617         orr     r4, r4, r5, lsr #8
 618         mov     r5, r5, lsl #24
 619         orr     r5, r5, r12, lsr #8
 620         mov     r12, r12, lsl #24
 621         orr     r12, r12, lr, lsr #8
 622 #else
 623         orr     r3, r3, r4, lsl #8
 624         mov     r4, r4, lsr #24
 625         orr     r4, r4, r5, lsl #8
 626         mov     r5, r5, lsr #24
 627         orr     r5, r5, r12, lsl #8
 628         mov     r12, r12, lsr #24
 629         orr     r12, r12, lr, lsl #8
 630 #endif
 631         stmia   r0!, {r3-r5, r12}
 632         subs    r2, r2, #0x10
 633         bge     .Lmemmove_fsrcul3loop16
 634         ldmia   sp!, {r4, r5}
 635         adds    r2, r2, #0x0c
 636         blt     .Lmemmove_fsrcul3l4
 637
 638 .Lmemmove_fsrcul3loop4:
 639 #ifdef __ARMEB__
 640         mov     r12, lr, lsl #24
 641 #else
 642         mov     r12, lr, lsr #24
 643 #endif
 644         ldr     lr, [r1], #4
 645 #ifdef __ARMEB__
 646         orr     r12, r12, lr, lsr #8
 647 #else
 648         orr     r12, r12, lr, lsl #8
 649 #endif
 650         str     r12, [r0], #4
 651         subs    r2, r2, #4
 652         bge     .Lmemmove_fsrcul3loop4
 653
 654 .Lmemmove_fsrcul3l4:
 655         sub     r1, r1, #1
 656         b       .Lmemmove_fl4
 657
 658 .Lmemmove_backwards:
 659         add     r1, r1, r2
 660         add     r0, r0, r2
 661         subs    r2, r2, #4
 662         blt     .Lmemmove_bl4           /* less than 4 bytes */
 663         ands    r12, r0, #3
 664         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 665         ands    r12, r1, #3
 666         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 667
 668 .Lmemmove_bt8:
 669         /* We have aligned source and destination */
 670         subs    r2, r2, #8
 671         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 672         stmdb   sp!, {r4, lr}
 673         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 674         blt     .Lmemmove_bl32
 675
 676         /* blat 32 bytes at a time */
 677         /* XXX for really big copies perhaps we should use more registers */
 678 .Lmemmove_bloop32:
 679         ldmdb   r1!, {r3, r4, r12, lr}
 680         stmdb   r0!, {r3, r4, r12, lr}
 681         ldmdb   r1!, {r3, r4, r12, lr}
 682         stmdb   r0!, {r3, r4, r12, lr}
 683         subs    r2, r2, #0x20
 684         bge     .Lmemmove_bloop32
 685
 686 .Lmemmove_bl32:
 687         cmn     r2, #0x10
 688         ldmgedb r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 689         stmgedb r0!, {r3, r4, r12, lr}
 690         subge   r2, r2, #0x10
 691         adds    r2, r2, #0x14
 692         ldmgedb r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 693         stmgedb r0!, {r3, r12, lr}
 694         subge   r2, r2, #0x0c
 695         ldmia   sp!, {r4, lr}
 696
 697 .Lmemmove_bl12:
 698         adds    r2, r2, #8
 699         blt     .Lmemmove_bl4
 700         subs    r2, r2, #4
 701         ldrlt   r3, [r1, #-4]!
 702         strlt   r3, [r0, #-4]!
 703         ldmgedb r1!, {r3, r12}
 704         stmgedb r0!, {r3, r12}
 705         subge   r2, r2, #4
 706
 707 .Lmemmove_bl4:
 708         /* less than 4 bytes to go */
 709         adds    r2, r2, #4
 710         RETeq                   /* done */
 711
 712         /* copy the crud byte at a time */
 713         cmp     r2, #2
 714         ldrb    r3, [r1, #-1]!
 715         strb    r3, [r0, #-1]!
 716         ldrgeb  r3, [r1, #-1]!
 717         strgeb  r3, [r0, #-1]!
 718         ldrgtb  r3, [r1, #-1]!
 719         strgtb  r3, [r0, #-1]!
 720         RET
 721
 722         /* erg - unaligned destination */
 723 .Lmemmove_bdestul:
 724         cmp     r12, #2
 725
 726         /* align destination with byte copies */
 727         ldrb    r3, [r1, #-1]!
 728         strb    r3, [r0, #-1]!
 729         ldrgeb  r3, [r1, #-1]!
 730         strgeb  r3, [r0, #-1]!
 731         ldrgtb  r3, [r1, #-1]!
 732         strgtb  r3, [r0, #-1]!
 733         subs    r2, r2, r12
 734         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 735         ands    r12, r1, #3
 736         beq     .Lmemmove_bt8           /* we have an aligned source */
 737
 738         /* erg - unaligned source */
 739         /* This is where it gets nasty ... */
 740 .Lmemmove_bsrcul:
 741         bic     r1, r1, #3
 742         ldr     r3, [r1, #0]
 743         cmp     r12, #2
 744         blt     .Lmemmove_bsrcul1
 745         beq     .Lmemmove_bsrcul2
 746         cmp     r2, #0x0c
 747         blt     .Lmemmove_bsrcul3loop4
 748         sub     r2, r2, #0x0c
 749         stmdb   sp!, {r4, r5, lr}
 750
 751 .Lmemmove_bsrcul3loop16:
 752 #ifdef __ARMEB__
 753         mov     lr, r3, lsr #8
 754 #else
 755         mov     lr, r3, lsl #8
 756 #endif
 757         ldmdb   r1!, {r3-r5, r12}
 758 #ifdef __ARMEB__
 759         orr     lr, lr, r12, lsl #24
 760         mov     r12, r12, lsr #8
 761         orr     r12, r12, r5, lsl #24
 762         mov     r5, r5, lsr #8
 763         orr     r5, r5, r4, lsl #24
 764         mov     r4, r4, lsr #8
 765         orr     r4, r4, r3, lsl #24
 766 #else
 767         orr     lr, lr, r12, lsr #24
 768         mov     r12, r12, lsl #8
 769         orr     r12, r12, r5, lsr #24
 770         mov     r5, r5, lsl #8
 771         orr     r5, r5, r4, lsr #24
 772         mov     r4, r4, lsl #8
 773         orr     r4, r4, r3, lsr #24
 774 #endif
 775         stmdb   r0!, {r4, r5, r12, lr}
 776         subs    r2, r2, #0x10
 777         bge     .Lmemmove_bsrcul3loop16
 778         ldmia   sp!, {r4, r5, lr}
 779         adds    r2, r2, #0x0c
 780         blt     .Lmemmove_bsrcul3l4
 781
 782 .Lmemmove_bsrcul3loop4:
 783 #ifdef __ARMEB__
 784         mov     r12, r3, lsr #8
 785 #else
 786         mov     r12, r3, lsl #8
 787 #endif
 788         ldr     r3, [r1, #-4]!
 789 #ifdef __ARMEB__
 790         orr     r12, r12, r3, lsl #24
 791 #else
 792         orr     r12, r12, r3, lsr #24
 793 #endif
 794         str     r12, [r0, #-4]!
 795         subs    r2, r2, #4
 796         bge     .Lmemmove_bsrcul3loop4
 797
 798 .Lmemmove_bsrcul3l4:
 799         add     r1, r1, #3
 800         b       .Lmemmove_bl4
 801
 802 .Lmemmove_bsrcul2:
 803         cmp     r2, #0x0c
 804         blt     .Lmemmove_bsrcul2loop4
 805         sub     r2, r2, #0x0c
 806         stmdb   sp!, {r4, r5, lr}
 807
 808 .Lmemmove_bsrcul2loop16:
 809 #ifdef __ARMEB__
 810         mov     lr, r3, lsr #16
 811 #else
 812         mov     lr, r3, lsl #16
 813 #endif
 814         ldmdb   r1!, {r3-r5, r12}
 815 #ifdef __ARMEB__
 816         orr     lr, lr, r12, lsl #16
 817         mov     r12, r12, lsr #16
 818         orr     r12, r12, r5, lsl #16
 819         mov     r5, r5, lsr #16
 820         orr     r5, r5, r4, lsl #16
 821         mov     r4, r4, lsr #16
 822         orr     r4, r4, r3, lsl #16
 823 #else
 824         orr     lr, lr, r12, lsr #16
 825         mov     r12, r12, lsl #16
 826         orr     r12, r12, r5, lsr #16
 827         mov     r5, r5, lsl #16
 828         orr     r5, r5, r4, lsr #16
 829         mov     r4, r4, lsl #16
 830         orr     r4, r4, r3, lsr #16
 831 #endif
 832         stmdb   r0!, {r4, r5, r12, lr}
 833         subs    r2, r2, #0x10
 834         bge     .Lmemmove_bsrcul2loop16
 835         ldmia   sp!, {r4, r5, lr}
 836         adds    r2, r2, #0x0c
 837         blt     .Lmemmove_bsrcul2l4
 838
 839 .Lmemmove_bsrcul2loop4:
 840 #ifdef __ARMEB__
 841         mov     r12, r3, lsr #16
 842 #else
 843         mov     r12, r3, lsl #16
 844 #endif
 845         ldr     r3, [r1, #-4]!
 846 #ifdef __ARMEB__
 847         orr     r12, r12, r3, lsl #16
 848 #else
 849         orr     r12, r12, r3, lsr #16
 850 #endif
 851         str     r12, [r0, #-4]!
 852         subs    r2, r2, #4
 853         bge     .Lmemmove_bsrcul2loop4
 854
 855 .Lmemmove_bsrcul2l4:
 856         add     r1, r1, #2
 857         b       .Lmemmove_bl4
 858
 859 .Lmemmove_bsrcul1:
 860         cmp     r2, #0x0c
 861         blt     .Lmemmove_bsrcul1loop4
 862         sub     r2, r2, #0x0c
 863         stmdb   sp!, {r4, r5, lr}
 864
 865 .Lmemmove_bsrcul1loop32:
 866 #ifdef __ARMEB__
 867         mov     lr, r3, lsr #24
 868 #else
 869         mov     lr, r3, lsl #24
 870 #endif
 871         ldmdb   r1!, {r3-r5, r12}
 872 #ifdef __ARMEB__
 873         orr     lr, lr, r12, lsl #8
 874         mov     r12, r12, lsr #24
 875         orr     r12, r12, r5, lsl #8
 876         mov     r5, r5, lsr #24
 877         orr     r5, r5, r4, lsl #8
 878         mov     r4, r4, lsr #24
 879         orr     r4, r4, r3, lsl #8
 880 #else
 881         orr     lr, lr, r12, lsr #8
 882         mov     r12, r12, lsl #24
 883         orr     r12, r12, r5, lsr #8
 884         mov     r5, r5, lsl #24
 885         orr     r5, r5, r4, lsr #8
 886         mov     r4, r4, lsl #24
 887         orr     r4, r4, r3, lsr #8
 888 #endif
 889         stmdb   r0!, {r4, r5, r12, lr}
 890         subs    r2, r2, #0x10
 891         bge     .Lmemmove_bsrcul1loop32
 892         ldmia   sp!, {r4, r5, lr}
 893         adds    r2, r2, #0x0c
 894         blt     .Lmemmove_bsrcul1l4
 895
 896 .Lmemmove_bsrcul1loop4:
 897 #ifdef __ARMEB__
 898         mov     r12, r3, lsr #24
 899 #else
 900         mov     r12, r3, lsl #24
 901 #endif
 902         ldr     r3, [r1, #-4]!
 903 #ifdef __ARMEB__
 904         orr     r12, r12, r3, lsl #8
 905 #else
 906         orr     r12, r12, r3, lsr #8
 907 #endif
 908         str     r12, [r0, #-4]!
 909         subs    r2, r2, #4
 910         bge     .Lmemmove_bsrcul1loop4
 911
 912 .Lmemmove_bsrcul1l4:
 913         add     r1, r1, #1
 914         b       .Lmemmove_bl4
 915
 916 #if !defined(__XSCALE__)
 917 ENTRY(memcpy)
 918         /* save leaf functions having to store this away */
 919         stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
 920
 921         subs    r2, r2, #4
 922         blt     .Lmemcpy_l4             /* less than 4 bytes */
 923         ands    r12, r0, #3
 924         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
 925         ands    r12, r1, #3
 926         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
 927
 928 .Lmemcpy_t8:
 929         /* We have aligned source and destination */
 930         subs    r2, r2, #8
 931         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
 932         subs    r2, r2, #0x14
 933         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
 934         stmdb   sp!, {r4}               /* borrow r4 */
 935
 936         /* blat 32 bytes at a time */
 937         /* XXX for really big copies perhaps we should use more registers */
 938 .Lmemcpy_loop32:
 939         ldmia   r1!, {r3, r4, r12, lr}
 940         stmia   r0!, {r3, r4, r12, lr}
 941         ldmia   r1!, {r3, r4, r12, lr}
 942         stmia   r0!, {r3, r4, r12, lr}
 943         subs    r2, r2, #0x20
 944         bge     .Lmemcpy_loop32
 945
 946         cmn     r2, #0x10
 947         ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 948         stmgeia r0!, {r3, r4, r12, lr}
 949         subge   r2, r2, #0x10
 950         ldmia   sp!, {r4}               /* return r4 */
 951
 952 .Lmemcpy_l32:
 953         adds    r2, r2, #0x14
 954
 955         /* blat 12 bytes at a time */
 956 .Lmemcpy_loop12:
 957         ldmgeia r1!, {r3, r12, lr}
 958         stmgeia r0!, {r3, r12, lr}
 959         subges  r2, r2, #0x0c
 960         bge     .Lmemcpy_loop12
 961
 962 .Lmemcpy_l12:
 963         adds    r2, r2, #8
 964         blt     .Lmemcpy_l4
 965
 966         subs    r2, r2, #4
 967         ldrlt   r3, [r1], #4
 968         strlt   r3, [r0], #4
 969         ldmgeia r1!, {r3, r12}
 970         stmgeia r0!, {r3, r12}
 971         subge   r2, r2, #4
 972
 973 .Lmemcpy_l4:
 974         /* less than 4 bytes to go */
 975         adds    r2, r2, #4
 976 #ifdef __APCS_26_
 977         ldmeqia sp!, {r0, pc}^          /* done */
 978 #else
 979         ldmeqia sp!, {r0, pc}           /* done */
 980 #endif
 981         /* copy the crud byte at a time */
 982         cmp     r2, #2
 983         ldrb    r3, [r1], #1
 984         strb    r3, [r0], #1
 985         ldrgeb  r3, [r1], #1
 986         strgeb  r3, [r0], #1
 987         ldrgtb  r3, [r1], #1
 988         strgtb  r3, [r0], #1
 989         ldmia   sp!, {r0, pc}
 990
 991         /* erg - unaligned destination */
 992 .Lmemcpy_destul:
 993         rsb     r12, r12, #4
 994         cmp     r12, #2
 995
 996         /* align destination with byte copies */
 997         ldrb    r3, [r1], #1
 998         strb    r3, [r0], #1
 999         ldrgeb  r3, [r1], #1
1000         strgeb  r3, [r0], #1
1001         ldrgtb  r3, [r1], #1
1002         strgtb  r3, [r0], #1
1003         subs    r2, r2, r12
1004         blt     .Lmemcpy_l4             /* less the 4 bytes */
1005
1006         ands    r12, r1, #3
1007         beq     .Lmemcpy_t8             /* we have an aligned source */
1008
1009         /* erg - unaligned source */
1010         /* This is where it gets nasty ... */
1011 .Lmemcpy_srcul:
1012         bic     r1, r1, #3
1013         ldr     lr, [r1], #4
1014         cmp     r12, #2
1015         bgt     .Lmemcpy_srcul3
1016         beq     .Lmemcpy_srcul2
1017         cmp     r2, #0x0c
1018         blt     .Lmemcpy_srcul1loop4
1019         sub     r2, r2, #0x0c
1020         stmdb   sp!, {r4, r5}
1021
1022 .Lmemcpy_srcul1loop16:
1023         mov     r3, lr, lsr #8
1024         ldmia   r1!, {r4, r5, r12, lr}
1025         orr     r3, r3, r4, lsl #24
1026         mov     r4, r4, lsr #8
1027         orr     r4, r4, r5, lsl #24
1028         mov     r5, r5, lsr #8
1029         orr     r5, r5, r12, lsl #24
1030         mov     r12, r12, lsr #8
1031         orr     r12, r12, lr, lsl #24
1032         stmia   r0!, {r3-r5, r12}
1033         subs    r2, r2, #0x10
1034         bge     .Lmemcpy_srcul1loop16
1035         ldmia   sp!, {r4, r5}
1036         adds    r2, r2, #0x0c
1037         blt     .Lmemcpy_srcul1l4
1038
1039 .Lmemcpy_srcul1loop4:
1040         mov     r12, lr, lsr #8
1041         ldr     lr, [r1], #4
1042         orr     r12, r12, lr, lsl #24
1043         str     r12, [r0], #4
1044         subs    r2, r2, #4
1045         bge     .Lmemcpy_srcul1loop4
1046
1047 .Lmemcpy_srcul1l4:
1048         sub     r1, r1, #3
1049         b       .Lmemcpy_l4
1050
1051 .Lmemcpy_srcul2:
1052         cmp     r2, #0x0c
1053         blt     .Lmemcpy_srcul2loop4
1054         sub     r2, r2, #0x0c
1055         stmdb   sp!, {r4, r5}
1056
1057 .Lmemcpy_srcul2loop16:
1058         mov     r3, lr, lsr #16
1059         ldmia   r1!, {r4, r5, r12, lr}
1060         orr     r3, r3, r4, lsl #16
1061         mov     r4, r4, lsr #16
1062         orr     r4, r4, r5, lsl #16
1063         mov     r5, r5, lsr #16
1064         orr     r5, r5, r12, lsl #16
1065         mov     r12, r12, lsr #16
1066         orr     r12, r12, lr, lsl #16
1067         stmia   r0!, {r3-r5, r12}
1068         subs    r2, r2, #0x10
1069         bge     .Lmemcpy_srcul2loop16
1070         ldmia   sp!, {r4, r5}
1071         adds    r2, r2, #0x0c
1072         blt     .Lmemcpy_srcul2l4
1073
1074 .Lmemcpy_srcul2loop4:
1075         mov     r12, lr, lsr #16
1076         ldr     lr, [r1], #4
1077         orr     r12, r12, lr, lsl #16
1078         str     r12, [r0], #4
1079         subs    r2, r2, #4
1080         bge     .Lmemcpy_srcul2loop4
1081
1082 .Lmemcpy_srcul2l4:
1083         sub     r1, r1, #2
1084         b       .Lmemcpy_l4
1085
1086 .Lmemcpy_srcul3:
1087         cmp     r2, #0x0c
1088         blt     .Lmemcpy_srcul3loop4
1089         sub     r2, r2, #0x0c
1090         stmdb   sp!, {r4, r5}
1091
1092 .Lmemcpy_srcul3loop16:
1093         mov     r3, lr, lsr #24
1094         ldmia   r1!, {r4, r5, r12, lr}
1095         orr     r3, r3, r4, lsl #8
1096         mov     r4, r4, lsr #24
1097         orr     r4, r4, r5, lsl #8
1098         mov     r5, r5, lsr #24
1099         orr     r5, r5, r12, lsl #8
1100         mov     r12, r12, lsr #24
1101         orr     r12, r12, lr, lsl #8
1102         stmia   r0!, {r3-r5, r12}
1103         subs    r2, r2, #0x10
1104         bge     .Lmemcpy_srcul3loop16
1105         ldmia   sp!, {r4, r5}
1106         adds    r2, r2, #0x0c
1107         blt     .Lmemcpy_srcul3l4
1108
1109 .Lmemcpy_srcul3loop4:
1110         mov     r12, lr, lsr #24
1111         ldr     lr, [r1], #4
1112         orr     r12, r12, lr, lsl #8
1113         str     r12, [r0], #4
1114         subs    r2, r2, #4
1115         bge     .Lmemcpy_srcul3loop4
1116
1117 .Lmemcpy_srcul3l4:
1118         sub     r1, r1, #1
1119         b       .Lmemcpy_l4
1120 #else
1121 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1122 ENTRY(memcpy)
1123         pld     [r1]
1124         cmp     r2, #0x0c
1125         ble     .Lmemcpy_short          /* <= 12 bytes */
1126         mov     r3, r0                  /* We must not clobber r0 */
1127
1128         /* Word-align the destination buffer */
1129         ands    ip, r3, #0x03           /* Already word aligned? */
1130         beq     .Lmemcpy_wordaligned    /* Yup */
1131         cmp     ip, #0x02
1132         ldrb    ip, [r1], #0x01
1133         sub     r2, r2, #0x01
1134         strb    ip, [r3], #0x01
1135         ldrleb  ip, [r1], #0x01
1136         suble   r2, r2, #0x01
1137         strleb  ip, [r3], #0x01
1138         ldrltb  ip, [r1], #0x01
1139         sublt   r2, r2, #0x01
1140         strltb  ip, [r3], #0x01
1141
1142         /* Destination buffer is now word aligned */
1143 .Lmemcpy_wordaligned:
1144         ands    ip, r1, #0x03           /* Is src also word-aligned? */
1145         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
1146
1147         /* Quad-align the destination buffer */
1148         tst     r3, #0x07               /* Already quad aligned? */
1149         ldrne   ip, [r1], #0x04
1150         stmfd   sp!, {r4-r9}            /* Free up some registers */
1151         subne   r2, r2, #0x04
1152         strne   ip, [r3], #0x04
1153
1154         /* Destination buffer quad aligned, source is at least word aligned */
1155         subs    r2, r2, #0x80
1156         blt     .Lmemcpy_w_lessthan128
1157
1158         /* Copy 128 bytes at a time */
1159 .Lmemcpy_w_loop128:
1160         ldr     r4, [r1], #0x04         /* LD:00-03 */
1161         ldr     r5, [r1], #0x04         /* LD:04-07 */
1162         pld     [r1, #0x18]             /* Prefetch 0x20 */
1163         ldr     r6, [r1], #0x04         /* LD:08-0b */
1164         ldr     r7, [r1], #0x04         /* LD:0c-0f */
1165         ldr     r8, [r1], #0x04         /* LD:10-13 */
1166         ldr     r9, [r1], #0x04         /* LD:14-17 */
1167         strd    r4, [r3], #0x08         /* ST:00-07 */
1168         ldr     r4, [r1], #0x04         /* LD:18-1b */
1169         ldr     r5, [r1], #0x04         /* LD:1c-1f */
1170         strd    r6, [r3], #0x08         /* ST:08-0f */
1171         ldr     r6, [r1], #0x04         /* LD:20-23 */
1172         ldr     r7, [r1], #0x04         /* LD:24-27 */
1173         pld     [r1, #0x18]             /* Prefetch 0x40 */
1174         strd    r8, [r3], #0x08         /* ST:10-17 */
1175         ldr     r8, [r1], #0x04         /* LD:28-2b */
1176         ldr     r9, [r1], #0x04         /* LD:2c-2f */
1177         strd    r4, [r3], #0x08         /* ST:18-1f */
1178         ldr     r4, [r1], #0x04         /* LD:30-33 */
1179         ldr     r5, [r1], #0x04         /* LD:34-37 */
1180         strd    r6, [r3], #0x08         /* ST:20-27 */
1181         ldr     r6, [r1], #0x04         /* LD:38-3b */
1182         ldr     r7, [r1], #0x04         /* LD:3c-3f */
1183         strd    r8, [r3], #0x08         /* ST:28-2f */
1184         ldr     r8, [r1], #0x04         /* LD:40-43 */
1185         ldr     r9, [r1], #0x04         /* LD:44-47 */
1186         pld     [r1, #0x18]             /* Prefetch 0x60 */
1187         strd    r4, [r3], #0x08         /* ST:30-37 */
1188         ldr     r4, [r1], #0x04         /* LD:48-4b */
1189         ldr     r5, [r1], #0x04         /* LD:4c-4f */
1190         strd    r6, [r3], #0x08         /* ST:38-3f */
1191         ldr     r6, [r1], #0x04         /* LD:50-53 */
1192         ldr     r7, [r1], #0x04         /* LD:54-57 */
1193         strd    r8, [r3], #0x08         /* ST:40-47 */
1194         ldr     r8, [r1], #0x04         /* LD:58-5b */
1195         ldr     r9, [r1], #0x04         /* LD:5c-5f */
1196         strd    r4, [r3], #0x08         /* ST:48-4f */
1197         ldr     r4, [r1], #0x04         /* LD:60-63 */
1198         ldr     r5, [r1], #0x04         /* LD:64-67 */
1199         pld     [r1, #0x18]             /* Prefetch 0x80 */
1200         strd    r6, [r3], #0x08         /* ST:50-57 */
1201         ldr     r6, [r1], #0x04         /* LD:68-6b */
1202         ldr     r7, [r1], #0x04         /* LD:6c-6f */
1203         strd    r8, [r3], #0x08         /* ST:58-5f */
1204         ldr     r8, [r1], #0x04         /* LD:70-73 */
1205         ldr     r9, [r1], #0x04         /* LD:74-77 */
1206         strd    r4, [r3], #0x08         /* ST:60-67 */
1207         ldr     r4, [r1], #0x04         /* LD:78-7b */
1208         ldr     r5, [r1], #0x04         /* LD:7c-7f */
1209         strd    r6, [r3], #0x08         /* ST:68-6f */
1210         strd    r8, [r3], #0x08         /* ST:70-77 */
1211         subs    r2, r2, #0x80
1212         strd    r4, [r3], #0x08         /* ST:78-7f */
1213         bge     .Lmemcpy_w_loop128
1214
1215 .Lmemcpy_w_lessthan128:
1216         adds    r2, r2, #0x80           /* Adjust for extra sub */
1217         ldmeqfd sp!, {r4-r9}
1218         RETeq                   /* Return now if done */
1219         subs    r2, r2, #0x20
1220         blt     .Lmemcpy_w_lessthan32
1221
1222         /* Copy 32 bytes at a time */
1223 .Lmemcpy_w_loop32:
1224         ldr     r4, [r1], #0x04
1225         ldr     r5, [r1], #0x04
1226         pld     [r1, #0x18]
1227         ldr     r6, [r1], #0x04
1228         ldr     r7, [r1], #0x04
1229         ldr     r8, [r1], #0x04
1230         ldr     r9, [r1], #0x04
1231         strd    r4, [r3], #0x08
1232         ldr     r4, [r1], #0x04
1233         ldr     r5, [r1], #0x04
1234         strd    r6, [r3], #0x08
1235         strd    r8, [r3], #0x08
1236         subs    r2, r2, #0x20
1237         strd    r4, [r3], #0x08
1238         bge     .Lmemcpy_w_loop32
1239
1240 .Lmemcpy_w_lessthan32:
1241         adds    r2, r2, #0x20           /* Adjust for extra sub */
1242         ldmeqfd sp!, {r4-r9}
1243         RETeq                   /* Return now if done */
1244
1245         and     r4, r2, #0x18
1246         rsbs    r4, r4, #0x18
1247         addne   pc, pc, r4, lsl #1
1248         nop
1249
1250         /* At least 24 bytes remaining */
1251         ldr     r4, [r1], #0x04
1252         ldr     r5, [r1], #0x04
1253         sub     r2, r2, #0x08
1254         strd    r4, [r3], #0x08
1255
1256         /* At least 16 bytes remaining */
1257         ldr     r4, [r1], #0x04
1258         ldr     r5, [r1], #0x04
1259         sub     r2, r2, #0x08
1260         strd    r4, [r3], #0x08
1261
1262         /* At least 8 bytes remaining */
1263         ldr     r4, [r1], #0x04
1264         ldr     r5, [r1], #0x04
1265         subs    r2, r2, #0x08
1266         strd    r4, [r3], #0x08
1267
1268         /* Less than 8 bytes remaining */
1269         ldmfd   sp!, {r4-r9}
1270         RETeq                   /* Return now if done */
1271         subs    r2, r2, #0x04
1272         ldrge   ip, [r1], #0x04
1273         strge   ip, [r3], #0x04
1274         RETeq                   /* Return now if done */
1275         addlt   r2, r2, #0x04
1276         ldrb    ip, [r1], #0x01
1277         cmp     r2, #0x02
1278         ldrgeb  r2, [r1], #0x01
1279         strb    ip, [r3], #0x01
1280         ldrgtb  ip, [r1]
1281         strgeb  r2, [r3], #0x01
1282         strgtb  ip, [r3]
1283         RET
1284
1285
1286 /*
1287  * At this point, it has not been possible to word align both buffers.
1288  * The destination buffer is word aligned, but the source buffer is not.
1289  */
1290 .Lmemcpy_bad_align:
1291         stmfd   sp!, {r4-r7}
1292         bic     r1, r1, #0x03
1293         cmp     ip, #2
1294         ldr     ip, [r1], #0x04
1295         bgt     .Lmemcpy_bad3
1296         beq     .Lmemcpy_bad2
1297         b       .Lmemcpy_bad1
1298
1299 .Lmemcpy_bad1_loop16:
1300 #ifdef __ARMEB__
1301         mov     r4, ip, lsl #8
1302 #else
1303         mov     r4, ip, lsr #8
1304 #endif
1305         ldr     r5, [r1], #0x04
1306         pld     [r1, #0x018]
1307         ldr     r6, [r1], #0x04
1308         ldr     r7, [r1], #0x04
1309         ldr     ip, [r1], #0x04
1310 #ifdef __ARMEB__
1311         orr     r4, r4, r5, lsr #24
1312         mov     r5, r5, lsl #8
1313         orr     r5, r5, r6, lsr #24
1314         mov     r6, r6, lsl #8
1315         orr     r6, r6, r7, lsr #24
1316         mov     r7, r7, lsl #8
1317         orr     r7, r7, ip, lsr #24
1318 #else
1319         orr     r4, r4, r5, lsl #24
1320         mov     r5, r5, lsr #8
1321         orr     r5, r5, r6, lsl #24
1322         mov     r6, r6, lsr #8
1323         orr     r6, r6, r7, lsl #24
1324         mov     r7, r7, lsr #8
1325         orr     r7, r7, ip, lsl #24
1326 #endif
1327         str     r4, [r3], #0x04
1328         str     r5, [r3], #0x04
1329         str     r6, [r3], #0x04
1330         str     r7, [r3], #0x04
1331 .Lmemcpy_bad1:
1332         subs    r2, r2, #0x10
1333         bge     .Lmemcpy_bad1_loop16
1334
1335         adds    r2, r2, #0x10
1336         ldmeqfd sp!, {r4-r7}
1337         RETeq                   /* Return now if done */
1338         subs    r2, r2, #0x04
1339         sublt   r1, r1, #0x03
1340         blt     .Lmemcpy_bad_done
1341
1342 .Lmemcpy_bad1_loop4:
1343 #ifdef __ARMEB__
1344         mov     r4, ip, lsl #8
1345 #else
1346         mov     r4, ip, lsr #8
1347 #endif
1348         ldr     ip, [r1], #0x04
1349         subs    r2, r2, #0x04
1350 #ifdef __ARMEB__
1351         orr     r4, r4, ip, lsr #24
1352 #else
1353         orr     r4, r4, ip, lsl #24
1354 #endif
1355         str     r4, [r3], #0x04
1356         bge     .Lmemcpy_bad1_loop4
1357         sub     r1, r1, #0x03
1358         b       .Lmemcpy_bad_done
1359
1360 .Lmemcpy_bad2_loop16:
1361 #ifdef __ARMEB__
1362         mov     r4, ip, lsl #16
1363 #else
1364         mov     r4, ip, lsr #16
1365 #endif
1366         ldr     r5, [r1], #0x04
1367         pld     [r1, #0x018]
1368         ldr     r6, [r1], #0x04
1369         ldr     r7, [r1], #0x04
1370         ldr     ip, [r1], #0x04
1371 #ifdef __ARMEB__
1372         orr     r4, r4, r5, lsr #16
1373         mov     r5, r5, lsl #16
1374         orr     r5, r5, r6, lsr #16
1375         mov     r6, r6, lsl #16
1376         orr     r6, r6, r7, lsr #16
1377         mov     r7, r7, lsl #16
1378         orr     r7, r7, ip, lsr #16
1379 #else
1380         orr     r4, r4, r5, lsl #16
1381         mov     r5, r5, lsr #16
1382         orr     r5, r5, r6, lsl #16
1383         mov     r6, r6, lsr #16
1384         orr     r6, r6, r7, lsl #16
1385         mov     r7, r7, lsr #16
1386         orr     r7, r7, ip, lsl #16
1387 #endif
1388         str     r4, [r3], #0x04
1389         str     r5, [r3], #0x04
1390         str     r6, [r3], #0x04
1391         str     r7, [r3], #0x04
1392 .Lmemcpy_bad2:
1393         subs    r2, r2, #0x10
1394         bge     .Lmemcpy_bad2_loop16
1395
1396         adds    r2, r2, #0x10
1397         ldmeqfd sp!, {r4-r7}
1398         RETeq                   /* Return now if done */
1399         subs    r2, r2, #0x04
1400         sublt   r1, r1, #0x02
1401         blt     .Lmemcpy_bad_done
1402
1403 .Lmemcpy_bad2_loop4:
1404 #ifdef __ARMEB__
1405         mov     r4, ip, lsl #16
1406 #else
1407         mov     r4, ip, lsr #16
1408 #endif
1409         ldr     ip, [r1], #0x04
1410         subs    r2, r2, #0x04
1411 #ifdef __ARMEB__
1412         orr     r4, r4, ip, lsr #16
1413 #else
1414         orr     r4, r4, ip, lsl #16
1415 #endif
1416         str     r4, [r3], #0x04
1417         bge     .Lmemcpy_bad2_loop4
1418         sub     r1, r1, #0x02
1419         b       .Lmemcpy_bad_done
1420
1421 .Lmemcpy_bad3_loop16:
1422 #ifdef __ARMEB__
1423         mov     r4, ip, lsl #24
1424 #else
1425         mov     r4, ip, lsr #24
1426 #endif
1427         ldr     r5, [r1], #0x04
1428         pld     [r1, #0x018]
1429         ldr     r6, [r1], #0x04
1430         ldr     r7, [r1], #0x04
1431         ldr     ip, [r1], #0x04
1432 #ifdef __ARMEB__
1433         orr     r4, r4, r5, lsr #8
1434         mov     r5, r5, lsl #24
1435         orr     r5, r5, r6, lsr #8
1436         mov     r6, r6, lsl #24
1437         orr     r6, r6, r7, lsr #8
1438         mov     r7, r7, lsl #24
1439         orr     r7, r7, ip, lsr #8
1440 #else
1441         orr     r4, r4, r5, lsl #8
1442         mov     r5, r5, lsr #24
1443         orr     r5, r5, r6, lsl #8
1444         mov     r6, r6, lsr #24
1445         orr     r6, r6, r7, lsl #8
1446         mov     r7, r7, lsr #24
1447         orr     r7, r7, ip, lsl #8
1448 #endif
1449         str     r4, [r3], #0x04
1450         str     r5, [r3], #0x04
1451         str     r6, [r3], #0x04
1452         str     r7, [r3], #0x04
1453 .Lmemcpy_bad3:
1454         subs    r2, r2, #0x10
1455         bge     .Lmemcpy_bad3_loop16
1456
1457         adds    r2, r2, #0x10
1458         ldmeqfd sp!, {r4-r7}
1459         RETeq                   /* Return now if done */
1460         subs    r2, r2, #0x04
1461         sublt   r1, r1, #0x01
1462         blt     .Lmemcpy_bad_done
1463
1464 .Lmemcpy_bad3_loop4:
1465 #ifdef __ARMEB__
1466         mov     r4, ip, lsl #24
1467 #else
1468         mov     r4, ip, lsr #24
1469 #endif
1470         ldr     ip, [r1], #0x04
1471         subs    r2, r2, #0x04
1472 #ifdef __ARMEB__
1473         orr     r4, r4, ip, lsr #8
1474 #else
1475         orr     r4, r4, ip, lsl #8
1476 #endif
1477         str     r4, [r3], #0x04
1478         bge     .Lmemcpy_bad3_loop4
1479         sub     r1, r1, #0x01
1480
1481 .Lmemcpy_bad_done:
1482         ldmfd   sp!, {r4-r7}
1483         adds    r2, r2, #0x04
1484         RETeq
1485         ldrb    ip, [r1], #0x01
1486         cmp     r2, #0x02
1487         ldrgeb  r2, [r1], #0x01
1488         strb    ip, [r3], #0x01
1489         ldrgtb  ip, [r1]
1490         strgeb  r2, [r3], #0x01
1491         strgtb  ip, [r3]
1492         RET
1493
1494
1495 /*
1496  * Handle short copies (less than 16 bytes), possibly misaligned.
1497  * Some of these are *very* common, thanks to the network stack,
1498  * and so are handled specially.
1499  */
1500 .Lmemcpy_short:
1501         add     pc, pc, r2, lsl #2
1502         nop
1503         RET                     /* 0x00 */
1504         b       .Lmemcpy_bytewise       /* 0x01 */
1505         b       .Lmemcpy_bytewise       /* 0x02 */
1506         b       .Lmemcpy_bytewise       /* 0x03 */
1507         b       .Lmemcpy_4              /* 0x04 */
1508         b       .Lmemcpy_bytewise       /* 0x05 */
1509         b       .Lmemcpy_6              /* 0x06 */
1510         b       .Lmemcpy_bytewise       /* 0x07 */
1511         b       .Lmemcpy_8              /* 0x08 */
1512         b       .Lmemcpy_bytewise       /* 0x09 */
1513         b       .Lmemcpy_bytewise       /* 0x0a */
1514         b       .Lmemcpy_bytewise       /* 0x0b */
1515         b       .Lmemcpy_c              /* 0x0c */
1516 .Lmemcpy_bytewise:
1517         mov     r3, r0                  /* We must not clobber r0 */
1518         ldrb    ip, [r1], #0x01
1519 1:      subs    r2, r2, #0x01
1520         strb    ip, [r3], #0x01
1521         ldrneb  ip, [r1], #0x01
1522         bne     1b
1523         RET
1524
1525 /******************************************************************************
1526  * Special case for 4 byte copies
1527  */
1528 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1529 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1530         LMEMCPY_4_PAD
1531 .Lmemcpy_4:
1532         and     r2, r1, #0x03
1533         orr     r2, r2, r0, lsl #2
1534         ands    r2, r2, #0x0f
1535         sub     r3, pc, #0x14
1536         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1537
1538 /*
1539  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1540  */
1541         ldr     r2, [r1]
1542         str     r2, [r0]
1543         RET
1544         LMEMCPY_4_PAD
1545
1546 /*
1547  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1548  */
1549         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1550         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1551 #ifdef __ARMEB__
1552         mov     r3, r3, lsl #8          /* r3 = 012. */
1553         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
1554 #else
1555         mov     r3, r3, lsr #8          /* r3 = .210 */
1556         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1557 #endif
1558         str     r3, [r0]
1559         RET
1560         LMEMCPY_4_PAD
1561
1562 /*
1563  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1564  */
1565 #ifdef __ARMEB__
1566         ldrh    r3, [r1]
1567         ldrh    r2, [r1, #0x02]
1568 #else
1569         ldrh    r3, [r1, #0x02]
1570         ldrh    r2, [r1]
1571 #endif
1572         orr     r3, r2, r3, lsl #16
1573         str     r3, [r0]
1574         RET
1575         LMEMCPY_4_PAD
1576
1577 /*
1578  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1579  */
1580         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1581         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1582 #ifdef __ARMEB__
1583         mov     r3, r3, lsl #24         /* r3 = 0... */
1584         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
1585 #else
1586         mov     r3, r3, lsr #24         /* r3 = ...0 */
1587         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1588 #endif
1589         str     r3, [r0]
1590         RET
1591         LMEMCPY_4_PAD
1592
1593 /*
1594  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1595  */
1596         ldr     r2, [r1]
1597 #ifdef __ARMEB__
1598         strb    r2, [r0, #0x03]
1599         mov     r3, r2, lsr #8
1600         mov     r1, r2, lsr #24
1601         strb    r1, [r0]
1602 #else
1603         strb    r2, [r0]
1604         mov     r3, r2, lsr #8
1605         mov     r1, r2, lsr #24
1606         strb    r1, [r0, #0x03]
1607 #endif
1608         strh    r3, [r0, #0x01]
1609         RET
1610         LMEMCPY_4_PAD
1611
1612 /*
1613  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1614  */
1615         ldrb    r2, [r1]
1616         ldrh    r3, [r1, #0x01]
1617         ldrb    r1, [r1, #0x03]
1618         strb    r2, [r0]
1619         strh    r3, [r0, #0x01]
1620         strb    r1, [r0, #0x03]
1621         RET
1622         LMEMCPY_4_PAD
1623
1624 /*
1625  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1626  */
1627         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1628         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1629 #ifdef __ARMEB__
1630         mov     r1, r2, lsr #8          /* r1 = ...0 */
1631         strb    r1, [r0]
1632         mov     r2, r2, lsl #8          /* r2 = .01. */
1633         orr     r2, r2, r3, lsr #8      /* r2 = .012 */
1634 #else
1635         strb    r2, [r0]
1636         mov     r2, r2, lsr #8          /* r2 = ...1 */
1637         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1638         mov     r3, r3, lsr #8          /* r3 = ...3 */
1639 #endif
1640         strh    r2, [r0, #0x01]
1641         strb    r3, [r0, #0x03]
1642         RET
1643         LMEMCPY_4_PAD
1644
1645 /*
1646  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1647  */
1648         ldrb    r2, [r1]
1649         ldrh    r3, [r1, #0x01]
1650         ldrb    r1, [r1, #0x03]
1651         strb    r2, [r0]
1652         strh    r3, [r0, #0x01]
1653         strb    r1, [r0, #0x03]
1654         RET
1655         LMEMCPY_4_PAD
1656
1657 /*
1658  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1659  */
1660         ldr     r2, [r1]
1661 #ifdef __ARMEB__
1662         strh    r2, [r0, #0x02]
1663         mov     r3, r2, lsr #16
1664         strh    r3, [r0]
1665 #else
1666         strh    r2, [r0]
1667         mov     r3, r2, lsr #16
1668         strh    r3, [r0, #0x02]
1669 #endif
1670         RET
1671         LMEMCPY_4_PAD
1672
1673 /*
1674  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1675  */
1676         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1677         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1678         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1679         strh    r1, [r0]
1680 #ifdef __ARMEB__
1681         mov     r2, r2, lsl #8          /* r2 = 012. */
1682         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1683 #else
1684         mov     r2, r2, lsr #24         /* r2 = ...2 */
1685         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1686 #endif
1687         strh    r2, [r0, #0x02]
1688         RET
1689         LMEMCPY_4_PAD
1690
1691 /*
1692  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1693  */
1694         ldrh    r2, [r1]
1695         ldrh    r3, [r1, #0x02]
1696         strh    r2, [r0]
1697         strh    r3, [r0, #0x02]
1698         RET
1699         LMEMCPY_4_PAD
1700
1701 /*
1702  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1703  */
1704         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1705         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1706         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1707         strh    r1, [r0, #0x02]
1708 #ifdef __ARMEB__
1709         mov     r3, r3, lsr #24         /* r3 = ...1 */
1710         orr     r3, r3, r2, lsl #8      /* r3 = xx01 */
1711 #else
1712         mov     r3, r3, lsl #8          /* r3 = 321. */
1713         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1714 #endif
1715         strh    r3, [r0]
1716         RET
1717         LMEMCPY_4_PAD
1718
1719 /*
1720  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1721  */
1722         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1723 #ifdef __ARMEB__
1724         strb    r2, [r0, #0x03]
1725         mov     r3, r2, lsr #8
1726         mov     r1, r2, lsr #24
1727         strh    r3, [r0, #0x01]
1728         strb    r1, [r0]
1729 #else
1730         strb    r2, [r0]
1731         mov     r3, r2, lsr #8
1732         mov     r1, r2, lsr #24
1733         strh    r3, [r0, #0x01]
1734         strb    r1, [r0, #0x03]
1735 #endif
1736         RET
1737         LMEMCPY_4_PAD
1738
1739 /*
1740  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1741  */
1742         ldrb    r2, [r1]
1743         ldrh    r3, [r1, #0x01]
1744         ldrb    r1, [r1, #0x03]
1745         strb    r2, [r0]
1746         strh    r3, [r0, #0x01]
1747         strb    r1, [r0, #0x03]
1748         RET
1749         LMEMCPY_4_PAD
1750
1751 /*
1752  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1753  */
1754 #ifdef __ARMEB__
1755         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1756         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1757         strb    r3, [r0, #0x03]
1758         mov     r3, r3, lsr #8          /* r3 = ...2 */
1759         orr     r3, r3, r2, lsl #8      /* r3 = ..12 */
1760         strh    r3, [r0, #0x01]
1761         mov     r2, r2, lsr #8          /* r2 = ...0 */
1762         strb    r2, [r0]
1763 #else
1764         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1765         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1766         strb    r2, [r0]
1767         mov     r2, r2, lsr #8          /* r2 = ...1 */
1768         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1769         strh    r2, [r0, #0x01]
1770         mov     r3, r3, lsr #8          /* r3 = ...3 */
1771         strb    r3, [r0, #0x03]
1772 #endif
1773         RET
1774         LMEMCPY_4_PAD
1775
1776 /*
1777  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1778  */
1779         ldrb    r2, [r1]
1780         ldrh    r3, [r1, #0x01]
1781         ldrb    r1, [r1, #0x03]
1782         strb    r2, [r0]
1783         strh    r3, [r0, #0x01]
1784         strb    r1, [r0, #0x03]
1785         RET
1786         LMEMCPY_4_PAD
1787
1788
1789 /******************************************************************************
1790  * Special case for 6 byte copies
1791  */
1792 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1793 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1794         LMEMCPY_6_PAD
1795 .Lmemcpy_6:
1796         and     r2, r1, #0x03
1797         orr     r2, r2, r0, lsl #2
1798         ands    r2, r2, #0x0f
1799         sub     r3, pc, #0x14
1800         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1801
1802 /*
1803  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1804  */
1805         ldr     r2, [r1]
1806         ldrh    r3, [r1, #0x04]
1807         str     r2, [r0]
1808         strh    r3, [r0, #0x04]
1809         RET
1810         LMEMCPY_6_PAD
1811
1812 /*
1813  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1814  */
1815         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1816         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1817 #ifdef __ARMEB__
1818         mov     r2, r2, lsl #8          /* r2 = 012. */
1819         orr     r2, r2, r3, lsr #24     /* r2 = 0123 */
1820 #else
1821         mov     r2, r2, lsr #8          /* r2 = .210 */
1822         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1823 #endif
1824         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1825         str     r2, [r0]
1826         strh    r3, [r0, #0x04]
1827         RET
1828         LMEMCPY_6_PAD
1829
1830 /*
1831  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1832  */
1833         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1834         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1835 #ifdef __ARMEB__
1836         mov     r1, r3, lsr #16         /* r1 = ..23 */
1837         orr     r1, r1, r2, lsl #16     /* r1 = 0123 */
1838         str     r1, [r0]
1839         strh    r3, [r0, #0x04]
1840 #else
1841         mov     r1, r3, lsr #16         /* r1 = ..54 */
1842         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1843         str     r2, [r0]
1844         strh    r1, [r0, #0x04]
1845 #endif
1846         RET
1847         LMEMCPY_6_PAD
1848
1849 /*
1850  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1851  */
1852         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1853         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1854         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1855 #ifdef __ARMEB__
1856         mov     r2, r2, lsl #24         /* r2 = 0... */
1857         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
1858         mov     r3, r3, lsl #8          /* r3 = 234. */
1859         orr     r1, r3, r1, lsr #24     /* r1 = 2345 */
1860 #else
1861         mov     r2, r2, lsr #24         /* r2 = ...0 */
1862         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1863         mov     r1, r1, lsl #8          /* r1 = xx5. */
1864         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1865 #endif
1866         str     r2, [r0]
1867         strh    r1, [r0, #0x04]
1868         RET
1869         LMEMCPY_6_PAD
1870
1871 /*
1872  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1873  */
1874         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1875         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1876         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1877         strh    r1, [r0, #0x01]
1878 #ifdef __ARMEB__
1879         mov     r1, r3, lsr #24         /* r1 = ...0 */
1880         strb    r1, [r0]
1881         mov     r3, r3, lsl #8          /* r3 = 123. */
1882         orr     r3, r3, r2, lsr #8      /* r3 = 1234 */
1883 #else
1884         strb    r3, [r0]
1885         mov     r3, r3, lsr #24         /* r3 = ...3 */
1886         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1887         mov     r2, r2, lsr #8          /* r2 = ...5 */
1888 #endif
1889         strh    r3, [r0, #0x03]
1890         strb    r2, [r0, #0x05]
1891         RET
1892         LMEMCPY_6_PAD
1893
1894 /*
1895  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1896  */
1897         ldrb    r2, [r1]
1898         ldrh    r3, [r1, #0x01]
1899         ldrh    ip, [r1, #0x03]
1900         ldrb    r1, [r1, #0x05]
1901         strb    r2, [r0]
1902         strh    r3, [r0, #0x01]
1903         strh    ip, [r0, #0x03]
1904         strb    r1, [r0, #0x05]
1905         RET
1906         LMEMCPY_6_PAD
1907
1908 /*
1909  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1910  */
1911         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1912         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1913 #ifdef __ARMEB__
1914         mov     r3, r2, lsr #8          /* r3 = ...0 */
1915         strb    r3, [r0]
1916         strb    r1, [r0, #0x05]
1917         mov     r3, r1, lsr #8          /* r3 = .234 */
1918         strh    r3, [r0, #0x03]
1919         mov     r3, r2, lsl #8          /* r3 = .01. */
1920         orr     r3, r3, r1, lsr #24     /* r3 = .012 */
1921         strh    r3, [r0, #0x01]
1922 #else
1923         strb    r2, [r0]
1924         mov     r3, r1, lsr #24
1925         strb    r3, [r0, #0x05]
1926         mov     r3, r1, lsr #8          /* r3 = .543 */
1927         strh    r3, [r0, #0x03]
1928         mov     r3, r2, lsr #8          /* r3 = ...1 */
1929         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
1930         strh    r3, [r0, #0x01]
1931 #endif
1932         RET
1933         LMEMCPY_6_PAD
1934
1935 /*
1936  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1937  */
1938         ldrb    r2, [r1]
1939         ldrh    r3, [r1, #0x01]
1940         ldrh    ip, [r1, #0x03]
1941         ldrb    r1, [r1, #0x05]
1942         strb    r2, [r0]
1943         strh    r3, [r0, #0x01]
1944         strh    ip, [r0, #0x03]
1945         strb    r1, [r0, #0x05]
1946         RET
1947         LMEMCPY_6_PAD
1948
1949 /*
1950  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1951  */
1952 #ifdef __ARMEB__
1953         ldr     r2, [r1]                /* r2 = 0123 */
1954         ldrh    r3, [r1, #0x04]         /* r3 = ..45 */
1955         mov     r1, r2, lsr #16         /* r1 = ..01 */
1956         orr     r3, r3, r2, lsl#16      /* r3 = 2345 */
1957         strh    r1, [r0]
1958         str     r3, [r0, #0x02]
1959 #else
1960         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
1961         ldr     r3, [r1]                /* r3 = 3210 */
1962         mov     r2, r2, lsl #16         /* r2 = 54.. */
1963         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
1964         strh    r3, [r0]
1965         str     r2, [r0, #0x02]
1966 #endif
1967         RET
1968         LMEMCPY_6_PAD
1969
1970 /*
1971  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1972  */
1973         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1974         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
1975         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1976 #ifdef __ARMEB__
1977         mov     r2, r2, lsr #8          /* r2 = .345 */
1978         orr     r2, r2, r3, lsl #24     /* r2 = 2345 */
1979 #else
1980         mov     r2, r2, lsl #8          /* r2 = 543. */
1981         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
1982 #endif
1983         strh    r1, [r0]
1984         str     r2, [r0, #0x02]
1985         RET
1986         LMEMCPY_6_PAD
1987
1988 /*
1989  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1990  */
1991         ldrh    r2, [r1]
1992         ldr     r3, [r1, #0x02]
1993         strh    r2, [r0]
1994         str     r3, [r0, #0x02]
1995         RET
1996         LMEMCPY_6_PAD
1997
1998 /*
1999  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2000  */
2001         ldrb    r3, [r1]                /* r3 = ...0 */
2002         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2003         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
2004 #ifdef __ARMEB__
2005         mov     r3, r3, lsl #8          /* r3 = ..0. */
2006         orr     r3, r3, r2, lsr #24     /* r3 = ..01 */
2007         orr     r1, r1, r2, lsl #8      /* r1 = 2345 */
2008 #else
2009         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2010         mov     r1, r1, lsl #24         /* r1 = 5... */
2011         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
2012 #endif
2013         strh    r3, [r0]
2014         str     r1, [r0, #0x02]
2015         RET
2016         LMEMCPY_6_PAD
2017
2018 /*
2019  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2020  */
2021         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2022         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
2023 #ifdef __ARMEB__
2024         mov     r3, r2, lsr #24         /* r3 = ...0 */
2025         strb    r3, [r0]
2026         mov     r2, r2, lsl #8          /* r2 = 123. */
2027         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2028 #else
2029         strb    r2, [r0]
2030         mov     r2, r2, lsr #8          /* r2 = .321 */
2031         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
2032         mov     r1, r1, lsr #8          /* r1 = ...5 */
2033 #endif
2034         str     r2, [r0, #0x01]
2035         strb    r1, [r0, #0x05]
2036         RET
2037         LMEMCPY_6_PAD
2038
2039 /*
2040  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2041  */
2042         ldrb    r2, [r1]
2043         ldrh    r3, [r1, #0x01]
2044         ldrh    ip, [r1, #0x03]
2045         ldrb    r1, [r1, #0x05]
2046         strb    r2, [r0]
2047         strh    r3, [r0, #0x01]
2048         strh    ip, [r0, #0x03]
2049         strb    r1, [r0, #0x05]
2050         RET
2051         LMEMCPY_6_PAD
2052
2053 /*
2054  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2055  */
2056         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2057         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
2058 #ifdef __ARMEB__
2059         mov     r3, r2, lsr #8          /* r3 = ...0 */
2060         strb    r3, [r0]
2061         mov     r2, r2, lsl #24         /* r2 = 1... */
2062         orr     r2, r2, r1, lsr #8      /* r2 = 1234 */
2063 #else
2064         strb    r2, [r0]
2065         mov     r2, r2, lsr #8          /* r2 = ...1 */
2066         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
2067         mov     r1, r1, lsr #24         /* r1 = ...5 */
2068 #endif
2069         str     r2, [r0, #0x01]
2070         strb    r1, [r0, #0x05]
2071         RET
2072         LMEMCPY_6_PAD
2073
2074 /*
2075  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2076  */
2077         ldrb    r2, [r1]
2078         ldr     r3, [r1, #0x01]
2079         ldrb    r1, [r1, #0x05]
2080         strb    r2, [r0]
2081         str     r3, [r0, #0x01]
2082         strb    r1, [r0, #0x05]
2083         RET
2084         LMEMCPY_6_PAD
2085
2086
2087 /******************************************************************************
2088  * Special case for 8 byte copies
2089  */
2090 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
2091 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
2092         LMEMCPY_8_PAD
2093 .Lmemcpy_8:
2094         and     r2, r1, #0x03
2095         orr     r2, r2, r0, lsl #2
2096         ands    r2, r2, #0x0f
2097         sub     r3, pc, #0x14
2098         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
2099
2100 /*
2101  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2102  */
2103         ldr     r2, [r1]
2104         ldr     r3, [r1, #0x04]
2105         str     r2, [r0]
2106         str     r3, [r0, #0x04]
2107         RET
2108         LMEMCPY_8_PAD
2109
2110 /*
2111  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2112  */
2113         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
2114         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
2115         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2116 #ifdef __ARMEB__
2117         mov     r3, r3, lsl #8          /* r3 = 012. */
2118         orr     r3, r3, r2, lsr #24     /* r3 = 0123 */
2119         orr     r2, r1, r2, lsl #8      /* r2 = 4567 */
2120 #else
2121         mov     r3, r3, lsr #8          /* r3 = .210 */
2122         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
2123         mov     r1, r1, lsl #24         /* r1 = 7... */
2124         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
2125 #endif
2126         str     r3, [r0]
2127         str     r2, [r0, #0x04]
2128         RET
2129         LMEMCPY_8_PAD
2130
2131 /*
2132  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2133  */
2134         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2135         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2136         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2137 #ifdef __ARMEB__
2138         mov     r2, r2, lsl #16         /* r2 = 01.. */
2139         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2140         orr     r3, r1, r3, lsl #16     /* r3 = 4567 */
2141 #else
2142         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2143         mov     r3, r3, lsr #16         /* r3 = ..54 */
2144         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
2145 #endif
2146         str     r2, [r0]
2147         str     r3, [r0, #0x04]
2148         RET
2149         LMEMCPY_8_PAD
2150
2151 /*
2152  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2153  */
2154         ldrb    r3, [r1]                /* r3 = ...0 */
2155         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2156         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
2157 #ifdef __ARMEB__
2158         mov     r3, r3, lsl #24         /* r3 = 0... */
2159         orr     r3, r3, r2, lsr #8      /* r3 = 0123 */
2160         mov     r2, r2, lsl #24         /* r2 = 4... */
2161         orr     r2, r2, r1, lsr #8      /* r2 = 4567 */
2162 #else
2163         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
2164         mov     r2, r2, lsr #24         /* r2 = ...4 */
2165         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
2166 #endif
2167         str     r3, [r0]
2168         str     r2, [r0, #0x04]
2169         RET
2170         LMEMCPY_8_PAD
2171
2172 /*
2173  * 0100: dst is 8-bit aligned, src is 32-bit aligned
2174  */
2175         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
2176         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
2177 #ifdef __ARMEB__
2178         mov     r1, r3, lsr #24         /* r1 = ...0 */
2179         strb    r1, [r0]
2180         mov     r1, r3, lsr #8          /* r1 = .012 */
2181         strb    r2, [r0, #0x07]
2182         mov     r3, r3, lsl #24         /* r3 = 3... */
2183         orr     r3, r3, r2, lsr #8      /* r3 = 3456 */
2184 #else
2185         strb    r3, [r0]
2186         mov     r1, r2, lsr #24         /* r1 = ...7 */
2187         strb    r1, [r0, #0x07]
2188         mov     r1, r3, lsr #8          /* r1 = .321 */
2189         mov     r3, r3, lsr #24         /* r3 = ...3 */
2190         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
2191 #endif
2192         strh    r1, [r0, #0x01]
2193         str     r3, [r0, #0x03]
2194         RET
2195         LMEMCPY_8_PAD
2196
2197 /*
2198  * 0101: dst is 8-bit aligned, src is 8-bit aligned
2199  */
2200         ldrb    r2, [r1]
2201         ldrh    r3, [r1, #0x01]
2202         ldr     ip, [r1, #0x03]
2203         ldrb    r1, [r1, #0x07]
2204         strb    r2, [r0]
2205         strh    r3, [r0, #0x01]
2206         str     ip, [r0, #0x03]
2207         strb    r1, [r0, #0x07]
2208         RET
2209         LMEMCPY_8_PAD
2210
2211 /*
2212  * 0110: dst is 8-bit aligned, src is 16-bit aligned
2213  */
2214         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2215         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2216         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2217 #ifdef __ARMEB__
2218         mov     ip, r2, lsr #8          /* ip = ...0 */
2219         strb    ip, [r0]
2220         mov     ip, r2, lsl #8          /* ip = .01. */
2221         orr     ip, ip, r3, lsr #24     /* ip = .012 */
2222         strb    r1, [r0, #0x07]
2223         mov     r3, r3, lsl #8          /* r3 = 345. */
2224         orr     r3, r3, r1, lsr #8      /* r3 = 3456 */
2225 #else
2226         strb    r2, [r0]                /* 0 */
2227         mov     ip, r1, lsr #8          /* ip = ...7 */
2228         strb    ip, [r0, #0x07]         /* 7 */
2229         mov     ip, r2, lsr #8          /* ip = ...1 */
2230         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2231         mov     r3, r3, lsr #8          /* r3 = .543 */
2232         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
2233 #endif
2234         strh    ip, [r0, #0x01]
2235         str     r3, [r0, #0x03]
2236         RET
2237         LMEMCPY_8_PAD
2238
2239 /*
2240  * 0111: dst is 8-bit aligned, src is 8-bit aligned
2241  */
2242         ldrb    r3, [r1]                /* r3 = ...0 */
2243         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2244         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
2245         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2246         strb    r3, [r0]
2247         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
2248 #ifdef __ARMEB__
2249         strh    r3, [r0, #0x01]
2250         orr     r2, r2, ip, lsl #16     /* r2 = 3456 */
2251 #else
2252         strh    ip, [r0, #0x01]
2253         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
2254 #endif
2255         str     r2, [r0, #0x03]
2256         strb    r1, [r0, #0x07]
2257         RET
2258         LMEMCPY_8_PAD
2259
2260 /*
2261  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2262  */
2263         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2264         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2265         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2266 #ifdef __ARMEB__
2267         strh    r1, [r0]
2268         mov     r1, r3, lsr #16         /* r1 = ..45 */
2269         orr     r2, r1 ,r2, lsl #16     /* r2 = 2345 */
2270 #else
2271         strh    r2, [r0]
2272         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
2273         mov     r3, r3, lsr #16         /* r3 = ..76 */
2274 #endif
2275         str     r2, [r0, #0x02]
2276         strh    r3, [r0, #0x06]
2277         RET
2278         LMEMCPY_8_PAD
2279
2280 /*
2281  * 1001: dst is 16-bit aligned, src is 8-bit aligned
2282  */
2283         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2284         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2285         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
2286         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
2287         strh    r1, [r0]
2288 #ifdef __ARMEB__
2289         mov     r1, r2, lsl #24         /* r1 = 2... */
2290         orr     r1, r1, r3, lsr #8      /* r1 = 2345 */
2291         orr     r3, ip, r3, lsl #8      /* r3 = 4567 */
2292 #else
2293         mov     r1, r2, lsr #24         /* r1 = ...2 */
2294         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
2295         mov     r3, r3, lsr #24         /* r3 = ...6 */
2296         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
2297 #endif
2298         str     r1, [r0, #0x02]
2299         strh    r3, [r0, #0x06]
2300         RET
2301         LMEMCPY_8_PAD
2302
2303 /*
2304  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2305  */
2306         ldrh    r2, [r1]
2307         ldr     ip, [r1, #0x02]
2308         ldrh    r3, [r1, #0x06]
2309         strh    r2, [r0]
2310         str     ip, [r0, #0x02]
2311         strh    r3, [r0, #0x06]
2312         RET
2313         LMEMCPY_8_PAD
2314
2315 /*
2316  * 1011: dst is 16-bit aligned, src is 8-bit aligned
2317  */
2318         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
2319         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
2320         ldrb    ip, [r1]                /* ip = ...0 */
2321         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
2322         strh    r1, [r0, #0x06]
2323 #ifdef __ARMEB__
2324         mov     r3, r3, lsr #24         /* r3 = ...5 */
2325         orr     r3, r3, r2, lsl #8      /* r3 = 2345 */
2326         mov     r2, r2, lsr #24         /* r2 = ...1 */
2327         orr     r2, r2, ip, lsl #8      /* r2 = ..01 */
2328 #else
2329         mov     r3, r3, lsl #24         /* r3 = 5... */
2330         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
2331         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
2332 #endif
2333         str     r3, [r0, #0x02]
2334         strh    r2, [r0]
2335         RET
2336         LMEMCPY_8_PAD
2337
2338 /*
2339  * 1100: dst is 8-bit aligned, src is 32-bit aligned
2340  */
2341         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2342         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2343         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
2344         strh    r1, [r0, #0x05]
2345 #ifdef __ARMEB__
2346         strb    r3, [r0, #0x07]
2347         mov     r1, r2, lsr #24         /* r1 = ...0 */
2348         strb    r1, [r0]
2349         mov     r2, r2, lsl #8          /* r2 = 123. */
2350         orr     r2, r2, r3, lsr #24     /* r2 = 1234 */
2351         str     r2, [r0, #0x01]
2352 #else
2353         strb    r2, [r0]
2354         mov     r1, r3, lsr #24         /* r1 = ...7 */
2355         strb    r1, [r0, #0x07]
2356         mov     r2, r2, lsr #8          /* r2 = .321 */
2357         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
2358         str     r2, [r0, #0x01]
2359 #endif
2360         RET
2361         LMEMCPY_8_PAD
2362
2363 /*
2364  * 1101: dst is 8-bit aligned, src is 8-bit aligned
2365  */
2366         ldrb    r3, [r1]                /* r3 = ...0 */
2367         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
2368         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2369         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
2370         strb    r3, [r0]
2371         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
2372 #ifdef __ARMEB__
2373         strh    ip, [r0, #0x05]
2374         orr     r2, r3, r2, lsl #16     /* r2 = 1234 */
2375 #else
2376         strh    r3, [r0, #0x05]
2377         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
2378 #endif
2379         str     r2, [r0, #0x01]
2380         strb    r1, [r0, #0x07]
2381         RET
2382         LMEMCPY_8_PAD
2383
2384 /*
2385  * 1110: dst is 8-bit aligned, src is 16-bit aligned
2386  */
2387         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2388         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2389         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
2390 #ifdef __ARMEB__
2391         mov     ip, r2, lsr #8          /* ip = ...0 */
2392         strb    ip, [r0]
2393         mov     ip, r2, lsl #24         /* ip = 1... */
2394         orr     ip, ip, r3, lsr #8      /* ip = 1234 */
2395         strb    r1, [r0, #0x07]
2396         mov     r1, r1, lsr #8          /* r1 = ...6 */
2397         orr     r1, r1, r3, lsl #8      /* r1 = 3456 */
2398 #else
2399         strb    r2, [r0]
2400         mov     ip, r2, lsr #8          /* ip = ...1 */
2401         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
2402         mov     r2, r1, lsr #8          /* r2 = ...7 */
2403         strb    r2, [r0, #0x07]
2404         mov     r1, r1, lsl #8          /* r1 = .76. */
2405         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
2406 #endif
2407         str     ip, [r0, #0x01]
2408         strh    r1, [r0, #0x05]
2409         RET
2410         LMEMCPY_8_PAD
2411
2412 /*
2413  * 1111: dst is 8-bit aligned, src is 8-bit aligned
2414  */
2415         ldrb    r2, [r1]
2416         ldr     ip, [r1, #0x01]
2417         ldrh    r3, [r1, #0x05]
2418         ldrb    r1, [r1, #0x07]
2419         strb    r2, [r0]
2420         str     ip, [r0, #0x01]
2421         strh    r3, [r0, #0x05]
2422         strb    r1, [r0, #0x07]
2423         RET
2424         LMEMCPY_8_PAD
2425
2426 /******************************************************************************
2427  * Special case for 12 byte copies
2428  */
2429 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
2430 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
2431         LMEMCPY_C_PAD
2432 .Lmemcpy_c:
2433         and     r2, r1, #0x03
2434         orr     r2, r2, r0, lsl #2
2435         ands    r2, r2, #0x0f
2436         sub     r3, pc, #0x14
2437         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
2438
2439 /*
2440  * 0000: dst is 32-bit aligned, src is 32-bit aligned
2441  */
2442         ldr     r2, [r1]
2443         ldr     r3, [r1, #0x04]
2444         ldr     r1, [r1, #0x08]
2445         str     r2, [r0]
2446         str     r3, [r0, #0x04]
2447         str     r1, [r0, #0x08]
2448         RET
2449         LMEMCPY_C_PAD
2450
2451 /*
2452  * 0001: dst is 32-bit aligned, src is 8-bit aligned
2453  */
2454         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
2455         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2456         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2457         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2458 #ifdef __ARMEB__
2459         orr     r2, r2, ip, lsl #8      /* r2 = 89AB */
2460         str     r2, [r0, #0x08]
2461         mov     r2, ip, lsr #24         /* r2 = ...7 */
2462         orr     r2, r2, r3, lsl #8      /* r2 = 4567 */
2463         mov     r1, r1, lsl #8          /* r1 = 012. */
2464         orr     r1, r1, r3, lsr #24     /* r1 = 0123 */
2465 #else
2466         mov     r2, r2, lsl #24         /* r2 = B... */
2467         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
2468         str     r2, [r0, #0x08]
2469         mov     r2, ip, lsl #24         /* r2 = 7... */
2470         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
2471         mov     r1, r1, lsr #8          /* r1 = .210 */
2472         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
2473 #endif
2474         str     r2, [r0, #0x04]
2475         str     r1, [r0]
2476         RET
2477         LMEMCPY_C_PAD
2478
2479 /*
2480  * 0010: dst is 32-bit aligned, src is 16-bit aligned
2481  */
2482         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2483         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2484         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2485         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2486 #ifdef __ARMEB__
2487         mov     r2, r2, lsl #16         /* r2 = 01.. */
2488         orr     r2, r2, r3, lsr #16     /* r2 = 0123 */
2489         str     r2, [r0]
2490         mov     r3, r3, lsl #16         /* r3 = 45.. */
2491         orr     r3, r3, ip, lsr #16     /* r3 = 4567 */
2492         orr     r1, r1, ip, lsl #16     /* r1 = 89AB */
2493 #else
2494         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
2495         str     r2, [r0]
2496         mov     r3, r3, lsr #16         /* r3 = ..54 */
2497         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
2498         mov     r1, r1, lsl #16         /* r1 = BA.. */
2499         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
2500 #endif
2501         str     r3, [r0, #0x04]
2502         str     r1, [r0, #0x08]
2503         RET
2504         LMEMCPY_C_PAD
2505
2506 /*
2507  * 0011: dst is 32-bit aligned, src is 8-bit aligned
2508  */
2509         ldrb    r2, [r1]                /* r2 = ...0 */
2510         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2511         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2512         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2513 #ifdef __ARMEB__
2514         mov     r2, r2, lsl #24         /* r2 = 0... */
2515         orr     r2, r2, r3, lsr #8      /* r2 = 0123 */
2516         str     r2, [r0]
2517         mov     r3, r3, lsl #24         /* r3 = 4... */
2518         orr     r3, r3, ip, lsr #8      /* r3 = 4567 */
2519         mov     r1, r1, lsr #8          /* r1 = .9AB */
2520         orr     r1, r1, ip, lsl #24     /* r1 = 89AB */
2521 #else
2522         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
2523         str     r2, [r0]
2524         mov     r3, r3, lsr #24         /* r3 = ...4 */
2525         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
2526         mov     r1, r1, lsl #8          /* r1 = BA9. */
2527         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
2528 #endif
2529         str     r3, [r0, #0x04]
2530         str     r1, [r0, #0x08]
2531         RET
2532         LMEMCPY_C_PAD
2533
2534 /*
2535  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2536  */
2537         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2538         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2539         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
2540         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
2541         strh    r1, [r0, #0x01]
2542 #ifdef __ARMEB__
2543         mov     r1, r2, lsr #24         /* r1 = ...0 */
2544         strb    r1, [r0]
2545         mov     r1, r2, lsl #24         /* r1 = 3... */
2546         orr     r2, r1, r3, lsr #8      /* r1 = 3456 */
2547         mov     r1, r3, lsl #24         /* r1 = 7... */
2548         orr     r1, r1, ip, lsr #8      /* r1 = 789A */
2549 #else
2550         strb    r2, [r0]
2551         mov     r1, r2, lsr #24         /* r1 = ...3 */
2552         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
2553         mov     r1, r3, lsr #24         /* r1 = ...7 */
2554         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
2555         mov     ip, ip, lsr #24         /* ip = ...B */
2556 #endif
2557         str     r2, [r0, #0x03]
2558         str     r1, [r0, #0x07]
2559         strb    ip, [r0, #0x0b]
2560         RET
2561         LMEMCPY_C_PAD
2562
2563 /*
2564  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2565  */
2566         ldrb    r2, [r1]
2567         ldrh    r3, [r1, #0x01]
2568         ldr     ip, [r1, #0x03]
2569         strb    r2, [r0]
2570         ldr     r2, [r1, #0x07]
2571         ldrb    r1, [r1, #0x0b]
2572         strh    r3, [r0, #0x01]
2573         str     ip, [r0, #0x03]
2574         str     r2, [r0, #0x07]
2575         strb    r1, [r0, #0x0b]
2576         RET
2577         LMEMCPY_C_PAD
2578
2579 /*
2580  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2581  */
2582         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
2583         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
2584         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
2585         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
2586 #ifdef __ARMEB__
2587         mov     r2, r2, ror #8          /* r2 = 1..0 */
2588         strb    r2, [r0]
2589         mov     r2, r2, lsr #16         /* r2 = ..1. */
2590         orr     r2, r2, r3, lsr #24     /* r2 = ..12 */
2591         strh    r2, [r0, #0x01]
2592         mov     r2, r3, lsl #8          /* r2 = 345. */
2593         orr     r3, r2, ip, lsr #24     /* r3 = 3456 */
2594         mov     r2, ip, lsl #8          /* r2 = 789. */
2595         orr     r2, r2, r1, lsr #8      /* r2 = 789A */
2596 #else
2597         strb    r2, [r0]
2598         mov     r2, r2, lsr #8          /* r2 = ...1 */
2599         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2600         strh    r2, [r0, #0x01]
2601         mov     r2, r3, lsr #8          /* r2 = .543 */
2602         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
2603         mov     r2, ip, lsr #8          /* r2 = .987 */
2604         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
2605         mov     r1, r1, lsr #8          /* r1 = ...B */
2606 #endif
2607         str     r3, [r0, #0x03]
2608         str     r2, [r0, #0x07]
2609         strb    r1, [r0, #0x0b]
2610         RET
2611         LMEMCPY_C_PAD
2612
2613 /*
2614  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2615  */
2616         ldrb    r2, [r1]
2617         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
2618         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
2619         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
2620         strb    r2, [r0]
2621 #ifdef __ARMEB__
2622         mov     r2, r3, lsr #16         /* r2 = ..12 */
2623         strh    r2, [r0, #0x01]
2624         mov     r3, r3, lsl #16         /* r3 = 34.. */
2625         orr     r3, r3, ip, lsr #16     /* r3 = 3456 */
2626         mov     ip, ip, lsl #16         /* ip = 78.. */
2627         orr     ip, ip, r1, lsr #16     /* ip = 789A */
2628         mov     r1, r1, lsr #8          /* r1 = .9AB */
2629 #else
2630         strh    r3, [r0, #0x01]
2631         mov     r3, r3, lsr #16         /* r3 = ..43 */
2632         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
2633         mov     ip, ip, lsr #16         /* ip = ..87 */
2634         orr     ip, ip, r1, lsl #16     /* ip = A987 */
2635         mov     r1, r1, lsr #16         /* r1 = ..xB */
2636 #endif
2637         str     r3, [r0, #0x03]
2638         str     ip, [r0, #0x07]
2639         strb    r1, [r0, #0x0b]
2640         RET
2641         LMEMCPY_C_PAD
2642
2643 /*
2644  * 1000: dst is 16-bit aligned, src is 32-bit aligned
2645  */
2646         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
2647         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
2648         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
2649         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
2650 #ifdef __ARMEB__
2651         strh    r1, [r0]
2652         mov     r1, ip, lsl #16         /* r1 = 23.. */
2653         orr     r1, r1, r3, lsr #16     /* r1 = 2345 */
2654         mov     r3, r3, lsl #16         /* r3 = 67.. */
2655         orr     r3, r3, r2, lsr #16     /* r3 = 6789 */
2656 #else
2657         strh    ip, [r0]
2658         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
2659         mov     r3, r3, lsr #16         /* r3 = ..76 */
2660         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
2661         mov     r2, r2, lsr #16         /* r2 = ..BA */
2662 #endif
2663         str     r1, [r0, #0x02]
2664         str     r3, [r0, #0x06]
2665         strh    r2, [r0, #0x0a]
2666         RET
2667         LMEMCPY_C_PAD
2668
2669 /*
2670  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2671  */
2672         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
2673         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
2674         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
2675         strh    ip, [r0]
2676         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
2677         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
2678 #ifdef __ARMEB__
2679         mov     r2, r2, lsl #24         /* r2 = 2... */
2680         orr     r2, r2, r3, lsr #8      /* r2 = 2345 */
2681         mov     r3, r3, lsl #24         /* r3 = 6... */
2682         orr     r3, r3, ip, lsr #8      /* r3 = 6789 */
2683         orr     r1, r1, ip, lsl #8      /* r1 = 89AB */
2684 #else
2685         mov     r2, r2, lsr #24         /* r2 = ...2 */
2686         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
2687         mov     r3, r3, lsr #24         /* r3 = ...6 */
2688         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
2689         mov     r1, r1, lsl #8          /* r1 = ..B. */
2690         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
2691 #endif
2692         str     r2, [r0, #0x02]
2693         str     r3, [r0, #0x06]
2694         strh    r1, [r0, #0x0a]
2695         RET
2696         LMEMCPY_C_PAD
2697
2698 /*
2699  * 1010: dst is 16-bit aligned, src is 16-bit aligned
2700  */
2701         ldrh    r2, [r1]
2702         ldr     r3, [r1, #0x02]
2703         ldr     ip, [r1, #0x06]
2704         ldrh    r1, [r1, #0x0a]
2705         strh    r2, [r0]
2706         str     r3, [r0, #0x02]
2707         str     ip, [r0, #0x06]
2708         strh    r1, [r0, #0x0a]
2709         RET
2710         LMEMCPY_C_PAD
2711
2712 /*
2713  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2714  */
2715         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
2716         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
2717         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
2718         strh    ip, [r0, #0x0a]
2719         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
2720         ldrb    r1, [r1]                /* r1 = ...0 */
2721 #ifdef __ARMEB__
2722         mov     r2, r2, lsr #24         /* r2 = ...9 */
2723         orr     r2, r2, r3, lsl #8      /* r2 = 6789 */
2724         mov     r3, r3, lsr #24         /* r3 = ...5 */
2725         orr     r3, r3, ip, lsl #8      /* r3 = 2345 */
2726         mov     r1, r1, lsl #8          /* r1 = ..0. */
2727         orr     r1, r1, ip, lsr #24     /* r1 = ..01 */
2728 #else
2729         mov     r2, r2, lsl #24         /* r2 = 9... */
2730         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
2731         mov     r3, r3, lsl #24         /* r3 = 5... */
2732         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
2733         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
2734 #endif
2735         str     r2, [r0, #0x06]
2736         str     r3, [r0, #0x02]
2737         strh    r1, [r0]
2738         RET
2739         LMEMCPY_C_PAD
2740
2741 /*
2742  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2743  */
2744         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2745         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2746         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2747 #ifdef __ARMEB__
2748         mov     r3, r2, lsr #24         /* r3 = ...0 */
2749         strb    r3, [r0]
2750         mov     r2, r2, lsl #8          /* r2 = 123. */
2751         orr     r2, r2, ip, lsr #24     /* r2 = 1234 */
2752         str     r2, [r0, #0x01]
2753         mov     r2, ip, lsl #8          /* r2 = 567. */
2754         orr     r2, r2, r1, lsr #24     /* r2 = 5678 */
2755         str     r2, [r0, #0x05]
2756         mov     r2, r1, lsr #8          /* r2 = ..9A */
2757         strh    r2, [r0, #0x09]
2758         strb    r1, [r0, #0x0b]
2759 #else
2760         strb    r2, [r0]
2761         mov     r3, r2, lsr #8          /* r3 = .321 */
2762         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2763         str     r3, [r0, #0x01]
2764         mov     r3, ip, lsr #8          /* r3 = .765 */
2765         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2766         str     r3, [r0, #0x05]
2767         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2768         strh    r1, [r0, #0x09]
2769         mov     r1, r1, lsr #16         /* r1 = ...B */
2770         strb    r1, [r0, #0x0b]
2771 #endif
2772         RET
2773         LMEMCPY_C_PAD
2774
2775 /*
2776  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2777  */
2778         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2779         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2780         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2781         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2782         strb    r2, [r0, #0x0b]
2783 #ifdef __ARMEB__
2784         strh    r3, [r0, #0x09]
2785         mov     r3, r3, lsr #16         /* r3 = ..78 */
2786         orr     r3, r3, ip, lsl #16     /* r3 = 5678 */
2787         mov     ip, ip, lsr #16         /* ip = ..34 */
2788         orr     ip, ip, r1, lsl #16     /* ip = 1234 */
2789         mov     r1, r1, lsr #16         /* r1 = ..x0 */
2790 #else
2791         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2792         strh    r2, [r0, #0x09]
2793         mov     r3, r3, lsl #16         /* r3 = 87.. */
2794         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2795         mov     ip, ip, lsl #16         /* ip = 43.. */
2796         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2797         mov     r1, r1, lsr #8          /* r1 = .210 */
2798 #endif
2799         str     r3, [r0, #0x05]
2800         str     ip, [r0, #0x01]
2801         strb    r1, [r0]
2802         RET
2803         LMEMCPY_C_PAD
2804
2805 /*
2806  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2807  */
2808 #ifdef __ARMEB__
2809         ldrh    r2, [r1, #0x0a]         /* r2 = ..AB */
2810         ldr     ip, [r1, #0x06]         /* ip = 6789 */
2811         ldr     r3, [r1, #0x02]         /* r3 = 2345 */
2812         ldrh    r1, [r1]                /* r1 = ..01 */
2813         strb    r2, [r0, #0x0b]
2814         mov     r2, r2, lsr #8          /* r2 = ...A */
2815         orr     r2, r2, ip, lsl #8      /* r2 = 789A */
2816         mov     ip, ip, lsr #8          /* ip = .678 */
2817         orr     ip, ip, r3, lsl #24     /* ip = 5678 */
2818         mov     r3, r3, lsr #8          /* r3 = .234 */
2819         orr     r3, r3, r1, lsl #24     /* r3 = 1234 */
2820         mov     r1, r1, lsr #8          /* r1 = ...0 */
2821         strb    r1, [r0]
2822         str     r3, [r0, #0x01]
2823         str     ip, [r0, #0x05]
2824         strh    r2, [r0, #0x09]
2825 #else
2826         ldrh    r2, [r1]                /* r2 = ..10 */
2827         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2828         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2829         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2830         strb    r2, [r0]
2831         mov     r2, r2, lsr #8          /* r2 = ...1 */
2832         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2833         mov     r3, r3, lsr #24         /* r3 = ...5 */
2834         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2835         mov     ip, ip, lsr #24         /* ip = ...9 */
2836         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2837         mov     r1, r1, lsr #8          /* r1 = ...B */
2838         str     r2, [r0, #0x01]
2839         str     r3, [r0, #0x05]
2840         strh    ip, [r0, #0x09]
2841         strb    r1, [r0, #0x0b]
2842 #endif
2843         RET
2844         LMEMCPY_C_PAD
2845
2846 /*
2847  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2848  */
2849         ldrb    r2, [r1]
2850         ldr     r3, [r1, #0x01]
2851         ldr     ip, [r1, #0x05]
2852         strb    r2, [r0]
2853         ldrh    r2, [r1, #0x09]
2854         ldrb    r1, [r1, #0x0b]
2855         str     r3, [r0, #0x01]
2856         str     ip, [r0, #0x05]
2857         strh    r2, [r0, #0x09]
2858         strb    r1, [r0, #0x0b]
2859         RET
2860 #endif /* __XSCALE__ */
2861
2862 #ifdef GPROF
2863
2864 ENTRY(user)
2865         nop
2866 ENTRY(btrap)
2867         nop
2868 ENTRY(etrap)
2869         nop
2870 ENTRY(bintr)
2871         nop
2872 ENTRY(eintr)
2873         nop
2874
2875 #endif