sys/arm/arm/support.S

   1 /*-
   2  * Copyright (c) 2004 Olivier Houchard
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26 /*
  27  * Copyright 2003 Wasabi Systems, Inc.
  28  * All rights reserved.
  29  *
  30  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *    notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *    notice, this list of conditions and the following disclaimer in the
  39  *    documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *    must display the following acknowledgement:
  42  *      This product includes software developed for the NetBSD Project by
  43  *      Wasabi Systems, Inc.
  44  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  45  *    or promote products derived from this software without specific prior
  46  *    written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  50  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  51  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  52  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  53  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  54  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  55  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  56  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  57  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  58  * POSSIBILITY OF SUCH DAMAGE.
  59  */
  60 /*
  61  * Copyright (c) 1997 The NetBSD Foundation, Inc.
  62  * All rights reserved.
  63  *
  64  * This code is derived from software contributed to The NetBSD Foundation
  65  * by Neil A. Carson and Mark Brinicombe
  66  *
  67  * Redistribution and use in source and binary forms, with or without
  68  * modification, are permitted provided that the following conditions
  69  * are met:
  70  * 1. Redistributions of source code must retain the above copyright
  71  *    notice, this list of conditions and the following disclaimer.
  72  * 2. Redistributions in binary form must reproduce the above copyright
  73  *    notice, this list of conditions and the following disclaimer in the
  74  *    documentation and/or other materials provided with the distribution.
  75  *
  76  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  77  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  78  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  79  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  80  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  81  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  82  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  83  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  84  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  85  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  86  * POSSIBILITY OF SUCH DAMAGE.
  87  */
  88
  89 #include <machine/asm.h>
  90 __FBSDID("$FreeBSD$");
  91
  92 #include "assym.inc"
  93
  94         .syntax unified
  95
  96 /*
  97  * memset: Sets a block of memory to the specified value
  98  *
  99  * On entry:
 100  *   r0 - dest address
 101  *   r1 - byte to write
 102  *   r2 - number of bytes to write
 103  *
 104  * On exit:
 105  *   r0 - dest address
 106  */
 107 /* LINTSTUB: Func: void bzero(void *, size_t) */
 108 ENTRY(bzero)
 109         mov     r3, #0x00
 110         b       do_memset
 111 END(bzero)
 112 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
 113 ENTRY(memset)
 114         and     r3, r1, #0xff           /* We deal with bytes */
 115         mov     r1, r2
 116 do_memset:
 117         cmp     r1, #0x04               /* Do we have less than 4 bytes */
 118         mov     ip, r0
 119         blt     .Lmemset_lessthanfour
 120
 121         /* Ok first we will word align the address */
 122         ands    r2, ip, #0x03           /* Get the bottom two bits */
 123         bne     .Lmemset_wordunaligned  /* The address is not word aligned */
 124
 125         /* We are now word aligned */
 126 .Lmemset_wordaligned:
 127         orr     r3, r3, r3, lsl #8      /* Extend value to 16-bits */
 128         tst     ip, #0x04               /* Quad-align for armv5e */
 129         orr     r3, r3, r3, lsl #16     /* Extend value to 32-bits */
 130         subne   r1, r1, #0x04           /* Quad-align if necessary */
 131         strne   r3, [ip], #0x04
 132         cmp     r1, #0x10
 133         blt     .Lmemset_loop4          /* If less than 16 then use words */
 134         mov     r2, r3                  /* Duplicate data */
 135         cmp     r1, #0x80               /* If < 128 then skip the big loop */
 136         blt     .Lmemset_loop32
 137
 138         /* Do 128 bytes at a time */
 139 .Lmemset_loop128:
 140         subs    r1, r1, #0x80
 141         strdge  r2, [ip], #0x08
 142         strdge  r2, [ip], #0x08
 143         strdge  r2, [ip], #0x08
 144         strdge  r2, [ip], #0x08
 145         strdge  r2, [ip], #0x08
 146         strdge  r2, [ip], #0x08
 147         strdge  r2, [ip], #0x08
 148         strdge  r2, [ip], #0x08
 149         strdge  r2, [ip], #0x08
 150         strdge  r2, [ip], #0x08
 151         strdge  r2, [ip], #0x08
 152         strdge  r2, [ip], #0x08
 153         strdge  r2, [ip], #0x08
 154         strdge  r2, [ip], #0x08
 155         strdge  r2, [ip], #0x08
 156         strdge  r2, [ip], #0x08
 157         bgt     .Lmemset_loop128
 158         RETeq                   /* Zero length so just exit */
 159
 160         add     r1, r1, #0x80           /* Adjust for extra sub */
 161
 162         /* Do 32 bytes at a time */
 163 .Lmemset_loop32:
 164         subs    r1, r1, #0x20
 165         strdge  r2, [ip], #0x08
 166         strdge  r2, [ip], #0x08
 167         strdge  r2, [ip], #0x08
 168         strdge  r2, [ip], #0x08
 169         bgt     .Lmemset_loop32
 170         RETeq                   /* Zero length so just exit */
 171
 172         adds    r1, r1, #0x10           /* Partially adjust for extra sub */
 173
 174         /* Deal with 16 bytes or more */
 175         strdge  r2, [ip], #0x08
 176         strdge  r2, [ip], #0x08
 177         RETeq                   /* Zero length so just exit */
 178
 179         addlt   r1, r1, #0x10           /* Possibly adjust for extra sub */
 180
 181         /* We have at least 4 bytes so copy as words */
 182 .Lmemset_loop4:
 183         subs    r1, r1, #0x04
 184         strge   r3, [ip], #0x04
 185         bgt     .Lmemset_loop4
 186         RETeq                   /* Zero length so just exit */
 187
 188         /* Compensate for 64-bit alignment check */
 189         adds    r1, r1, #0x04
 190         RETeq
 191         cmp     r1, #2
 192
 193         strb    r3, [ip], #0x01         /* Set 1 byte */
 194         strbge  r3, [ip], #0x01         /* Set another byte */
 195         strbgt  r3, [ip]                /* and a third */
 196         RET                     /* Exit */
 197
 198 .Lmemset_wordunaligned:
 199         rsb     r2, r2, #0x004
 200         strb    r3, [ip], #0x01         /* Set 1 byte */
 201         cmp     r2, #0x02
 202         strbge  r3, [ip], #0x01         /* Set another byte */
 203         sub     r1, r1, r2
 204         strbgt  r3, [ip], #0x01         /* and a third */
 205         cmp     r1, #0x04               /* More than 4 bytes left? */
 206         bge     .Lmemset_wordaligned    /* Yup */
 207
 208 .Lmemset_lessthanfour:
 209         cmp     r1, #0x00
 210         RETeq                   /* Zero length so exit */
 211         strb    r3, [ip], #0x01         /* Set 1 byte */
 212         cmp     r1, #0x02
 213         strbge  r3, [ip], #0x01         /* Set another byte */
 214         strbgt  r3, [ip]                /* and a third */
 215         RET                     /* Exit */
 216 EEND(memset)
 217 END(bzero)
 218
 219 ENTRY(memcmp)
 220         mov     ip, r0
 221         cmp     r2, #0x06
 222         beq     .Lmemcmp_6bytes
 223         mov     r0, #0x00
 224
 225         /* Are both addresses aligned the same way? */
 226         cmp     r2, #0x00
 227         eorsne  r3, ip, r1
 228         RETeq                   /* len == 0, or same addresses! */
 229         tst     r3, #0x03
 230         subne   r2, r2, #0x01
 231         bne     .Lmemcmp_bytewise2      /* Badly aligned. Do it the slow way */
 232
 233         /* Word-align the addresses, if necessary */
 234         sub     r3, r1, #0x05
 235         ands    r3, r3, #0x03
 236         add     r3, r3, r3, lsl #1
 237         addne   pc, pc, r3, lsl #3
 238         nop
 239
 240         /* Compare up to 3 bytes */
 241         ldrb    r0, [ip], #0x01
 242         ldrb    r3, [r1], #0x01
 243         subs    r0, r0, r3
 244         RETne
 245         subs    r2, r2, #0x01
 246         RETeq
 247
 248         /* Compare up to 2 bytes */
 249         ldrb    r0, [ip], #0x01
 250         ldrb    r3, [r1], #0x01
 251         subs    r0, r0, r3
 252         RETne
 253         subs    r2, r2, #0x01
 254         RETeq
 255
 256         /* Compare 1 byte */
 257         ldrb    r0, [ip], #0x01
 258         ldrb    r3, [r1], #0x01
 259         subs    r0, r0, r3
 260         RETne
 261         subs    r2, r2, #0x01
 262         RETeq
 263
 264         /* Compare 4 bytes at a time, if possible */
 265         subs    r2, r2, #0x04
 266         bcc     .Lmemcmp_bytewise
 267 .Lmemcmp_word_aligned:
 268         ldr     r0, [ip], #0x04
 269         ldr     r3, [r1], #0x04
 270         subs    r2, r2, #0x04
 271         cmpcs   r0, r3
 272         beq     .Lmemcmp_word_aligned
 273         sub     r0, r0, r3
 274
 275         /* Correct for extra subtraction, and check if done */
 276         adds    r2, r2, #0x04
 277         cmpeq   r0, #0x00               /* If done, did all bytes match? */
 278         RETeq                   /* Yup. Just return */
 279
 280         /* Re-do the final word byte-wise */
 281         sub     ip, ip, #0x04
 282         sub     r1, r1, #0x04
 283
 284 .Lmemcmp_bytewise:
 285         add     r2, r2, #0x03
 286 .Lmemcmp_bytewise2:
 287         ldrb    r0, [ip], #0x01
 288         ldrb    r3, [r1], #0x01
 289         subs    r2, r2, #0x01
 290         cmpcs   r0, r3
 291         beq     .Lmemcmp_bytewise2
 292         sub     r0, r0, r3
 293         RET
 294
 295         /*
 296          * 6 byte compares are very common, thanks to the network stack.
 297          * This code is hand-scheduled to reduce the number of stalls for
 298          * load results. Everything else being equal, this will be ~32%
 299          * faster than a byte-wise memcmp.
 300          */
 301         .align  5
 302 .Lmemcmp_6bytes:
 303         ldrb    r3, [r1, #0x00]         /* r3 = b2#0 */
 304         ldrb    r0, [ip, #0x00]         /* r0 = b1#0 */
 305         ldrb    r2, [r1, #0x01]         /* r2 = b2#1 */
 306         subs    r0, r0, r3              /* r0 = b1#0 - b2#0 */
 307         ldrbeq  r3, [ip, #0x01]         /* r3 = b1#1 */
 308         RETne                   /* Return if mismatch on #0 */
 309         subs    r0, r3, r2              /* r0 = b1#1 - b2#1 */
 310         ldrbeq  r3, [r1, #0x02]         /* r3 = b2#2 */
 311         ldrbeq  r0, [ip, #0x02]         /* r0 = b1#2 */
 312         RETne                   /* Return if mismatch on #1 */
 313         ldrb    r2, [r1, #0x03]         /* r2 = b2#3 */
 314         subs    r0, r0, r3              /* r0 = b1#2 - b2#2 */
 315         ldrbeq  r3, [ip, #0x03]         /* r3 = b1#3 */
 316         RETne                   /* Return if mismatch on #2 */
 317         subs    r0, r3, r2              /* r0 = b1#3 - b2#3 */
 318         ldrbeq  r3, [r1, #0x04]         /* r3 = b2#4 */
 319         ldrbeq  r0, [ip, #0x04]         /* r0 = b1#4 */
 320         RETne                   /* Return if mismatch on #3 */
 321         ldrb    r2, [r1, #0x05]         /* r2 = b2#5 */
 322         subs    r0, r0, r3              /* r0 = b1#4 - b2#4 */
 323         ldrbeq  r3, [ip, #0x05]         /* r3 = b1#5 */
 324         RETne                   /* Return if mismatch on #4 */
 325         sub     r0, r3, r2              /* r0 = b1#5 - b2#5 */
 326         RET
 327 END(memcmp)
 328
 329 ENTRY(memmove)
 330         /* Do the buffers overlap? */
 331         cmp     r0, r1
 332         RETeq           /* Bail now if src/dst are the same */
 333         subcc   r3, r0, r1      /* if (dst > src) r3 = dst - src */
 334         subcs   r3, r1, r0      /* if (src > dsr) r3 = src - dst */
 335         cmp     r3, r2          /* if (r3 < len) we have an overlap */
 336         bcc     PIC_SYM(_C_LABEL(memcpy), PLT)
 337
 338         /* Determine copy direction */
 339         cmp     r1, r0
 340         bcc     .Lmemmove_backwards
 341
 342         moveq   r0, #0                  /* Quick abort for len=0 */
 343         RETeq
 344
 345         stmdb   sp!, {r0, lr}           /* memmove() returns dest addr */
 346         subs    r2, r2, #4
 347         blt     .Lmemmove_fl4           /* less than 4 bytes */
 348         ands    r12, r0, #3
 349         bne     .Lmemmove_fdestul       /* oh unaligned destination addr */
 350         ands    r12, r1, #3
 351         bne     .Lmemmove_fsrcul                /* oh unaligned source addr */
 352
 353 .Lmemmove_ft8:
 354         /* We have aligned source and destination */
 355         subs    r2, r2, #8
 356         blt     .Lmemmove_fl12          /* less than 12 bytes (4 from above) */
 357         subs    r2, r2, #0x14
 358         blt     .Lmemmove_fl32          /* less than 32 bytes (12 from above) */
 359         stmdb   sp!, {r4}               /* borrow r4 */
 360
 361         /* blat 32 bytes at a time */
 362         /* XXX for really big copies perhaps we should use more registers */
 363 .Lmemmove_floop32:
 364         ldmia   r1!, {r3, r4, r12, lr}
 365         stmia   r0!, {r3, r4, r12, lr}
 366         ldmia   r1!, {r3, r4, r12, lr}
 367         stmia   r0!, {r3, r4, r12, lr}
 368         subs    r2, r2, #0x20
 369         bge     .Lmemmove_floop32
 370
 371         cmn     r2, #0x10
 372         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 373         stmiage r0!, {r3, r4, r12, lr}
 374         subge   r2, r2, #0x10
 375         ldmia   sp!, {r4}               /* return r4 */
 376
 377 .Lmemmove_fl32:
 378         adds    r2, r2, #0x14
 379
 380         /* blat 12 bytes at a time */
 381 .Lmemmove_floop12:
 382         ldmiage r1!, {r3, r12, lr}
 383         stmiage r0!, {r3, r12, lr}
 384         subsge  r2, r2, #0x0c
 385         bge     .Lmemmove_floop12
 386
 387 .Lmemmove_fl12:
 388         adds    r2, r2, #8
 389         blt     .Lmemmove_fl4
 390
 391         subs    r2, r2, #4
 392         ldrlt   r3, [r1], #4
 393         strlt   r3, [r0], #4
 394         ldmiage r1!, {r3, r12}
 395         stmiage r0!, {r3, r12}
 396         subge   r2, r2, #4
 397
 398 .Lmemmove_fl4:
 399         /* less than 4 bytes to go */
 400         adds    r2, r2, #4
 401         ldmiaeq sp!, {r0, pc}           /* done */
 402
 403         /* copy the crud byte at a time */
 404         cmp     r2, #2
 405         ldrb    r3, [r1], #1
 406         strb    r3, [r0], #1
 407         ldrbge  r3, [r1], #1
 408         strbge  r3, [r0], #1
 409         ldrbgt  r3, [r1], #1
 410         strbgt  r3, [r0], #1
 411         ldmia   sp!, {r0, pc}
 412
 413         /* erg - unaligned destination */
 414 .Lmemmove_fdestul:
 415         rsb     r12, r12, #4
 416         cmp     r12, #2
 417
 418         /* align destination with byte copies */
 419         ldrb    r3, [r1], #1
 420         strb    r3, [r0], #1
 421         ldrbge  r3, [r1], #1
 422         strbge  r3, [r0], #1
 423         ldrbgt  r3, [r1], #1
 424         strbgt  r3, [r0], #1
 425         subs    r2, r2, r12
 426         blt     .Lmemmove_fl4           /* less the 4 bytes */
 427
 428         ands    r12, r1, #3
 429         beq     .Lmemmove_ft8           /* we have an aligned source */
 430
 431         /* erg - unaligned source */
 432         /* This is where it gets nasty ... */
 433 .Lmemmove_fsrcul:
 434         bic     r1, r1, #3
 435         ldr     lr, [r1], #4
 436         cmp     r12, #2
 437         bgt     .Lmemmove_fsrcul3
 438         beq     .Lmemmove_fsrcul2
 439         cmp     r2, #0x0c
 440         blt     .Lmemmove_fsrcul1loop4
 441         sub     r2, r2, #0x0c
 442         stmdb   sp!, {r4, r5}
 443
 444 .Lmemmove_fsrcul1loop16:
 445         mov     r3, lr, lsr #8
 446         ldmia   r1!, {r4, r5, r12, lr}
 447         orr     r3, r3, r4, lsl #24
 448         mov     r4, r4, lsr #8
 449         orr     r4, r4, r5, lsl #24
 450         mov     r5, r5, lsr #8
 451         orr     r5, r5, r12, lsl #24
 452         mov     r12, r12, lsr #8
 453         orr     r12, r12, lr, lsl #24
 454         stmia   r0!, {r3-r5, r12}
 455         subs    r2, r2, #0x10
 456         bge     .Lmemmove_fsrcul1loop16
 457         ldmia   sp!, {r4, r5}
 458         adds    r2, r2, #0x0c
 459         blt     .Lmemmove_fsrcul1l4
 460
 461 .Lmemmove_fsrcul1loop4:
 462         mov     r12, lr, lsr #8
 463         ldr     lr, [r1], #4
 464         orr     r12, r12, lr, lsl #24
 465         str     r12, [r0], #4
 466         subs    r2, r2, #4
 467         bge     .Lmemmove_fsrcul1loop4
 468
 469 .Lmemmove_fsrcul1l4:
 470         sub     r1, r1, #3
 471         b       .Lmemmove_fl4
 472
 473 .Lmemmove_fsrcul2:
 474         cmp     r2, #0x0c
 475         blt     .Lmemmove_fsrcul2loop4
 476         sub     r2, r2, #0x0c
 477         stmdb   sp!, {r4, r5}
 478
 479 .Lmemmove_fsrcul2loop16:
 480         mov     r3, lr, lsr #16
 481         ldmia   r1!, {r4, r5, r12, lr}
 482         orr     r3, r3, r4, lsl #16
 483         mov     r4, r4, lsr #16
 484         orr     r4, r4, r5, lsl #16
 485         mov     r5, r5, lsr #16
 486         orr     r5, r5, r12, lsl #16
 487         mov     r12, r12, lsr #16
 488         orr     r12, r12, lr, lsl #16
 489         stmia   r0!, {r3-r5, r12}
 490         subs    r2, r2, #0x10
 491         bge     .Lmemmove_fsrcul2loop16
 492         ldmia   sp!, {r4, r5}
 493         adds    r2, r2, #0x0c
 494         blt     .Lmemmove_fsrcul2l4
 495
 496 .Lmemmove_fsrcul2loop4:
 497         mov     r12, lr, lsr #16
 498         ldr     lr, [r1], #4
 499         orr     r12, r12, lr, lsl #16
 500         str     r12, [r0], #4
 501         subs    r2, r2, #4
 502         bge     .Lmemmove_fsrcul2loop4
 503
 504 .Lmemmove_fsrcul2l4:
 505         sub     r1, r1, #2
 506         b       .Lmemmove_fl4
 507
 508 .Lmemmove_fsrcul3:
 509         cmp     r2, #0x0c
 510         blt     .Lmemmove_fsrcul3loop4
 511         sub     r2, r2, #0x0c
 512         stmdb   sp!, {r4, r5}
 513
 514 .Lmemmove_fsrcul3loop16:
 515         mov     r3, lr, lsr #24
 516         ldmia   r1!, {r4, r5, r12, lr}
 517         orr     r3, r3, r4, lsl #8
 518         mov     r4, r4, lsr #24
 519         orr     r4, r4, r5, lsl #8
 520         mov     r5, r5, lsr #24
 521         orr     r5, r5, r12, lsl #8
 522         mov     r12, r12, lsr #24
 523         orr     r12, r12, lr, lsl #8
 524         stmia   r0!, {r3-r5, r12}
 525         subs    r2, r2, #0x10
 526         bge     .Lmemmove_fsrcul3loop16
 527         ldmia   sp!, {r4, r5}
 528         adds    r2, r2, #0x0c
 529         blt     .Lmemmove_fsrcul3l4
 530
 531 .Lmemmove_fsrcul3loop4:
 532         mov     r12, lr, lsr #24
 533         ldr     lr, [r1], #4
 534         orr     r12, r12, lr, lsl #8
 535         str     r12, [r0], #4
 536         subs    r2, r2, #4
 537         bge     .Lmemmove_fsrcul3loop4
 538
 539 .Lmemmove_fsrcul3l4:
 540         sub     r1, r1, #1
 541         b       .Lmemmove_fl4
 542
 543 .Lmemmove_backwards:
 544         add     r1, r1, r2
 545         add     r0, r0, r2
 546         subs    r2, r2, #4
 547         blt     .Lmemmove_bl4           /* less than 4 bytes */
 548         ands    r12, r0, #3
 549         bne     .Lmemmove_bdestul       /* oh unaligned destination addr */
 550         ands    r12, r1, #3
 551         bne     .Lmemmove_bsrcul                /* oh unaligned source addr */
 552
 553 .Lmemmove_bt8:
 554         /* We have aligned source and destination */
 555         subs    r2, r2, #8
 556         blt     .Lmemmove_bl12          /* less than 12 bytes (4 from above) */
 557         stmdb   sp!, {r4, lr}
 558         subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
 559         blt     .Lmemmove_bl32
 560
 561         /* blat 32 bytes at a time */
 562         /* XXX for really big copies perhaps we should use more registers */
 563 .Lmemmove_bloop32:
 564         ldmdb   r1!, {r3, r4, r12, lr}
 565         stmdb   r0!, {r3, r4, r12, lr}
 566         ldmdb   r1!, {r3, r4, r12, lr}
 567         stmdb   r0!, {r3, r4, r12, lr}
 568         subs    r2, r2, #0x20
 569         bge     .Lmemmove_bloop32
 570
 571 .Lmemmove_bl32:
 572         cmn     r2, #0x10
 573         ldmdbge r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
 574         stmdbge r0!, {r3, r4, r12, lr}
 575         subge   r2, r2, #0x10
 576         adds    r2, r2, #0x14
 577         ldmdbge r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
 578         stmdbge r0!, {r3, r12, lr}
 579         subge   r2, r2, #0x0c
 580         ldmia   sp!, {r4, lr}
 581
 582 .Lmemmove_bl12:
 583         adds    r2, r2, #8
 584         blt     .Lmemmove_bl4
 585         subs    r2, r2, #4
 586         ldrlt   r3, [r1, #-4]!
 587         strlt   r3, [r0, #-4]!
 588         ldmdbge r1!, {r3, r12}
 589         stmdbge r0!, {r3, r12}
 590         subge   r2, r2, #4
 591
 592 .Lmemmove_bl4:
 593         /* less than 4 bytes to go */
 594         adds    r2, r2, #4
 595         RETeq                   /* done */
 596
 597         /* copy the crud byte at a time */
 598         cmp     r2, #2
 599         ldrb    r3, [r1, #-1]!
 600         strb    r3, [r0, #-1]!
 601         ldrbge  r3, [r1, #-1]!
 602         strbge  r3, [r0, #-1]!
 603         ldrbgt  r3, [r1, #-1]!
 604         strbgt  r3, [r0, #-1]!
 605         RET
 606
 607         /* erg - unaligned destination */
 608 .Lmemmove_bdestul:
 609         cmp     r12, #2
 610
 611         /* align destination with byte copies */
 612         ldrb    r3, [r1, #-1]!
 613         strb    r3, [r0, #-1]!
 614         ldrbge  r3, [r1, #-1]!
 615         strbge  r3, [r0, #-1]!
 616         ldrbgt  r3, [r1, #-1]!
 617         strbgt  r3, [r0, #-1]!
 618         subs    r2, r2, r12
 619         blt     .Lmemmove_bl4           /* less than 4 bytes to go */
 620         ands    r12, r1, #3
 621         beq     .Lmemmove_bt8           /* we have an aligned source */
 622
 623         /* erg - unaligned source */
 624         /* This is where it gets nasty ... */
 625 .Lmemmove_bsrcul:
 626         bic     r1, r1, #3
 627         ldr     r3, [r1, #0]
 628         cmp     r12, #2
 629         blt     .Lmemmove_bsrcul1
 630         beq     .Lmemmove_bsrcul2
 631         cmp     r2, #0x0c
 632         blt     .Lmemmove_bsrcul3loop4
 633         sub     r2, r2, #0x0c
 634         stmdb   sp!, {r4, r5, lr}
 635
 636 .Lmemmove_bsrcul3loop16:
 637         mov     lr, r3, lsl #8
 638         ldmdb   r1!, {r3-r5, r12}
 639         orr     lr, lr, r12, lsr #24
 640         mov     r12, r12, lsl #8
 641         orr     r12, r12, r5, lsr #24
 642         mov     r5, r5, lsl #8
 643         orr     r5, r5, r4, lsr #24
 644         mov     r4, r4, lsl #8
 645         orr     r4, r4, r3, lsr #24
 646         stmdb   r0!, {r4, r5, r12, lr}
 647         subs    r2, r2, #0x10
 648         bge     .Lmemmove_bsrcul3loop16
 649         ldmia   sp!, {r4, r5, lr}
 650         adds    r2, r2, #0x0c
 651         blt     .Lmemmove_bsrcul3l4
 652
 653 .Lmemmove_bsrcul3loop4:
 654         mov     r12, r3, lsl #8
 655         ldr     r3, [r1, #-4]!
 656         orr     r12, r12, r3, lsr #24
 657         str     r12, [r0, #-4]!
 658         subs    r2, r2, #4
 659         bge     .Lmemmove_bsrcul3loop4
 660
 661 .Lmemmove_bsrcul3l4:
 662         add     r1, r1, #3
 663         b       .Lmemmove_bl4
 664
 665 .Lmemmove_bsrcul2:
 666         cmp     r2, #0x0c
 667         blt     .Lmemmove_bsrcul2loop4
 668         sub     r2, r2, #0x0c
 669         stmdb   sp!, {r4, r5, lr}
 670
 671 .Lmemmove_bsrcul2loop16:
 672         mov     lr, r3, lsl #16
 673         ldmdb   r1!, {r3-r5, r12}
 674         orr     lr, lr, r12, lsr #16
 675         mov     r12, r12, lsl #16
 676         orr     r12, r12, r5, lsr #16
 677         mov     r5, r5, lsl #16
 678         orr     r5, r5, r4, lsr #16
 679         mov     r4, r4, lsl #16
 680         orr     r4, r4, r3, lsr #16
 681         stmdb   r0!, {r4, r5, r12, lr}
 682         subs    r2, r2, #0x10
 683         bge     .Lmemmove_bsrcul2loop16
 684         ldmia   sp!, {r4, r5, lr}
 685         adds    r2, r2, #0x0c
 686         blt     .Lmemmove_bsrcul2l4
 687
 688 .Lmemmove_bsrcul2loop4:
 689         mov     r12, r3, lsl #16
 690         ldr     r3, [r1, #-4]!
 691         orr     r12, r12, r3, lsr #16
 692         str     r12, [r0, #-4]!
 693         subs    r2, r2, #4
 694         bge     .Lmemmove_bsrcul2loop4
 695
 696 .Lmemmove_bsrcul2l4:
 697         add     r1, r1, #2
 698         b       .Lmemmove_bl4
 699
 700 .Lmemmove_bsrcul1:
 701         cmp     r2, #0x0c
 702         blt     .Lmemmove_bsrcul1loop4
 703         sub     r2, r2, #0x0c
 704         stmdb   sp!, {r4, r5, lr}
 705
 706 .Lmemmove_bsrcul1loop32:
 707         mov     lr, r3, lsl #24
 708         ldmdb   r1!, {r3-r5, r12}
 709         orr     lr, lr, r12, lsr #8
 710         mov     r12, r12, lsl #24
 711         orr     r12, r12, r5, lsr #8
 712         mov     r5, r5, lsl #24
 713         orr     r5, r5, r4, lsr #8
 714         mov     r4, r4, lsl #24
 715         orr     r4, r4, r3, lsr #8
 716         stmdb   r0!, {r4, r5, r12, lr}
 717         subs    r2, r2, #0x10
 718         bge     .Lmemmove_bsrcul1loop32
 719         ldmia   sp!, {r4, r5, lr}
 720         adds    r2, r2, #0x0c
 721         blt     .Lmemmove_bsrcul1l4
 722
 723 .Lmemmove_bsrcul1loop4:
 724         mov     r12, r3, lsl #24
 725         ldr     r3, [r1, #-4]!
 726         orr     r12, r12, r3, lsr #8
 727         str     r12, [r0, #-4]!
 728         subs    r2, r2, #4
 729         bge     .Lmemmove_bsrcul1loop4
 730
 731 .Lmemmove_bsrcul1l4:
 732         add     r1, r1, #1
 733         b       .Lmemmove_bl4
 734 END(memmove)
 735
 736 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
 737 ENTRY(memcpy)
 738         pld     [r1]
 739         cmp     r2, #0x0c
 740         ble     .Lmemcpy_short          /* <= 12 bytes */
 741 #ifdef FLASHADDR
 742 #if FLASHADDR > PHYSADDR
 743         ldr     r3, =FLASHADDR
 744         cmp     r3, pc
 745         bls     .Lnormal
 746 #else
 747         ldr     r3, =FLASHADDR
 748         cmp     r3, pc
 749         bhi     .Lnormal
 750 #endif
 751 #endif
 752         mov     r3, r0                  /* We must not clobber r0 */
 753
 754         /* Word-align the destination buffer */
 755         ands    ip, r3, #0x03           /* Already word aligned? */
 756         beq     .Lmemcpy_wordaligned    /* Yup */
 757         cmp     ip, #0x02
 758         ldrb    ip, [r1], #0x01
 759         sub     r2, r2, #0x01
 760         strb    ip, [r3], #0x01
 761         ldrble  ip, [r1], #0x01
 762         suble   r2, r2, #0x01
 763         strble  ip, [r3], #0x01
 764         ldrblt  ip, [r1], #0x01
 765         sublt   r2, r2, #0x01
 766         strblt  ip, [r3], #0x01
 767
 768         /* Destination buffer is now word aligned */
 769 .Lmemcpy_wordaligned:
 770         ands    ip, r1, #0x03           /* Is src also word-aligned? */
 771         bne     .Lmemcpy_bad_align      /* Nope. Things just got bad */
 772
 773         /* Quad-align the destination buffer */
 774         tst     r3, #0x07               /* Already quad aligned? */
 775         ldrne   ip, [r1], #0x04
 776         stmfd   sp!, {r4-r9}            /* Free up some registers */
 777         subne   r2, r2, #0x04
 778         strne   ip, [r3], #0x04
 779
 780         /* Destination buffer quad aligned, source is at least word aligned */
 781         subs    r2, r2, #0x80
 782         blt     .Lmemcpy_w_lessthan128
 783
 784         /* Copy 128 bytes at a time */
 785 .Lmemcpy_w_loop128:
 786         ldr     r4, [r1], #0x04         /* LD:00-03 */
 787         ldr     r5, [r1], #0x04         /* LD:04-07 */
 788         pld     [r1, #0x18]             /* Prefetch 0x20 */
 789         ldr     r6, [r1], #0x04         /* LD:08-0b */
 790         ldr     r7, [r1], #0x04         /* LD:0c-0f */
 791         ldr     r8, [r1], #0x04         /* LD:10-13 */
 792         ldr     r9, [r1], #0x04         /* LD:14-17 */
 793         strd    r4, [r3], #0x08         /* ST:00-07 */
 794         ldr     r4, [r1], #0x04         /* LD:18-1b */
 795         ldr     r5, [r1], #0x04         /* LD:1c-1f */
 796         strd    r6, [r3], #0x08         /* ST:08-0f */
 797         ldr     r6, [r1], #0x04         /* LD:20-23 */
 798         ldr     r7, [r1], #0x04         /* LD:24-27 */
 799         pld     [r1, #0x18]             /* Prefetch 0x40 */
 800         strd    r8, [r3], #0x08         /* ST:10-17 */
 801         ldr     r8, [r1], #0x04         /* LD:28-2b */
 802         ldr     r9, [r1], #0x04         /* LD:2c-2f */
 803         strd    r4, [r3], #0x08         /* ST:18-1f */
 804         ldr     r4, [r1], #0x04         /* LD:30-33 */
 805         ldr     r5, [r1], #0x04         /* LD:34-37 */
 806         strd    r6, [r3], #0x08         /* ST:20-27 */
 807         ldr     r6, [r1], #0x04         /* LD:38-3b */
 808         ldr     r7, [r1], #0x04         /* LD:3c-3f */
 809         strd    r8, [r3], #0x08         /* ST:28-2f */
 810         ldr     r8, [r1], #0x04         /* LD:40-43 */
 811         ldr     r9, [r1], #0x04         /* LD:44-47 */
 812         pld     [r1, #0x18]             /* Prefetch 0x60 */
 813         strd    r4, [r3], #0x08         /* ST:30-37 */
 814         ldr     r4, [r1], #0x04         /* LD:48-4b */
 815         ldr     r5, [r1], #0x04         /* LD:4c-4f */
 816         strd    r6, [r3], #0x08         /* ST:38-3f */
 817         ldr     r6, [r1], #0x04         /* LD:50-53 */
 818         ldr     r7, [r1], #0x04         /* LD:54-57 */
 819         strd    r8, [r3], #0x08         /* ST:40-47 */
 820         ldr     r8, [r1], #0x04         /* LD:58-5b */
 821         ldr     r9, [r1], #0x04         /* LD:5c-5f */
 822         strd    r4, [r3], #0x08         /* ST:48-4f */
 823         ldr     r4, [r1], #0x04         /* LD:60-63 */
 824         ldr     r5, [r1], #0x04         /* LD:64-67 */
 825         pld     [r1, #0x18]             /* Prefetch 0x80 */
 826         strd    r6, [r3], #0x08         /* ST:50-57 */
 827         ldr     r6, [r1], #0x04         /* LD:68-6b */
 828         ldr     r7, [r1], #0x04         /* LD:6c-6f */
 829         strd    r8, [r3], #0x08         /* ST:58-5f */
 830         ldr     r8, [r1], #0x04         /* LD:70-73 */
 831         ldr     r9, [r1], #0x04         /* LD:74-77 */
 832         strd    r4, [r3], #0x08         /* ST:60-67 */
 833         ldr     r4, [r1], #0x04         /* LD:78-7b */
 834         ldr     r5, [r1], #0x04         /* LD:7c-7f */
 835         strd    r6, [r3], #0x08         /* ST:68-6f */
 836         strd    r8, [r3], #0x08         /* ST:70-77 */
 837         subs    r2, r2, #0x80
 838         strd    r4, [r3], #0x08         /* ST:78-7f */
 839         bge     .Lmemcpy_w_loop128
 840
 841 .Lmemcpy_w_lessthan128:
 842         adds    r2, r2, #0x80           /* Adjust for extra sub */
 843         ldmfdeq sp!, {r4-r9}
 844         RETeq                   /* Return now if done */
 845         subs    r2, r2, #0x20
 846         blt     .Lmemcpy_w_lessthan32
 847
 848         /* Copy 32 bytes at a time */
 849 .Lmemcpy_w_loop32:
 850         ldr     r4, [r1], #0x04
 851         ldr     r5, [r1], #0x04
 852         pld     [r1, #0x18]
 853         ldr     r6, [r1], #0x04
 854         ldr     r7, [r1], #0x04
 855         ldr     r8, [r1], #0x04
 856         ldr     r9, [r1], #0x04
 857         strd    r4, [r3], #0x08
 858         ldr     r4, [r1], #0x04
 859         ldr     r5, [r1], #0x04
 860         strd    r6, [r3], #0x08
 861         strd    r8, [r3], #0x08
 862         subs    r2, r2, #0x20
 863         strd    r4, [r3], #0x08
 864         bge     .Lmemcpy_w_loop32
 865
 866 .Lmemcpy_w_lessthan32:
 867         adds    r2, r2, #0x20           /* Adjust for extra sub */
 868         ldmfdeq sp!, {r4-r9}
 869         RETeq                   /* Return now if done */
 870
 871         and     r4, r2, #0x18
 872         rsbs    r4, r4, #0x18
 873         addne   pc, pc, r4, lsl #1
 874         nop
 875
 876         /* At least 24 bytes remaining */
 877         ldr     r4, [r1], #0x04
 878         ldr     r5, [r1], #0x04
 879         sub     r2, r2, #0x08
 880         strd    r4, [r3], #0x08
 881
 882         /* At least 16 bytes remaining */
 883         ldr     r4, [r1], #0x04
 884         ldr     r5, [r1], #0x04
 885         sub     r2, r2, #0x08
 886         strd    r4, [r3], #0x08
 887
 888         /* At least 8 bytes remaining */
 889         ldr     r4, [r1], #0x04
 890         ldr     r5, [r1], #0x04
 891         subs    r2, r2, #0x08
 892         strd    r4, [r3], #0x08
 893
 894         /* Less than 8 bytes remaining */
 895         ldmfd   sp!, {r4-r9}
 896         RETeq                   /* Return now if done */
 897         subs    r2, r2, #0x04
 898         ldrge   ip, [r1], #0x04
 899         strge   ip, [r3], #0x04
 900         RETeq                   /* Return now if done */
 901         addlt   r2, r2, #0x04
 902         ldrb    ip, [r1], #0x01
 903         cmp     r2, #0x02
 904         ldrbge  r2, [r1], #0x01
 905         strb    ip, [r3], #0x01
 906         ldrbgt  ip, [r1]
 907         strbge  r2, [r3], #0x01
 908         strbgt  ip, [r3]
 909         RET
 910 /* Place a literal pool here for the above ldr instructions to use */
 911 .ltorg
 912
 913
 914 /*
 915  * At this point, it has not been possible to word align both buffers.
 916  * The destination buffer is word aligned, but the source buffer is not.
 917  */
 918 .Lmemcpy_bad_align:
 919         stmfd   sp!, {r4-r7}
 920         bic     r1, r1, #0x03
 921         cmp     ip, #2
 922         ldr     ip, [r1], #0x04
 923         bgt     .Lmemcpy_bad3
 924         beq     .Lmemcpy_bad2
 925         b       .Lmemcpy_bad1
 926
 927 .Lmemcpy_bad1_loop16:
 928         mov     r4, ip, lsr #8
 929         ldr     r5, [r1], #0x04
 930         pld     [r1, #0x018]
 931         ldr     r6, [r1], #0x04
 932         ldr     r7, [r1], #0x04
 933         ldr     ip, [r1], #0x04
 934         orr     r4, r4, r5, lsl #24
 935         mov     r5, r5, lsr #8
 936         orr     r5, r5, r6, lsl #24
 937         mov     r6, r6, lsr #8
 938         orr     r6, r6, r7, lsl #24
 939         mov     r7, r7, lsr #8
 940         orr     r7, r7, ip, lsl #24
 941         str     r4, [r3], #0x04
 942         str     r5, [r3], #0x04
 943         str     r6, [r3], #0x04
 944         str     r7, [r3], #0x04
 945 .Lmemcpy_bad1:
 946         subs    r2, r2, #0x10
 947         bge     .Lmemcpy_bad1_loop16
 948
 949         adds    r2, r2, #0x10
 950         ldmfdeq sp!, {r4-r7}
 951         RETeq                   /* Return now if done */
 952         subs    r2, r2, #0x04
 953         sublt   r1, r1, #0x03
 954         blt     .Lmemcpy_bad_done
 955
 956 .Lmemcpy_bad1_loop4:
 957         mov     r4, ip, lsr #8
 958         ldr     ip, [r1], #0x04
 959         subs    r2, r2, #0x04
 960         orr     r4, r4, ip, lsl #24
 961         str     r4, [r3], #0x04
 962         bge     .Lmemcpy_bad1_loop4
 963         sub     r1, r1, #0x03
 964         b       .Lmemcpy_bad_done
 965
 966 .Lmemcpy_bad2_loop16:
 967         mov     r4, ip, lsr #16
 968         ldr     r5, [r1], #0x04
 969         pld     [r1, #0x018]
 970         ldr     r6, [r1], #0x04
 971         ldr     r7, [r1], #0x04
 972         ldr     ip, [r1], #0x04
 973         orr     r4, r4, r5, lsl #16
 974         mov     r5, r5, lsr #16
 975         orr     r5, r5, r6, lsl #16
 976         mov     r6, r6, lsr #16
 977         orr     r6, r6, r7, lsl #16
 978         mov     r7, r7, lsr #16
 979         orr     r7, r7, ip, lsl #16
 980         str     r4, [r3], #0x04
 981         str     r5, [r3], #0x04
 982         str     r6, [r3], #0x04
 983         str     r7, [r3], #0x04
 984 .Lmemcpy_bad2:
 985         subs    r2, r2, #0x10
 986         bge     .Lmemcpy_bad2_loop16
 987
 988         adds    r2, r2, #0x10
 989         ldmfdeq sp!, {r4-r7}
 990         RETeq                   /* Return now if done */
 991         subs    r2, r2, #0x04
 992         sublt   r1, r1, #0x02
 993         blt     .Lmemcpy_bad_done
 994
 995 .Lmemcpy_bad2_loop4:
 996         mov     r4, ip, lsr #16
 997         ldr     ip, [r1], #0x04
 998         subs    r2, r2, #0x04
 999         orr     r4, r4, ip, lsl #16
1000         str     r4, [r3], #0x04
1001         bge     .Lmemcpy_bad2_loop4
1002         sub     r1, r1, #0x02
1003         b       .Lmemcpy_bad_done
1004
1005 .Lmemcpy_bad3_loop16:
1006         mov     r4, ip, lsr #24
1007         ldr     r5, [r1], #0x04
1008         pld     [r1, #0x018]
1009         ldr     r6, [r1], #0x04
1010         ldr     r7, [r1], #0x04
1011         ldr     ip, [r1], #0x04
1012         orr     r4, r4, r5, lsl #8
1013         mov     r5, r5, lsr #24
1014         orr     r5, r5, r6, lsl #8
1015         mov     r6, r6, lsr #24
1016         orr     r6, r6, r7, lsl #8
1017         mov     r7, r7, lsr #24
1018         orr     r7, r7, ip, lsl #8
1019         str     r4, [r3], #0x04
1020         str     r5, [r3], #0x04
1021         str     r6, [r3], #0x04
1022         str     r7, [r3], #0x04
1023 .Lmemcpy_bad3:
1024         subs    r2, r2, #0x10
1025         bge     .Lmemcpy_bad3_loop16
1026
1027         adds    r2, r2, #0x10
1028         ldmfdeq sp!, {r4-r7}
1029         RETeq                   /* Return now if done */
1030         subs    r2, r2, #0x04
1031         sublt   r1, r1, #0x01
1032         blt     .Lmemcpy_bad_done
1033
1034 .Lmemcpy_bad3_loop4:
1035         mov     r4, ip, lsr #24
1036         ldr     ip, [r1], #0x04
1037         subs    r2, r2, #0x04
1038         orr     r4, r4, ip, lsl #8
1039         str     r4, [r3], #0x04
1040         bge     .Lmemcpy_bad3_loop4
1041         sub     r1, r1, #0x01
1042
1043 .Lmemcpy_bad_done:
1044         ldmfd   sp!, {r4-r7}
1045         adds    r2, r2, #0x04
1046         RETeq
1047         ldrb    ip, [r1], #0x01
1048         cmp     r2, #0x02
1049         ldrbge  r2, [r1], #0x01
1050         strb    ip, [r3], #0x01
1051         ldrbgt  ip, [r1]
1052         strbge  r2, [r3], #0x01
1053         strbgt  ip, [r3]
1054         RET
1055
1056
1057 /*
1058  * Handle short copies (less than 16 bytes), possibly misaligned.
1059  * Some of these are *very* common, thanks to the network stack,
1060  * and so are handled specially.
1061  */
1062 .Lmemcpy_short:
1063         add     pc, pc, r2, lsl #2
1064         nop
1065         RET                     /* 0x00 */
1066         b       .Lmemcpy_bytewise       /* 0x01 */
1067         b       .Lmemcpy_bytewise       /* 0x02 */
1068         b       .Lmemcpy_bytewise       /* 0x03 */
1069         b       .Lmemcpy_4              /* 0x04 */
1070         b       .Lmemcpy_bytewise       /* 0x05 */
1071         b       .Lmemcpy_6              /* 0x06 */
1072         b       .Lmemcpy_bytewise       /* 0x07 */
1073         b       .Lmemcpy_8              /* 0x08 */
1074         b       .Lmemcpy_bytewise       /* 0x09 */
1075         b       .Lmemcpy_bytewise       /* 0x0a */
1076         b       .Lmemcpy_bytewise       /* 0x0b */
1077         b       .Lmemcpy_c              /* 0x0c */
1078 .Lmemcpy_bytewise:
1079         mov     r3, r0                  /* We must not clobber r0 */
1080         ldrb    ip, [r1], #0x01
1081 1:      subs    r2, r2, #0x01
1082         strb    ip, [r3], #0x01
1083         ldrbne  ip, [r1], #0x01
1084         bne     1b
1085         RET
1086
1087 /******************************************************************************
1088  * Special case for 4 byte copies
1089  */
1090 #define LMEMCPY_4_LOG2  6       /* 64 bytes */
1091 #define LMEMCPY_4_PAD   .align LMEMCPY_4_LOG2
1092         LMEMCPY_4_PAD
1093 .Lmemcpy_4:
1094         and     r2, r1, #0x03
1095         orr     r2, r2, r0, lsl #2
1096         ands    r2, r2, #0x0f
1097         sub     r3, pc, #0x14
1098         addne   pc, r3, r2, lsl #LMEMCPY_4_LOG2
1099
1100 /*
1101  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1102  */
1103         ldr     r2, [r1]
1104         str     r2, [r0]
1105         RET
1106         LMEMCPY_4_PAD
1107
1108 /*
1109  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1110  */
1111         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1112         ldr     r2, [r1, #3]            /* BE:r2 = 3xxx  LE:r2 = xxx3 */
1113         mov     r3, r3, lsr #8          /* r3 = .210 */
1114         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1115         str     r3, [r0]
1116         RET
1117         LMEMCPY_4_PAD
1118
1119 /*
1120  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1121  */
1122         ldrh    r3, [r1, #0x02]
1123         ldrh    r2, [r1]
1124         orr     r3, r2, r3, lsl #16
1125         str     r3, [r0]
1126         RET
1127         LMEMCPY_4_PAD
1128
1129 /*
1130  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1131  */
1132         ldr     r3, [r1, #-3]           /* BE:r3 = xxx0  LE:r3 = 0xxx */
1133         ldr     r2, [r1, #1]            /* BE:r2 = 123x  LE:r2 = x321 */
1134         mov     r3, r3, lsr #24         /* r3 = ...0 */
1135         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1136         str     r3, [r0]
1137         RET
1138         LMEMCPY_4_PAD
1139
1140 /*
1141  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1142  */
1143         ldr     r2, [r1]
1144         strb    r2, [r0]
1145         mov     r3, r2, lsr #8
1146         mov     r1, r2, lsr #24
1147         strb    r1, [r0, #0x03]
1148         strh    r3, [r0, #0x01]
1149         RET
1150         LMEMCPY_4_PAD
1151
1152 /*
1153  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1154  */
1155         ldrb    r2, [r1]
1156         ldrh    r3, [r1, #0x01]
1157         ldrb    r1, [r1, #0x03]
1158         strb    r2, [r0]
1159         strh    r3, [r0, #0x01]
1160         strb    r1, [r0, #0x03]
1161         RET
1162         LMEMCPY_4_PAD
1163
1164 /*
1165  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1166  */
1167         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1168         ldrh    r3, [r1, #0x02]         /* LE:r3 = ..23  LE:r3 = ..32 */
1169         strb    r2, [r0]
1170         mov     r2, r2, lsr #8          /* r2 = ...1 */
1171         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1172         mov     r3, r3, lsr #8          /* r3 = ...3 */
1173         strh    r2, [r0, #0x01]
1174         strb    r3, [r0, #0x03]
1175         RET
1176         LMEMCPY_4_PAD
1177
1178 /*
1179  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1180  */
1181         ldrb    r2, [r1]
1182         ldrh    r3, [r1, #0x01]
1183         ldrb    r1, [r1, #0x03]
1184         strb    r2, [r0]
1185         strh    r3, [r0, #0x01]
1186         strb    r1, [r0, #0x03]
1187         RET
1188         LMEMCPY_4_PAD
1189
1190 /*
1191  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1192  */
1193         ldr     r2, [r1]
1194         strh    r2, [r0]
1195         mov     r3, r2, lsr #16
1196         strh    r3, [r0, #0x02]
1197         RET
1198         LMEMCPY_4_PAD
1199
1200 /*
1201  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1202  */
1203         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1204         ldr     r3, [r1, #3]            /* BE:r3 = 3xxx  LE:r3 = xxx3 */
1205         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1206         strh    r1, [r0]
1207         mov     r2, r2, lsr #24         /* r2 = ...2 */
1208         orr     r2, r2, r3, lsl #8      /* r2 = xx32 */
1209         strh    r2, [r0, #0x02]
1210         RET
1211         LMEMCPY_4_PAD
1212
1213 /*
1214  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1215  */
1216         ldrh    r2, [r1]
1217         ldrh    r3, [r1, #0x02]
1218         strh    r2, [r0]
1219         strh    r3, [r0, #0x02]
1220         RET
1221         LMEMCPY_4_PAD
1222
1223 /*
1224  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1225  */
1226         ldr     r3, [r1, #1]            /* BE:r3 = 123x  LE:r3 = x321 */
1227         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1228         mov     r1, r3, lsr #8          /* BE:r1 = .123  LE:r1 = .x32 */
1229         strh    r1, [r0, #0x02]
1230         mov     r3, r3, lsl #8          /* r3 = 321. */
1231         orr     r3, r3, r2, lsr #24     /* r3 = 3210 */
1232         strh    r3, [r0]
1233         RET
1234         LMEMCPY_4_PAD
1235
1236 /*
1237  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1238  */
1239         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1240         strb    r2, [r0]
1241         mov     r3, r2, lsr #8
1242         mov     r1, r2, lsr #24
1243         strh    r3, [r0, #0x01]
1244         strb    r1, [r0, #0x03]
1245         RET
1246         LMEMCPY_4_PAD
1247
1248 /*
1249  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1250  */
1251         ldrb    r2, [r1]
1252         ldrh    r3, [r1, #0x01]
1253         ldrb    r1, [r1, #0x03]
1254         strb    r2, [r0]
1255         strh    r3, [r0, #0x01]
1256         strb    r1, [r0, #0x03]
1257         RET
1258         LMEMCPY_4_PAD
1259
1260 /*
1261  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1262  */
1263         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1264         ldrh    r3, [r1, #0x02]         /* BE:r3 = ..23  LE:r3 = ..32 */
1265         strb    r2, [r0]
1266         mov     r2, r2, lsr #8          /* r2 = ...1 */
1267         orr     r2, r2, r3, lsl #8      /* r2 = .321 */
1268         strh    r2, [r0, #0x01]
1269         mov     r3, r3, lsr #8          /* r3 = ...3 */
1270         strb    r3, [r0, #0x03]
1271         RET
1272         LMEMCPY_4_PAD
1273
1274 /*
1275  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1276  */
1277         ldrb    r2, [r1]
1278         ldrh    r3, [r1, #0x01]
1279         ldrb    r1, [r1, #0x03]
1280         strb    r2, [r0]
1281         strh    r3, [r0, #0x01]
1282         strb    r1, [r0, #0x03]
1283         RET
1284         LMEMCPY_4_PAD
1285
1286
1287 /******************************************************************************
1288  * Special case for 6 byte copies
1289  */
1290 #define LMEMCPY_6_LOG2  6       /* 64 bytes */
1291 #define LMEMCPY_6_PAD   .align LMEMCPY_6_LOG2
1292         LMEMCPY_6_PAD
1293 .Lmemcpy_6:
1294         and     r2, r1, #0x03
1295         orr     r2, r2, r0, lsl #2
1296         ands    r2, r2, #0x0f
1297         sub     r3, pc, #0x14
1298         addne   pc, r3, r2, lsl #LMEMCPY_6_LOG2
1299
1300 /*
1301  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1302  */
1303         ldr     r2, [r1]
1304         ldrh    r3, [r1, #0x04]
1305         str     r2, [r0]
1306         strh    r3, [r0, #0x04]
1307         RET
1308         LMEMCPY_6_PAD
1309
1310 /*
1311  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1312  */
1313         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1314         ldr     r3, [r1, #0x03]         /* BE:r3 = 345x  LE:r3 = x543 */
1315         mov     r2, r2, lsr #8          /* r2 = .210 */
1316         orr     r2, r2, r3, lsl #24     /* r2 = 3210 */
1317         mov     r3, r3, lsr #8          /* BE:r3 = .345  LE:r3 = .x54 */
1318         str     r2, [r0]
1319         strh    r3, [r0, #0x04]
1320         RET
1321         LMEMCPY_6_PAD
1322
1323 /*
1324  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1325  */
1326         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1327         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1328         mov     r1, r3, lsr #16         /* r1 = ..54 */
1329         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1330         str     r2, [r0]
1331         strh    r1, [r0, #0x04]
1332         RET
1333         LMEMCPY_6_PAD
1334
1335 /*
1336  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1337  */
1338         ldr     r2, [r1, #-3]           /* BE:r2 = xxx0  LE:r2 = 0xxx */
1339         ldr     r3, [r1, #1]            /* BE:r3 = 1234  LE:r3 = 4321 */
1340         ldr     r1, [r1, #5]            /* BE:r1 = 5xxx  LE:r3 = xxx5 */
1341         mov     r2, r2, lsr #24         /* r2 = ...0 */
1342         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1343         mov     r1, r1, lsl #8          /* r1 = xx5. */
1344         orr     r1, r1, r3, lsr #24     /* r1 = xx54 */
1345         str     r2, [r0]
1346         strh    r1, [r0, #0x04]
1347         RET
1348         LMEMCPY_6_PAD
1349
1350 /*
1351  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1352  */
1353         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1354         ldrh    r2, [r1, #0x04]         /* BE:r2 = ..45  LE:r2 = ..54 */
1355         mov     r1, r3, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1356         strh    r1, [r0, #0x01]
1357         strb    r3, [r0]
1358         mov     r3, r3, lsr #24         /* r3 = ...3 */
1359         orr     r3, r3, r2, lsl #8      /* r3 = .543 */
1360         mov     r2, r2, lsr #8          /* r2 = ...5 */
1361         strh    r3, [r0, #0x03]
1362         strb    r2, [r0, #0x05]
1363         RET
1364         LMEMCPY_6_PAD
1365
1366 /*
1367  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1368  */
1369         ldrb    r2, [r1]
1370         ldrh    r3, [r1, #0x01]
1371         ldrh    ip, [r1, #0x03]
1372         ldrb    r1, [r1, #0x05]
1373         strb    r2, [r0]
1374         strh    r3, [r0, #0x01]
1375         strh    ip, [r0, #0x03]
1376         strb    r1, [r0, #0x05]
1377         RET
1378         LMEMCPY_6_PAD
1379
1380 /*
1381  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1382  */
1383         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1384         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1385         strb    r2, [r0]
1386         mov     r3, r1, lsr #24
1387         strb    r3, [r0, #0x05]
1388         mov     r3, r1, lsr #8          /* r3 = .543 */
1389         strh    r3, [r0, #0x03]
1390         mov     r3, r2, lsr #8          /* r3 = ...1 */
1391         orr     r3, r3, r1, lsl #8      /* r3 = 4321 */
1392         strh    r3, [r0, #0x01]
1393         RET
1394         LMEMCPY_6_PAD
1395
1396 /*
1397  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1398  */
1399         ldrb    r2, [r1]
1400         ldrh    r3, [r1, #0x01]
1401         ldrh    ip, [r1, #0x03]
1402         ldrb    r1, [r1, #0x05]
1403         strb    r2, [r0]
1404         strh    r3, [r0, #0x01]
1405         strh    ip, [r0, #0x03]
1406         strb    r1, [r0, #0x05]
1407         RET
1408         LMEMCPY_6_PAD
1409
1410 /*
1411  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1412  */
1413         ldrh    r2, [r1, #0x04]         /* r2 = ..54 */
1414         ldr     r3, [r1]                /* r3 = 3210 */
1415         mov     r2, r2, lsl #16         /* r2 = 54.. */
1416         orr     r2, r2, r3, lsr #16     /* r2 = 5432 */
1417         strh    r3, [r0]
1418         str     r2, [r0, #0x02]
1419         RET
1420         LMEMCPY_6_PAD
1421
1422 /*
1423  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1424  */
1425         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1426         ldr     r2, [r1, #3]            /* BE:r2 = 345x  LE:r2 = x543 */
1427         mov     r1, r3, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1428         mov     r2, r2, lsl #8          /* r2 = 543. */
1429         orr     r2, r2, r3, lsr #24     /* r2 = 5432 */
1430         strh    r1, [r0]
1431         str     r2, [r0, #0x02]
1432         RET
1433         LMEMCPY_6_PAD
1434
1435 /*
1436  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1437  */
1438         ldrh    r2, [r1]
1439         ldr     r3, [r1, #0x02]
1440         strh    r2, [r0]
1441         str     r3, [r0, #0x02]
1442         RET
1443         LMEMCPY_6_PAD
1444
1445 /*
1446  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1447  */
1448         ldrb    r3, [r1]                /* r3 = ...0 */
1449         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1450         ldrb    r1, [r1, #0x05]         /* r1 = ...5 */
1451         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1452         mov     r1, r1, lsl #24         /* r1 = 5... */
1453         orr     r1, r1, r2, lsr #8      /* r1 = 5432 */
1454         strh    r3, [r0]
1455         str     r1, [r0, #0x02]
1456         RET
1457         LMEMCPY_6_PAD
1458
1459 /*
1460  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1461  */
1462         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1463         ldrh    r1, [r1, #0x04]         /* BE:r1 = ..45  LE:r1 = ..54 */
1464         strb    r2, [r0]
1465         mov     r2, r2, lsr #8          /* r2 = .321 */
1466         orr     r2, r2, r1, lsl #24     /* r2 = 4321 */
1467         mov     r1, r1, lsr #8          /* r1 = ...5 */
1468         str     r2, [r0, #0x01]
1469         strb    r1, [r0, #0x05]
1470         RET
1471         LMEMCPY_6_PAD
1472
1473 /*
1474  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1475  */
1476         ldrb    r2, [r1]
1477         ldrh    r3, [r1, #0x01]
1478         ldrh    ip, [r1, #0x03]
1479         ldrb    r1, [r1, #0x05]
1480         strb    r2, [r0]
1481         strh    r3, [r0, #0x01]
1482         strh    ip, [r0, #0x03]
1483         strb    r1, [r0, #0x05]
1484         RET
1485         LMEMCPY_6_PAD
1486
1487 /*
1488  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1489  */
1490         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1491         ldr     r1, [r1, #0x02]         /* BE:r1 = 2345  LE:r1 = 5432 */
1492         strb    r2, [r0]
1493         mov     r2, r2, lsr #8          /* r2 = ...1 */
1494         orr     r2, r2, r1, lsl #8      /* r2 = 4321 */
1495         mov     r1, r1, lsr #24         /* r1 = ...5 */
1496         str     r2, [r0, #0x01]
1497         strb    r1, [r0, #0x05]
1498         RET
1499         LMEMCPY_6_PAD
1500
1501 /*
1502  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1503  */
1504         ldrb    r2, [r1]
1505         ldr     r3, [r1, #0x01]
1506         ldrb    r1, [r1, #0x05]
1507         strb    r2, [r0]
1508         str     r3, [r0, #0x01]
1509         strb    r1, [r0, #0x05]
1510         RET
1511         LMEMCPY_6_PAD
1512
1513
1514 /******************************************************************************
1515  * Special case for 8 byte copies
1516  */
1517 #define LMEMCPY_8_LOG2  6       /* 64 bytes */
1518 #define LMEMCPY_8_PAD   .align LMEMCPY_8_LOG2
1519         LMEMCPY_8_PAD
1520 .Lmemcpy_8:
1521         and     r2, r1, #0x03
1522         orr     r2, r2, r0, lsl #2
1523         ands    r2, r2, #0x0f
1524         sub     r3, pc, #0x14
1525         addne   pc, r3, r2, lsl #LMEMCPY_8_LOG2
1526
1527 /*
1528  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1529  */
1530         ldr     r2, [r1]
1531         ldr     r3, [r1, #0x04]
1532         str     r2, [r0]
1533         str     r3, [r0, #0x04]
1534         RET
1535         LMEMCPY_8_PAD
1536
1537 /*
1538  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1539  */
1540         ldr     r3, [r1, #-1]           /* BE:r3 = x012  LE:r3 = 210x */
1541         ldr     r2, [r1, #0x03]         /* BE:r2 = 3456  LE:r2 = 6543 */
1542         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1543         mov     r3, r3, lsr #8          /* r3 = .210 */
1544         orr     r3, r3, r2, lsl #24     /* r3 = 3210 */
1545         mov     r1, r1, lsl #24         /* r1 = 7... */
1546         orr     r2, r1, r2, lsr #8      /* r2 = 7654 */
1547         str     r3, [r0]
1548         str     r2, [r0, #0x04]
1549         RET
1550         LMEMCPY_8_PAD
1551
1552 /*
1553  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1554  */
1555         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1556         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1557         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1558         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1559         mov     r3, r3, lsr #16         /* r3 = ..54 */
1560         orr     r3, r3, r1, lsl #16     /* r3 = 7654 */
1561         str     r2, [r0]
1562         str     r3, [r0, #0x04]
1563         RET
1564         LMEMCPY_8_PAD
1565
1566 /*
1567  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1568  */
1569         ldrb    r3, [r1]                /* r3 = ...0 */
1570         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1571         ldr     r1, [r1, #0x05]         /* BE:r1 = 567x  LE:r1 = x765 */
1572         orr     r3, r3, r2, lsl #8      /* r3 = 3210 */
1573         mov     r2, r2, lsr #24         /* r2 = ...4 */
1574         orr     r2, r2, r1, lsl #8      /* r2 = 7654 */
1575         str     r3, [r0]
1576         str     r2, [r0, #0x04]
1577         RET
1578         LMEMCPY_8_PAD
1579
1580 /*
1581  * 0100: dst is 8-bit aligned, src is 32-bit aligned
1582  */
1583         ldr     r3, [r1]                /* BE:r3 = 0123  LE:r3 = 3210 */
1584         ldr     r2, [r1, #0x04]         /* BE:r2 = 4567  LE:r2 = 7654 */
1585         strb    r3, [r0]
1586         mov     r1, r2, lsr #24         /* r1 = ...7 */
1587         strb    r1, [r0, #0x07]
1588         mov     r1, r3, lsr #8          /* r1 = .321 */
1589         mov     r3, r3, lsr #24         /* r3 = ...3 */
1590         orr     r3, r3, r2, lsl #8      /* r3 = 6543 */
1591         strh    r1, [r0, #0x01]
1592         str     r3, [r0, #0x03]
1593         RET
1594         LMEMCPY_8_PAD
1595
1596 /*
1597  * 0101: dst is 8-bit aligned, src is 8-bit aligned
1598  */
1599         ldrb    r2, [r1]
1600         ldrh    r3, [r1, #0x01]
1601         ldr     ip, [r1, #0x03]
1602         ldrb    r1, [r1, #0x07]
1603         strb    r2, [r0]
1604         strh    r3, [r0, #0x01]
1605         str     ip, [r0, #0x03]
1606         strb    r1, [r0, #0x07]
1607         RET
1608         LMEMCPY_8_PAD
1609
1610 /*
1611  * 0110: dst is 8-bit aligned, src is 16-bit aligned
1612  */
1613         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1614         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1615         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1616         strb    r2, [r0]                /* 0 */
1617         mov     ip, r1, lsr #8          /* ip = ...7 */
1618         strb    ip, [r0, #0x07]         /* 7 */
1619         mov     ip, r2, lsr #8          /* ip = ...1 */
1620         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
1621         mov     r3, r3, lsr #8          /* r3 = .543 */
1622         orr     r3, r3, r1, lsl #24     /* r3 = 6543 */
1623         strh    ip, [r0, #0x01]
1624         str     r3, [r0, #0x03]
1625         RET
1626         LMEMCPY_8_PAD
1627
1628 /*
1629  * 0111: dst is 8-bit aligned, src is 8-bit aligned
1630  */
1631         ldrb    r3, [r1]                /* r3 = ...0 */
1632         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
1633         ldrh    r2, [r1, #0x05]         /* BE:r2 = ..56  LE:r2 = ..65 */
1634         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1635         strb    r3, [r0]
1636         mov     r3, ip, lsr #16         /* BE:r3 = ..12  LE:r3 = ..43 */
1637         strh    ip, [r0, #0x01]
1638         orr     r2, r3, r2, lsl #16     /* r2 = 6543 */
1639         str     r2, [r0, #0x03]
1640         strb    r1, [r0, #0x07]
1641         RET
1642         LMEMCPY_8_PAD
1643
1644 /*
1645  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1646  */
1647         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1648         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1649         mov     r1, r2, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
1650         strh    r2, [r0]
1651         orr     r2, r1, r3, lsl #16     /* r2 = 5432 */
1652         mov     r3, r3, lsr #16         /* r3 = ..76 */
1653         str     r2, [r0, #0x02]
1654         strh    r3, [r0, #0x06]
1655         RET
1656         LMEMCPY_8_PAD
1657
1658 /*
1659  * 1001: dst is 16-bit aligned, src is 8-bit aligned
1660  */
1661         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1662         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1663         ldrb    ip, [r1, #0x07]         /* ip = ...7 */
1664         mov     r1, r2, lsr #8          /* BE:r1 = .x01  LE:r1 = .210 */
1665         strh    r1, [r0]
1666         mov     r1, r2, lsr #24         /* r1 = ...2 */
1667         orr     r1, r1, r3, lsl #8      /* r1 = 5432 */
1668         mov     r3, r3, lsr #24         /* r3 = ...6 */
1669         orr     r3, r3, ip, lsl #8      /* r3 = ..76 */
1670         str     r1, [r0, #0x02]
1671         strh    r3, [r0, #0x06]
1672         RET
1673         LMEMCPY_8_PAD
1674
1675 /*
1676  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1677  */
1678         ldrh    r2, [r1]
1679         ldr     ip, [r1, #0x02]
1680         ldrh    r3, [r1, #0x06]
1681         strh    r2, [r0]
1682         str     ip, [r0, #0x02]
1683         strh    r3, [r0, #0x06]
1684         RET
1685         LMEMCPY_8_PAD
1686
1687 /*
1688  * 1011: dst is 16-bit aligned, src is 8-bit aligned
1689  */
1690         ldr     r3, [r1, #0x05]         /* BE:r3 = 567x  LE:r3 = x765 */
1691         ldr     r2, [r1, #0x01]         /* BE:r2 = 1234  LE:r2 = 4321 */
1692         ldrb    ip, [r1]                /* ip = ...0 */
1693         mov     r1, r3, lsr #8          /* BE:r1 = .567  LE:r1 = .x76 */
1694         strh    r1, [r0, #0x06]
1695         mov     r3, r3, lsl #24         /* r3 = 5... */
1696         orr     r3, r3, r2, lsr #8      /* r3 = 5432 */
1697         orr     r2, ip, r2, lsl #8      /* r2 = 3210 */
1698         str     r3, [r0, #0x02]
1699         strh    r2, [r0]
1700         RET
1701         LMEMCPY_8_PAD
1702
1703 /*
1704  * 1100: dst is 8-bit aligned, src is 32-bit aligned
1705  */
1706         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1707         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1708         mov     r1, r3, lsr #8          /* BE:r1 = .456  LE:r1 = .765 */
1709         strh    r1, [r0, #0x05]
1710         strb    r2, [r0]
1711         mov     r1, r3, lsr #24         /* r1 = ...7 */
1712         strb    r1, [r0, #0x07]
1713         mov     r2, r2, lsr #8          /* r2 = .321 */
1714         orr     r2, r2, r3, lsl #24     /* r2 = 4321 */
1715         str     r2, [r0, #0x01]
1716         RET
1717         LMEMCPY_8_PAD
1718
1719 /*
1720  * 1101: dst is 8-bit aligned, src is 8-bit aligned
1721  */
1722         ldrb    r3, [r1]                /* r3 = ...0 */
1723         ldrh    r2, [r1, #0x01]         /* BE:r2 = ..12  LE:r2 = ..21 */
1724         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
1725         ldrb    r1, [r1, #0x07]         /* r1 = ...7 */
1726         strb    r3, [r0]
1727         mov     r3, ip, lsr #16         /* BE:r3 = ..34  LE:r3 = ..65 */
1728         strh    r3, [r0, #0x05]
1729         orr     r2, r2, ip, lsl #16     /* r2 = 4321 */
1730         str     r2, [r0, #0x01]
1731         strb    r1, [r0, #0x07]
1732         RET
1733         LMEMCPY_8_PAD
1734
1735 /*
1736  * 1110: dst is 8-bit aligned, src is 16-bit aligned
1737  */
1738         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1739         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1740         ldrh    r1, [r1, #0x06]         /* BE:r1 = ..67  LE:r1 = ..76 */
1741         strb    r2, [r0]
1742         mov     ip, r2, lsr #8          /* ip = ...1 */
1743         orr     ip, ip, r3, lsl #8      /* ip = 4321 */
1744         mov     r2, r1, lsr #8          /* r2 = ...7 */
1745         strb    r2, [r0, #0x07]
1746         mov     r1, r1, lsl #8          /* r1 = .76. */
1747         orr     r1, r1, r3, lsr #24     /* r1 = .765 */
1748         str     ip, [r0, #0x01]
1749         strh    r1, [r0, #0x05]
1750         RET
1751         LMEMCPY_8_PAD
1752
1753 /*
1754  * 1111: dst is 8-bit aligned, src is 8-bit aligned
1755  */
1756         ldrb    r2, [r1]
1757         ldr     ip, [r1, #0x01]
1758         ldrh    r3, [r1, #0x05]
1759         ldrb    r1, [r1, #0x07]
1760         strb    r2, [r0]
1761         str     ip, [r0, #0x01]
1762         strh    r3, [r0, #0x05]
1763         strb    r1, [r0, #0x07]
1764         RET
1765         LMEMCPY_8_PAD
1766
1767 /******************************************************************************
1768  * Special case for 12 byte copies
1769  */
1770 #define LMEMCPY_C_LOG2  7       /* 128 bytes */
1771 #define LMEMCPY_C_PAD   .align LMEMCPY_C_LOG2
1772         LMEMCPY_C_PAD
1773 .Lmemcpy_c:
1774         and     r2, r1, #0x03
1775         orr     r2, r2, r0, lsl #2
1776         ands    r2, r2, #0x0f
1777         sub     r3, pc, #0x14
1778         addne   pc, r3, r2, lsl #LMEMCPY_C_LOG2
1779
1780 /*
1781  * 0000: dst is 32-bit aligned, src is 32-bit aligned
1782  */
1783         ldr     r2, [r1]
1784         ldr     r3, [r1, #0x04]
1785         ldr     r1, [r1, #0x08]
1786         str     r2, [r0]
1787         str     r3, [r0, #0x04]
1788         str     r1, [r0, #0x08]
1789         RET
1790         LMEMCPY_C_PAD
1791
1792 /*
1793  * 0001: dst is 32-bit aligned, src is 8-bit aligned
1794  */
1795         ldrb    r2, [r1, #0xb]          /* r2 = ...B */
1796         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
1797         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1798         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
1799         mov     r2, r2, lsl #24         /* r2 = B... */
1800         orr     r2, r2, ip, lsr #8      /* r2 = BA98 */
1801         str     r2, [r0, #0x08]
1802         mov     r2, ip, lsl #24         /* r2 = 7... */
1803         orr     r2, r2, r3, lsr #8      /* r2 = 7654 */
1804         mov     r1, r1, lsr #8          /* r1 = .210 */
1805         orr     r1, r1, r3, lsl #24     /* r1 = 3210 */
1806         str     r2, [r0, #0x04]
1807         str     r1, [r0]
1808         RET
1809         LMEMCPY_C_PAD
1810
1811 /*
1812  * 0010: dst is 32-bit aligned, src is 16-bit aligned
1813  */
1814         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1815         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1816         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
1817         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
1818         orr     r2, r2, r3, lsl #16     /* r2 = 3210 */
1819         str     r2, [r0]
1820         mov     r3, r3, lsr #16         /* r3 = ..54 */
1821         orr     r3, r3, ip, lsl #16     /* r3 = 7654 */
1822         mov     r1, r1, lsl #16         /* r1 = BA.. */
1823         orr     r1, r1, ip, lsr #16     /* r1 = BA98 */
1824         str     r3, [r0, #0x04]
1825         str     r1, [r0, #0x08]
1826         RET
1827         LMEMCPY_C_PAD
1828
1829 /*
1830  * 0011: dst is 32-bit aligned, src is 8-bit aligned
1831  */
1832         ldrb    r2, [r1]                /* r2 = ...0 */
1833         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
1834         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
1835         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
1836         orr     r2, r2, r3, lsl #8      /* r2 = 3210 */
1837         str     r2, [r0]
1838         mov     r3, r3, lsr #24         /* r3 = ...4 */
1839         orr     r3, r3, ip, lsl #8      /* r3 = 7654 */
1840         mov     r1, r1, lsl #8          /* r1 = BA9. */
1841         orr     r1, r1, ip, lsr #24     /* r1 = BA98 */
1842         str     r3, [r0, #0x04]
1843         str     r1, [r0, #0x08]
1844         RET
1845         LMEMCPY_C_PAD
1846
1847 /*
1848  * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1849  */
1850         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
1851         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1852         ldr     ip, [r1, #0x08]         /* BE:ip = 89AB  LE:ip = BA98 */
1853         mov     r1, r2, lsr #8          /* BE:r1 = .012  LE:r1 = .321 */
1854         strh    r1, [r0, #0x01]
1855         strb    r2, [r0]
1856         mov     r1, r2, lsr #24         /* r1 = ...3 */
1857         orr     r2, r1, r3, lsl #8      /* r1 = 6543 */
1858         mov     r1, r3, lsr #24         /* r1 = ...7 */
1859         orr     r1, r1, ip, lsl #8      /* r1 = A987 */
1860         mov     ip, ip, lsr #24         /* ip = ...B */
1861         str     r2, [r0, #0x03]
1862         str     r1, [r0, #0x07]
1863         strb    ip, [r0, #0x0b]
1864         RET
1865         LMEMCPY_C_PAD
1866
1867 /*
1868  * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1869  */
1870         ldrb    r2, [r1]
1871         ldrh    r3, [r1, #0x01]
1872         ldr     ip, [r1, #0x03]
1873         strb    r2, [r0]
1874         ldr     r2, [r1, #0x07]
1875         ldrb    r1, [r1, #0x0b]
1876         strh    r3, [r0, #0x01]
1877         str     ip, [r0, #0x03]
1878         str     r2, [r0, #0x07]
1879         strb    r1, [r0, #0x0b]
1880         RET
1881         LMEMCPY_C_PAD
1882
1883 /*
1884  * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1885  */
1886         ldrh    r2, [r1]                /* BE:r2 = ..01  LE:r2 = ..10 */
1887         ldr     r3, [r1, #0x02]         /* BE:r3 = 2345  LE:r3 = 5432 */
1888         ldr     ip, [r1, #0x06]         /* BE:ip = 6789  LE:ip = 9876 */
1889         ldrh    r1, [r1, #0x0a]         /* BE:r1 = ..AB  LE:r1 = ..BA */
1890         strb    r2, [r0]
1891         mov     r2, r2, lsr #8          /* r2 = ...1 */
1892         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
1893         strh    r2, [r0, #0x01]
1894         mov     r2, r3, lsr #8          /* r2 = .543 */
1895         orr     r3, r2, ip, lsl #24     /* r3 = 6543 */
1896         mov     r2, ip, lsr #8          /* r2 = .987 */
1897         orr     r2, r2, r1, lsl #24     /* r2 = A987 */
1898         mov     r1, r1, lsr #8          /* r1 = ...B */
1899         str     r3, [r0, #0x03]
1900         str     r2, [r0, #0x07]
1901         strb    r1, [r0, #0x0b]
1902         RET
1903         LMEMCPY_C_PAD
1904
1905 /*
1906  * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1907  */
1908         ldrb    r2, [r1]
1909         ldr     r3, [r1, #0x01]         /* BE:r3 = 1234  LE:r3 = 4321 */
1910         ldr     ip, [r1, #0x05]         /* BE:ip = 5678  LE:ip = 8765 */
1911         ldr     r1, [r1, #0x09]         /* BE:r1 = 9ABx  LE:r1 = xBA9 */
1912         strb    r2, [r0]
1913         strh    r3, [r0, #0x01]
1914         mov     r3, r3, lsr #16         /* r3 = ..43 */
1915         orr     r3, r3, ip, lsl #16     /* r3 = 6543 */
1916         mov     ip, ip, lsr #16         /* ip = ..87 */
1917         orr     ip, ip, r1, lsl #16     /* ip = A987 */
1918         mov     r1, r1, lsr #16         /* r1 = ..xB */
1919         str     r3, [r0, #0x03]
1920         str     ip, [r0, #0x07]
1921         strb    r1, [r0, #0x0b]
1922         RET
1923         LMEMCPY_C_PAD
1924
1925 /*
1926  * 1000: dst is 16-bit aligned, src is 32-bit aligned
1927  */
1928         ldr     ip, [r1]                /* BE:ip = 0123  LE:ip = 3210 */
1929         ldr     r3, [r1, #0x04]         /* BE:r3 = 4567  LE:r3 = 7654 */
1930         ldr     r2, [r1, #0x08]         /* BE:r2 = 89AB  LE:r2 = BA98 */
1931         mov     r1, ip, lsr #16         /* BE:r1 = ..01  LE:r1 = ..32 */
1932         strh    ip, [r0]
1933         orr     r1, r1, r3, lsl #16     /* r1 = 5432 */
1934         mov     r3, r3, lsr #16         /* r3 = ..76 */
1935         orr     r3, r3, r2, lsl #16     /* r3 = 9876 */
1936         mov     r2, r2, lsr #16         /* r2 = ..BA */
1937         str     r1, [r0, #0x02]
1938         str     r3, [r0, #0x06]
1939         strh    r2, [r0, #0x0a]
1940         RET
1941         LMEMCPY_C_PAD
1942
1943 /*
1944  * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1945  */
1946         ldr     r2, [r1, #-1]           /* BE:r2 = x012  LE:r2 = 210x */
1947         ldr     r3, [r1, #0x03]         /* BE:r3 = 3456  LE:r3 = 6543 */
1948         mov     ip, r2, lsr #8          /* BE:ip = .x01  LE:ip = .210 */
1949         strh    ip, [r0]
1950         ldr     ip, [r1, #0x07]         /* BE:ip = 789A  LE:ip = A987 */
1951         ldrb    r1, [r1, #0x0b]         /* r1 = ...B */
1952         mov     r2, r2, lsr #24         /* r2 = ...2 */
1953         orr     r2, r2, r3, lsl #8      /* r2 = 5432 */
1954         mov     r3, r3, lsr #24         /* r3 = ...6 */
1955         orr     r3, r3, ip, lsl #8      /* r3 = 9876 */
1956         mov     r1, r1, lsl #8          /* r1 = ..B. */
1957         orr     r1, r1, ip, lsr #24     /* r1 = ..BA */
1958         str     r2, [r0, #0x02]
1959         str     r3, [r0, #0x06]
1960         strh    r1, [r0, #0x0a]
1961         RET
1962         LMEMCPY_C_PAD
1963
1964 /*
1965  * 1010: dst is 16-bit aligned, src is 16-bit aligned
1966  */
1967         ldrh    r2, [r1]
1968         ldr     r3, [r1, #0x02]
1969         ldr     ip, [r1, #0x06]
1970         ldrh    r1, [r1, #0x0a]
1971         strh    r2, [r0]
1972         str     r3, [r0, #0x02]
1973         str     ip, [r0, #0x06]
1974         strh    r1, [r0, #0x0a]
1975         RET
1976         LMEMCPY_C_PAD
1977
1978 /*
1979  * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1980  */
1981         ldr     r2, [r1, #0x09]         /* BE:r2 = 9ABx  LE:r2 = xBA9 */
1982         ldr     r3, [r1, #0x05]         /* BE:r3 = 5678  LE:r3 = 8765 */
1983         mov     ip, r2, lsr #8          /* BE:ip = .9AB  LE:ip = .xBA */
1984         strh    ip, [r0, #0x0a]
1985         ldr     ip, [r1, #0x01]         /* BE:ip = 1234  LE:ip = 4321 */
1986         ldrb    r1, [r1]                /* r1 = ...0 */
1987         mov     r2, r2, lsl #24         /* r2 = 9... */
1988         orr     r2, r2, r3, lsr #8      /* r2 = 9876 */
1989         mov     r3, r3, lsl #24         /* r3 = 5... */
1990         orr     r3, r3, ip, lsr #8      /* r3 = 5432 */
1991         orr     r1, r1, ip, lsl #8      /* r1 = 3210 */
1992         str     r2, [r0, #0x06]
1993         str     r3, [r0, #0x02]
1994         strh    r1, [r0]
1995         RET
1996         LMEMCPY_C_PAD
1997
1998 /*
1999  * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2000  */
2001         ldr     r2, [r1]                /* BE:r2 = 0123  LE:r2 = 3210 */
2002         ldr     ip, [r1, #0x04]         /* BE:ip = 4567  LE:ip = 7654 */
2003         ldr     r1, [r1, #0x08]         /* BE:r1 = 89AB  LE:r1 = BA98 */
2004         strb    r2, [r0]
2005         mov     r3, r2, lsr #8          /* r3 = .321 */
2006         orr     r3, r3, ip, lsl #24     /* r3 = 4321 */
2007         str     r3, [r0, #0x01]
2008         mov     r3, ip, lsr #8          /* r3 = .765 */
2009         orr     r3, r3, r1, lsl #24     /* r3 = 8765 */
2010         str     r3, [r0, #0x05]
2011         mov     r1, r1, lsr #8          /* r1 = .BA9 */
2012         strh    r1, [r0, #0x09]
2013         mov     r1, r1, lsr #16         /* r1 = ...B */
2014         strb    r1, [r0, #0x0b]
2015         RET
2016         LMEMCPY_C_PAD
2017
2018 /*
2019  * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2020  */
2021         ldrb    r2, [r1, #0x0b]         /* r2 = ...B */
2022         ldr     r3, [r1, #0x07]         /* BE:r3 = 789A  LE:r3 = A987 */
2023         ldr     ip, [r1, #0x03]         /* BE:ip = 3456  LE:ip = 6543 */
2024         ldr     r1, [r1, #-1]           /* BE:r1 = x012  LE:r1 = 210x */
2025         strb    r2, [r0, #0x0b]
2026         mov     r2, r3, lsr #16         /* r2 = ..A9 */
2027         strh    r2, [r0, #0x09]
2028         mov     r3, r3, lsl #16         /* r3 = 87.. */
2029         orr     r3, r3, ip, lsr #16     /* r3 = 8765 */
2030         mov     ip, ip, lsl #16         /* ip = 43.. */
2031         orr     ip, ip, r1, lsr #16     /* ip = 4321 */
2032         mov     r1, r1, lsr #8          /* r1 = .210 */
2033         str     r3, [r0, #0x05]
2034         str     ip, [r0, #0x01]
2035         strb    r1, [r0]
2036         RET
2037         LMEMCPY_C_PAD
2038
2039 /*
2040  * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2041  */
2042         ldrh    r2, [r1]                /* r2 = ..10 */
2043         ldr     r3, [r1, #0x02]         /* r3 = 5432 */
2044         ldr     ip, [r1, #0x06]         /* ip = 9876 */
2045         ldrh    r1, [r1, #0x0a]         /* r1 = ..BA */
2046         strb    r2, [r0]
2047         mov     r2, r2, lsr #8          /* r2 = ...1 */
2048         orr     r2, r2, r3, lsl #8      /* r2 = 4321 */
2049         mov     r3, r3, lsr #24         /* r3 = ...5 */
2050         orr     r3, r3, ip, lsl #8      /* r3 = 8765 */
2051         mov     ip, ip, lsr #24         /* ip = ...9 */
2052         orr     ip, ip, r1, lsl #8      /* ip = .BA9 */
2053         mov     r1, r1, lsr #8          /* r1 = ...B */
2054         str     r2, [r0, #0x01]
2055         str     r3, [r0, #0x05]
2056         strh    ip, [r0, #0x09]
2057         strb    r1, [r0, #0x0b]
2058         RET
2059         LMEMCPY_C_PAD
2060
2061 /*
2062  * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2063  */
2064         ldrb    r2, [r1]
2065         ldr     r3, [r1, #0x01]
2066         ldr     ip, [r1, #0x05]
2067         strb    r2, [r0]
2068         ldrh    r2, [r1, #0x09]
2069         ldrb    r1, [r1, #0x0b]
2070         str     r3, [r0, #0x01]
2071         str     ip, [r0, #0x05]
2072         strh    r2, [r0, #0x09]
2073         strb    r1, [r0, #0x0b]
2074         RET
2075 END(memcpy)