2 * Copyright (c) 2004 Olivier Houchard
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed for the NetBSD Project by
43 * Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 * or promote products derived from this software without specific prior
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution.
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
89 #include <machine/asm.h>
90 __FBSDID("$FreeBSD$");
97 .word _C_LABEL(_arm_memcpy)
99 .word _C_LABEL(_arm_bzero)
101 .word _C_LABEL(_min_memcpy_size)
103 .word _C_LABEL(_min_bzero_size)
105 * memset: Sets a block of memory to the specified value
110 * r2 - number of bytes to write
115 /* LINTSTUB: Func: void bzero(void *, size_t) */
121 ldr r2, .L_min_bzero_size
125 stmfd sp!, {r0, r1, lr}
130 ldmfd sp!, {r0, r1, lr}
136 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
138 and r3, r1, #0xff /* We deal with bytes */
141 cmp r1, #0x04 /* Do we have less than 4 bytes */
143 blt .Lmemset_lessthanfour
145 /* Ok first we will word align the address */
146 ands r2, ip, #0x03 /* Get the bottom two bits */
147 bne .Lmemset_wordunaligned /* The address is not word aligned */
149 /* We are now word aligned */
150 .Lmemset_wordaligned:
151 orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
152 tst ip, #0x04 /* Quad-align for armv5e */
153 orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
154 subne r1, r1, #0x04 /* Quad-align if necessary */
155 strne r3, [ip], #0x04
157 blt .Lmemset_loop4 /* If less than 16 then use words */
158 mov r2, r3 /* Duplicate data */
159 cmp r1, #0x80 /* If < 128 then skip the big loop */
162 /* Do 128 bytes at a time */
165 strdge r2, [ip], #0x08
166 strdge r2, [ip], #0x08
167 strdge r2, [ip], #0x08
168 strdge r2, [ip], #0x08
169 strdge r2, [ip], #0x08
170 strdge r2, [ip], #0x08
171 strdge r2, [ip], #0x08
172 strdge r2, [ip], #0x08
173 strdge r2, [ip], #0x08
174 strdge r2, [ip], #0x08
175 strdge r2, [ip], #0x08
176 strdge r2, [ip], #0x08
177 strdge r2, [ip], #0x08
178 strdge r2, [ip], #0x08
179 strdge r2, [ip], #0x08
180 strdge r2, [ip], #0x08
182 RETeq /* Zero length so just exit */
184 add r1, r1, #0x80 /* Adjust for extra sub */
186 /* Do 32 bytes at a time */
189 strdge r2, [ip], #0x08
190 strdge r2, [ip], #0x08
191 strdge r2, [ip], #0x08
192 strdge r2, [ip], #0x08
194 RETeq /* Zero length so just exit */
196 adds r1, r1, #0x10 /* Partially adjust for extra sub */
198 /* Deal with 16 bytes or more */
199 strdge r2, [ip], #0x08
200 strdge r2, [ip], #0x08
201 RETeq /* Zero length so just exit */
203 addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
205 /* We have at least 4 bytes so copy as words */
208 strge r3, [ip], #0x04
210 RETeq /* Zero length so just exit */
212 /* Compensate for 64-bit alignment check */
217 strb r3, [ip], #0x01 /* Set 1 byte */
218 strbge r3, [ip], #0x01 /* Set another byte */
219 strbgt r3, [ip] /* and a third */
222 .Lmemset_wordunaligned:
224 strb r3, [ip], #0x01 /* Set 1 byte */
226 strbge r3, [ip], #0x01 /* Set another byte */
228 strbgt r3, [ip], #0x01 /* and a third */
229 cmp r1, #0x04 /* More than 4 bytes left? */
230 bge .Lmemset_wordaligned /* Yup */
232 .Lmemset_lessthanfour:
234 RETeq /* Zero length so exit */
235 strb r3, [ip], #0x01 /* Set 1 byte */
237 strbge r3, [ip], #0x01 /* Set another byte */
238 strbgt r3, [ip] /* and a third */
249 /* Are both addresses aligned the same way? */
252 RETeq /* len == 0, or same addresses! */
255 bne .Lmemcmp_bytewise2 /* Badly aligned. Do it the slow way */
257 /* Word-align the addresses, if necessary */
260 add r3, r3, r3, lsl #1
261 addne pc, pc, r3, lsl #3
264 /* Compare up to 3 bytes */
272 /* Compare up to 2 bytes */
288 /* Compare 4 bytes at a time, if possible */
290 bcc .Lmemcmp_bytewise
291 .Lmemcmp_word_aligned:
296 beq .Lmemcmp_word_aligned
299 /* Correct for extra subtraction, and check if done */
301 cmpeq r0, #0x00 /* If done, did all bytes match? */
302 RETeq /* Yup. Just return */
304 /* Re-do the final word byte-wise */
315 beq .Lmemcmp_bytewise2
320 * 6 byte compares are very common, thanks to the network stack.
321 * This code is hand-scheduled to reduce the number of stalls for
322 * load results. Everything else being equal, this will be ~32%
323 * faster than a byte-wise memcmp.
327 ldrb r3, [r1, #0x00] /* r3 = b2#0 */
328 ldrb r0, [ip, #0x00] /* r0 = b1#0 */
329 ldrb r2, [r1, #0x01] /* r2 = b2#1 */
330 subs r0, r0, r3 /* r0 = b1#0 - b2#0 */
331 ldrbeq r3, [ip, #0x01] /* r3 = b1#1 */
332 RETne /* Return if mismatch on #0 */
333 subs r0, r3, r2 /* r0 = b1#1 - b2#1 */
334 ldrbeq r3, [r1, #0x02] /* r3 = b2#2 */
335 ldrbeq r0, [ip, #0x02] /* r0 = b1#2 */
336 RETne /* Return if mismatch on #1 */
337 ldrb r2, [r1, #0x03] /* r2 = b2#3 */
338 subs r0, r0, r3 /* r0 = b1#2 - b2#2 */
339 ldrbeq r3, [ip, #0x03] /* r3 = b1#3 */
340 RETne /* Return if mismatch on #2 */
341 subs r0, r3, r2 /* r0 = b1#3 - b2#3 */
342 ldrbeq r3, [r1, #0x04] /* r3 = b2#4 */
343 ldrbeq r0, [ip, #0x04] /* r0 = b1#4 */
344 RETne /* Return if mismatch on #3 */
345 ldrb r2, [r1, #0x05] /* r2 = b2#5 */
346 subs r0, r0, r3 /* r0 = b1#4 - b2#4 */
347 ldrbeq r3, [ip, #0x05] /* r3 = b1#5 */
348 RETne /* Return if mismatch on #4 */
349 sub r0, r3, r2 /* r0 = b1#5 - b2#5 */
354 /* switch the source and destination registers */
359 /* Do the buffers overlap? */
361 RETeq /* Bail now if src/dst are the same */
362 subcc r3, r0, r1 /* if (dst > src) r3 = dst - src */
363 subcs r3, r1, r0 /* if (src > dsr) r3 = src - dst */
364 cmp r3, r2 /* if (r3 < len) we have an overlap */
365 bcc PIC_SYM(_C_LABEL(memcpy), PLT)
367 /* Determine copy direction */
369 bcc .Lmemmove_backwards
371 moveq r0, #0 /* Quick abort for len=0 */
374 stmdb sp!, {r0, lr} /* memmove() returns dest addr */
376 blt .Lmemmove_fl4 /* less than 4 bytes */
378 bne .Lmemmove_fdestul /* oh unaligned destination addr */
380 bne .Lmemmove_fsrcul /* oh unaligned source addr */
383 /* We have aligned source and destination */
385 blt .Lmemmove_fl12 /* less than 12 bytes (4 from above) */
387 blt .Lmemmove_fl32 /* less than 32 bytes (12 from above) */
388 stmdb sp!, {r4} /* borrow r4 */
390 /* blat 32 bytes at a time */
391 /* XXX for really big copies perhaps we should use more registers */
393 ldmia r1!, {r3, r4, r12, lr}
394 stmia r0!, {r3, r4, r12, lr}
395 ldmia r1!, {r3, r4, r12, lr}
396 stmia r0!, {r3, r4, r12, lr}
398 bge .Lmemmove_floop32
401 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
402 stmiage r0!, {r3, r4, r12, lr}
404 ldmia sp!, {r4} /* return r4 */
409 /* blat 12 bytes at a time */
411 ldmiage r1!, {r3, r12, lr}
412 stmiage r0!, {r3, r12, lr}
414 bge .Lmemmove_floop12
423 ldmiage r1!, {r3, r12}
424 stmiage r0!, {r3, r12}
428 /* less than 4 bytes to go */
430 ldmiaeq sp!, {r0, pc} /* done */
432 /* copy the crud byte at a time */
442 /* erg - unaligned destination */
447 /* align destination with byte copies */
455 blt .Lmemmove_fl4 /* less the 4 bytes */
458 beq .Lmemmove_ft8 /* we have an aligned source */
460 /* erg - unaligned source */
461 /* This is where it gets nasty ... */
466 bgt .Lmemmove_fsrcul3
467 beq .Lmemmove_fsrcul2
469 blt .Lmemmove_fsrcul1loop4
473 .Lmemmove_fsrcul1loop16:
475 ldmia r1!, {r4, r5, r12, lr}
476 orr r3, r3, r4, lsl #24
478 orr r4, r4, r5, lsl #24
480 orr r5, r5, r12, lsl #24
482 orr r12, r12, lr, lsl #24
483 stmia r0!, {r3-r5, r12}
485 bge .Lmemmove_fsrcul1loop16
488 blt .Lmemmove_fsrcul1l4
490 .Lmemmove_fsrcul1loop4:
493 orr r12, r12, lr, lsl #24
496 bge .Lmemmove_fsrcul1loop4
504 blt .Lmemmove_fsrcul2loop4
508 .Lmemmove_fsrcul2loop16:
510 ldmia r1!, {r4, r5, r12, lr}
511 orr r3, r3, r4, lsl #16
513 orr r4, r4, r5, lsl #16
515 orr r5, r5, r12, lsl #16
516 mov r12, r12, lsr #16
517 orr r12, r12, lr, lsl #16
518 stmia r0!, {r3-r5, r12}
520 bge .Lmemmove_fsrcul2loop16
523 blt .Lmemmove_fsrcul2l4
525 .Lmemmove_fsrcul2loop4:
528 orr r12, r12, lr, lsl #16
531 bge .Lmemmove_fsrcul2loop4
539 blt .Lmemmove_fsrcul3loop4
543 .Lmemmove_fsrcul3loop16:
545 ldmia r1!, {r4, r5, r12, lr}
546 orr r3, r3, r4, lsl #8
548 orr r4, r4, r5, lsl #8
550 orr r5, r5, r12, lsl #8
551 mov r12, r12, lsr #24
552 orr r12, r12, lr, lsl #8
553 stmia r0!, {r3-r5, r12}
555 bge .Lmemmove_fsrcul3loop16
558 blt .Lmemmove_fsrcul3l4
560 .Lmemmove_fsrcul3loop4:
563 orr r12, r12, lr, lsl #8
566 bge .Lmemmove_fsrcul3loop4
576 blt .Lmemmove_bl4 /* less than 4 bytes */
578 bne .Lmemmove_bdestul /* oh unaligned destination addr */
580 bne .Lmemmove_bsrcul /* oh unaligned source addr */
583 /* We have aligned source and destination */
585 blt .Lmemmove_bl12 /* less than 12 bytes (4 from above) */
587 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
590 /* blat 32 bytes at a time */
591 /* XXX for really big copies perhaps we should use more registers */
593 ldmdb r1!, {r3, r4, r12, lr}
594 stmdb r0!, {r3, r4, r12, lr}
595 ldmdb r1!, {r3, r4, r12, lr}
596 stmdb r0!, {r3, r4, r12, lr}
598 bge .Lmemmove_bloop32
602 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
603 stmdbge r0!, {r3, r4, r12, lr}
606 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
607 stmdbge r0!, {r3, r12, lr}
617 ldmdbge r1!, {r3, r12}
618 stmdbge r0!, {r3, r12}
622 /* less than 4 bytes to go */
626 /* copy the crud byte at a time */
630 ldrbge r3, [r1, #-1]!
631 strbge r3, [r0, #-1]!
632 ldrbgt r3, [r1, #-1]!
633 strbgt r3, [r0, #-1]!
636 /* erg - unaligned destination */
640 /* align destination with byte copies */
643 ldrbge r3, [r1, #-1]!
644 strbge r3, [r0, #-1]!
645 ldrbgt r3, [r1, #-1]!
646 strbgt r3, [r0, #-1]!
648 blt .Lmemmove_bl4 /* less than 4 bytes to go */
650 beq .Lmemmove_bt8 /* we have an aligned source */
652 /* erg - unaligned source */
653 /* This is where it gets nasty ... */
658 blt .Lmemmove_bsrcul1
659 beq .Lmemmove_bsrcul2
661 blt .Lmemmove_bsrcul3loop4
663 stmdb sp!, {r4, r5, lr}
665 .Lmemmove_bsrcul3loop16:
667 ldmdb r1!, {r3-r5, r12}
668 orr lr, lr, r12, lsr #24
670 orr r12, r12, r5, lsr #24
672 orr r5, r5, r4, lsr #24
674 orr r4, r4, r3, lsr #24
675 stmdb r0!, {r4, r5, r12, lr}
677 bge .Lmemmove_bsrcul3loop16
678 ldmia sp!, {r4, r5, lr}
680 blt .Lmemmove_bsrcul3l4
682 .Lmemmove_bsrcul3loop4:
685 orr r12, r12, r3, lsr #24
688 bge .Lmemmove_bsrcul3loop4
696 blt .Lmemmove_bsrcul2loop4
698 stmdb sp!, {r4, r5, lr}
700 .Lmemmove_bsrcul2loop16:
702 ldmdb r1!, {r3-r5, r12}
703 orr lr, lr, r12, lsr #16
704 mov r12, r12, lsl #16
705 orr r12, r12, r5, lsr #16
707 orr r5, r5, r4, lsr #16
709 orr r4, r4, r3, lsr #16
710 stmdb r0!, {r4, r5, r12, lr}
712 bge .Lmemmove_bsrcul2loop16
713 ldmia sp!, {r4, r5, lr}
715 blt .Lmemmove_bsrcul2l4
717 .Lmemmove_bsrcul2loop4:
720 orr r12, r12, r3, lsr #16
723 bge .Lmemmove_bsrcul2loop4
731 blt .Lmemmove_bsrcul1loop4
733 stmdb sp!, {r4, r5, lr}
735 .Lmemmove_bsrcul1loop32:
737 ldmdb r1!, {r3-r5, r12}
738 orr lr, lr, r12, lsr #8
739 mov r12, r12, lsl #24
740 orr r12, r12, r5, lsr #8
742 orr r5, r5, r4, lsr #8
744 orr r4, r4, r3, lsr #8
745 stmdb r0!, {r4, r5, r12, lr}
747 bge .Lmemmove_bsrcul1loop32
748 ldmia sp!, {r4, r5, lr}
750 blt .Lmemmove_bsrcul1l4
752 .Lmemmove_bsrcul1loop4:
755 orr r12, r12, r3, lsr #8
758 bge .Lmemmove_bsrcul1loop4
766 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
770 ble .Lmemcpy_short /* <= 12 bytes */
772 #if FLASHADDR > PHYSADDR
782 ldr r3, .L_arm_memcpy
786 ldr r3, .L_min_memcpy_size
790 stmfd sp!, {r0-r2, r4, lr}
792 ldr r4, .L_arm_memcpy
796 ldmfd sp!, {r0-r2, r4, lr}
799 mov r3, r0 /* We must not clobber r0 */
801 /* Word-align the destination buffer */
802 ands ip, r3, #0x03 /* Already word aligned? */
803 beq .Lmemcpy_wordaligned /* Yup */
808 ldrble ip, [r1], #0x01
810 strble ip, [r3], #0x01
811 ldrblt ip, [r1], #0x01
813 strblt ip, [r3], #0x01
815 /* Destination buffer is now word aligned */
816 .Lmemcpy_wordaligned:
817 ands ip, r1, #0x03 /* Is src also word-aligned? */
818 bne .Lmemcpy_bad_align /* Nope. Things just got bad */
820 /* Quad-align the destination buffer */
821 tst r3, #0x07 /* Already quad aligned? */
822 ldrne ip, [r1], #0x04
823 stmfd sp!, {r4-r9} /* Free up some registers */
825 strne ip, [r3], #0x04
827 /* Destination buffer quad aligned, source is at least word aligned */
829 blt .Lmemcpy_w_lessthan128
831 /* Copy 128 bytes at a time */
833 ldr r4, [r1], #0x04 /* LD:00-03 */
834 ldr r5, [r1], #0x04 /* LD:04-07 */
835 pld [r1, #0x18] /* Prefetch 0x20 */
836 ldr r6, [r1], #0x04 /* LD:08-0b */
837 ldr r7, [r1], #0x04 /* LD:0c-0f */
838 ldr r8, [r1], #0x04 /* LD:10-13 */
839 ldr r9, [r1], #0x04 /* LD:14-17 */
840 strd r4, [r3], #0x08 /* ST:00-07 */
841 ldr r4, [r1], #0x04 /* LD:18-1b */
842 ldr r5, [r1], #0x04 /* LD:1c-1f */
843 strd r6, [r3], #0x08 /* ST:08-0f */
844 ldr r6, [r1], #0x04 /* LD:20-23 */
845 ldr r7, [r1], #0x04 /* LD:24-27 */
846 pld [r1, #0x18] /* Prefetch 0x40 */
847 strd r8, [r3], #0x08 /* ST:10-17 */
848 ldr r8, [r1], #0x04 /* LD:28-2b */
849 ldr r9, [r1], #0x04 /* LD:2c-2f */
850 strd r4, [r3], #0x08 /* ST:18-1f */
851 ldr r4, [r1], #0x04 /* LD:30-33 */
852 ldr r5, [r1], #0x04 /* LD:34-37 */
853 strd r6, [r3], #0x08 /* ST:20-27 */
854 ldr r6, [r1], #0x04 /* LD:38-3b */
855 ldr r7, [r1], #0x04 /* LD:3c-3f */
856 strd r8, [r3], #0x08 /* ST:28-2f */
857 ldr r8, [r1], #0x04 /* LD:40-43 */
858 ldr r9, [r1], #0x04 /* LD:44-47 */
859 pld [r1, #0x18] /* Prefetch 0x60 */
860 strd r4, [r3], #0x08 /* ST:30-37 */
861 ldr r4, [r1], #0x04 /* LD:48-4b */
862 ldr r5, [r1], #0x04 /* LD:4c-4f */
863 strd r6, [r3], #0x08 /* ST:38-3f */
864 ldr r6, [r1], #0x04 /* LD:50-53 */
865 ldr r7, [r1], #0x04 /* LD:54-57 */
866 strd r8, [r3], #0x08 /* ST:40-47 */
867 ldr r8, [r1], #0x04 /* LD:58-5b */
868 ldr r9, [r1], #0x04 /* LD:5c-5f */
869 strd r4, [r3], #0x08 /* ST:48-4f */
870 ldr r4, [r1], #0x04 /* LD:60-63 */
871 ldr r5, [r1], #0x04 /* LD:64-67 */
872 pld [r1, #0x18] /* Prefetch 0x80 */
873 strd r6, [r3], #0x08 /* ST:50-57 */
874 ldr r6, [r1], #0x04 /* LD:68-6b */
875 ldr r7, [r1], #0x04 /* LD:6c-6f */
876 strd r8, [r3], #0x08 /* ST:58-5f */
877 ldr r8, [r1], #0x04 /* LD:70-73 */
878 ldr r9, [r1], #0x04 /* LD:74-77 */
879 strd r4, [r3], #0x08 /* ST:60-67 */
880 ldr r4, [r1], #0x04 /* LD:78-7b */
881 ldr r5, [r1], #0x04 /* LD:7c-7f */
882 strd r6, [r3], #0x08 /* ST:68-6f */
883 strd r8, [r3], #0x08 /* ST:70-77 */
885 strd r4, [r3], #0x08 /* ST:78-7f */
886 bge .Lmemcpy_w_loop128
888 .Lmemcpy_w_lessthan128:
889 adds r2, r2, #0x80 /* Adjust for extra sub */
891 RETeq /* Return now if done */
893 blt .Lmemcpy_w_lessthan32
895 /* Copy 32 bytes at a time */
911 bge .Lmemcpy_w_loop32
913 .Lmemcpy_w_lessthan32:
914 adds r2, r2, #0x20 /* Adjust for extra sub */
916 RETeq /* Return now if done */
920 addne pc, pc, r4, lsl #1
923 /* At least 24 bytes remaining */
929 /* At least 16 bytes remaining */
935 /* At least 8 bytes remaining */
941 /* Less than 8 bytes remaining */
943 RETeq /* Return now if done */
945 ldrge ip, [r1], #0x04
946 strge ip, [r3], #0x04
947 RETeq /* Return now if done */
951 ldrbge r2, [r1], #0x01
954 strbge r2, [r3], #0x01
957 /* Place a literal pool here for the above ldr instructions to use */
962 * At this point, it has not been possible to word align both buffers.
963 * The destination buffer is word aligned, but the source buffer is not.
974 .Lmemcpy_bad1_loop16:
981 orr r4, r4, r5, lsl #24
983 orr r5, r5, r6, lsl #24
985 orr r6, r6, r7, lsl #24
987 orr r7, r7, ip, lsl #24
994 bge .Lmemcpy_bad1_loop16
998 RETeq /* Return now if done */
1001 blt .Lmemcpy_bad_done
1003 .Lmemcpy_bad1_loop4:
1007 orr r4, r4, ip, lsl #24
1009 bge .Lmemcpy_bad1_loop4
1013 .Lmemcpy_bad2_loop16:
1020 orr r4, r4, r5, lsl #16
1022 orr r5, r5, r6, lsl #16
1024 orr r6, r6, r7, lsl #16
1026 orr r7, r7, ip, lsl #16
1033 bge .Lmemcpy_bad2_loop16
1036 ldmfdeq sp!, {r4-r7}
1037 RETeq /* Return now if done */
1040 blt .Lmemcpy_bad_done
1042 .Lmemcpy_bad2_loop4:
1046 orr r4, r4, ip, lsl #16
1048 bge .Lmemcpy_bad2_loop4
1052 .Lmemcpy_bad3_loop16:
1059 orr r4, r4, r5, lsl #8
1061 orr r5, r5, r6, lsl #8
1063 orr r6, r6, r7, lsl #8
1065 orr r7, r7, ip, lsl #8
1072 bge .Lmemcpy_bad3_loop16
1075 ldmfdeq sp!, {r4-r7}
1076 RETeq /* Return now if done */
1079 blt .Lmemcpy_bad_done
1081 .Lmemcpy_bad3_loop4:
1085 orr r4, r4, ip, lsl #8
1087 bge .Lmemcpy_bad3_loop4
1094 ldrb ip, [r1], #0x01
1096 ldrbge r2, [r1], #0x01
1097 strb ip, [r3], #0x01
1099 strbge r2, [r3], #0x01
1105 * Handle short copies (less than 16 bytes), possibly misaligned.
1106 * Some of these are *very* common, thanks to the network stack,
1107 * and so are handled specially.
1110 add pc, pc, r2, lsl #2
1113 b .Lmemcpy_bytewise /* 0x01 */
1114 b .Lmemcpy_bytewise /* 0x02 */
1115 b .Lmemcpy_bytewise /* 0x03 */
1116 b .Lmemcpy_4 /* 0x04 */
1117 b .Lmemcpy_bytewise /* 0x05 */
1118 b .Lmemcpy_6 /* 0x06 */
1119 b .Lmemcpy_bytewise /* 0x07 */
1120 b .Lmemcpy_8 /* 0x08 */
1121 b .Lmemcpy_bytewise /* 0x09 */
1122 b .Lmemcpy_bytewise /* 0x0a */
1123 b .Lmemcpy_bytewise /* 0x0b */
1124 b .Lmemcpy_c /* 0x0c */
1126 mov r3, r0 /* We must not clobber r0 */
1127 ldrb ip, [r1], #0x01
1128 1: subs r2, r2, #0x01
1129 strb ip, [r3], #0x01
1130 ldrbne ip, [r1], #0x01
1134 /******************************************************************************
1135 * Special case for 4 byte copies
1137 #define LMEMCPY_4_LOG2 6 /* 64 bytes */
1138 #define LMEMCPY_4_PAD .align LMEMCPY_4_LOG2
1142 orr r2, r2, r0, lsl #2
1145 addne pc, r3, r2, lsl #LMEMCPY_4_LOG2
1148 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1156 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1158 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1159 ldr r2, [r1, #3] /* BE:r2 = 3xxx LE:r2 = xxx3 */
1160 mov r3, r3, lsr #8 /* r3 = .210 */
1161 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1167 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1169 ldrh r3, [r1, #0x02]
1171 orr r3, r2, r3, lsl #16
1177 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1179 ldr r3, [r1, #-3] /* BE:r3 = xxx0 LE:r3 = 0xxx */
1180 ldr r2, [r1, #1] /* BE:r2 = 123x LE:r2 = x321 */
1181 mov r3, r3, lsr #24 /* r3 = ...0 */
1182 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1188 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1194 strb r1, [r0, #0x03]
1195 strh r3, [r0, #0x01]
1200 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1203 ldrh r3, [r1, #0x01]
1204 ldrb r1, [r1, #0x03]
1206 strh r3, [r0, #0x01]
1207 strb r1, [r0, #0x03]
1212 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1214 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1215 ldrh r3, [r1, #0x02] /* LE:r3 = ..23 LE:r3 = ..32 */
1217 mov r2, r2, lsr #8 /* r2 = ...1 */
1218 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1219 mov r3, r3, lsr #8 /* r3 = ...3 */
1220 strh r2, [r0, #0x01]
1221 strb r3, [r0, #0x03]
1226 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1229 ldrh r3, [r1, #0x01]
1230 ldrb r1, [r1, #0x03]
1232 strh r3, [r0, #0x01]
1233 strb r1, [r0, #0x03]
1238 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1243 strh r3, [r0, #0x02]
1248 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1250 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1251 ldr r3, [r1, #3] /* BE:r3 = 3xxx LE:r3 = xxx3 */
1252 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1254 mov r2, r2, lsr #24 /* r2 = ...2 */
1255 orr r2, r2, r3, lsl #8 /* r2 = xx32 */
1256 strh r2, [r0, #0x02]
1261 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1264 ldrh r3, [r1, #0x02]
1266 strh r3, [r0, #0x02]
1271 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1273 ldr r3, [r1, #1] /* BE:r3 = 123x LE:r3 = x321 */
1274 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1275 mov r1, r3, lsr #8 /* BE:r1 = .123 LE:r1 = .x32 */
1276 strh r1, [r0, #0x02]
1277 mov r3, r3, lsl #8 /* r3 = 321. */
1278 orr r3, r3, r2, lsr #24 /* r3 = 3210 */
1284 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1286 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1290 strh r3, [r0, #0x01]
1291 strb r1, [r0, #0x03]
1296 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1299 ldrh r3, [r1, #0x01]
1300 ldrb r1, [r1, #0x03]
1302 strh r3, [r0, #0x01]
1303 strb r1, [r0, #0x03]
1308 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1310 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1311 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
1313 mov r2, r2, lsr #8 /* r2 = ...1 */
1314 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1315 strh r2, [r0, #0x01]
1316 mov r3, r3, lsr #8 /* r3 = ...3 */
1317 strb r3, [r0, #0x03]
1322 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1325 ldrh r3, [r1, #0x01]
1326 ldrb r1, [r1, #0x03]
1328 strh r3, [r0, #0x01]
1329 strb r1, [r0, #0x03]
1334 /******************************************************************************
1335 * Special case for 6 byte copies
1337 #define LMEMCPY_6_LOG2 6 /* 64 bytes */
1338 #define LMEMCPY_6_PAD .align LMEMCPY_6_LOG2
1342 orr r2, r2, r0, lsl #2
1345 addne pc, r3, r2, lsl #LMEMCPY_6_LOG2
1348 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1351 ldrh r3, [r1, #0x04]
1353 strh r3, [r0, #0x04]
1358 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1360 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1361 ldr r3, [r1, #0x03] /* BE:r3 = 345x LE:r3 = x543 */
1362 mov r2, r2, lsr #8 /* r2 = .210 */
1363 orr r2, r2, r3, lsl #24 /* r2 = 3210 */
1364 mov r3, r3, lsr #8 /* BE:r3 = .345 LE:r3 = .x54 */
1366 strh r3, [r0, #0x04]
1371 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1373 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1374 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1375 mov r1, r3, lsr #16 /* r1 = ..54 */
1376 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1378 strh r1, [r0, #0x04]
1383 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1385 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1386 ldr r3, [r1, #1] /* BE:r3 = 1234 LE:r3 = 4321 */
1387 ldr r1, [r1, #5] /* BE:r1 = 5xxx LE:r3 = xxx5 */
1388 mov r2, r2, lsr #24 /* r2 = ...0 */
1389 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1390 mov r1, r1, lsl #8 /* r1 = xx5. */
1391 orr r1, r1, r3, lsr #24 /* r1 = xx54 */
1393 strh r1, [r0, #0x04]
1398 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1400 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1401 ldrh r2, [r1, #0x04] /* BE:r2 = ..45 LE:r2 = ..54 */
1402 mov r1, r3, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1403 strh r1, [r0, #0x01]
1405 mov r3, r3, lsr #24 /* r3 = ...3 */
1406 orr r3, r3, r2, lsl #8 /* r3 = .543 */
1407 mov r2, r2, lsr #8 /* r2 = ...5 */
1408 strh r3, [r0, #0x03]
1409 strb r2, [r0, #0x05]
1414 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1417 ldrh r3, [r1, #0x01]
1418 ldrh ip, [r1, #0x03]
1419 ldrb r1, [r1, #0x05]
1421 strh r3, [r0, #0x01]
1422 strh ip, [r0, #0x03]
1423 strb r1, [r0, #0x05]
1428 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1430 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1431 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1434 strb r3, [r0, #0x05]
1435 mov r3, r1, lsr #8 /* r3 = .543 */
1436 strh r3, [r0, #0x03]
1437 mov r3, r2, lsr #8 /* r3 = ...1 */
1438 orr r3, r3, r1, lsl #8 /* r3 = 4321 */
1439 strh r3, [r0, #0x01]
1444 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1447 ldrh r3, [r1, #0x01]
1448 ldrh ip, [r1, #0x03]
1449 ldrb r1, [r1, #0x05]
1451 strh r3, [r0, #0x01]
1452 strh ip, [r0, #0x03]
1453 strb r1, [r0, #0x05]
1458 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1460 ldrh r2, [r1, #0x04] /* r2 = ..54 */
1461 ldr r3, [r1] /* r3 = 3210 */
1462 mov r2, r2, lsl #16 /* r2 = 54.. */
1463 orr r2, r2, r3, lsr #16 /* r2 = 5432 */
1470 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1472 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1473 ldr r2, [r1, #3] /* BE:r2 = 345x LE:r2 = x543 */
1474 mov r1, r3, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1475 mov r2, r2, lsl #8 /* r2 = 543. */
1476 orr r2, r2, r3, lsr #24 /* r2 = 5432 */
1483 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1493 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1495 ldrb r3, [r1] /* r3 = ...0 */
1496 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1497 ldrb r1, [r1, #0x05] /* r1 = ...5 */
1498 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1499 mov r1, r1, lsl #24 /* r1 = 5... */
1500 orr r1, r1, r2, lsr #8 /* r1 = 5432 */
1507 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1509 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1510 ldrh r1, [r1, #0x04] /* BE:r1 = ..45 LE:r1 = ..54 */
1512 mov r2, r2, lsr #8 /* r2 = .321 */
1513 orr r2, r2, r1, lsl #24 /* r2 = 4321 */
1514 mov r1, r1, lsr #8 /* r1 = ...5 */
1516 strb r1, [r0, #0x05]
1521 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1524 ldrh r3, [r1, #0x01]
1525 ldrh ip, [r1, #0x03]
1526 ldrb r1, [r1, #0x05]
1528 strh r3, [r0, #0x01]
1529 strh ip, [r0, #0x03]
1530 strb r1, [r0, #0x05]
1535 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1537 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1538 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1540 mov r2, r2, lsr #8 /* r2 = ...1 */
1541 orr r2, r2, r1, lsl #8 /* r2 = 4321 */
1542 mov r1, r1, lsr #24 /* r1 = ...5 */
1544 strb r1, [r0, #0x05]
1549 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1553 ldrb r1, [r1, #0x05]
1556 strb r1, [r0, #0x05]
1561 /******************************************************************************
1562 * Special case for 8 byte copies
1564 #define LMEMCPY_8_LOG2 6 /* 64 bytes */
1565 #define LMEMCPY_8_PAD .align LMEMCPY_8_LOG2
1569 orr r2, r2, r0, lsl #2
1572 addne pc, r3, r2, lsl #LMEMCPY_8_LOG2
1575 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1585 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1587 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1588 ldr r2, [r1, #0x03] /* BE:r2 = 3456 LE:r2 = 6543 */
1589 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1590 mov r3, r3, lsr #8 /* r3 = .210 */
1591 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1592 mov r1, r1, lsl #24 /* r1 = 7... */
1593 orr r2, r1, r2, lsr #8 /* r2 = 7654 */
1600 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1602 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1603 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1604 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1605 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1606 mov r3, r3, lsr #16 /* r3 = ..54 */
1607 orr r3, r3, r1, lsl #16 /* r3 = 7654 */
1614 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1616 ldrb r3, [r1] /* r3 = ...0 */
1617 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1618 ldr r1, [r1, #0x05] /* BE:r1 = 567x LE:r1 = x765 */
1619 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1620 mov r2, r2, lsr #24 /* r2 = ...4 */
1621 orr r2, r2, r1, lsl #8 /* r2 = 7654 */
1628 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1630 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1631 ldr r2, [r1, #0x04] /* BE:r2 = 4567 LE:r2 = 7654 */
1633 mov r1, r2, lsr #24 /* r1 = ...7 */
1634 strb r1, [r0, #0x07]
1635 mov r1, r3, lsr #8 /* r1 = .321 */
1636 mov r3, r3, lsr #24 /* r3 = ...3 */
1637 orr r3, r3, r2, lsl #8 /* r3 = 6543 */
1638 strh r1, [r0, #0x01]
1644 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1647 ldrh r3, [r1, #0x01]
1649 ldrb r1, [r1, #0x07]
1651 strh r3, [r0, #0x01]
1653 strb r1, [r0, #0x07]
1658 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1660 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1661 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1662 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1663 strb r2, [r0] /* 0 */
1664 mov ip, r1, lsr #8 /* ip = ...7 */
1665 strb ip, [r0, #0x07] /* 7 */
1666 mov ip, r2, lsr #8 /* ip = ...1 */
1667 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1668 mov r3, r3, lsr #8 /* r3 = .543 */
1669 orr r3, r3, r1, lsl #24 /* r3 = 6543 */
1670 strh ip, [r0, #0x01]
1676 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1678 ldrb r3, [r1] /* r3 = ...0 */
1679 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1680 ldrh r2, [r1, #0x05] /* BE:r2 = ..56 LE:r2 = ..65 */
1681 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1683 mov r3, ip, lsr #16 /* BE:r3 = ..12 LE:r3 = ..43 */
1684 strh ip, [r0, #0x01]
1685 orr r2, r3, r2, lsl #16 /* r2 = 6543 */
1687 strb r1, [r0, #0x07]
1692 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1694 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1695 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1696 mov r1, r2, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1698 orr r2, r1, r3, lsl #16 /* r2 = 5432 */
1699 mov r3, r3, lsr #16 /* r3 = ..76 */
1701 strh r3, [r0, #0x06]
1706 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1708 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1709 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1710 ldrb ip, [r1, #0x07] /* ip = ...7 */
1711 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1713 mov r1, r2, lsr #24 /* r1 = ...2 */
1714 orr r1, r1, r3, lsl #8 /* r1 = 5432 */
1715 mov r3, r3, lsr #24 /* r3 = ...6 */
1716 orr r3, r3, ip, lsl #8 /* r3 = ..76 */
1718 strh r3, [r0, #0x06]
1723 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1727 ldrh r3, [r1, #0x06]
1730 strh r3, [r0, #0x06]
1735 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1737 ldr r3, [r1, #0x05] /* BE:r3 = 567x LE:r3 = x765 */
1738 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1739 ldrb ip, [r1] /* ip = ...0 */
1740 mov r1, r3, lsr #8 /* BE:r1 = .567 LE:r1 = .x76 */
1741 strh r1, [r0, #0x06]
1742 mov r3, r3, lsl #24 /* r3 = 5... */
1743 orr r3, r3, r2, lsr #8 /* r3 = 5432 */
1744 orr r2, ip, r2, lsl #8 /* r2 = 3210 */
1751 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1753 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1754 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1755 mov r1, r3, lsr #8 /* BE:r1 = .456 LE:r1 = .765 */
1756 strh r1, [r0, #0x05]
1758 mov r1, r3, lsr #24 /* r1 = ...7 */
1759 strb r1, [r0, #0x07]
1760 mov r2, r2, lsr #8 /* r2 = .321 */
1761 orr r2, r2, r3, lsl #24 /* r2 = 4321 */
1767 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1769 ldrb r3, [r1] /* r3 = ...0 */
1770 ldrh r2, [r1, #0x01] /* BE:r2 = ..12 LE:r2 = ..21 */
1771 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
1772 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1774 mov r3, ip, lsr #16 /* BE:r3 = ..34 LE:r3 = ..65 */
1775 strh r3, [r0, #0x05]
1776 orr r2, r2, ip, lsl #16 /* r2 = 4321 */
1778 strb r1, [r0, #0x07]
1783 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1785 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1786 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1787 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1789 mov ip, r2, lsr #8 /* ip = ...1 */
1790 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1791 mov r2, r1, lsr #8 /* r2 = ...7 */
1792 strb r2, [r0, #0x07]
1793 mov r1, r1, lsl #8 /* r1 = .76. */
1794 orr r1, r1, r3, lsr #24 /* r1 = .765 */
1796 strh r1, [r0, #0x05]
1801 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1805 ldrh r3, [r1, #0x05]
1806 ldrb r1, [r1, #0x07]
1809 strh r3, [r0, #0x05]
1810 strb r1, [r0, #0x07]
1814 /******************************************************************************
1815 * Special case for 12 byte copies
1817 #define LMEMCPY_C_LOG2 7 /* 128 bytes */
1818 #define LMEMCPY_C_PAD .align LMEMCPY_C_LOG2
1822 orr r2, r2, r0, lsl #2
1825 addne pc, r3, r2, lsl #LMEMCPY_C_LOG2
1828 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1840 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1842 ldrb r2, [r1, #0xb] /* r2 = ...B */
1843 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1844 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1845 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
1846 mov r2, r2, lsl #24 /* r2 = B... */
1847 orr r2, r2, ip, lsr #8 /* r2 = BA98 */
1849 mov r2, ip, lsl #24 /* r2 = 7... */
1850 orr r2, r2, r3, lsr #8 /* r2 = 7654 */
1851 mov r1, r1, lsr #8 /* r1 = .210 */
1852 orr r1, r1, r3, lsl #24 /* r1 = 3210 */
1859 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1861 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1862 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1863 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1864 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1865 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1867 mov r3, r3, lsr #16 /* r3 = ..54 */
1868 orr r3, r3, ip, lsl #16 /* r3 = 7654 */
1869 mov r1, r1, lsl #16 /* r1 = BA.. */
1870 orr r1, r1, ip, lsr #16 /* r1 = BA98 */
1877 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1879 ldrb r2, [r1] /* r2 = ...0 */
1880 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1881 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1882 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1883 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1885 mov r3, r3, lsr #24 /* r3 = ...4 */
1886 orr r3, r3, ip, lsl #8 /* r3 = 7654 */
1887 mov r1, r1, lsl #8 /* r1 = BA9. */
1888 orr r1, r1, ip, lsr #24 /* r1 = BA98 */
1895 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1897 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1898 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1899 ldr ip, [r1, #0x08] /* BE:ip = 89AB LE:ip = BA98 */
1900 mov r1, r2, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1901 strh r1, [r0, #0x01]
1903 mov r1, r2, lsr #24 /* r1 = ...3 */
1904 orr r2, r1, r3, lsl #8 /* r1 = 6543 */
1905 mov r1, r3, lsr #24 /* r1 = ...7 */
1906 orr r1, r1, ip, lsl #8 /* r1 = A987 */
1907 mov ip, ip, lsr #24 /* ip = ...B */
1910 strb ip, [r0, #0x0b]
1915 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1918 ldrh r3, [r1, #0x01]
1922 ldrb r1, [r1, #0x0b]
1923 strh r3, [r0, #0x01]
1926 strb r1, [r0, #0x0b]
1931 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1933 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1934 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1935 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1936 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1938 mov r2, r2, lsr #8 /* r2 = ...1 */
1939 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
1940 strh r2, [r0, #0x01]
1941 mov r2, r3, lsr #8 /* r2 = .543 */
1942 orr r3, r2, ip, lsl #24 /* r3 = 6543 */
1943 mov r2, ip, lsr #8 /* r2 = .987 */
1944 orr r2, r2, r1, lsl #24 /* r2 = A987 */
1945 mov r1, r1, lsr #8 /* r1 = ...B */
1948 strb r1, [r0, #0x0b]
1953 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1956 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1957 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1958 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1960 strh r3, [r0, #0x01]
1961 mov r3, r3, lsr #16 /* r3 = ..43 */
1962 orr r3, r3, ip, lsl #16 /* r3 = 6543 */
1963 mov ip, ip, lsr #16 /* ip = ..87 */
1964 orr ip, ip, r1, lsl #16 /* ip = A987 */
1965 mov r1, r1, lsr #16 /* r1 = ..xB */
1968 strb r1, [r0, #0x0b]
1973 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1975 ldr ip, [r1] /* BE:ip = 0123 LE:ip = 3210 */
1976 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1977 ldr r2, [r1, #0x08] /* BE:r2 = 89AB LE:r2 = BA98 */
1978 mov r1, ip, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1980 orr r1, r1, r3, lsl #16 /* r1 = 5432 */
1981 mov r3, r3, lsr #16 /* r3 = ..76 */
1982 orr r3, r3, r2, lsl #16 /* r3 = 9876 */
1983 mov r2, r2, lsr #16 /* r2 = ..BA */
1986 strh r2, [r0, #0x0a]
1991 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1993 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1994 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1995 mov ip, r2, lsr #8 /* BE:ip = .x01 LE:ip = .210 */
1997 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1998 ldrb r1, [r1, #0x0b] /* r1 = ...B */
1999 mov r2, r2, lsr #24 /* r2 = ...2 */
2000 orr r2, r2, r3, lsl #8 /* r2 = 5432 */
2001 mov r3, r3, lsr #24 /* r3 = ...6 */
2002 orr r3, r3, ip, lsl #8 /* r3 = 9876 */
2003 mov r1, r1, lsl #8 /* r1 = ..B. */
2004 orr r1, r1, ip, lsr #24 /* r1 = ..BA */
2007 strh r1, [r0, #0x0a]
2012 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2017 ldrh r1, [r1, #0x0a]
2021 strh r1, [r0, #0x0a]
2026 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2028 ldr r2, [r1, #0x09] /* BE:r2 = 9ABx LE:r2 = xBA9 */
2029 ldr r3, [r1, #0x05] /* BE:r3 = 5678 LE:r3 = 8765 */
2030 mov ip, r2, lsr #8 /* BE:ip = .9AB LE:ip = .xBA */
2031 strh ip, [r0, #0x0a]
2032 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
2033 ldrb r1, [r1] /* r1 = ...0 */
2034 mov r2, r2, lsl #24 /* r2 = 9... */
2035 orr r2, r2, r3, lsr #8 /* r2 = 9876 */
2036 mov r3, r3, lsl #24 /* r3 = 5... */
2037 orr r3, r3, ip, lsr #8 /* r3 = 5432 */
2038 orr r1, r1, ip, lsl #8 /* r1 = 3210 */
2046 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2048 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
2049 ldr ip, [r1, #0x04] /* BE:ip = 4567 LE:ip = 7654 */
2050 ldr r1, [r1, #0x08] /* BE:r1 = 89AB LE:r1 = BA98 */
2052 mov r3, r2, lsr #8 /* r3 = .321 */
2053 orr r3, r3, ip, lsl #24 /* r3 = 4321 */
2055 mov r3, ip, lsr #8 /* r3 = .765 */
2056 orr r3, r3, r1, lsl #24 /* r3 = 8765 */
2058 mov r1, r1, lsr #8 /* r1 = .BA9 */
2059 strh r1, [r0, #0x09]
2060 mov r1, r1, lsr #16 /* r1 = ...B */
2061 strb r1, [r0, #0x0b]
2066 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2068 ldrb r2, [r1, #0x0b] /* r2 = ...B */
2069 ldr r3, [r1, #0x07] /* BE:r3 = 789A LE:r3 = A987 */
2070 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
2071 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
2072 strb r2, [r0, #0x0b]
2073 mov r2, r3, lsr #16 /* r2 = ..A9 */
2074 strh r2, [r0, #0x09]
2075 mov r3, r3, lsl #16 /* r3 = 87.. */
2076 orr r3, r3, ip, lsr #16 /* r3 = 8765 */
2077 mov ip, ip, lsl #16 /* ip = 43.. */
2078 orr ip, ip, r1, lsr #16 /* ip = 4321 */
2079 mov r1, r1, lsr #8 /* r1 = .210 */
2087 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2089 ldrh r2, [r1] /* r2 = ..10 */
2090 ldr r3, [r1, #0x02] /* r3 = 5432 */
2091 ldr ip, [r1, #0x06] /* ip = 9876 */
2092 ldrh r1, [r1, #0x0a] /* r1 = ..BA */
2094 mov r2, r2, lsr #8 /* r2 = ...1 */
2095 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
2096 mov r3, r3, lsr #24 /* r3 = ...5 */
2097 orr r3, r3, ip, lsl #8 /* r3 = 8765 */
2098 mov ip, ip, lsr #24 /* ip = ...9 */
2099 orr ip, ip, r1, lsl #8 /* ip = .BA9 */
2100 mov r1, r1, lsr #8 /* r1 = ...B */
2103 strh ip, [r0, #0x09]
2104 strb r1, [r0, #0x0b]
2109 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2115 ldrh r2, [r1, #0x09]
2116 ldrb r1, [r1, #0x0b]
2119 strh r2, [r0, #0x09]
2120 strb r1, [r0, #0x0b]