2 * Copyright (c) 2004 Olivier Houchard
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed for the NetBSD Project by
43 * Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 * or promote products derived from this software without specific prior
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution.
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
89 #include <machine/asm.h>
90 __FBSDID("$FreeBSD$");
97 * memset: Sets a block of memory to the specified value
102 * r2 - number of bytes to write
107 /* LINTSTUB: Func: void bzero(void *, size_t) */
112 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
114 and r3, r1, #0xff /* We deal with bytes */
117 cmp r1, #0x04 /* Do we have less than 4 bytes */
119 blt .Lmemset_lessthanfour
121 /* Ok first we will word align the address */
122 ands r2, ip, #0x03 /* Get the bottom two bits */
123 bne .Lmemset_wordunaligned /* The address is not word aligned */
125 /* We are now word aligned */
126 .Lmemset_wordaligned:
127 orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
128 tst ip, #0x04 /* Quad-align for armv5e */
129 orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
130 subne r1, r1, #0x04 /* Quad-align if necessary */
131 strne r3, [ip], #0x04
133 blt .Lmemset_loop4 /* If less than 16 then use words */
134 mov r2, r3 /* Duplicate data */
135 cmp r1, #0x80 /* If < 128 then skip the big loop */
138 /* Do 128 bytes at a time */
141 strdge r2, [ip], #0x08
142 strdge r2, [ip], #0x08
143 strdge r2, [ip], #0x08
144 strdge r2, [ip], #0x08
145 strdge r2, [ip], #0x08
146 strdge r2, [ip], #0x08
147 strdge r2, [ip], #0x08
148 strdge r2, [ip], #0x08
149 strdge r2, [ip], #0x08
150 strdge r2, [ip], #0x08
151 strdge r2, [ip], #0x08
152 strdge r2, [ip], #0x08
153 strdge r2, [ip], #0x08
154 strdge r2, [ip], #0x08
155 strdge r2, [ip], #0x08
156 strdge r2, [ip], #0x08
158 RETeq /* Zero length so just exit */
160 add r1, r1, #0x80 /* Adjust for extra sub */
162 /* Do 32 bytes at a time */
165 strdge r2, [ip], #0x08
166 strdge r2, [ip], #0x08
167 strdge r2, [ip], #0x08
168 strdge r2, [ip], #0x08
170 RETeq /* Zero length so just exit */
172 adds r1, r1, #0x10 /* Partially adjust for extra sub */
174 /* Deal with 16 bytes or more */
175 strdge r2, [ip], #0x08
176 strdge r2, [ip], #0x08
177 RETeq /* Zero length so just exit */
179 addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
181 /* We have at least 4 bytes so copy as words */
184 strge r3, [ip], #0x04
186 RETeq /* Zero length so just exit */
188 /* Compensate for 64-bit alignment check */
193 strb r3, [ip], #0x01 /* Set 1 byte */
194 strbge r3, [ip], #0x01 /* Set another byte */
195 strbgt r3, [ip] /* and a third */
198 .Lmemset_wordunaligned:
200 strb r3, [ip], #0x01 /* Set 1 byte */
202 strbge r3, [ip], #0x01 /* Set another byte */
204 strbgt r3, [ip], #0x01 /* and a third */
205 cmp r1, #0x04 /* More than 4 bytes left? */
206 bge .Lmemset_wordaligned /* Yup */
208 .Lmemset_lessthanfour:
210 RETeq /* Zero length so exit */
211 strb r3, [ip], #0x01 /* Set 1 byte */
213 strbge r3, [ip], #0x01 /* Set another byte */
214 strbgt r3, [ip] /* and a third */
225 /* Are both addresses aligned the same way? */
228 RETeq /* len == 0, or same addresses! */
231 bne .Lmemcmp_bytewise2 /* Badly aligned. Do it the slow way */
233 /* Word-align the addresses, if necessary */
236 add r3, r3, r3, lsl #1
237 addne pc, pc, r3, lsl #3
240 /* Compare up to 3 bytes */
248 /* Compare up to 2 bytes */
264 /* Compare 4 bytes at a time, if possible */
266 bcc .Lmemcmp_bytewise
267 .Lmemcmp_word_aligned:
272 beq .Lmemcmp_word_aligned
275 /* Correct for extra subtraction, and check if done */
277 cmpeq r0, #0x00 /* If done, did all bytes match? */
278 RETeq /* Yup. Just return */
280 /* Re-do the final word byte-wise */
291 beq .Lmemcmp_bytewise2
296 * 6 byte compares are very common, thanks to the network stack.
297 * This code is hand-scheduled to reduce the number of stalls for
298 * load results. Everything else being equal, this will be ~32%
299 * faster than a byte-wise memcmp.
303 ldrb r3, [r1, #0x00] /* r3 = b2#0 */
304 ldrb r0, [ip, #0x00] /* r0 = b1#0 */
305 ldrb r2, [r1, #0x01] /* r2 = b2#1 */
306 subs r0, r0, r3 /* r0 = b1#0 - b2#0 */
307 ldrbeq r3, [ip, #0x01] /* r3 = b1#1 */
308 RETne /* Return if mismatch on #0 */
309 subs r0, r3, r2 /* r0 = b1#1 - b2#1 */
310 ldrbeq r3, [r1, #0x02] /* r3 = b2#2 */
311 ldrbeq r0, [ip, #0x02] /* r0 = b1#2 */
312 RETne /* Return if mismatch on #1 */
313 ldrb r2, [r1, #0x03] /* r2 = b2#3 */
314 subs r0, r0, r3 /* r0 = b1#2 - b2#2 */
315 ldrbeq r3, [ip, #0x03] /* r3 = b1#3 */
316 RETne /* Return if mismatch on #2 */
317 subs r0, r3, r2 /* r0 = b1#3 - b2#3 */
318 ldrbeq r3, [r1, #0x04] /* r3 = b2#4 */
319 ldrbeq r0, [ip, #0x04] /* r0 = b1#4 */
320 RETne /* Return if mismatch on #3 */
321 ldrb r2, [r1, #0x05] /* r2 = b2#5 */
322 subs r0, r0, r3 /* r0 = b1#4 - b2#4 */
323 ldrbeq r3, [ip, #0x05] /* r3 = b1#5 */
324 RETne /* Return if mismatch on #4 */
325 sub r0, r3, r2 /* r0 = b1#5 - b2#5 */
330 /* Do the buffers overlap? */
332 RETeq /* Bail now if src/dst are the same */
333 subcc r3, r0, r1 /* if (dst > src) r3 = dst - src */
334 subcs r3, r1, r0 /* if (src > dsr) r3 = src - dst */
335 cmp r3, r2 /* if (r3 < len) we have an overlap */
336 bcc PIC_SYM(_C_LABEL(memcpy), PLT)
338 /* Determine copy direction */
340 bcc .Lmemmove_backwards
342 moveq r0, #0 /* Quick abort for len=0 */
345 stmdb sp!, {r0, lr} /* memmove() returns dest addr */
347 blt .Lmemmove_fl4 /* less than 4 bytes */
349 bne .Lmemmove_fdestul /* oh unaligned destination addr */
351 bne .Lmemmove_fsrcul /* oh unaligned source addr */
354 /* We have aligned source and destination */
356 blt .Lmemmove_fl12 /* less than 12 bytes (4 from above) */
358 blt .Lmemmove_fl32 /* less than 32 bytes (12 from above) */
359 stmdb sp!, {r4} /* borrow r4 */
361 /* blat 32 bytes at a time */
362 /* XXX for really big copies perhaps we should use more registers */
364 ldmia r1!, {r3, r4, r12, lr}
365 stmia r0!, {r3, r4, r12, lr}
366 ldmia r1!, {r3, r4, r12, lr}
367 stmia r0!, {r3, r4, r12, lr}
369 bge .Lmemmove_floop32
372 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
373 stmiage r0!, {r3, r4, r12, lr}
375 ldmia sp!, {r4} /* return r4 */
380 /* blat 12 bytes at a time */
382 ldmiage r1!, {r3, r12, lr}
383 stmiage r0!, {r3, r12, lr}
385 bge .Lmemmove_floop12
394 ldmiage r1!, {r3, r12}
395 stmiage r0!, {r3, r12}
399 /* less than 4 bytes to go */
401 ldmiaeq sp!, {r0, pc} /* done */
403 /* copy the crud byte at a time */
413 /* erg - unaligned destination */
418 /* align destination with byte copies */
426 blt .Lmemmove_fl4 /* less the 4 bytes */
429 beq .Lmemmove_ft8 /* we have an aligned source */
431 /* erg - unaligned source */
432 /* This is where it gets nasty ... */
437 bgt .Lmemmove_fsrcul3
438 beq .Lmemmove_fsrcul2
440 blt .Lmemmove_fsrcul1loop4
444 .Lmemmove_fsrcul1loop16:
446 ldmia r1!, {r4, r5, r12, lr}
447 orr r3, r3, r4, lsl #24
449 orr r4, r4, r5, lsl #24
451 orr r5, r5, r12, lsl #24
453 orr r12, r12, lr, lsl #24
454 stmia r0!, {r3-r5, r12}
456 bge .Lmemmove_fsrcul1loop16
459 blt .Lmemmove_fsrcul1l4
461 .Lmemmove_fsrcul1loop4:
464 orr r12, r12, lr, lsl #24
467 bge .Lmemmove_fsrcul1loop4
475 blt .Lmemmove_fsrcul2loop4
479 .Lmemmove_fsrcul2loop16:
481 ldmia r1!, {r4, r5, r12, lr}
482 orr r3, r3, r4, lsl #16
484 orr r4, r4, r5, lsl #16
486 orr r5, r5, r12, lsl #16
487 mov r12, r12, lsr #16
488 orr r12, r12, lr, lsl #16
489 stmia r0!, {r3-r5, r12}
491 bge .Lmemmove_fsrcul2loop16
494 blt .Lmemmove_fsrcul2l4
496 .Lmemmove_fsrcul2loop4:
499 orr r12, r12, lr, lsl #16
502 bge .Lmemmove_fsrcul2loop4
510 blt .Lmemmove_fsrcul3loop4
514 .Lmemmove_fsrcul3loop16:
516 ldmia r1!, {r4, r5, r12, lr}
517 orr r3, r3, r4, lsl #8
519 orr r4, r4, r5, lsl #8
521 orr r5, r5, r12, lsl #8
522 mov r12, r12, lsr #24
523 orr r12, r12, lr, lsl #8
524 stmia r0!, {r3-r5, r12}
526 bge .Lmemmove_fsrcul3loop16
529 blt .Lmemmove_fsrcul3l4
531 .Lmemmove_fsrcul3loop4:
534 orr r12, r12, lr, lsl #8
537 bge .Lmemmove_fsrcul3loop4
547 blt .Lmemmove_bl4 /* less than 4 bytes */
549 bne .Lmemmove_bdestul /* oh unaligned destination addr */
551 bne .Lmemmove_bsrcul /* oh unaligned source addr */
554 /* We have aligned source and destination */
556 blt .Lmemmove_bl12 /* less than 12 bytes (4 from above) */
558 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
561 /* blat 32 bytes at a time */
562 /* XXX for really big copies perhaps we should use more registers */
564 ldmdb r1!, {r3, r4, r12, lr}
565 stmdb r0!, {r3, r4, r12, lr}
566 ldmdb r1!, {r3, r4, r12, lr}
567 stmdb r0!, {r3, r4, r12, lr}
569 bge .Lmemmove_bloop32
573 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
574 stmdbge r0!, {r3, r4, r12, lr}
577 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
578 stmdbge r0!, {r3, r12, lr}
588 ldmdbge r1!, {r3, r12}
589 stmdbge r0!, {r3, r12}
593 /* less than 4 bytes to go */
597 /* copy the crud byte at a time */
601 ldrbge r3, [r1, #-1]!
602 strbge r3, [r0, #-1]!
603 ldrbgt r3, [r1, #-1]!
604 strbgt r3, [r0, #-1]!
607 /* erg - unaligned destination */
611 /* align destination with byte copies */
614 ldrbge r3, [r1, #-1]!
615 strbge r3, [r0, #-1]!
616 ldrbgt r3, [r1, #-1]!
617 strbgt r3, [r0, #-1]!
619 blt .Lmemmove_bl4 /* less than 4 bytes to go */
621 beq .Lmemmove_bt8 /* we have an aligned source */
623 /* erg - unaligned source */
624 /* This is where it gets nasty ... */
629 blt .Lmemmove_bsrcul1
630 beq .Lmemmove_bsrcul2
632 blt .Lmemmove_bsrcul3loop4
634 stmdb sp!, {r4, r5, lr}
636 .Lmemmove_bsrcul3loop16:
638 ldmdb r1!, {r3-r5, r12}
639 orr lr, lr, r12, lsr #24
641 orr r12, r12, r5, lsr #24
643 orr r5, r5, r4, lsr #24
645 orr r4, r4, r3, lsr #24
646 stmdb r0!, {r4, r5, r12, lr}
648 bge .Lmemmove_bsrcul3loop16
649 ldmia sp!, {r4, r5, lr}
651 blt .Lmemmove_bsrcul3l4
653 .Lmemmove_bsrcul3loop4:
656 orr r12, r12, r3, lsr #24
659 bge .Lmemmove_bsrcul3loop4
667 blt .Lmemmove_bsrcul2loop4
669 stmdb sp!, {r4, r5, lr}
671 .Lmemmove_bsrcul2loop16:
673 ldmdb r1!, {r3-r5, r12}
674 orr lr, lr, r12, lsr #16
675 mov r12, r12, lsl #16
676 orr r12, r12, r5, lsr #16
678 orr r5, r5, r4, lsr #16
680 orr r4, r4, r3, lsr #16
681 stmdb r0!, {r4, r5, r12, lr}
683 bge .Lmemmove_bsrcul2loop16
684 ldmia sp!, {r4, r5, lr}
686 blt .Lmemmove_bsrcul2l4
688 .Lmemmove_bsrcul2loop4:
691 orr r12, r12, r3, lsr #16
694 bge .Lmemmove_bsrcul2loop4
702 blt .Lmemmove_bsrcul1loop4
704 stmdb sp!, {r4, r5, lr}
706 .Lmemmove_bsrcul1loop32:
708 ldmdb r1!, {r3-r5, r12}
709 orr lr, lr, r12, lsr #8
710 mov r12, r12, lsl #24
711 orr r12, r12, r5, lsr #8
713 orr r5, r5, r4, lsr #8
715 orr r4, r4, r3, lsr #8
716 stmdb r0!, {r4, r5, r12, lr}
718 bge .Lmemmove_bsrcul1loop32
719 ldmia sp!, {r4, r5, lr}
721 blt .Lmemmove_bsrcul1l4
723 .Lmemmove_bsrcul1loop4:
726 orr r12, r12, r3, lsr #8
729 bge .Lmemmove_bsrcul1loop4
736 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
740 ble .Lmemcpy_short /* <= 12 bytes */
742 #if FLASHADDR > PHYSADDR
752 mov r3, r0 /* We must not clobber r0 */
754 /* Word-align the destination buffer */
755 ands ip, r3, #0x03 /* Already word aligned? */
756 beq .Lmemcpy_wordaligned /* Yup */
761 ldrble ip, [r1], #0x01
763 strble ip, [r3], #0x01
764 ldrblt ip, [r1], #0x01
766 strblt ip, [r3], #0x01
768 /* Destination buffer is now word aligned */
769 .Lmemcpy_wordaligned:
770 ands ip, r1, #0x03 /* Is src also word-aligned? */
771 bne .Lmemcpy_bad_align /* Nope. Things just got bad */
773 /* Quad-align the destination buffer */
774 tst r3, #0x07 /* Already quad aligned? */
775 ldrne ip, [r1], #0x04
776 stmfd sp!, {r4-r9} /* Free up some registers */
778 strne ip, [r3], #0x04
780 /* Destination buffer quad aligned, source is at least word aligned */
782 blt .Lmemcpy_w_lessthan128
784 /* Copy 128 bytes at a time */
786 ldr r4, [r1], #0x04 /* LD:00-03 */
787 ldr r5, [r1], #0x04 /* LD:04-07 */
788 pld [r1, #0x18] /* Prefetch 0x20 */
789 ldr r6, [r1], #0x04 /* LD:08-0b */
790 ldr r7, [r1], #0x04 /* LD:0c-0f */
791 ldr r8, [r1], #0x04 /* LD:10-13 */
792 ldr r9, [r1], #0x04 /* LD:14-17 */
793 strd r4, [r3], #0x08 /* ST:00-07 */
794 ldr r4, [r1], #0x04 /* LD:18-1b */
795 ldr r5, [r1], #0x04 /* LD:1c-1f */
796 strd r6, [r3], #0x08 /* ST:08-0f */
797 ldr r6, [r1], #0x04 /* LD:20-23 */
798 ldr r7, [r1], #0x04 /* LD:24-27 */
799 pld [r1, #0x18] /* Prefetch 0x40 */
800 strd r8, [r3], #0x08 /* ST:10-17 */
801 ldr r8, [r1], #0x04 /* LD:28-2b */
802 ldr r9, [r1], #0x04 /* LD:2c-2f */
803 strd r4, [r3], #0x08 /* ST:18-1f */
804 ldr r4, [r1], #0x04 /* LD:30-33 */
805 ldr r5, [r1], #0x04 /* LD:34-37 */
806 strd r6, [r3], #0x08 /* ST:20-27 */
807 ldr r6, [r1], #0x04 /* LD:38-3b */
808 ldr r7, [r1], #0x04 /* LD:3c-3f */
809 strd r8, [r3], #0x08 /* ST:28-2f */
810 ldr r8, [r1], #0x04 /* LD:40-43 */
811 ldr r9, [r1], #0x04 /* LD:44-47 */
812 pld [r1, #0x18] /* Prefetch 0x60 */
813 strd r4, [r3], #0x08 /* ST:30-37 */
814 ldr r4, [r1], #0x04 /* LD:48-4b */
815 ldr r5, [r1], #0x04 /* LD:4c-4f */
816 strd r6, [r3], #0x08 /* ST:38-3f */
817 ldr r6, [r1], #0x04 /* LD:50-53 */
818 ldr r7, [r1], #0x04 /* LD:54-57 */
819 strd r8, [r3], #0x08 /* ST:40-47 */
820 ldr r8, [r1], #0x04 /* LD:58-5b */
821 ldr r9, [r1], #0x04 /* LD:5c-5f */
822 strd r4, [r3], #0x08 /* ST:48-4f */
823 ldr r4, [r1], #0x04 /* LD:60-63 */
824 ldr r5, [r1], #0x04 /* LD:64-67 */
825 pld [r1, #0x18] /* Prefetch 0x80 */
826 strd r6, [r3], #0x08 /* ST:50-57 */
827 ldr r6, [r1], #0x04 /* LD:68-6b */
828 ldr r7, [r1], #0x04 /* LD:6c-6f */
829 strd r8, [r3], #0x08 /* ST:58-5f */
830 ldr r8, [r1], #0x04 /* LD:70-73 */
831 ldr r9, [r1], #0x04 /* LD:74-77 */
832 strd r4, [r3], #0x08 /* ST:60-67 */
833 ldr r4, [r1], #0x04 /* LD:78-7b */
834 ldr r5, [r1], #0x04 /* LD:7c-7f */
835 strd r6, [r3], #0x08 /* ST:68-6f */
836 strd r8, [r3], #0x08 /* ST:70-77 */
838 strd r4, [r3], #0x08 /* ST:78-7f */
839 bge .Lmemcpy_w_loop128
841 .Lmemcpy_w_lessthan128:
842 adds r2, r2, #0x80 /* Adjust for extra sub */
844 RETeq /* Return now if done */
846 blt .Lmemcpy_w_lessthan32
848 /* Copy 32 bytes at a time */
864 bge .Lmemcpy_w_loop32
866 .Lmemcpy_w_lessthan32:
867 adds r2, r2, #0x20 /* Adjust for extra sub */
869 RETeq /* Return now if done */
873 addne pc, pc, r4, lsl #1
876 /* At least 24 bytes remaining */
882 /* At least 16 bytes remaining */
888 /* At least 8 bytes remaining */
894 /* Less than 8 bytes remaining */
896 RETeq /* Return now if done */
898 ldrge ip, [r1], #0x04
899 strge ip, [r3], #0x04
900 RETeq /* Return now if done */
904 ldrbge r2, [r1], #0x01
907 strbge r2, [r3], #0x01
910 /* Place a literal pool here for the above ldr instructions to use */
915 * At this point, it has not been possible to word align both buffers.
916 * The destination buffer is word aligned, but the source buffer is not.
927 .Lmemcpy_bad1_loop16:
934 orr r4, r4, r5, lsl #24
936 orr r5, r5, r6, lsl #24
938 orr r6, r6, r7, lsl #24
940 orr r7, r7, ip, lsl #24
947 bge .Lmemcpy_bad1_loop16
951 RETeq /* Return now if done */
954 blt .Lmemcpy_bad_done
960 orr r4, r4, ip, lsl #24
962 bge .Lmemcpy_bad1_loop4
966 .Lmemcpy_bad2_loop16:
973 orr r4, r4, r5, lsl #16
975 orr r5, r5, r6, lsl #16
977 orr r6, r6, r7, lsl #16
979 orr r7, r7, ip, lsl #16
986 bge .Lmemcpy_bad2_loop16
990 RETeq /* Return now if done */
993 blt .Lmemcpy_bad_done
999 orr r4, r4, ip, lsl #16
1001 bge .Lmemcpy_bad2_loop4
1005 .Lmemcpy_bad3_loop16:
1012 orr r4, r4, r5, lsl #8
1014 orr r5, r5, r6, lsl #8
1016 orr r6, r6, r7, lsl #8
1018 orr r7, r7, ip, lsl #8
1025 bge .Lmemcpy_bad3_loop16
1028 ldmfdeq sp!, {r4-r7}
1029 RETeq /* Return now if done */
1032 blt .Lmemcpy_bad_done
1034 .Lmemcpy_bad3_loop4:
1038 orr r4, r4, ip, lsl #8
1040 bge .Lmemcpy_bad3_loop4
1047 ldrb ip, [r1], #0x01
1049 ldrbge r2, [r1], #0x01
1050 strb ip, [r3], #0x01
1052 strbge r2, [r3], #0x01
1058 * Handle short copies (less than 16 bytes), possibly misaligned.
1059 * Some of these are *very* common, thanks to the network stack,
1060 * and so are handled specially.
1063 add pc, pc, r2, lsl #2
1066 b .Lmemcpy_bytewise /* 0x01 */
1067 b .Lmemcpy_bytewise /* 0x02 */
1068 b .Lmemcpy_bytewise /* 0x03 */
1069 b .Lmemcpy_4 /* 0x04 */
1070 b .Lmemcpy_bytewise /* 0x05 */
1071 b .Lmemcpy_6 /* 0x06 */
1072 b .Lmemcpy_bytewise /* 0x07 */
1073 b .Lmemcpy_8 /* 0x08 */
1074 b .Lmemcpy_bytewise /* 0x09 */
1075 b .Lmemcpy_bytewise /* 0x0a */
1076 b .Lmemcpy_bytewise /* 0x0b */
1077 b .Lmemcpy_c /* 0x0c */
1079 mov r3, r0 /* We must not clobber r0 */
1080 ldrb ip, [r1], #0x01
1081 1: subs r2, r2, #0x01
1082 strb ip, [r3], #0x01
1083 ldrbne ip, [r1], #0x01
1087 /******************************************************************************
1088 * Special case for 4 byte copies
1090 #define LMEMCPY_4_LOG2 6 /* 64 bytes */
1091 #define LMEMCPY_4_PAD .align LMEMCPY_4_LOG2
1095 orr r2, r2, r0, lsl #2
1098 addne pc, r3, r2, lsl #LMEMCPY_4_LOG2
1101 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1109 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1111 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1112 ldr r2, [r1, #3] /* BE:r2 = 3xxx LE:r2 = xxx3 */
1113 mov r3, r3, lsr #8 /* r3 = .210 */
1114 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1120 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1122 ldrh r3, [r1, #0x02]
1124 orr r3, r2, r3, lsl #16
1130 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1132 ldr r3, [r1, #-3] /* BE:r3 = xxx0 LE:r3 = 0xxx */
1133 ldr r2, [r1, #1] /* BE:r2 = 123x LE:r2 = x321 */
1134 mov r3, r3, lsr #24 /* r3 = ...0 */
1135 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1141 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1147 strb r1, [r0, #0x03]
1148 strh r3, [r0, #0x01]
1153 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1156 ldrh r3, [r1, #0x01]
1157 ldrb r1, [r1, #0x03]
1159 strh r3, [r0, #0x01]
1160 strb r1, [r0, #0x03]
1165 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1167 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1168 ldrh r3, [r1, #0x02] /* LE:r3 = ..23 LE:r3 = ..32 */
1170 mov r2, r2, lsr #8 /* r2 = ...1 */
1171 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1172 mov r3, r3, lsr #8 /* r3 = ...3 */
1173 strh r2, [r0, #0x01]
1174 strb r3, [r0, #0x03]
1179 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1182 ldrh r3, [r1, #0x01]
1183 ldrb r1, [r1, #0x03]
1185 strh r3, [r0, #0x01]
1186 strb r1, [r0, #0x03]
1191 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1196 strh r3, [r0, #0x02]
1201 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1203 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1204 ldr r3, [r1, #3] /* BE:r3 = 3xxx LE:r3 = xxx3 */
1205 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1207 mov r2, r2, lsr #24 /* r2 = ...2 */
1208 orr r2, r2, r3, lsl #8 /* r2 = xx32 */
1209 strh r2, [r0, #0x02]
1214 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1217 ldrh r3, [r1, #0x02]
1219 strh r3, [r0, #0x02]
1224 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1226 ldr r3, [r1, #1] /* BE:r3 = 123x LE:r3 = x321 */
1227 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1228 mov r1, r3, lsr #8 /* BE:r1 = .123 LE:r1 = .x32 */
1229 strh r1, [r0, #0x02]
1230 mov r3, r3, lsl #8 /* r3 = 321. */
1231 orr r3, r3, r2, lsr #24 /* r3 = 3210 */
1237 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1239 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1243 strh r3, [r0, #0x01]
1244 strb r1, [r0, #0x03]
1249 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1252 ldrh r3, [r1, #0x01]
1253 ldrb r1, [r1, #0x03]
1255 strh r3, [r0, #0x01]
1256 strb r1, [r0, #0x03]
1261 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1263 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1264 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
1266 mov r2, r2, lsr #8 /* r2 = ...1 */
1267 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1268 strh r2, [r0, #0x01]
1269 mov r3, r3, lsr #8 /* r3 = ...3 */
1270 strb r3, [r0, #0x03]
1275 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1278 ldrh r3, [r1, #0x01]
1279 ldrb r1, [r1, #0x03]
1281 strh r3, [r0, #0x01]
1282 strb r1, [r0, #0x03]
1287 /******************************************************************************
1288 * Special case for 6 byte copies
1290 #define LMEMCPY_6_LOG2 6 /* 64 bytes */
1291 #define LMEMCPY_6_PAD .align LMEMCPY_6_LOG2
1295 orr r2, r2, r0, lsl #2
1298 addne pc, r3, r2, lsl #LMEMCPY_6_LOG2
1301 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1304 ldrh r3, [r1, #0x04]
1306 strh r3, [r0, #0x04]
1311 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1313 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1314 ldr r3, [r1, #0x03] /* BE:r3 = 345x LE:r3 = x543 */
1315 mov r2, r2, lsr #8 /* r2 = .210 */
1316 orr r2, r2, r3, lsl #24 /* r2 = 3210 */
1317 mov r3, r3, lsr #8 /* BE:r3 = .345 LE:r3 = .x54 */
1319 strh r3, [r0, #0x04]
1324 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1326 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1327 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1328 mov r1, r3, lsr #16 /* r1 = ..54 */
1329 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1331 strh r1, [r0, #0x04]
1336 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1338 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1339 ldr r3, [r1, #1] /* BE:r3 = 1234 LE:r3 = 4321 */
1340 ldr r1, [r1, #5] /* BE:r1 = 5xxx LE:r3 = xxx5 */
1341 mov r2, r2, lsr #24 /* r2 = ...0 */
1342 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1343 mov r1, r1, lsl #8 /* r1 = xx5. */
1344 orr r1, r1, r3, lsr #24 /* r1 = xx54 */
1346 strh r1, [r0, #0x04]
1351 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1353 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1354 ldrh r2, [r1, #0x04] /* BE:r2 = ..45 LE:r2 = ..54 */
1355 mov r1, r3, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1356 strh r1, [r0, #0x01]
1358 mov r3, r3, lsr #24 /* r3 = ...3 */
1359 orr r3, r3, r2, lsl #8 /* r3 = .543 */
1360 mov r2, r2, lsr #8 /* r2 = ...5 */
1361 strh r3, [r0, #0x03]
1362 strb r2, [r0, #0x05]
1367 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1370 ldrh r3, [r1, #0x01]
1371 ldrh ip, [r1, #0x03]
1372 ldrb r1, [r1, #0x05]
1374 strh r3, [r0, #0x01]
1375 strh ip, [r0, #0x03]
1376 strb r1, [r0, #0x05]
1381 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1383 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1384 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1387 strb r3, [r0, #0x05]
1388 mov r3, r1, lsr #8 /* r3 = .543 */
1389 strh r3, [r0, #0x03]
1390 mov r3, r2, lsr #8 /* r3 = ...1 */
1391 orr r3, r3, r1, lsl #8 /* r3 = 4321 */
1392 strh r3, [r0, #0x01]
1397 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1400 ldrh r3, [r1, #0x01]
1401 ldrh ip, [r1, #0x03]
1402 ldrb r1, [r1, #0x05]
1404 strh r3, [r0, #0x01]
1405 strh ip, [r0, #0x03]
1406 strb r1, [r0, #0x05]
1411 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1413 ldrh r2, [r1, #0x04] /* r2 = ..54 */
1414 ldr r3, [r1] /* r3 = 3210 */
1415 mov r2, r2, lsl #16 /* r2 = 54.. */
1416 orr r2, r2, r3, lsr #16 /* r2 = 5432 */
1423 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1425 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1426 ldr r2, [r1, #3] /* BE:r2 = 345x LE:r2 = x543 */
1427 mov r1, r3, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1428 mov r2, r2, lsl #8 /* r2 = 543. */
1429 orr r2, r2, r3, lsr #24 /* r2 = 5432 */
1436 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1446 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1448 ldrb r3, [r1] /* r3 = ...0 */
1449 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1450 ldrb r1, [r1, #0x05] /* r1 = ...5 */
1451 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1452 mov r1, r1, lsl #24 /* r1 = 5... */
1453 orr r1, r1, r2, lsr #8 /* r1 = 5432 */
1460 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1462 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1463 ldrh r1, [r1, #0x04] /* BE:r1 = ..45 LE:r1 = ..54 */
1465 mov r2, r2, lsr #8 /* r2 = .321 */
1466 orr r2, r2, r1, lsl #24 /* r2 = 4321 */
1467 mov r1, r1, lsr #8 /* r1 = ...5 */
1469 strb r1, [r0, #0x05]
1474 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1477 ldrh r3, [r1, #0x01]
1478 ldrh ip, [r1, #0x03]
1479 ldrb r1, [r1, #0x05]
1481 strh r3, [r0, #0x01]
1482 strh ip, [r0, #0x03]
1483 strb r1, [r0, #0x05]
1488 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1490 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1491 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1493 mov r2, r2, lsr #8 /* r2 = ...1 */
1494 orr r2, r2, r1, lsl #8 /* r2 = 4321 */
1495 mov r1, r1, lsr #24 /* r1 = ...5 */
1497 strb r1, [r0, #0x05]
1502 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1506 ldrb r1, [r1, #0x05]
1509 strb r1, [r0, #0x05]
1514 /******************************************************************************
1515 * Special case for 8 byte copies
1517 #define LMEMCPY_8_LOG2 6 /* 64 bytes */
1518 #define LMEMCPY_8_PAD .align LMEMCPY_8_LOG2
1522 orr r2, r2, r0, lsl #2
1525 addne pc, r3, r2, lsl #LMEMCPY_8_LOG2
1528 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1538 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1540 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1541 ldr r2, [r1, #0x03] /* BE:r2 = 3456 LE:r2 = 6543 */
1542 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1543 mov r3, r3, lsr #8 /* r3 = .210 */
1544 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1545 mov r1, r1, lsl #24 /* r1 = 7... */
1546 orr r2, r1, r2, lsr #8 /* r2 = 7654 */
1553 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1555 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1556 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1557 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1558 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1559 mov r3, r3, lsr #16 /* r3 = ..54 */
1560 orr r3, r3, r1, lsl #16 /* r3 = 7654 */
1567 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1569 ldrb r3, [r1] /* r3 = ...0 */
1570 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1571 ldr r1, [r1, #0x05] /* BE:r1 = 567x LE:r1 = x765 */
1572 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1573 mov r2, r2, lsr #24 /* r2 = ...4 */
1574 orr r2, r2, r1, lsl #8 /* r2 = 7654 */
1581 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1583 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1584 ldr r2, [r1, #0x04] /* BE:r2 = 4567 LE:r2 = 7654 */
1586 mov r1, r2, lsr #24 /* r1 = ...7 */
1587 strb r1, [r0, #0x07]
1588 mov r1, r3, lsr #8 /* r1 = .321 */
1589 mov r3, r3, lsr #24 /* r3 = ...3 */
1590 orr r3, r3, r2, lsl #8 /* r3 = 6543 */
1591 strh r1, [r0, #0x01]
1597 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1600 ldrh r3, [r1, #0x01]
1602 ldrb r1, [r1, #0x07]
1604 strh r3, [r0, #0x01]
1606 strb r1, [r0, #0x07]
1611 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1613 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1614 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1615 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1616 strb r2, [r0] /* 0 */
1617 mov ip, r1, lsr #8 /* ip = ...7 */
1618 strb ip, [r0, #0x07] /* 7 */
1619 mov ip, r2, lsr #8 /* ip = ...1 */
1620 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1621 mov r3, r3, lsr #8 /* r3 = .543 */
1622 orr r3, r3, r1, lsl #24 /* r3 = 6543 */
1623 strh ip, [r0, #0x01]
1629 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1631 ldrb r3, [r1] /* r3 = ...0 */
1632 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1633 ldrh r2, [r1, #0x05] /* BE:r2 = ..56 LE:r2 = ..65 */
1634 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1636 mov r3, ip, lsr #16 /* BE:r3 = ..12 LE:r3 = ..43 */
1637 strh ip, [r0, #0x01]
1638 orr r2, r3, r2, lsl #16 /* r2 = 6543 */
1640 strb r1, [r0, #0x07]
1645 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1647 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1648 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1649 mov r1, r2, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1651 orr r2, r1, r3, lsl #16 /* r2 = 5432 */
1652 mov r3, r3, lsr #16 /* r3 = ..76 */
1654 strh r3, [r0, #0x06]
1659 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1661 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1662 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1663 ldrb ip, [r1, #0x07] /* ip = ...7 */
1664 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1666 mov r1, r2, lsr #24 /* r1 = ...2 */
1667 orr r1, r1, r3, lsl #8 /* r1 = 5432 */
1668 mov r3, r3, lsr #24 /* r3 = ...6 */
1669 orr r3, r3, ip, lsl #8 /* r3 = ..76 */
1671 strh r3, [r0, #0x06]
1676 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1680 ldrh r3, [r1, #0x06]
1683 strh r3, [r0, #0x06]
1688 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1690 ldr r3, [r1, #0x05] /* BE:r3 = 567x LE:r3 = x765 */
1691 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1692 ldrb ip, [r1] /* ip = ...0 */
1693 mov r1, r3, lsr #8 /* BE:r1 = .567 LE:r1 = .x76 */
1694 strh r1, [r0, #0x06]
1695 mov r3, r3, lsl #24 /* r3 = 5... */
1696 orr r3, r3, r2, lsr #8 /* r3 = 5432 */
1697 orr r2, ip, r2, lsl #8 /* r2 = 3210 */
1704 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1706 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1707 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1708 mov r1, r3, lsr #8 /* BE:r1 = .456 LE:r1 = .765 */
1709 strh r1, [r0, #0x05]
1711 mov r1, r3, lsr #24 /* r1 = ...7 */
1712 strb r1, [r0, #0x07]
1713 mov r2, r2, lsr #8 /* r2 = .321 */
1714 orr r2, r2, r3, lsl #24 /* r2 = 4321 */
1720 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1722 ldrb r3, [r1] /* r3 = ...0 */
1723 ldrh r2, [r1, #0x01] /* BE:r2 = ..12 LE:r2 = ..21 */
1724 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
1725 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1727 mov r3, ip, lsr #16 /* BE:r3 = ..34 LE:r3 = ..65 */
1728 strh r3, [r0, #0x05]
1729 orr r2, r2, ip, lsl #16 /* r2 = 4321 */
1731 strb r1, [r0, #0x07]
1736 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1738 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1739 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1740 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1742 mov ip, r2, lsr #8 /* ip = ...1 */
1743 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1744 mov r2, r1, lsr #8 /* r2 = ...7 */
1745 strb r2, [r0, #0x07]
1746 mov r1, r1, lsl #8 /* r1 = .76. */
1747 orr r1, r1, r3, lsr #24 /* r1 = .765 */
1749 strh r1, [r0, #0x05]
1754 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1758 ldrh r3, [r1, #0x05]
1759 ldrb r1, [r1, #0x07]
1762 strh r3, [r0, #0x05]
1763 strb r1, [r0, #0x07]
1767 /******************************************************************************
1768 * Special case for 12 byte copies
1770 #define LMEMCPY_C_LOG2 7 /* 128 bytes */
1771 #define LMEMCPY_C_PAD .align LMEMCPY_C_LOG2
1775 orr r2, r2, r0, lsl #2
1778 addne pc, r3, r2, lsl #LMEMCPY_C_LOG2
1781 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1793 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1795 ldrb r2, [r1, #0xb] /* r2 = ...B */
1796 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1797 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1798 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
1799 mov r2, r2, lsl #24 /* r2 = B... */
1800 orr r2, r2, ip, lsr #8 /* r2 = BA98 */
1802 mov r2, ip, lsl #24 /* r2 = 7... */
1803 orr r2, r2, r3, lsr #8 /* r2 = 7654 */
1804 mov r1, r1, lsr #8 /* r1 = .210 */
1805 orr r1, r1, r3, lsl #24 /* r1 = 3210 */
1812 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1814 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1815 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1816 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1817 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1818 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1820 mov r3, r3, lsr #16 /* r3 = ..54 */
1821 orr r3, r3, ip, lsl #16 /* r3 = 7654 */
1822 mov r1, r1, lsl #16 /* r1 = BA.. */
1823 orr r1, r1, ip, lsr #16 /* r1 = BA98 */
1830 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1832 ldrb r2, [r1] /* r2 = ...0 */
1833 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1834 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1835 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1836 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1838 mov r3, r3, lsr #24 /* r3 = ...4 */
1839 orr r3, r3, ip, lsl #8 /* r3 = 7654 */
1840 mov r1, r1, lsl #8 /* r1 = BA9. */
1841 orr r1, r1, ip, lsr #24 /* r1 = BA98 */
1848 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1850 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1851 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1852 ldr ip, [r1, #0x08] /* BE:ip = 89AB LE:ip = BA98 */
1853 mov r1, r2, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1854 strh r1, [r0, #0x01]
1856 mov r1, r2, lsr #24 /* r1 = ...3 */
1857 orr r2, r1, r3, lsl #8 /* r1 = 6543 */
1858 mov r1, r3, lsr #24 /* r1 = ...7 */
1859 orr r1, r1, ip, lsl #8 /* r1 = A987 */
1860 mov ip, ip, lsr #24 /* ip = ...B */
1863 strb ip, [r0, #0x0b]
1868 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1871 ldrh r3, [r1, #0x01]
1875 ldrb r1, [r1, #0x0b]
1876 strh r3, [r0, #0x01]
1879 strb r1, [r0, #0x0b]
1884 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1886 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1887 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1888 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1889 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1891 mov r2, r2, lsr #8 /* r2 = ...1 */
1892 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
1893 strh r2, [r0, #0x01]
1894 mov r2, r3, lsr #8 /* r2 = .543 */
1895 orr r3, r2, ip, lsl #24 /* r3 = 6543 */
1896 mov r2, ip, lsr #8 /* r2 = .987 */
1897 orr r2, r2, r1, lsl #24 /* r2 = A987 */
1898 mov r1, r1, lsr #8 /* r1 = ...B */
1901 strb r1, [r0, #0x0b]
1906 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1909 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1910 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1911 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1913 strh r3, [r0, #0x01]
1914 mov r3, r3, lsr #16 /* r3 = ..43 */
1915 orr r3, r3, ip, lsl #16 /* r3 = 6543 */
1916 mov ip, ip, lsr #16 /* ip = ..87 */
1917 orr ip, ip, r1, lsl #16 /* ip = A987 */
1918 mov r1, r1, lsr #16 /* r1 = ..xB */
1921 strb r1, [r0, #0x0b]
1926 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1928 ldr ip, [r1] /* BE:ip = 0123 LE:ip = 3210 */
1929 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1930 ldr r2, [r1, #0x08] /* BE:r2 = 89AB LE:r2 = BA98 */
1931 mov r1, ip, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1933 orr r1, r1, r3, lsl #16 /* r1 = 5432 */
1934 mov r3, r3, lsr #16 /* r3 = ..76 */
1935 orr r3, r3, r2, lsl #16 /* r3 = 9876 */
1936 mov r2, r2, lsr #16 /* r2 = ..BA */
1939 strh r2, [r0, #0x0a]
1944 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1946 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1947 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1948 mov ip, r2, lsr #8 /* BE:ip = .x01 LE:ip = .210 */
1950 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1951 ldrb r1, [r1, #0x0b] /* r1 = ...B */
1952 mov r2, r2, lsr #24 /* r2 = ...2 */
1953 orr r2, r2, r3, lsl #8 /* r2 = 5432 */
1954 mov r3, r3, lsr #24 /* r3 = ...6 */
1955 orr r3, r3, ip, lsl #8 /* r3 = 9876 */
1956 mov r1, r1, lsl #8 /* r1 = ..B. */
1957 orr r1, r1, ip, lsr #24 /* r1 = ..BA */
1960 strh r1, [r0, #0x0a]
1965 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1970 ldrh r1, [r1, #0x0a]
1974 strh r1, [r0, #0x0a]
1979 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1981 ldr r2, [r1, #0x09] /* BE:r2 = 9ABx LE:r2 = xBA9 */
1982 ldr r3, [r1, #0x05] /* BE:r3 = 5678 LE:r3 = 8765 */
1983 mov ip, r2, lsr #8 /* BE:ip = .9AB LE:ip = .xBA */
1984 strh ip, [r0, #0x0a]
1985 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1986 ldrb r1, [r1] /* r1 = ...0 */
1987 mov r2, r2, lsl #24 /* r2 = 9... */
1988 orr r2, r2, r3, lsr #8 /* r2 = 9876 */
1989 mov r3, r3, lsl #24 /* r3 = 5... */
1990 orr r3, r3, ip, lsr #8 /* r3 = 5432 */
1991 orr r1, r1, ip, lsl #8 /* r1 = 3210 */
1999 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2001 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
2002 ldr ip, [r1, #0x04] /* BE:ip = 4567 LE:ip = 7654 */
2003 ldr r1, [r1, #0x08] /* BE:r1 = 89AB LE:r1 = BA98 */
2005 mov r3, r2, lsr #8 /* r3 = .321 */
2006 orr r3, r3, ip, lsl #24 /* r3 = 4321 */
2008 mov r3, ip, lsr #8 /* r3 = .765 */
2009 orr r3, r3, r1, lsl #24 /* r3 = 8765 */
2011 mov r1, r1, lsr #8 /* r1 = .BA9 */
2012 strh r1, [r0, #0x09]
2013 mov r1, r1, lsr #16 /* r1 = ...B */
2014 strb r1, [r0, #0x0b]
2019 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2021 ldrb r2, [r1, #0x0b] /* r2 = ...B */
2022 ldr r3, [r1, #0x07] /* BE:r3 = 789A LE:r3 = A987 */
2023 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
2024 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
2025 strb r2, [r0, #0x0b]
2026 mov r2, r3, lsr #16 /* r2 = ..A9 */
2027 strh r2, [r0, #0x09]
2028 mov r3, r3, lsl #16 /* r3 = 87.. */
2029 orr r3, r3, ip, lsr #16 /* r3 = 8765 */
2030 mov ip, ip, lsl #16 /* ip = 43.. */
2031 orr ip, ip, r1, lsr #16 /* ip = 4321 */
2032 mov r1, r1, lsr #8 /* r1 = .210 */
2040 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2042 ldrh r2, [r1] /* r2 = ..10 */
2043 ldr r3, [r1, #0x02] /* r3 = 5432 */
2044 ldr ip, [r1, #0x06] /* ip = 9876 */
2045 ldrh r1, [r1, #0x0a] /* r1 = ..BA */
2047 mov r2, r2, lsr #8 /* r2 = ...1 */
2048 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
2049 mov r3, r3, lsr #24 /* r3 = ...5 */
2050 orr r3, r3, ip, lsl #8 /* r3 = 8765 */
2051 mov ip, ip, lsr #24 /* ip = ...9 */
2052 orr ip, ip, r1, lsl #8 /* ip = .BA9 */
2053 mov r1, r1, lsr #8 /* r1 = ...B */
2056 strh ip, [r0, #0x09]
2057 strb r1, [r0, #0x0b]
2062 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2068 ldrh r2, [r1, #0x09]
2069 ldrb r1, [r1, #0x0b]
2072 strh r2, [r0, #0x09]
2073 strb r1, [r0, #0x0b]