2 * Copyright (c) 2004 Olivier Houchard
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed for the NetBSD Project by
43 * Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 * or promote products derived from this software without specific prior
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution.
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
89 #include <machine/asm.h>
90 __FBSDID("$FreeBSD$");
97 * memset: Sets a block of memory to the specified value
102 * r2 - number of bytes to write
107 /* LINTSTUB: Func: void bzero(void *, size_t) */
112 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
114 and r3, r1, #0xff /* We deal with bytes */
117 cmp r1, #0x04 /* Do we have less than 4 bytes */
119 blt .Lmemset_lessthanfour
121 /* Ok first we will word align the address */
122 ands r2, ip, #0x03 /* Get the bottom two bits */
123 bne .Lmemset_wordunaligned /* The address is not word aligned */
125 /* We are now word aligned */
126 .Lmemset_wordaligned:
127 orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
128 tst ip, #0x04 /* Quad-align for armv5e */
129 orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
130 subne r1, r1, #0x04 /* Quad-align if necessary */
131 strne r3, [ip], #0x04
133 blt .Lmemset_loop4 /* If less than 16 then use words */
134 mov r2, r3 /* Duplicate data */
135 cmp r1, #0x80 /* If < 128 then skip the big loop */
138 /* Do 128 bytes at a time */
141 strdge r2, [ip], #0x08
142 strdge r2, [ip], #0x08
143 strdge r2, [ip], #0x08
144 strdge r2, [ip], #0x08
145 strdge r2, [ip], #0x08
146 strdge r2, [ip], #0x08
147 strdge r2, [ip], #0x08
148 strdge r2, [ip], #0x08
149 strdge r2, [ip], #0x08
150 strdge r2, [ip], #0x08
151 strdge r2, [ip], #0x08
152 strdge r2, [ip], #0x08
153 strdge r2, [ip], #0x08
154 strdge r2, [ip], #0x08
155 strdge r2, [ip], #0x08
156 strdge r2, [ip], #0x08
158 RETeq /* Zero length so just exit */
160 add r1, r1, #0x80 /* Adjust for extra sub */
162 /* Do 32 bytes at a time */
165 strdge r2, [ip], #0x08
166 strdge r2, [ip], #0x08
167 strdge r2, [ip], #0x08
168 strdge r2, [ip], #0x08
170 RETeq /* Zero length so just exit */
172 adds r1, r1, #0x10 /* Partially adjust for extra sub */
174 /* Deal with 16 bytes or more */
175 strdge r2, [ip], #0x08
176 strdge r2, [ip], #0x08
177 RETeq /* Zero length so just exit */
179 addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
181 /* We have at least 4 bytes so copy as words */
184 strge r3, [ip], #0x04
186 RETeq /* Zero length so just exit */
188 /* Compensate for 64-bit alignment check */
193 strb r3, [ip], #0x01 /* Set 1 byte */
194 strbge r3, [ip], #0x01 /* Set another byte */
195 strbgt r3, [ip] /* and a third */
198 .Lmemset_wordunaligned:
200 strb r3, [ip], #0x01 /* Set 1 byte */
202 strbge r3, [ip], #0x01 /* Set another byte */
204 strbgt r3, [ip], #0x01 /* and a third */
205 cmp r1, #0x04 /* More than 4 bytes left? */
206 bge .Lmemset_wordaligned /* Yup */
208 .Lmemset_lessthanfour:
210 RETeq /* Zero length so exit */
211 strb r3, [ip], #0x01 /* Set 1 byte */
213 strbge r3, [ip], #0x01 /* Set another byte */
214 strbgt r3, [ip] /* and a third */
225 /* Are both addresses aligned the same way? */
228 RETeq /* len == 0, or same addresses! */
231 bne .Lmemcmp_bytewise2 /* Badly aligned. Do it the slow way */
233 /* Word-align the addresses, if necessary */
236 add r3, r3, r3, lsl #1
237 addne pc, pc, r3, lsl #3
240 /* Compare up to 3 bytes */
248 /* Compare up to 2 bytes */
264 /* Compare 4 bytes at a time, if possible */
266 bcc .Lmemcmp_bytewise
267 .Lmemcmp_word_aligned:
272 beq .Lmemcmp_word_aligned
275 /* Correct for extra subtraction, and check if done */
277 cmpeq r0, #0x00 /* If done, did all bytes match? */
278 RETeq /* Yup. Just return */
280 /* Re-do the final word byte-wise */
291 beq .Lmemcmp_bytewise2
296 * 6 byte compares are very common, thanks to the network stack.
297 * This code is hand-scheduled to reduce the number of stalls for
298 * load results. Everything else being equal, this will be ~32%
299 * faster than a byte-wise memcmp.
303 ldrb r3, [r1, #0x00] /* r3 = b2#0 */
304 ldrb r0, [ip, #0x00] /* r0 = b1#0 */
305 ldrb r2, [r1, #0x01] /* r2 = b2#1 */
306 subs r0, r0, r3 /* r0 = b1#0 - b2#0 */
307 ldrbeq r3, [ip, #0x01] /* r3 = b1#1 */
308 RETne /* Return if mismatch on #0 */
309 subs r0, r3, r2 /* r0 = b1#1 - b2#1 */
310 ldrbeq r3, [r1, #0x02] /* r3 = b2#2 */
311 ldrbeq r0, [ip, #0x02] /* r0 = b1#2 */
312 RETne /* Return if mismatch on #1 */
313 ldrb r2, [r1, #0x03] /* r2 = b2#3 */
314 subs r0, r0, r3 /* r0 = b1#2 - b2#2 */
315 ldrbeq r3, [ip, #0x03] /* r3 = b1#3 */
316 RETne /* Return if mismatch on #2 */
317 subs r0, r3, r2 /* r0 = b1#3 - b2#3 */
318 ldrbeq r3, [r1, #0x04] /* r3 = b2#4 */
319 ldrbeq r0, [ip, #0x04] /* r0 = b1#4 */
320 RETne /* Return if mismatch on #3 */
321 ldrb r2, [r1, #0x05] /* r2 = b2#5 */
322 subs r0, r0, r3 /* r0 = b1#4 - b2#4 */
323 ldrbeq r3, [ip, #0x05] /* r3 = b1#5 */
324 RETne /* Return if mismatch on #4 */
325 sub r0, r3, r2 /* r0 = b1#5 - b2#5 */
330 /* switch the source and destination registers */
335 /* Do the buffers overlap? */
337 RETeq /* Bail now if src/dst are the same */
338 subcc r3, r0, r1 /* if (dst > src) r3 = dst - src */
339 subcs r3, r1, r0 /* if (src > dsr) r3 = src - dst */
340 cmp r3, r2 /* if (r3 < len) we have an overlap */
341 bcc PIC_SYM(_C_LABEL(memcpy), PLT)
343 /* Determine copy direction */
345 bcc .Lmemmove_backwards
347 moveq r0, #0 /* Quick abort for len=0 */
350 stmdb sp!, {r0, lr} /* memmove() returns dest addr */
352 blt .Lmemmove_fl4 /* less than 4 bytes */
354 bne .Lmemmove_fdestul /* oh unaligned destination addr */
356 bne .Lmemmove_fsrcul /* oh unaligned source addr */
359 /* We have aligned source and destination */
361 blt .Lmemmove_fl12 /* less than 12 bytes (4 from above) */
363 blt .Lmemmove_fl32 /* less than 32 bytes (12 from above) */
364 stmdb sp!, {r4} /* borrow r4 */
366 /* blat 32 bytes at a time */
367 /* XXX for really big copies perhaps we should use more registers */
369 ldmia r1!, {r3, r4, r12, lr}
370 stmia r0!, {r3, r4, r12, lr}
371 ldmia r1!, {r3, r4, r12, lr}
372 stmia r0!, {r3, r4, r12, lr}
374 bge .Lmemmove_floop32
377 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
378 stmiage r0!, {r3, r4, r12, lr}
380 ldmia sp!, {r4} /* return r4 */
385 /* blat 12 bytes at a time */
387 ldmiage r1!, {r3, r12, lr}
388 stmiage r0!, {r3, r12, lr}
390 bge .Lmemmove_floop12
399 ldmiage r1!, {r3, r12}
400 stmiage r0!, {r3, r12}
404 /* less than 4 bytes to go */
406 ldmiaeq sp!, {r0, pc} /* done */
408 /* copy the crud byte at a time */
418 /* erg - unaligned destination */
423 /* align destination with byte copies */
431 blt .Lmemmove_fl4 /* less the 4 bytes */
434 beq .Lmemmove_ft8 /* we have an aligned source */
436 /* erg - unaligned source */
437 /* This is where it gets nasty ... */
442 bgt .Lmemmove_fsrcul3
443 beq .Lmemmove_fsrcul2
445 blt .Lmemmove_fsrcul1loop4
449 .Lmemmove_fsrcul1loop16:
451 ldmia r1!, {r4, r5, r12, lr}
452 orr r3, r3, r4, lsl #24
454 orr r4, r4, r5, lsl #24
456 orr r5, r5, r12, lsl #24
458 orr r12, r12, lr, lsl #24
459 stmia r0!, {r3-r5, r12}
461 bge .Lmemmove_fsrcul1loop16
464 blt .Lmemmove_fsrcul1l4
466 .Lmemmove_fsrcul1loop4:
469 orr r12, r12, lr, lsl #24
472 bge .Lmemmove_fsrcul1loop4
480 blt .Lmemmove_fsrcul2loop4
484 .Lmemmove_fsrcul2loop16:
486 ldmia r1!, {r4, r5, r12, lr}
487 orr r3, r3, r4, lsl #16
489 orr r4, r4, r5, lsl #16
491 orr r5, r5, r12, lsl #16
492 mov r12, r12, lsr #16
493 orr r12, r12, lr, lsl #16
494 stmia r0!, {r3-r5, r12}
496 bge .Lmemmove_fsrcul2loop16
499 blt .Lmemmove_fsrcul2l4
501 .Lmemmove_fsrcul2loop4:
504 orr r12, r12, lr, lsl #16
507 bge .Lmemmove_fsrcul2loop4
515 blt .Lmemmove_fsrcul3loop4
519 .Lmemmove_fsrcul3loop16:
521 ldmia r1!, {r4, r5, r12, lr}
522 orr r3, r3, r4, lsl #8
524 orr r4, r4, r5, lsl #8
526 orr r5, r5, r12, lsl #8
527 mov r12, r12, lsr #24
528 orr r12, r12, lr, lsl #8
529 stmia r0!, {r3-r5, r12}
531 bge .Lmemmove_fsrcul3loop16
534 blt .Lmemmove_fsrcul3l4
536 .Lmemmove_fsrcul3loop4:
539 orr r12, r12, lr, lsl #8
542 bge .Lmemmove_fsrcul3loop4
552 blt .Lmemmove_bl4 /* less than 4 bytes */
554 bne .Lmemmove_bdestul /* oh unaligned destination addr */
556 bne .Lmemmove_bsrcul /* oh unaligned source addr */
559 /* We have aligned source and destination */
561 blt .Lmemmove_bl12 /* less than 12 bytes (4 from above) */
563 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
566 /* blat 32 bytes at a time */
567 /* XXX for really big copies perhaps we should use more registers */
569 ldmdb r1!, {r3, r4, r12, lr}
570 stmdb r0!, {r3, r4, r12, lr}
571 ldmdb r1!, {r3, r4, r12, lr}
572 stmdb r0!, {r3, r4, r12, lr}
574 bge .Lmemmove_bloop32
578 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
579 stmdbge r0!, {r3, r4, r12, lr}
582 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
583 stmdbge r0!, {r3, r12, lr}
593 ldmdbge r1!, {r3, r12}
594 stmdbge r0!, {r3, r12}
598 /* less than 4 bytes to go */
602 /* copy the crud byte at a time */
606 ldrbge r3, [r1, #-1]!
607 strbge r3, [r0, #-1]!
608 ldrbgt r3, [r1, #-1]!
609 strbgt r3, [r0, #-1]!
612 /* erg - unaligned destination */
616 /* align destination with byte copies */
619 ldrbge r3, [r1, #-1]!
620 strbge r3, [r0, #-1]!
621 ldrbgt r3, [r1, #-1]!
622 strbgt r3, [r0, #-1]!
624 blt .Lmemmove_bl4 /* less than 4 bytes to go */
626 beq .Lmemmove_bt8 /* we have an aligned source */
628 /* erg - unaligned source */
629 /* This is where it gets nasty ... */
634 blt .Lmemmove_bsrcul1
635 beq .Lmemmove_bsrcul2
637 blt .Lmemmove_bsrcul3loop4
639 stmdb sp!, {r4, r5, lr}
641 .Lmemmove_bsrcul3loop16:
643 ldmdb r1!, {r3-r5, r12}
644 orr lr, lr, r12, lsr #24
646 orr r12, r12, r5, lsr #24
648 orr r5, r5, r4, lsr #24
650 orr r4, r4, r3, lsr #24
651 stmdb r0!, {r4, r5, r12, lr}
653 bge .Lmemmove_bsrcul3loop16
654 ldmia sp!, {r4, r5, lr}
656 blt .Lmemmove_bsrcul3l4
658 .Lmemmove_bsrcul3loop4:
661 orr r12, r12, r3, lsr #24
664 bge .Lmemmove_bsrcul3loop4
672 blt .Lmemmove_bsrcul2loop4
674 stmdb sp!, {r4, r5, lr}
676 .Lmemmove_bsrcul2loop16:
678 ldmdb r1!, {r3-r5, r12}
679 orr lr, lr, r12, lsr #16
680 mov r12, r12, lsl #16
681 orr r12, r12, r5, lsr #16
683 orr r5, r5, r4, lsr #16
685 orr r4, r4, r3, lsr #16
686 stmdb r0!, {r4, r5, r12, lr}
688 bge .Lmemmove_bsrcul2loop16
689 ldmia sp!, {r4, r5, lr}
691 blt .Lmemmove_bsrcul2l4
693 .Lmemmove_bsrcul2loop4:
696 orr r12, r12, r3, lsr #16
699 bge .Lmemmove_bsrcul2loop4
707 blt .Lmemmove_bsrcul1loop4
709 stmdb sp!, {r4, r5, lr}
711 .Lmemmove_bsrcul1loop32:
713 ldmdb r1!, {r3-r5, r12}
714 orr lr, lr, r12, lsr #8
715 mov r12, r12, lsl #24
716 orr r12, r12, r5, lsr #8
718 orr r5, r5, r4, lsr #8
720 orr r4, r4, r3, lsr #8
721 stmdb r0!, {r4, r5, r12, lr}
723 bge .Lmemmove_bsrcul1loop32
724 ldmia sp!, {r4, r5, lr}
726 blt .Lmemmove_bsrcul1l4
728 .Lmemmove_bsrcul1loop4:
731 orr r12, r12, r3, lsr #8
734 bge .Lmemmove_bsrcul1loop4
742 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
746 ble .Lmemcpy_short /* <= 12 bytes */
748 #if FLASHADDR > PHYSADDR
758 mov r3, r0 /* We must not clobber r0 */
760 /* Word-align the destination buffer */
761 ands ip, r3, #0x03 /* Already word aligned? */
762 beq .Lmemcpy_wordaligned /* Yup */
767 ldrble ip, [r1], #0x01
769 strble ip, [r3], #0x01
770 ldrblt ip, [r1], #0x01
772 strblt ip, [r3], #0x01
774 /* Destination buffer is now word aligned */
775 .Lmemcpy_wordaligned:
776 ands ip, r1, #0x03 /* Is src also word-aligned? */
777 bne .Lmemcpy_bad_align /* Nope. Things just got bad */
779 /* Quad-align the destination buffer */
780 tst r3, #0x07 /* Already quad aligned? */
781 ldrne ip, [r1], #0x04
782 stmfd sp!, {r4-r9} /* Free up some registers */
784 strne ip, [r3], #0x04
786 /* Destination buffer quad aligned, source is at least word aligned */
788 blt .Lmemcpy_w_lessthan128
790 /* Copy 128 bytes at a time */
792 ldr r4, [r1], #0x04 /* LD:00-03 */
793 ldr r5, [r1], #0x04 /* LD:04-07 */
794 pld [r1, #0x18] /* Prefetch 0x20 */
795 ldr r6, [r1], #0x04 /* LD:08-0b */
796 ldr r7, [r1], #0x04 /* LD:0c-0f */
797 ldr r8, [r1], #0x04 /* LD:10-13 */
798 ldr r9, [r1], #0x04 /* LD:14-17 */
799 strd r4, [r3], #0x08 /* ST:00-07 */
800 ldr r4, [r1], #0x04 /* LD:18-1b */
801 ldr r5, [r1], #0x04 /* LD:1c-1f */
802 strd r6, [r3], #0x08 /* ST:08-0f */
803 ldr r6, [r1], #0x04 /* LD:20-23 */
804 ldr r7, [r1], #0x04 /* LD:24-27 */
805 pld [r1, #0x18] /* Prefetch 0x40 */
806 strd r8, [r3], #0x08 /* ST:10-17 */
807 ldr r8, [r1], #0x04 /* LD:28-2b */
808 ldr r9, [r1], #0x04 /* LD:2c-2f */
809 strd r4, [r3], #0x08 /* ST:18-1f */
810 ldr r4, [r1], #0x04 /* LD:30-33 */
811 ldr r5, [r1], #0x04 /* LD:34-37 */
812 strd r6, [r3], #0x08 /* ST:20-27 */
813 ldr r6, [r1], #0x04 /* LD:38-3b */
814 ldr r7, [r1], #0x04 /* LD:3c-3f */
815 strd r8, [r3], #0x08 /* ST:28-2f */
816 ldr r8, [r1], #0x04 /* LD:40-43 */
817 ldr r9, [r1], #0x04 /* LD:44-47 */
818 pld [r1, #0x18] /* Prefetch 0x60 */
819 strd r4, [r3], #0x08 /* ST:30-37 */
820 ldr r4, [r1], #0x04 /* LD:48-4b */
821 ldr r5, [r1], #0x04 /* LD:4c-4f */
822 strd r6, [r3], #0x08 /* ST:38-3f */
823 ldr r6, [r1], #0x04 /* LD:50-53 */
824 ldr r7, [r1], #0x04 /* LD:54-57 */
825 strd r8, [r3], #0x08 /* ST:40-47 */
826 ldr r8, [r1], #0x04 /* LD:58-5b */
827 ldr r9, [r1], #0x04 /* LD:5c-5f */
828 strd r4, [r3], #0x08 /* ST:48-4f */
829 ldr r4, [r1], #0x04 /* LD:60-63 */
830 ldr r5, [r1], #0x04 /* LD:64-67 */
831 pld [r1, #0x18] /* Prefetch 0x80 */
832 strd r6, [r3], #0x08 /* ST:50-57 */
833 ldr r6, [r1], #0x04 /* LD:68-6b */
834 ldr r7, [r1], #0x04 /* LD:6c-6f */
835 strd r8, [r3], #0x08 /* ST:58-5f */
836 ldr r8, [r1], #0x04 /* LD:70-73 */
837 ldr r9, [r1], #0x04 /* LD:74-77 */
838 strd r4, [r3], #0x08 /* ST:60-67 */
839 ldr r4, [r1], #0x04 /* LD:78-7b */
840 ldr r5, [r1], #0x04 /* LD:7c-7f */
841 strd r6, [r3], #0x08 /* ST:68-6f */
842 strd r8, [r3], #0x08 /* ST:70-77 */
844 strd r4, [r3], #0x08 /* ST:78-7f */
845 bge .Lmemcpy_w_loop128
847 .Lmemcpy_w_lessthan128:
848 adds r2, r2, #0x80 /* Adjust for extra sub */
850 RETeq /* Return now if done */
852 blt .Lmemcpy_w_lessthan32
854 /* Copy 32 bytes at a time */
870 bge .Lmemcpy_w_loop32
872 .Lmemcpy_w_lessthan32:
873 adds r2, r2, #0x20 /* Adjust for extra sub */
875 RETeq /* Return now if done */
879 addne pc, pc, r4, lsl #1
882 /* At least 24 bytes remaining */
888 /* At least 16 bytes remaining */
894 /* At least 8 bytes remaining */
900 /* Less than 8 bytes remaining */
902 RETeq /* Return now if done */
904 ldrge ip, [r1], #0x04
905 strge ip, [r3], #0x04
906 RETeq /* Return now if done */
910 ldrbge r2, [r1], #0x01
913 strbge r2, [r3], #0x01
916 /* Place a literal pool here for the above ldr instructions to use */
921 * At this point, it has not been possible to word align both buffers.
922 * The destination buffer is word aligned, but the source buffer is not.
933 .Lmemcpy_bad1_loop16:
940 orr r4, r4, r5, lsl #24
942 orr r5, r5, r6, lsl #24
944 orr r6, r6, r7, lsl #24
946 orr r7, r7, ip, lsl #24
953 bge .Lmemcpy_bad1_loop16
957 RETeq /* Return now if done */
960 blt .Lmemcpy_bad_done
966 orr r4, r4, ip, lsl #24
968 bge .Lmemcpy_bad1_loop4
972 .Lmemcpy_bad2_loop16:
979 orr r4, r4, r5, lsl #16
981 orr r5, r5, r6, lsl #16
983 orr r6, r6, r7, lsl #16
985 orr r7, r7, ip, lsl #16
992 bge .Lmemcpy_bad2_loop16
996 RETeq /* Return now if done */
999 blt .Lmemcpy_bad_done
1001 .Lmemcpy_bad2_loop4:
1005 orr r4, r4, ip, lsl #16
1007 bge .Lmemcpy_bad2_loop4
1011 .Lmemcpy_bad3_loop16:
1018 orr r4, r4, r5, lsl #8
1020 orr r5, r5, r6, lsl #8
1022 orr r6, r6, r7, lsl #8
1024 orr r7, r7, ip, lsl #8
1031 bge .Lmemcpy_bad3_loop16
1034 ldmfdeq sp!, {r4-r7}
1035 RETeq /* Return now if done */
1038 blt .Lmemcpy_bad_done
1040 .Lmemcpy_bad3_loop4:
1044 orr r4, r4, ip, lsl #8
1046 bge .Lmemcpy_bad3_loop4
1053 ldrb ip, [r1], #0x01
1055 ldrbge r2, [r1], #0x01
1056 strb ip, [r3], #0x01
1058 strbge r2, [r3], #0x01
1064 * Handle short copies (less than 16 bytes), possibly misaligned.
1065 * Some of these are *very* common, thanks to the network stack,
1066 * and so are handled specially.
1069 add pc, pc, r2, lsl #2
1072 b .Lmemcpy_bytewise /* 0x01 */
1073 b .Lmemcpy_bytewise /* 0x02 */
1074 b .Lmemcpy_bytewise /* 0x03 */
1075 b .Lmemcpy_4 /* 0x04 */
1076 b .Lmemcpy_bytewise /* 0x05 */
1077 b .Lmemcpy_6 /* 0x06 */
1078 b .Lmemcpy_bytewise /* 0x07 */
1079 b .Lmemcpy_8 /* 0x08 */
1080 b .Lmemcpy_bytewise /* 0x09 */
1081 b .Lmemcpy_bytewise /* 0x0a */
1082 b .Lmemcpy_bytewise /* 0x0b */
1083 b .Lmemcpy_c /* 0x0c */
1085 mov r3, r0 /* We must not clobber r0 */
1086 ldrb ip, [r1], #0x01
1087 1: subs r2, r2, #0x01
1088 strb ip, [r3], #0x01
1089 ldrbne ip, [r1], #0x01
1093 /******************************************************************************
1094 * Special case for 4 byte copies
1096 #define LMEMCPY_4_LOG2 6 /* 64 bytes */
1097 #define LMEMCPY_4_PAD .align LMEMCPY_4_LOG2
1101 orr r2, r2, r0, lsl #2
1104 addne pc, r3, r2, lsl #LMEMCPY_4_LOG2
1107 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1115 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1117 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1118 ldr r2, [r1, #3] /* BE:r2 = 3xxx LE:r2 = xxx3 */
1119 mov r3, r3, lsr #8 /* r3 = .210 */
1120 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1126 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1128 ldrh r3, [r1, #0x02]
1130 orr r3, r2, r3, lsl #16
1136 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1138 ldr r3, [r1, #-3] /* BE:r3 = xxx0 LE:r3 = 0xxx */
1139 ldr r2, [r1, #1] /* BE:r2 = 123x LE:r2 = x321 */
1140 mov r3, r3, lsr #24 /* r3 = ...0 */
1141 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1147 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1153 strb r1, [r0, #0x03]
1154 strh r3, [r0, #0x01]
1159 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1162 ldrh r3, [r1, #0x01]
1163 ldrb r1, [r1, #0x03]
1165 strh r3, [r0, #0x01]
1166 strb r1, [r0, #0x03]
1171 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1173 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1174 ldrh r3, [r1, #0x02] /* LE:r3 = ..23 LE:r3 = ..32 */
1176 mov r2, r2, lsr #8 /* r2 = ...1 */
1177 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1178 mov r3, r3, lsr #8 /* r3 = ...3 */
1179 strh r2, [r0, #0x01]
1180 strb r3, [r0, #0x03]
1185 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1188 ldrh r3, [r1, #0x01]
1189 ldrb r1, [r1, #0x03]
1191 strh r3, [r0, #0x01]
1192 strb r1, [r0, #0x03]
1197 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1202 strh r3, [r0, #0x02]
1207 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1209 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1210 ldr r3, [r1, #3] /* BE:r3 = 3xxx LE:r3 = xxx3 */
1211 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1213 mov r2, r2, lsr #24 /* r2 = ...2 */
1214 orr r2, r2, r3, lsl #8 /* r2 = xx32 */
1215 strh r2, [r0, #0x02]
1220 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1223 ldrh r3, [r1, #0x02]
1225 strh r3, [r0, #0x02]
1230 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1232 ldr r3, [r1, #1] /* BE:r3 = 123x LE:r3 = x321 */
1233 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1234 mov r1, r3, lsr #8 /* BE:r1 = .123 LE:r1 = .x32 */
1235 strh r1, [r0, #0x02]
1236 mov r3, r3, lsl #8 /* r3 = 321. */
1237 orr r3, r3, r2, lsr #24 /* r3 = 3210 */
1243 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1245 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1249 strh r3, [r0, #0x01]
1250 strb r1, [r0, #0x03]
1255 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1258 ldrh r3, [r1, #0x01]
1259 ldrb r1, [r1, #0x03]
1261 strh r3, [r0, #0x01]
1262 strb r1, [r0, #0x03]
1267 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1269 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1270 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
1272 mov r2, r2, lsr #8 /* r2 = ...1 */
1273 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1274 strh r2, [r0, #0x01]
1275 mov r3, r3, lsr #8 /* r3 = ...3 */
1276 strb r3, [r0, #0x03]
1281 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1284 ldrh r3, [r1, #0x01]
1285 ldrb r1, [r1, #0x03]
1287 strh r3, [r0, #0x01]
1288 strb r1, [r0, #0x03]
1293 /******************************************************************************
1294 * Special case for 6 byte copies
1296 #define LMEMCPY_6_LOG2 6 /* 64 bytes */
1297 #define LMEMCPY_6_PAD .align LMEMCPY_6_LOG2
1301 orr r2, r2, r0, lsl #2
1304 addne pc, r3, r2, lsl #LMEMCPY_6_LOG2
1307 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1310 ldrh r3, [r1, #0x04]
1312 strh r3, [r0, #0x04]
1317 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1319 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1320 ldr r3, [r1, #0x03] /* BE:r3 = 345x LE:r3 = x543 */
1321 mov r2, r2, lsr #8 /* r2 = .210 */
1322 orr r2, r2, r3, lsl #24 /* r2 = 3210 */
1323 mov r3, r3, lsr #8 /* BE:r3 = .345 LE:r3 = .x54 */
1325 strh r3, [r0, #0x04]
1330 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1332 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1333 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1334 mov r1, r3, lsr #16 /* r1 = ..54 */
1335 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1337 strh r1, [r0, #0x04]
1342 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1344 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1345 ldr r3, [r1, #1] /* BE:r3 = 1234 LE:r3 = 4321 */
1346 ldr r1, [r1, #5] /* BE:r1 = 5xxx LE:r3 = xxx5 */
1347 mov r2, r2, lsr #24 /* r2 = ...0 */
1348 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1349 mov r1, r1, lsl #8 /* r1 = xx5. */
1350 orr r1, r1, r3, lsr #24 /* r1 = xx54 */
1352 strh r1, [r0, #0x04]
1357 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1359 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1360 ldrh r2, [r1, #0x04] /* BE:r2 = ..45 LE:r2 = ..54 */
1361 mov r1, r3, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1362 strh r1, [r0, #0x01]
1364 mov r3, r3, lsr #24 /* r3 = ...3 */
1365 orr r3, r3, r2, lsl #8 /* r3 = .543 */
1366 mov r2, r2, lsr #8 /* r2 = ...5 */
1367 strh r3, [r0, #0x03]
1368 strb r2, [r0, #0x05]
1373 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1376 ldrh r3, [r1, #0x01]
1377 ldrh ip, [r1, #0x03]
1378 ldrb r1, [r1, #0x05]
1380 strh r3, [r0, #0x01]
1381 strh ip, [r0, #0x03]
1382 strb r1, [r0, #0x05]
1387 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1389 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1390 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1393 strb r3, [r0, #0x05]
1394 mov r3, r1, lsr #8 /* r3 = .543 */
1395 strh r3, [r0, #0x03]
1396 mov r3, r2, lsr #8 /* r3 = ...1 */
1397 orr r3, r3, r1, lsl #8 /* r3 = 4321 */
1398 strh r3, [r0, #0x01]
1403 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1406 ldrh r3, [r1, #0x01]
1407 ldrh ip, [r1, #0x03]
1408 ldrb r1, [r1, #0x05]
1410 strh r3, [r0, #0x01]
1411 strh ip, [r0, #0x03]
1412 strb r1, [r0, #0x05]
1417 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1419 ldrh r2, [r1, #0x04] /* r2 = ..54 */
1420 ldr r3, [r1] /* r3 = 3210 */
1421 mov r2, r2, lsl #16 /* r2 = 54.. */
1422 orr r2, r2, r3, lsr #16 /* r2 = 5432 */
1429 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1431 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1432 ldr r2, [r1, #3] /* BE:r2 = 345x LE:r2 = x543 */
1433 mov r1, r3, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1434 mov r2, r2, lsl #8 /* r2 = 543. */
1435 orr r2, r2, r3, lsr #24 /* r2 = 5432 */
1442 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1452 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1454 ldrb r3, [r1] /* r3 = ...0 */
1455 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1456 ldrb r1, [r1, #0x05] /* r1 = ...5 */
1457 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1458 mov r1, r1, lsl #24 /* r1 = 5... */
1459 orr r1, r1, r2, lsr #8 /* r1 = 5432 */
1466 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1468 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1469 ldrh r1, [r1, #0x04] /* BE:r1 = ..45 LE:r1 = ..54 */
1471 mov r2, r2, lsr #8 /* r2 = .321 */
1472 orr r2, r2, r1, lsl #24 /* r2 = 4321 */
1473 mov r1, r1, lsr #8 /* r1 = ...5 */
1475 strb r1, [r0, #0x05]
1480 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1483 ldrh r3, [r1, #0x01]
1484 ldrh ip, [r1, #0x03]
1485 ldrb r1, [r1, #0x05]
1487 strh r3, [r0, #0x01]
1488 strh ip, [r0, #0x03]
1489 strb r1, [r0, #0x05]
1494 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1496 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1497 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1499 mov r2, r2, lsr #8 /* r2 = ...1 */
1500 orr r2, r2, r1, lsl #8 /* r2 = 4321 */
1501 mov r1, r1, lsr #24 /* r1 = ...5 */
1503 strb r1, [r0, #0x05]
1508 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1512 ldrb r1, [r1, #0x05]
1515 strb r1, [r0, #0x05]
1520 /******************************************************************************
1521 * Special case for 8 byte copies
1523 #define LMEMCPY_8_LOG2 6 /* 64 bytes */
1524 #define LMEMCPY_8_PAD .align LMEMCPY_8_LOG2
1528 orr r2, r2, r0, lsl #2
1531 addne pc, r3, r2, lsl #LMEMCPY_8_LOG2
1534 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1544 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1546 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1547 ldr r2, [r1, #0x03] /* BE:r2 = 3456 LE:r2 = 6543 */
1548 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1549 mov r3, r3, lsr #8 /* r3 = .210 */
1550 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1551 mov r1, r1, lsl #24 /* r1 = 7... */
1552 orr r2, r1, r2, lsr #8 /* r2 = 7654 */
1559 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1561 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1562 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1563 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1564 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1565 mov r3, r3, lsr #16 /* r3 = ..54 */
1566 orr r3, r3, r1, lsl #16 /* r3 = 7654 */
1573 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1575 ldrb r3, [r1] /* r3 = ...0 */
1576 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1577 ldr r1, [r1, #0x05] /* BE:r1 = 567x LE:r1 = x765 */
1578 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1579 mov r2, r2, lsr #24 /* r2 = ...4 */
1580 orr r2, r2, r1, lsl #8 /* r2 = 7654 */
1587 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1589 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1590 ldr r2, [r1, #0x04] /* BE:r2 = 4567 LE:r2 = 7654 */
1592 mov r1, r2, lsr #24 /* r1 = ...7 */
1593 strb r1, [r0, #0x07]
1594 mov r1, r3, lsr #8 /* r1 = .321 */
1595 mov r3, r3, lsr #24 /* r3 = ...3 */
1596 orr r3, r3, r2, lsl #8 /* r3 = 6543 */
1597 strh r1, [r0, #0x01]
1603 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1606 ldrh r3, [r1, #0x01]
1608 ldrb r1, [r1, #0x07]
1610 strh r3, [r0, #0x01]
1612 strb r1, [r0, #0x07]
1617 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1619 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1620 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1621 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1622 strb r2, [r0] /* 0 */
1623 mov ip, r1, lsr #8 /* ip = ...7 */
1624 strb ip, [r0, #0x07] /* 7 */
1625 mov ip, r2, lsr #8 /* ip = ...1 */
1626 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1627 mov r3, r3, lsr #8 /* r3 = .543 */
1628 orr r3, r3, r1, lsl #24 /* r3 = 6543 */
1629 strh ip, [r0, #0x01]
1635 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1637 ldrb r3, [r1] /* r3 = ...0 */
1638 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1639 ldrh r2, [r1, #0x05] /* BE:r2 = ..56 LE:r2 = ..65 */
1640 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1642 mov r3, ip, lsr #16 /* BE:r3 = ..12 LE:r3 = ..43 */
1643 strh ip, [r0, #0x01]
1644 orr r2, r3, r2, lsl #16 /* r2 = 6543 */
1646 strb r1, [r0, #0x07]
1651 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1653 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1654 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1655 mov r1, r2, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1657 orr r2, r1, r3, lsl #16 /* r2 = 5432 */
1658 mov r3, r3, lsr #16 /* r3 = ..76 */
1660 strh r3, [r0, #0x06]
1665 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1667 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1668 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1669 ldrb ip, [r1, #0x07] /* ip = ...7 */
1670 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1672 mov r1, r2, lsr #24 /* r1 = ...2 */
1673 orr r1, r1, r3, lsl #8 /* r1 = 5432 */
1674 mov r3, r3, lsr #24 /* r3 = ...6 */
1675 orr r3, r3, ip, lsl #8 /* r3 = ..76 */
1677 strh r3, [r0, #0x06]
1682 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1686 ldrh r3, [r1, #0x06]
1689 strh r3, [r0, #0x06]
1694 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1696 ldr r3, [r1, #0x05] /* BE:r3 = 567x LE:r3 = x765 */
1697 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1698 ldrb ip, [r1] /* ip = ...0 */
1699 mov r1, r3, lsr #8 /* BE:r1 = .567 LE:r1 = .x76 */
1700 strh r1, [r0, #0x06]
1701 mov r3, r3, lsl #24 /* r3 = 5... */
1702 orr r3, r3, r2, lsr #8 /* r3 = 5432 */
1703 orr r2, ip, r2, lsl #8 /* r2 = 3210 */
1710 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1712 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1713 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1714 mov r1, r3, lsr #8 /* BE:r1 = .456 LE:r1 = .765 */
1715 strh r1, [r0, #0x05]
1717 mov r1, r3, lsr #24 /* r1 = ...7 */
1718 strb r1, [r0, #0x07]
1719 mov r2, r2, lsr #8 /* r2 = .321 */
1720 orr r2, r2, r3, lsl #24 /* r2 = 4321 */
1726 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1728 ldrb r3, [r1] /* r3 = ...0 */
1729 ldrh r2, [r1, #0x01] /* BE:r2 = ..12 LE:r2 = ..21 */
1730 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
1731 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1733 mov r3, ip, lsr #16 /* BE:r3 = ..34 LE:r3 = ..65 */
1734 strh r3, [r0, #0x05]
1735 orr r2, r2, ip, lsl #16 /* r2 = 4321 */
1737 strb r1, [r0, #0x07]
1742 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1744 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1745 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1746 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1748 mov ip, r2, lsr #8 /* ip = ...1 */
1749 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1750 mov r2, r1, lsr #8 /* r2 = ...7 */
1751 strb r2, [r0, #0x07]
1752 mov r1, r1, lsl #8 /* r1 = .76. */
1753 orr r1, r1, r3, lsr #24 /* r1 = .765 */
1755 strh r1, [r0, #0x05]
1760 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1764 ldrh r3, [r1, #0x05]
1765 ldrb r1, [r1, #0x07]
1768 strh r3, [r0, #0x05]
1769 strb r1, [r0, #0x07]
1773 /******************************************************************************
1774 * Special case for 12 byte copies
1776 #define LMEMCPY_C_LOG2 7 /* 128 bytes */
1777 #define LMEMCPY_C_PAD .align LMEMCPY_C_LOG2
1781 orr r2, r2, r0, lsl #2
1784 addne pc, r3, r2, lsl #LMEMCPY_C_LOG2
1787 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1799 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1801 ldrb r2, [r1, #0xb] /* r2 = ...B */
1802 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1803 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1804 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
1805 mov r2, r2, lsl #24 /* r2 = B... */
1806 orr r2, r2, ip, lsr #8 /* r2 = BA98 */
1808 mov r2, ip, lsl #24 /* r2 = 7... */
1809 orr r2, r2, r3, lsr #8 /* r2 = 7654 */
1810 mov r1, r1, lsr #8 /* r1 = .210 */
1811 orr r1, r1, r3, lsl #24 /* r1 = 3210 */
1818 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1820 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1821 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1822 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1823 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1824 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1826 mov r3, r3, lsr #16 /* r3 = ..54 */
1827 orr r3, r3, ip, lsl #16 /* r3 = 7654 */
1828 mov r1, r1, lsl #16 /* r1 = BA.. */
1829 orr r1, r1, ip, lsr #16 /* r1 = BA98 */
1836 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1838 ldrb r2, [r1] /* r2 = ...0 */
1839 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1840 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1841 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1842 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1844 mov r3, r3, lsr #24 /* r3 = ...4 */
1845 orr r3, r3, ip, lsl #8 /* r3 = 7654 */
1846 mov r1, r1, lsl #8 /* r1 = BA9. */
1847 orr r1, r1, ip, lsr #24 /* r1 = BA98 */
1854 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1856 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1857 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1858 ldr ip, [r1, #0x08] /* BE:ip = 89AB LE:ip = BA98 */
1859 mov r1, r2, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1860 strh r1, [r0, #0x01]
1862 mov r1, r2, lsr #24 /* r1 = ...3 */
1863 orr r2, r1, r3, lsl #8 /* r1 = 6543 */
1864 mov r1, r3, lsr #24 /* r1 = ...7 */
1865 orr r1, r1, ip, lsl #8 /* r1 = A987 */
1866 mov ip, ip, lsr #24 /* ip = ...B */
1869 strb ip, [r0, #0x0b]
1874 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1877 ldrh r3, [r1, #0x01]
1881 ldrb r1, [r1, #0x0b]
1882 strh r3, [r0, #0x01]
1885 strb r1, [r0, #0x0b]
1890 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1892 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1893 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1894 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1895 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1897 mov r2, r2, lsr #8 /* r2 = ...1 */
1898 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
1899 strh r2, [r0, #0x01]
1900 mov r2, r3, lsr #8 /* r2 = .543 */
1901 orr r3, r2, ip, lsl #24 /* r3 = 6543 */
1902 mov r2, ip, lsr #8 /* r2 = .987 */
1903 orr r2, r2, r1, lsl #24 /* r2 = A987 */
1904 mov r1, r1, lsr #8 /* r1 = ...B */
1907 strb r1, [r0, #0x0b]
1912 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1915 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1916 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1917 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1919 strh r3, [r0, #0x01]
1920 mov r3, r3, lsr #16 /* r3 = ..43 */
1921 orr r3, r3, ip, lsl #16 /* r3 = 6543 */
1922 mov ip, ip, lsr #16 /* ip = ..87 */
1923 orr ip, ip, r1, lsl #16 /* ip = A987 */
1924 mov r1, r1, lsr #16 /* r1 = ..xB */
1927 strb r1, [r0, #0x0b]
1932 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1934 ldr ip, [r1] /* BE:ip = 0123 LE:ip = 3210 */
1935 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1936 ldr r2, [r1, #0x08] /* BE:r2 = 89AB LE:r2 = BA98 */
1937 mov r1, ip, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1939 orr r1, r1, r3, lsl #16 /* r1 = 5432 */
1940 mov r3, r3, lsr #16 /* r3 = ..76 */
1941 orr r3, r3, r2, lsl #16 /* r3 = 9876 */
1942 mov r2, r2, lsr #16 /* r2 = ..BA */
1945 strh r2, [r0, #0x0a]
1950 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1952 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1953 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1954 mov ip, r2, lsr #8 /* BE:ip = .x01 LE:ip = .210 */
1956 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1957 ldrb r1, [r1, #0x0b] /* r1 = ...B */
1958 mov r2, r2, lsr #24 /* r2 = ...2 */
1959 orr r2, r2, r3, lsl #8 /* r2 = 5432 */
1960 mov r3, r3, lsr #24 /* r3 = ...6 */
1961 orr r3, r3, ip, lsl #8 /* r3 = 9876 */
1962 mov r1, r1, lsl #8 /* r1 = ..B. */
1963 orr r1, r1, ip, lsr #24 /* r1 = ..BA */
1966 strh r1, [r0, #0x0a]
1971 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1976 ldrh r1, [r1, #0x0a]
1980 strh r1, [r0, #0x0a]
1985 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1987 ldr r2, [r1, #0x09] /* BE:r2 = 9ABx LE:r2 = xBA9 */
1988 ldr r3, [r1, #0x05] /* BE:r3 = 5678 LE:r3 = 8765 */
1989 mov ip, r2, lsr #8 /* BE:ip = .9AB LE:ip = .xBA */
1990 strh ip, [r0, #0x0a]
1991 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1992 ldrb r1, [r1] /* r1 = ...0 */
1993 mov r2, r2, lsl #24 /* r2 = 9... */
1994 orr r2, r2, r3, lsr #8 /* r2 = 9876 */
1995 mov r3, r3, lsl #24 /* r3 = 5... */
1996 orr r3, r3, ip, lsr #8 /* r3 = 5432 */
1997 orr r1, r1, ip, lsl #8 /* r1 = 3210 */
2005 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2007 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
2008 ldr ip, [r1, #0x04] /* BE:ip = 4567 LE:ip = 7654 */
2009 ldr r1, [r1, #0x08] /* BE:r1 = 89AB LE:r1 = BA98 */
2011 mov r3, r2, lsr #8 /* r3 = .321 */
2012 orr r3, r3, ip, lsl #24 /* r3 = 4321 */
2014 mov r3, ip, lsr #8 /* r3 = .765 */
2015 orr r3, r3, r1, lsl #24 /* r3 = 8765 */
2017 mov r1, r1, lsr #8 /* r1 = .BA9 */
2018 strh r1, [r0, #0x09]
2019 mov r1, r1, lsr #16 /* r1 = ...B */
2020 strb r1, [r0, #0x0b]
2025 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2027 ldrb r2, [r1, #0x0b] /* r2 = ...B */
2028 ldr r3, [r1, #0x07] /* BE:r3 = 789A LE:r3 = A987 */
2029 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
2030 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
2031 strb r2, [r0, #0x0b]
2032 mov r2, r3, lsr #16 /* r2 = ..A9 */
2033 strh r2, [r0, #0x09]
2034 mov r3, r3, lsl #16 /* r3 = 87.. */
2035 orr r3, r3, ip, lsr #16 /* r3 = 8765 */
2036 mov ip, ip, lsl #16 /* ip = 43.. */
2037 orr ip, ip, r1, lsr #16 /* ip = 4321 */
2038 mov r1, r1, lsr #8 /* r1 = .210 */
2046 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2048 ldrh r2, [r1] /* r2 = ..10 */
2049 ldr r3, [r1, #0x02] /* r3 = 5432 */
2050 ldr ip, [r1, #0x06] /* ip = 9876 */
2051 ldrh r1, [r1, #0x0a] /* r1 = ..BA */
2053 mov r2, r2, lsr #8 /* r2 = ...1 */
2054 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
2055 mov r3, r3, lsr #24 /* r3 = ...5 */
2056 orr r3, r3, ip, lsl #8 /* r3 = 8765 */
2057 mov ip, ip, lsr #24 /* ip = ...9 */
2058 orr ip, ip, r1, lsl #8 /* ip = .BA9 */
2059 mov r1, r1, lsr #8 /* r1 = ...B */
2062 strh ip, [r0, #0x09]
2063 strb r1, [r0, #0x0b]
2068 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2074 ldrh r2, [r1, #0x09]
2075 ldrb r1, [r1, #0x0b]
2078 strh r2, [r0, #0x09]
2079 strb r1, [r0, #0x0b]