2 * Copyright (c) 2004 Olivier Houchard
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed for the NetBSD Project by
43 * Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 * or promote products derived from this software without specific prior
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution.
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
89 #include <machine/asm.h>
90 __FBSDID("$FreeBSD$");
97 .word _C_LABEL(_arm_memcpy)
99 .word _C_LABEL(_arm_bzero)
101 .word _C_LABEL(_min_memcpy_size)
103 .word _C_LABEL(_min_bzero_size)
105 * memset: Sets a block of memory to the specified value
110 * r2 - number of bytes to write
115 /* LINTSTUB: Func: void bzero(void *, size_t) */
121 ldr r2, .L_min_bzero_size
125 stmfd sp!, {r0, r1, lr}
130 ldmfd sp!, {r0, r1, lr}
136 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
138 and r3, r1, #0xff /* We deal with bytes */
141 cmp r1, #0x04 /* Do we have less than 4 bytes */
143 blt .Lmemset_lessthanfour
145 /* Ok first we will word align the address */
146 ands r2, ip, #0x03 /* Get the bottom two bits */
147 bne .Lmemset_wordunaligned /* The address is not word aligned */
149 /* We are now word aligned */
150 .Lmemset_wordaligned:
151 orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
153 tst ip, #0x04 /* Quad-align for armv5e */
157 orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
159 subne r1, r1, #0x04 /* Quad-align if necessary */
160 strne r3, [ip], #0x04
163 blt .Lmemset_loop4 /* If less than 16 then use words */
164 mov r2, r3 /* Duplicate data */
165 cmp r1, #0x80 /* If < 128 then skip the big loop */
168 /* Do 128 bytes at a time */
172 strdge r2, [ip], #0x08
173 strdge r2, [ip], #0x08
174 strdge r2, [ip], #0x08
175 strdge r2, [ip], #0x08
176 strdge r2, [ip], #0x08
177 strdge r2, [ip], #0x08
178 strdge r2, [ip], #0x08
179 strdge r2, [ip], #0x08
180 strdge r2, [ip], #0x08
181 strdge r2, [ip], #0x08
182 strdge r2, [ip], #0x08
183 strdge r2, [ip], #0x08
184 strdge r2, [ip], #0x08
185 strdge r2, [ip], #0x08
186 strdge r2, [ip], #0x08
187 strdge r2, [ip], #0x08
207 RETeq /* Zero length so just exit */
209 add r1, r1, #0x80 /* Adjust for extra sub */
211 /* Do 32 bytes at a time */
215 strdge r2, [ip], #0x08
216 strdge r2, [ip], #0x08
217 strdge r2, [ip], #0x08
218 strdge r2, [ip], #0x08
226 RETeq /* Zero length so just exit */
228 adds r1, r1, #0x10 /* Partially adjust for extra sub */
230 /* Deal with 16 bytes or more */
232 strdge r2, [ip], #0x08
233 strdge r2, [ip], #0x08
238 RETeq /* Zero length so just exit */
240 addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
242 /* We have at least 4 bytes so copy as words */
245 strge r3, [ip], #0x04
247 RETeq /* Zero length so just exit */
250 /* Compensate for 64-bit alignment check */
258 strb r3, [ip], #0x01 /* Set 1 byte */
259 strbge r3, [ip], #0x01 /* Set another byte */
260 strbgt r3, [ip] /* and a third */
263 .Lmemset_wordunaligned:
265 strb r3, [ip], #0x01 /* Set 1 byte */
267 strbge r3, [ip], #0x01 /* Set another byte */
269 strbgt r3, [ip], #0x01 /* and a third */
270 cmp r1, #0x04 /* More than 4 bytes left? */
271 bge .Lmemset_wordaligned /* Yup */
273 .Lmemset_lessthanfour:
275 RETeq /* Zero length so exit */
276 strb r3, [ip], #0x01 /* Set 1 byte */
278 strbge r3, [ip], #0x01 /* Set another byte */
279 strbgt r3, [ip] /* and a third */
290 /* Are both addresses aligned the same way? */
293 RETeq /* len == 0, or same addresses! */
296 bne .Lmemcmp_bytewise2 /* Badly aligned. Do it the slow way */
298 /* Word-align the addresses, if necessary */
301 add r3, r3, r3, lsl #1
302 addne pc, pc, r3, lsl #3
305 /* Compare up to 3 bytes */
313 /* Compare up to 2 bytes */
329 /* Compare 4 bytes at a time, if possible */
331 bcc .Lmemcmp_bytewise
332 .Lmemcmp_word_aligned:
337 beq .Lmemcmp_word_aligned
340 /* Correct for extra subtraction, and check if done */
342 cmpeq r0, #0x00 /* If done, did all bytes match? */
343 RETeq /* Yup. Just return */
345 /* Re-do the final word byte-wise */
356 beq .Lmemcmp_bytewise2
361 * 6 byte compares are very common, thanks to the network stack.
362 * This code is hand-scheduled to reduce the number of stalls for
363 * load results. Everything else being equal, this will be ~32%
364 * faster than a byte-wise memcmp.
368 ldrb r3, [r1, #0x00] /* r3 = b2#0 */
369 ldrb r0, [ip, #0x00] /* r0 = b1#0 */
370 ldrb r2, [r1, #0x01] /* r2 = b2#1 */
371 subs r0, r0, r3 /* r0 = b1#0 - b2#0 */
372 ldrbeq r3, [ip, #0x01] /* r3 = b1#1 */
373 RETne /* Return if mismatch on #0 */
374 subs r0, r3, r2 /* r0 = b1#1 - b2#1 */
375 ldrbeq r3, [r1, #0x02] /* r3 = b2#2 */
376 ldrbeq r0, [ip, #0x02] /* r0 = b1#2 */
377 RETne /* Return if mismatch on #1 */
378 ldrb r2, [r1, #0x03] /* r2 = b2#3 */
379 subs r0, r0, r3 /* r0 = b1#2 - b2#2 */
380 ldrbeq r3, [ip, #0x03] /* r3 = b1#3 */
381 RETne /* Return if mismatch on #2 */
382 subs r0, r3, r2 /* r0 = b1#3 - b2#3 */
383 ldrbeq r3, [r1, #0x04] /* r3 = b2#4 */
384 ldrbeq r0, [ip, #0x04] /* r0 = b1#4 */
385 RETne /* Return if mismatch on #3 */
386 ldrb r2, [r1, #0x05] /* r2 = b2#5 */
387 subs r0, r0, r3 /* r0 = b1#4 - b2#4 */
388 ldrbeq r3, [ip, #0x05] /* r3 = b1#5 */
389 RETne /* Return if mismatch on #4 */
390 sub r0, r3, r2 /* r0 = b1#5 - b2#5 */
395 /* switch the source and destination registers */
400 /* Do the buffers overlap? */
402 RETeq /* Bail now if src/dst are the same */
403 subcc r3, r0, r1 /* if (dst > src) r3 = dst - src */
404 subcs r3, r1, r0 /* if (src > dsr) r3 = src - dst */
405 cmp r3, r2 /* if (r3 < len) we have an overlap */
406 bcc PIC_SYM(_C_LABEL(memcpy), PLT)
408 /* Determine copy direction */
410 bcc .Lmemmove_backwards
412 moveq r0, #0 /* Quick abort for len=0 */
415 stmdb sp!, {r0, lr} /* memmove() returns dest addr */
417 blt .Lmemmove_fl4 /* less than 4 bytes */
419 bne .Lmemmove_fdestul /* oh unaligned destination addr */
421 bne .Lmemmove_fsrcul /* oh unaligned source addr */
424 /* We have aligned source and destination */
426 blt .Lmemmove_fl12 /* less than 12 bytes (4 from above) */
428 blt .Lmemmove_fl32 /* less than 32 bytes (12 from above) */
429 stmdb sp!, {r4} /* borrow r4 */
431 /* blat 32 bytes at a time */
432 /* XXX for really big copies perhaps we should use more registers */
434 ldmia r1!, {r3, r4, r12, lr}
435 stmia r0!, {r3, r4, r12, lr}
436 ldmia r1!, {r3, r4, r12, lr}
437 stmia r0!, {r3, r4, r12, lr}
439 bge .Lmemmove_floop32
442 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
443 stmiage r0!, {r3, r4, r12, lr}
445 ldmia sp!, {r4} /* return r4 */
450 /* blat 12 bytes at a time */
452 ldmiage r1!, {r3, r12, lr}
453 stmiage r0!, {r3, r12, lr}
455 bge .Lmemmove_floop12
464 ldmiage r1!, {r3, r12}
465 stmiage r0!, {r3, r12}
469 /* less than 4 bytes to go */
471 ldmiaeq sp!, {r0, pc} /* done */
473 /* copy the crud byte at a time */
483 /* erg - unaligned destination */
488 /* align destination with byte copies */
496 blt .Lmemmove_fl4 /* less the 4 bytes */
499 beq .Lmemmove_ft8 /* we have an aligned source */
501 /* erg - unaligned source */
502 /* This is where it gets nasty ... */
507 bgt .Lmemmove_fsrcul3
508 beq .Lmemmove_fsrcul2
510 blt .Lmemmove_fsrcul1loop4
514 .Lmemmove_fsrcul1loop16:
516 ldmia r1!, {r4, r5, r12, lr}
517 orr r3, r3, r4, lsl #24
519 orr r4, r4, r5, lsl #24
521 orr r5, r5, r12, lsl #24
523 orr r12, r12, lr, lsl #24
524 stmia r0!, {r3-r5, r12}
526 bge .Lmemmove_fsrcul1loop16
529 blt .Lmemmove_fsrcul1l4
531 .Lmemmove_fsrcul1loop4:
534 orr r12, r12, lr, lsl #24
537 bge .Lmemmove_fsrcul1loop4
545 blt .Lmemmove_fsrcul2loop4
549 .Lmemmove_fsrcul2loop16:
551 ldmia r1!, {r4, r5, r12, lr}
552 orr r3, r3, r4, lsl #16
554 orr r4, r4, r5, lsl #16
556 orr r5, r5, r12, lsl #16
557 mov r12, r12, lsr #16
558 orr r12, r12, lr, lsl #16
559 stmia r0!, {r3-r5, r12}
561 bge .Lmemmove_fsrcul2loop16
564 blt .Lmemmove_fsrcul2l4
566 .Lmemmove_fsrcul2loop4:
569 orr r12, r12, lr, lsl #16
572 bge .Lmemmove_fsrcul2loop4
580 blt .Lmemmove_fsrcul3loop4
584 .Lmemmove_fsrcul3loop16:
586 ldmia r1!, {r4, r5, r12, lr}
587 orr r3, r3, r4, lsl #8
589 orr r4, r4, r5, lsl #8
591 orr r5, r5, r12, lsl #8
592 mov r12, r12, lsr #24
593 orr r12, r12, lr, lsl #8
594 stmia r0!, {r3-r5, r12}
596 bge .Lmemmove_fsrcul3loop16
599 blt .Lmemmove_fsrcul3l4
601 .Lmemmove_fsrcul3loop4:
604 orr r12, r12, lr, lsl #8
607 bge .Lmemmove_fsrcul3loop4
617 blt .Lmemmove_bl4 /* less than 4 bytes */
619 bne .Lmemmove_bdestul /* oh unaligned destination addr */
621 bne .Lmemmove_bsrcul /* oh unaligned source addr */
624 /* We have aligned source and destination */
626 blt .Lmemmove_bl12 /* less than 12 bytes (4 from above) */
628 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
631 /* blat 32 bytes at a time */
632 /* XXX for really big copies perhaps we should use more registers */
634 ldmdb r1!, {r3, r4, r12, lr}
635 stmdb r0!, {r3, r4, r12, lr}
636 ldmdb r1!, {r3, r4, r12, lr}
637 stmdb r0!, {r3, r4, r12, lr}
639 bge .Lmemmove_bloop32
643 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
644 stmdbge r0!, {r3, r4, r12, lr}
647 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
648 stmdbge r0!, {r3, r12, lr}
658 ldmdbge r1!, {r3, r12}
659 stmdbge r0!, {r3, r12}
663 /* less than 4 bytes to go */
667 /* copy the crud byte at a time */
671 ldrbge r3, [r1, #-1]!
672 strbge r3, [r0, #-1]!
673 ldrbgt r3, [r1, #-1]!
674 strbgt r3, [r0, #-1]!
677 /* erg - unaligned destination */
681 /* align destination with byte copies */
684 ldrbge r3, [r1, #-1]!
685 strbge r3, [r0, #-1]!
686 ldrbgt r3, [r1, #-1]!
687 strbgt r3, [r0, #-1]!
689 blt .Lmemmove_bl4 /* less than 4 bytes to go */
691 beq .Lmemmove_bt8 /* we have an aligned source */
693 /* erg - unaligned source */
694 /* This is where it gets nasty ... */
699 blt .Lmemmove_bsrcul1
700 beq .Lmemmove_bsrcul2
702 blt .Lmemmove_bsrcul3loop4
704 stmdb sp!, {r4, r5, lr}
706 .Lmemmove_bsrcul3loop16:
708 ldmdb r1!, {r3-r5, r12}
709 orr lr, lr, r12, lsr #24
711 orr r12, r12, r5, lsr #24
713 orr r5, r5, r4, lsr #24
715 orr r4, r4, r3, lsr #24
716 stmdb r0!, {r4, r5, r12, lr}
718 bge .Lmemmove_bsrcul3loop16
719 ldmia sp!, {r4, r5, lr}
721 blt .Lmemmove_bsrcul3l4
723 .Lmemmove_bsrcul3loop4:
726 orr r12, r12, r3, lsr #24
729 bge .Lmemmove_bsrcul3loop4
737 blt .Lmemmove_bsrcul2loop4
739 stmdb sp!, {r4, r5, lr}
741 .Lmemmove_bsrcul2loop16:
743 ldmdb r1!, {r3-r5, r12}
744 orr lr, lr, r12, lsr #16
745 mov r12, r12, lsl #16
746 orr r12, r12, r5, lsr #16
748 orr r5, r5, r4, lsr #16
750 orr r4, r4, r3, lsr #16
751 stmdb r0!, {r4, r5, r12, lr}
753 bge .Lmemmove_bsrcul2loop16
754 ldmia sp!, {r4, r5, lr}
756 blt .Lmemmove_bsrcul2l4
758 .Lmemmove_bsrcul2loop4:
761 orr r12, r12, r3, lsr #16
764 bge .Lmemmove_bsrcul2loop4
772 blt .Lmemmove_bsrcul1loop4
774 stmdb sp!, {r4, r5, lr}
776 .Lmemmove_bsrcul1loop32:
778 ldmdb r1!, {r3-r5, r12}
779 orr lr, lr, r12, lsr #8
780 mov r12, r12, lsl #24
781 orr r12, r12, r5, lsr #8
783 orr r5, r5, r4, lsr #8
785 orr r4, r4, r3, lsr #8
786 stmdb r0!, {r4, r5, r12, lr}
788 bge .Lmemmove_bsrcul1loop32
789 ldmia sp!, {r4, r5, lr}
791 blt .Lmemmove_bsrcul1l4
793 .Lmemmove_bsrcul1loop4:
796 orr r12, r12, r3, lsr #8
799 bge .Lmemmove_bsrcul1loop4
807 #if !defined(_ARM_ARCH_5E)
809 /* save leaf functions having to store this away */
810 /* Do not check arm_memcpy if we're running from flash */
811 #if defined(FLASHADDR) && defined(PHYSADDR)
812 #if FLASHADDR > PHYSADDR
822 ldr r3, .L_arm_memcpy
826 ldr r3, .L_min_memcpy_size
830 stmfd sp!, {r0-r2, r4, lr}
832 ldr r4, .L_arm_memcpy
836 ldmfd sp!, {r0-r2, r4, lr}
840 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
843 blt .Lmemcpy_l4 /* less than 4 bytes */
845 bne .Lmemcpy_destul /* oh unaligned destination addr */
847 bne .Lmemcpy_srcul /* oh unaligned source addr */
850 /* We have aligned source and destination */
852 blt .Lmemcpy_l12 /* less than 12 bytes (4 from above) */
854 blt .Lmemcpy_l32 /* less than 32 bytes (12 from above) */
855 stmdb sp!, {r4} /* borrow r4 */
857 /* blat 32 bytes at a time */
858 /* XXX for really big copies perhaps we should use more registers */
860 ldmia r1!, {r3, r4, r12, lr}
861 stmia r0!, {r3, r4, r12, lr}
862 ldmia r1!, {r3, r4, r12, lr}
863 stmia r0!, {r3, r4, r12, lr}
868 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
869 stmiage r0!, {r3, r4, r12, lr}
871 ldmia sp!, {r4} /* return r4 */
876 /* blat 12 bytes at a time */
878 ldmiage r1!, {r3, r12, lr}
879 stmiage r0!, {r3, r12, lr}
890 ldmiage r1!, {r3, r12}
891 stmiage r0!, {r3, r12}
895 /* less than 4 bytes to go */
898 ldmiaeq sp!, {r0, pc}^ /* done */
900 ldmiaeq sp!, {r0, pc} /* done */
902 /* copy the crud byte at a time */
912 /* erg - unaligned destination */
917 /* align destination with byte copies */
925 blt .Lmemcpy_l4 /* less the 4 bytes */
928 beq .Lmemcpy_t8 /* we have an aligned source */
930 /* erg - unaligned source */
931 /* This is where it gets nasty ... */
939 blt .Lmemcpy_srcul1loop4
943 .Lmemcpy_srcul1loop16:
945 ldmia r1!, {r4, r5, r12, lr}
946 orr r3, r3, r4, lsl #24
948 orr r4, r4, r5, lsl #24
950 orr r5, r5, r12, lsl #24
952 orr r12, r12, lr, lsl #24
953 stmia r0!, {r3-r5, r12}
955 bge .Lmemcpy_srcul1loop16
958 blt .Lmemcpy_srcul1l4
960 .Lmemcpy_srcul1loop4:
963 orr r12, r12, lr, lsl #24
966 bge .Lmemcpy_srcul1loop4
974 blt .Lmemcpy_srcul2loop4
978 .Lmemcpy_srcul2loop16:
980 ldmia r1!, {r4, r5, r12, lr}
981 orr r3, r3, r4, lsl #16
983 orr r4, r4, r5, lsl #16
985 orr r5, r5, r12, lsl #16
986 mov r12, r12, lsr #16
987 orr r12, r12, lr, lsl #16
988 stmia r0!, {r3-r5, r12}
990 bge .Lmemcpy_srcul2loop16
993 blt .Lmemcpy_srcul2l4
995 .Lmemcpy_srcul2loop4:
998 orr r12, r12, lr, lsl #16
1001 bge .Lmemcpy_srcul2loop4
1009 blt .Lmemcpy_srcul3loop4
1013 .Lmemcpy_srcul3loop16:
1015 ldmia r1!, {r4, r5, r12, lr}
1016 orr r3, r3, r4, lsl #8
1018 orr r4, r4, r5, lsl #8
1020 orr r5, r5, r12, lsl #8
1021 mov r12, r12, lsr #24
1022 orr r12, r12, lr, lsl #8
1023 stmia r0!, {r3-r5, r12}
1025 bge .Lmemcpy_srcul3loop16
1028 blt .Lmemcpy_srcul3l4
1030 .Lmemcpy_srcul3loop4:
1031 mov r12, lr, lsr #24
1033 orr r12, r12, lr, lsl #8
1036 bge .Lmemcpy_srcul3loop4
1044 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1048 ble .Lmemcpy_short /* <= 12 bytes */
1050 #if FLASHADDR > PHYSADDR
1060 ldr r3, .L_arm_memcpy
1064 ldr r3, .L_min_memcpy_size
1068 stmfd sp!, {r0-r2, r4, lr}
1070 ldr r4, .L_arm_memcpy
1074 ldmfd sp!, {r0-r2, r4, lr}
1077 mov r3, r0 /* We must not clobber r0 */
1079 /* Word-align the destination buffer */
1080 ands ip, r3, #0x03 /* Already word aligned? */
1081 beq .Lmemcpy_wordaligned /* Yup */
1083 ldrb ip, [r1], #0x01
1085 strb ip, [r3], #0x01
1086 ldrble ip, [r1], #0x01
1088 strble ip, [r3], #0x01
1089 ldrblt ip, [r1], #0x01
1091 strblt ip, [r3], #0x01
1093 /* Destination buffer is now word aligned */
1094 .Lmemcpy_wordaligned:
1095 ands ip, r1, #0x03 /* Is src also word-aligned? */
1096 bne .Lmemcpy_bad_align /* Nope. Things just got bad */
1098 /* Quad-align the destination buffer */
1099 tst r3, #0x07 /* Already quad aligned? */
1100 ldrne ip, [r1], #0x04
1101 stmfd sp!, {r4-r9} /* Free up some registers */
1103 strne ip, [r3], #0x04
1105 /* Destination buffer quad aligned, source is at least word aligned */
1107 blt .Lmemcpy_w_lessthan128
1109 /* Copy 128 bytes at a time */
1111 ldr r4, [r1], #0x04 /* LD:00-03 */
1112 ldr r5, [r1], #0x04 /* LD:04-07 */
1113 pld [r1, #0x18] /* Prefetch 0x20 */
1114 ldr r6, [r1], #0x04 /* LD:08-0b */
1115 ldr r7, [r1], #0x04 /* LD:0c-0f */
1116 ldr r8, [r1], #0x04 /* LD:10-13 */
1117 ldr r9, [r1], #0x04 /* LD:14-17 */
1118 strd r4, [r3], #0x08 /* ST:00-07 */
1119 ldr r4, [r1], #0x04 /* LD:18-1b */
1120 ldr r5, [r1], #0x04 /* LD:1c-1f */
1121 strd r6, [r3], #0x08 /* ST:08-0f */
1122 ldr r6, [r1], #0x04 /* LD:20-23 */
1123 ldr r7, [r1], #0x04 /* LD:24-27 */
1124 pld [r1, #0x18] /* Prefetch 0x40 */
1125 strd r8, [r3], #0x08 /* ST:10-17 */
1126 ldr r8, [r1], #0x04 /* LD:28-2b */
1127 ldr r9, [r1], #0x04 /* LD:2c-2f */
1128 strd r4, [r3], #0x08 /* ST:18-1f */
1129 ldr r4, [r1], #0x04 /* LD:30-33 */
1130 ldr r5, [r1], #0x04 /* LD:34-37 */
1131 strd r6, [r3], #0x08 /* ST:20-27 */
1132 ldr r6, [r1], #0x04 /* LD:38-3b */
1133 ldr r7, [r1], #0x04 /* LD:3c-3f */
1134 strd r8, [r3], #0x08 /* ST:28-2f */
1135 ldr r8, [r1], #0x04 /* LD:40-43 */
1136 ldr r9, [r1], #0x04 /* LD:44-47 */
1137 pld [r1, #0x18] /* Prefetch 0x60 */
1138 strd r4, [r3], #0x08 /* ST:30-37 */
1139 ldr r4, [r1], #0x04 /* LD:48-4b */
1140 ldr r5, [r1], #0x04 /* LD:4c-4f */
1141 strd r6, [r3], #0x08 /* ST:38-3f */
1142 ldr r6, [r1], #0x04 /* LD:50-53 */
1143 ldr r7, [r1], #0x04 /* LD:54-57 */
1144 strd r8, [r3], #0x08 /* ST:40-47 */
1145 ldr r8, [r1], #0x04 /* LD:58-5b */
1146 ldr r9, [r1], #0x04 /* LD:5c-5f */
1147 strd r4, [r3], #0x08 /* ST:48-4f */
1148 ldr r4, [r1], #0x04 /* LD:60-63 */
1149 ldr r5, [r1], #0x04 /* LD:64-67 */
1150 pld [r1, #0x18] /* Prefetch 0x80 */
1151 strd r6, [r3], #0x08 /* ST:50-57 */
1152 ldr r6, [r1], #0x04 /* LD:68-6b */
1153 ldr r7, [r1], #0x04 /* LD:6c-6f */
1154 strd r8, [r3], #0x08 /* ST:58-5f */
1155 ldr r8, [r1], #0x04 /* LD:70-73 */
1156 ldr r9, [r1], #0x04 /* LD:74-77 */
1157 strd r4, [r3], #0x08 /* ST:60-67 */
1158 ldr r4, [r1], #0x04 /* LD:78-7b */
1159 ldr r5, [r1], #0x04 /* LD:7c-7f */
1160 strd r6, [r3], #0x08 /* ST:68-6f */
1161 strd r8, [r3], #0x08 /* ST:70-77 */
1163 strd r4, [r3], #0x08 /* ST:78-7f */
1164 bge .Lmemcpy_w_loop128
1166 .Lmemcpy_w_lessthan128:
1167 adds r2, r2, #0x80 /* Adjust for extra sub */
1168 ldmfdeq sp!, {r4-r9}
1169 RETeq /* Return now if done */
1171 blt .Lmemcpy_w_lessthan32
1173 /* Copy 32 bytes at a time */
1182 strd r4, [r3], #0x08
1185 strd r6, [r3], #0x08
1186 strd r8, [r3], #0x08
1188 strd r4, [r3], #0x08
1189 bge .Lmemcpy_w_loop32
1191 .Lmemcpy_w_lessthan32:
1192 adds r2, r2, #0x20 /* Adjust for extra sub */
1193 ldmfdeq sp!, {r4-r9}
1194 RETeq /* Return now if done */
1198 addne pc, pc, r4, lsl #1
1201 /* At least 24 bytes remaining */
1205 strd r4, [r3], #0x08
1207 /* At least 16 bytes remaining */
1211 strd r4, [r3], #0x08
1213 /* At least 8 bytes remaining */
1217 strd r4, [r3], #0x08
1219 /* Less than 8 bytes remaining */
1221 RETeq /* Return now if done */
1223 ldrge ip, [r1], #0x04
1224 strge ip, [r3], #0x04
1225 RETeq /* Return now if done */
1227 ldrb ip, [r1], #0x01
1229 ldrbge r2, [r1], #0x01
1230 strb ip, [r3], #0x01
1232 strbge r2, [r3], #0x01
1235 /* Place a literal pool here for the above ldr instructions to use */
1240 * At this point, it has not been possible to word align both buffers.
1241 * The destination buffer is word aligned, but the source buffer is not.
1252 .Lmemcpy_bad1_loop16:
1259 orr r4, r4, r5, lsl #24
1261 orr r5, r5, r6, lsl #24
1263 orr r6, r6, r7, lsl #24
1265 orr r7, r7, ip, lsl #24
1272 bge .Lmemcpy_bad1_loop16
1275 ldmfdeq sp!, {r4-r7}
1276 RETeq /* Return now if done */
1279 blt .Lmemcpy_bad_done
1281 .Lmemcpy_bad1_loop4:
1285 orr r4, r4, ip, lsl #24
1287 bge .Lmemcpy_bad1_loop4
1291 .Lmemcpy_bad2_loop16:
1298 orr r4, r4, r5, lsl #16
1300 orr r5, r5, r6, lsl #16
1302 orr r6, r6, r7, lsl #16
1304 orr r7, r7, ip, lsl #16
1311 bge .Lmemcpy_bad2_loop16
1314 ldmfdeq sp!, {r4-r7}
1315 RETeq /* Return now if done */
1318 blt .Lmemcpy_bad_done
1320 .Lmemcpy_bad2_loop4:
1324 orr r4, r4, ip, lsl #16
1326 bge .Lmemcpy_bad2_loop4
1330 .Lmemcpy_bad3_loop16:
1337 orr r4, r4, r5, lsl #8
1339 orr r5, r5, r6, lsl #8
1341 orr r6, r6, r7, lsl #8
1343 orr r7, r7, ip, lsl #8
1350 bge .Lmemcpy_bad3_loop16
1353 ldmfdeq sp!, {r4-r7}
1354 RETeq /* Return now if done */
1357 blt .Lmemcpy_bad_done
1359 .Lmemcpy_bad3_loop4:
1363 orr r4, r4, ip, lsl #8
1365 bge .Lmemcpy_bad3_loop4
1372 ldrb ip, [r1], #0x01
1374 ldrbge r2, [r1], #0x01
1375 strb ip, [r3], #0x01
1377 strbge r2, [r3], #0x01
1383 * Handle short copies (less than 16 bytes), possibly misaligned.
1384 * Some of these are *very* common, thanks to the network stack,
1385 * and so are handled specially.
1388 add pc, pc, r2, lsl #2
1391 b .Lmemcpy_bytewise /* 0x01 */
1392 b .Lmemcpy_bytewise /* 0x02 */
1393 b .Lmemcpy_bytewise /* 0x03 */
1394 b .Lmemcpy_4 /* 0x04 */
1395 b .Lmemcpy_bytewise /* 0x05 */
1396 b .Lmemcpy_6 /* 0x06 */
1397 b .Lmemcpy_bytewise /* 0x07 */
1398 b .Lmemcpy_8 /* 0x08 */
1399 b .Lmemcpy_bytewise /* 0x09 */
1400 b .Lmemcpy_bytewise /* 0x0a */
1401 b .Lmemcpy_bytewise /* 0x0b */
1402 b .Lmemcpy_c /* 0x0c */
1404 mov r3, r0 /* We must not clobber r0 */
1405 ldrb ip, [r1], #0x01
1406 1: subs r2, r2, #0x01
1407 strb ip, [r3], #0x01
1408 ldrbne ip, [r1], #0x01
1412 /******************************************************************************
1413 * Special case for 4 byte copies
1415 #define LMEMCPY_4_LOG2 6 /* 64 bytes */
1416 #define LMEMCPY_4_PAD .align LMEMCPY_4_LOG2
1420 orr r2, r2, r0, lsl #2
1423 addne pc, r3, r2, lsl #LMEMCPY_4_LOG2
1426 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1434 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1436 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1437 ldr r2, [r1, #3] /* BE:r2 = 3xxx LE:r2 = xxx3 */
1438 mov r3, r3, lsr #8 /* r3 = .210 */
1439 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1445 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1447 ldrh r3, [r1, #0x02]
1449 orr r3, r2, r3, lsl #16
1455 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1457 ldr r3, [r1, #-3] /* BE:r3 = xxx0 LE:r3 = 0xxx */
1458 ldr r2, [r1, #1] /* BE:r2 = 123x LE:r2 = x321 */
1459 mov r3, r3, lsr #24 /* r3 = ...0 */
1460 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1466 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1472 strb r1, [r0, #0x03]
1473 strh r3, [r0, #0x01]
1478 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1481 ldrh r3, [r1, #0x01]
1482 ldrb r1, [r1, #0x03]
1484 strh r3, [r0, #0x01]
1485 strb r1, [r0, #0x03]
1490 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1492 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1493 ldrh r3, [r1, #0x02] /* LE:r3 = ..23 LE:r3 = ..32 */
1494 mov r1, r2, lsr #8 /* r1 = ...0 */
1496 mov r2, r2, lsl #8 /* r2 = .01. */
1497 orr r2, r2, r3, lsr #8 /* r2 = .012 */
1498 strh r2, [r0, #0x01]
1499 strb r3, [r0, #0x03]
1504 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1507 ldrh r3, [r1, #0x01]
1508 ldrb r1, [r1, #0x03]
1510 strh r3, [r0, #0x01]
1511 strb r1, [r0, #0x03]
1516 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1521 strh r3, [r0, #0x02]
1526 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1528 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1529 ldr r3, [r1, #3] /* BE:r3 = 3xxx LE:r3 = xxx3 */
1530 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1532 mov r2, r2, lsr #24 /* r2 = ...2 */
1533 orr r2, r2, r3, lsl #8 /* r2 = xx32 */
1534 strh r2, [r0, #0x02]
1539 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1542 ldrh r3, [r1, #0x02]
1544 strh r3, [r0, #0x02]
1549 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1551 ldr r3, [r1, #1] /* BE:r3 = 123x LE:r3 = x321 */
1552 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1553 mov r1, r3, lsr #8 /* BE:r1 = .123 LE:r1 = .x32 */
1554 strh r1, [r0, #0x02]
1555 mov r3, r3, lsl #8 /* r3 = 321. */
1556 orr r3, r3, r2, lsr #24 /* r3 = 3210 */
1562 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1564 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1568 strh r3, [r0, #0x01]
1569 strb r1, [r0, #0x03]
1574 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1577 ldrh r3, [r1, #0x01]
1578 ldrb r1, [r1, #0x03]
1580 strh r3, [r0, #0x01]
1581 strb r1, [r0, #0x03]
1586 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1588 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1589 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
1591 mov r2, r2, lsr #8 /* r2 = ...1 */
1592 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1593 strh r2, [r0, #0x01]
1594 mov r3, r3, lsr #8 /* r3 = ...3 */
1595 strb r3, [r0, #0x03]
1600 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1603 ldrh r3, [r1, #0x01]
1604 ldrb r1, [r1, #0x03]
1606 strh r3, [r0, #0x01]
1607 strb r1, [r0, #0x03]
1612 /******************************************************************************
1613 * Special case for 6 byte copies
1615 #define LMEMCPY_6_LOG2 6 /* 64 bytes */
1616 #define LMEMCPY_6_PAD .align LMEMCPY_6_LOG2
1620 orr r2, r2, r0, lsl #2
1623 addne pc, r3, r2, lsl #LMEMCPY_6_LOG2
1626 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1629 ldrh r3, [r1, #0x04]
1631 strh r3, [r0, #0x04]
1636 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1638 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1639 ldr r3, [r1, #0x03] /* BE:r3 = 345x LE:r3 = x543 */
1640 mov r2, r2, lsr #8 /* r2 = .210 */
1641 orr r2, r2, r3, lsl #24 /* r2 = 3210 */
1642 mov r3, r3, lsr #8 /* BE:r3 = .345 LE:r3 = .x54 */
1644 strh r3, [r0, #0x04]
1649 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1651 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1652 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1653 mov r1, r3, lsr #16 /* r1 = ..54 */
1654 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1656 strh r1, [r0, #0x04]
1661 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1663 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1664 ldr r3, [r1, #1] /* BE:r3 = 1234 LE:r3 = 4321 */
1665 ldr r1, [r1, #5] /* BE:r1 = 5xxx LE:r3 = xxx5 */
1666 mov r2, r2, lsr #24 /* r2 = ...0 */
1667 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1668 mov r1, r1, lsl #8 /* r1 = xx5. */
1669 orr r1, r1, r3, lsr #24 /* r1 = xx54 */
1671 strh r1, [r0, #0x04]
1676 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1678 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1679 ldrh r2, [r1, #0x04] /* BE:r2 = ..45 LE:r2 = ..54 */
1680 mov r1, r3, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1681 strh r1, [r0, #0x01]
1683 mov r3, r3, lsr #24 /* r3 = ...3 */
1684 orr r3, r3, r2, lsl #8 /* r3 = .543 */
1685 mov r2, r2, lsr #8 /* r2 = ...5 */
1686 strh r3, [r0, #0x03]
1687 strb r2, [r0, #0x05]
1692 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1695 ldrh r3, [r1, #0x01]
1696 ldrh ip, [r1, #0x03]
1697 ldrb r1, [r1, #0x05]
1699 strh r3, [r0, #0x01]
1700 strh ip, [r0, #0x03]
1701 strb r1, [r0, #0x05]
1706 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1708 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1709 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1712 strb r3, [r0, #0x05]
1713 mov r3, r1, lsr #8 /* r3 = .543 */
1714 strh r3, [r0, #0x03]
1715 mov r3, r2, lsr #8 /* r3 = ...1 */
1716 orr r3, r3, r1, lsl #8 /* r3 = 4321 */
1717 strh r3, [r0, #0x01]
1722 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1725 ldrh r3, [r1, #0x01]
1726 ldrh ip, [r1, #0x03]
1727 ldrb r1, [r1, #0x05]
1729 strh r3, [r0, #0x01]
1730 strh ip, [r0, #0x03]
1731 strb r1, [r0, #0x05]
1736 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1738 ldrh r2, [r1, #0x04] /* r2 = ..54 */
1739 ldr r3, [r1] /* r3 = 3210 */
1740 mov r2, r2, lsl #16 /* r2 = 54.. */
1741 orr r2, r2, r3, lsr #16 /* r2 = 5432 */
1748 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1750 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1751 ldr r2, [r1, #3] /* BE:r2 = 345x LE:r2 = x543 */
1752 mov r1, r3, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1753 mov r2, r2, lsl #8 /* r2 = 543. */
1754 orr r2, r2, r3, lsr #24 /* r2 = 5432 */
1761 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1771 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1773 ldrb r3, [r1] /* r3 = ...0 */
1774 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1775 ldrb r1, [r1, #0x05] /* r1 = ...5 */
1776 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1777 mov r1, r1, lsl #24 /* r1 = 5... */
1778 orr r1, r1, r2, lsr #8 /* r1 = 5432 */
1785 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1787 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1788 ldrh r1, [r1, #0x04] /* BE:r1 = ..45 LE:r1 = ..54 */
1790 mov r2, r2, lsr #8 /* r2 = .321 */
1791 orr r2, r2, r1, lsl #24 /* r2 = 4321 */
1792 mov r1, r1, lsr #8 /* r1 = ...5 */
1794 strb r1, [r0, #0x05]
1799 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1802 ldrh r3, [r1, #0x01]
1803 ldrh ip, [r1, #0x03]
1804 ldrb r1, [r1, #0x05]
1806 strh r3, [r0, #0x01]
1807 strh ip, [r0, #0x03]
1808 strb r1, [r0, #0x05]
1813 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1815 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1816 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1818 mov r2, r2, lsr #8 /* r2 = ...1 */
1819 orr r2, r2, r1, lsl #8 /* r2 = 4321 */
1820 mov r1, r1, lsr #24 /* r1 = ...5 */
1822 strb r1, [r0, #0x05]
1827 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1831 ldrb r1, [r1, #0x05]
1834 strb r1, [r0, #0x05]
1839 /******************************************************************************
1840 * Special case for 8 byte copies
1842 #define LMEMCPY_8_LOG2 6 /* 64 bytes */
1843 #define LMEMCPY_8_PAD .align LMEMCPY_8_LOG2
1847 orr r2, r2, r0, lsl #2
1850 addne pc, r3, r2, lsl #LMEMCPY_8_LOG2
1853 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1863 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1865 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1866 ldr r2, [r1, #0x03] /* BE:r2 = 3456 LE:r2 = 6543 */
1867 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1868 mov r3, r3, lsr #8 /* r3 = .210 */
1869 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1870 mov r1, r1, lsl #24 /* r1 = 7... */
1871 orr r2, r1, r2, lsr #8 /* r2 = 7654 */
1878 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1880 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1881 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1882 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1883 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1884 mov r3, r3, lsr #16 /* r3 = ..54 */
1885 orr r3, r3, r1, lsl #16 /* r3 = 7654 */
1892 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1894 ldrb r3, [r1] /* r3 = ...0 */
1895 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1896 ldr r1, [r1, #0x05] /* BE:r1 = 567x LE:r1 = x765 */
1897 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1898 mov r2, r2, lsr #24 /* r2 = ...4 */
1899 orr r2, r2, r1, lsl #8 /* r2 = 7654 */
1906 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1908 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1909 ldr r2, [r1, #0x04] /* BE:r2 = 4567 LE:r2 = 7654 */
1911 mov r1, r2, lsr #24 /* r1 = ...7 */
1912 strb r1, [r0, #0x07]
1913 mov r1, r3, lsr #8 /* r1 = .321 */
1914 mov r3, r3, lsr #24 /* r3 = ...3 */
1915 orr r3, r3, r2, lsl #8 /* r3 = 6543 */
1916 strh r1, [r0, #0x01]
1922 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1925 ldrh r3, [r1, #0x01]
1927 ldrb r1, [r1, #0x07]
1929 strh r3, [r0, #0x01]
1931 strb r1, [r0, #0x07]
1936 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1938 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1939 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1940 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1941 strb r2, [r0] /* 0 */
1942 mov ip, r1, lsr #8 /* ip = ...7 */
1943 strb ip, [r0, #0x07] /* 7 */
1944 mov ip, r2, lsr #8 /* ip = ...1 */
1945 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1946 mov r3, r3, lsr #8 /* r3 = .543 */
1947 orr r3, r3, r1, lsl #24 /* r3 = 6543 */
1948 strh ip, [r0, #0x01]
1954 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1956 ldrb r3, [r1] /* r3 = ...0 */
1957 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1958 ldrh r2, [r1, #0x05] /* BE:r2 = ..56 LE:r2 = ..65 */
1959 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1961 mov r3, ip, lsr #16 /* BE:r3 = ..12 LE:r3 = ..43 */
1962 strh ip, [r0, #0x01]
1963 orr r2, r3, r2, lsl #16 /* r2 = 6543 */
1965 strb r1, [r0, #0x07]
1970 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1972 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1973 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1974 mov r1, r2, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1976 orr r2, r1, r3, lsl #16 /* r2 = 5432 */
1977 mov r3, r3, lsr #16 /* r3 = ..76 */
1979 strh r3, [r0, #0x06]
1984 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1986 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1987 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1988 ldrb ip, [r1, #0x07] /* ip = ...7 */
1989 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1991 mov r1, r2, lsr #24 /* r1 = ...2 */
1992 orr r1, r1, r3, lsl #8 /* r1 = 5432 */
1993 mov r3, r3, lsr #24 /* r3 = ...6 */
1994 orr r3, r3, ip, lsl #8 /* r3 = ..76 */
1996 strh r3, [r0, #0x06]
2001 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2005 ldrh r3, [r1, #0x06]
2008 strh r3, [r0, #0x06]
2013 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2015 ldr r3, [r1, #0x05] /* BE:r3 = 567x LE:r3 = x765 */
2016 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
2017 ldrb ip, [r1] /* ip = ...0 */
2018 mov r1, r3, lsr #8 /* BE:r1 = .567 LE:r1 = .x76 */
2019 strh r1, [r0, #0x06]
2020 mov r3, r3, lsl #24 /* r3 = 5... */
2021 orr r3, r3, r2, lsr #8 /* r3 = 5432 */
2022 orr r2, ip, r2, lsl #8 /* r2 = 3210 */
2029 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2031 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
2032 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
2033 mov r1, r3, lsr #8 /* BE:r1 = .456 LE:r1 = .765 */
2034 strh r1, [r0, #0x05]
2036 mov r1, r3, lsr #24 /* r1 = ...7 */
2037 strb r1, [r0, #0x07]
2038 mov r2, r2, lsr #8 /* r2 = .321 */
2039 orr r2, r2, r3, lsl #24 /* r2 = 4321 */
2045 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2047 ldrb r3, [r1] /* r3 = ...0 */
2048 ldrh r2, [r1, #0x01] /* BE:r2 = ..12 LE:r2 = ..21 */
2049 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
2050 ldrb r1, [r1, #0x07] /* r1 = ...7 */
2052 mov r3, ip, lsr #16 /* BE:r3 = ..34 LE:r3 = ..65 */
2053 strh r3, [r0, #0x05]
2054 orr r2, r2, ip, lsl #16 /* r2 = 4321 */
2056 strb r1, [r0, #0x07]
2061 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2063 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
2064 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
2065 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
2067 mov ip, r2, lsr #8 /* ip = ...1 */
2068 orr ip, ip, r3, lsl #8 /* ip = 4321 */
2069 mov r2, r1, lsr #8 /* r2 = ...7 */
2070 strb r2, [r0, #0x07]
2071 mov r1, r1, lsl #8 /* r1 = .76. */
2072 orr r1, r1, r3, lsr #24 /* r1 = .765 */
2074 strh r1, [r0, #0x05]
2079 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2083 ldrh r3, [r1, #0x05]
2084 ldrb r1, [r1, #0x07]
2087 strh r3, [r0, #0x05]
2088 strb r1, [r0, #0x07]
2092 /******************************************************************************
2093 * Special case for 12 byte copies
2095 #define LMEMCPY_C_LOG2 7 /* 128 bytes */
2096 #define LMEMCPY_C_PAD .align LMEMCPY_C_LOG2
2100 orr r2, r2, r0, lsl #2
2103 addne pc, r3, r2, lsl #LMEMCPY_C_LOG2
2106 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2118 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2120 ldrb r2, [r1, #0xb] /* r2 = ...B */
2121 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
2122 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
2123 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
2124 mov r2, r2, lsl #24 /* r2 = B... */
2125 orr r2, r2, ip, lsr #8 /* r2 = BA98 */
2127 mov r2, ip, lsl #24 /* r2 = 7... */
2128 orr r2, r2, r3, lsr #8 /* r2 = 7654 */
2129 mov r1, r1, lsr #8 /* r1 = .210 */
2130 orr r1, r1, r3, lsl #24 /* r1 = 3210 */
2137 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2139 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
2140 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
2141 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
2142 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
2143 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
2145 mov r3, r3, lsr #16 /* r3 = ..54 */
2146 orr r3, r3, ip, lsl #16 /* r3 = 7654 */
2147 mov r1, r1, lsl #16 /* r1 = BA.. */
2148 orr r1, r1, ip, lsr #16 /* r1 = BA98 */
2155 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2157 ldrb r2, [r1] /* r2 = ...0 */
2158 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
2159 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
2160 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
2161 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
2163 mov r3, r3, lsr #24 /* r3 = ...4 */
2164 orr r3, r3, ip, lsl #8 /* r3 = 7654 */
2165 mov r1, r1, lsl #8 /* r1 = BA9. */
2166 orr r1, r1, ip, lsr #24 /* r1 = BA98 */
2173 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2175 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
2176 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
2177 ldr ip, [r1, #0x08] /* BE:ip = 89AB LE:ip = BA98 */
2178 mov r1, r2, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
2179 strh r1, [r0, #0x01]
2181 mov r1, r2, lsr #24 /* r1 = ...3 */
2182 orr r2, r1, r3, lsl #8 /* r1 = 6543 */
2183 mov r1, r3, lsr #24 /* r1 = ...7 */
2184 orr r1, r1, ip, lsl #8 /* r1 = A987 */
2185 mov ip, ip, lsr #24 /* ip = ...B */
2188 strb ip, [r0, #0x0b]
2193 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2196 ldrh r3, [r1, #0x01]
2200 ldrb r1, [r1, #0x0b]
2201 strh r3, [r0, #0x01]
2204 strb r1, [r0, #0x0b]
2209 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2211 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
2212 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
2213 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
2214 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
2216 mov r2, r2, lsr #8 /* r2 = ...1 */
2217 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
2218 strh r2, [r0, #0x01]
2219 mov r2, r3, lsr #8 /* r2 = .543 */
2220 orr r3, r2, ip, lsl #24 /* r3 = 6543 */
2221 mov r2, ip, lsr #8 /* r2 = .987 */
2222 orr r2, r2, r1, lsl #24 /* r2 = A987 */
2223 mov r1, r1, lsr #8 /* r1 = ...B */
2226 strb r1, [r0, #0x0b]
2231 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2234 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
2235 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
2236 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
2238 strh r3, [r0, #0x01]
2239 mov r3, r3, lsr #16 /* r3 = ..43 */
2240 orr r3, r3, ip, lsl #16 /* r3 = 6543 */
2241 mov ip, ip, lsr #16 /* ip = ..87 */
2242 orr ip, ip, r1, lsl #16 /* ip = A987 */
2243 mov r1, r1, lsr #16 /* r1 = ..xB */
2246 strb r1, [r0, #0x0b]
2251 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2253 ldr ip, [r1] /* BE:ip = 0123 LE:ip = 3210 */
2254 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
2255 ldr r2, [r1, #0x08] /* BE:r2 = 89AB LE:r2 = BA98 */
2256 mov r1, ip, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
2258 orr r1, r1, r3, lsl #16 /* r1 = 5432 */
2259 mov r3, r3, lsr #16 /* r3 = ..76 */
2260 orr r3, r3, r2, lsl #16 /* r3 = 9876 */
2261 mov r2, r2, lsr #16 /* r2 = ..BA */
2264 strh r2, [r0, #0x0a]
2269 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2271 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
2272 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
2273 mov ip, r2, lsr #8 /* BE:ip = .x01 LE:ip = .210 */
2275 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
2276 ldrb r1, [r1, #0x0b] /* r1 = ...B */
2277 mov r2, r2, lsr #24 /* r2 = ...2 */
2278 orr r2, r2, r3, lsl #8 /* r2 = 5432 */
2279 mov r3, r3, lsr #24 /* r3 = ...6 */
2280 orr r3, r3, ip, lsl #8 /* r3 = 9876 */
2281 mov r1, r1, lsl #8 /* r1 = ..B. */
2282 orr r1, r1, ip, lsr #24 /* r1 = ..BA */
2285 strh r1, [r0, #0x0a]
2290 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2295 ldrh r1, [r1, #0x0a]
2299 strh r1, [r0, #0x0a]
2304 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2306 ldr r2, [r1, #0x09] /* BE:r2 = 9ABx LE:r2 = xBA9 */
2307 ldr r3, [r1, #0x05] /* BE:r3 = 5678 LE:r3 = 8765 */
2308 mov ip, r2, lsr #8 /* BE:ip = .9AB LE:ip = .xBA */
2309 strh ip, [r0, #0x0a]
2310 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
2311 ldrb r1, [r1] /* r1 = ...0 */
2312 mov r2, r2, lsl #24 /* r2 = 9... */
2313 orr r2, r2, r3, lsr #8 /* r2 = 9876 */
2314 mov r3, r3, lsl #24 /* r3 = 5... */
2315 orr r3, r3, ip, lsr #8 /* r3 = 5432 */
2316 orr r1, r1, ip, lsl #8 /* r1 = 3210 */
2324 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2326 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
2327 ldr ip, [r1, #0x04] /* BE:ip = 4567 LE:ip = 7654 */
2328 ldr r1, [r1, #0x08] /* BE:r1 = 89AB LE:r1 = BA98 */
2330 mov r3, r2, lsr #8 /* r3 = .321 */
2331 orr r3, r3, ip, lsl #24 /* r3 = 4321 */
2333 mov r3, ip, lsr #8 /* r3 = .765 */
2334 orr r3, r3, r1, lsl #24 /* r3 = 8765 */
2336 mov r1, r1, lsr #8 /* r1 = .BA9 */
2337 strh r1, [r0, #0x09]
2338 mov r1, r1, lsr #16 /* r1 = ...B */
2339 strb r1, [r0, #0x0b]
2344 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2346 ldrb r2, [r1, #0x0b] /* r2 = ...B */
2347 ldr r3, [r1, #0x07] /* BE:r3 = 789A LE:r3 = A987 */
2348 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
2349 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
2350 strb r2, [r0, #0x0b]
2351 mov r2, r3, lsr #16 /* r2 = ..A9 */
2352 strh r2, [r0, #0x09]
2353 mov r3, r3, lsl #16 /* r3 = 87.. */
2354 orr r3, r3, ip, lsr #16 /* r3 = 8765 */
2355 mov ip, ip, lsl #16 /* ip = 43.. */
2356 orr ip, ip, r1, lsr #16 /* ip = 4321 */
2357 mov r1, r1, lsr #8 /* r1 = .210 */
2365 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2367 ldrh r2, [r1] /* r2 = ..10 */
2368 ldr r3, [r1, #0x02] /* r3 = 5432 */
2369 ldr ip, [r1, #0x06] /* ip = 9876 */
2370 ldrh r1, [r1, #0x0a] /* r1 = ..BA */
2372 mov r2, r2, lsr #8 /* r2 = ...1 */
2373 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
2374 mov r3, r3, lsr #24 /* r3 = ...5 */
2375 orr r3, r3, ip, lsl #8 /* r3 = 8765 */
2376 mov ip, ip, lsr #24 /* ip = ...9 */
2377 orr ip, ip, r1, lsl #8 /* ip = .BA9 */
2378 mov r1, r1, lsr #8 /* r1 = ...B */
2381 strh ip, [r0, #0x09]
2382 strb r1, [r0, #0x0b]
2387 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2393 ldrh r2, [r1, #0x09]
2394 ldrb r1, [r1, #0x0b]
2397 strh r2, [r0, #0x09]
2398 strb r1, [r0, #0x0b]
2401 #endif /* _ARM_ARCH_5E */