2 * Copyright (c) 2004 Olivier Houchard
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed for the NetBSD Project by
43 * Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 * or promote products derived from this software without specific prior
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution.
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
89 #include <machine/asm.h>
90 __FBSDID("$FreeBSD$");
97 * memset: Sets a block of memory to the specified value
102 * r2 - number of bytes to write
107 /* LINTSTUB: Func: void *memset(void *, int, size_t) */
109 and r3, r1, #0xff /* We deal with bytes */
112 cmp r1, #0x04 /* Do we have less than 4 bytes */
114 blt .Lmemset_lessthanfour
116 /* Ok first we will word align the address */
117 ands r2, ip, #0x03 /* Get the bottom two bits */
118 bne .Lmemset_wordunaligned /* The address is not word aligned */
120 /* We are now word aligned */
121 .Lmemset_wordaligned:
122 orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
123 tst ip, #0x04 /* Quad-align for armv5e */
124 orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
125 subne r1, r1, #0x04 /* Quad-align if necessary */
126 strne r3, [ip], #0x04
128 blt .Lmemset_loop4 /* If less than 16 then use words */
129 mov r2, r3 /* Duplicate data */
130 cmp r1, #0x80 /* If < 128 then skip the big loop */
133 /* Do 128 bytes at a time */
136 strdge r2, [ip], #0x08
137 strdge r2, [ip], #0x08
138 strdge r2, [ip], #0x08
139 strdge r2, [ip], #0x08
140 strdge r2, [ip], #0x08
141 strdge r2, [ip], #0x08
142 strdge r2, [ip], #0x08
143 strdge r2, [ip], #0x08
144 strdge r2, [ip], #0x08
145 strdge r2, [ip], #0x08
146 strdge r2, [ip], #0x08
147 strdge r2, [ip], #0x08
148 strdge r2, [ip], #0x08
149 strdge r2, [ip], #0x08
150 strdge r2, [ip], #0x08
151 strdge r2, [ip], #0x08
153 RETeq /* Zero length so just exit */
155 add r1, r1, #0x80 /* Adjust for extra sub */
157 /* Do 32 bytes at a time */
160 strdge r2, [ip], #0x08
161 strdge r2, [ip], #0x08
162 strdge r2, [ip], #0x08
163 strdge r2, [ip], #0x08
165 RETeq /* Zero length so just exit */
167 adds r1, r1, #0x10 /* Partially adjust for extra sub */
169 /* Deal with 16 bytes or more */
170 strdge r2, [ip], #0x08
171 strdge r2, [ip], #0x08
172 RETeq /* Zero length so just exit */
174 addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
176 /* We have at least 4 bytes so copy as words */
179 strge r3, [ip], #0x04
181 RETeq /* Zero length so just exit */
183 /* Compensate for 64-bit alignment check */
188 strb r3, [ip], #0x01 /* Set 1 byte */
189 strbge r3, [ip], #0x01 /* Set another byte */
190 strbgt r3, [ip] /* and a third */
193 .Lmemset_wordunaligned:
195 strb r3, [ip], #0x01 /* Set 1 byte */
197 strbge r3, [ip], #0x01 /* Set another byte */
199 strbgt r3, [ip], #0x01 /* and a third */
200 cmp r1, #0x04 /* More than 4 bytes left? */
201 bge .Lmemset_wordaligned /* Yup */
203 .Lmemset_lessthanfour:
205 RETeq /* Zero length so exit */
206 strb r3, [ip], #0x01 /* Set 1 byte */
208 strbge r3, [ip], #0x01 /* Set another byte */
209 strbgt r3, [ip] /* and a third */
219 /* Are both addresses aligned the same way? */
222 RETeq /* len == 0, or same addresses! */
225 bne .Lmemcmp_bytewise2 /* Badly aligned. Do it the slow way */
227 /* Word-align the addresses, if necessary */
230 add r3, r3, r3, lsl #1
231 addne pc, pc, r3, lsl #3
234 /* Compare up to 3 bytes */
242 /* Compare up to 2 bytes */
258 /* Compare 4 bytes at a time, if possible */
260 bcc .Lmemcmp_bytewise
261 .Lmemcmp_word_aligned:
266 beq .Lmemcmp_word_aligned
269 /* Correct for extra subtraction, and check if done */
271 cmpeq r0, #0x00 /* If done, did all bytes match? */
272 RETeq /* Yup. Just return */
274 /* Re-do the final word byte-wise */
285 beq .Lmemcmp_bytewise2
290 * 6 byte compares are very common, thanks to the network stack.
291 * This code is hand-scheduled to reduce the number of stalls for
292 * load results. Everything else being equal, this will be ~32%
293 * faster than a byte-wise memcmp.
297 ldrb r3, [r1, #0x00] /* r3 = b2#0 */
298 ldrb r0, [ip, #0x00] /* r0 = b1#0 */
299 ldrb r2, [r1, #0x01] /* r2 = b2#1 */
300 subs r0, r0, r3 /* r0 = b1#0 - b2#0 */
301 ldrbeq r3, [ip, #0x01] /* r3 = b1#1 */
302 RETne /* Return if mismatch on #0 */
303 subs r0, r3, r2 /* r0 = b1#1 - b2#1 */
304 ldrbeq r3, [r1, #0x02] /* r3 = b2#2 */
305 ldrbeq r0, [ip, #0x02] /* r0 = b1#2 */
306 RETne /* Return if mismatch on #1 */
307 ldrb r2, [r1, #0x03] /* r2 = b2#3 */
308 subs r0, r0, r3 /* r0 = b1#2 - b2#2 */
309 ldrbeq r3, [ip, #0x03] /* r3 = b1#3 */
310 RETne /* Return if mismatch on #2 */
311 subs r0, r3, r2 /* r0 = b1#3 - b2#3 */
312 ldrbeq r3, [r1, #0x04] /* r3 = b2#4 */
313 ldrbeq r0, [ip, #0x04] /* r0 = b1#4 */
314 RETne /* Return if mismatch on #3 */
315 ldrb r2, [r1, #0x05] /* r2 = b2#5 */
316 subs r0, r0, r3 /* r0 = b1#4 - b2#4 */
317 ldrbeq r3, [ip, #0x05] /* r3 = b1#5 */
318 RETne /* Return if mismatch on #4 */
319 sub r0, r3, r2 /* r0 = b1#5 - b2#5 */
324 /* Do the buffers overlap? */
326 RETeq /* Bail now if src/dst are the same */
327 subcc r3, r0, r1 /* if (dst > src) r3 = dst - src */
328 subcs r3, r1, r0 /* if (src > dsr) r3 = src - dst */
329 cmp r3, r2 /* if (r3 < len) we have an overlap */
330 bcc PIC_SYM(_C_LABEL(memcpy), PLT)
332 /* Determine copy direction */
334 bcc .Lmemmove_backwards
336 moveq r0, #0 /* Quick abort for len=0 */
339 stmdb sp!, {r0, lr} /* memmove() returns dest addr */
341 blt .Lmemmove_fl4 /* less than 4 bytes */
343 bne .Lmemmove_fdestul /* oh unaligned destination addr */
345 bne .Lmemmove_fsrcul /* oh unaligned source addr */
348 /* We have aligned source and destination */
350 blt .Lmemmove_fl12 /* less than 12 bytes (4 from above) */
352 blt .Lmemmove_fl32 /* less than 32 bytes (12 from above) */
353 stmdb sp!, {r4} /* borrow r4 */
355 /* blat 32 bytes at a time */
356 /* XXX for really big copies perhaps we should use more registers */
358 ldmia r1!, {r3, r4, r12, lr}
359 stmia r0!, {r3, r4, r12, lr}
360 ldmia r1!, {r3, r4, r12, lr}
361 stmia r0!, {r3, r4, r12, lr}
363 bge .Lmemmove_floop32
366 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
367 stmiage r0!, {r3, r4, r12, lr}
369 ldmia sp!, {r4} /* return r4 */
374 /* blat 12 bytes at a time */
376 ldmiage r1!, {r3, r12, lr}
377 stmiage r0!, {r3, r12, lr}
379 bge .Lmemmove_floop12
388 ldmiage r1!, {r3, r12}
389 stmiage r0!, {r3, r12}
393 /* less than 4 bytes to go */
395 ldmiaeq sp!, {r0, pc} /* done */
397 /* copy the crud byte at a time */
407 /* erg - unaligned destination */
412 /* align destination with byte copies */
420 blt .Lmemmove_fl4 /* less the 4 bytes */
423 beq .Lmemmove_ft8 /* we have an aligned source */
425 /* erg - unaligned source */
426 /* This is where it gets nasty ... */
431 bgt .Lmemmove_fsrcul3
432 beq .Lmemmove_fsrcul2
434 blt .Lmemmove_fsrcul1loop4
438 .Lmemmove_fsrcul1loop16:
440 ldmia r1!, {r4, r5, r12, lr}
441 orr r3, r3, r4, lsl #24
443 orr r4, r4, r5, lsl #24
445 orr r5, r5, r12, lsl #24
447 orr r12, r12, lr, lsl #24
448 stmia r0!, {r3-r5, r12}
450 bge .Lmemmove_fsrcul1loop16
453 blt .Lmemmove_fsrcul1l4
455 .Lmemmove_fsrcul1loop4:
458 orr r12, r12, lr, lsl #24
461 bge .Lmemmove_fsrcul1loop4
469 blt .Lmemmove_fsrcul2loop4
473 .Lmemmove_fsrcul2loop16:
475 ldmia r1!, {r4, r5, r12, lr}
476 orr r3, r3, r4, lsl #16
478 orr r4, r4, r5, lsl #16
480 orr r5, r5, r12, lsl #16
481 mov r12, r12, lsr #16
482 orr r12, r12, lr, lsl #16
483 stmia r0!, {r3-r5, r12}
485 bge .Lmemmove_fsrcul2loop16
488 blt .Lmemmove_fsrcul2l4
490 .Lmemmove_fsrcul2loop4:
493 orr r12, r12, lr, lsl #16
496 bge .Lmemmove_fsrcul2loop4
504 blt .Lmemmove_fsrcul3loop4
508 .Lmemmove_fsrcul3loop16:
510 ldmia r1!, {r4, r5, r12, lr}
511 orr r3, r3, r4, lsl #8
513 orr r4, r4, r5, lsl #8
515 orr r5, r5, r12, lsl #8
516 mov r12, r12, lsr #24
517 orr r12, r12, lr, lsl #8
518 stmia r0!, {r3-r5, r12}
520 bge .Lmemmove_fsrcul3loop16
523 blt .Lmemmove_fsrcul3l4
525 .Lmemmove_fsrcul3loop4:
528 orr r12, r12, lr, lsl #8
531 bge .Lmemmove_fsrcul3loop4
541 blt .Lmemmove_bl4 /* less than 4 bytes */
543 bne .Lmemmove_bdestul /* oh unaligned destination addr */
545 bne .Lmemmove_bsrcul /* oh unaligned source addr */
548 /* We have aligned source and destination */
550 blt .Lmemmove_bl12 /* less than 12 bytes (4 from above) */
552 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
555 /* blat 32 bytes at a time */
556 /* XXX for really big copies perhaps we should use more registers */
558 ldmdb r1!, {r3, r4, r12, lr}
559 stmdb r0!, {r3, r4, r12, lr}
560 ldmdb r1!, {r3, r4, r12, lr}
561 stmdb r0!, {r3, r4, r12, lr}
563 bge .Lmemmove_bloop32
567 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
568 stmdbge r0!, {r3, r4, r12, lr}
571 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
572 stmdbge r0!, {r3, r12, lr}
582 ldmdbge r1!, {r3, r12}
583 stmdbge r0!, {r3, r12}
587 /* less than 4 bytes to go */
591 /* copy the crud byte at a time */
595 ldrbge r3, [r1, #-1]!
596 strbge r3, [r0, #-1]!
597 ldrbgt r3, [r1, #-1]!
598 strbgt r3, [r0, #-1]!
601 /* erg - unaligned destination */
605 /* align destination with byte copies */
608 ldrbge r3, [r1, #-1]!
609 strbge r3, [r0, #-1]!
610 ldrbgt r3, [r1, #-1]!
611 strbgt r3, [r0, #-1]!
613 blt .Lmemmove_bl4 /* less than 4 bytes to go */
615 beq .Lmemmove_bt8 /* we have an aligned source */
617 /* erg - unaligned source */
618 /* This is where it gets nasty ... */
623 blt .Lmemmove_bsrcul1
624 beq .Lmemmove_bsrcul2
626 blt .Lmemmove_bsrcul3loop4
628 stmdb sp!, {r4, r5, lr}
630 .Lmemmove_bsrcul3loop16:
632 ldmdb r1!, {r3-r5, r12}
633 orr lr, lr, r12, lsr #24
635 orr r12, r12, r5, lsr #24
637 orr r5, r5, r4, lsr #24
639 orr r4, r4, r3, lsr #24
640 stmdb r0!, {r4, r5, r12, lr}
642 bge .Lmemmove_bsrcul3loop16
643 ldmia sp!, {r4, r5, lr}
645 blt .Lmemmove_bsrcul3l4
647 .Lmemmove_bsrcul3loop4:
650 orr r12, r12, r3, lsr #24
653 bge .Lmemmove_bsrcul3loop4
661 blt .Lmemmove_bsrcul2loop4
663 stmdb sp!, {r4, r5, lr}
665 .Lmemmove_bsrcul2loop16:
667 ldmdb r1!, {r3-r5, r12}
668 orr lr, lr, r12, lsr #16
669 mov r12, r12, lsl #16
670 orr r12, r12, r5, lsr #16
672 orr r5, r5, r4, lsr #16
674 orr r4, r4, r3, lsr #16
675 stmdb r0!, {r4, r5, r12, lr}
677 bge .Lmemmove_bsrcul2loop16
678 ldmia sp!, {r4, r5, lr}
680 blt .Lmemmove_bsrcul2l4
682 .Lmemmove_bsrcul2loop4:
685 orr r12, r12, r3, lsr #16
688 bge .Lmemmove_bsrcul2loop4
696 blt .Lmemmove_bsrcul1loop4
698 stmdb sp!, {r4, r5, lr}
700 .Lmemmove_bsrcul1loop32:
702 ldmdb r1!, {r3-r5, r12}
703 orr lr, lr, r12, lsr #8
704 mov r12, r12, lsl #24
705 orr r12, r12, r5, lsr #8
707 orr r5, r5, r4, lsr #8
709 orr r4, r4, r3, lsr #8
710 stmdb r0!, {r4, r5, r12, lr}
712 bge .Lmemmove_bsrcul1loop32
713 ldmia sp!, {r4, r5, lr}
715 blt .Lmemmove_bsrcul1l4
717 .Lmemmove_bsrcul1loop4:
720 orr r12, r12, r3, lsr #8
723 bge .Lmemmove_bsrcul1loop4
730 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
734 ble .Lmemcpy_short /* <= 12 bytes */
736 #if FLASHADDR > PHYSADDR
746 mov r3, r0 /* We must not clobber r0 */
748 /* Word-align the destination buffer */
749 ands ip, r3, #0x03 /* Already word aligned? */
750 beq .Lmemcpy_wordaligned /* Yup */
755 ldrble ip, [r1], #0x01
757 strble ip, [r3], #0x01
758 ldrblt ip, [r1], #0x01
760 strblt ip, [r3], #0x01
762 /* Destination buffer is now word aligned */
763 .Lmemcpy_wordaligned:
764 ands ip, r1, #0x03 /* Is src also word-aligned? */
765 bne .Lmemcpy_bad_align /* Nope. Things just got bad */
767 /* Quad-align the destination buffer */
768 tst r3, #0x07 /* Already quad aligned? */
769 ldrne ip, [r1], #0x04
770 stmfd sp!, {r4-r9} /* Free up some registers */
772 strne ip, [r3], #0x04
774 /* Destination buffer quad aligned, source is at least word aligned */
776 blt .Lmemcpy_w_lessthan128
778 /* Copy 128 bytes at a time */
780 ldr r4, [r1], #0x04 /* LD:00-03 */
781 ldr r5, [r1], #0x04 /* LD:04-07 */
782 pld [r1, #0x18] /* Prefetch 0x20 */
783 ldr r6, [r1], #0x04 /* LD:08-0b */
784 ldr r7, [r1], #0x04 /* LD:0c-0f */
785 ldr r8, [r1], #0x04 /* LD:10-13 */
786 ldr r9, [r1], #0x04 /* LD:14-17 */
787 strd r4, [r3], #0x08 /* ST:00-07 */
788 ldr r4, [r1], #0x04 /* LD:18-1b */
789 ldr r5, [r1], #0x04 /* LD:1c-1f */
790 strd r6, [r3], #0x08 /* ST:08-0f */
791 ldr r6, [r1], #0x04 /* LD:20-23 */
792 ldr r7, [r1], #0x04 /* LD:24-27 */
793 pld [r1, #0x18] /* Prefetch 0x40 */
794 strd r8, [r3], #0x08 /* ST:10-17 */
795 ldr r8, [r1], #0x04 /* LD:28-2b */
796 ldr r9, [r1], #0x04 /* LD:2c-2f */
797 strd r4, [r3], #0x08 /* ST:18-1f */
798 ldr r4, [r1], #0x04 /* LD:30-33 */
799 ldr r5, [r1], #0x04 /* LD:34-37 */
800 strd r6, [r3], #0x08 /* ST:20-27 */
801 ldr r6, [r1], #0x04 /* LD:38-3b */
802 ldr r7, [r1], #0x04 /* LD:3c-3f */
803 strd r8, [r3], #0x08 /* ST:28-2f */
804 ldr r8, [r1], #0x04 /* LD:40-43 */
805 ldr r9, [r1], #0x04 /* LD:44-47 */
806 pld [r1, #0x18] /* Prefetch 0x60 */
807 strd r4, [r3], #0x08 /* ST:30-37 */
808 ldr r4, [r1], #0x04 /* LD:48-4b */
809 ldr r5, [r1], #0x04 /* LD:4c-4f */
810 strd r6, [r3], #0x08 /* ST:38-3f */
811 ldr r6, [r1], #0x04 /* LD:50-53 */
812 ldr r7, [r1], #0x04 /* LD:54-57 */
813 strd r8, [r3], #0x08 /* ST:40-47 */
814 ldr r8, [r1], #0x04 /* LD:58-5b */
815 ldr r9, [r1], #0x04 /* LD:5c-5f */
816 strd r4, [r3], #0x08 /* ST:48-4f */
817 ldr r4, [r1], #0x04 /* LD:60-63 */
818 ldr r5, [r1], #0x04 /* LD:64-67 */
819 pld [r1, #0x18] /* Prefetch 0x80 */
820 strd r6, [r3], #0x08 /* ST:50-57 */
821 ldr r6, [r1], #0x04 /* LD:68-6b */
822 ldr r7, [r1], #0x04 /* LD:6c-6f */
823 strd r8, [r3], #0x08 /* ST:58-5f */
824 ldr r8, [r1], #0x04 /* LD:70-73 */
825 ldr r9, [r1], #0x04 /* LD:74-77 */
826 strd r4, [r3], #0x08 /* ST:60-67 */
827 ldr r4, [r1], #0x04 /* LD:78-7b */
828 ldr r5, [r1], #0x04 /* LD:7c-7f */
829 strd r6, [r3], #0x08 /* ST:68-6f */
830 strd r8, [r3], #0x08 /* ST:70-77 */
832 strd r4, [r3], #0x08 /* ST:78-7f */
833 bge .Lmemcpy_w_loop128
835 .Lmemcpy_w_lessthan128:
836 adds r2, r2, #0x80 /* Adjust for extra sub */
838 RETeq /* Return now if done */
840 blt .Lmemcpy_w_lessthan32
842 /* Copy 32 bytes at a time */
858 bge .Lmemcpy_w_loop32
860 .Lmemcpy_w_lessthan32:
861 adds r2, r2, #0x20 /* Adjust for extra sub */
863 RETeq /* Return now if done */
867 addne pc, pc, r4, lsl #1
870 /* At least 24 bytes remaining */
876 /* At least 16 bytes remaining */
882 /* At least 8 bytes remaining */
888 /* Less than 8 bytes remaining */
890 RETeq /* Return now if done */
892 ldrge ip, [r1], #0x04
893 strge ip, [r3], #0x04
894 RETeq /* Return now if done */
898 ldrbge r2, [r1], #0x01
901 strbge r2, [r3], #0x01
904 /* Place a literal pool here for the above ldr instructions to use */
909 * At this point, it has not been possible to word align both buffers.
910 * The destination buffer is word aligned, but the source buffer is not.
921 .Lmemcpy_bad1_loop16:
928 orr r4, r4, r5, lsl #24
930 orr r5, r5, r6, lsl #24
932 orr r6, r6, r7, lsl #24
934 orr r7, r7, ip, lsl #24
941 bge .Lmemcpy_bad1_loop16
945 RETeq /* Return now if done */
948 blt .Lmemcpy_bad_done
954 orr r4, r4, ip, lsl #24
956 bge .Lmemcpy_bad1_loop4
960 .Lmemcpy_bad2_loop16:
967 orr r4, r4, r5, lsl #16
969 orr r5, r5, r6, lsl #16
971 orr r6, r6, r7, lsl #16
973 orr r7, r7, ip, lsl #16
980 bge .Lmemcpy_bad2_loop16
984 RETeq /* Return now if done */
987 blt .Lmemcpy_bad_done
993 orr r4, r4, ip, lsl #16
995 bge .Lmemcpy_bad2_loop4
999 .Lmemcpy_bad3_loop16:
1006 orr r4, r4, r5, lsl #8
1008 orr r5, r5, r6, lsl #8
1010 orr r6, r6, r7, lsl #8
1012 orr r7, r7, ip, lsl #8
1019 bge .Lmemcpy_bad3_loop16
1022 ldmfdeq sp!, {r4-r7}
1023 RETeq /* Return now if done */
1026 blt .Lmemcpy_bad_done
1028 .Lmemcpy_bad3_loop4:
1032 orr r4, r4, ip, lsl #8
1034 bge .Lmemcpy_bad3_loop4
1041 ldrb ip, [r1], #0x01
1043 ldrbge r2, [r1], #0x01
1044 strb ip, [r3], #0x01
1046 strbge r2, [r3], #0x01
1052 * Handle short copies (less than 16 bytes), possibly misaligned.
1053 * Some of these are *very* common, thanks to the network stack,
1054 * and so are handled specially.
1057 add pc, pc, r2, lsl #2
1060 b .Lmemcpy_bytewise /* 0x01 */
1061 b .Lmemcpy_bytewise /* 0x02 */
1062 b .Lmemcpy_bytewise /* 0x03 */
1063 b .Lmemcpy_4 /* 0x04 */
1064 b .Lmemcpy_bytewise /* 0x05 */
1065 b .Lmemcpy_6 /* 0x06 */
1066 b .Lmemcpy_bytewise /* 0x07 */
1067 b .Lmemcpy_8 /* 0x08 */
1068 b .Lmemcpy_bytewise /* 0x09 */
1069 b .Lmemcpy_bytewise /* 0x0a */
1070 b .Lmemcpy_bytewise /* 0x0b */
1071 b .Lmemcpy_c /* 0x0c */
1073 mov r3, r0 /* We must not clobber r0 */
1074 ldrb ip, [r1], #0x01
1075 1: subs r2, r2, #0x01
1076 strb ip, [r3], #0x01
1077 ldrbne ip, [r1], #0x01
1081 /******************************************************************************
1082 * Special case for 4 byte copies
1084 #define LMEMCPY_4_LOG2 6 /* 64 bytes */
1085 #define LMEMCPY_4_PAD .align LMEMCPY_4_LOG2
1089 orr r2, r2, r0, lsl #2
1092 addne pc, r3, r2, lsl #LMEMCPY_4_LOG2
1095 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1103 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1105 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1106 ldr r2, [r1, #3] /* BE:r2 = 3xxx LE:r2 = xxx3 */
1107 mov r3, r3, lsr #8 /* r3 = .210 */
1108 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1114 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1116 ldrh r3, [r1, #0x02]
1118 orr r3, r2, r3, lsl #16
1124 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1126 ldr r3, [r1, #-3] /* BE:r3 = xxx0 LE:r3 = 0xxx */
1127 ldr r2, [r1, #1] /* BE:r2 = 123x LE:r2 = x321 */
1128 mov r3, r3, lsr #24 /* r3 = ...0 */
1129 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1135 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1141 strb r1, [r0, #0x03]
1142 strh r3, [r0, #0x01]
1147 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1150 ldrh r3, [r1, #0x01]
1151 ldrb r1, [r1, #0x03]
1153 strh r3, [r0, #0x01]
1154 strb r1, [r0, #0x03]
1159 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1161 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1162 ldrh r3, [r1, #0x02] /* LE:r3 = ..23 LE:r3 = ..32 */
1164 mov r2, r2, lsr #8 /* r2 = ...1 */
1165 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1166 mov r3, r3, lsr #8 /* r3 = ...3 */
1167 strh r2, [r0, #0x01]
1168 strb r3, [r0, #0x03]
1173 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1176 ldrh r3, [r1, #0x01]
1177 ldrb r1, [r1, #0x03]
1179 strh r3, [r0, #0x01]
1180 strb r1, [r0, #0x03]
1185 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1190 strh r3, [r0, #0x02]
1195 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1197 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1198 ldr r3, [r1, #3] /* BE:r3 = 3xxx LE:r3 = xxx3 */
1199 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1201 mov r2, r2, lsr #24 /* r2 = ...2 */
1202 orr r2, r2, r3, lsl #8 /* r2 = xx32 */
1203 strh r2, [r0, #0x02]
1208 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1211 ldrh r3, [r1, #0x02]
1213 strh r3, [r0, #0x02]
1218 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1220 ldr r3, [r1, #1] /* BE:r3 = 123x LE:r3 = x321 */
1221 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1222 mov r1, r3, lsr #8 /* BE:r1 = .123 LE:r1 = .x32 */
1223 strh r1, [r0, #0x02]
1224 mov r3, r3, lsl #8 /* r3 = 321. */
1225 orr r3, r3, r2, lsr #24 /* r3 = 3210 */
1231 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1233 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1237 strh r3, [r0, #0x01]
1238 strb r1, [r0, #0x03]
1243 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1246 ldrh r3, [r1, #0x01]
1247 ldrb r1, [r1, #0x03]
1249 strh r3, [r0, #0x01]
1250 strb r1, [r0, #0x03]
1255 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1257 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1258 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
1260 mov r2, r2, lsr #8 /* r2 = ...1 */
1261 orr r2, r2, r3, lsl #8 /* r2 = .321 */
1262 strh r2, [r0, #0x01]
1263 mov r3, r3, lsr #8 /* r3 = ...3 */
1264 strb r3, [r0, #0x03]
1269 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1272 ldrh r3, [r1, #0x01]
1273 ldrb r1, [r1, #0x03]
1275 strh r3, [r0, #0x01]
1276 strb r1, [r0, #0x03]
1281 /******************************************************************************
1282 * Special case for 6 byte copies
1284 #define LMEMCPY_6_LOG2 6 /* 64 bytes */
1285 #define LMEMCPY_6_PAD .align LMEMCPY_6_LOG2
1289 orr r2, r2, r0, lsl #2
1292 addne pc, r3, r2, lsl #LMEMCPY_6_LOG2
1295 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1298 ldrh r3, [r1, #0x04]
1300 strh r3, [r0, #0x04]
1305 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1307 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1308 ldr r3, [r1, #0x03] /* BE:r3 = 345x LE:r3 = x543 */
1309 mov r2, r2, lsr #8 /* r2 = .210 */
1310 orr r2, r2, r3, lsl #24 /* r2 = 3210 */
1311 mov r3, r3, lsr #8 /* BE:r3 = .345 LE:r3 = .x54 */
1313 strh r3, [r0, #0x04]
1318 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1320 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1321 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1322 mov r1, r3, lsr #16 /* r1 = ..54 */
1323 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1325 strh r1, [r0, #0x04]
1330 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1332 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
1333 ldr r3, [r1, #1] /* BE:r3 = 1234 LE:r3 = 4321 */
1334 ldr r1, [r1, #5] /* BE:r1 = 5xxx LE:r3 = xxx5 */
1335 mov r2, r2, lsr #24 /* r2 = ...0 */
1336 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1337 mov r1, r1, lsl #8 /* r1 = xx5. */
1338 orr r1, r1, r3, lsr #24 /* r1 = xx54 */
1340 strh r1, [r0, #0x04]
1345 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1347 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1348 ldrh r2, [r1, #0x04] /* BE:r2 = ..45 LE:r2 = ..54 */
1349 mov r1, r3, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1350 strh r1, [r0, #0x01]
1352 mov r3, r3, lsr #24 /* r3 = ...3 */
1353 orr r3, r3, r2, lsl #8 /* r3 = .543 */
1354 mov r2, r2, lsr #8 /* r2 = ...5 */
1355 strh r3, [r0, #0x03]
1356 strb r2, [r0, #0x05]
1361 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1364 ldrh r3, [r1, #0x01]
1365 ldrh ip, [r1, #0x03]
1366 ldrb r1, [r1, #0x05]
1368 strh r3, [r0, #0x01]
1369 strh ip, [r0, #0x03]
1370 strb r1, [r0, #0x05]
1375 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1377 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1378 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1381 strb r3, [r0, #0x05]
1382 mov r3, r1, lsr #8 /* r3 = .543 */
1383 strh r3, [r0, #0x03]
1384 mov r3, r2, lsr #8 /* r3 = ...1 */
1385 orr r3, r3, r1, lsl #8 /* r3 = 4321 */
1386 strh r3, [r0, #0x01]
1391 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1394 ldrh r3, [r1, #0x01]
1395 ldrh ip, [r1, #0x03]
1396 ldrb r1, [r1, #0x05]
1398 strh r3, [r0, #0x01]
1399 strh ip, [r0, #0x03]
1400 strb r1, [r0, #0x05]
1405 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1407 ldrh r2, [r1, #0x04] /* r2 = ..54 */
1408 ldr r3, [r1] /* r3 = 3210 */
1409 mov r2, r2, lsl #16 /* r2 = 54.. */
1410 orr r2, r2, r3, lsr #16 /* r2 = 5432 */
1417 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1419 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1420 ldr r2, [r1, #3] /* BE:r2 = 345x LE:r2 = x543 */
1421 mov r1, r3, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1422 mov r2, r2, lsl #8 /* r2 = 543. */
1423 orr r2, r2, r3, lsr #24 /* r2 = 5432 */
1430 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1440 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1442 ldrb r3, [r1] /* r3 = ...0 */
1443 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1444 ldrb r1, [r1, #0x05] /* r1 = ...5 */
1445 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1446 mov r1, r1, lsl #24 /* r1 = 5... */
1447 orr r1, r1, r2, lsr #8 /* r1 = 5432 */
1454 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1456 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1457 ldrh r1, [r1, #0x04] /* BE:r1 = ..45 LE:r1 = ..54 */
1459 mov r2, r2, lsr #8 /* r2 = .321 */
1460 orr r2, r2, r1, lsl #24 /* r2 = 4321 */
1461 mov r1, r1, lsr #8 /* r1 = ...5 */
1463 strb r1, [r0, #0x05]
1468 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1471 ldrh r3, [r1, #0x01]
1472 ldrh ip, [r1, #0x03]
1473 ldrb r1, [r1, #0x05]
1475 strh r3, [r0, #0x01]
1476 strh ip, [r0, #0x03]
1477 strb r1, [r0, #0x05]
1482 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1484 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1485 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
1487 mov r2, r2, lsr #8 /* r2 = ...1 */
1488 orr r2, r2, r1, lsl #8 /* r2 = 4321 */
1489 mov r1, r1, lsr #24 /* r1 = ...5 */
1491 strb r1, [r0, #0x05]
1496 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1500 ldrb r1, [r1, #0x05]
1503 strb r1, [r0, #0x05]
1508 /******************************************************************************
1509 * Special case for 8 byte copies
1511 #define LMEMCPY_8_LOG2 6 /* 64 bytes */
1512 #define LMEMCPY_8_PAD .align LMEMCPY_8_LOG2
1516 orr r2, r2, r0, lsl #2
1519 addne pc, r3, r2, lsl #LMEMCPY_8_LOG2
1522 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1532 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1534 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1535 ldr r2, [r1, #0x03] /* BE:r2 = 3456 LE:r2 = 6543 */
1536 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1537 mov r3, r3, lsr #8 /* r3 = .210 */
1538 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1539 mov r1, r1, lsl #24 /* r1 = 7... */
1540 orr r2, r1, r2, lsr #8 /* r2 = 7654 */
1547 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1549 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1550 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1551 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1552 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1553 mov r3, r3, lsr #16 /* r3 = ..54 */
1554 orr r3, r3, r1, lsl #16 /* r3 = 7654 */
1561 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1563 ldrb r3, [r1] /* r3 = ...0 */
1564 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1565 ldr r1, [r1, #0x05] /* BE:r1 = 567x LE:r1 = x765 */
1566 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1567 mov r2, r2, lsr #24 /* r2 = ...4 */
1568 orr r2, r2, r1, lsl #8 /* r2 = 7654 */
1575 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1577 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1578 ldr r2, [r1, #0x04] /* BE:r2 = 4567 LE:r2 = 7654 */
1580 mov r1, r2, lsr #24 /* r1 = ...7 */
1581 strb r1, [r0, #0x07]
1582 mov r1, r3, lsr #8 /* r1 = .321 */
1583 mov r3, r3, lsr #24 /* r3 = ...3 */
1584 orr r3, r3, r2, lsl #8 /* r3 = 6543 */
1585 strh r1, [r0, #0x01]
1591 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1594 ldrh r3, [r1, #0x01]
1596 ldrb r1, [r1, #0x07]
1598 strh r3, [r0, #0x01]
1600 strb r1, [r0, #0x07]
1605 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1607 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1608 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1609 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1610 strb r2, [r0] /* 0 */
1611 mov ip, r1, lsr #8 /* ip = ...7 */
1612 strb ip, [r0, #0x07] /* 7 */
1613 mov ip, r2, lsr #8 /* ip = ...1 */
1614 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1615 mov r3, r3, lsr #8 /* r3 = .543 */
1616 orr r3, r3, r1, lsl #24 /* r3 = 6543 */
1617 strh ip, [r0, #0x01]
1623 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1625 ldrb r3, [r1] /* r3 = ...0 */
1626 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1627 ldrh r2, [r1, #0x05] /* BE:r2 = ..56 LE:r2 = ..65 */
1628 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1630 mov r3, ip, lsr #16 /* BE:r3 = ..12 LE:r3 = ..43 */
1631 strh ip, [r0, #0x01]
1632 orr r2, r3, r2, lsl #16 /* r2 = 6543 */
1634 strb r1, [r0, #0x07]
1639 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1641 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1642 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1643 mov r1, r2, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1645 orr r2, r1, r3, lsl #16 /* r2 = 5432 */
1646 mov r3, r3, lsr #16 /* r3 = ..76 */
1648 strh r3, [r0, #0x06]
1653 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1655 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1656 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1657 ldrb ip, [r1, #0x07] /* ip = ...7 */
1658 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1660 mov r1, r2, lsr #24 /* r1 = ...2 */
1661 orr r1, r1, r3, lsl #8 /* r1 = 5432 */
1662 mov r3, r3, lsr #24 /* r3 = ...6 */
1663 orr r3, r3, ip, lsl #8 /* r3 = ..76 */
1665 strh r3, [r0, #0x06]
1670 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1674 ldrh r3, [r1, #0x06]
1677 strh r3, [r0, #0x06]
1682 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1684 ldr r3, [r1, #0x05] /* BE:r3 = 567x LE:r3 = x765 */
1685 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1686 ldrb ip, [r1] /* ip = ...0 */
1687 mov r1, r3, lsr #8 /* BE:r1 = .567 LE:r1 = .x76 */
1688 strh r1, [r0, #0x06]
1689 mov r3, r3, lsl #24 /* r3 = 5... */
1690 orr r3, r3, r2, lsr #8 /* r3 = 5432 */
1691 orr r2, ip, r2, lsl #8 /* r2 = 3210 */
1698 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1700 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1701 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1702 mov r1, r3, lsr #8 /* BE:r1 = .456 LE:r1 = .765 */
1703 strh r1, [r0, #0x05]
1705 mov r1, r3, lsr #24 /* r1 = ...7 */
1706 strb r1, [r0, #0x07]
1707 mov r2, r2, lsr #8 /* r2 = .321 */
1708 orr r2, r2, r3, lsl #24 /* r2 = 4321 */
1714 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1716 ldrb r3, [r1] /* r3 = ...0 */
1717 ldrh r2, [r1, #0x01] /* BE:r2 = ..12 LE:r2 = ..21 */
1718 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
1719 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1721 mov r3, ip, lsr #16 /* BE:r3 = ..34 LE:r3 = ..65 */
1722 strh r3, [r0, #0x05]
1723 orr r2, r2, ip, lsl #16 /* r2 = 4321 */
1725 strb r1, [r0, #0x07]
1730 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1732 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1733 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1734 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1736 mov ip, r2, lsr #8 /* ip = ...1 */
1737 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1738 mov r2, r1, lsr #8 /* r2 = ...7 */
1739 strb r2, [r0, #0x07]
1740 mov r1, r1, lsl #8 /* r1 = .76. */
1741 orr r1, r1, r3, lsr #24 /* r1 = .765 */
1743 strh r1, [r0, #0x05]
1748 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1752 ldrh r3, [r1, #0x05]
1753 ldrb r1, [r1, #0x07]
1756 strh r3, [r0, #0x05]
1757 strb r1, [r0, #0x07]
1761 /******************************************************************************
1762 * Special case for 12 byte copies
1764 #define LMEMCPY_C_LOG2 7 /* 128 bytes */
1765 #define LMEMCPY_C_PAD .align LMEMCPY_C_LOG2
1769 orr r2, r2, r0, lsl #2
1772 addne pc, r3, r2, lsl #LMEMCPY_C_LOG2
1775 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1787 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1789 ldrb r2, [r1, #0xb] /* r2 = ...B */
1790 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1791 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1792 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
1793 mov r2, r2, lsl #24 /* r2 = B... */
1794 orr r2, r2, ip, lsr #8 /* r2 = BA98 */
1796 mov r2, ip, lsl #24 /* r2 = 7... */
1797 orr r2, r2, r3, lsr #8 /* r2 = 7654 */
1798 mov r1, r1, lsr #8 /* r1 = .210 */
1799 orr r1, r1, r3, lsl #24 /* r1 = 3210 */
1806 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1808 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1809 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1810 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1811 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1812 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1814 mov r3, r3, lsr #16 /* r3 = ..54 */
1815 orr r3, r3, ip, lsl #16 /* r3 = 7654 */
1816 mov r1, r1, lsl #16 /* r1 = BA.. */
1817 orr r1, r1, ip, lsr #16 /* r1 = BA98 */
1824 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1826 ldrb r2, [r1] /* r2 = ...0 */
1827 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1828 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1829 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1830 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1832 mov r3, r3, lsr #24 /* r3 = ...4 */
1833 orr r3, r3, ip, lsl #8 /* r3 = 7654 */
1834 mov r1, r1, lsl #8 /* r1 = BA9. */
1835 orr r1, r1, ip, lsr #24 /* r1 = BA98 */
1842 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1844 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1845 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1846 ldr ip, [r1, #0x08] /* BE:ip = 89AB LE:ip = BA98 */
1847 mov r1, r2, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1848 strh r1, [r0, #0x01]
1850 mov r1, r2, lsr #24 /* r1 = ...3 */
1851 orr r2, r1, r3, lsl #8 /* r1 = 6543 */
1852 mov r1, r3, lsr #24 /* r1 = ...7 */
1853 orr r1, r1, ip, lsl #8 /* r1 = A987 */
1854 mov ip, ip, lsr #24 /* ip = ...B */
1857 strb ip, [r0, #0x0b]
1862 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1865 ldrh r3, [r1, #0x01]
1869 ldrb r1, [r1, #0x0b]
1870 strh r3, [r0, #0x01]
1873 strb r1, [r0, #0x0b]
1878 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1880 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1881 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1882 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1883 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1885 mov r2, r2, lsr #8 /* r2 = ...1 */
1886 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
1887 strh r2, [r0, #0x01]
1888 mov r2, r3, lsr #8 /* r2 = .543 */
1889 orr r3, r2, ip, lsl #24 /* r3 = 6543 */
1890 mov r2, ip, lsr #8 /* r2 = .987 */
1891 orr r2, r2, r1, lsl #24 /* r2 = A987 */
1892 mov r1, r1, lsr #8 /* r1 = ...B */
1895 strb r1, [r0, #0x0b]
1900 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1903 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1904 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1905 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1907 strh r3, [r0, #0x01]
1908 mov r3, r3, lsr #16 /* r3 = ..43 */
1909 orr r3, r3, ip, lsl #16 /* r3 = 6543 */
1910 mov ip, ip, lsr #16 /* ip = ..87 */
1911 orr ip, ip, r1, lsl #16 /* ip = A987 */
1912 mov r1, r1, lsr #16 /* r1 = ..xB */
1915 strb r1, [r0, #0x0b]
1920 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1922 ldr ip, [r1] /* BE:ip = 0123 LE:ip = 3210 */
1923 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1924 ldr r2, [r1, #0x08] /* BE:r2 = 89AB LE:r2 = BA98 */
1925 mov r1, ip, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1927 orr r1, r1, r3, lsl #16 /* r1 = 5432 */
1928 mov r3, r3, lsr #16 /* r3 = ..76 */
1929 orr r3, r3, r2, lsl #16 /* r3 = 9876 */
1930 mov r2, r2, lsr #16 /* r2 = ..BA */
1933 strh r2, [r0, #0x0a]
1938 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1940 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1941 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1942 mov ip, r2, lsr #8 /* BE:ip = .x01 LE:ip = .210 */
1944 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1945 ldrb r1, [r1, #0x0b] /* r1 = ...B */
1946 mov r2, r2, lsr #24 /* r2 = ...2 */
1947 orr r2, r2, r3, lsl #8 /* r2 = 5432 */
1948 mov r3, r3, lsr #24 /* r3 = ...6 */
1949 orr r3, r3, ip, lsl #8 /* r3 = 9876 */
1950 mov r1, r1, lsl #8 /* r1 = ..B. */
1951 orr r1, r1, ip, lsr #24 /* r1 = ..BA */
1954 strh r1, [r0, #0x0a]
1959 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1964 ldrh r1, [r1, #0x0a]
1968 strh r1, [r0, #0x0a]
1973 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1975 ldr r2, [r1, #0x09] /* BE:r2 = 9ABx LE:r2 = xBA9 */
1976 ldr r3, [r1, #0x05] /* BE:r3 = 5678 LE:r3 = 8765 */
1977 mov ip, r2, lsr #8 /* BE:ip = .9AB LE:ip = .xBA */
1978 strh ip, [r0, #0x0a]
1979 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1980 ldrb r1, [r1] /* r1 = ...0 */
1981 mov r2, r2, lsl #24 /* r2 = 9... */
1982 orr r2, r2, r3, lsr #8 /* r2 = 9876 */
1983 mov r3, r3, lsl #24 /* r3 = 5... */
1984 orr r3, r3, ip, lsr #8 /* r3 = 5432 */
1985 orr r1, r1, ip, lsl #8 /* r1 = 3210 */
1993 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1995 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1996 ldr ip, [r1, #0x04] /* BE:ip = 4567 LE:ip = 7654 */
1997 ldr r1, [r1, #0x08] /* BE:r1 = 89AB LE:r1 = BA98 */
1999 mov r3, r2, lsr #8 /* r3 = .321 */
2000 orr r3, r3, ip, lsl #24 /* r3 = 4321 */
2002 mov r3, ip, lsr #8 /* r3 = .765 */
2003 orr r3, r3, r1, lsl #24 /* r3 = 8765 */
2005 mov r1, r1, lsr #8 /* r1 = .BA9 */
2006 strh r1, [r0, #0x09]
2007 mov r1, r1, lsr #16 /* r1 = ...B */
2008 strb r1, [r0, #0x0b]
2013 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2015 ldrb r2, [r1, #0x0b] /* r2 = ...B */
2016 ldr r3, [r1, #0x07] /* BE:r3 = 789A LE:r3 = A987 */
2017 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
2018 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
2019 strb r2, [r0, #0x0b]
2020 mov r2, r3, lsr #16 /* r2 = ..A9 */
2021 strh r2, [r0, #0x09]
2022 mov r3, r3, lsl #16 /* r3 = 87.. */
2023 orr r3, r3, ip, lsr #16 /* r3 = 8765 */
2024 mov ip, ip, lsl #16 /* ip = 43.. */
2025 orr ip, ip, r1, lsr #16 /* ip = 4321 */
2026 mov r1, r1, lsr #8 /* r1 = .210 */
2034 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2036 ldrh r2, [r1] /* r2 = ..10 */
2037 ldr r3, [r1, #0x02] /* r3 = 5432 */
2038 ldr ip, [r1, #0x06] /* ip = 9876 */
2039 ldrh r1, [r1, #0x0a] /* r1 = ..BA */
2041 mov r2, r2, lsr #8 /* r2 = ...1 */
2042 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
2043 mov r3, r3, lsr #24 /* r3 = ...5 */
2044 orr r3, r3, ip, lsl #8 /* r3 = 8765 */
2045 mov ip, ip, lsr #24 /* ip = ...9 */
2046 orr ip, ip, r1, lsl #8 /* ip = .BA9 */
2047 mov r1, r1, lsr #8 /* r1 = ...B */
2050 strh ip, [r0, #0x09]
2051 strb r1, [r0, #0x0b]
2056 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2062 ldrh r2, [r1, #0x09]
2063 ldrb r1, [r1, #0x0b]
2066 strh r2, [r0, #0x09]
2067 strb r1, [r0, #0x0b]