1 /* $NetBSD: memcpy_xscale.S,v 1.1 2003/10/14 07:51:45 scw Exp $ */
4 * Copyright 2003 Wasabi Systems, Inc.
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project by
20 * Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 * or promote products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
38 #include <machine/asm.h>
39 __FBSDID("$FreeBSD$");
41 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
45 ble .Lmemcpy_short /* <= 12 bytes */
46 mov r3, r0 /* We must not clobber r0 */
48 /* Word-align the destination buffer */
49 ands ip, r3, #0x03 /* Already word aligned? */
50 beq .Lmemcpy_wordaligned /* Yup */
55 ldrleb ip, [r1], #0x01
57 strleb ip, [r3], #0x01
58 ldrltb ip, [r1], #0x01
60 strltb ip, [r3], #0x01
62 /* Destination buffer is now word aligned */
64 ands ip, r1, #0x03 /* Is src also word-aligned? */
65 bne .Lmemcpy_bad_align /* Nope. Things just got bad */
67 /* Quad-align the destination buffer */
68 tst r3, #0x07 /* Already quad aligned? */
70 stmfd sp!, {r4-r9} /* Free up some registers */
74 /* Destination buffer quad aligned, source is at least word aligned */
76 blt .Lmemcpy_w_lessthan128
78 /* Copy 128 bytes at a time */
80 ldr r4, [r1], #0x04 /* LD:00-03 */
81 ldr r5, [r1], #0x04 /* LD:04-07 */
82 pld [r1, #0x18] /* Prefetch 0x20 */
83 ldr r6, [r1], #0x04 /* LD:08-0b */
84 ldr r7, [r1], #0x04 /* LD:0c-0f */
85 ldr r8, [r1], #0x04 /* LD:10-13 */
86 ldr r9, [r1], #0x04 /* LD:14-17 */
87 strd r4, [r3], #0x08 /* ST:00-07 */
88 ldr r4, [r1], #0x04 /* LD:18-1b */
89 ldr r5, [r1], #0x04 /* LD:1c-1f */
90 strd r6, [r3], #0x08 /* ST:08-0f */
91 ldr r6, [r1], #0x04 /* LD:20-23 */
92 ldr r7, [r1], #0x04 /* LD:24-27 */
93 pld [r1, #0x18] /* Prefetch 0x40 */
94 strd r8, [r3], #0x08 /* ST:10-17 */
95 ldr r8, [r1], #0x04 /* LD:28-2b */
96 ldr r9, [r1], #0x04 /* LD:2c-2f */
97 strd r4, [r3], #0x08 /* ST:18-1f */
98 ldr r4, [r1], #0x04 /* LD:30-33 */
99 ldr r5, [r1], #0x04 /* LD:34-37 */
100 strd r6, [r3], #0x08 /* ST:20-27 */
101 ldr r6, [r1], #0x04 /* LD:38-3b */
102 ldr r7, [r1], #0x04 /* LD:3c-3f */
103 strd r8, [r3], #0x08 /* ST:28-2f */
104 ldr r8, [r1], #0x04 /* LD:40-43 */
105 ldr r9, [r1], #0x04 /* LD:44-47 */
106 pld [r1, #0x18] /* Prefetch 0x60 */
107 strd r4, [r3], #0x08 /* ST:30-37 */
108 ldr r4, [r1], #0x04 /* LD:48-4b */
109 ldr r5, [r1], #0x04 /* LD:4c-4f */
110 strd r6, [r3], #0x08 /* ST:38-3f */
111 ldr r6, [r1], #0x04 /* LD:50-53 */
112 ldr r7, [r1], #0x04 /* LD:54-57 */
113 strd r8, [r3], #0x08 /* ST:40-47 */
114 ldr r8, [r1], #0x04 /* LD:58-5b */
115 ldr r9, [r1], #0x04 /* LD:5c-5f */
116 strd r4, [r3], #0x08 /* ST:48-4f */
117 ldr r4, [r1], #0x04 /* LD:60-63 */
118 ldr r5, [r1], #0x04 /* LD:64-67 */
119 pld [r1, #0x18] /* Prefetch 0x80 */
120 strd r6, [r3], #0x08 /* ST:50-57 */
121 ldr r6, [r1], #0x04 /* LD:68-6b */
122 ldr r7, [r1], #0x04 /* LD:6c-6f */
123 strd r8, [r3], #0x08 /* ST:58-5f */
124 ldr r8, [r1], #0x04 /* LD:70-73 */
125 ldr r9, [r1], #0x04 /* LD:74-77 */
126 strd r4, [r3], #0x08 /* ST:60-67 */
127 ldr r4, [r1], #0x04 /* LD:78-7b */
128 ldr r5, [r1], #0x04 /* LD:7c-7f */
129 strd r6, [r3], #0x08 /* ST:68-6f */
130 strd r8, [r3], #0x08 /* ST:70-77 */
132 strd r4, [r3], #0x08 /* ST:78-7f */
133 bge .Lmemcpy_w_loop128
135 .Lmemcpy_w_lessthan128:
136 adds r2, r2, #0x80 /* Adjust for extra sub */
138 bxeq lr /* Return now if done */
140 blt .Lmemcpy_w_lessthan32
142 /* Copy 32 bytes at a time */
158 bge .Lmemcpy_w_loop32
160 .Lmemcpy_w_lessthan32:
161 adds r2, r2, #0x20 /* Adjust for extra sub */
163 bxeq lr /* Return now if done */
167 addne pc, pc, r4, lsl #1
170 /* At least 24 bytes remaining */
176 /* At least 16 bytes remaining */
182 /* At least 8 bytes remaining */
188 /* Less than 8 bytes remaining */
190 bxeq lr /* Return now if done */
192 ldrge ip, [r1], #0x04
193 strge ip, [r3], #0x04
194 bxeq lr /* Return now if done */
198 ldrgeb r2, [r1], #0x01
201 strgeb r2, [r3], #0x01
207 * At this point, it has not been possible to word align both buffers.
208 * The destination buffer is word aligned, but the source buffer is not.
219 .Lmemcpy_bad1_loop16:
231 orr r4, r4, r5, lsr #24
233 orr r5, r5, r6, lsr #24
235 orr r6, r6, r7, lsr #24
237 orr r7, r7, ip, lsr #24
239 orr r4, r4, r5, lsl #24
241 orr r5, r5, r6, lsl #24
243 orr r6, r6, r7, lsl #24
245 orr r7, r7, ip, lsl #24
253 bge .Lmemcpy_bad1_loop16
257 bxeq lr /* Return now if done */
260 blt .Lmemcpy_bad_done
271 orr r4, r4, ip, lsr #24
273 orr r4, r4, ip, lsl #24
276 bge .Lmemcpy_bad1_loop4
280 .Lmemcpy_bad2_loop16:
292 orr r4, r4, r5, lsr #16
294 orr r5, r5, r6, lsr #16
296 orr r6, r6, r7, lsr #16
298 orr r7, r7, ip, lsr #16
300 orr r4, r4, r5, lsl #16
302 orr r5, r5, r6, lsl #16
304 orr r6, r6, r7, lsl #16
306 orr r7, r7, ip, lsl #16
314 bge .Lmemcpy_bad2_loop16
318 bxeq lr /* Return now if done */
321 blt .Lmemcpy_bad_done
332 orr r4, r4, ip, lsr #16
334 orr r4, r4, ip, lsl #16
337 bge .Lmemcpy_bad2_loop4
341 .Lmemcpy_bad3_loop16:
353 orr r4, r4, r5, lsr #8
355 orr r5, r5, r6, lsr #8
357 orr r6, r6, r7, lsr #8
359 orr r7, r7, ip, lsr #8
361 orr r4, r4, r5, lsl #8
363 orr r5, r5, r6, lsl #8
365 orr r6, r6, r7, lsl #8
367 orr r7, r7, ip, lsl #8
375 bge .Lmemcpy_bad3_loop16
379 bxeq lr /* Return now if done */
382 blt .Lmemcpy_bad_done
393 orr r4, r4, ip, lsr #8
395 orr r4, r4, ip, lsl #8
398 bge .Lmemcpy_bad3_loop4
407 ldrgeb r2, [r1], #0x01
410 strgeb r2, [r3], #0x01
416 * Handle short copies (less than 16 bytes), possibly misaligned.
417 * Some of these are *very* common, thanks to the network stack,
418 * and so are handled specially.
422 add pc, pc, r2, lsl #2
425 b .Lmemcpy_bytewise /* 0x01 */
426 b .Lmemcpy_bytewise /* 0x02 */
427 b .Lmemcpy_bytewise /* 0x03 */
428 b .Lmemcpy_4 /* 0x04 */
429 b .Lmemcpy_bytewise /* 0x05 */
430 b .Lmemcpy_6 /* 0x06 */
431 b .Lmemcpy_bytewise /* 0x07 */
432 b .Lmemcpy_8 /* 0x08 */
433 b .Lmemcpy_bytewise /* 0x09 */
434 b .Lmemcpy_bytewise /* 0x0a */
435 b .Lmemcpy_bytewise /* 0x0b */
436 b .Lmemcpy_c /* 0x0c */
439 mov r3, r0 /* We must not clobber r0 */
441 1: subs r2, r2, #0x01
443 ldrneb ip, [r1], #0x01
448 /******************************************************************************
449 * Special case for 4 byte copies
451 #define LMEMCPY_4_LOG2 6 /* 64 bytes */
452 #define LMEMCPY_4_PAD .align LMEMCPY_4_LOG2
456 orr r2, r2, r0, lsl #2
459 addne pc, r3, r2, lsl #LMEMCPY_4_LOG2
462 * 0000: dst is 32-bit aligned, src is 32-bit aligned
470 * 0001: dst is 32-bit aligned, src is 8-bit aligned
472 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
473 ldr r2, [r1, #3] /* BE:r2 = 3xxx LE:r2 = xxx3 */
475 mov r3, r3, lsl #8 /* r3 = 012. */
476 orr r3, r3, r2, lsr #24 /* r3 = 0123 */
478 mov r3, r3, lsr #8 /* r3 = .210 */
479 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
486 * 0010: dst is 32-bit aligned, src is 16-bit aligned
495 orr r3, r2, r3, lsl #16
501 * 0011: dst is 32-bit aligned, src is 8-bit aligned
503 ldr r3, [r1, #-3] /* BE:r3 = xxx0 LE:r3 = 0xxx */
504 ldr r2, [r1, #1] /* BE:r2 = 123x LE:r2 = x321 */
506 mov r3, r3, lsl #24 /* r3 = 0... */
507 orr r3, r3, r2, lsr #8 /* r3 = 0123 */
509 mov r3, r3, lsr #24 /* r3 = ...0 */
510 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
517 * 0100: dst is 8-bit aligned, src is 32-bit aligned
536 * 0101: dst is 8-bit aligned, src is 8-bit aligned
548 * 0110: dst is 8-bit aligned, src is 16-bit aligned
550 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
551 ldrh r3, [r1, #0x02] /* LE:r3 = ..23 LE:r3 = ..32 */
553 mov r1, r2, lsr #8 /* r1 = ...0 */
555 mov r2, r2, lsl #8 /* r2 = .01. */
556 orr r2, r2, r3, lsr #8 /* r2 = .012 */
559 mov r2, r2, lsr #8 /* r2 = ...1 */
560 orr r2, r2, r3, lsl #8 /* r2 = .321 */
561 mov r3, r3, lsr #8 /* r3 = ...3 */
569 * 0111: dst is 8-bit aligned, src is 8-bit aligned
581 * 1000: dst is 16-bit aligned, src is 32-bit aligned
597 * 1001: dst is 16-bit aligned, src is 8-bit aligned
599 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
600 ldr r3, [r1, #3] /* BE:r3 = 3xxx LE:r3 = xxx3 */
601 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
604 mov r2, r2, lsl #8 /* r2 = 012. */
605 orr r2, r2, r3, lsr #24 /* r2 = 0123 */
607 mov r2, r2, lsr #24 /* r2 = ...2 */
608 orr r2, r2, r3, lsl #8 /* r2 = xx32 */
615 * 1010: dst is 16-bit aligned, src is 16-bit aligned
625 * 1011: dst is 16-bit aligned, src is 8-bit aligned
627 ldr r3, [r1, #1] /* BE:r3 = 123x LE:r3 = x321 */
628 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
629 mov r1, r3, lsr #8 /* BE:r1 = .123 LE:r1 = .x32 */
632 mov r3, r3, lsr #24 /* r3 = ...1 */
633 orr r3, r3, r2, lsl #8 /* r3 = xx01 */
635 mov r3, r3, lsl #8 /* r3 = 321. */
636 orr r3, r3, r2, lsr #24 /* r3 = 3210 */
643 * 1100: dst is 8-bit aligned, src is 32-bit aligned
645 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
663 * 1101: dst is 8-bit aligned, src is 8-bit aligned
675 * 1110: dst is 8-bit aligned, src is 16-bit aligned
678 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
679 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
681 mov r3, r3, lsr #8 /* r3 = ...2 */
682 orr r3, r3, r2, lsl #8 /* r3 = ..12 */
684 mov r2, r2, lsr #8 /* r2 = ...0 */
687 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
688 ldrh r3, [r1, #0x02] /* BE:r3 = ..23 LE:r3 = ..32 */
690 mov r2, r2, lsr #8 /* r2 = ...1 */
691 orr r2, r2, r3, lsl #8 /* r2 = .321 */
693 mov r3, r3, lsr #8 /* r3 = ...3 */
700 * 1111: dst is 8-bit aligned, src is 8-bit aligned
712 /******************************************************************************
713 * Special case for 6 byte copies
715 #define LMEMCPY_6_LOG2 6 /* 64 bytes */
716 #define LMEMCPY_6_PAD .align LMEMCPY_6_LOG2
720 orr r2, r2, r0, lsl #2
723 addne pc, r3, r2, lsl #LMEMCPY_6_LOG2
726 * 0000: dst is 32-bit aligned, src is 32-bit aligned
736 * 0001: dst is 32-bit aligned, src is 8-bit aligned
738 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
739 ldr r3, [r1, #0x03] /* BE:r3 = 345x LE:r3 = x543 */
741 mov r2, r2, lsl #8 /* r2 = 012. */
742 orr r2, r2, r3, lsr #24 /* r2 = 0123 */
744 mov r2, r2, lsr #8 /* r2 = .210 */
745 orr r2, r2, r3, lsl #24 /* r2 = 3210 */
747 mov r3, r3, lsr #8 /* BE:r3 = .345 LE:r3 = .x54 */
754 * 0010: dst is 32-bit aligned, src is 16-bit aligned
756 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
757 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
759 mov r1, r3, lsr #16 /* r1 = ..23 */
760 orr r1, r1, r2, lsl #16 /* r1 = 0123 */
764 mov r1, r3, lsr #16 /* r1 = ..54 */
765 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
773 * 0011: dst is 32-bit aligned, src is 8-bit aligned
775 ldr r2, [r1, #-3] /* BE:r2 = xxx0 LE:r2 = 0xxx */
776 ldr r3, [r1, #1] /* BE:r3 = 1234 LE:r3 = 4321 */
777 ldr r1, [r1, #5] /* BE:r1 = 5xxx LE:r3 = xxx5 */
779 mov r2, r2, lsl #24 /* r2 = 0... */
780 orr r2, r2, r3, lsr #8 /* r2 = 0123 */
781 mov r3, r3, lsl #8 /* r3 = 234. */
782 orr r1, r3, r1, lsr #24 /* r1 = 2345 */
784 mov r2, r2, lsr #24 /* r2 = ...0 */
785 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
786 mov r1, r1, lsl #8 /* r1 = xx5. */
787 orr r1, r1, r3, lsr #24 /* r1 = xx54 */
795 * 0100: dst is 8-bit aligned, src is 32-bit aligned
797 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
798 ldrh r2, [r1, #0x04] /* BE:r2 = ..45 LE:r2 = ..54 */
799 mov r1, r3, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
802 mov r1, r3, lsr #24 /* r1 = ...0 */
804 mov r3, r3, lsl #8 /* r3 = 123. */
805 orr r3, r3, r2, lsr #8 /* r3 = 1234 */
808 mov r3, r3, lsr #24 /* r3 = ...3 */
809 orr r3, r3, r2, lsl #8 /* r3 = .543 */
810 mov r2, r2, lsr #8 /* r2 = ...5 */
818 * 0101: dst is 8-bit aligned, src is 8-bit aligned
832 * 0110: dst is 8-bit aligned, src is 16-bit aligned
834 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
835 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
837 mov r3, r2, lsr #8 /* r3 = ...0 */
840 mov r3, r1, lsr #8 /* r3 = .234 */
842 mov r3, r2, lsl #8 /* r3 = .01. */
843 orr r3, r3, r1, lsr #24 /* r3 = .012 */
849 mov r3, r1, lsr #8 /* r3 = .543 */
851 mov r3, r2, lsr #8 /* r3 = ...1 */
852 orr r3, r3, r1, lsl #8 /* r3 = 4321 */
859 * 0111: dst is 8-bit aligned, src is 8-bit aligned
873 * 1000: dst is 16-bit aligned, src is 32-bit aligned
876 ldr r2, [r1] /* r2 = 0123 */
877 ldrh r3, [r1, #0x04] /* r3 = ..45 */
878 mov r1, r2, lsr #16 /* r1 = ..01 */
879 orr r3, r3, r2, lsl#16 /* r3 = 2345 */
883 ldrh r2, [r1, #0x04] /* r2 = ..54 */
884 ldr r3, [r1] /* r3 = 3210 */
885 mov r2, r2, lsl #16 /* r2 = 54.. */
886 orr r2, r2, r3, lsr #16 /* r2 = 5432 */
894 * 1001: dst is 16-bit aligned, src is 8-bit aligned
896 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
897 ldr r2, [r1, #3] /* BE:r2 = 345x LE:r2 = x543 */
898 mov r1, r3, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
900 mov r2, r2, lsr #8 /* r2 = .345 */
901 orr r2, r2, r3, lsl #24 /* r2 = 2345 */
903 mov r2, r2, lsl #8 /* r2 = 543. */
904 orr r2, r2, r3, lsr #24 /* r2 = 5432 */
912 * 1010: dst is 16-bit aligned, src is 16-bit aligned
922 * 1011: dst is 16-bit aligned, src is 8-bit aligned
924 ldrb r3, [r1] /* r3 = ...0 */
925 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
926 ldrb r1, [r1, #0x05] /* r1 = ...5 */
928 mov r3, r3, lsl #8 /* r3 = ..0. */
929 orr r3, r3, r2, lsr #24 /* r3 = ..01 */
930 orr r1, r1, r2, lsl #8 /* r1 = 2345 */
932 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
933 mov r1, r1, lsl #24 /* r1 = 5... */
934 orr r1, r1, r2, lsr #8 /* r1 = 5432 */
942 * 1100: dst is 8-bit aligned, src is 32-bit aligned
944 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
945 ldrh r1, [r1, #0x04] /* BE:r1 = ..45 LE:r1 = ..54 */
947 mov r3, r2, lsr #24 /* r3 = ...0 */
949 mov r2, r2, lsl #8 /* r2 = 123. */
950 orr r2, r2, r1, lsr #8 /* r2 = 1234 */
953 mov r2, r2, lsr #8 /* r2 = .321 */
954 orr r2, r2, r1, lsl #24 /* r2 = 4321 */
955 mov r1, r1, lsr #8 /* r1 = ...5 */
963 * 1101: dst is 8-bit aligned, src is 8-bit aligned
977 * 1110: dst is 8-bit aligned, src is 16-bit aligned
979 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
980 ldr r1, [r1, #0x02] /* BE:r1 = 2345 LE:r1 = 5432 */
982 mov r3, r2, lsr #8 /* r3 = ...0 */
984 mov r2, r2, lsl #24 /* r2 = 1... */
985 orr r2, r2, r1, lsr #8 /* r2 = 1234 */
988 mov r2, r2, lsr #8 /* r2 = ...1 */
989 orr r2, r2, r1, lsl #8 /* r2 = 4321 */
990 mov r1, r1, lsr #24 /* r1 = ...5 */
998 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1002 ldrb r1, [r1, #0x05]
1005 strb r1, [r0, #0x05]
1010 /******************************************************************************
1011 * Special case for 8 byte copies
1013 #define LMEMCPY_8_LOG2 6 /* 64 bytes */
1014 #define LMEMCPY_8_PAD .align LMEMCPY_8_LOG2
1018 orr r2, r2, r0, lsl #2
1021 addne pc, r3, r2, lsl #LMEMCPY_8_LOG2
1024 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1034 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1036 ldr r3, [r1, #-1] /* BE:r3 = x012 LE:r3 = 210x */
1037 ldr r2, [r1, #0x03] /* BE:r2 = 3456 LE:r2 = 6543 */
1038 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1040 mov r3, r3, lsl #8 /* r3 = 012. */
1041 orr r3, r3, r2, lsr #24 /* r3 = 0123 */
1042 orr r2, r1, r2, lsl #8 /* r2 = 4567 */
1044 mov r3, r3, lsr #8 /* r3 = .210 */
1045 orr r3, r3, r2, lsl #24 /* r3 = 3210 */
1046 mov r1, r1, lsl #24 /* r1 = 7... */
1047 orr r2, r1, r2, lsr #8 /* r2 = 7654 */
1055 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1057 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1058 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1059 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1061 mov r2, r2, lsl #16 /* r2 = 01.. */
1062 orr r2, r2, r3, lsr #16 /* r2 = 0123 */
1063 orr r3, r1, r3, lsl #16 /* r3 = 4567 */
1065 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1066 mov r3, r3, lsr #16 /* r3 = ..54 */
1067 orr r3, r3, r1, lsl #16 /* r3 = 7654 */
1075 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1077 ldrb r3, [r1] /* r3 = ...0 */
1078 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1079 ldr r1, [r1, #0x05] /* BE:r1 = 567x LE:r1 = x765 */
1081 mov r3, r3, lsl #24 /* r3 = 0... */
1082 orr r3, r3, r2, lsr #8 /* r3 = 0123 */
1083 mov r2, r2, lsl #24 /* r2 = 4... */
1084 orr r2, r2, r1, lsr #8 /* r2 = 4567 */
1086 orr r3, r3, r2, lsl #8 /* r3 = 3210 */
1087 mov r2, r2, lsr #24 /* r2 = ...4 */
1088 orr r2, r2, r1, lsl #8 /* r2 = 7654 */
1096 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1098 ldr r3, [r1] /* BE:r3 = 0123 LE:r3 = 3210 */
1099 ldr r2, [r1, #0x04] /* BE:r2 = 4567 LE:r2 = 7654 */
1101 mov r1, r3, lsr #24 /* r1 = ...0 */
1103 mov r1, r3, lsr #8 /* r1 = .012 */
1104 strb r2, [r0, #0x07]
1105 mov r3, r3, lsl #24 /* r3 = 3... */
1106 orr r3, r3, r2, lsr #8 /* r3 = 3456 */
1109 mov r1, r2, lsr #24 /* r1 = ...7 */
1110 strb r1, [r0, #0x07]
1111 mov r1, r3, lsr #8 /* r1 = .321 */
1112 mov r3, r3, lsr #24 /* r3 = ...3 */
1113 orr r3, r3, r2, lsl #8 /* r3 = 6543 */
1115 strh r1, [r0, #0x01]
1121 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1124 ldrh r3, [r1, #0x01]
1126 ldrb r1, [r1, #0x07]
1128 strh r3, [r0, #0x01]
1130 strb r1, [r0, #0x07]
1135 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1137 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1138 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1139 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1141 mov ip, r2, lsr #8 /* ip = ...0 */
1143 mov ip, r2, lsl #8 /* ip = .01. */
1144 orr ip, ip, r3, lsr #24 /* ip = .012 */
1145 strb r1, [r0, #0x07]
1146 mov r3, r3, lsl #8 /* r3 = 345. */
1147 orr r3, r3, r1, lsr #8 /* r3 = 3456 */
1149 strb r2, [r0] /* 0 */
1150 mov ip, r1, lsr #8 /* ip = ...7 */
1151 strb ip, [r0, #0x07] /* 7 */
1152 mov ip, r2, lsr #8 /* ip = ...1 */
1153 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1154 mov r3, r3, lsr #8 /* r3 = .543 */
1155 orr r3, r3, r1, lsl #24 /* r3 = 6543 */
1157 strh ip, [r0, #0x01]
1163 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1165 ldrb r3, [r1] /* r3 = ...0 */
1166 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1167 ldrh r2, [r1, #0x05] /* BE:r2 = ..56 LE:r2 = ..65 */
1168 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1170 mov r3, ip, lsr #16 /* BE:r3 = ..12 LE:r3 = ..43 */
1172 strh r3, [r0, #0x01]
1173 orr r2, r2, ip, lsl #16 /* r2 = 3456 */
1175 strh ip, [r0, #0x01]
1176 orr r2, r3, r2, lsl #16 /* r2 = 6543 */
1179 strb r1, [r0, #0x07]
1184 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1186 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1187 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1188 mov r1, r2, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1191 mov r1, r3, lsr #16 /* r1 = ..45 */
1192 orr r2, r1 ,r2, lsl #16 /* r2 = 2345 */
1195 orr r2, r1, r3, lsl #16 /* r2 = 5432 */
1196 mov r3, r3, lsr #16 /* r3 = ..76 */
1199 strh r3, [r0, #0x06]
1204 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1206 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1207 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1208 ldrb ip, [r1, #0x07] /* ip = ...7 */
1209 mov r1, r2, lsr #8 /* BE:r1 = .x01 LE:r1 = .210 */
1212 mov r1, r2, lsl #24 /* r1 = 2... */
1213 orr r1, r1, r3, lsr #8 /* r1 = 2345 */
1214 orr r3, ip, r3, lsl #8 /* r3 = 4567 */
1216 mov r1, r2, lsr #24 /* r1 = ...2 */
1217 orr r1, r1, r3, lsl #8 /* r1 = 5432 */
1218 mov r3, r3, lsr #24 /* r3 = ...6 */
1219 orr r3, r3, ip, lsl #8 /* r3 = ..76 */
1222 strh r3, [r0, #0x06]
1227 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1231 ldrh r3, [r1, #0x06]
1234 strh r3, [r0, #0x06]
1239 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1241 ldr r3, [r1, #0x05] /* BE:r3 = 567x LE:r3 = x765 */
1242 ldr r2, [r1, #0x01] /* BE:r2 = 1234 LE:r2 = 4321 */
1243 ldrb ip, [r1] /* ip = ...0 */
1244 mov r1, r3, lsr #8 /* BE:r1 = .567 LE:r1 = .x76 */
1245 strh r1, [r0, #0x06]
1247 mov r3, r3, lsr #24 /* r3 = ...5 */
1248 orr r3, r3, r2, lsl #8 /* r3 = 2345 */
1249 mov r2, r2, lsr #24 /* r2 = ...1 */
1250 orr r2, r2, ip, lsl #8 /* r2 = ..01 */
1252 mov r3, r3, lsl #24 /* r3 = 5... */
1253 orr r3, r3, r2, lsr #8 /* r3 = 5432 */
1254 orr r2, ip, r2, lsl #8 /* r2 = 3210 */
1262 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1264 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1265 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1266 mov r1, r3, lsr #8 /* BE:r1 = .456 LE:r1 = .765 */
1267 strh r1, [r0, #0x05]
1269 strb r3, [r0, #0x07]
1270 mov r1, r2, lsr #24 /* r1 = ...0 */
1272 mov r2, r2, lsl #8 /* r2 = 123. */
1273 orr r2, r2, r3, lsr #24 /* r2 = 1234 */
1277 mov r1, r3, lsr #24 /* r1 = ...7 */
1278 strb r1, [r0, #0x07]
1279 mov r2, r2, lsr #8 /* r2 = .321 */
1280 orr r2, r2, r3, lsl #24 /* r2 = 4321 */
1287 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1289 ldrb r3, [r1] /* r3 = ...0 */
1290 ldrh r2, [r1, #0x01] /* BE:r2 = ..12 LE:r2 = ..21 */
1291 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
1292 ldrb r1, [r1, #0x07] /* r1 = ...7 */
1294 mov r3, ip, lsr #16 /* BE:r3 = ..34 LE:r3 = ..65 */
1296 strh ip, [r0, #0x05]
1297 orr r2, r3, r2, lsl #16 /* r2 = 1234 */
1299 strh r3, [r0, #0x05]
1300 orr r2, r2, ip, lsl #16 /* r2 = 4321 */
1303 strb r1, [r0, #0x07]
1308 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1310 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1311 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1312 ldrh r1, [r1, #0x06] /* BE:r1 = ..67 LE:r1 = ..76 */
1314 mov ip, r2, lsr #8 /* ip = ...0 */
1316 mov ip, r2, lsl #24 /* ip = 1... */
1317 orr ip, ip, r3, lsr #8 /* ip = 1234 */
1318 strb r1, [r0, #0x07]
1319 mov r1, r1, lsr #8 /* r1 = ...6 */
1320 orr r1, r1, r3, lsl #8 /* r1 = 3456 */
1323 mov ip, r2, lsr #8 /* ip = ...1 */
1324 orr ip, ip, r3, lsl #8 /* ip = 4321 */
1325 mov r2, r1, lsr #8 /* r2 = ...7 */
1326 strb r2, [r0, #0x07]
1327 mov r1, r1, lsl #8 /* r1 = .76. */
1328 orr r1, r1, r3, lsr #24 /* r1 = .765 */
1331 strh r1, [r0, #0x05]
1336 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1340 ldrh r3, [r1, #0x05]
1341 ldrb r1, [r1, #0x07]
1344 strh r3, [r0, #0x05]
1345 strb r1, [r0, #0x07]
1349 /******************************************************************************
1350 * Special case for 12 byte copies
1352 #define LMEMCPY_C_LOG2 7 /* 128 bytes */
1353 #define LMEMCPY_C_PAD .align LMEMCPY_C_LOG2
1357 orr r2, r2, r0, lsl #2
1360 addne pc, r3, r2, lsl #LMEMCPY_C_LOG2
1363 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1375 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1377 ldrb r2, [r1, #0xb] /* r2 = ...B */
1378 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1379 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1380 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
1382 orr r2, r2, ip, lsl #8 /* r2 = 89AB */
1384 mov r2, ip, lsr #24 /* r2 = ...7 */
1385 orr r2, r2, r3, lsl #8 /* r2 = 4567 */
1386 mov r1, r1, lsl #8 /* r1 = 012. */
1387 orr r1, r1, r3, lsr #24 /* r1 = 0123 */
1389 mov r2, r2, lsl #24 /* r2 = B... */
1390 orr r2, r2, ip, lsr #8 /* r2 = BA98 */
1392 mov r2, ip, lsl #24 /* r2 = 7... */
1393 orr r2, r2, r3, lsr #8 /* r2 = 7654 */
1394 mov r1, r1, lsr #8 /* r1 = .210 */
1395 orr r1, r1, r3, lsl #24 /* r1 = 3210 */
1403 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1405 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1406 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1407 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1408 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1410 mov r2, r2, lsl #16 /* r2 = 01.. */
1411 orr r2, r2, r3, lsr #16 /* r2 = 0123 */
1413 mov r3, r3, lsl #16 /* r3 = 45.. */
1414 orr r3, r3, ip, lsr #16 /* r3 = 4567 */
1415 orr r1, r1, ip, lsl #16 /* r1 = 89AB */
1417 orr r2, r2, r3, lsl #16 /* r2 = 3210 */
1419 mov r3, r3, lsr #16 /* r3 = ..54 */
1420 orr r3, r3, ip, lsl #16 /* r3 = 7654 */
1421 mov r1, r1, lsl #16 /* r1 = BA.. */
1422 orr r1, r1, ip, lsr #16 /* r1 = BA98 */
1430 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1432 ldrb r2, [r1] /* r2 = ...0 */
1433 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1434 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1435 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1437 mov r2, r2, lsl #24 /* r2 = 0... */
1438 orr r2, r2, r3, lsr #8 /* r2 = 0123 */
1440 mov r3, r3, lsl #24 /* r3 = 4... */
1441 orr r3, r3, ip, lsr #8 /* r3 = 4567 */
1442 mov r1, r1, lsr #8 /* r1 = .9AB */
1443 orr r1, r1, ip, lsl #24 /* r1 = 89AB */
1445 orr r2, r2, r3, lsl #8 /* r2 = 3210 */
1447 mov r3, r3, lsr #24 /* r3 = ...4 */
1448 orr r3, r3, ip, lsl #8 /* r3 = 7654 */
1449 mov r1, r1, lsl #8 /* r1 = BA9. */
1450 orr r1, r1, ip, lsr #24 /* r1 = BA98 */
1458 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1460 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1461 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1462 ldr ip, [r1, #0x08] /* BE:ip = 89AB LE:ip = BA98 */
1463 mov r1, r2, lsr #8 /* BE:r1 = .012 LE:r1 = .321 */
1464 strh r1, [r0, #0x01]
1466 mov r1, r2, lsr #24 /* r1 = ...0 */
1468 mov r1, r2, lsl #24 /* r1 = 3... */
1469 orr r2, r1, r3, lsr #8 /* r1 = 3456 */
1470 mov r1, r3, lsl #24 /* r1 = 7... */
1471 orr r1, r1, ip, lsr #8 /* r1 = 789A */
1474 mov r1, r2, lsr #24 /* r1 = ...3 */
1475 orr r2, r1, r3, lsl #8 /* r1 = 6543 */
1476 mov r1, r3, lsr #24 /* r1 = ...7 */
1477 orr r1, r1, ip, lsl #8 /* r1 = A987 */
1478 mov ip, ip, lsr #24 /* ip = ...B */
1482 strb ip, [r0, #0x0b]
1487 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1490 ldrh r3, [r1, #0x01]
1494 ldrb r1, [r1, #0x0b]
1495 strh r3, [r0, #0x01]
1498 strb r1, [r0, #0x0b]
1503 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1505 ldrh r2, [r1] /* BE:r2 = ..01 LE:r2 = ..10 */
1506 ldr r3, [r1, #0x02] /* BE:r3 = 2345 LE:r3 = 5432 */
1507 ldr ip, [r1, #0x06] /* BE:ip = 6789 LE:ip = 9876 */
1508 ldrh r1, [r1, #0x0a] /* BE:r1 = ..AB LE:r1 = ..BA */
1510 mov r2, r2, ror #8 /* r2 = 1..0 */
1512 mov r2, r2, lsr #16 /* r2 = ..1. */
1513 orr r2, r2, r3, lsr #24 /* r2 = ..12 */
1514 strh r2, [r0, #0x01]
1515 mov r2, r3, lsl #8 /* r2 = 345. */
1516 orr r3, r2, ip, lsr #24 /* r3 = 3456 */
1517 mov r2, ip, lsl #8 /* r2 = 789. */
1518 orr r2, r2, r1, lsr #8 /* r2 = 789A */
1521 mov r2, r2, lsr #8 /* r2 = ...1 */
1522 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
1523 strh r2, [r0, #0x01]
1524 mov r2, r3, lsr #8 /* r2 = .543 */
1525 orr r3, r2, ip, lsl #24 /* r3 = 6543 */
1526 mov r2, ip, lsr #8 /* r2 = .987 */
1527 orr r2, r2, r1, lsl #24 /* r2 = A987 */
1528 mov r1, r1, lsr #8 /* r1 = ...B */
1532 strb r1, [r0, #0x0b]
1537 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1540 ldr r3, [r1, #0x01] /* BE:r3 = 1234 LE:r3 = 4321 */
1541 ldr ip, [r1, #0x05] /* BE:ip = 5678 LE:ip = 8765 */
1542 ldr r1, [r1, #0x09] /* BE:r1 = 9ABx LE:r1 = xBA9 */
1545 mov r2, r3, lsr #16 /* r2 = ..12 */
1546 strh r2, [r0, #0x01]
1547 mov r3, r3, lsl #16 /* r3 = 34.. */
1548 orr r3, r3, ip, lsr #16 /* r3 = 3456 */
1549 mov ip, ip, lsl #16 /* ip = 78.. */
1550 orr ip, ip, r1, lsr #16 /* ip = 789A */
1551 mov r1, r1, lsr #8 /* r1 = .9AB */
1553 strh r3, [r0, #0x01]
1554 mov r3, r3, lsr #16 /* r3 = ..43 */
1555 orr r3, r3, ip, lsl #16 /* r3 = 6543 */
1556 mov ip, ip, lsr #16 /* ip = ..87 */
1557 orr ip, ip, r1, lsl #16 /* ip = A987 */
1558 mov r1, r1, lsr #16 /* r1 = ..xB */
1562 strb r1, [r0, #0x0b]
1567 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1569 ldr ip, [r1] /* BE:ip = 0123 LE:ip = 3210 */
1570 ldr r3, [r1, #0x04] /* BE:r3 = 4567 LE:r3 = 7654 */
1571 ldr r2, [r1, #0x08] /* BE:r2 = 89AB LE:r2 = BA98 */
1572 mov r1, ip, lsr #16 /* BE:r1 = ..01 LE:r1 = ..32 */
1575 mov r1, ip, lsl #16 /* r1 = 23.. */
1576 orr r1, r1, r3, lsr #16 /* r1 = 2345 */
1577 mov r3, r3, lsl #16 /* r3 = 67.. */
1578 orr r3, r3, r2, lsr #16 /* r3 = 6789 */
1581 orr r1, r1, r3, lsl #16 /* r1 = 5432 */
1582 mov r3, r3, lsr #16 /* r3 = ..76 */
1583 orr r3, r3, r2, lsl #16 /* r3 = 9876 */
1584 mov r2, r2, lsr #16 /* r2 = ..BA */
1588 strh r2, [r0, #0x0a]
1593 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1595 ldr r2, [r1, #-1] /* BE:r2 = x012 LE:r2 = 210x */
1596 ldr r3, [r1, #0x03] /* BE:r3 = 3456 LE:r3 = 6543 */
1597 mov ip, r2, lsr #8 /* BE:ip = .x01 LE:ip = .210 */
1599 ldr ip, [r1, #0x07] /* BE:ip = 789A LE:ip = A987 */
1600 ldrb r1, [r1, #0x0b] /* r1 = ...B */
1602 mov r2, r2, lsl #24 /* r2 = 2... */
1603 orr r2, r2, r3, lsr #8 /* r2 = 2345 */
1604 mov r3, r3, lsl #24 /* r3 = 6... */
1605 orr r3, r3, ip, lsr #8 /* r3 = 6789 */
1606 orr r1, r1, ip, lsl #8 /* r1 = 89AB */
1608 mov r2, r2, lsr #24 /* r2 = ...2 */
1609 orr r2, r2, r3, lsl #8 /* r2 = 5432 */
1610 mov r3, r3, lsr #24 /* r3 = ...6 */
1611 orr r3, r3, ip, lsl #8 /* r3 = 9876 */
1612 mov r1, r1, lsl #8 /* r1 = ..B. */
1613 orr r1, r1, ip, lsr #24 /* r1 = ..BA */
1617 strh r1, [r0, #0x0a]
1622 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1627 ldrh r1, [r1, #0x0a]
1631 strh r1, [r0, #0x0a]
1636 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1638 ldr r2, [r1, #0x09] /* BE:r2 = 9ABx LE:r2 = xBA9 */
1639 ldr r3, [r1, #0x05] /* BE:r3 = 5678 LE:r3 = 8765 */
1640 mov ip, r2, lsr #8 /* BE:ip = .9AB LE:ip = .xBA */
1641 strh ip, [r0, #0x0a]
1642 ldr ip, [r1, #0x01] /* BE:ip = 1234 LE:ip = 4321 */
1643 ldrb r1, [r1] /* r1 = ...0 */
1645 mov r2, r2, lsr #24 /* r2 = ...9 */
1646 orr r2, r2, r3, lsl #8 /* r2 = 6789 */
1647 mov r3, r3, lsr #24 /* r3 = ...5 */
1648 orr r3, r3, ip, lsl #8 /* r3 = 2345 */
1649 mov r1, r1, lsl #8 /* r1 = ..0. */
1650 orr r1, r1, ip, lsr #24 /* r1 = ..01 */
1652 mov r2, r2, lsl #24 /* r2 = 9... */
1653 orr r2, r2, r3, lsr #8 /* r2 = 9876 */
1654 mov r3, r3, lsl #24 /* r3 = 5... */
1655 orr r3, r3, ip, lsr #8 /* r3 = 5432 */
1656 orr r1, r1, ip, lsl #8 /* r1 = 3210 */
1665 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1667 ldr r2, [r1] /* BE:r2 = 0123 LE:r2 = 3210 */
1668 ldr ip, [r1, #0x04] /* BE:ip = 4567 LE:ip = 7654 */
1669 ldr r1, [r1, #0x08] /* BE:r1 = 89AB LE:r1 = BA98 */
1671 mov r3, r2, lsr #24 /* r3 = ...0 */
1673 mov r2, r2, lsl #8 /* r2 = 123. */
1674 orr r2, r2, ip, lsr #24 /* r2 = 1234 */
1676 mov r2, ip, lsl #8 /* r2 = 567. */
1677 orr r2, r2, r1, lsr #24 /* r2 = 5678 */
1679 mov r2, r1, lsr #8 /* r2 = ..9A */
1680 strh r2, [r0, #0x09]
1681 strb r1, [r0, #0x0b]
1684 mov r3, r2, lsr #8 /* r3 = .321 */
1685 orr r3, r3, ip, lsl #24 /* r3 = 4321 */
1687 mov r3, ip, lsr #8 /* r3 = .765 */
1688 orr r3, r3, r1, lsl #24 /* r3 = 8765 */
1690 mov r1, r1, lsr #8 /* r1 = .BA9 */
1691 strh r1, [r0, #0x09]
1692 mov r1, r1, lsr #16 /* r1 = ...B */
1693 strb r1, [r0, #0x0b]
1699 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
1701 ldrb r2, [r1, #0x0b] /* r2 = ...B */
1702 ldr r3, [r1, #0x07] /* BE:r3 = 789A LE:r3 = A987 */
1703 ldr ip, [r1, #0x03] /* BE:ip = 3456 LE:ip = 6543 */
1704 ldr r1, [r1, #-1] /* BE:r1 = x012 LE:r1 = 210x */
1705 strb r2, [r0, #0x0b]
1707 strh r3, [r0, #0x09]
1708 mov r3, r3, lsr #16 /* r3 = ..78 */
1709 orr r3, r3, ip, lsl #16 /* r3 = 5678 */
1710 mov ip, ip, lsr #16 /* ip = ..34 */
1711 orr ip, ip, r1, lsl #16 /* ip = 1234 */
1712 mov r1, r1, lsr #16 /* r1 = ..x0 */
1714 mov r2, r3, lsr #16 /* r2 = ..A9 */
1715 strh r2, [r0, #0x09]
1716 mov r3, r3, lsl #16 /* r3 = 87.. */
1717 orr r3, r3, ip, lsr #16 /* r3 = 8765 */
1718 mov ip, ip, lsl #16 /* ip = 43.. */
1719 orr ip, ip, r1, lsr #16 /* ip = 4321 */
1720 mov r1, r1, lsr #8 /* r1 = .210 */
1729 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
1732 ldrh r2, [r1, #0x0a] /* r2 = ..AB */
1733 ldr ip, [r1, #0x06] /* ip = 6789 */
1734 ldr r3, [r1, #0x02] /* r3 = 2345 */
1735 ldrh r1, [r1] /* r1 = ..01 */
1736 strb r2, [r0, #0x0b]
1737 mov r2, r2, lsr #8 /* r2 = ...A */
1738 orr r2, r2, ip, lsl #8 /* r2 = 789A */
1739 mov ip, ip, lsr #8 /* ip = .678 */
1740 orr ip, ip, r3, lsl #24 /* ip = 5678 */
1741 mov r3, r3, lsr #8 /* r3 = .234 */
1742 orr r3, r3, r1, lsl #24 /* r3 = 1234 */
1743 mov r1, r1, lsr #8 /* r1 = ...0 */
1747 strh r2, [r0, #0x09]
1749 ldrh r2, [r1] /* r2 = ..10 */
1750 ldr r3, [r1, #0x02] /* r3 = 5432 */
1751 ldr ip, [r1, #0x06] /* ip = 9876 */
1752 ldrh r1, [r1, #0x0a] /* r1 = ..BA */
1754 mov r2, r2, lsr #8 /* r2 = ...1 */
1755 orr r2, r2, r3, lsl #8 /* r2 = 4321 */
1756 mov r3, r3, lsr #24 /* r3 = ...5 */
1757 orr r3, r3, ip, lsl #8 /* r3 = 8765 */
1758 mov ip, ip, lsr #24 /* ip = ...9 */
1759 orr ip, ip, r1, lsl #8 /* ip = .BA9 */
1760 mov r1, r1, lsr #8 /* r1 = ...B */
1763 strh ip, [r0, #0x09]
1764 strb r1, [r0, #0x0b]
1770 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
1776 ldrh r2, [r1, #0x09]
1777 ldrb r1, [r1, #0x0b]
1780 strh r2, [r0, #0x09]
1781 strb r1, [r0, #0x0b]
1783 #endif /* !_STANDALONE */