2 * memcpy - copy memory area
4 * Copyright (c) 2012-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
10 * ARMv8-a, AArch64, unaligned accesses.
14 #include <machine/asm.h>
45 /* This implementation handles overlaps and supports both memcpy and memmove
46 from a single entry point. It uses unaligned accesses and branchless
47 sequences to keep the code small, simple and improve performance.
49 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
50 copies of up to 128 bytes, and large copies. The overhead of the overlap
51 check is negligible since it is only required for large copies.
53 Large copies use a software pipelined loop processing 64 bytes per iteration.
54 The destination pointer is 16-byte aligned to minimize unaligned accesses.
55 The loop tail is handled by always copying 64 bytes from the end.
60 add srcend, src, count
61 add dstend, dstin, count
67 /* Small copies: 0..32 bytes. */
71 ldp D_l, D_h, [srcend, -16]
73 stp D_l, D_h, [dstend, -16]
76 /* Copy 8-15 bytes. */
78 tbz count, 3, L(copy8)
88 tbz count, 2, L(copy4)
90 ldr B_lw, [srcend, -4]
92 str B_lw, [dstend, -4]
95 /* Copy 0..3 bytes using a branchless sequence. */
100 ldrb C_lw, [srcend, -1]
101 ldrb B_lw, [src, tmp1]
103 strb B_lw, [dstin, tmp1]
104 strb C_lw, [dstend, -1]
109 /* Medium copies: 33..128 bytes. */
112 ldp B_l, B_h, [src, 16]
113 ldp C_l, C_h, [srcend, -32]
114 ldp D_l, D_h, [srcend, -16]
117 stp A_l, A_h, [dstin]
118 stp B_l, B_h, [dstin, 16]
119 stp C_l, C_h, [dstend, -32]
120 stp D_l, D_h, [dstend, -16]
124 /* Copy 65..128 bytes. */
126 ldp E_l, E_h, [src, 32]
127 ldp F_l, F_h, [src, 48]
130 ldp G_l, G_h, [srcend, -64]
131 ldp H_l, H_h, [srcend, -48]
132 stp G_l, G_h, [dstend, -64]
133 stp H_l, H_h, [dstend, -48]
135 stp A_l, A_h, [dstin]
136 stp B_l, B_h, [dstin, 16]
137 stp E_l, E_h, [dstin, 32]
138 stp F_l, F_h, [dstin, 48]
139 stp C_l, C_h, [dstend, -32]
140 stp D_l, D_h, [dstend, -16]
144 /* Copy more than 128 bytes. */
146 /* Use backwards copy if there is an overlap. */
150 b.lo L(copy_long_backwards)
152 /* Copy 16 bytes and then align dst to 16-byte alignment. */
158 add count, count, tmp1 /* Count is now 16 too large. */
159 ldp A_l, A_h, [src, 16]
160 stp D_l, D_h, [dstin]
161 ldp B_l, B_h, [src, 32]
162 ldp C_l, C_h, [src, 48]
163 ldp D_l, D_h, [src, 64]!
164 subs count, count, 128 + 16 /* Test and readjust count. */
165 b.ls L(copy64_from_end)
168 stp A_l, A_h, [dst, 16]
169 ldp A_l, A_h, [src, 16]
170 stp B_l, B_h, [dst, 32]
171 ldp B_l, B_h, [src, 32]
172 stp C_l, C_h, [dst, 48]
173 ldp C_l, C_h, [src, 48]
174 stp D_l, D_h, [dst, 64]!
175 ldp D_l, D_h, [src, 64]!
176 subs count, count, 64
179 /* Write the last iteration and copy 64 bytes from the end. */
181 ldp E_l, E_h, [srcend, -64]
182 stp A_l, A_h, [dst, 16]
183 ldp A_l, A_h, [srcend, -48]
184 stp B_l, B_h, [dst, 32]
185 ldp B_l, B_h, [srcend, -32]
186 stp C_l, C_h, [dst, 48]
187 ldp C_l, C_h, [srcend, -16]
188 stp D_l, D_h, [dst, 64]
189 stp E_l, E_h, [dstend, -64]
190 stp A_l, A_h, [dstend, -48]
191 stp B_l, B_h, [dstend, -32]
192 stp C_l, C_h, [dstend, -16]
197 /* Large backwards copy for overlapping copies.
198 Copy 16 bytes and then align dst to 16-byte alignment. */
199 L(copy_long_backwards):
200 ldp D_l, D_h, [srcend, -16]
202 sub srcend, srcend, tmp1
203 sub count, count, tmp1
204 ldp A_l, A_h, [srcend, -16]
205 stp D_l, D_h, [dstend, -16]
206 ldp B_l, B_h, [srcend, -32]
207 ldp C_l, C_h, [srcend, -48]
208 ldp D_l, D_h, [srcend, -64]!
209 sub dstend, dstend, tmp1
210 subs count, count, 128
211 b.ls L(copy64_from_start)
214 stp A_l, A_h, [dstend, -16]
215 ldp A_l, A_h, [srcend, -16]
216 stp B_l, B_h, [dstend, -32]
217 ldp B_l, B_h, [srcend, -32]
218 stp C_l, C_h, [dstend, -48]
219 ldp C_l, C_h, [srcend, -48]
220 stp D_l, D_h, [dstend, -64]!
221 ldp D_l, D_h, [srcend, -64]!
222 subs count, count, 64
223 b.hi L(loop64_backwards)
225 /* Write the last iteration and copy 64 bytes from the start. */
226 L(copy64_from_start):
227 ldp G_l, G_h, [src, 48]
228 stp A_l, A_h, [dstend, -16]
229 ldp A_l, A_h, [src, 32]
230 stp B_l, B_h, [dstend, -32]
231 ldp B_l, B_h, [src, 16]
232 stp C_l, C_h, [dstend, -48]
234 stp D_l, D_h, [dstend, -64]
235 stp G_l, G_h, [dstin, 48]
236 stp A_l, A_h, [dstin, 32]
237 stp B_l, B_h, [dstin, 16]
238 stp C_l, C_h, [dstin]