1 /* Copyright (c) 2013, Linaro Limited
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
11 * Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
15 * Neither the name of Linaro Limited nor the names of its
16 contributors may be used to endorse or promote products derived
17 from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
34 of VFP or NEON when built with the appropriate flags.
38 ARMv6 (ARMv7-a if using Neon)
45 /* This implementation requires ARM state. */
56 #elif !defined (__SOFTFP__)
60 # define FRAME_SIZE 32
65 # define FRAME_SIZE 32
69 /* Old versions of GAS incorrectly implement the NEON align semantics. */
70 #ifdef BROKEN_ASM_NEON_ALIGN
71 #define ALIGN(addr, align) addr,:align
73 #define ALIGN(addr, align) addr:align
76 #define PC_OFFSET 8 /* PC pipeline compensation. */
79 /* Call parameters. */
90 /* For bulk copies using GP registers. */
91 #define A_l r2 /* Call-clobbered. */
92 #define A_h r3 /* Call-clobbered. */
101 /* Number of lines ahead to pre-fetch data. If you change this the code
102 below will need adjustment to compensate. */
104 #define prefetch_lines 5
107 .macro cpy_line_vfp vreg, base
108 vstr \vreg, [dst, #\base]
109 vldr \vreg, [src, #\base]
110 vstr d0, [dst, #\base + 8]
111 vldr d0, [src, #\base + 8]
112 vstr d1, [dst, #\base + 16]
113 vldr d1, [src, #\base + 16]
114 vstr d2, [dst, #\base + 24]
115 vldr d2, [src, #\base + 24]
116 vstr \vreg, [dst, #\base + 32]
117 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
118 vstr d0, [dst, #\base + 40]
119 vldr d0, [src, #\base + 40]
120 vstr d1, [dst, #\base + 48]
121 vldr d1, [src, #\base + 48]
122 vstr d2, [dst, #\base + 56]
123 vldr d2, [src, #\base + 56]
126 .macro cpy_tail_vfp vreg, base
127 vstr \vreg, [dst, #\base]
128 vldr \vreg, [src, #\base]
129 vstr d0, [dst, #\base + 8]
130 vldr d0, [src, #\base + 8]
131 vstr d1, [dst, #\base + 16]
132 vldr d1, [src, #\base + 16]
133 vstr d2, [dst, #\base + 24]
134 vldr d2, [src, #\base + 24]
135 vstr \vreg, [dst, #\base + 32]
136 vstr d0, [dst, #\base + 40]
137 vldr d0, [src, #\base + 40]
138 vstr d1, [dst, #\base + 48]
139 vldr d1, [src, #\base + 48]
140 vstr d2, [dst, #\base + 56]
141 vldr d2, [src, #\base + 56]
145 .macro def_fn f p2align=0
153 def_fn memcpy p2align=6
155 mov dst, dstin /* Preserve dstin, we need to return it. */
158 /* Deal with small copies quickly by dropping straight into the
163 and tmp1, count, #0x38
164 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
166 vld1.8 {d0}, [src]! /* 14 words to go. */
168 vld1.8 {d0}, [src]! /* 12 words to go. */
170 vld1.8 {d0}, [src]! /* 10 words to go. */
172 vld1.8 {d0}, [src]! /* 8 words to go. */
174 vld1.8 {d0}, [src]! /* 6 words to go. */
176 vld1.8 {d0}, [src]! /* 4 words to go. */
178 vld1.8 {d0}, [src]! /* 2 words to go. */
182 ldrne tmp1, [src], #4
183 strne tmp1, [dst], #4
185 /* Copy up to 15 full words of data. May not be aligned. */
186 /* Cannot use VFP for unaligned data. */
187 and tmp1, count, #0x3c
190 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
191 /* Jump directly into the sequence below at the correct offset. */
192 add pc, pc, tmp1, lsl #1
194 ldr tmp1, [src, #-60] /* 15 words to go. */
195 str tmp1, [dst, #-60]
197 ldr tmp1, [src, #-56] /* 14 words to go. */
198 str tmp1, [dst, #-56]
199 ldr tmp1, [src, #-52]
200 str tmp1, [dst, #-52]
202 ldr tmp1, [src, #-48] /* 12 words to go. */
203 str tmp1, [dst, #-48]
204 ldr tmp1, [src, #-44]
205 str tmp1, [dst, #-44]
207 ldr tmp1, [src, #-40] /* 10 words to go. */
208 str tmp1, [dst, #-40]
209 ldr tmp1, [src, #-36]
210 str tmp1, [dst, #-36]
212 ldr tmp1, [src, #-32] /* 8 words to go. */
213 str tmp1, [dst, #-32]
214 ldr tmp1, [src, #-28]
215 str tmp1, [dst, #-28]
217 ldr tmp1, [src, #-24] /* 6 words to go. */
218 str tmp1, [dst, #-24]
219 ldr tmp1, [src, #-20]
220 str tmp1, [dst, #-20]
222 ldr tmp1, [src, #-16] /* 4 words to go. */
223 str tmp1, [dst, #-16]
224 ldr tmp1, [src, #-12]
225 str tmp1, [dst, #-12]
227 ldr tmp1, [src, #-8] /* 2 words to go. */
233 lsls count, count, #31
234 ldrhcs tmp1, [src], #2
235 ldrbne src, [src] /* Src is dead, use as a scratch. */
236 strhcs tmp1, [dst], #2
241 /* At least 64 bytes to copy, but don't know the alignment yet. */
242 str tmp2, [sp, #-FRAME_SIZE]!
249 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
250 that the FP pipeline is much better at streaming loads and
251 stores. This is outside the critical loop. */
255 /* SRC and DST have the same mutual 64-bit alignment, but we may
256 still need to pre-copy some bytes to get to natural alignment.
257 We bring SRC and DST into full 64-bit alignment. */
261 sub count, count, tmp2, lsr #29
262 ldrmi tmp1, [src], #4
263 strmi tmp1, [dst], #4
265 ldrhcs tmp1, [src], #2
266 ldrbne tmp2, [src], #1
267 strhcs tmp1, [dst], #2
268 strbne tmp2, [dst], #1
271 subs tmp2, count, #64 /* Use tmp2 for count. */
277 .Lcpy_body_medium: /* Count in tmp2. */
303 .Ltail63aligned: /* Count in tmp2. */
304 and tmp1, tmp2, #0x38
307 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
310 vldr d0, [src, #-56] /* 14 words to go. */
312 vldr d0, [src, #-48] /* 12 words to go. */
314 vldr d0, [src, #-40] /* 10 words to go. */
316 vldr d0, [src, #-32] /* 8 words to go. */
318 vldr d0, [src, #-24] /* 6 words to go. */
320 vldr d0, [src, #-16] /* 4 words to go. */
322 vldr d0, [src, #-8] /* 2 words to go. */
328 ldrd A_l, A_h, [src, #8]
329 strd A_l, A_h, [dst, #8]
330 ldrd A_l, A_h, [src, #16]
331 strd A_l, A_h, [dst, #16]
332 ldrd A_l, A_h, [src, #24]
333 strd A_l, A_h, [dst, #24]
334 ldrd A_l, A_h, [src, #32]
335 strd A_l, A_h, [dst, #32]
336 ldrd A_l, A_h, [src, #40]
337 strd A_l, A_h, [dst, #40]
338 ldrd A_l, A_h, [src, #48]
339 strd A_l, A_h, [dst, #48]
340 ldrd A_l, A_h, [src, #56]
341 strd A_l, A_h, [dst, #56]
342 ldrd A_l, A_h, [src, #64]!
343 strd A_l, A_h, [dst, #64]!
348 ldr tmp2,[sp], #FRAME_SIZE
354 .Ltail63aligned: /* Count in tmp2. */
355 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
356 we know that the src and dest are 64-bit aligned so we can use
357 LDRD/STRD to improve efficiency. */
358 /* TMP2 is now negative, but we don't care about that. The bottom
359 six bits still tell us how many bytes are left to copy. */
361 and tmp1, tmp2, #0x38
364 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
366 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
367 strd A_l, A_h, [dst, #-56]
368 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
369 strd A_l, A_h, [dst, #-48]
370 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
371 strd A_l, A_h, [dst, #-40]
372 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
373 strd A_l, A_h, [dst, #-32]
374 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
375 strd A_l, A_h, [dst, #-24]
376 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
377 strd A_l, A_h, [dst, #-16]
378 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
379 strd A_l, A_h, [dst, #-8]
383 ldrne tmp1, [src], #4
384 strne tmp1, [dst], #4
385 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
386 ldrhcs tmp1, [src], #2
388 strhcs tmp1, [dst], #2
392 ldr tmp2, [sp], #FRAME_SIZE
395 .Lcpy_body_long: /* Count in tmp2. */
397 /* Long copy. We know that there's at least (prefetch_lines * 64)
400 /* Don't use PLD. Instead, read some data in advance of the current
401 copy position into a register. This should act like a PLD
402 operation but we won't have to repeat the transfer. */
415 subs tmp2, tmp2, #prefetch_lines * 64 * 2
421 add dst, dst, #3 * 64
422 add src, src, #3 * 64
425 add dst, dst, #2 * 64
426 add src, src, #2 * 64
427 subs tmp2, tmp2, #prefetch_lines * 64
434 add src, src, #3 * 64
435 add dst, dst, #3 * 64
439 vstr d0, [dst, #64 + 8]
440 vldr d0, [src, #64 + 8]
441 vstr d1, [dst, #64 + 16]
442 vldr d1, [src, #64 + 16]
443 vstr d2, [dst, #64 + 24]
444 vldr d2, [src, #64 + 24]
445 vstr d7, [dst, #64 + 32]
447 vstr d0, [dst, #64 + 40]
448 vstr d1, [dst, #64 + 48]
449 vstr d2, [dst, #64 + 56]
451 add tmp2, tmp2, #prefetch_lines * 64
454 /* Long copy. Use an SMS style loop to maximize the I/O
455 bandwidth of the core. We don't have enough spare registers
456 to synthesise prefetching, so use PLD operations. */
457 /* Pre-bias src and dst. */
464 ldrd A_l, A_h, [src, #8]
465 strd B_l, B_h, [sp, #8]
466 ldrd B_l, B_h, [src, #16]
467 strd C_l, C_h, [sp, #16]
468 ldrd C_l, C_h, [src, #24]
469 strd D_l, D_h, [sp, #24]
471 ldrd D_l, D_h, [src, #32]!
476 strd A_l, A_h, [dst, #40]
477 ldrd A_l, A_h, [src, #40]
478 strd B_l, B_h, [dst, #48]
479 ldrd B_l, B_h, [src, #48]
480 strd C_l, C_h, [dst, #56]
481 ldrd C_l, C_h, [src, #56]
482 strd D_l, D_h, [dst, #64]!
483 ldrd D_l, D_h, [src, #64]!
486 strd A_l, A_h, [dst, #8]
487 ldrd A_l, A_h, [src, #8]
488 strd B_l, B_h, [dst, #16]
489 ldrd B_l, B_h, [src, #16]
490 strd C_l, C_h, [dst, #24]
491 ldrd C_l, C_h, [src, #24]
492 strd D_l, D_h, [dst, #32]
493 ldrd D_l, D_h, [src, #32]
495 /* Save the remaining bytes and restore the callee-saved regs. */
496 strd A_l, A_h, [dst, #40]
498 strd B_l, B_h, [dst, #48]
499 ldrd B_l, B_h, [sp, #8]
500 strd C_l, C_h, [dst, #56]
501 ldrd C_l, C_h, [sp, #16]
502 strd D_l, D_h, [dst, #64]
503 ldrd D_l, D_h, [sp, #24]
507 ldr tmp2, [sp], #FRAME_SIZE
514 /* There's at least 64 bytes to copy, but there is no mutual
516 /* Bring DST to 64-bit alignment. */
521 sub count, count, tmp2, lsr #29
522 ldrmi tmp1, [src], #4
523 strmi tmp1, [dst], #4
525 ldrbne tmp1, [src], #1
526 ldrhcs tmp2, [src], #2
527 strbne tmp1, [dst], #1
528 strhcs tmp2, [dst], #2
531 subs count, count, #64
532 ldrmi tmp2, [sp], #FRAME_SIZE
533 bmi .Ltail63unaligned
537 vld1.8 {d0-d3}, [src]!
538 vld1.8 {d4-d7}, [src]!
539 subs count, count, #64
543 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
544 vld1.8 {d0-d3}, [src]!
545 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
546 vld1.8 {d4-d7}, [src]!
547 subs count, count, #64
550 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
551 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
552 ands count, count, #0x3f
554 /* Use an SMS style loop to maximize the I/O bandwidth. */
557 subs tmp2, count, #64 /* Use tmp2 for count. */
560 strd B_l, B_h, [sp, #8]
563 strd C_l, C_h, [sp, #16]
566 strd D_l, D_h, [sp, #24]
572 pld [src, #(5 * 64) - (32 - 4)]
573 strd A_l, A_h, [dst, #40]
576 strd B_l, B_h, [dst, #48]
579 strd C_l, C_h, [dst, #56]
582 strd D_l, D_h, [dst, #64]!
587 strd A_l, A_h, [dst, #8]
590 strd B_l, B_h, [dst, #16]
593 strd C_l, C_h, [dst, #24]
596 strd D_l, D_h, [dst, #32]
601 /* Save the remaining bytes and restore the callee-saved regs. */
602 strd A_l, A_h, [dst, #40]
604 strd B_l, B_h, [dst, #48]
605 ldrd B_l, B_h, [sp, #8]
606 strd C_l, C_h, [dst, #56]
607 ldrd C_l, C_h, [sp, #16]
608 strd D_l, D_h, [dst, #64]
609 ldrd D_l, D_h, [sp, #24]
611 ands count, tmp2, #0x3f
613 ldr tmp2, [sp], #FRAME_SIZE
614 bne .Ltail63unaligned
617 .size memcpy, . - memcpy