contrib/cortex-strings/src/arm/memcpy.S

   1 /* Copyright (c) 2013, Linaro Limited
   2    All rights reserved.
   3
   4    Redistribution and use in source and binary forms, with or without
   5    modification, are permitted provided that the following conditions
   6    are met:
   7
   8       * Redistributions of source code must retain the above copyright
   9       notice, this list of conditions and the following disclaimer.
  10
  11       * Redistributions in binary form must reproduce the above copyright
  12       notice, this list of conditions and the following disclaimer in the
  13       documentation and/or other materials provided with the distribution.
  14
  15       * Neither the name of Linaro Limited nor the names of its
  16       contributors may be used to endorse or promote products derived
  17       from this software without specific prior written permission.
  18
  19    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33    This memcpy routine is optimised for Cortex-A15 cores and takes advantage
  34    of VFP or NEON when built with the appropriate flags.
  35
  36    Assumptions:
  37
  38     ARMv6 (ARMv7-a if using Neon)
  39     ARM state
  40     Unaligned accesses
  41
  42  */
  43
  44         .syntax unified
  45         /* This implementation requires ARM state.  */
  46         .arm
  47
  48 #ifdef __ARM_NEON__
  49
  50         .fpu    neon
  51         .arch   armv7-a
  52 # define FRAME_SIZE     4
  53 # define USE_VFP
  54 # define USE_NEON
  55
  56 #elif !defined (__SOFTFP__)
  57
  58         .arch   armv6
  59         .fpu    vfpv2
  60 # define FRAME_SIZE     32
  61 # define USE_VFP
  62
  63 #else
  64         .arch   armv6
  65 # define FRAME_SIZE    32
  66
  67 #endif
  68
  69 /* Old versions of GAS incorrectly implement the NEON align semantics.  */
  70 #ifdef BROKEN_ASM_NEON_ALIGN
  71 #define ALIGN(addr, align) addr,:align
  72 #else
  73 #define ALIGN(addr, align) addr:align
  74 #endif
  75
  76 #define PC_OFFSET       8       /* PC pipeline compensation.  */
  77 #define INSN_SIZE       4
  78
  79 /* Call parameters.  */
  80 #define dstin   r0
  81 #define src     r1
  82 #define count   r2
  83
  84 /* Locals.  */
  85 #define tmp1    r3
  86 #define dst     ip
  87 #define tmp2    r10
  88
  89 #ifndef USE_NEON
  90 /* For bulk copies using GP registers.  */
  91 #define A_l     r2              /* Call-clobbered.  */
  92 #define A_h     r3              /* Call-clobbered.  */
  93 #define B_l     r4
  94 #define B_h     r5
  95 #define C_l     r6
  96 #define C_h     r7
  97 #define D_l     r8
  98 #define D_h     r9
  99 #endif
 100
 101 /* Number of lines ahead to pre-fetch data.  If you change this the code
 102    below will need adjustment to compensate.  */
 103
 104 #define prefetch_lines  5
 105
 106 #ifdef USE_VFP
 107         .macro  cpy_line_vfp vreg, base
 108         vstr    \vreg, [dst, #\base]
 109         vldr    \vreg, [src, #\base]
 110         vstr    d0, [dst, #\base + 8]
 111         vldr    d0, [src, #\base + 8]
 112         vstr    d1, [dst, #\base + 16]
 113         vldr    d1, [src, #\base + 16]
 114         vstr    d2, [dst, #\base + 24]
 115         vldr    d2, [src, #\base + 24]
 116         vstr    \vreg, [dst, #\base + 32]
 117         vldr    \vreg, [src, #\base + prefetch_lines * 64 - 32]
 118         vstr    d0, [dst, #\base + 40]
 119         vldr    d0, [src, #\base + 40]
 120         vstr    d1, [dst, #\base + 48]
 121         vldr    d1, [src, #\base + 48]
 122         vstr    d2, [dst, #\base + 56]
 123         vldr    d2, [src, #\base + 56]
 124         .endm
 125
 126         .macro  cpy_tail_vfp vreg, base
 127         vstr    \vreg, [dst, #\base]
 128         vldr    \vreg, [src, #\base]
 129         vstr    d0, [dst, #\base + 8]
 130         vldr    d0, [src, #\base + 8]
 131         vstr    d1, [dst, #\base + 16]
 132         vldr    d1, [src, #\base + 16]
 133         vstr    d2, [dst, #\base + 24]
 134         vldr    d2, [src, #\base + 24]
 135         vstr    \vreg, [dst, #\base + 32]
 136         vstr    d0, [dst, #\base + 40]
 137         vldr    d0, [src, #\base + 40]
 138         vstr    d1, [dst, #\base + 48]
 139         vldr    d1, [src, #\base + 48]
 140         vstr    d2, [dst, #\base + 56]
 141         vldr    d2, [src, #\base + 56]
 142         .endm
 143 #endif
 144
 145         .macro def_fn f p2align=0
 146         .text
 147         .p2align \p2align
 148         .global \f
 149         .type \f, %function
 150 \f:
 151         .endm
 152
 153 def_fn memcpy p2align=6
 154
 155         mov     dst, dstin      /* Preserve dstin, we need to return it.  */
 156         cmp     count, #64
 157         bge     .Lcpy_not_short
 158         /* Deal with small copies quickly by dropping straight into the
 159            exit block.  */
 160
 161 .Ltail63unaligned:
 162 #ifdef USE_NEON
 163         and     tmp1, count, #0x38
 164         rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 165         add     pc, pc, tmp1
 166         vld1.8  {d0}, [src]!    /* 14 words to go.  */
 167         vst1.8  {d0}, [dst]!
 168         vld1.8  {d0}, [src]!    /* 12 words to go.  */
 169         vst1.8  {d0}, [dst]!
 170         vld1.8  {d0}, [src]!    /* 10 words to go.  */
 171         vst1.8  {d0}, [dst]!
 172         vld1.8  {d0}, [src]!    /* 8 words to go.  */
 173         vst1.8  {d0}, [dst]!
 174         vld1.8  {d0}, [src]!    /* 6 words to go.  */
 175         vst1.8  {d0}, [dst]!
 176         vld1.8  {d0}, [src]!    /* 4 words to go.  */
 177         vst1.8  {d0}, [dst]!
 178         vld1.8  {d0}, [src]!    /* 2 words to go.  */
 179         vst1.8  {d0}, [dst]!
 180
 181         tst     count, #4
 182         ldrne   tmp1, [src], #4
 183         strne   tmp1, [dst], #4
 184 #else
 185         /* Copy up to 15 full words of data.  May not be aligned.  */
 186         /* Cannot use VFP for unaligned data.  */
 187         and     tmp1, count, #0x3c
 188         add     dst, dst, tmp1
 189         add     src, src, tmp1
 190         rsb     tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
 191         /* Jump directly into the sequence below at the correct offset.  */
 192         add     pc, pc, tmp1, lsl #1
 193
 194         ldr     tmp1, [src, #-60]       /* 15 words to go.  */
 195         str     tmp1, [dst, #-60]
 196
 197         ldr     tmp1, [src, #-56]       /* 14 words to go.  */
 198         str     tmp1, [dst, #-56]
 199         ldr     tmp1, [src, #-52]
 200         str     tmp1, [dst, #-52]
 201
 202         ldr     tmp1, [src, #-48]       /* 12 words to go.  */
 203         str     tmp1, [dst, #-48]
 204         ldr     tmp1, [src, #-44]
 205         str     tmp1, [dst, #-44]
 206
 207         ldr     tmp1, [src, #-40]       /* 10 words to go.  */
 208         str     tmp1, [dst, #-40]
 209         ldr     tmp1, [src, #-36]
 210         str     tmp1, [dst, #-36]
 211
 212         ldr     tmp1, [src, #-32]       /* 8 words to go.  */
 213         str     tmp1, [dst, #-32]
 214         ldr     tmp1, [src, #-28]
 215         str     tmp1, [dst, #-28]
 216
 217         ldr     tmp1, [src, #-24]       /* 6 words to go.  */
 218         str     tmp1, [dst, #-24]
 219         ldr     tmp1, [src, #-20]
 220         str     tmp1, [dst, #-20]
 221
 222         ldr     tmp1, [src, #-16]       /* 4 words to go.  */
 223         str     tmp1, [dst, #-16]
 224         ldr     tmp1, [src, #-12]
 225         str     tmp1, [dst, #-12]
 226
 227         ldr     tmp1, [src, #-8]        /* 2 words to go.  */
 228         str     tmp1, [dst, #-8]
 229         ldr     tmp1, [src, #-4]
 230         str     tmp1, [dst, #-4]
 231 #endif
 232
 233         lsls    count, count, #31
 234         ldrhcs  tmp1, [src], #2
 235         ldrbne  src, [src]              /* Src is dead, use as a scratch.  */
 236         strhcs  tmp1, [dst], #2
 237         strbne  src, [dst]
 238         bx      lr
 239
 240 .Lcpy_not_short:
 241         /* At least 64 bytes to copy, but don't know the alignment yet.  */
 242         str     tmp2, [sp, #-FRAME_SIZE]!
 243         and     tmp2, src, #7
 244         and     tmp1, dst, #7
 245         cmp     tmp1, tmp2
 246         bne     .Lcpy_notaligned
 247
 248 #ifdef USE_VFP
 249         /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
 250            that the FP pipeline is much better at streaming loads and
 251            stores.  This is outside the critical loop.  */
 252         vmov.f32        s0, s0
 253 #endif
 254
 255         /* SRC and DST have the same mutual 64-bit alignment, but we may
 256            still need to pre-copy some bytes to get to natural alignment.
 257            We bring SRC and DST into full 64-bit alignment.  */
 258         lsls    tmp2, dst, #29
 259         beq     1f
 260         rsbs    tmp2, tmp2, #0
 261         sub     count, count, tmp2, lsr #29
 262         ldrmi   tmp1, [src], #4
 263         strmi   tmp1, [dst], #4
 264         lsls    tmp2, tmp2, #2
 265         ldrhcs  tmp1, [src], #2
 266         ldrbne  tmp2, [src], #1
 267         strhcs  tmp1, [dst], #2
 268         strbne  tmp2, [dst], #1
 269
 270 1:
 271         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 272         blt     .Ltail63aligned
 273
 274         cmp     tmp2, #512
 275         bge     .Lcpy_body_long
 276
 277 .Lcpy_body_medium:                      /* Count in tmp2.  */
 278 #ifdef USE_VFP
 279 1:
 280         vldr    d0, [src, #0]
 281         subs    tmp2, tmp2, #64
 282         vldr    d1, [src, #8]
 283         vstr    d0, [dst, #0]
 284         vldr    d0, [src, #16]
 285         vstr    d1, [dst, #8]
 286         vldr    d1, [src, #24]
 287         vstr    d0, [dst, #16]
 288         vldr    d0, [src, #32]
 289         vstr    d1, [dst, #24]
 290         vldr    d1, [src, #40]
 291         vstr    d0, [dst, #32]
 292         vldr    d0, [src, #48]
 293         vstr    d1, [dst, #40]
 294         vldr    d1, [src, #56]
 295         vstr    d0, [dst, #48]
 296         add     src, src, #64
 297         vstr    d1, [dst, #56]
 298         add     dst, dst, #64
 299         bge     1b
 300         tst     tmp2, #0x3f
 301         beq     .Ldone
 302
 303 .Ltail63aligned:                        /* Count in tmp2.  */
 304         and     tmp1, tmp2, #0x38
 305         add     dst, dst, tmp1
 306         add     src, src, tmp1
 307         rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 308         add     pc, pc, tmp1
 309
 310         vldr    d0, [src, #-56] /* 14 words to go.  */
 311         vstr    d0, [dst, #-56]
 312         vldr    d0, [src, #-48] /* 12 words to go.  */
 313         vstr    d0, [dst, #-48]
 314         vldr    d0, [src, #-40] /* 10 words to go.  */
 315         vstr    d0, [dst, #-40]
 316         vldr    d0, [src, #-32] /* 8 words to go.  */
 317         vstr    d0, [dst, #-32]
 318         vldr    d0, [src, #-24] /* 6 words to go.  */
 319         vstr    d0, [dst, #-24]
 320         vldr    d0, [src, #-16] /* 4 words to go.  */
 321         vstr    d0, [dst, #-16]
 322         vldr    d0, [src, #-8]  /* 2 words to go.  */
 323         vstr    d0, [dst, #-8]
 324 #else
 325         sub     src, src, #8
 326         sub     dst, dst, #8
 327 1:
 328         ldrd    A_l, A_h, [src, #8]
 329         strd    A_l, A_h, [dst, #8]
 330         ldrd    A_l, A_h, [src, #16]
 331         strd    A_l, A_h, [dst, #16]
 332         ldrd    A_l, A_h, [src, #24]
 333         strd    A_l, A_h, [dst, #24]
 334         ldrd    A_l, A_h, [src, #32]
 335         strd    A_l, A_h, [dst, #32]
 336         ldrd    A_l, A_h, [src, #40]
 337         strd    A_l, A_h, [dst, #40]
 338         ldrd    A_l, A_h, [src, #48]
 339         strd    A_l, A_h, [dst, #48]
 340         ldrd    A_l, A_h, [src, #56]
 341         strd    A_l, A_h, [dst, #56]
 342         ldrd    A_l, A_h, [src, #64]!
 343         strd    A_l, A_h, [dst, #64]!
 344         subs    tmp2, tmp2, #64
 345         bge     1b
 346         tst     tmp2, #0x3f
 347         bne     1f
 348         ldr     tmp2,[sp], #FRAME_SIZE
 349         bx      lr
 350 1:
 351         add     src, src, #8
 352         add     dst, dst, #8
 353
 354 .Ltail63aligned:                        /* Count in tmp2.  */
 355         /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 356            we know that the src and dest are 64-bit aligned so we can use
 357            LDRD/STRD to improve efficiency.  */
 358         /* TMP2 is now negative, but we don't care about that.  The bottom
 359            six bits still tell us how many bytes are left to copy.  */
 360
 361         and     tmp1, tmp2, #0x38
 362         add     dst, dst, tmp1
 363         add     src, src, tmp1
 364         rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 365         add     pc, pc, tmp1
 366         ldrd    A_l, A_h, [src, #-56]   /* 14 words to go.  */
 367         strd    A_l, A_h, [dst, #-56]
 368         ldrd    A_l, A_h, [src, #-48]   /* 12 words to go.  */
 369         strd    A_l, A_h, [dst, #-48]
 370         ldrd    A_l, A_h, [src, #-40]   /* 10 words to go.  */
 371         strd    A_l, A_h, [dst, #-40]
 372         ldrd    A_l, A_h, [src, #-32]   /* 8 words to go.  */
 373         strd    A_l, A_h, [dst, #-32]
 374         ldrd    A_l, A_h, [src, #-24]   /* 6 words to go.  */
 375         strd    A_l, A_h, [dst, #-24]
 376         ldrd    A_l, A_h, [src, #-16]   /* 4 words to go.  */
 377         strd    A_l, A_h, [dst, #-16]
 378         ldrd    A_l, A_h, [src, #-8]    /* 2 words to go.  */
 379         strd    A_l, A_h, [dst, #-8]
 380
 381 #endif
 382         tst     tmp2, #4
 383         ldrne   tmp1, [src], #4
 384         strne   tmp1, [dst], #4
 385         lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
 386         ldrhcs  tmp1, [src], #2
 387         ldrbne  tmp2, [src]
 388         strhcs  tmp1, [dst], #2
 389         strbne  tmp2, [dst]
 390
 391 .Ldone:
 392         ldr     tmp2, [sp], #FRAME_SIZE
 393         bx      lr
 394
 395 .Lcpy_body_long:                        /* Count in tmp2.  */
 396
 397         /* Long copy.  We know that there's at least (prefetch_lines * 64)
 398            bytes to go.  */
 399 #ifdef USE_VFP
 400         /* Don't use PLD.  Instead, read some data in advance of the current
 401            copy position into a register.  This should act like a PLD
 402            operation but we won't have to repeat the transfer.  */
 403
 404         vldr    d3, [src, #0]
 405         vldr    d4, [src, #64]
 406         vldr    d5, [src, #128]
 407         vldr    d6, [src, #192]
 408         vldr    d7, [src, #256]
 409
 410         vldr    d0, [src, #8]
 411         vldr    d1, [src, #16]
 412         vldr    d2, [src, #24]
 413         add     src, src, #32
 414
 415         subs    tmp2, tmp2, #prefetch_lines * 64 * 2
 416         blt     2f
 417 1:
 418         cpy_line_vfp    d3, 0
 419         cpy_line_vfp    d4, 64
 420         cpy_line_vfp    d5, 128
 421         add     dst, dst, #3 * 64
 422         add     src, src, #3 * 64
 423         cpy_line_vfp    d6, 0
 424         cpy_line_vfp    d7, 64
 425         add     dst, dst, #2 * 64
 426         add     src, src, #2 * 64
 427         subs    tmp2, tmp2, #prefetch_lines * 64
 428         bge     1b
 429
 430 2:
 431         cpy_tail_vfp    d3, 0
 432         cpy_tail_vfp    d4, 64
 433         cpy_tail_vfp    d5, 128
 434         add     src, src, #3 * 64
 435         add     dst, dst, #3 * 64
 436         cpy_tail_vfp    d6, 0
 437         vstr    d7, [dst, #64]
 438         vldr    d7, [src, #64]
 439         vstr    d0, [dst, #64 + 8]
 440         vldr    d0, [src, #64 + 8]
 441         vstr    d1, [dst, #64 + 16]
 442         vldr    d1, [src, #64 + 16]
 443         vstr    d2, [dst, #64 + 24]
 444         vldr    d2, [src, #64 + 24]
 445         vstr    d7, [dst, #64 + 32]
 446         add     src, src, #96
 447         vstr    d0, [dst, #64 + 40]
 448         vstr    d1, [dst, #64 + 48]
 449         vstr    d2, [dst, #64 + 56]
 450         add     dst, dst, #128
 451         add     tmp2, tmp2, #prefetch_lines * 64
 452         b       .Lcpy_body_medium
 453 #else
 454         /* Long copy.  Use an SMS style loop to maximize the I/O
 455            bandwidth of the core.  We don't have enough spare registers
 456            to synthesise prefetching, so use PLD operations.  */
 457         /* Pre-bias src and dst.  */
 458         sub     src, src, #8
 459         sub     dst, dst, #8
 460         pld     [src, #8]
 461         pld     [src, #72]
 462         subs    tmp2, tmp2, #64
 463         pld     [src, #136]
 464         ldrd    A_l, A_h, [src, #8]
 465         strd    B_l, B_h, [sp, #8]
 466         ldrd    B_l, B_h, [src, #16]
 467         strd    C_l, C_h, [sp, #16]
 468         ldrd    C_l, C_h, [src, #24]
 469         strd    D_l, D_h, [sp, #24]
 470         pld     [src, #200]
 471         ldrd    D_l, D_h, [src, #32]!
 472         b       1f
 473         .p2align        6
 474 2:
 475         pld     [src, #232]
 476         strd    A_l, A_h, [dst, #40]
 477         ldrd    A_l, A_h, [src, #40]
 478         strd    B_l, B_h, [dst, #48]
 479         ldrd    B_l, B_h, [src, #48]
 480         strd    C_l, C_h, [dst, #56]
 481         ldrd    C_l, C_h, [src, #56]
 482         strd    D_l, D_h, [dst, #64]!
 483         ldrd    D_l, D_h, [src, #64]!
 484         subs    tmp2, tmp2, #64
 485 1:
 486         strd    A_l, A_h, [dst, #8]
 487         ldrd    A_l, A_h, [src, #8]
 488         strd    B_l, B_h, [dst, #16]
 489         ldrd    B_l, B_h, [src, #16]
 490         strd    C_l, C_h, [dst, #24]
 491         ldrd    C_l, C_h, [src, #24]
 492         strd    D_l, D_h, [dst, #32]
 493         ldrd    D_l, D_h, [src, #32]
 494         bcs     2b
 495         /* Save the remaining bytes and restore the callee-saved regs.  */
 496         strd    A_l, A_h, [dst, #40]
 497         add     src, src, #40
 498         strd    B_l, B_h, [dst, #48]
 499         ldrd    B_l, B_h, [sp, #8]
 500         strd    C_l, C_h, [dst, #56]
 501         ldrd    C_l, C_h, [sp, #16]
 502         strd    D_l, D_h, [dst, #64]
 503         ldrd    D_l, D_h, [sp, #24]
 504         add     dst, dst, #72
 505         tst     tmp2, #0x3f
 506         bne     .Ltail63aligned
 507         ldr     tmp2, [sp], #FRAME_SIZE
 508         bx      lr
 509 #endif
 510
 511 .Lcpy_notaligned:
 512         pld     [src]
 513         pld     [src, #64]
 514         /* There's at least 64 bytes to copy, but there is no mutual
 515            alignment.  */
 516         /* Bring DST to 64-bit alignment.  */
 517         lsls    tmp2, dst, #29
 518         pld     [src, #(2 * 64)]
 519         beq     1f
 520         rsbs    tmp2, tmp2, #0
 521         sub     count, count, tmp2, lsr #29
 522         ldrmi   tmp1, [src], #4
 523         strmi   tmp1, [dst], #4
 524         lsls    tmp2, tmp2, #2
 525         ldrbne  tmp1, [src], #1
 526         ldrhcs  tmp2, [src], #2
 527         strbne  tmp1, [dst], #1
 528         strhcs  tmp2, [dst], #2
 529 1:
 530         pld     [src, #(3 * 64)]
 531         subs    count, count, #64
 532         ldrmi   tmp2, [sp], #FRAME_SIZE
 533         bmi     .Ltail63unaligned
 534         pld     [src, #(4 * 64)]
 535
 536 #ifdef USE_NEON
 537         vld1.8  {d0-d3}, [src]!
 538         vld1.8  {d4-d7}, [src]!
 539         subs    count, count, #64
 540         bmi     2f
 541 1:
 542         pld     [src, #(4 * 64)]
 543         vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
 544         vld1.8  {d0-d3}, [src]!
 545         vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
 546         vld1.8  {d4-d7}, [src]!
 547         subs    count, count, #64
 548         bpl     1b
 549 2:
 550         vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
 551         vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
 552         ands    count, count, #0x3f
 553 #else
 554         /* Use an SMS style loop to maximize the I/O bandwidth.  */
 555         sub     src, src, #4
 556         sub     dst, dst, #8
 557         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 558         ldr     A_l, [src, #4]
 559         ldr     A_h, [src, #8]
 560         strd    B_l, B_h, [sp, #8]
 561         ldr     B_l, [src, #12]
 562         ldr     B_h, [src, #16]
 563         strd    C_l, C_h, [sp, #16]
 564         ldr     C_l, [src, #20]
 565         ldr     C_h, [src, #24]
 566         strd    D_l, D_h, [sp, #24]
 567         ldr     D_l, [src, #28]
 568         ldr     D_h, [src, #32]!
 569         b       1f
 570         .p2align        6
 571 2:
 572         pld     [src, #(5 * 64) - (32 - 4)]
 573         strd    A_l, A_h, [dst, #40]
 574         ldr     A_l, [src, #36]
 575         ldr     A_h, [src, #40]
 576         strd    B_l, B_h, [dst, #48]
 577         ldr     B_l, [src, #44]
 578         ldr     B_h, [src, #48]
 579         strd    C_l, C_h, [dst, #56]
 580         ldr     C_l, [src, #52]
 581         ldr     C_h, [src, #56]
 582         strd    D_l, D_h, [dst, #64]!
 583         ldr     D_l, [src, #60]
 584         ldr     D_h, [src, #64]!
 585         subs    tmp2, tmp2, #64
 586 1:
 587         strd    A_l, A_h, [dst, #8]
 588         ldr     A_l, [src, #4]
 589         ldr     A_h, [src, #8]
 590         strd    B_l, B_h, [dst, #16]
 591         ldr     B_l, [src, #12]
 592         ldr     B_h, [src, #16]
 593         strd    C_l, C_h, [dst, #24]
 594         ldr     C_l, [src, #20]
 595         ldr     C_h, [src, #24]
 596         strd    D_l, D_h, [dst, #32]
 597         ldr     D_l, [src, #28]
 598         ldr     D_h, [src, #32]
 599         bcs     2b
 600
 601         /* Save the remaining bytes and restore the callee-saved regs.  */
 602         strd    A_l, A_h, [dst, #40]
 603         add     src, src, #36
 604         strd    B_l, B_h, [dst, #48]
 605         ldrd    B_l, B_h, [sp, #8]
 606         strd    C_l, C_h, [dst, #56]
 607         ldrd    C_l, C_h, [sp, #16]
 608         strd    D_l, D_h, [dst, #64]
 609         ldrd    D_l, D_h, [sp, #24]
 610         add     dst, dst, #72
 611         ands    count, tmp2, #0x3f
 612 #endif
 613         ldr     tmp2, [sp], #FRAME_SIZE
 614         bne     .Ltail63unaligned
 615         bx      lr
 616
 617         .size   memcpy, . - memcpy