2 Copyright (c) 2014, Intel Corporation
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 # define MEMMOVE memmove
38 # define L(label) .L##label
42 # define cfi_startproc .cfi_startproc
46 # define cfi_endproc .cfi_endproc
49 #ifndef cfi_rel_offset
50 # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
54 # define cfi_restore(reg) .cfi_restore reg
57 #ifndef cfi_adjust_cfa_offset
58 # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
62 # define ENTRY(name) \
63 .type name, @function; \
71 # define ALIAS_SYMBOL(alias, original) \
82 #define CFI_PUSH(REG) \
83 cfi_adjust_cfa_offset (4); \
84 cfi_rel_offset (REG, 0)
86 #define CFI_POP(REG) \
87 cfi_adjust_cfa_offset (-4); \
90 #define PUSH(REG) push REG;
91 #define POP(REG) pop REG;
93 #define ENTRANCE PUSH (%rbx);
94 #define RETURN_END POP (%rbx); ret
95 #define RETURN RETURN_END;
97 .section .text.sse2,"ax",@progbits
102 /* Check whether we should copy backward or forward. */
105 jg L(mm_len_0_or_more_backward)
107 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
110 jbe L(mm_len_0_16_bytes_forward)
113 ja L(mm_len_32_or_more_forward)
115 /* Copy [0..32] and return. */
117 movdqu -16(%rsi, %rdx), %xmm1
119 movdqu %xmm1, -16(%rdi, %rdx)
122 L(mm_len_32_or_more_forward):
124 ja L(mm_len_64_or_more_forward)
126 /* Copy [0..64] and return. */
128 movdqu 16(%rsi), %xmm1
129 movdqu -16(%rsi, %rdx), %xmm2
130 movdqu -32(%rsi, %rdx), %xmm3
132 movdqu %xmm1, 16(%rdi)
133 movdqu %xmm2, -16(%rdi, %rdx)
134 movdqu %xmm3, -32(%rdi, %rdx)
137 L(mm_len_64_or_more_forward):
139 ja L(mm_len_128_or_more_forward)
141 /* Copy [0..128] and return. */
143 movdqu 16(%rsi), %xmm1
144 movdqu 32(%rsi), %xmm2
145 movdqu 48(%rsi), %xmm3
146 movdqu -64(%rsi, %rdx), %xmm4
147 movdqu -48(%rsi, %rdx), %xmm5
148 movdqu -32(%rsi, %rdx), %xmm6
149 movdqu -16(%rsi, %rdx), %xmm7
151 movdqu %xmm1, 16(%rdi)
152 movdqu %xmm2, 32(%rdi)
153 movdqu %xmm3, 48(%rdi)
154 movdqu %xmm4, -64(%rdi, %rdx)
155 movdqu %xmm5, -48(%rdi, %rdx)
156 movdqu %xmm6, -32(%rdi, %rdx)
157 movdqu %xmm7, -16(%rdi, %rdx)
160 L(mm_len_128_or_more_forward):
161 /* Aligning the address of destination. */
162 /* save first unaligned 64 bytes */
164 movdqu 16(%rsi), %xmm1
165 movdqu 32(%rsi), %xmm2
166 movdqu 48(%rsi), %xmm3
169 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */
170 sub %rdi, %rsi /* rsi = src - dst = diff */
172 movdqu (%r8, %rsi), %xmm4
173 movdqu 16(%r8, %rsi), %xmm5
174 movdqu 32(%r8, %rsi), %xmm6
175 movdqu 48(%r8, %rsi), %xmm7
178 movdqu %xmm1, 16(%rdi)
179 movdqu %xmm2, 32(%rdi)
180 movdqu %xmm3, 48(%rdi)
182 movaps %xmm5, 16(%r8)
183 movaps %xmm6, 32(%r8)
184 movaps %xmm7, 48(%r8)
187 lea (%rdi, %rdx), %rbx
190 jbe L(mm_copy_remaining_forward)
192 cmp $SHARED_CACHE_SIZE_HALF, %rdx
193 jae L(mm_large_page_loop_forward)
196 L(mm_main_loop_forward):
198 prefetcht0 128(%r8, %rsi)
200 movdqu (%r8, %rsi), %xmm0
201 movdqu 16(%r8, %rsi), %xmm1
202 movdqu 32(%r8, %rsi), %xmm2
203 movdqu 48(%r8, %rsi), %xmm3
205 movaps %xmm1, 16(%r8)
206 movaps %xmm2, 32(%r8)
207 movaps %xmm3, 48(%r8)
210 ja L(mm_main_loop_forward)
212 L(mm_copy_remaining_forward):
215 /* We copied all up till %rdi position in the dst.
216 In %rdx now is how many bytes are left to copy.
217 Now we need to advance %r8. */
220 L(mm_remaining_0_64_bytes_forward):
222 ja L(mm_remaining_33_64_bytes_forward)
224 ja L(mm_remaining_17_32_bytes_forward)
230 ja L(mm_remaining_9_16_bytes_forward)
233 ja L(mm_remaining_5_8_bytes_forward)
236 ja L(mm_remaining_3_4_bytes_forward)
237 movzbl -1(%r9,%rdx), %esi
239 movb %sil, -1(%r8,%rdx)
243 L(mm_remaining_33_64_bytes_forward):
245 movdqu 16(%r9), %xmm1
246 movdqu -32(%r9, %rdx), %xmm2
247 movdqu -16(%r9, %rdx), %xmm3
249 movdqu %xmm1, 16(%r8)
250 movdqu %xmm2, -32(%r8, %rdx)
251 movdqu %xmm3, -16(%r8, %rdx)
254 L(mm_remaining_17_32_bytes_forward):
256 movdqu -16(%r9, %rdx), %xmm1
258 movdqu %xmm1, -16(%r8, %rdx)
261 L(mm_remaining_5_8_bytes_forward):
263 movl -4(%r9,%rdx), %ebx
265 movl %ebx, -4(%r8,%rdx)
268 L(mm_remaining_9_16_bytes_forward):
270 mov -8(%r9, %rdx), %rbx
272 mov %rbx, -8(%r8, %rdx)
275 L(mm_remaining_3_4_bytes_forward):
276 movzwl -2(%r9,%rdx), %esi
278 movw %si, -2(%r8,%rdx)
282 L(mm_len_0_16_bytes_forward):
284 jne L(mm_len_9_16_bytes_forward)
287 jne L(mm_len_5_8_bytes_forward)
293 jne L(mm_len_2_4_bytes_forward)
294 movzbl -1(%rsi,%rdx), %ebx
296 movb %bl, -1(%rdi,%rdx)
300 L(mm_len_2_4_bytes_forward):
301 movzwl -2(%rsi,%rdx), %ebx
303 movw %bx, -2(%rdi,%rdx)
307 L(mm_len_5_8_bytes_forward):
309 movl -4(%rsi,%rdx), %esi
311 movl %esi, -4(%rdi,%rdx)
314 L(mm_len_9_16_bytes_forward):
316 mov -8(%rsi, %rdx), %rsi
318 mov %rsi, -8(%rdi, %rdx)
322 /* Compute in %rdx how many bytes are left to copy after
323 the main loop stops. */
326 /* The code for copying backwards. */
327 L(mm_len_0_or_more_backward):
329 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
332 jbe L(mm_len_0_16_bytes_backward)
335 ja L(mm_len_32_or_more_backward)
337 /* Copy [0..32] and return. */
339 movdqu -16(%rsi, %rdx), %xmm1
341 movdqu %xmm1, -16(%rdi, %rdx)
344 L(mm_len_32_or_more_backward):
346 ja L(mm_len_64_or_more_backward)
348 /* Copy [0..64] and return. */
350 movdqu 16(%rsi), %xmm1
351 movdqu -16(%rsi, %rdx), %xmm2
352 movdqu -32(%rsi, %rdx), %xmm3
354 movdqu %xmm1, 16(%rdi)
355 movdqu %xmm2, -16(%rdi, %rdx)
356 movdqu %xmm3, -32(%rdi, %rdx)
359 L(mm_len_64_or_more_backward):
361 ja L(mm_len_128_or_more_backward)
363 /* Copy [0..128] and return. */
365 movdqu 16(%rsi), %xmm1
366 movdqu 32(%rsi), %xmm2
367 movdqu 48(%rsi), %xmm3
368 movdqu -64(%rsi, %rdx), %xmm4
369 movdqu -48(%rsi, %rdx), %xmm5
370 movdqu -32(%rsi, %rdx), %xmm6
371 movdqu -16(%rsi, %rdx), %xmm7
373 movdqu %xmm1, 16(%rdi)
374 movdqu %xmm2, 32(%rdi)
375 movdqu %xmm3, 48(%rdi)
376 movdqu %xmm4, -64(%rdi, %rdx)
377 movdqu %xmm5, -48(%rdi, %rdx)
378 movdqu %xmm6, -32(%rdi, %rdx)
379 movdqu %xmm7, -16(%rdi, %rdx)
382 L(mm_len_128_or_more_backward):
383 /* Aligning the address of destination. We need to save
384 16 bits from the source in order not to overwrite them. */
385 movdqu -16(%rsi, %rdx), %xmm0
386 movdqu -32(%rsi, %rdx), %xmm1
387 movdqu -48(%rsi, %rdx), %xmm2
388 movdqu -64(%rsi, %rdx), %xmm3
390 lea (%rdi, %rdx), %r9
391 and $-64, %r9 /* r9 = aligned dst */
394 sub %rdi, %r8 /* r8 = src - dst, diff */
396 movdqu -16(%r9, %r8), %xmm4
397 movdqu -32(%r9, %r8), %xmm5
398 movdqu -48(%r9, %r8), %xmm6
399 movdqu -64(%r9, %r8), %xmm7
401 movdqu %xmm0, -16(%rdi, %rdx)
402 movdqu %xmm1, -32(%rdi, %rdx)
403 movdqu %xmm2, -48(%rdi, %rdx)
404 movdqu %xmm3, -64(%rdi, %rdx)
405 movdqa %xmm4, -16(%r9)
406 movaps %xmm5, -32(%r9)
407 movaps %xmm6, -48(%r9)
408 movaps %xmm7, -64(%r9)
417 cmp $SHARED_CACHE_SIZE_HALF, %rdx
418 jae L(mm_large_page_loop_backward)
421 L(mm_main_loop_backward):
423 prefetcht0 -128(%r9, %r8)
425 movdqu -64(%r9, %r8), %xmm0
426 movdqu -48(%r9, %r8), %xmm1
427 movdqu -32(%r9, %r8), %xmm2
428 movdqu -16(%r9, %r8), %xmm3
429 movdqa %xmm0, -64(%r9)
430 movaps %xmm1, -48(%r9)
431 movaps %xmm2, -32(%r9)
432 movaps %xmm3, -16(%r9)
435 jb L(mm_main_loop_backward)
438 /* Copy [0..16] and return. */
439 L(mm_len_0_16_bytes_backward):
441 jnz L(mm_len_9_16_bytes_backward)
444 jnz L(mm_len_5_8_bytes_backward)
450 jne L(mm_len_3_4_bytes_backward)
451 movzbl -1(%rsi,%rdx), %ebx
453 movb %bl, -1(%rdi,%rdx)
457 L(mm_len_3_4_bytes_backward):
458 movzwl -2(%rsi,%rdx), %ebx
460 movw %bx, -2(%rdi,%rdx)
464 L(mm_len_9_16_bytes_backward):
465 movl -4(%rsi,%rdx), %ebx
466 movl -8(%rsi,%rdx), %ecx
467 movl %ebx, -4(%rdi,%rdx)
468 movl %ecx, -8(%rdi,%rdx)
470 jmp L(mm_len_0_16_bytes_backward)
472 L(mm_len_5_8_bytes_backward):
474 movl -4(%rsi,%rdx), %ecx
476 movl %ecx, -4(%rdi,%rdx)
481 /* Big length copy forward part. */
484 L(mm_large_page_loop_forward):
485 movdqu (%r8, %rsi), %xmm0
486 movdqu 16(%r8, %rsi), %xmm1
487 movdqu 32(%r8, %rsi), %xmm2
488 movdqu 48(%r8, %rsi), %xmm3
490 movntdq %xmm1, 16(%r8)
491 movntdq %xmm2, 32(%r8)
492 movntdq %xmm3, 48(%r8)
495 ja L(mm_large_page_loop_forward)
497 jmp L(mm_copy_remaining_forward)
499 /* Big length copy backward part. */
501 L(mm_large_page_loop_backward):
502 movdqu -64(%r9, %r8), %xmm0
503 movdqu -48(%r9, %r8), %xmm1
504 movdqu -32(%r9, %r8), %xmm2
505 movdqu -16(%r9, %r8), %xmm3
506 movntdq %xmm0, -64(%r9)
507 movntdq %xmm1, -48(%r9)
508 movntdq %xmm2, -32(%r9)
509 movntdq %xmm3, -16(%r9)
512 jb L(mm_large_page_loop_backward)
518 ALIAS_SYMBOL(memcpy, MEMMOVE)