From 78e0dea2e21225f0b2b7f2e0618b65be338f6bc3 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 20 Nov 2018 17:10:44 +0000 Subject: [PATCH] MFC r339531,r339579,r340252,r340463,r340464,340472,r340587 amd64: tidy up memset to have rax set earlier for small sizes amd64: finish the tail in memset with an overlapping store amd64: align memset buffers to 16 bytes before using rep stos amd64: convert libc bzero to a C func to avoid future bloat amd64: sync up libc memset with the kernel version amd64: handle small memset buffers with overlapping stores Fix -DNO_CLEAN amd64 build after r340463 --- Makefile.inc1 | 7 ++ lib/libc/amd64/string/Makefile.inc | 1 - lib/libc/amd64/string/bzero.S | 7 -- lib/libc/amd64/string/bzero.c | 15 +++ lib/libc/amd64/string/memset.S | 141 ++++++++++++++++------------- sys/amd64/amd64/support.S | 124 +++++++++++++++---------- 6 files changed, 175 insertions(+), 120 deletions(-) delete mode 100644 lib/libc/amd64/string/bzero.S create mode 100644 lib/libc/amd64/string/bzero.c diff --git a/Makefile.inc1 b/Makefile.inc1 index f0632112797..085dc8206d6 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -948,6 +948,13 @@ _cleanobj_fast_depend_hack: .PHONY ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \ fi .endfor +# 20181115 r340463 bzero reimplemented as .c + @if [ -e "${OBJTOP}/lib/libc/.depend.bzero.o" ] && \ + egrep -qw 'bzero\.[sS]' ${OBJTOP}/lib/libc/.depend.bzero.o; then \ + echo "Removing stale dependencies for bzero"; \ + rm -f ${OBJTOP}/lib/libc/.depend.bzero.* \ + ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.bzero.*}; \ + fi # 20181009 track migration from ntp's embedded libevent to updated one @if [ -e "${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o" ] && \ egrep -q 'contrib/ntp/sntp/libevent/bufferevent_openssl.c' \ diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index a8a07a1eb58..db88ac72353 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -2,7 +2,6 @@ MDSRCS+= \ bcmp.S \ - bzero.S \ memcmp.S \ memcpy.S \ memmove.S \ diff --git a/lib/libc/amd64/string/bzero.S b/lib/libc/amd64/string/bzero.S deleted file mode 100644 index 123d9c4a7ef..00000000000 --- a/lib/libc/amd64/string/bzero.S +++ /dev/null @@ -1,7 +0,0 @@ -/* $FreeBSD */ - -#include -__FBSDID("$FreeBSD$"); - -#define BZERO -#include "memset.S" diff --git a/lib/libc/amd64/string/bzero.c b/lib/libc/amd64/string/bzero.c new file mode 100644 index 00000000000..1ab391076b0 --- /dev/null +++ b/lib/libc/amd64/string/bzero.c @@ -0,0 +1,15 @@ +/*- + * Public domain. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +void +bzero(void *b, size_t len) +{ + + memset(b, 0, len); +} diff --git a/lib/libc/amd64/string/memset.S b/lib/libc/amd64/string/memset.S index b2a871b0038..67f21714b3d 100644 --- a/lib/libc/amd64/string/memset.S +++ b/lib/libc/amd64/string/memset.S @@ -31,101 +31,112 @@ #include __FBSDID("$FreeBSD$"); -.macro MEMSET bzero erms -.if \bzero == 1 - movq %rsi,%rcx - movq %rsi,%rdx - xorl %eax,%eax -.else - movq %rdi,%r9 +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +.macro MEMSET erms + movq %rdi,%rax movq %rdx,%rcx movzbq %sil,%r8 - movabs $0x0101010101010101,%rax - imulq %r8,%rax -.endif + movabs $0x0101010101010101,%r10 + imulq %r8,%r10 cmpq $32,%rcx - jb 1016f + jbe 101632f cmpq $256,%rcx ja 1256f -1032: - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) +103200: + movq %r10,(%rdi) + movq %r10,8(%rdi) + movq %r10,16(%rdi) + movq %r10,24(%rdi) leaq 32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx - jae 1032b - cmpb $0,%cl - je 1000f -1016: + ja 103200b cmpb $16,%cl - jl 1008f - movq %rax,(%rdi) - movq %rax,8(%rdi) - subb $16,%cl - jz 1000f - leaq 16(%rdi),%rdi -1008: + ja 201632f + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + ret + ALIGN_TEXT +101632: + cmpb $16,%cl + jl 100816f +201632: + movq %r10,(%rdi) + movq %r10,8(%rdi) + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + ret + ALIGN_TEXT +100816: cmpb $8,%cl - jl 1004f - movq %rax,(%rdi) - subb $8,%cl - jz 1000f - leaq 8(%rdi),%rdi -1004: + jl 100408f + movq %r10,(%rdi) + movq %r10,-8(%rdi,%rcx) + ret + ALIGN_TEXT +100408: cmpb $4,%cl - jl 1002f - movl %eax,(%rdi) - subb $4,%cl - jz 1000f - leaq 4(%rdi),%rdi -1002: + jl 100204f + movl %r10d,(%rdi) + movl %r10d,-4(%rdi,%rcx) + ret + ALIGN_TEXT +100204: cmpb $2,%cl - jl 1001f - movw %ax,(%rdi) - subb $2,%cl - jz 1000f - leaq 2(%rdi),%rdi -1001: - cmpb $1,%cl - jl 1000f - movb %al,(%rdi) -1000: -.if \bzero == 0 - movq %r9,%rax -.endif + jl 100001f + movw %r10w,(%rdi) + movw %r10w,-2(%rdi,%rcx) ret - + ALIGN_TEXT +100001: + cmpb $0,%cl + je 100000f + movb %r10b,(%rdi) +100000: + ret + ALIGN_TEXT 1256: + movq %rdi,%r9 + movq %r10,%rax + testl $15,%edi + jnz 3f +1: .if \erms == 1 rep stosb + movq %r9,%rax .else + movq %rcx,%rdx shrq $3,%rcx rep stosq - movq %rdx,%rcx - andb $7,%cl - jne 1004b -.endif -.if \bzero == 0 movq %r9,%rax + andl $7,%edx + jnz 2f + ret +2: + movq %r10,-8(%rdi,%rdx) .endif ret + ALIGN_TEXT +3: + movq %r10,(%rdi) + movq %r10,8(%rdi) + movq %rdi,%r8 + andq $15,%r8 + leaq -16(%rcx,%r8),%rcx + neg %r8 + leaq 16(%rdi,%r8),%rdi + jmp 1b .endm -#ifndef BZERO + ENTRY(memset) - MEMSET bzero=0 erms=0 + MEMSET erms=0 END(memset) -#else -ENTRY(bzero) - MEMSET bzero=1 erms=0 -END(bzero) -#endif .section .note.GNU-stack,"",%progbits diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 356f20cbfbf..1bb82ef43b9 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -452,82 +452,112 @@ END(memcpy_erms) */ .macro MEMSET erms PUSH_FRAME_POINTER - movq %rdi,%r9 + movq %rdi,%rax movq %rdx,%rcx movzbq %sil,%r8 - movabs $0x0101010101010101,%rax - imulq %r8,%rax + movabs $0x0101010101010101,%r10 + imulq %r8,%r10 cmpq $32,%rcx - jb 1016f + jbe 101632f cmpq $256,%rcx ja 1256f -1032: - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) +103200: + movq %r10,(%rdi) + movq %r10,8(%rdi) + movq %r10,16(%rdi) + movq %r10,24(%rdi) leaq 32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx - jae 1032b - cmpb $0,%cl - je 1000f -1016: + ja 103200b cmpb $16,%cl - jl 1008f - movq %rax,(%rdi) - movq %rax,8(%rdi) - subb $16,%cl - jz 1000f - leaq 16(%rdi),%rdi -1008: + ja 201632f + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +101632: + cmpb $16,%cl + jl 100816f +201632: + movq %r10,(%rdi) + movq %r10,8(%rdi) + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100816: cmpb $8,%cl - jl 1004f - movq %rax,(%rdi) - subb $8,%cl - jz 1000f - leaq 8(%rdi),%rdi -1004: + jl 100408f + movq %r10,(%rdi) + movq %r10,-8(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100408: cmpb $4,%cl - jl 1002f - movl %eax,(%rdi) - subb $4,%cl - jz 1000f - leaq 4(%rdi),%rdi -1002: + jl 100204f + movl %r10d,(%rdi) + movl %r10d,-4(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100204: cmpb $2,%cl - jl 1001f - movw %ax,(%rdi) - subb $2,%cl - jz 1000f - leaq 2(%rdi),%rdi -1001: - cmpb $1,%cl - jl 1000f - movb %al,(%rdi) -1000: - movq %r9,%rax + jl 100001f + movw %r10w,(%rdi) + movw %r10w,-2(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100001: + cmpb $0,%cl + je 100000f + movb %r10b,(%rdi) +100000: POP_FRAME_POINTER ret ALIGN_TEXT 1256: + movq %rdi,%r9 + movq %r10,%rax + testl $15,%edi + jnz 3f +1: .if \erms == 1 rep stosb + movq %r9,%rax .else + movq %rcx,%rdx shrq $3,%rcx rep stosq - movq %rdx,%rcx - andb $7,%cl - jne 1004b -.endif movq %r9,%rax + andl $7,%edx + jnz 2f POP_FRAME_POINTER ret +2: + movq %r10,-8(%rdi,%rdx) +.endif + POP_FRAME_POINTER + ret + ALIGN_TEXT +3: + movq %r10,(%rdi) + movq %r10,8(%rdi) + movq %rdi,%r8 + andq $15,%r8 + leaq -16(%rcx,%r8),%rcx + neg %r8 + leaq 16(%rdi,%r8),%rdi + jmp 1b .endm ENTRY(memset_std) -- 2.45.0