From 088ac3ef4b7c315e5669a38197fd04f76a20b8f1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 16 Nov 2018 00:44:22 +0000 Subject: [PATCH] amd64: handle small memset buffers with overlapping stores Instead of jumping to locations which store the exact number of bytes, use displacement to move the destination. In particular the following clears an area between 8-16 (inclusive) branch-free: movq %r10,(%rdi) movq %r10,-8(%rdi,%rcx) For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2. Writing 8 bytes starting at that offset overlaps with 6 bytes written previously and writes 2 new, giving 10 in total. Provides a nice win for smaller stores. Other ones are erratic depending on the microarchitecture. General idea taken from NetBSD (restricted use of the trick) and bionic string functions (use for various ranges like in this patch). Reviewed by: kib (previous version) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17660 --- lib/libc/amd64/string/memset.S | 65 ++++++++++++++++++-------------- sys/amd64/amd64/support.S | 69 ++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 58 deletions(-) diff --git a/lib/libc/amd64/string/memset.S b/lib/libc/amd64/string/memset.S index a68d478b7dc..67f21714b3d 100644 --- a/lib/libc/amd64/string/memset.S +++ b/lib/libc/amd64/string/memset.S @@ -41,12 +41,12 @@ __FBSDID("$FreeBSD$"); imulq %r8,%r10 cmpq $32,%rcx - jb 1016f + jbe 101632f cmpq $256,%rcx ja 1256f -1032: +103200: movq %r10,(%rdi) movq %r10,8(%rdi) movq %r10,16(%rdi) @@ -54,43 +54,49 @@ __FBSDID("$FreeBSD$"); leaq 32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx - jae 1032b - cmpb $0,%cl - je 1000f -1016: + ja 103200b + cmpb $16,%cl + ja 201632f + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + ret + ALIGN_TEXT +101632: cmpb $16,%cl - jl 1008f + jl 100816f +201632: movq %r10,(%rdi) movq %r10,8(%rdi) - subb $16,%cl - jz 1000f - leaq 16(%rdi),%rdi -1008: + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + ret + ALIGN_TEXT +100816: cmpb $8,%cl - jl 1004f + jl 100408f movq %r10,(%rdi) - subb $8,%cl - jz 1000f - leaq 8(%rdi),%rdi -1004: + movq %r10,-8(%rdi,%rcx) + ret + ALIGN_TEXT +100408: cmpb $4,%cl - jl 1002f + jl 100204f movl %r10d,(%rdi) - subb $4,%cl - jz 1000f - leaq 4(%rdi),%rdi -1002: + movl %r10d,-4(%rdi,%rcx) + ret + ALIGN_TEXT +100204: cmpb $2,%cl - jl 1001f + jl 100001f movw %r10w,(%rdi) - subb $2,%cl - jz 1000f - leaq 2(%rdi),%rdi -1001: - cmpb $1,%cl - jl 1000f + movw %r10w,-2(%rdi,%rcx) + ret + ALIGN_TEXT +100001: + cmpb $0,%cl + je 100000f movb %r10b,(%rdi) -1000: +100000: ret ALIGN_TEXT 1256: @@ -128,6 +134,7 @@ __FBSDID("$FreeBSD$"); jmp 1b .endm + ENTRY(memset) MEMSET erms=0 END(memset) diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 73e00bae8aa..1bb82ef43b9 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -459,12 +459,12 @@ END(memcpy_erms) imulq %r8,%r10 cmpq $32,%rcx - jb 1016f + jbe 101632f cmpq $256,%rcx ja 1256f -1032: +103200: movq %r10,(%rdi) movq %r10,8(%rdi) movq %r10,16(%rdi) @@ -472,43 +472,54 @@ END(memcpy_erms) leaq 32(%rdi),%rdi subq $32,%rcx cmpq $32,%rcx - jae 1032b - cmpb $0,%cl - je 1000f -1016: + ja 103200b cmpb $16,%cl - jl 1008f + ja 201632f + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +101632: + cmpb $16,%cl + jl 100816f +201632: movq %r10,(%rdi) movq %r10,8(%rdi) - subb $16,%cl - jz 1000f - leaq 16(%rdi),%rdi -1008: + movq %r10,-16(%rdi,%rcx) + movq %r10,-8(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100816: cmpb $8,%cl - jl 1004f + jl 100408f movq %r10,(%rdi) - subb $8,%cl - jz 1000f - leaq 8(%rdi),%rdi -1004: + movq %r10,-8(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100408: cmpb $4,%cl - jl 1002f + jl 100204f movl %r10d,(%rdi) - subb $4,%cl - jz 1000f - leaq 4(%rdi),%rdi -1002: + movl %r10d,-4(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100204: cmpb $2,%cl - jl 1001f + jl 100001f movw %r10w,(%rdi) - subb $2,%cl - jz 1000f - leaq 2(%rdi),%rdi -1001: - cmpb $1,%cl - jl 1000f + movw %r10w,-2(%rdi,%rcx) + POP_FRAME_POINTER + ret + ALIGN_TEXT +100001: + cmpb $0,%cl + je 100000f movb %r10b,(%rdi) -1000: +100000: POP_FRAME_POINTER ret ALIGN_TEXT -- 2.45.0