/* $NetBSD: bcopyinout.S,v 1.11 2003/10/13 21:22:40 scw Exp $ */ /*- * Copyright (c) 2002 Wasabi Systems, Inc. * All rights reserved. * * Written by Allen Briggs for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "assym.s" #include .L_arm_memcpy: .word _C_LABEL(_arm_memcpy) .L_min_memcpy_size: .word _C_LABEL(_min_memcpy_size) __FBSDID("$FreeBSD$"); #ifdef _ARM_ARCH_5E #include #else .text .align 0 #ifdef MULTIPROCESSOR .Lcpu_info: .word _C_LABEL(cpu_info) #else .Lcurpcb: .word _C_LABEL(__pcpu) + PC_CURPCB #endif #define SAVE_REGS stmfd sp!, {r4-r11} #define RESTORE_REGS ldmfd sp!, {r4-r11} #if defined(_ARM_ARCH_5E) #define HELLOCPP # #define PREFETCH(rx,o) pld [ rx , HELLOCPP (o) ] #else #define PREFETCH(rx,o) #endif /* * r0 = user space address * r1 = kernel space address * r2 = length * * Copies bytes from user space to kernel space * * We save/restore r4-r11: * r4-r11 are scratch */ ENTRY(copyin) /* Quick exit if length is zero */ teq r2, #0 moveq r0, #0 RETeq ldr r3, .L_arm_memcpy ldr r3, [r3] cmp r3, #0 beq .Lnormal ldr r3, .L_min_memcpy_size ldr r3, [r3] cmp r2, r3 blt .Lnormal stmfd sp!, {r0-r2, r4, lr} mov r3, r0 mov r0, r1 mov r1, r3 mov r3, #2 /* SRC_IS_USER */ ldr r4, .L_arm_memcpy mov lr, pc ldr pc, [r4] cmp r0, #0 ldmfd sp!, {r0-r2, r4, lr} moveq r0, #0 RETeq .Lnormal: SAVE_REGS #ifdef MULTIPROCESSOR /* XXX Probably not appropriate for non-Hydra SMPs */ stmfd sp!, {r0-r2, r14} bl _C_LABEL(cpu_number) ldr r4, .Lcpu_info ldr r4, [r4, r0, lsl #2] ldr r4, [r4, #CI_CURPCB] ldmfd sp!, {r0-r2, r14} #else ldr r4, .Lcurpcb ldr r4, [r4] #endif ldr r5, [r4, #PCB_ONFAULT] adr r3, .Lcopyfault str r3, [r4, #PCB_ONFAULT] PREFETCH(r0, 0) PREFETCH(r1, 0) /* * If not too many bytes, take the slow path. */ cmp r2, #0x08 blt .Licleanup /* * Align destination to word boundary. */ and r6, r1, #0x3 ldr pc, [pc, r6, lsl #2] b .Lialend .word .Lialend .word .Lial3 .word .Lial2 .word .Lial1 .Lial3: ldrbt r6, [r0], #1 sub r2, r2, #1 strb r6, [r1], #1 .Lial2: ldrbt r7, [r0], #1 sub r2, r2, #1 strb r7, [r1], #1 .Lial1: ldrbt r6, [r0], #1 sub r2, r2, #1 strb r6, [r1], #1 .Lialend: /* * If few bytes left, finish slow. */ cmp r2, #0x08 blt .Licleanup /* * If source is not aligned, finish slow. */ ands r3, r0, #0x03 bne .Licleanup cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ blt .Licleanup8 /* * Align destination to cacheline boundary. * If source and destination are nicely aligned, this can be a big * win. If not, it's still cheaper to copy in groups of 32 even if * we don't get the nice cacheline alignment. */ and r6, r1, #0x1f ldr pc, [pc, r6] b .Licaligned .word .Licaligned .word .Lical28 .word .Lical24 .word .Lical20 .word .Lical16 .word .Lical12 .word .Lical8 .word .Lical4 .Lical28:ldrt r6, [r0], #4 sub r2, r2, #4 str r6, [r1], #4 .Lical24:ldrt r7, [r0], #4 sub r2, r2, #4 str r7, [r1], #4 .Lical20:ldrt r6, [r0], #4 sub r2, r2, #4 str r6, [r1], #4 .Lical16:ldrt r7, [r0], #4 sub r2, r2, #4 str r7, [r1], #4 .Lical12:ldrt r6, [r0], #4 sub r2, r2, #4 str r6, [r1], #4 .Lical8:ldrt r7, [r0], #4 sub r2, r2, #4 str r7, [r1], #4 .Lical4:ldrt r6, [r0], #4 sub r2, r2, #4 str r6, [r1], #4 /* * We start with > 0x40 bytes to copy (>= 0x60 got us into this * part of the code, and we may have knocked that down by as much * as 0x1c getting aligned). * * This loop basically works out to: * do { * prefetch-next-cacheline(s) * bytes -= 0x20; * copy cacheline * } while (bytes >= 0x40); * bytes -= 0x20; * copy cacheline */ .Licaligned: PREFETCH(r0, 32) PREFETCH(r1, 32) sub r2, r2, #0x20 /* Copy a cacheline */ ldrt r10, [r0], #4 ldrt r11, [r0], #4 ldrt r6, [r0], #4 ldrt r7, [r0], #4 ldrt r8, [r0], #4 ldrt r9, [r0], #4 stmia r1!, {r10-r11} ldrt r10, [r0], #4 ldrt r11, [r0], #4 stmia r1!, {r6-r11} cmp r2, #0x40 bge .Licaligned sub r2, r2, #0x20 /* Copy a cacheline */ ldrt r10, [r0], #4 ldrt r11, [r0], #4 ldrt r6, [r0], #4 ldrt r7, [r0], #4 ldrt r8, [r0], #4 ldrt r9, [r0], #4 stmia r1!, {r10-r11} ldrt r10, [r0], #4 ldrt r11, [r0], #4 stmia r1!, {r6-r11} cmp r2, #0x08 blt .Liprecleanup .Licleanup8: ldrt r8, [r0], #4 ldrt r9, [r0], #4 sub r2, r2, #8 stmia r1!, {r8, r9} cmp r2, #8 bge .Licleanup8 .Liprecleanup: /* * If we're done, bail. */ cmp r2, #0 beq .Lout .Licleanup: and r6, r2, #0x3 ldr pc, [pc, r6, lsl #2] b .Licend .word .Lic4 .word .Lic1 .word .Lic2 .word .Lic3 .Lic4: ldrbt r6, [r0], #1 sub r2, r2, #1 strb r6, [r1], #1 .Lic3: ldrbt r7, [r0], #1 sub r2, r2, #1 strb r7, [r1], #1 .Lic2: ldrbt r6, [r0], #1 sub r2, r2, #1 strb r6, [r1], #1 .Lic1: ldrbt r7, [r0], #1 subs r2, r2, #1 strb r7, [r1], #1 .Licend: bne .Licleanup .Liout: mov r0, #0 str r5, [r4, #PCB_ONFAULT] RESTORE_REGS RET .Lcopyfault: mov r0, #14 /* EFAULT */ str r5, [r4, #PCB_ONFAULT] RESTORE_REGS RET /* * r0 = kernel space address * r1 = user space address * r2 = length * * Copies bytes from kernel space to user space * * We save/restore r4-r11: * r4-r11 are scratch */ ENTRY(copyout) /* Quick exit if length is zero */ teq r2, #0 moveq r0, #0 RETeq ldr r3, .L_arm_memcpy ldr r3, [r3] cmp r3, #0 beq .Lnormale ldr r3, .L_min_memcpy_size ldr r3, [r3] cmp r2, r3 blt .Lnormale stmfd sp!, {r0-r2, r4, lr} mov r3, r0 mov r0, r1 mov r1, r3 mov r3, #1 /* DST_IS_USER */ ldr r4, .L_arm_memcpy mov lr, pc ldr pc, [r4] cmp r0, #0 ldmfd sp!, {r0-r2, r4, lr} moveq r0, #0 RETeq .Lnormale: SAVE_REGS #ifdef MULTIPROCESSOR /* XXX Probably not appropriate for non-Hydra SMPs */ stmfd sp!, {r0-r2, r14} bl _C_LABEL(cpu_number) ldr r4, .Lcpu_info ldr r4, [r4, r0, lsl #2] ldr r4, [r4, #CI_CURPCB] ldmfd sp!, {r0-r2, r14} #else ldr r4, .Lcurpcb ldr r4, [r4] #endif ldr r5, [r4, #PCB_ONFAULT] adr r3, .Lcopyfault str r3, [r4, #PCB_ONFAULT] PREFETCH(r0, 0) PREFETCH(r1, 0) /* * If not too many bytes, take the slow path. */ cmp r2, #0x08 blt .Lcleanup /* * Align destination to word boundary. */ and r6, r1, #0x3 ldr pc, [pc, r6, lsl #2] b .Lalend .word .Lalend .word .Lal3 .word .Lal2 .word .Lal1 .Lal3: ldrb r6, [r0], #1 sub r2, r2, #1 strbt r6, [r1], #1 .Lal2: ldrb r7, [r0], #1 sub r2, r2, #1 strbt r7, [r1], #1 .Lal1: ldrb r6, [r0], #1 sub r2, r2, #1 strbt r6, [r1], #1 .Lalend: /* * If few bytes left, finish slow. */ cmp r2, #0x08 blt .Lcleanup /* * If source is not aligned, finish slow. */ ands r3, r0, #0x03 bne .Lcleanup cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ blt .Lcleanup8 /* * Align source & destination to cacheline boundary. */ and r6, r1, #0x1f ldr pc, [pc, r6] b .Lcaligned .word .Lcaligned .word .Lcal28 .word .Lcal24 .word .Lcal20 .word .Lcal16 .word .Lcal12 .word .Lcal8 .word .Lcal4 .Lcal28:ldr r6, [r0], #4 sub r2, r2, #4 strt r6, [r1], #4 .Lcal24:ldr r7, [r0], #4 sub r2, r2, #4 strt r7, [r1], #4 .Lcal20:ldr r6, [r0], #4 sub r2, r2, #4 strt r6, [r1], #4 .Lcal16:ldr r7, [r0], #4 sub r2, r2, #4 strt r7, [r1], #4 .Lcal12:ldr r6, [r0], #4 sub r2, r2, #4 strt r6, [r1], #4 .Lcal8: ldr r7, [r0], #4 sub r2, r2, #4 strt r7, [r1], #4 .Lcal4: ldr r6, [r0], #4 sub r2, r2, #4 strt r6, [r1], #4 /* * We start with > 0x40 bytes to copy (>= 0x60 got us into this * part of the code, and we may have knocked that down by as much * as 0x1c getting aligned). * * This loop basically works out to: * do { * prefetch-next-cacheline(s) * bytes -= 0x20; * copy cacheline * } while (bytes >= 0x40); * bytes -= 0x20; * copy cacheline */ .Lcaligned: PREFETCH(r0, 32) PREFETCH(r1, 32) sub r2, r2, #0x20 /* Copy a cacheline */ ldmia r0!, {r6-r11} strt r6, [r1], #4 strt r7, [r1], #4 ldmia r0!, {r6-r7} strt r8, [r1], #4 strt r9, [r1], #4 strt r10, [r1], #4 strt r11, [r1], #4 strt r6, [r1], #4 strt r7, [r1], #4 cmp r2, #0x40 bge .Lcaligned sub r2, r2, #0x20 /* Copy a cacheline */ ldmia r0!, {r6-r11} strt r6, [r1], #4 strt r7, [r1], #4 ldmia r0!, {r6-r7} strt r8, [r1], #4 strt r9, [r1], #4 strt r10, [r1], #4 strt r11, [r1], #4 strt r6, [r1], #4 strt r7, [r1], #4 cmp r2, #0x08 blt .Lprecleanup .Lcleanup8: ldmia r0!, {r8-r9} sub r2, r2, #8 strt r8, [r1], #4 strt r9, [r1], #4 cmp r2, #8 bge .Lcleanup8 .Lprecleanup: /* * If we're done, bail. */ cmp r2, #0 beq .Lout .Lcleanup: and r6, r2, #0x3 ldr pc, [pc, r6, lsl #2] b .Lcend .word .Lc4 .word .Lc1 .word .Lc2 .word .Lc3 .Lc4: ldrb r6, [r0], #1 sub r2, r2, #1 strbt r6, [r1], #1 .Lc3: ldrb r7, [r0], #1 sub r2, r2, #1 strbt r7, [r1], #1 .Lc2: ldrb r6, [r0], #1 sub r2, r2, #1 strbt r6, [r1], #1 .Lc1: ldrb r7, [r0], #1 subs r2, r2, #1 strbt r7, [r1], #1 .Lcend: bne .Lcleanup .Lout: mov r0, #0 str r5, [r4, #PCB_ONFAULT] RESTORE_REGS RET #endif /* * int badaddr_read_1(const uint8_t *src, uint8_t *dest) * * Copies a single 8-bit value from src to dest, returning 0 on success, * else EFAULT if a page fault occurred. */ ENTRY(badaddr_read_1) #ifdef MULTIPROCESSOR /* XXX Probably not appropriate for non-Hydra SMPs */ stmfd sp!, {r0-r1, r14} bl _C_LABEL(cpu_number) ldr r2, .Lcpu_info ldr r2, [r2, r0, lsl #2] ldr r2, [r2, #CI_CURPCB] ldmfd sp!, {r0-r1, r14} #else ldr r2, .Lcurpcb ldr r2, [r2] #endif ldr ip, [r2, #PCB_ONFAULT] adr r3, 1f str r3, [r2, #PCB_ONFAULT] nop nop nop ldrb r3, [r0] nop nop nop strb r3, [r1] mov r0, #0 /* No fault */ 1: str ip, [r2, #PCB_ONFAULT] RET /* * int badaddr_read_2(const uint16_t *src, uint16_t *dest) * * Copies a single 16-bit value from src to dest, returning 0 on success, * else EFAULT if a page fault occurred. */ ENTRY(badaddr_read_2) #ifdef MULTIPROCESSOR /* XXX Probably not appropriate for non-Hydra SMPs */ stmfd sp!, {r0-r1, r14} bl _C_LABEL(cpu_number) ldr r2, .Lcpu_info ldr r2, [r2, r0, lsl #2] ldr r2, [r2, #CI_CURPCB] ldmfd sp!, {r0-r1, r14} #else ldr r2, .Lcurpcb ldr r2, [r2] #endif ldr ip, [r2, #PCB_ONFAULT] adr r3, 1f str r3, [r2, #PCB_ONFAULT] nop nop nop ldrh r3, [r0] nop nop nop strh r3, [r1] mov r0, #0 /* No fault */ 1: str ip, [r2, #PCB_ONFAULT] RET /* * int badaddr_read_4(const uint32_t *src, uint32_t *dest) * * Copies a single 32-bit value from src to dest, returning 0 on success, * else EFAULT if a page fault occurred. */ ENTRY(badaddr_read_4) #ifdef MULTIPROCESSOR /* XXX Probably not appropriate for non-Hydra SMPs */ stmfd sp!, {r0-r1, r14} bl _C_LABEL(cpu_number) ldr r2, .Lcpu_info ldr r2, [r2, r0, lsl #2] ldr r2, [r2, #CI_CURPCB] ldmfd sp!, {r0-r1, r14} #else ldr r2, .Lcurpcb ldr r2, [r2] #endif ldr ip, [r2, #PCB_ONFAULT] adr r3, 1f str r3, [r2, #PCB_ONFAULT] nop nop nop ldr r3, [r0] nop nop nop str r3, [r1] mov r0, #0 /* No fault */ 1: str ip, [r2, #PCB_ONFAULT] RET