2 * Copyright (c) 1982, 1986, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
49 #include <sys/mutex.h>
51 #include <sys/malloc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/sched.h>
54 #include <sys/sysctl.h>
55 #include <sys/vnode.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_map.h>
60 #ifdef ZERO_COPY_SOCKETS
61 #include <vm/vm_param.h>
63 #if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT)
64 #include <vm/vm_object.h>
67 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
68 "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
70 #if defined(ZERO_COPY_SOCKETS) || defined(ENABLE_VFS_IOOPT)
71 static int userspaceco(caddr_t cp, u_int cnt, struct uio *uio,
72 struct vm_object *obj, int disposable);
75 #ifdef ZERO_COPY_SOCKETS
76 /* Declared in uipc_socket.c */
77 extern int so_zero_copy_receive;
79 static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr,
83 vm_pgmoveco(mapa, srcobj, kaddr, uaddr)
86 vm_offset_t kaddr, uaddr;
89 vm_page_t kern_pg, user_pg;
92 vm_pindex_t upindex, kpindex;
97 * First lookup the kernel page.
99 kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
101 if ((vm_map_lookup(&map, uaddr,
102 VM_PROT_READ, &entry, &uobject,
103 &upindex, &prot, &wired)) != KERN_SUCCESS) {
106 if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
107 vm_page_lock_queues();
108 if (!vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco"))
109 vm_page_unlock_queues();
110 pmap_remove(map->pmap, uaddr, uaddr+PAGE_SIZE);
111 vm_page_lock_queues();
112 vm_page_busy(user_pg);
113 vm_page_free(user_pg);
114 vm_page_unlock_queues();
117 if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) ||
118 (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) {
119 printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), "
120 "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex,
121 kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0,
122 kern_pg->hold_count, (u_long)kern_pg->phys_addr);
123 if ((kern_pg->queue - kern_pg->pc) == PQ_FREE)
124 panic("vm_pgmoveco: renaming free page");
126 panic("vm_pgmoveco: renaming busy page");
128 kpindex = kern_pg->pindex;
129 vm_page_busy(kern_pg);
130 vm_page_rename(kern_pg, uobject, upindex);
131 vm_page_flag_clear(kern_pg, PG_BUSY);
132 kern_pg->valid = VM_PAGE_BITS_ALL;
134 vm_map_lookup_done(map, entry);
135 return(KERN_SUCCESS);
137 #endif /* ZERO_COPY_SOCKETS */
143 register struct uio *uio;
145 struct thread *td = curthread;
146 register struct iovec *iov;
151 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
153 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
157 mtx_lock_spin(&sched_lock);
158 save = td->td_flags & TDF_DEADLKTREAT;
159 td->td_flags |= TDF_DEADLKTREAT;
160 mtx_unlock_spin(&sched_lock);
163 while (n > 0 && uio->uio_resid) {
174 switch (uio->uio_segflg) {
177 if (ticks - PCPU_GET(switchticks) >= hogticks)
179 if (uio->uio_rw == UIO_READ)
180 error = copyout(cp, iov->iov_base, cnt);
182 error = copyin(iov->iov_base, cp, cnt);
188 if (uio->uio_rw == UIO_READ)
189 bcopy(cp, iov->iov_base, cnt);
191 bcopy(iov->iov_base, cp, cnt);
196 iov->iov_base = (char *)iov->iov_base + cnt;
198 uio->uio_resid -= cnt;
199 uio->uio_offset += cnt;
204 if (td != curthread) printf("uiomove: IT CHANGED!");
205 td = curthread; /* Might things have changed in copyin/copyout? */
207 mtx_lock_spin(&sched_lock);
208 td->td_flags = (td->td_flags & ~TDF_DEADLKTREAT) | save;
209 mtx_unlock_spin(&sched_lock);
214 #if defined(ENABLE_VFS_IOOPT) || defined(ZERO_COPY_SOCKETS)
216 * Experimental support for zero-copy I/O
219 userspaceco(cp, cnt, uio, obj, disposable)
223 struct vm_object *obj;
231 #ifdef ZERO_COPY_SOCKETS
233 if (uio->uio_rw == UIO_READ) {
234 if ((so_zero_copy_receive != 0)
236 && ((cnt & PAGE_MASK) == 0)
237 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
238 && ((uio->uio_offset & PAGE_MASK) == 0)
239 && ((((intptr_t) cp) & PAGE_MASK) == 0)
240 && (obj->type == OBJT_DEFAULT)
241 && (disposable != 0)) {
242 /* SOCKET: use page-trading */
244 * We only want to call vm_pgmoveco() on
245 * disposeable pages, since it gives the
246 * kernel page to the userland process.
248 error = vm_pgmoveco(&curproc->p_vmspace->vm_map,
249 obj, (vm_offset_t)cp,
250 (vm_offset_t)iov->iov_base);
253 * If we get an error back, attempt
254 * to use copyout() instead. The
255 * disposable page should be freed
256 * automatically if we weren't able to move
260 error = copyout(cp, iov->iov_base, cnt);
261 #ifdef ENABLE_VFS_IOOPT
262 } else if ((vfs_ioopt != 0)
263 && ((cnt & PAGE_MASK) == 0)
264 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
265 && ((uio->uio_offset & PAGE_MASK) == 0)
266 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
267 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
268 uio->uio_offset, cnt,
269 (vm_offset_t) iov->iov_base, NULL);
270 #endif /* ENABLE_VFS_IOOPT */
272 error = copyout(cp, iov->iov_base, cnt);
275 error = copyin(iov->iov_base, cp, cnt);
277 #else /* ZERO_COPY_SOCKETS */
278 if (uio->uio_rw == UIO_READ) {
279 #ifdef ENABLE_VFS_IOOPT
281 && ((cnt & PAGE_MASK) == 0)
282 && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
283 && ((uio->uio_offset & PAGE_MASK) == 0)
284 && ((((intptr_t) cp) & PAGE_MASK) == 0)) {
285 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
286 uio->uio_offset, cnt,
287 (vm_offset_t) iov->iov_base, NULL);
289 #endif /* ENABLE_VFS_IOOPT */
291 error = copyout(cp, iov->iov_base, cnt);
294 error = copyin(iov->iov_base, cp, cnt);
296 #endif /* ZERO_COPY_SOCKETS */
302 uiomoveco(cp, n, uio, obj, disposable)
306 struct vm_object *obj;
313 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
314 ("uiomoveco: mode"));
315 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
318 while (n > 0 && uio->uio_resid) {
329 switch (uio->uio_segflg) {
332 if (ticks - PCPU_GET(switchticks) >= hogticks)
335 error = userspaceco(cp, cnt, uio, obj, disposable);
342 if (uio->uio_rw == UIO_READ)
343 bcopy(cp, iov->iov_base, cnt);
345 bcopy(iov->iov_base, cp, cnt);
350 iov->iov_base = (char *)iov->iov_base + cnt;
352 uio->uio_resid -= cnt;
353 uio->uio_offset += cnt;
359 #endif /* ENABLE_VFS_IOOPT || ZERO_COPY_SOCKETS */
361 #ifdef ENABLE_VFS_IOOPT
364 * Experimental support for zero-copy I/O
367 uioread(n, uio, obj, nread)
370 struct vm_object *obj;
384 while (n > 0 && uio->uio_resid) {
395 if ((uio->uio_segflg == UIO_USERSPACE) &&
396 ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
397 ((uio->uio_offset & PAGE_MASK) == 0) ) {
404 if (ticks - PCPU_GET(switchticks) >= hogticks)
406 error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
407 uio->uio_offset, cnt,
408 (vm_offset_t) iov->iov_base, &npagesmoved);
410 if (npagesmoved == 0)
413 tcnt = npagesmoved * PAGE_SIZE;
419 iov->iov_base = (char *)iov->iov_base + cnt;
421 uio->uio_resid -= cnt;
422 uio->uio_offset += cnt;
431 #endif /* ENABLE_VFS_IOOPT */
434 * Give next character to user as result of read.
439 register struct uio *uio;
441 register struct iovec *iov;
442 register char *iov_base;
445 if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
448 if (iov->iov_len == 0) {
453 switch (uio->uio_segflg) {
456 if (subyte(iov->iov_base, c) < 0)
461 iov_base = iov->iov_base;
463 iov->iov_base = iov_base;
469 iov->iov_base = (char *)iov->iov_base + 1;
477 * General routine to allocate a hash table.
480 hashinit(elements, type, hashmask)
482 struct malloc_type *type;
486 LIST_HEAD(generic, generic) *hashtbl;
490 panic("hashinit: bad elements");
491 for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
494 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
495 for (i = 0; i < hashsize; i++)
496 LIST_INIT(&hashtbl[i]);
497 *hashmask = hashsize - 1;
502 hashdestroy(vhashtbl, type, hashmask)
504 struct malloc_type *type;
507 LIST_HEAD(generic, generic) *hashtbl, *hp;
510 for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++)
512 panic("hashdestroy: hash not empty");
516 static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
517 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
518 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
519 #define NPRIMES (sizeof(primes) / sizeof(primes[0]))
522 * General routine to allocate a prime number sized hash table.
525 phashinit(elements, type, nentries)
527 struct malloc_type *type;
531 LIST_HEAD(generic, generic) *hashtbl;
535 panic("phashinit: bad elements");
536 for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
540 hashsize = primes[i];
542 hashsize = primes[i - 1];
543 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
544 for (i = 0; i < hashsize; i++)
545 LIST_INIT(&hashtbl[i]);
546 *nentries = hashsize;
556 mtx_lock_spin(&sched_lock);
558 sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */
559 td->td_proc->p_stats->p_ru.ru_nivcsw++;
561 mtx_unlock_spin(&sched_lock);
566 copyinfrom(const void *src, void *dst, size_t len, int seg)
572 error = copyin(src, dst, len);
575 bcopy(src, dst, len);
578 panic("copyinfrom: bad seg %d\n", seg);
584 copyinstrfrom(const void *src, void *dst, size_t len, size_t *copied, int seg)
590 error = copyinstr(src, dst, len, copied);
593 error = copystr(src, dst, len, copied);
596 panic("copyinstrfrom: bad seg %d\n", seg);