2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
67 #include <sys/condvar.h>
69 #include <sys/ktrace.h>
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 static int pollscan(struct thread *, struct pollfd *, u_int);
77 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
78 static int dofileread(struct thread *, int, struct file *, struct uio *,
80 static int dofilewrite(struct thread *, int, struct file *, struct uio *,
82 static void doselwakeup(struct selinfo *, int);
84 #ifndef _SYS_SYSPROTO_H_
94 struct read_args *uap;
100 if (uap->nbyte > INT_MAX)
102 aiov.iov_base = uap->buf;
103 aiov.iov_len = uap->nbyte;
104 auio.uio_iov = &aiov;
106 auio.uio_resid = uap->nbyte;
107 auio.uio_segflg = UIO_USERSPACE;
108 error = kern_readv(td, uap->fd, &auio);
113 * Positioned read system call
115 #ifndef _SYS_SYSPROTO_H_
127 struct pread_args *uap;
133 if (uap->nbyte > INT_MAX)
135 aiov.iov_base = uap->buf;
136 aiov.iov_len = uap->nbyte;
137 auio.uio_iov = &aiov;
139 auio.uio_resid = uap->nbyte;
140 auio.uio_segflg = UIO_USERSPACE;
141 error = kern_preadv(td, uap->fd, &auio, uap->offset);
146 freebsd6_pread(td, uap)
148 struct freebsd6_pread_args *uap;
150 struct pread_args oargs;
153 oargs.buf = uap->buf;
154 oargs.nbyte = uap->nbyte;
155 oargs.offset = uap->offset;
156 return (pread(td, &oargs));
160 * Scatter read system call.
162 #ifndef _SYS_SYSPROTO_H_
170 readv(struct thread *td, struct readv_args *uap)
175 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
178 error = kern_readv(td, uap->fd, auio);
184 kern_readv(struct thread *td, int fd, struct uio *auio)
189 error = fget_read(td, fd, &fp);
192 error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
198 * Scatter positioned read system call.
200 #ifndef _SYS_SYSPROTO_H_
209 preadv(struct thread *td, struct preadv_args *uap)
214 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
217 error = kern_preadv(td, uap->fd, auio, uap->offset);
223 kern_preadv(td, fd, auio, offset)
232 error = fget_read(td, fd, &fp);
235 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
237 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
240 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
246 * Common code for readv and preadv that reads data in
247 * from a file using the passed in uio, offset, and flags.
250 dofileread(td, fd, fp, auio, offset, flags)
261 struct uio *ktruio = NULL;
264 /* Finish zero length reads right here */
265 if (auio->uio_resid == 0) {
266 td->td_retval[0] = 0;
269 auio->uio_rw = UIO_READ;
270 auio->uio_offset = offset;
273 if (KTRPOINT(td, KTR_GENIO))
274 ktruio = cloneuio(auio);
276 cnt = auio->uio_resid;
277 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
278 if (auio->uio_resid != cnt && (error == ERESTART ||
279 error == EINTR || error == EWOULDBLOCK))
282 cnt -= auio->uio_resid;
284 if (ktruio != NULL) {
285 ktruio->uio_resid = cnt;
286 ktrgenio(fd, UIO_READ, ktruio, error);
289 td->td_retval[0] = cnt;
293 #ifndef _SYS_SYSPROTO_H_
303 struct write_args *uap;
309 if (uap->nbyte > INT_MAX)
311 aiov.iov_base = (void *)(uintptr_t)uap->buf;
312 aiov.iov_len = uap->nbyte;
313 auio.uio_iov = &aiov;
315 auio.uio_resid = uap->nbyte;
316 auio.uio_segflg = UIO_USERSPACE;
317 error = kern_writev(td, uap->fd, &auio);
322 * Positioned write system call.
324 #ifndef _SYS_SYSPROTO_H_
336 struct pwrite_args *uap;
342 if (uap->nbyte > INT_MAX)
344 aiov.iov_base = (void *)(uintptr_t)uap->buf;
345 aiov.iov_len = uap->nbyte;
346 auio.uio_iov = &aiov;
348 auio.uio_resid = uap->nbyte;
349 auio.uio_segflg = UIO_USERSPACE;
350 error = kern_pwritev(td, uap->fd, &auio, uap->offset);
355 freebsd6_pwrite(td, uap)
357 struct freebsd6_pwrite_args *uap;
359 struct pwrite_args oargs;
362 oargs.buf = uap->buf;
363 oargs.nbyte = uap->nbyte;
364 oargs.offset = uap->offset;
365 return (pwrite(td, &oargs));
369 * Gather write system call.
371 #ifndef _SYS_SYSPROTO_H_
379 writev(struct thread *td, struct writev_args *uap)
384 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
387 error = kern_writev(td, uap->fd, auio);
393 kern_writev(struct thread *td, int fd, struct uio *auio)
398 error = fget_write(td, fd, &fp);
401 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
407 * Gather positioned write system call.
409 #ifndef _SYS_SYSPROTO_H_
410 struct pwritev_args {
418 pwritev(struct thread *td, struct pwritev_args *uap)
423 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
426 error = kern_pwritev(td, uap->fd, auio, uap->offset);
432 kern_pwritev(td, fd, auio, offset)
441 error = fget_write(td, fd, &fp);
444 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
446 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
449 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
455 * Common code for writev and pwritev that writes data to
456 * a file using the passed in uio, offset, and flags.
459 dofilewrite(td, fd, fp, auio, offset, flags)
470 struct uio *ktruio = NULL;
473 auio->uio_rw = UIO_WRITE;
475 auio->uio_offset = offset;
477 if (KTRPOINT(td, KTR_GENIO))
478 ktruio = cloneuio(auio);
480 cnt = auio->uio_resid;
481 if (fp->f_type == DTYPE_VNODE)
483 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
484 if (auio->uio_resid != cnt && (error == ERESTART ||
485 error == EINTR || error == EWOULDBLOCK))
487 /* Socket layer is responsible for issuing SIGPIPE. */
488 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
489 PROC_LOCK(td->td_proc);
490 psignal(td->td_proc, SIGPIPE);
491 PROC_UNLOCK(td->td_proc);
494 cnt -= auio->uio_resid;
496 if (ktruio != NULL) {
497 ktruio->uio_resid = cnt;
498 ktrgenio(fd, UIO_WRITE, ktruio, error);
501 td->td_retval[0] = cnt;
505 #ifndef _SYS_SYSPROTO_H_
514 ioctl(struct thread *td, struct ioctl_args *uap)
521 if (uap->com > 0xffffffff) {
523 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
524 td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
525 uap->com &= 0xffffffff;
530 * Interpret high order word to find amount of data to be
531 * copied to/from the user's address space.
533 size = IOCPARM_LEN(com);
534 if ((size > IOCPARM_MAX) ||
535 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
536 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
537 ((com & IOC_OUT) && size == 0) ||
539 ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
541 ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
545 if (!(com & IOC_VOID))
546 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
548 /* Integer argument. */
549 arg = (intptr_t)uap->data;
554 data = (void *)&uap->data;
556 error = copyin(uap->data, data, (u_int)size);
559 free(data, M_IOCTLOPS);
562 } else if (com & IOC_OUT) {
564 * Zero the buffer so the user always
565 * gets back something deterministic.
570 error = kern_ioctl(td, uap->fd, com, data);
572 if (error == 0 && (com & IOC_OUT))
573 error = copyout(data, uap->data, (u_int)size);
576 free(data, M_IOCTLOPS);
581 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
584 struct filedesc *fdp;
588 if ((error = fget(td, fd, &fp)) != 0)
590 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
594 fdp = td->td_proc->p_fd;
598 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
599 FILEDESC_XUNLOCK(fdp);
603 fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
604 FILEDESC_XUNLOCK(fdp);
608 if ((tmp = *(int *)data))
609 fp->f_flag |= FNONBLOCK;
611 fp->f_flag &= ~FNONBLOCK;
617 if ((tmp = *(int *)data))
618 fp->f_flag |= FASYNC;
620 fp->f_flag &= ~FASYNC;
626 error = fo_ioctl(fp, com, data, td->td_ucred, td);
633 * sellock and selwait are initialized in selectinit() via SYSINIT.
637 u_int nselcoll; /* Select collisions since boot */
638 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
640 #ifndef _SYS_SYSPROTO_H_
643 fd_set *in, *ou, *ex;
649 register struct thread *td;
650 register struct select_args *uap;
652 struct timeval tv, *tvp;
655 if (uap->tv != NULL) {
656 error = copyin(uap->tv, &tv, sizeof(tv));
663 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
667 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
668 fd_set *fd_ex, struct timeval *tvp)
670 struct filedesc *fdp;
672 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
673 * infds with the new FD_SETSIZE of 1024, and more than enough for
674 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
677 fd_mask s_selbits[howmany(2048, NFDBITS)];
678 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
679 struct timeval atv, rtv, ttv;
681 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
685 fdp = td->td_proc->p_fd;
688 if (nd > td->td_proc->p_fd->fd_nfiles)
689 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
690 FILEDESC_SUNLOCK(fdp);
693 * Allocate just enough bits for the non-null fd_sets. Use the
694 * preallocated auto buffer if possible.
696 nfdbits = roundup(nd, NFDBITS);
697 ncpbytes = nfdbits / NBBY;
700 nbufbytes += 2 * ncpbytes;
702 nbufbytes += 2 * ncpbytes;
704 nbufbytes += 2 * ncpbytes;
705 if (nbufbytes <= sizeof s_selbits)
706 selbits = &s_selbits[0];
708 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
711 * Assign pointers into the bit buffers and fetch the input bits.
712 * Put the output buffers together so that they can be bzeroed
716 #define getbits(name, x) \
721 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
723 sbp += ncpbytes / sizeof *sbp; \
724 error = copyin(name, ibits[x], ncpbytes); \
726 goto done_nosellock; \
734 bzero(selbits, nbufbytes / 2);
738 if (itimerfix(&atv)) {
742 getmicrouptime(&rtv);
743 timevaladd(&atv, &rtv);
749 TAILQ_INIT(&td->td_selq);
754 td->td_flags |= TDF_SELECT;
756 mtx_unlock(&sellock);
758 error = selscan(td, ibits, obits, nd);
760 if (error || td->td_retval[0])
762 if (atv.tv_sec || atv.tv_usec) {
763 getmicrouptime(&rtv);
764 if (timevalcmp(&rtv, &atv, >=))
767 timevalsub(&ttv, &rtv);
768 timo = ttv.tv_sec > 24 * 60 * 60 ?
769 24 * 60 * 60 * hz : tvtohz(&ttv);
773 * An event of interest may occur while we do not hold
774 * sellock, so check TDF_SELECT and the number of
775 * collisions and rescan the file descriptors if
779 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
786 error = cv_timedwait_sig(&selwait, &sellock, timo);
788 error = cv_wait_sig(&selwait, &sellock);
794 clear_selinfo_list(td);
796 td->td_flags &= ~TDF_SELECT;
798 mtx_unlock(&sellock);
801 /* select is not restarted after signals... */
802 if (error == ERESTART)
804 if (error == EWOULDBLOCK)
806 #define putbits(name, x) \
807 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
817 if (selbits != &s_selbits[0])
818 free(selbits, M_SELECT);
824 selscan(td, ibits, obits, nfd)
826 fd_mask **ibits, **obits;
833 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
834 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
835 struct filedesc *fdp = td->td_proc->p_fd;
838 for (msk = 0; msk < 3; msk++) {
839 if (ibits[msk] == NULL)
841 for (i = 0; i < nfd; i += NFDBITS) {
842 bits = ibits[msk][i/NFDBITS];
843 /* ffs(int mask) not portable, fd_mask is long */
844 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
847 if ((fp = fget_locked(fdp, fd)) == NULL) {
848 FILEDESC_SUNLOCK(fdp);
851 if (fo_poll(fp, flag[msk], td->td_ucred,
853 obits[msk][(fd)/NFDBITS] |=
854 ((fd_mask)1 << ((fd) % NFDBITS));
860 FILEDESC_SUNLOCK(fdp);
861 td->td_retval[0] = n;
865 #ifndef _SYS_SYSPROTO_H_
875 struct poll_args *uap;
878 struct pollfd smallbits[32];
879 struct timeval atv, rtv, ttv;
887 * This is kinda bogus. We have fd limits, but that is not
888 * really related to the size of the pollfd array. Make sure
889 * we let the process use at least FD_SETSIZE entries and at
890 * least enough for the current limits. We want to be reasonably
891 * safe, but not overly restrictive.
893 PROC_LOCK(td->td_proc);
894 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
895 (nfds > FD_SETSIZE)) {
896 PROC_UNLOCK(td->td_proc);
900 PROC_UNLOCK(td->td_proc);
901 ni = nfds * sizeof(struct pollfd);
902 if (ni > sizeof(smallbits))
903 bits = malloc(ni, M_TEMP, M_WAITOK);
906 error = copyin(uap->fds, bits, ni);
909 if (uap->timeout != INFTIM) {
910 atv.tv_sec = uap->timeout / 1000;
911 atv.tv_usec = (uap->timeout % 1000) * 1000;
912 if (itimerfix(&atv)) {
916 getmicrouptime(&rtv);
917 timevaladd(&atv, &rtv);
923 TAILQ_INIT(&td->td_selq);
928 td->td_flags |= TDF_SELECT;
930 mtx_unlock(&sellock);
932 error = pollscan(td, bits, nfds);
934 if (error || td->td_retval[0])
936 if (atv.tv_sec || atv.tv_usec) {
937 getmicrouptime(&rtv);
938 if (timevalcmp(&rtv, &atv, >=))
941 timevalsub(&ttv, &rtv);
942 timo = ttv.tv_sec > 24 * 60 * 60 ?
943 24 * 60 * 60 * hz : tvtohz(&ttv);
946 * An event of interest may occur while we do not hold
947 * sellock, so check TDF_SELECT and the number of collisions
948 * and rescan the file descriptors if necessary.
951 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
958 error = cv_timedwait_sig(&selwait, &sellock, timo);
960 error = cv_wait_sig(&selwait, &sellock);
966 clear_selinfo_list(td);
968 td->td_flags &= ~TDF_SELECT;
970 mtx_unlock(&sellock);
973 /* poll is not restarted after signals... */
974 if (error == ERESTART)
976 if (error == EWOULDBLOCK)
979 error = copyout(bits, uap->fds, ni);
984 if (ni > sizeof(smallbits))
991 pollscan(td, fds, nfd)
996 register struct filedesc *fdp = td->td_proc->p_fd;
1001 FILEDESC_SLOCK(fdp);
1002 for (i = 0; i < nfd; i++, fds++) {
1003 if (fds->fd >= fdp->fd_nfiles) {
1004 fds->revents = POLLNVAL;
1006 } else if (fds->fd < 0) {
1009 fp = fdp->fd_ofiles[fds->fd];
1011 fds->revents = POLLNVAL;
1015 * Note: backend also returns POLLHUP and
1016 * POLLERR if appropriate.
1018 fds->revents = fo_poll(fp, fds->events,
1020 if (fds->revents != 0)
1025 FILEDESC_SUNLOCK(fdp);
1026 td->td_retval[0] = n;
1031 * OpenBSD poll system call.
1033 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1035 #ifndef _SYS_SYSPROTO_H_
1036 struct openbsd_poll_args {
1043 openbsd_poll(td, uap)
1044 register struct thread *td;
1045 register struct openbsd_poll_args *uap;
1047 return (poll(td, (struct poll_args *)uap));
1051 * Remove the references to the thread from all of the objects we were
1054 * This code assumes that the underlying owner of the selinfo structure will
1055 * hold sellock before it changes it, and that it will unlink itself from our
1056 * list if it goes away.
1059 clear_selinfo_list(td)
1064 mtx_assert(&sellock, MA_OWNED);
1065 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1066 si->si_thread = NULL;
1067 TAILQ_INIT(&td->td_selq);
1071 * Record a select request.
1074 selrecord(selector, sip)
1075 struct thread *selector;
1076 struct selinfo *sip;
1081 * If the selinfo's thread pointer is NULL then take ownership of it.
1083 * If the thread pointer is not NULL and it points to another
1084 * thread, then we have a collision.
1086 * If the thread pointer is not NULL and points back to us then leave
1087 * it alone as we've already added pointed it at us and added it to
1090 if (sip->si_thread == NULL) {
1091 sip->si_thread = selector;
1092 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1093 } else if (sip->si_thread != selector) {
1094 sip->si_flags |= SI_COLL;
1097 mtx_unlock(&sellock);
1100 /* Wake up a selecting thread. */
1103 struct selinfo *sip;
1105 doselwakeup(sip, -1);
1108 /* Wake up a selecting thread, and set its priority. */
1110 selwakeuppri(sip, pri)
1111 struct selinfo *sip;
1114 doselwakeup(sip, pri);
1118 * Do a wakeup when a selectable event occurs.
1121 doselwakeup(sip, pri)
1122 struct selinfo *sip;
1128 td = sip->si_thread;
1129 if ((sip->si_flags & SI_COLL) != 0) {
1131 sip->si_flags &= ~SI_COLL;
1132 cv_broadcastpri(&selwait, pri);
1135 mtx_unlock(&sellock);
1138 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1139 sip->si_thread = NULL;
1141 td->td_flags &= ~TDF_SELECT;
1143 sleepq_remove(td, &selwait);
1144 mtx_unlock(&sellock);
1147 static void selectinit(void *);
1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1155 cv_init(&selwait, "select");
1156 mtx_init(&sellock, "sellck", NULL, MTX_DEF);