2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
67 #include <sys/condvar.h>
69 #include <sys/ktrace.h>
72 #include <vm/vm_page.h>
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
78 static int pollscan(struct thread *, struct pollfd *, u_int);
79 static int selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int dofileread(struct thread *, int, struct file *, struct uio *,
82 static int dofilewrite(struct thread *, int, struct file *, struct uio *,
84 static void doselwakeup(struct selinfo *, int);
89 #ifndef _SYS_SYSPROTO_H_
102 struct read_args *uap;
108 if (uap->nbyte > INT_MAX)
110 aiov.iov_base = uap->buf;
111 aiov.iov_len = uap->nbyte;
112 auio.uio_iov = &aiov;
114 auio.uio_resid = uap->nbyte;
115 auio.uio_segflg = UIO_USERSPACE;
116 error = kern_readv(td, uap->fd, &auio);
121 * Positioned read system call
123 #ifndef _SYS_SYSPROTO_H_
138 struct pread_args *uap;
144 if (uap->nbyte > INT_MAX)
146 aiov.iov_base = uap->buf;
147 aiov.iov_len = uap->nbyte;
148 auio.uio_iov = &aiov;
150 auio.uio_resid = uap->nbyte;
151 auio.uio_segflg = UIO_USERSPACE;
152 error = kern_preadv(td, uap->fd, &auio, uap->offset);
157 * Scatter read system call.
159 #ifndef _SYS_SYSPROTO_H_
170 readv(struct thread *td, struct readv_args *uap)
175 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
178 error = kern_readv(td, uap->fd, auio);
184 kern_readv(struct thread *td, int fd, struct uio *auio)
189 error = fget_read(td, fd, &fp);
192 error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
198 * Scatter positioned read system call.
200 #ifndef _SYS_SYSPROTO_H_
212 preadv(struct thread *td, struct preadv_args *uap)
217 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
220 error = kern_preadv(td, uap->fd, auio, uap->offset);
226 kern_preadv(td, fd, auio, offset)
235 error = fget_read(td, fd, &fp);
238 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
240 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
243 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
249 * Common code for readv and preadv that reads data in
250 * from a file using the passed in uio, offset, and flags.
253 dofileread(td, fd, fp, auio, offset, flags)
264 struct uio *ktruio = NULL;
267 /* Finish zero length reads right here */
268 if (auio->uio_resid == 0) {
269 td->td_retval[0] = 0;
272 auio->uio_rw = UIO_READ;
273 auio->uio_offset = offset;
276 if (KTRPOINT(td, KTR_GENIO))
277 ktruio = cloneuio(auio);
279 cnt = auio->uio_resid;
280 if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
281 if (auio->uio_resid != cnt && (error == ERESTART ||
282 error == EINTR || error == EWOULDBLOCK))
285 cnt -= auio->uio_resid;
287 if (ktruio != NULL) {
288 ktruio->uio_resid = cnt;
289 ktrgenio(fd, UIO_READ, ktruio, error);
292 td->td_retval[0] = cnt;
299 #ifndef _SYS_SYSPROTO_H_
312 struct write_args *uap;
318 if (uap->nbyte > INT_MAX)
320 aiov.iov_base = (void *)(uintptr_t)uap->buf;
321 aiov.iov_len = uap->nbyte;
322 auio.uio_iov = &aiov;
324 auio.uio_resid = uap->nbyte;
325 auio.uio_segflg = UIO_USERSPACE;
326 error = kern_writev(td, uap->fd, &auio);
331 * Positioned write system call
333 #ifndef _SYS_SYSPROTO_H_
348 struct pwrite_args *uap;
354 if (uap->nbyte > INT_MAX)
356 aiov.iov_base = (void *)(uintptr_t)uap->buf;
357 aiov.iov_len = uap->nbyte;
358 auio.uio_iov = &aiov;
360 auio.uio_resid = uap->nbyte;
361 auio.uio_segflg = UIO_USERSPACE;
362 error = kern_pwritev(td, uap->fd, &auio, uap->offset);
367 * Gather write system call
369 #ifndef _SYS_SYSPROTO_H_
380 writev(struct thread *td, struct writev_args *uap)
385 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
388 error = kern_writev(td, uap->fd, auio);
394 kern_writev(struct thread *td, int fd, struct uio *auio)
399 error = fget_write(td, fd, &fp);
402 error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
408 * Gather positioned write system call
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
422 pwritev(struct thread *td, struct pwritev_args *uap)
427 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
430 error = kern_pwritev(td, uap->fd, auio, uap->offset);
436 kern_pwritev(td, fd, auio, offset)
445 error = fget_write(td, fd, &fp);
448 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
450 else if (offset < 0 && fp->f_vnode->v_type != VCHR)
453 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
459 * Common code for writev and pwritev that writes data to
460 * a file using the passed in uio, offset, and flags.
463 dofilewrite(td, fd, fp, auio, offset, flags)
474 struct uio *ktruio = NULL;
477 auio->uio_rw = UIO_WRITE;
479 auio->uio_offset = offset;
481 if (KTRPOINT(td, KTR_GENIO))
482 ktruio = cloneuio(auio);
484 cnt = auio->uio_resid;
485 if (fp->f_type == DTYPE_VNODE)
487 if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
488 if (auio->uio_resid != cnt && (error == ERESTART ||
489 error == EINTR || error == EWOULDBLOCK))
491 /* Socket layer is responsible for issuing SIGPIPE. */
492 if (error == EPIPE) {
493 PROC_LOCK(td->td_proc);
494 psignal(td->td_proc, SIGPIPE);
495 PROC_UNLOCK(td->td_proc);
498 cnt -= auio->uio_resid;
500 if (ktruio != NULL) {
501 ktruio->uio_resid = cnt;
502 ktrgenio(fd, UIO_WRITE, ktruio, error);
505 td->td_retval[0] = cnt;
512 #ifndef _SYS_SYSPROTO_H_
524 ioctl(struct thread *td, struct ioctl_args *uap)
531 if (uap->com > 0xffffffff) {
533 "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
534 td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
535 uap->com &= 0xffffffff;
540 * Interpret high order word to find amount of data to be
541 * copied to/from the user's address space.
543 size = IOCPARM_LEN(com);
544 if ((size > IOCPARM_MAX) ||
545 ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
546 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
547 ((com & IOC_OUT) && size == 0) ||
549 ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
551 ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
555 if (!(com & IOC_VOID))
556 data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
558 /* Integer argument. */
559 arg = (intptr_t)uap->data;
564 data = (void *)&uap->data;
566 error = copyin(uap->data, data, (u_int)size);
569 free(data, M_IOCTLOPS);
572 } else if (com & IOC_OUT) {
574 * Zero the buffer so the user always
575 * gets back something deterministic.
580 error = kern_ioctl(td, uap->fd, com, data);
582 if (error == 0 && (com & IOC_OUT))
583 error = copyout(data, uap->data, (u_int)size);
586 free(data, M_IOCTLOPS);
591 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
594 struct filedesc *fdp;
598 if ((error = fget(td, fd, &fp)) != 0)
600 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
604 fdp = td->td_proc->p_fd;
607 FILEDESC_LOCK_FAST(fdp);
608 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
609 FILEDESC_UNLOCK_FAST(fdp);
612 FILEDESC_LOCK_FAST(fdp);
613 fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
614 FILEDESC_UNLOCK_FAST(fdp);
618 if ((tmp = *(int *)data))
619 fp->f_flag |= FNONBLOCK;
621 fp->f_flag &= ~FNONBLOCK;
627 if ((tmp = *(int *)data))
628 fp->f_flag |= FASYNC;
630 fp->f_flag &= ~FASYNC;
636 error = fo_ioctl(fp, com, data, td->td_ucred, td);
643 * sellock and selwait are initialized in selectinit() via SYSINIT.
647 u_int nselcoll; /* Select collisions since boot */
648 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
651 * Select system call.
653 #ifndef _SYS_SYSPROTO_H_
656 fd_set *in, *ou, *ex;
665 register struct thread *td;
666 register struct select_args *uap;
668 struct timeval tv, *tvp;
671 if (uap->tv != NULL) {
672 error = copyin(uap->tv, &tv, sizeof(tv));
679 return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
683 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
684 fd_set *fd_ex, struct timeval *tvp)
686 struct filedesc *fdp;
688 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
689 * infds with the new FD_SETSIZE of 1024, and more than enough for
690 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
693 fd_mask s_selbits[howmany(2048, NFDBITS)];
694 fd_mask *ibits[3], *obits[3], *selbits, *sbp;
695 struct timeval atv, rtv, ttv;
697 u_int ncoll, nbufbytes, ncpbytes, nfdbits;
701 fdp = td->td_proc->p_fd;
703 FILEDESC_LOCK_FAST(fdp);
705 if (nd > td->td_proc->p_fd->fd_nfiles)
706 nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
707 FILEDESC_UNLOCK_FAST(fdp);
710 * Allocate just enough bits for the non-null fd_sets. Use the
711 * preallocated auto buffer if possible.
713 nfdbits = roundup(nd, NFDBITS);
714 ncpbytes = nfdbits / NBBY;
717 nbufbytes += 2 * ncpbytes;
719 nbufbytes += 2 * ncpbytes;
721 nbufbytes += 2 * ncpbytes;
722 if (nbufbytes <= sizeof s_selbits)
723 selbits = &s_selbits[0];
725 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
728 * Assign pointers into the bit buffers and fetch the input bits.
729 * Put the output buffers together so that they can be bzeroed
733 #define getbits(name, x) \
738 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
740 sbp += ncpbytes / sizeof *sbp; \
741 error = copyin(name, ibits[x], ncpbytes); \
743 goto done_nosellock; \
751 bzero(selbits, nbufbytes / 2);
755 if (itimerfix(&atv)) {
759 getmicrouptime(&rtv);
760 timevaladd(&atv, &rtv);
766 TAILQ_INIT(&td->td_selq);
770 mtx_lock_spin(&sched_lock);
771 td->td_flags |= TDF_SELECT;
772 mtx_unlock_spin(&sched_lock);
773 mtx_unlock(&sellock);
775 error = selscan(td, ibits, obits, nd);
777 if (error || td->td_retval[0])
779 if (atv.tv_sec || atv.tv_usec) {
780 getmicrouptime(&rtv);
781 if (timevalcmp(&rtv, &atv, >=))
784 timevalsub(&ttv, &rtv);
785 timo = ttv.tv_sec > 24 * 60 * 60 ?
786 24 * 60 * 60 * hz : tvtohz(&ttv);
790 * An event of interest may occur while we do not hold
791 * sellock, so check TDF_SELECT and the number of
792 * collisions and rescan the file descriptors if
795 mtx_lock_spin(&sched_lock);
796 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
797 mtx_unlock_spin(&sched_lock);
800 mtx_unlock_spin(&sched_lock);
803 error = cv_timedwait_sig(&selwait, &sellock, timo);
805 error = cv_wait_sig(&selwait, &sellock);
811 clear_selinfo_list(td);
812 mtx_lock_spin(&sched_lock);
813 td->td_flags &= ~TDF_SELECT;
814 mtx_unlock_spin(&sched_lock);
815 mtx_unlock(&sellock);
818 /* select is not restarted after signals... */
819 if (error == ERESTART)
821 if (error == EWOULDBLOCK)
823 #define putbits(name, x) \
824 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
834 if (selbits != &s_selbits[0])
835 free(selbits, M_SELECT);
841 selscan(td, ibits, obits, nfd)
843 fd_mask **ibits, **obits;
850 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
851 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
852 struct filedesc *fdp = td->td_proc->p_fd;
855 for (msk = 0; msk < 3; msk++) {
856 if (ibits[msk] == NULL)
858 for (i = 0; i < nfd; i += NFDBITS) {
859 bits = ibits[msk][i/NFDBITS];
860 /* ffs(int mask) not portable, fd_mask is long */
861 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
864 if ((fp = fget_locked(fdp, fd)) == NULL) {
865 FILEDESC_UNLOCK(fdp);
868 if (fo_poll(fp, flag[msk], td->td_ucred,
870 obits[msk][(fd)/NFDBITS] |=
871 ((fd_mask)1 << ((fd) % NFDBITS));
877 FILEDESC_UNLOCK(fdp);
878 td->td_retval[0] = n;
885 #ifndef _SYS_SYSPROTO_H_
898 struct poll_args *uap;
901 struct pollfd smallbits[32];
902 struct timeval atv, rtv, ttv;
910 * This is kinda bogus. We have fd limits, but that is not
911 * really related to the size of the pollfd array. Make sure
912 * we let the process use at least FD_SETSIZE entries and at
913 * least enough for the current limits. We want to be reasonably
914 * safe, but not overly restrictive.
916 PROC_LOCK(td->td_proc);
917 if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
918 (nfds > FD_SETSIZE)) {
919 PROC_UNLOCK(td->td_proc);
923 PROC_UNLOCK(td->td_proc);
924 ni = nfds * sizeof(struct pollfd);
925 if (ni > sizeof(smallbits))
926 bits = malloc(ni, M_TEMP, M_WAITOK);
929 error = copyin(uap->fds, bits, ni);
932 if (uap->timeout != INFTIM) {
933 atv.tv_sec = uap->timeout / 1000;
934 atv.tv_usec = (uap->timeout % 1000) * 1000;
935 if (itimerfix(&atv)) {
939 getmicrouptime(&rtv);
940 timevaladd(&atv, &rtv);
946 TAILQ_INIT(&td->td_selq);
950 mtx_lock_spin(&sched_lock);
951 td->td_flags |= TDF_SELECT;
952 mtx_unlock_spin(&sched_lock);
953 mtx_unlock(&sellock);
955 error = pollscan(td, bits, nfds);
957 if (error || td->td_retval[0])
959 if (atv.tv_sec || atv.tv_usec) {
960 getmicrouptime(&rtv);
961 if (timevalcmp(&rtv, &atv, >=))
964 timevalsub(&ttv, &rtv);
965 timo = ttv.tv_sec > 24 * 60 * 60 ?
966 24 * 60 * 60 * hz : tvtohz(&ttv);
969 * An event of interest may occur while we do not hold
970 * sellock, so check TDF_SELECT and the number of collisions
971 * and rescan the file descriptors if necessary.
973 mtx_lock_spin(&sched_lock);
974 if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
975 mtx_unlock_spin(&sched_lock);
978 mtx_unlock_spin(&sched_lock);
981 error = cv_timedwait_sig(&selwait, &sellock, timo);
983 error = cv_wait_sig(&selwait, &sellock);
989 clear_selinfo_list(td);
990 mtx_lock_spin(&sched_lock);
991 td->td_flags &= ~TDF_SELECT;
992 mtx_unlock_spin(&sched_lock);
993 mtx_unlock(&sellock);
996 /* poll is not restarted after signals... */
997 if (error == ERESTART)
999 if (error == EWOULDBLOCK)
1002 error = copyout(bits, uap->fds, ni);
1007 if (ni > sizeof(smallbits))
1014 pollscan(td, fds, nfd)
1019 register struct filedesc *fdp = td->td_proc->p_fd;
1025 for (i = 0; i < nfd; i++, fds++) {
1026 if (fds->fd >= fdp->fd_nfiles) {
1027 fds->revents = POLLNVAL;
1029 } else if (fds->fd < 0) {
1032 fp = fdp->fd_ofiles[fds->fd];
1034 fds->revents = POLLNVAL;
1038 * Note: backend also returns POLLHUP and
1039 * POLLERR if appropriate.
1041 fds->revents = fo_poll(fp, fds->events,
1043 if (fds->revents != 0)
1048 FILEDESC_UNLOCK(fdp);
1049 td->td_retval[0] = n;
1054 * OpenBSD poll system call.
1055 * XXX this isn't quite a true representation.. OpenBSD uses select ops.
1057 #ifndef _SYS_SYSPROTO_H_
1058 struct openbsd_poll_args {
1068 openbsd_poll(td, uap)
1069 register struct thread *td;
1070 register struct openbsd_poll_args *uap;
1072 return (poll(td, (struct poll_args *)uap));
1076 * Remove the references to the thread from all of the objects
1079 * This code assumes that the underlying owner of the selinfo
1080 * structure will hold sellock before it changes it, and that
1081 * it will unlink itself from our list if it goes away.
1084 clear_selinfo_list(td)
1089 mtx_assert(&sellock, MA_OWNED);
1090 TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1091 si->si_thread = NULL;
1092 TAILQ_INIT(&td->td_selq);
1096 * Record a select request.
1099 selrecord(selector, sip)
1100 struct thread *selector;
1101 struct selinfo *sip;
1106 * If the selinfo's thread pointer is NULL then take ownership of it.
1108 * If the thread pointer is not NULL and it points to another
1109 * thread, then we have a collision.
1111 * If the thread pointer is not NULL and points back to us then leave
1112 * it alone as we've already added pointed it at us and added it to
1115 if (sip->si_thread == NULL) {
1116 sip->si_thread = selector;
1117 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1118 } else if (sip->si_thread != selector) {
1119 sip->si_flags |= SI_COLL;
1122 mtx_unlock(&sellock);
1125 /* Wake up a selecting thread. */
1128 struct selinfo *sip;
1130 doselwakeup(sip, -1);
1133 /* Wake up a selecting thread, and set its priority. */
1135 selwakeuppri(sip, pri)
1136 struct selinfo *sip;
1139 doselwakeup(sip, pri);
1143 * Do a wakeup when a selectable event occurs.
1146 doselwakeup(sip, pri)
1147 struct selinfo *sip;
1153 td = sip->si_thread;
1154 if ((sip->si_flags & SI_COLL) != 0) {
1156 sip->si_flags &= ~SI_COLL;
1157 cv_broadcastpri(&selwait, pri);
1160 mtx_unlock(&sellock);
1163 TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1164 sip->si_thread = NULL;
1165 mtx_lock_spin(&sched_lock);
1166 td->td_flags &= ~TDF_SELECT;
1167 mtx_unlock_spin(&sched_lock);
1168 sleepq_remove(td, &selwait);
1169 mtx_unlock(&sellock);
1172 static void selectinit(void *);
1173 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1180 cv_init(&selwait, "select");
1181 mtx_init(&sellock, "sellck", NULL, MTX_DEF);