2 * Copyright (c) 1989, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
34 * $Id: spec_vnops.c,v 1.79 1999/01/21 08:29:07 dillon Exp $
37 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
43 #include <sys/mount.h>
44 #include <sys/vnode.h>
46 #include <sys/fcntl.h>
47 #include <sys/disklabel.h>
48 #include <sys/vmmeter.h>
51 #include <vm/vm_prot.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/vm_pager.h>
55 #include <vm/vnode_pager.h>
56 #include <vm/vm_extern.h>
58 #include <miscfs/specfs/specdev.h>
60 static int spec_advlock __P((struct vop_advlock_args *));
61 static int spec_badop __P((void));
62 static int spec_bmap __P((struct vop_bmap_args *));
63 static int spec_close __P((struct vop_close_args *));
64 static int spec_freeblks __P((struct vop_freeblks_args *));
65 static int spec_fsync __P((struct vop_fsync_args *));
66 static int spec_getattr __P((struct vop_getattr_args *));
67 static int spec_getpages __P((struct vop_getpages_args *));
68 static int spec_inactive __P((struct vop_inactive_args *));
69 static int spec_ioctl __P((struct vop_ioctl_args *));
70 static int spec_lookup __P((struct vop_lookup_args *));
71 static int spec_open __P((struct vop_open_args *));
72 static int spec_poll __P((struct vop_poll_args *));
73 static int spec_print __P((struct vop_print_args *));
74 static int spec_read __P((struct vop_read_args *));
75 static int spec_strategy __P((struct vop_strategy_args *));
76 static int spec_write __P((struct vop_write_args *));
78 struct vnode *speclisth[SPECHSZ];
79 vop_t **spec_vnodeop_p;
80 static struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
81 { &vop_default_desc, (vop_t *) vop_defaultop },
82 { &vop_access_desc, (vop_t *) vop_ebadf },
83 { &vop_advlock_desc, (vop_t *) spec_advlock },
84 { &vop_bmap_desc, (vop_t *) spec_bmap },
85 { &vop_close_desc, (vop_t *) spec_close },
86 { &vop_create_desc, (vop_t *) spec_badop },
87 { &vop_freeblks_desc, (vop_t *) spec_freeblks },
88 { &vop_fsync_desc, (vop_t *) spec_fsync },
89 { &vop_getattr_desc, (vop_t *) spec_getattr },
90 { &vop_getpages_desc, (vop_t *) spec_getpages },
91 { &vop_inactive_desc, (vop_t *) spec_inactive },
92 { &vop_ioctl_desc, (vop_t *) spec_ioctl },
93 { &vop_lease_desc, (vop_t *) vop_null },
94 { &vop_link_desc, (vop_t *) spec_badop },
95 { &vop_lookup_desc, (vop_t *) spec_lookup },
96 { &vop_mkdir_desc, (vop_t *) spec_badop },
97 { &vop_mknod_desc, (vop_t *) spec_badop },
98 { &vop_open_desc, (vop_t *) spec_open },
99 { &vop_pathconf_desc, (vop_t *) vop_stdpathconf },
100 { &vop_poll_desc, (vop_t *) spec_poll },
101 { &vop_print_desc, (vop_t *) spec_print },
102 { &vop_read_desc, (vop_t *) spec_read },
103 { &vop_readdir_desc, (vop_t *) spec_badop },
104 { &vop_readlink_desc, (vop_t *) spec_badop },
105 { &vop_reallocblks_desc, (vop_t *) spec_badop },
106 { &vop_reclaim_desc, (vop_t *) vop_null },
107 { &vop_remove_desc, (vop_t *) spec_badop },
108 { &vop_rename_desc, (vop_t *) spec_badop },
109 { &vop_rmdir_desc, (vop_t *) spec_badop },
110 { &vop_setattr_desc, (vop_t *) vop_ebadf },
111 { &vop_strategy_desc, (vop_t *) spec_strategy },
112 { &vop_symlink_desc, (vop_t *) spec_badop },
113 { &vop_write_desc, (vop_t *) spec_write },
116 static struct vnodeopv_desc spec_vnodeop_opv_desc =
117 { &spec_vnodeop_p, spec_vnodeop_entries };
119 VNODEOP_SET(spec_vnodeop_opv_desc);
124 struct vop_generic_args /* {
125 struct vnodeop_desc *a_desc;
126 <other random data follows, presumably>
129 return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap));
132 static void spec_getpages_iodone __P((struct buf *bp));
135 * Trivial lookup routine that always fails.
139 struct vop_lookup_args /* {
141 struct vnode **a_vpp;
142 struct componentname *a_cnp;
151 * Open a special file.
156 struct vop_open_args /* {
159 struct ucred *a_cred;
163 struct proc *p = ap->a_p;
164 struct vnode *bvp, *vp = ap->a_vp;
165 dev_t bdev, dev = (dev_t)vp->v_rdev;
166 int maj = major(dev);
170 * Don't allow open if fs is mounted -nodev.
172 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
175 switch (vp->v_type) {
178 if ((u_int)maj >= nchrdev)
180 if ( (cdevsw[maj] == NULL) || (cdevsw[maj]->d_open == NULL))
182 if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) {
184 * When running in very secure mode, do not allow
185 * opens for writing of any disk character devices.
188 && cdevsw[maj]->d_bmaj != -1
189 && (cdevsw[maj]->d_flags & D_TYPEMASK) == D_DISK)
192 * When running in secure mode, do not allow opens
193 * for writing of /dev/mem, /dev/kmem, or character
194 * devices whose corresponding block devices are
197 if (securelevel >= 1) {
198 if ((bdev = chrtoblk(dev)) != NODEV &&
199 vfinddev(bdev, VBLK, &bvp) &&
200 bvp->v_usecount > 0 &&
201 (error = vfs_mountedon(bvp)))
207 if ((cdevsw[maj]->d_flags & D_TYPEMASK) == D_TTY)
208 vp->v_flag |= VISTTY;
209 VOP_UNLOCK(vp, 0, p);
210 error = (*cdevsw[maj]->d_open)(dev, ap->a_mode, S_IFCHR, p);
211 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
215 if ((u_int)maj >= nblkdev)
217 if ( (bdevsw[maj] == NULL) || (bdevsw[maj]->d_open == NULL))
220 * When running in very secure mode, do not allow
221 * opens for writing of any disk block devices.
223 if (securelevel >= 2 && ap->a_cred != FSCRED &&
224 (ap->a_mode & FWRITE) &&
225 (bdevsw[maj]->d_flags & D_TYPEMASK) == D_DISK)
229 * Do not allow opens of block devices that are
232 error = vfs_mountedon(vp);
235 return ((*bdevsw[maj]->d_open)(dev, ap->a_mode, S_IFBLK, p));
249 struct vop_read_args /* {
253 struct ucred *a_cred;
256 register struct vnode *vp = ap->a_vp;
257 register struct uio *uio = ap->a_uio;
258 struct proc *p = uio->uio_procp;
262 struct partinfo dpart;
269 if (uio->uio_rw != UIO_READ)
270 panic("spec_read mode");
271 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
272 panic("spec_read proc");
274 if (uio->uio_resid == 0)
277 switch (vp->v_type) {
280 VOP_UNLOCK(vp, 0, p);
281 error = (*cdevsw[major(vp->v_rdev)]->d_read)
282 (vp->v_rdev, uio, ap->a_ioflag);
283 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
287 if (uio->uio_offset < 0)
289 bsize = BLKDEV_IOSIZE;
291 if ((ioctl = bdevsw[major(dev)]->d_ioctl) != NULL &&
292 (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 &&
293 dpart.part->p_fstype == FS_BSDFFS &&
294 dpart.part->p_frag != 0 && dpart.part->p_fsize != 0)
295 bsize = dpart.part->p_frag * dpart.part->p_fsize;
296 bscale = btodb(bsize);
298 bn = btodb(uio->uio_offset) & ~(bscale - 1);
299 on = uio->uio_offset % bsize;
300 n = min((unsigned)(bsize - on), uio->uio_resid);
301 if (vp->v_lastr + bscale == bn) {
302 nextbn = bn + bscale;
303 error = breadn(vp, bn, (int)bsize, &nextbn,
304 (int *)&bsize, 1, NOCRED, &bp);
306 error = bread(vp, bn, (int)bsize, NOCRED, &bp);
308 n = min(n, bsize - bp->b_resid);
313 error = uiomove((char *)bp->b_data + on, n, uio);
315 } while (error == 0 && uio->uio_resid > 0 && n != 0);
319 panic("spec_read type");
330 struct vop_write_args /* {
334 struct ucred *a_cred;
337 register struct vnode *vp = ap->a_vp;
338 register struct uio *uio = ap->a_uio;
339 struct proc *p = uio->uio_procp;
343 struct partinfo dpart;
348 if (uio->uio_rw != UIO_WRITE)
349 panic("spec_write mode");
350 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
351 panic("spec_write proc");
354 switch (vp->v_type) {
357 VOP_UNLOCK(vp, 0, p);
358 error = (*cdevsw[major(vp->v_rdev)]->d_write)
359 (vp->v_rdev, uio, ap->a_ioflag);
360 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
364 if (uio->uio_resid == 0)
366 if (uio->uio_offset < 0)
368 bsize = BLKDEV_IOSIZE;
369 if ((*bdevsw[major(vp->v_rdev)]->d_ioctl)(vp->v_rdev, DIOCGPART,
370 (caddr_t)&dpart, FREAD, p) == 0) {
371 if (dpart.part->p_fstype == FS_BSDFFS &&
372 dpart.part->p_frag != 0 && dpart.part->p_fsize != 0)
373 bsize = dpart.part->p_frag *
376 blkmask = btodb(bsize) - 1;
378 bn = btodb(uio->uio_offset) & ~blkmask;
379 on = uio->uio_offset % bsize;
380 n = min((unsigned)(bsize - on), uio->uio_resid);
382 bp = getblk(vp, bn, bsize, 0, 0);
384 error = bread(vp, bn, bsize, NOCRED, &bp);
389 n = min(n, bsize - bp->b_resid);
390 error = uiomove((char *)bp->b_data + on, n, uio);
395 } while (error == 0 && uio->uio_resid > 0 && n != 0);
399 panic("spec_write type");
405 * Device ioctl operation.
410 struct vop_ioctl_args /* {
415 struct ucred *a_cred;
419 dev_t dev = ap->a_vp->v_rdev;
421 switch (ap->a_vp->v_type) {
424 return ((*cdevsw[major(dev)]->d_ioctl)(dev, ap->a_command,
425 ap->a_data, ap->a_fflag, ap->a_p));
427 return ((*bdevsw[major(dev)]->d_ioctl)(dev, ap->a_command,
428 ap->a_data, ap->a_fflag, ap->a_p));
438 struct vop_poll_args /* {
441 struct ucred *a_cred;
447 switch (ap->a_vp->v_type) {
450 dev = ap->a_vp->v_rdev;
451 return (*cdevsw[major(dev)]->d_poll)(dev, ap->a_events, ap->a_p);
453 return (vop_defaultop((struct vop_generic_args *)ap));
458 * Synch buffers associated with a block device
463 struct vop_fsync_args /* {
465 struct ucred *a_cred;
470 register struct vnode *vp = ap->a_vp;
471 register struct buf *bp;
475 if (vp->v_type == VCHR)
478 * Flush all dirty buffers associated with a block device.
482 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
483 nbp = TAILQ_NEXT(bp, b_vnbufs);
484 if ((bp->b_flags & B_BUSY))
486 if ((bp->b_flags & B_DELWRI) == 0)
487 panic("spec_fsync: not dirty");
488 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
493 bp->b_flags |= B_BUSY;
499 if (ap->a_waitfor == MNT_WAIT) {
500 while (vp->v_numoutput) {
501 vp->v_flag |= VBWAIT;
502 (void) tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "spfsyn", 0);
505 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
506 vprint("spec_fsync: dirty", vp);
518 struct vop_inactive_args /* {
524 VOP_UNLOCK(ap->a_vp, 0, ap->a_p);
529 * Just call the device strategy routine
533 struct vop_strategy_args /* {
540 if (((bp->b_flags & B_READ) == 0) &&
541 (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start)
542 (*bioops.io_start)(bp);
543 (*bdevsw[major(bp->b_dev)]->d_strategy)(bp);
549 struct vop_freeblks_args /* {
558 bsw = bdevsw[major(ap->a_vp->v_rdev)];
559 if ((bsw->d_flags & D_CANFREE) == 0)
561 bp = geteblk(ap->a_length);
562 bp->b_flags |= B_FREEBUF | B_BUSY;
563 bp->b_dev = ap->a_vp->v_rdev;
564 bp->b_blkno = ap->a_addr;
565 bp->b_offset = dbtob(ap->a_addr);
566 bp->b_bcount = ap->a_length;
567 (*bsw->d_strategy)(bp);
572 * This is a noop, simply returning what one has been given.
576 struct vop_bmap_args /* {
579 struct vnode **a_vpp;
586 if (ap->a_vpp != NULL)
587 *ap->a_vpp = ap->a_vp;
588 if (ap->a_bnp != NULL)
589 *ap->a_bnp = ap->a_bn;
590 if (ap->a_runp != NULL)
592 if (ap->a_runb != NULL)
598 * Device close routine
603 struct vop_close_args /* {
606 struct ucred *a_cred;
610 register struct vnode *vp = ap->a_vp;
611 dev_t dev = vp->v_rdev;
615 switch (vp->v_type) {
619 * Hack: a tty device that is a controlling terminal
620 * has a reference from the session structure.
621 * We cannot easily tell that a character device is
622 * a controlling terminal, unless it is the closing
623 * process' controlling terminal. In that case,
624 * if the reference count is 2 (this last descriptor
625 * plus the session), release the reference from the session.
627 if (vcount(vp) == 2 && ap->a_p &&
628 (vp->v_flag & VXLOCK) == 0 &&
629 vp == ap->a_p->p_session->s_ttyvp) {
631 ap->a_p->p_session->s_ttyvp = NULL;
634 * If the vnode is locked, then we are in the midst
635 * of forcably closing the device, otherwise we only
636 * close on last reference.
638 if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0)
640 devclose = cdevsw[major(dev)]->d_close;
646 * On last close of a block device (that isn't mounted)
647 * we must invalidate any in core blocks, so that
648 * we can, for instance, change floppy disks.
650 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p);
651 error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
652 VOP_UNLOCK(vp, 0, ap->a_p);
657 * We do not want to really close the device if it
658 * is still in use unless we are trying to close it
659 * forcibly. Since every use (buffer, vnode, swap, cmap)
660 * holds a reference to the vnode, and because we mark
661 * any other vnodes that alias this device, when the
662 * sum of the reference counts on all the aliased
663 * vnodes descends to one, we are on last close.
665 if ((vcount(vp) > 1) && (vp->v_flag & VXLOCK) == 0)
668 devclose = bdevsw[major(dev)]->d_close;
673 panic("spec_close: not special");
676 return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p));
680 * Print out the contents of a special device vnode.
684 struct vop_print_args /* {
689 printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev),
690 minor(ap->a_vp->v_rdev));
695 * Special device advisory byte-level locks.
700 struct vop_advlock_args /* {
709 return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
713 * Special device bad operation
719 panic("spec_badop called");
724 spec_getpages_iodone(bp)
728 bp->b_flags |= B_DONE;
734 struct vop_getpages_args *ap;
738 int i, pcount, size, s;
743 int toff, nextoff, nread;
744 struct vnode *vp = ap->a_vp;
749 pcount = round_page(ap->a_count) / PAGE_SIZE;
752 * Calculate the offset of the transfer.
754 offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset;
756 /* XXX sanity check before we go into details. */
757 /* XXX limits should be defined elsewhere. */
758 #define DADDR_T_BIT 32
759 #define OFFSET_MAX ((1LL << (DADDR_T_BIT + DEV_BSHIFT)) - 1)
760 if (offset < 0 || offset > OFFSET_MAX) {
761 /* XXX still no %q in kernel. */
762 printf("spec_getpages: preposterous offset 0x%x%08x\n",
763 (u_int)((u_quad_t)offset >> 32),
764 (u_int)(offset & 0xffffffff));
765 return (VM_PAGER_ERROR);
768 blkno = btodb(offset);
771 * Round up physical size for real devices, use the
772 * fundamental blocksize of the fs if possible.
774 if (vp && vp->v_mount) {
775 if (vp->v_type != VBLK) {
776 vprint("Non VBLK", vp);
778 blksiz = vp->v_mount->mnt_stat.f_bsize;
779 if (blksiz < DEV_BSIZE) {
785 size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
788 kva = (vm_offset_t)bp->b_data;
791 * Map the pages to be read into the kva.
793 pmap_qenter(kva, ap->a_m, pcount);
795 /* Build a minimal buffer header. */
796 bp->b_flags = B_BUSY | B_READ | B_CALL;
797 bp->b_iodone = spec_getpages_iodone;
799 /* B_PHYS is not set, but it is nice to fill this in. */
800 bp->b_proc = curproc;
801 bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
802 if (bp->b_rcred != NOCRED)
804 if (bp->b_wcred != NOCRED)
807 bp->b_lblkno = blkno;
808 pbgetvp(ap->a_vp, bp);
810 bp->b_bufsize = size;
814 cnt.v_vnodepgsin += pcount;
817 VOP_STRATEGY(bp->b_vp, bp);
821 /* We definitely need to be at splbio here. */
822 while ((bp->b_flags & B_DONE) == 0)
823 tsleep(bp, PVM, "spread", 0);
827 if ((bp->b_flags & B_ERROR) != 0) {
834 nread = size - bp->b_resid;
836 if (nread < ap->a_count) {
837 bzero((caddr_t)kva + nread,
838 ap->a_count - nread);
840 pmap_qremove(kva, pcount);
844 for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) {
845 nextoff = toff + PAGE_SIZE;
848 m->flags &= ~PG_ZERO;
850 if (nextoff <= nread) {
851 m->valid = VM_PAGE_BITS_ALL;
853 } else if (toff < nread) {
854 int nvalid = ((nread + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1);
855 vm_page_set_validclean(m, 0, nvalid);
861 if (i != ap->a_reqpage) {
863 * Just in case someone was asking for this page we
864 * now tell them that it is ok to use.
866 if (!error || (m->valid == VM_PAGE_BITS_ALL)) {
868 if (m->flags & PG_WANTED) {
871 vm_page_deactivate(m);
880 } else if (m->valid) {
885 m = ap->a_m[ap->a_reqpage];
888 "spec_getpages: I/O read failure: (error code=%d)\n",
891 " size: %d, resid: %ld, a_count: %d, valid: 0x%x\n",
892 size, bp->b_resid, ap->a_count, m->valid);
894 " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n",
895 nread, ap->a_reqpage, (u_long)m->pindex, pcount);
898 * Free the buffer header back to the swap buffer pool.
901 return VM_PAGER_ERROR;
904 * Free the buffer header back to the swap buffer pool.
913 struct vop_getattr_args /* {
916 struct ucred *a_cred;
920 register struct vnode *vp = ap->a_vp;
921 register struct vattr *vap = ap->a_vap;
922 struct partinfo dpart;
924 bzero(vap, sizeof (*vap));
926 if (vp->v_type == VBLK)
927 vap->va_blocksize = BLKDEV_IOSIZE;
928 else if (vp->v_type == VCHR)
929 vap->va_blocksize = MAXBSIZE;
931 if ((*bdevsw[major(vp->v_rdev)]->d_ioctl)(vp->v_rdev, DIOCGPART,
932 (caddr_t)&dpart, FREAD, ap->a_p) == 0) {
933 vap->va_bytes = dbtob(dpart.disklab->d_partitions
934 [minor(vp->v_rdev)].p_size);
935 vap->va_size = vap->va_bytes;