4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
21 /* Portions Copyright 2007 Shivakumar GN */
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
29 #include <sys/types.h>
30 #include <sys/cmn_err.h>
31 #include <sys/debug.h>
32 #include <sys/dirent.h>
35 #include <sys/mutex.h>
36 #include <sys/sysmacros.h>
37 #include <sys/systm.h>
40 #include <sys/vnode.h>
47 * Generic pseudo-filesystem routines.
49 * There are significant similarities between the implementation of certain file
50 * system entry points across different filesystems. While one could attempt to
51 * "choke up on the bat" and incorporate common functionality into a VOP
52 * preamble or postamble, such an approach is limited in the benefit it can
53 * provide. In this file we instead define a toolkit of routines which can be
54 * called from a filesystem (with in-kernel pseudo-filesystems being the focus
55 * of the exercise) in a more component-like fashion.
57 * There are three basic classes of routines:
59 * 1) Lowlevel support routines
61 * These routines are designed to play a support role for existing
62 * pseudo-filesystems (such as procfs). They simplify common tasks,
63 * without enforcing the filesystem to hand over management to GFS. The
64 * routines covered are:
73 * 2) Complete GFS management
75 * These routines take a more active role in management of the
76 * pseudo-filesystem. They handle the relationship between vnode private
77 * data and VFS data, as well as the relationship between vnodes in the
78 * directory hierarchy.
80 * In order to use these interfaces, the first member of every private
81 * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control
98 * 3) Single File pseudo-filesystems
100 * This routine creates a rooted file to be overlayed ontop of another
101 * file in the physical filespace.
103 * Note that the parent is NULL (actually the vfs), but there is nothing
104 * technically keeping such a file from utilizing the "Complete GFS
105 * management" set of routines.
107 * gfs_root_create_file()
111 * Low level directory routines
113 * These routines provide some simple abstractions for reading directories.
114 * They are designed to be used by existing pseudo filesystems (namely procfs)
115 * that already have a complicated management infrastructure.
119 * gfs_readdir_init: initiate a generic readdir
120 * st - a pointer to an uninitialized gfs_readdir_state_t structure
121 * name_max - the directory's maximum file name length
122 * ureclen - the exported file-space record length (1 for non-legacy FSs)
123 * uiop - the uiop passed to readdir
124 * parent - the parent directory's inode
125 * self - this directory's inode
127 * Returns 0 or a non-zero errno.
129 * Typical VOP_READDIR usage of gfs_readdir_*:
131 * if ((error = gfs_readdir_init(...)) != 0)
134 * while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
135 * if (!consumer_entry_at(voffset))
136 * voffset = consumer_next_entry(voffset);
137 * if (consumer_eof(voffset)) {
141 * if ((error = gfs_readdir_emit(..., voffset,
142 * consumer_ino(voffset), consumer_name(voffset))) != 0)
145 * return (gfs_readdir_fini(..., error, eofp, eof));
147 * As you can see, a zero result from gfs_readdir_pred() or
148 * gfs_readdir_emit() indicates that processing should continue,
149 * whereas a non-zero result indicates that the loop should terminate.
150 * Most consumers need do nothing more than let gfs_readdir_fini()
151 * determine what the cause of failure was and return the appropriate
155 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
156 uio_t *uiop, ino64_t parent, ino64_t self)
158 if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
159 (uiop->uio_loffset % ureclen) != 0)
162 st->grd_ureclen = ureclen;
163 st->grd_oresid = uiop->uio_resid;
164 st->grd_namlen = name_max;
165 st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
166 st->grd_parent = parent;
173 * gfs_readdir_emit_int: internal routine to emit directory entry
175 * st - the current readdir state, which must have d_ino and d_name
177 * uiop - caller-supplied uio pointer
178 * next - the offset of the next entry
181 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
182 int *ncookies, u_long **cookies)
186 namlen = strlen(st->grd_dirent->d_name);
187 reclen = DIRENT64_RECLEN(namlen);
189 if (reclen > uiop->uio_resid) {
191 * Error if no entries were returned yet
193 if (uiop->uio_resid == st->grd_oresid)
198 /* XXX: This can change in the future. */
199 st->grd_dirent->d_type = DT_DIR;
200 st->grd_dirent->d_reclen = (ushort_t)reclen;
201 st->grd_dirent->d_namlen = namlen;
203 if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
206 uiop->uio_loffset = next;
207 if (*cookies != NULL) {
211 KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies));
218 * gfs_readdir_emit: emit a directory entry
219 * voff - the virtual offset (obtained from gfs_readdir_pred)
220 * ino - the entry's inode
221 * name - the entry's name
223 * Returns a 0 on success, a non-zero errno on failure, or -1 if the
224 * readdir loop should terminate. A non-zero result (either errno or
225 * -1) from this function is typically passed directly to
226 * gfs_readdir_fini().
229 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
230 ino64_t ino, const char *name, int *ncookies, u_long **cookies)
232 offset_t off = (voff + 2) * st->grd_ureclen;
234 st->grd_dirent->d_ino = ino;
235 (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
238 * Inter-entry offsets are invalid, so we assume a record size of
239 * grd_ureclen and explicitly set the offset appropriately.
241 return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies,
246 * gfs_readdir_pred: readdir loop predicate
247 * voffp - a pointer in which the next virtual offset should be stored
249 * Returns a 0 on success, a non-zero errno on failure, or -1 if the
250 * readdir loop should terminate. A non-zero result (either errno or
251 * -1) from this function is typically passed directly to
252 * gfs_readdir_fini().
255 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp,
256 int *ncookies, u_long **cookies)
262 if (uiop->uio_resid <= 0)
265 off = uiop->uio_loffset / st->grd_ureclen;
268 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
269 ".", ncookies, cookies)) == 0)
271 } else if (off == 1) {
272 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
273 "..", ncookies, cookies)) == 0)
284 * gfs_readdir_fini: generic readdir cleanup
285 * error - if positive, an error to return
286 * eofp - the eofp passed to readdir
287 * eof - the eof value
289 * Returns a 0 on success, a non-zero errno on failure. This result
290 * should be returned from readdir.
293 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
295 kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
306 * Performs a basic check for "." and ".." directory entries.
309 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
311 if (*nm == '\0' || strcmp(nm, ".") == 0) {
315 } else if (strcmp(nm, "..") == 0) {
317 ASSERT(dvp->v_flag & VROOT);
324 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
332 * gfs_file_create(): create a new GFS file
334 * size - size of private data structure (v_data)
335 * pvp - parent vnode (GFS directory)
336 * ops - vnode operations vector
338 * In order to use this interface, the parent vnode must have been created by
339 * gfs_dir_create(), and the private data stored in v_data must have a
340 * 'gfs_file_t' as its first field.
342 * Given these constraints, this routine will automatically:
344 * - Allocate v_data for the vnode
345 * - Initialize necessary fields in the vnode
349 gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops)
356 * Allocate vnode and internal data structure
358 fp = kmem_zalloc(size, KM_SLEEP);
359 error = getnewvnode("zfs", vfsp, ops, &vp);
361 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
362 vp->v_data = (caddr_t)fp;
365 * Set up various pointers
368 fp->gfs_parent = pvp;
370 fp->gfs_type = GFS_FILE;
372 vp->v_vflag |= VV_FORCEINSMQ;
373 error = insmntque(vp, vfsp);
374 vp->v_vflag &= ~VV_FORCEINSMQ;
375 KASSERT(error == 0, ("insmntque() failed: error %d", error));
378 * Initialize vnode and hold parent.
387 * gfs_dir_create: creates a new directory in the parent
389 * size - size of private data structure (v_data)
390 * pvp - parent vnode (GFS directory)
391 * ops - vnode operations vector
392 * entries - NULL-terminated list of static entries (if any)
393 * maxlen - maximum length of a directory entry
394 * readdir_cb - readdir callback (see gfs_dir_readdir)
395 * inode_cb - inode callback (see gfs_dir_readdir)
396 * lookup_cb - lookup callback (see gfs_dir_lookup)
398 * In order to use this function, the first member of the private vnode
399 * structure (v_data) must be a gfs_dir_t. For each directory, there are
400 * static entries, defined when the structure is initialized, and dynamic
401 * entries, retrieved through callbacks.
403 * If a directory has static entries, then it must supply a inode callback,
404 * which will compute the inode number based on the parent and the index.
405 * For a directory with dynamic entries, the caller must supply a readdir
406 * callback and a lookup callback. If a static lookup fails, we fall back to
407 * the supplied lookup callback, if any.
409 * This function also performs the same initialization as gfs_file_create().
412 gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops,
413 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
414 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
420 vp = gfs_file_create(struct_size, pvp, vfsp, ops);
424 dp->gfsd_file.gfs_type = GFS_DIR;
425 dp->gfsd_maxlen = maxlen;
427 if (entries != NULL) {
428 for (de = entries; de->gfse_name != NULL; de++)
431 dp->gfsd_static = kmem_alloc(
432 dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
433 bcopy(entries, dp->gfsd_static,
434 dp->gfsd_nstatic * sizeof (gfs_dirent_t));
437 dp->gfsd_readdir = readdir_cb;
438 dp->gfsd_lookup = lookup_cb;
439 dp->gfsd_inode = inode_cb;
441 mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
447 * gfs_root_create(): create a root vnode for a GFS filesystem
449 * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The
450 * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
453 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
454 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
455 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
460 vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb,
461 maxlen, readdir_cb, lookup_cb);
462 /* Manually set the inode */
463 ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
470 * gfs_file_inactive()
472 * Called from the VOP_INACTIVE() routine. If necessary, this routine will
473 * remove the given vnode from the parent directory and clean up any references
476 * If the vnode was not removed (due to a race with vget), then NULL is
477 * returned. Otherwise, a pointer to the private data is returned.
480 gfs_file_inactive(vnode_t *vp)
483 gfs_dirent_t *ge = NULL;
484 gfs_file_t *fp = vp->v_data;
485 gfs_dir_t *dp = NULL;
488 if (fp->gfs_parent == NULL)
491 dp = fp->gfs_parent->v_data;
494 * First, see if this vnode is cached in the parent.
499 * Find it in the set of static entries.
501 for (i = 0; i < dp->gfsd_nstatic; i++) {
502 ge = &dp->gfsd_static[i];
504 if (ge->gfse_vnode == vp)
509 * If 'ge' is NULL, then it is a dynamic entry.
515 ASSERT(vp->v_count < 2);
517 * Really remove this vnode
522 * If this was a statically cached entry, simply set the
523 * cached vnode to NULL.
525 ge->gfse_vnode = NULL;
527 if (vp->v_count == 1) {
535 * Free vnode and release parent
537 if (fp->gfs_parent) {
539 VI_LOCK(fp->gfs_parent);
540 fp->gfs_parent->v_usecount--;
541 VI_UNLOCK(fp->gfs_parent);
543 ASSERT(vp->v_vfsp != NULL);
544 VFS_RELE(vp->v_vfsp);
553 * Same as above, but for directories.
556 gfs_dir_inactive(vnode_t *vp)
560 ASSERT(vp->v_type == VDIR);
562 if ((dp = gfs_file_inactive(vp)) != NULL) {
563 mutex_destroy(&dp->gfsd_lock);
564 if (dp->gfsd_nstatic)
565 kmem_free(dp->gfsd_static,
566 dp->gfsd_nstatic * sizeof (gfs_dirent_t));
575 * Looks up the given name in the directory and returns the corresponding vnode,
578 * First, we search statically defined entries, if any. If a match is found,
579 * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
580 * existing vnode. Otherwise, we call the static entry's callback routine,
581 * caching the result if necessary.
583 * If no static entry is found, we invoke the lookup callback, if any. The
584 * arguments to this callback are:
586 * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
590 * vpp - pointer to resulting vnode
592 * Returns 0 on success, non-zero on error.
595 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
600 gfs_dir_t *dp = dvp->v_data;
603 ASSERT(dvp->v_type == VDIR);
605 if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
611 * Search static entries.
613 for (i = 0; i < dp->gfsd_nstatic; i++) {
614 ge = &dp->gfsd_static[i];
616 if (strcmp(ge->gfse_name, nm) == 0) {
617 if (ge->gfse_vnode) {
618 ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
625 * We drop the directory lock, as the constructor will
626 * need to do KM_SLEEP allocations. If we return from
627 * the constructor only to find that a parallel
628 * operation has completed, and GFS_CACHE_VNODE is set
629 * for this entry, we discard the result in favor of the
633 vp = ge->gfse_ctor(dvp);
636 ((gfs_file_t *)vp->v_data)->gfs_index = i;
638 /* Set the inode according to the callback. */
639 ((gfs_file_t *)vp->v_data)->gfs_ino =
640 dp->gfsd_inode(dvp, i);
642 if (ge->gfse_flags & GFS_CACHE_VNODE) {
643 if (ge->gfse_vnode == NULL) {
647 * A parallel constructor beat us to it;
648 * return existing vnode. We have to be
649 * careful because we can't release the
650 * current vnode while holding the
651 * directory lock; its inactive routine
652 * will try to lock this directory.
669 * See if there is a dynamic constructor.
671 if (dp->gfsd_lookup) {
676 * Once again, drop the directory lock, as the lookup routine
677 * will need to allocate memory, or otherwise deadlock on this
681 ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
686 fp = (gfs_file_t *)vp->v_data;
691 * No static entry found, and there is no lookup callback, so
709 * gfs_dir_readdir: does a readdir() on the given directory
711 * dvp - directory vnode
712 * uiop - uio structure
714 * data - arbitrary data passed to readdir callback
716 * This routine does all the readdir() dirty work. Even so, the caller must
717 * supply two callbacks in order to get full compatibility.
719 * If the directory contains static entries, an inode callback must be
720 * specified. This avoids having to create every vnode and call VOP_GETATTR()
721 * when reading the directory. This function has the following arguments:
723 * ino_t gfs_inode_cb(vnode_t *vp, int index);
725 * vp - vnode for the directory
726 * index - index in original gfs_dirent_t array
728 * Returns the inode number for the given entry.
730 * For directories with dynamic entries, a readdir callback must be provided.
731 * This is significantly more complex, thanks to the particulars of
734 * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
735 * offset_t *off, offset_t *nextoff, void *data)
737 * vp - directory vnode
738 * dp - directory entry, sized according to maxlen given to
739 * gfs_dir_create(). callback must fill in d_name and
741 * eofp - callback must set to 1 when EOF has been reached
742 * off - on entry, the last offset read from the directory. Callback
743 * must set to the offset of the current entry, typically left
745 * nextoff - callback must set to offset of next entry. Typically
747 * data - caller-supplied data
749 * Return 0 on success, or error on failure.
752 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
753 u_long **cookies, void *data)
755 gfs_readdir_state_t gstate;
759 gfs_dir_t *dp = dvp->v_data;
761 ino = dp->gfsd_file.gfs_ino;
763 if (dp->gfsd_file.gfs_parent == NULL)
764 pino = ino; /* root of filesystem */
766 pino = ((gfs_file_t *)
767 (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
769 if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
773 while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
774 cookies)) == 0 && !eof) {
776 if (off >= 0 && off < dp->gfsd_nstatic) {
777 ino = dp->gfsd_inode(dvp, off);
779 if ((error = gfs_readdir_emit(&gstate, uiop,
780 off, ino, dp->gfsd_static[off].gfse_name, ncookies,
784 } else if (dp->gfsd_readdir) {
785 off -= dp->gfsd_nstatic;
787 if ((error = dp->gfsd_readdir(dvp,
788 gstate.grd_dirent, &eof, &off, &next,
792 off += dp->gfsd_nstatic + 2;
793 next += dp->gfsd_nstatic + 2;
795 if ((error = gfs_readdir_emit_int(&gstate, uiop,
796 next, ncookies, cookies)) != 0)
800 * Offset is beyond the end of the static entries, and
801 * we have no dynamic entries. Set EOF.
807 return (gfs_readdir_fini(&gstate, error, eofp, eof));
811 * gfs_vop_readdir: VOP_READDIR() entry point
813 * For use directly in vnode ops table. Given a GFS directory, calls
814 * gfs_dir_readdir() as necessary.
819 struct vop_readdir_args /* {
822 struct ucred *a_cred;
828 vnode_t *vp = ap->a_vp;
829 uio_t *uiop = ap->a_uio;
830 int *eofp = ap->a_eofflag;
832 u_long *cookies = NULL;
835 if (ap->a_ncookies) {
837 * Minimum entry size is dirent size and 1 byte for a file name.
839 ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
840 cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
841 *ap->a_cookies = cookies;
842 *ap->a_ncookies = ncookies;
845 error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
848 /* Subtract unused cookies */
850 *ap->a_ncookies -= ncookies;
851 } else if (ap->a_ncookies) {
852 free(*ap->a_cookies, M_TEMP);
853 *ap->a_cookies = NULL;
861 * gfs_vop_inactive: VOP_INACTIVE() entry point
863 * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
864 * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
869 struct vop_inactive_args /* {
874 vnode_t *vp = ap->a_vp;
875 gfs_file_t *fp = vp->v_data;
878 if (fp->gfs_type == GFS_DIR)
879 data = gfs_dir_inactive(vp);
881 data = gfs_file_inactive(vp);
884 kmem_free(data, fp->gfs_size);