2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
41 #include "opt_ktrace.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
68 #include <sys/ktrace.h>
71 #include <sys/capsicum.h>
73 #include <security/audit/audit.h>
74 #include <security/mac/mac_framework.h>
82 SDT_PROVIDER_DECLARE(vfs);
83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
85 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
87 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
89 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
91 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
92 "struct namecache *", "int", "int");
93 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
94 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
95 "char *", "struct vnode *");
96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
98 "struct vnode *", "char *");
99 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
102 "struct vnode *", "char *");
103 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
106 "struct componentname *");
107 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
108 "struct componentname *");
109 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
110 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
111 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
112 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
114 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
116 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
119 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
120 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
121 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
124 * This structure describes the elements in the cache of recent
125 * names looked up by namei.
130 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
131 "the state must fit in a union with a pointer without growing it");
134 LIST_ENTRY(namecache) nc_src; /* source vnode list */
135 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
136 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
137 struct vnode *nc_dvp; /* vnode of parent of name */
139 struct vnode *nu_vp; /* vnode the name refers to */
140 struct negstate nu_neg;/* negative entry state */
142 u_char nc_flag; /* flag bits */
143 u_char nc_nlen; /* length of name */
144 char nc_name[0]; /* segment name + nul */
148 * struct namecache_ts repeats struct namecache layout up to the
150 * struct namecache_ts is used in place of struct namecache when time(s) need
151 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
152 * both a non-dotdot directory name plus dotdot for the directory's
155 * See below for alignment requirement.
157 struct namecache_ts {
158 struct timespec nc_time; /* timespec provided by fs */
159 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
160 int nc_ticks; /* ticks value when entry was added */
161 struct namecache nc_nc;
165 * At least mips n32 performs 64-bit accesses to timespec as found
166 * in namecache_ts and requires them to be aligned. Since others
167 * may be in the same spot suffer a little bit and enforce the
168 * alignment for everyone. Note this is a nop for 64-bit platforms.
170 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
171 #define CACHE_PATH_CUTOFF 39
173 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
174 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
175 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1)
176 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1)
178 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
179 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
180 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
181 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
183 #define nc_vp n_un.nu_vp
184 #define nc_neg n_un.nu_neg
187 * Flags in namecache.nc_flag
189 #define NCF_WHITE 0x01
190 #define NCF_ISDOTDOT 0x02
193 #define NCF_DVDROP 0x10
194 #define NCF_NEGATIVE 0x20
195 #define NCF_INVALID 0x40
199 * Flags in negstate.neg_flag
204 * Mark an entry as invalid.
206 * This is called before it starts getting deconstructed.
209 cache_ncp_invalidate(struct namecache *ncp)
212 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
213 ("%s: entry %p already invalid", __func__, ncp));
214 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
215 atomic_thread_fence_rel();
219 * Check whether the entry can be safely used.
221 * All places which elide locks are supposed to call this after they are
222 * done with reading from an entry.
225 cache_ncp_canuse(struct namecache *ncp)
228 atomic_thread_fence_acq();
229 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
233 * Name caching works as follows:
235 * Names found by directory scans are retained in a cache
236 * for future reference. It is managed LRU, so frequently
237 * used names will hang around. Cache is indexed by hash value
238 * obtained from (dvp, name) where dvp refers to the directory
241 * If it is a "negative" entry, (i.e. for a name that is known NOT to
242 * exist) the vnode pointer will be NULL.
244 * Upon reaching the last segment of a path, if the reference
245 * is for DELETE, or NOCACHE is set (rewrite), and the
246 * name is located in the cache, it will be dropped.
248 * These locks are used (in the order in which they can be taken):
250 * vnodelock mtx vnode lists and v_cache_dd field protection
251 * bucketlock mtx for access to given set of hash buckets
252 * neglist mtx negative entry LRU management
254 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
255 * shrinking the LRU list.
257 * It is legal to take multiple vnodelock and bucketlock locks. The locking
258 * order is lower address first. Both are recursive.
260 * "." lookups are lockless.
262 * ".." and vnode -> name lookups require vnodelock.
264 * name -> vnode lookup requires the relevant bucketlock to be held for reading.
266 * Insertions and removals of entries require involved vnodes and bucketlocks
267 * to be locked to provide safe operation against other threads modifying the
270 * Some lookups result in removal of the found entry (e.g. getting rid of a
271 * negative entry with the intent to create a positive one), which poses a
272 * problem when multiple threads reach the state. Similarly, two different
273 * threads can purge two different vnodes and try to remove the same name.
275 * If the already held vnode lock is lower than the second required lock, we
276 * can just take the other lock. However, in the opposite case, this could
277 * deadlock. As such, this is resolved by trylocking and if that fails unlocking
278 * the first node, locking everything in order and revalidating the state.
284 * Structures associated with name caching.
286 #define NCHHASH(hash) \
287 (&nchashtbl[(hash) & nchash])
288 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
289 static u_long __read_mostly nchash; /* size of hash table */
290 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
291 "Size of namecache hash table");
292 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
293 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
294 "Ratio of negative namecache entries");
295 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
296 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
297 u_int ncsizefactor = 2;
298 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
299 "Size factor for namecache");
300 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
302 struct nchstats nchstats; /* cache effectiveness statistics */
304 static bool __read_frequently cache_fast_revlookup = true;
305 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
306 &cache_fast_revlookup, 0, "");
308 static struct mtx __exclusive_cache_line ncneg_shrink_lock;
311 #define numneglists (ncneghash + 1)
315 TAILQ_HEAD(, namecache) nl_list;
316 TAILQ_HEAD(, namecache) nl_hotlist;
318 } __aligned(CACHE_LINE_SIZE);
320 static struct neglist neglists[numneglists];
322 static inline struct neglist *
323 NCP2NEGLIST(struct namecache *ncp)
326 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
329 static inline struct negstate *
330 NCP2NEGSTATE(struct namecache *ncp)
333 MPASS(ncp->nc_flag & NCF_NEGATIVE);
334 return (&ncp->nc_neg);
337 #define numbucketlocks (ncbuckethash + 1)
338 static u_int __read_mostly ncbuckethash;
339 static struct mtx_padalign __read_mostly *bucketlocks;
340 #define HASH2BUCKETLOCK(hash) \
341 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
343 #define numvnodelocks (ncvnodehash + 1)
344 static u_int __read_mostly ncvnodehash;
345 static struct mtx __read_mostly *vnodelocks;
346 static inline struct mtx *
347 VP2VNODELOCK(struct vnode *vp)
350 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
354 * UMA zones for the VFS cache.
356 * The small cache is used for entries with short names, which are the
357 * most common. The large cache is used for entries which are too big to
358 * fit in the small cache.
360 static uma_zone_t __read_mostly cache_zone_small;
361 static uma_zone_t __read_mostly cache_zone_small_ts;
362 static uma_zone_t __read_mostly cache_zone_large;
363 static uma_zone_t __read_mostly cache_zone_large_ts;
365 static struct namecache *
366 cache_alloc(int len, int ts)
368 struct namecache_ts *ncp_ts;
369 struct namecache *ncp;
371 if (__predict_false(ts)) {
372 if (len <= CACHE_PATH_CUTOFF)
373 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
375 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
376 ncp = &ncp_ts->nc_nc;
378 if (len <= CACHE_PATH_CUTOFF)
379 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
381 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
387 cache_free(struct namecache *ncp)
389 struct namecache_ts *ncp_ts;
392 if ((ncp->nc_flag & NCF_DVDROP) != 0)
394 if (__predict_false(ncp->nc_flag & NCF_TS)) {
395 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
396 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
397 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
399 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
401 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
402 uma_zfree_smr(cache_zone_small, ncp);
404 uma_zfree_smr(cache_zone_large, ncp);
409 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
411 struct namecache_ts *ncp_ts;
413 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
414 (tsp == NULL && ticksp == NULL),
420 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
421 *tsp = ncp_ts->nc_time;
422 *ticksp = ncp_ts->nc_ticks;
426 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
427 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
428 "VFS namecache enabled");
431 /* Export size information to userland */
432 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
433 sizeof(struct namecache), "sizeof(struct namecache)");
436 * The new name cache statistics
438 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
439 "Name cache statistics");
440 #define STATNODE_ULONG(name, descr) \
441 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
442 #define STATNODE_COUNTER(name, descr) \
443 static COUNTER_U64_DEFINE_EARLY(name); \
444 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
446 STATNODE_ULONG(numneg, "Number of negative cache entries");
447 STATNODE_ULONG(numcache, "Number of cache entries");
448 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
449 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
450 STATNODE_COUNTER(dothits, "Number of '.' hits");
451 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
452 STATNODE_COUNTER(nummiss, "Number of cache misses");
453 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
454 STATNODE_COUNTER(numposzaps,
455 "Number of cache hits (positive) we do not want to cache");
456 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
457 STATNODE_COUNTER(numnegzaps,
458 "Number of cache hits (negative) we do not want to cache");
459 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
460 /* These count for vn_getcwd(), too. */
461 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
462 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
463 STATNODE_COUNTER(numfullpathfail2,
464 "Number of fullpath search errors (VOP_VPTOCNP failures)");
465 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
466 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
467 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
468 "Number of successful removals after relocking");
469 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
470 "Number of times zap_and_exit failed to lock");
471 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
472 "Number of times zap_and_exit failed to lock");
473 static long cache_lock_vnodes_cel_3_failures;
474 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
475 "Number of times 3-way vnode locking failed");
476 STATNODE_COUNTER(numneg_evicted,
477 "Number of negative entries evicted when adding a new entry");
478 STATNODE_COUNTER(shrinking_skipped,
479 "Number of times shrinking was already in progress");
481 static void cache_zap_locked(struct namecache *ncp);
482 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
483 char **freebuf, size_t *buflen);
484 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
485 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
486 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
487 char **retbuf, size_t *buflen);
488 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
489 char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
491 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
494 cache_assert_vlp_locked(struct mtx *vlp)
498 mtx_assert(vlp, MA_OWNED);
502 cache_assert_vnode_locked(struct vnode *vp)
506 vlp = VP2VNODELOCK(vp);
507 cache_assert_vlp_locked(vlp);
511 * TODO: With the value stored we can do better than computing the hash based
512 * on the address. The choice of FNV should also be revisited.
515 cache_prehash(struct vnode *vp)
518 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
522 cache_get_hash(char *name, u_char len, struct vnode *dvp)
525 return (fnv_32_buf(name, len, dvp->v_nchash));
528 static inline struct nchashhead *
529 NCP2BUCKET(struct namecache *ncp)
533 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
534 return (NCHHASH(hash));
537 static inline struct mtx *
538 NCP2BUCKETLOCK(struct namecache *ncp)
542 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
543 return (HASH2BUCKETLOCK(hash));
548 cache_assert_bucket_locked(struct namecache *ncp)
552 blp = NCP2BUCKETLOCK(ncp);
553 mtx_assert(blp, MA_OWNED);
557 cache_assert_bucket_unlocked(struct namecache *ncp)
561 blp = NCP2BUCKETLOCK(ncp);
562 mtx_assert(blp, MA_NOTOWNED);
565 #define cache_assert_bucket_locked(x) do { } while (0)
566 #define cache_assert_bucket_unlocked(x) do { } while (0)
569 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
571 _cache_sort_vnodes(void **p1, void **p2)
575 MPASS(*p1 != NULL || *p2 != NULL);
585 cache_lock_all_buckets(void)
589 for (i = 0; i < numbucketlocks; i++)
590 mtx_lock(&bucketlocks[i]);
594 cache_unlock_all_buckets(void)
598 for (i = 0; i < numbucketlocks; i++)
599 mtx_unlock(&bucketlocks[i]);
603 cache_lock_all_vnodes(void)
607 for (i = 0; i < numvnodelocks; i++)
608 mtx_lock(&vnodelocks[i]);
612 cache_unlock_all_vnodes(void)
616 for (i = 0; i < numvnodelocks; i++)
617 mtx_unlock(&vnodelocks[i]);
621 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
624 cache_sort_vnodes(&vlp1, &vlp2);
627 if (!mtx_trylock(vlp1))
630 if (!mtx_trylock(vlp2)) {
640 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
643 MPASS(vlp1 != NULL || vlp2 != NULL);
653 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
656 MPASS(vlp1 != NULL || vlp2 != NULL);
665 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
667 struct nchstats snap;
669 if (req->oldptr == NULL)
670 return (SYSCTL_OUT(req, 0, sizeof(snap)));
673 snap.ncs_goodhits = counter_u64_fetch(numposhits);
674 snap.ncs_neghits = counter_u64_fetch(numneghits);
675 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
676 counter_u64_fetch(numnegzaps);
677 snap.ncs_miss = counter_u64_fetch(nummisszap) +
678 counter_u64_fetch(nummiss);
680 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
682 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
683 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
684 "VFS cache effectiveness statistics");
687 sysctl_hotnum(SYSCTL_HANDLER_ARGS)
692 for (i = 0; i < numneglists; i++)
693 out += neglists[i].nl_hotnum;
695 return (SYSCTL_OUT(req, &out, sizeof(out)));
697 SYSCTL_PROC(_vfs_cache, OID_AUTO, hotnum, CTLTYPE_INT | CTLFLAG_RD |
698 CTLFLAG_MPSAFE, 0, 0, sysctl_hotnum, "I",
699 "Number of hot negative entries");
703 * Grab an atomic snapshot of the name cache hash chain lengths
705 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
706 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
710 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
712 struct nchashhead *ncpp;
713 struct namecache *ncp;
714 int i, error, n_nchash, *cntbuf;
717 n_nchash = nchash + 1; /* nchash is max index, not count */
718 if (req->oldptr == NULL)
719 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
720 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
721 cache_lock_all_buckets();
722 if (n_nchash != nchash + 1) {
723 cache_unlock_all_buckets();
724 free(cntbuf, M_TEMP);
727 /* Scan hash tables counting entries */
728 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
729 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
731 cache_unlock_all_buckets();
732 for (error = 0, i = 0; i < n_nchash; i++)
733 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
735 free(cntbuf, M_TEMP);
738 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
739 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
740 "nchash chain lengths");
743 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
746 struct nchashhead *ncpp;
747 struct namecache *ncp;
749 int count, maxlength, used, pct;
752 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
754 cache_lock_all_buckets();
755 n_nchash = nchash + 1; /* nchash is max index, not count */
759 /* Scan hash tables for applicable entries */
760 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
762 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
767 if (maxlength < count)
770 n_nchash = nchash + 1;
771 cache_unlock_all_buckets();
772 pct = (used * 100) / (n_nchash / 100);
773 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
776 error = SYSCTL_OUT(req, &used, sizeof(used));
779 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
782 error = SYSCTL_OUT(req, &pct, sizeof(pct));
787 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
788 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
789 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
793 * Negative entries management
795 * A variation of LRU scheme is used. New entries are hashed into one of
796 * numneglists cold lists. Entries get promoted to the hot list on first hit.
798 * The shrinker will demote hot list head and evict from the cold list in a
799 * round-robin manner.
802 cache_negative_init(struct namecache *ncp)
806 ncp->nc_flag |= NCF_NEGATIVE;
807 ns = NCP2NEGSTATE(ncp);
812 * Move a negative entry to the hot list.
815 cache_negative_promote(struct namecache *ncp)
820 ns = NCP2NEGSTATE(ncp);
821 nl = NCP2NEGLIST(ncp);
822 mtx_assert(&nl->nl_lock, MA_OWNED);
823 if ((ns->neg_flag & NEG_HOT) == 0) {
824 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
825 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
827 ns->neg_flag |= NEG_HOT;
832 * Move a negative entry to the hot list if it matches the lookup.
834 * We have to take locks, but they may be contended and in the worst
835 * case we may need to go off CPU. We don't want to spin within the
836 * smr section and we can't block with it. Exiting the section means
837 * the found entry could have been evicted. We are going to look it
841 cache_negative_promote_cond(struct vnode *dvp, struct componentname *cnp,
842 struct namecache *oncp, uint32_t hash)
844 struct namecache *ncp;
848 nl = NCP2NEGLIST(oncp);
850 mtx_lock(&nl->nl_lock);
852 * For hash iteration.
857 * Avoid all surprises by only succeeding if we got the same entry and
858 * bailing completely otherwise.
859 * XXX There are no provisions to keep the vnode around, meaning we may
860 * end up promoting a negative entry for a *new* vnode and returning
861 * ENOENT on its account. This is the error we want to return anyway
862 * and promotion is harmless.
864 * In particular at this point there can be a new ncp which matches the
865 * search but hashes to a different neglist.
867 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
873 * No match to begin with.
875 if (__predict_false(ncp == NULL)) {
880 * The newly found entry may be something different...
882 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
883 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
888 * ... and not even negative.
890 nc_flag = atomic_load_char(&ncp->nc_flag);
891 if ((nc_flag & NCF_NEGATIVE) == 0) {
895 if (__predict_false(!cache_ncp_canuse(ncp))) {
899 cache_negative_promote(ncp);
901 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
902 counter_u64_add(numneghits, 1);
904 mtx_unlock(&nl->nl_lock);
908 mtx_unlock(&nl->nl_lock);
913 cache_negative_hit(struct namecache *ncp)
918 ns = NCP2NEGSTATE(ncp);
919 if ((ns->neg_flag & NEG_HOT) != 0)
921 nl = NCP2NEGLIST(ncp);
922 mtx_lock(&nl->nl_lock);
923 cache_negative_promote(ncp);
924 mtx_unlock(&nl->nl_lock);
928 cache_negative_insert(struct namecache *ncp)
932 MPASS(ncp->nc_flag & NCF_NEGATIVE);
933 cache_assert_bucket_locked(ncp);
934 nl = NCP2NEGLIST(ncp);
935 mtx_lock(&nl->nl_lock);
936 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
937 mtx_unlock(&nl->nl_lock);
938 atomic_add_long(&numneg, 1);
942 cache_negative_remove(struct namecache *ncp)
947 cache_assert_bucket_locked(ncp);
948 nl = NCP2NEGLIST(ncp);
949 ns = NCP2NEGSTATE(ncp);
950 mtx_lock(&nl->nl_lock);
951 if ((ns->neg_flag & NEG_HOT) != 0) {
952 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
955 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
957 mtx_unlock(&nl->nl_lock);
958 atomic_subtract_long(&numneg, 1);
961 static struct neglist *
962 cache_negative_shrink_select(void)
969 for (i = 0; i < numneglists; i++) {
970 nl = &neglists[(cycle + i) % numneglists];
971 if (TAILQ_FIRST(&nl->nl_list) == NULL &&
972 TAILQ_FIRST(&nl->nl_hotlist) == NULL)
974 mtx_lock(&nl->nl_lock);
975 if (TAILQ_FIRST(&nl->nl_list) != NULL ||
976 TAILQ_FIRST(&nl->nl_hotlist) != NULL)
978 mtx_unlock(&nl->nl_lock);
985 cache_negative_zap_one(void)
987 struct namecache *ncp, *ncp2;
993 if (mtx_owner(&ncneg_shrink_lock) != NULL ||
994 !mtx_trylock(&ncneg_shrink_lock)) {
995 counter_u64_add(shrinking_skipped, 1);
999 nl = cache_negative_shrink_select();
1000 mtx_unlock(&ncneg_shrink_lock);
1005 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1007 ns = NCP2NEGSTATE(ncp);
1008 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1009 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1011 ns->neg_flag &= ~NEG_HOT;
1013 ncp = TAILQ_FIRST(&nl->nl_list);
1015 ns = NCP2NEGSTATE(ncp);
1016 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1017 blp = NCP2BUCKETLOCK(ncp);
1018 mtx_unlock(&nl->nl_lock);
1022 * Enter SMR to safely check the negative list.
1023 * Even if the found pointer matches, the entry may now be reallocated
1024 * and used by a different vnode.
1027 ncp2 = TAILQ_FIRST(&nl->nl_list);
1028 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
1029 blp != NCP2BUCKETLOCK(ncp2)) {
1034 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
1036 cache_zap_locked(ncp);
1037 counter_u64_add(numneg_evicted, 1);
1046 * cache_zap_locked():
1048 * Removes a namecache entry from cache, whether it contains an actual
1049 * pointer to a vnode or if it is just a negative cache entry.
1052 cache_zap_locked(struct namecache *ncp)
1054 struct nchashhead *ncpp;
1056 if (!(ncp->nc_flag & NCF_NEGATIVE))
1057 cache_assert_vnode_locked(ncp->nc_vp);
1058 cache_assert_vnode_locked(ncp->nc_dvp);
1059 cache_assert_bucket_locked(ncp);
1061 cache_ncp_invalidate(ncp);
1063 ncpp = NCP2BUCKET(ncp);
1064 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1065 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1066 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1067 ncp->nc_name, ncp->nc_vp);
1068 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1069 if (ncp == ncp->nc_vp->v_cache_dd) {
1070 vn_seqc_write_begin_unheld(ncp->nc_vp);
1071 ncp->nc_vp->v_cache_dd = NULL;
1072 vn_seqc_write_end(ncp->nc_vp);
1075 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1077 cache_negative_remove(ncp);
1079 if (ncp->nc_flag & NCF_ISDOTDOT) {
1080 if (ncp == ncp->nc_dvp->v_cache_dd) {
1081 vn_seqc_write_begin_unheld(ncp->nc_dvp);
1082 ncp->nc_dvp->v_cache_dd = NULL;
1083 vn_seqc_write_end(ncp->nc_dvp);
1086 LIST_REMOVE(ncp, nc_src);
1087 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1088 ncp->nc_flag |= NCF_DVDROP;
1089 counter_u64_add(numcachehv, -1);
1092 atomic_subtract_long(&numcache, 1);
1096 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1100 MPASS(ncp->nc_dvp == vp);
1101 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1102 cache_assert_vnode_locked(vp);
1104 blp = NCP2BUCKETLOCK(ncp);
1106 cache_zap_locked(ncp);
1111 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1114 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1117 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1118 cache_assert_vnode_locked(vp);
1120 if (ncp->nc_flag & NCF_NEGATIVE) {
1121 if (*vlpp != NULL) {
1125 cache_zap_negative_locked_vnode_kl(ncp, vp);
1129 pvlp = VP2VNODELOCK(vp);
1130 blp = NCP2BUCKETLOCK(ncp);
1131 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1132 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1134 if (*vlpp == vlp1 || *vlpp == vlp2) {
1138 if (*vlpp != NULL) {
1142 cache_sort_vnodes(&vlp1, &vlp2);
1147 if (!mtx_trylock(vlp1))
1153 cache_zap_locked(ncp);
1155 if (to_unlock != NULL)
1156 mtx_unlock(to_unlock);
1163 MPASS(*vlpp == NULL);
1169 * If trylocking failed we can get here. We know enough to take all needed locks
1170 * in the right order and re-lookup the entry.
1173 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1174 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1177 struct namecache *rncp;
1179 cache_assert_bucket_unlocked(ncp);
1181 cache_sort_vnodes(&dvlp, &vlp);
1182 cache_lock_vnodes(dvlp, vlp);
1184 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1185 if (rncp == ncp && rncp->nc_dvp == dvp &&
1186 rncp->nc_nlen == cnp->cn_namelen &&
1187 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1191 cache_zap_locked(rncp);
1193 cache_unlock_vnodes(dvlp, vlp);
1194 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1199 cache_unlock_vnodes(dvlp, vlp);
1203 static int __noinline
1204 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1205 uint32_t hash, struct mtx *blp)
1207 struct mtx *dvlp, *vlp;
1210 cache_assert_bucket_locked(ncp);
1212 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1214 if (!(ncp->nc_flag & NCF_NEGATIVE))
1215 vlp = VP2VNODELOCK(ncp->nc_vp);
1216 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1217 cache_zap_locked(ncp);
1219 cache_unlock_vnodes(dvlp, vlp);
1225 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1228 static __noinline int
1229 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1231 struct namecache *ncp;
1233 struct mtx *dvlp, *dvlp2;
1237 if (cnp->cn_namelen == 2 &&
1238 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1239 dvlp = VP2VNODELOCK(dvp);
1243 ncp = dvp->v_cache_dd;
1248 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1251 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1252 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1254 MPASS(dvp->v_cache_dd == NULL);
1260 vn_seqc_write_begin(dvp);
1261 dvp->v_cache_dd = NULL;
1262 vn_seqc_write_end(dvp);
1267 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1271 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1272 blp = HASH2BUCKETLOCK(hash);
1274 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1279 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1280 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1281 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1290 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1291 if (__predict_false(error != 0)) {
1292 zap_and_exit_bucket_fail++;
1295 counter_u64_add(numposzaps, 1);
1296 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1300 counter_u64_add(nummisszap, 1);
1301 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1305 static int __noinline
1306 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1307 struct timespec *tsp, int *ticksp)
1312 counter_u64_add(dothits, 1);
1313 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1320 * When we lookup "." we still can be asked to lock it
1323 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1324 if (ltype != VOP_ISLOCKED(*vpp)) {
1325 if (ltype == LK_EXCLUSIVE) {
1326 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1327 if (VN_IS_DOOMED((*vpp))) {
1328 /* forced unmount */
1334 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1339 static int __noinline
1340 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1341 struct timespec *tsp, int *ticksp)
1343 struct namecache_ts *ncp_ts;
1344 struct namecache *ncp;
1350 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1352 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1353 cache_remove_cnp(dvp, cnp);
1357 counter_u64_add(dotdothits, 1);
1359 dvlp = VP2VNODELOCK(dvp);
1361 ncp = dvp->v_cache_dd;
1363 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1367 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1368 if (ncp->nc_flag & NCF_NEGATIVE)
1375 goto negative_success;
1376 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1377 cache_out_ts(ncp, tsp, ticksp);
1378 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1379 NCF_DTS && tsp != NULL) {
1380 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1381 *tsp = ncp_ts->nc_dotdottime;
1385 ltype = VOP_ISLOCKED(dvp);
1387 vs = vget_prep(*vpp);
1389 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1390 vn_lock(dvp, ltype | LK_RETRY);
1391 if (VN_IS_DOOMED(dvp)) {
1403 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1404 if (cnp->cn_flags & ISLASTCN) {
1405 counter_u64_add(numnegzaps, 1);
1406 cache_zap_negative_locked_vnode_kl(ncp, dvp);
1413 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1414 cache_out_ts(ncp, tsp, ticksp);
1415 counter_u64_add(numneghits, 1);
1416 whiteout = (ncp->nc_flag & NCF_WHITE);
1417 cache_negative_hit(ncp);
1420 cnp->cn_flags |= ISWHITEOUT;
1425 * Lookup a name in the name cache
1429 * - dvp: Parent directory in which to search.
1430 * - vpp: Return argument. Will contain desired vnode on cache hit.
1431 * - cnp: Parameters of the name search. The most interesting bits of
1432 * the cn_flags field have the following meanings:
1433 * - MAKEENTRY: If clear, free an entry from the cache rather than look
1435 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
1436 * - tsp: Return storage for cache timestamp. On a successful (positive
1437 * or negative) lookup, tsp will be filled with any timespec that
1438 * was stored when this cache entry was created. However, it will
1439 * be clear for "." entries.
1440 * - ticks: Return storage for alternate cache timestamp. On a successful
1441 * (positive or negative) lookup, it will contain the ticks value
1442 * that was current when the cache entry was created, unless cnp
1445 * Either both tsp and ticks have to be provided or neither of them.
1449 * - -1: A positive cache hit. vpp will contain the desired vnode.
1450 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
1451 * to a forced unmount. vpp will not be modified. If the entry
1452 * is a whiteout, then the ISWHITEOUT flag will be set in
1454 * - 0: A cache miss. vpp will not be modified.
1458 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
1459 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
1460 * lock is not recursively acquired.
1462 static int __noinline
1463 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1464 struct timespec *tsp, int *ticksp)
1466 struct namecache *ncp;
1473 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1476 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1477 blp = HASH2BUCKETLOCK(hash);
1480 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1481 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1482 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1486 if (__predict_false(ncp == NULL)) {
1488 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1490 counter_u64_add(nummiss, 1);
1494 if (ncp->nc_flag & NCF_NEGATIVE)
1495 goto negative_success;
1497 counter_u64_add(numposhits, 1);
1499 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1500 cache_out_ts(ncp, tsp, ticksp);
1502 vs = vget_prep(*vpp);
1504 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1511 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1512 if (cnp->cn_flags & ISLASTCN) {
1513 counter_u64_add(numnegzaps, 1);
1514 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1515 if (__predict_false(error != 0)) {
1516 zap_and_exit_bucket_fail2++;
1524 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1525 cache_out_ts(ncp, tsp, ticksp);
1526 counter_u64_add(numneghits, 1);
1527 whiteout = (ncp->nc_flag & NCF_WHITE);
1528 cache_negative_hit(ncp);
1531 cnp->cn_flags |= ISWHITEOUT;
1536 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1537 struct timespec *tsp, int *ticksp)
1539 struct namecache *ncp;
1540 struct negstate *ns;
1544 bool whiteout, neg_hot;
1547 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1550 if (__predict_false(!doingcache)) {
1551 cnp->cn_flags &= ~MAKEENTRY;
1556 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1557 if (cnp->cn_namelen == 1)
1558 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1559 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1560 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1563 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1565 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1566 cache_remove_cnp(dvp, cnp);
1570 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1573 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1574 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1575 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1579 if (__predict_false(ncp == NULL)) {
1581 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1583 counter_u64_add(nummiss, 1);
1587 nc_flag = atomic_load_char(&ncp->nc_flag);
1588 if (nc_flag & NCF_NEGATIVE)
1589 goto negative_success;
1591 counter_u64_add(numposhits, 1);
1593 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1594 cache_out_ts(ncp, tsp, ticksp);
1596 if (!cache_ncp_canuse(ncp)) {
1601 vs = vget_prep_smr(*vpp);
1603 if (__predict_false(vs == VGET_NONE)) {
1607 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1614 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1615 if (cnp->cn_flags & ISLASTCN) {
1621 cache_out_ts(ncp, tsp, ticksp);
1622 whiteout = (ncp->nc_flag & NCF_WHITE);
1623 ns = NCP2NEGSTATE(ncp);
1624 neg_hot = ((ns->neg_flag & NEG_HOT) != 0);
1625 if (__predict_false(!cache_ncp_canuse(ncp))) {
1631 if (!cache_negative_promote_cond(dvp, cnp, ncp, hash))
1634 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1635 counter_u64_add(numneghits, 1);
1639 cnp->cn_flags |= ISWHITEOUT;
1642 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1645 struct celockstate {
1649 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1650 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1653 cache_celockstate_init(struct celockstate *cel)
1656 bzero(cel, sizeof(*cel));
1660 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1663 struct mtx *vlp1, *vlp2;
1665 MPASS(cel->vlp[0] == NULL);
1666 MPASS(cel->vlp[1] == NULL);
1667 MPASS(cel->vlp[2] == NULL);
1669 MPASS(vp != NULL || dvp != NULL);
1671 vlp1 = VP2VNODELOCK(vp);
1672 vlp2 = VP2VNODELOCK(dvp);
1673 cache_sort_vnodes(&vlp1, &vlp2);
1684 cache_unlock_vnodes_cel(struct celockstate *cel)
1687 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1689 if (cel->vlp[0] != NULL)
1690 mtx_unlock(cel->vlp[0]);
1691 if (cel->vlp[1] != NULL)
1692 mtx_unlock(cel->vlp[1]);
1693 if (cel->vlp[2] != NULL)
1694 mtx_unlock(cel->vlp[2]);
1698 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1703 cache_assert_vlp_locked(cel->vlp[0]);
1704 cache_assert_vlp_locked(cel->vlp[1]);
1705 MPASS(cel->vlp[2] == NULL);
1708 vlp = VP2VNODELOCK(vp);
1711 if (vlp >= cel->vlp[1]) {
1714 if (mtx_trylock(vlp))
1716 cache_lock_vnodes_cel_3_failures++;
1717 cache_unlock_vnodes_cel(cel);
1718 if (vlp < cel->vlp[0]) {
1720 mtx_lock(cel->vlp[0]);
1721 mtx_lock(cel->vlp[1]);
1723 if (cel->vlp[0] != NULL)
1724 mtx_lock(cel->vlp[0]);
1726 mtx_lock(cel->vlp[1]);
1736 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1740 MPASS(cel->blp[0] == NULL);
1741 MPASS(cel->blp[1] == NULL);
1743 cache_sort_vnodes(&blp1, &blp2);
1754 cache_unlock_buckets_cel(struct celockstate *cel)
1757 if (cel->blp[0] != NULL)
1758 mtx_unlock(cel->blp[0]);
1759 mtx_unlock(cel->blp[1]);
1763 * Lock part of the cache affected by the insertion.
1765 * This means vnodelocks for dvp, vp and the relevant bucketlock.
1766 * However, insertion can result in removal of an old entry. In this
1767 * case we have an additional vnode and bucketlock pair to lock.
1769 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1770 * preserving the locking order (smaller address first).
1773 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1776 struct namecache *ncp;
1777 struct mtx *blps[2];
1779 blps[0] = HASH2BUCKETLOCK(hash);
1782 cache_lock_vnodes_cel(cel, dvp, vp);
1783 if (vp == NULL || vp->v_type != VDIR)
1785 ncp = vp->v_cache_dd;
1788 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1790 MPASS(ncp->nc_dvp == vp);
1791 blps[1] = NCP2BUCKETLOCK(ncp);
1792 if (ncp->nc_flag & NCF_NEGATIVE)
1794 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1797 * All vnodes got re-locked. Re-validate the state and if
1798 * nothing changed we are done. Otherwise restart.
1800 if (ncp == vp->v_cache_dd &&
1801 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1802 blps[1] == NCP2BUCKETLOCK(ncp) &&
1803 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1805 cache_unlock_vnodes_cel(cel);
1810 cache_lock_buckets_cel(cel, blps[0], blps[1]);
1814 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1817 struct namecache *ncp;
1818 struct mtx *blps[2];
1820 blps[0] = HASH2BUCKETLOCK(hash);
1823 cache_lock_vnodes_cel(cel, dvp, vp);
1824 ncp = dvp->v_cache_dd;
1827 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1829 MPASS(ncp->nc_dvp == dvp);
1830 blps[1] = NCP2BUCKETLOCK(ncp);
1831 if (ncp->nc_flag & NCF_NEGATIVE)
1833 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1835 if (ncp == dvp->v_cache_dd &&
1836 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1837 blps[1] == NCP2BUCKETLOCK(ncp) &&
1838 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1840 cache_unlock_vnodes_cel(cel);
1845 cache_lock_buckets_cel(cel, blps[0], blps[1]);
1849 cache_enter_unlock(struct celockstate *cel)
1852 cache_unlock_buckets_cel(cel);
1853 cache_unlock_vnodes_cel(cel);
1856 static void __noinline
1857 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1858 struct componentname *cnp)
1860 struct celockstate cel;
1861 struct namecache *ncp;
1865 if (dvp->v_cache_dd == NULL)
1867 len = cnp->cn_namelen;
1868 cache_celockstate_init(&cel);
1869 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1870 cache_enter_lock_dd(&cel, dvp, vp, hash);
1871 vn_seqc_write_begin(dvp);
1872 ncp = dvp->v_cache_dd;
1873 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1874 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1875 cache_zap_locked(ncp);
1879 dvp->v_cache_dd = NULL;
1880 vn_seqc_write_end(dvp);
1881 cache_enter_unlock(&cel);
1887 * Add an entry to the cache.
1890 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1891 struct timespec *tsp, struct timespec *dtsp)
1893 struct celockstate cel;
1894 struct namecache *ncp, *n2, *ndd;
1895 struct namecache_ts *ncp_ts;
1896 struct nchashhead *ncpp;
1902 VNPASS(!VN_IS_DOOMED(dvp), dvp);
1903 VNPASS(dvp->v_type != VNON, dvp);
1905 VNPASS(!VN_IS_DOOMED(vp), vp);
1906 VNPASS(vp->v_type != VNON, vp);
1910 if (__predict_false(!doingcache))
1915 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1916 if (cnp->cn_namelen == 1)
1918 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1919 cache_enter_dotdot_prep(dvp, vp, cnp);
1920 flag = NCF_ISDOTDOT;
1925 * Avoid blowout in namecache entries.
1928 * 1. filesystems may end up tryng to add an already existing entry
1929 * (for example this can happen after a cache miss during concurrent
1930 * lookup), in which case we will call cache_negative_zap_one despite
1931 * not adding anything.
1932 * 2. the routine may fail to free anything and no provisions are made
1933 * to make it try harder (see the inside for failure modes)
1934 * 3. it only ever looks at negative entries.
1936 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1937 if (numneg * ncnegfactor > lnumcache) {
1938 cache_negative_zap_one();
1939 lnumcache = atomic_load_long(&numcache);
1941 if (__predict_false(lnumcache >= ncsize)) {
1942 atomic_subtract_long(&numcache, 1);
1943 counter_u64_add(numdrops, 1);
1947 cache_celockstate_init(&cel);
1952 * Calculate the hash key and setup as much of the new
1953 * namecache entry as possible before acquiring the lock.
1955 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1956 ncp->nc_flag = flag | NCF_WIP;
1959 cache_negative_init(ncp);
1962 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1963 ncp_ts->nc_time = *tsp;
1964 ncp_ts->nc_ticks = ticks;
1965 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1967 ncp_ts->nc_dotdottime = *dtsp;
1968 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1971 len = ncp->nc_nlen = cnp->cn_namelen;
1972 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1973 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1974 ncp->nc_name[len] = '\0';
1975 cache_enter_lock(&cel, dvp, vp, hash);
1978 * See if this vnode or negative entry is already in the cache
1979 * with this name. This can happen with concurrent lookups of
1980 * the same path name.
1982 ncpp = NCHHASH(hash);
1983 CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1984 if (n2->nc_dvp == dvp &&
1985 n2->nc_nlen == cnp->cn_namelen &&
1986 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1987 MPASS(cache_ncp_canuse(n2));
1988 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
1990 ("%s: found entry pointing to a different vnode (%p != %p)",
1991 __func__, NULL, vp));
1993 KASSERT(n2->nc_vp == vp,
1994 ("%s: found entry pointing to a different vnode (%p != %p)",
1995 __func__, n2->nc_vp, vp));
1997 * Entries are supposed to be immutable unless in the
1998 * process of getting destroyed. Accommodating for
1999 * changing timestamps is possible but not worth it.
2000 * This should be harmless in terms of correctness, in
2001 * the worst case resulting in an earlier expiration.
2002 * Alternatively, the found entry can be replaced
2005 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2008 KASSERT((n2->nc_flag & NCF_TS) != 0,
2010 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2011 n2_ts->nc_time = ncp_ts->nc_time;
2012 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2014 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2015 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2019 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2021 goto out_unlock_free;
2025 if (flag == NCF_ISDOTDOT) {
2027 * See if we are trying to add .. entry, but some other lookup
2028 * has populated v_cache_dd pointer already.
2030 if (dvp->v_cache_dd != NULL)
2031 goto out_unlock_free;
2032 KASSERT(vp == NULL || vp->v_type == VDIR,
2033 ("wrong vnode type %p", vp));
2034 vn_seqc_write_begin(dvp);
2035 dvp->v_cache_dd = ncp;
2036 vn_seqc_write_end(dvp);
2040 if (flag != NCF_ISDOTDOT) {
2042 * For this case, the cache entry maps both the
2043 * directory name in it and the name ".." for the
2044 * directory's parent.
2046 vn_seqc_write_begin(vp);
2047 if ((ndd = vp->v_cache_dd) != NULL) {
2048 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2049 cache_zap_locked(ndd);
2053 vp->v_cache_dd = ncp;
2054 vn_seqc_write_end(vp);
2055 } else if (vp->v_type != VDIR) {
2056 if (vp->v_cache_dd != NULL) {
2057 vn_seqc_write_begin(vp);
2058 vp->v_cache_dd = NULL;
2059 vn_seqc_write_end(vp);
2064 if (flag != NCF_ISDOTDOT) {
2065 if (LIST_EMPTY(&dvp->v_cache_src)) {
2067 counter_u64_add(numcachehv, 1);
2069 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2073 * If the entry is "negative", we place it into the
2074 * "negative" cache queue, otherwise, we place it into the
2075 * destination vnode's cache entries queue.
2078 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2079 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2082 if (cnp->cn_flags & ISWHITEOUT)
2083 ncp->nc_flag |= NCF_WHITE;
2084 cache_negative_insert(ncp);
2085 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2090 * Insert the new namecache entry into the appropriate chain
2091 * within the cache entries table.
2093 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2095 atomic_thread_fence_rel();
2097 * Mark the entry as fully constructed.
2098 * It is immutable past this point until its removal.
2100 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2102 cache_enter_unlock(&cel);
2107 cache_enter_unlock(&cel);
2108 atomic_subtract_long(&numcache, 1);
2114 cache_roundup_2(u_int val)
2118 for (res = 1; res <= val; res <<= 1)
2124 static struct nchashhead *
2125 nchinittbl(u_long elements, u_long *hashmask)
2127 struct nchashhead *hashtbl;
2130 hashsize = cache_roundup_2(elements) / 2;
2132 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2133 for (i = 0; i < hashsize; i++)
2134 CK_SLIST_INIT(&hashtbl[i]);
2135 *hashmask = hashsize - 1;
2140 ncfreetbl(struct nchashhead *hashtbl)
2143 free(hashtbl, M_VFSCACHE);
2147 * Name cache initialization, from vfs_init() when we are booting
2150 nchinit(void *dummy __unused)
2154 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2155 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2156 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2157 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2158 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2159 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2160 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2161 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2163 VFS_SMR_ZONE_SET(cache_zone_small);
2164 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2165 VFS_SMR_ZONE_SET(cache_zone_large);
2166 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2168 ncsize = desiredvnodes * ncsizefactor;
2169 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2170 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2171 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2173 if (ncbuckethash > nchash)
2174 ncbuckethash = nchash;
2175 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2177 for (i = 0; i < numbucketlocks; i++)
2178 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2179 ncvnodehash = ncbuckethash;
2180 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2182 for (i = 0; i < numvnodelocks; i++)
2183 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2185 for (i = 0; i < numneglists; i++) {
2186 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2187 TAILQ_INIT(&neglists[i].nl_list);
2188 TAILQ_INIT(&neglists[i].nl_hotlist);
2191 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2193 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2196 cache_vnode_init(struct vnode *vp)
2199 LIST_INIT(&vp->v_cache_src);
2200 TAILQ_INIT(&vp->v_cache_dst);
2201 vp->v_cache_dd = NULL;
2206 cache_changesize(u_long newmaxvnodes)
2208 struct nchashhead *new_nchashtbl, *old_nchashtbl;
2209 u_long new_nchash, old_nchash;
2210 struct namecache *ncp;
2215 newncsize = newmaxvnodes * ncsizefactor;
2216 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2217 if (newmaxvnodes < numbucketlocks)
2218 newmaxvnodes = numbucketlocks;
2220 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2221 /* If same hash table size, nothing to do */
2222 if (nchash == new_nchash) {
2223 ncfreetbl(new_nchashtbl);
2227 * Move everything from the old hash table to the new table.
2228 * None of the namecache entries in the table can be removed
2229 * because to do so, they have to be removed from the hash table.
2231 cache_lock_all_vnodes();
2232 cache_lock_all_buckets();
2233 old_nchashtbl = nchashtbl;
2234 old_nchash = nchash;
2235 nchashtbl = new_nchashtbl;
2236 nchash = new_nchash;
2237 for (i = 0; i <= old_nchash; i++) {
2238 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2239 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2241 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2242 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2246 cache_unlock_all_buckets();
2247 cache_unlock_all_vnodes();
2248 ncfreetbl(old_nchashtbl);
2252 * Invalidate all entries from and to a particular vnode.
2255 cache_purge_impl(struct vnode *vp)
2257 TAILQ_HEAD(, namecache) ncps;
2258 struct namecache *ncp, *nnp;
2259 struct mtx *vlp, *vlp2;
2262 vlp = VP2VNODELOCK(vp);
2266 while (!LIST_EMPTY(&vp->v_cache_src)) {
2267 ncp = LIST_FIRST(&vp->v_cache_src);
2268 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2270 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2272 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2273 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2274 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2276 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2278 ncp = vp->v_cache_dd;
2280 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2281 ("lost dotdot link"));
2282 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2284 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2286 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2290 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2296 * Opportunistic check to see if there is anything to do.
2299 cache_has_entries(struct vnode *vp)
2302 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2303 vp->v_cache_dd == NULL)
2309 cache_purge(struct vnode *vp)
2312 SDT_PROBE1(vfs, namecache, purge, done, vp);
2313 if (!cache_has_entries(vp))
2315 cache_purge_impl(vp);
2319 * Only to be used by vgone.
2322 cache_purge_vgone(struct vnode *vp)
2326 VNPASS(VN_IS_DOOMED(vp), vp);
2327 if (cache_has_entries(vp)) {
2328 cache_purge_impl(vp);
2333 * Serialize against a potential thread doing cache_purge.
2335 vlp = VP2VNODELOCK(vp);
2336 mtx_wait_unlocked(vlp);
2337 if (cache_has_entries(vp)) {
2338 cache_purge_impl(vp);
2345 * Invalidate all negative entries for a particular directory vnode.
2348 cache_purge_negative(struct vnode *vp)
2350 TAILQ_HEAD(, namecache) ncps;
2351 struct namecache *ncp, *nnp;
2354 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2355 if (LIST_EMPTY(&vp->v_cache_src))
2358 vlp = VP2VNODELOCK(vp);
2360 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2361 if (!(ncp->nc_flag & NCF_NEGATIVE))
2363 cache_zap_negative_locked_vnode_kl(ncp, vp);
2364 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2367 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2373 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2374 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2377 ASSERT_VOP_IN_SEQC(fdvp);
2378 ASSERT_VOP_IN_SEQC(fvp);
2379 ASSERT_VOP_IN_SEQC(tdvp);
2381 ASSERT_VOP_IN_SEQC(tvp);
2386 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2387 ("%s: lingering negative entry", __func__));
2389 cache_remove_cnp(tdvp, tcnp);
2394 * Flush all entries referencing a particular filesystem.
2397 cache_purgevfs(struct mount *mp)
2399 struct vnode *vp, *mvp;
2401 SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2403 * Somewhat wasteful iteration over all vnodes. Would be better to
2404 * support filtering and avoid the interlock to begin with.
2406 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2407 if (!cache_has_entries(vp)) {
2419 * Perform canonical checks and cache lookup and pass on to filesystem
2420 * through the vop_cachedlookup only if needed.
2424 vfs_cache_lookup(struct vop_lookup_args *ap)
2428 struct vnode **vpp = ap->a_vpp;
2429 struct componentname *cnp = ap->a_cnp;
2430 int flags = cnp->cn_flags;
2435 if (dvp->v_type != VDIR)
2438 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2439 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2442 error = vn_dir_check_exec(dvp, cnp);
2446 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2448 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2454 /* Implementation of the getcwd syscall. */
2456 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2462 buflen = uap->buflen;
2463 if (__predict_false(buflen < 2))
2465 if (buflen > MAXPATHLEN)
2466 buflen = MAXPATHLEN;
2468 buf = uma_zalloc(namei_zone, M_WAITOK);
2469 error = vn_getcwd(buf, &retbuf, &buflen);
2471 error = copyout(retbuf, uap->buf, buflen);
2472 uma_zfree(namei_zone, buf);
2477 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2483 pwd = pwd_get_smr();
2484 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2486 VFS_SMR_ASSERT_NOT_ENTERED();
2488 pwd = pwd_hold(curthread);
2489 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2495 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2502 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2503 size_t size, int flags, enum uio_seg pathseg)
2505 struct nameidata nd;
2506 char *retbuf, *freebuf;
2511 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2512 pathseg, path, fd, &cap_fstat_rights, td);
2513 if ((error = namei(&nd)) != 0)
2515 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2517 error = copyout(retbuf, buf, size);
2518 free(freebuf, M_TEMP);
2525 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2528 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2529 uap->flags, UIO_USERSPACE));
2533 * Retrieve the full filesystem path that correspond to a vnode from the name
2534 * cache (if available)
2537 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2544 if (__predict_false(vp == NULL))
2547 buflen = MAXPATHLEN;
2548 buf = malloc(buflen, M_TEMP, M_WAITOK);
2550 pwd = pwd_get_smr();
2551 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2552 VFS_SMR_ASSERT_NOT_ENTERED();
2554 pwd = pwd_hold(curthread);
2555 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2566 * This function is similar to vn_fullpath, but it attempts to lookup the
2567 * pathname relative to the global root mount point. This is required for the
2568 * auditing sub-system, as audited pathnames must be absolute, relative to the
2569 * global root mount point.
2572 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2578 if (__predict_false(vp == NULL))
2580 buflen = MAXPATHLEN;
2581 buf = malloc(buflen, M_TEMP, M_WAITOK);
2583 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2584 VFS_SMR_ASSERT_NOT_ENTERED();
2586 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2595 static struct namecache *
2596 vn_dd_from_dst(struct vnode *vp)
2598 struct namecache *ncp;
2600 cache_assert_vnode_locked(vp);
2601 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2602 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2609 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2612 struct namecache *ncp;
2616 vlp = VP2VNODELOCK(*vp);
2618 ncp = (*vp)->v_cache_dd;
2619 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2620 KASSERT(ncp == vn_dd_from_dst(*vp),
2621 ("%s: mismatch for dd entry (%p != %p)", __func__,
2622 ncp, vn_dd_from_dst(*vp)));
2624 ncp = vn_dd_from_dst(*vp);
2627 if (*buflen < ncp->nc_nlen) {
2630 counter_u64_add(numfullpathfail4, 1);
2632 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2636 *buflen -= ncp->nc_nlen;
2637 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2638 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2647 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2650 vn_lock(*vp, LK_SHARED | LK_RETRY);
2651 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2654 counter_u64_add(numfullpathfail2, 1);
2655 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2660 if (VN_IS_DOOMED(dvp)) {
2661 /* forced unmount */
2664 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2668 * *vp has its use count incremented still.
2675 * Resolve a directory to a pathname.
2677 * The name of the directory can always be found in the namecache or fetched
2678 * from the filesystem. There is also guaranteed to be only one parent, meaning
2679 * we can just follow vnodes up until we find the root.
2681 * The vnode must be referenced.
2684 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2685 size_t *len, bool slash_prefixed, size_t addend)
2687 #ifdef KDTRACE_HOOKS
2688 struct vnode *startvp = vp;
2694 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2695 VNPASS(vp->v_usecount > 0, vp);
2699 if (!slash_prefixed) {
2707 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2708 counter_u64_add(numfullpathcalls, 1);
2709 while (vp != rdir && vp != rootvnode) {
2711 * The vp vnode must be already fully constructed,
2712 * since it is either found in namecache or obtained
2713 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
2714 * without obtaining the vnode lock.
2716 if ((vp->v_vflag & VV_ROOT) != 0) {
2717 vn_lock(vp, LK_RETRY | LK_SHARED);
2720 * With the vnode locked, check for races with
2721 * unmount, forced or not. Note that we
2722 * already verified that vp is not equal to
2723 * the root vnode, which means that
2724 * mnt_vnodecovered can be NULL only for the
2727 if (VN_IS_DOOMED(vp) ||
2728 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2729 vp1->v_mountedhere != vp->v_mount) {
2732 SDT_PROBE3(vfs, namecache, fullpath, return,
2742 if (vp->v_type != VDIR) {
2744 counter_u64_add(numfullpathfail1, 1);
2746 SDT_PROBE3(vfs, namecache, fullpath, return,
2750 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2756 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2760 buf[--buflen] = '/';
2761 slash_prefixed = true;
2765 if (!slash_prefixed) {
2768 counter_u64_add(numfullpathfail4, 1);
2769 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2773 buf[--buflen] = '/';
2775 counter_u64_add(numfullpathfound, 1);
2778 *retbuf = buf + buflen;
2779 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2786 * Resolve an arbitrary vnode to a pathname.
2789 * - hardlinks are not tracked, thus if the vnode is not a directory this can
2790 * resolve to a different path than the one used to find it
2791 * - namecache is not mandatory, meaning names are not guaranteed to be added
2792 * (in which case resolving fails)
2794 static void __inline
2795 cache_rev_failed_impl(int *reason, int line)
2800 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
2803 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2804 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2806 #ifdef KDTRACE_HOOKS
2807 struct vnode *startvp = vp;
2811 struct namecache *ncp;
2815 #ifdef KDTRACE_HOOKS
2818 seqc_t vp_seqc, tvp_seqc;
2821 VFS_SMR_ASSERT_ENTERED();
2823 if (!cache_fast_revlookup) {
2828 orig_buflen = *buflen;
2830 if (!slash_prefixed) {
2831 MPASS(*buflen >= 2);
2833 buf[*buflen] = '\0';
2836 if (vp == rdir || vp == rootvnode) {
2837 if (!slash_prefixed) {
2844 #ifdef KDTRACE_HOOKS
2848 ncp = NULL; /* for sdt probe down below */
2849 vp_seqc = vn_seqc_read_any(vp);
2850 if (seqc_in_modify(vp_seqc)) {
2851 cache_rev_failed(&reason);
2856 #ifdef KDTRACE_HOOKS
2859 if ((vp->v_vflag & VV_ROOT) != 0) {
2860 mp = atomic_load_ptr(&vp->v_mount);
2862 cache_rev_failed(&reason);
2865 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2866 tvp_seqc = vn_seqc_read_any(tvp);
2867 if (seqc_in_modify(tvp_seqc)) {
2868 cache_rev_failed(&reason);
2871 if (!vn_seqc_consistent(vp, vp_seqc)) {
2872 cache_rev_failed(&reason);
2879 ncp = atomic_load_ptr(&vp->v_cache_dd);
2881 cache_rev_failed(&reason);
2884 nc_flag = atomic_load_char(&ncp->nc_flag);
2885 if ((nc_flag & NCF_ISDOTDOT) != 0) {
2886 cache_rev_failed(&reason);
2889 if (!cache_ncp_canuse(ncp)) {
2890 cache_rev_failed(&reason);
2893 if (ncp->nc_nlen >= *buflen) {
2894 cache_rev_failed(&reason);
2898 *buflen -= ncp->nc_nlen;
2899 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2903 tvp_seqc = vn_seqc_read_any(tvp);
2904 if (seqc_in_modify(tvp_seqc)) {
2905 cache_rev_failed(&reason);
2908 if (!vn_seqc_consistent(vp, vp_seqc)) {
2909 cache_rev_failed(&reason);
2914 if (vp == rdir || vp == rootvnode)
2919 *retbuf = buf + *buflen;
2920 *buflen = orig_buflen - *buflen + addend;
2921 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2925 *buflen = orig_buflen;
2926 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2932 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2936 bool slash_prefixed;
2942 orig_buflen = *buflen;
2945 slash_prefixed = false;
2946 if (vp->v_type != VDIR) {
2948 buf[*buflen] = '\0';
2949 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2958 slash_prefixed = true;
2961 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
2962 orig_buflen - *buflen));
2966 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2968 * Since the namecache does not track handlings, the caller is expected to first
2969 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2971 * Then we have 2 cases:
2972 * - if the found vnode is a directory, the path can be constructed just by
2973 * fullowing names up the chain
2974 * - otherwise we populate the buffer with the saved name and start resolving
2978 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
2983 struct componentname *cnp;
2987 bool slash_prefixed;
2992 if (*buflen > MAXPATHLEN)
2993 *buflen = MAXPATHLEN;
2995 slash_prefixed = false;
2997 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3002 * Check for VBAD to work around the vp_crossmp bug in lookup().
3004 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3005 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3006 * If the type is VDIR (like in this very case) we can skip looking
3007 * at ni_dvp in the first place. However, since vnodes get passed here
3008 * unlocked the target may transition to doomed state (type == VBAD)
3009 * before we get to evaluate the condition. If this happens, we will
3010 * populate part of the buffer and descend to vn_fullpath_dir with
3011 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3013 * This should be atomic_load(&vp->v_type) but it is ilegal to take
3014 * an address of a bit field, even if said field is sized to char.
3015 * Work around the problem by reading the value into a full-sized enum
3016 * and then re-reading it with atomic_load which will still prevent
3017 * the compiler from re-reading down the road.
3020 type = atomic_load_int(&type);
3027 addend = cnp->cn_namelen + 2;
3028 if (*buflen < addend) {
3033 tmpbuf = buf + *buflen;
3035 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3036 tmpbuf[addend - 1] = '\0';
3037 slash_prefixed = true;
3042 pwd = pwd_get_smr();
3043 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3044 slash_prefixed, addend);
3045 VFS_SMR_ASSERT_NOT_ENTERED();
3047 pwd = pwd_hold(curthread);
3049 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3050 slash_prefixed, addend);
3065 vn_dir_dd_ino(struct vnode *vp)
3067 struct namecache *ncp;
3072 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3073 vlp = VP2VNODELOCK(vp);
3075 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3076 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3079 vs = vget_prep(ddvp);
3081 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3090 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3092 struct namecache *ncp;
3096 vlp = VP2VNODELOCK(vp);
3098 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3099 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3105 l = min(ncp->nc_nlen, buflen - 1);
3106 memcpy(buf, ncp->nc_name, l);
3113 * This function updates path string to vnode's full global path
3114 * and checks the size of the new path string against the pathlen argument.
3116 * Requires a locked, referenced vnode.
3117 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3119 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3120 * because it falls back to the ".." lookup if the namecache lookup fails.
3123 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3126 struct nameidata nd;
3131 ASSERT_VOP_ELOCKED(vp, __func__);
3133 /* Construct global filesystem path from vp. */
3135 error = vn_fullpath_global(vp, &rpath, &fbuf);
3142 if (strlen(rpath) >= pathlen) {
3144 error = ENAMETOOLONG;
3149 * Re-lookup the vnode by path to detect a possible rename.
3150 * As a side effect, the vnode is relocked.
3151 * If vnode was renamed, return ENOENT.
3153 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3154 UIO_SYSSPACE, path, td);
3160 NDFREE(&nd, NDF_ONLY_PNBUF);
3164 strcpy(path, rpath);
3177 db_print_vpath(struct vnode *vp)
3180 while (vp != NULL) {
3181 db_printf("%p: ", vp);
3182 if (vp == rootvnode) {
3186 if (vp->v_vflag & VV_ROOT) {
3187 db_printf("<mount point>");
3188 vp = vp->v_mount->mnt_vnodecovered;
3190 struct namecache *ncp;
3194 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3197 for (i = 0; i < ncp->nc_nlen; i++)
3198 db_printf("%c", *ncn++);
3211 DB_SHOW_COMMAND(vpath, db_show_vpath)
3216 db_printf("usage: show vpath <struct vnode *>\n");
3220 vp = (struct vnode *)addr;
3226 static bool __read_frequently cache_fast_lookup = true;
3227 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3228 &cache_fast_lookup, 0, "");
3230 #define CACHE_FPL_FAILED -2020
3233 cache_fpl_cleanup_cnp(struct componentname *cnp)
3236 uma_zfree(namei_zone, cnp->cn_pnbuf);
3238 cnp->cn_pnbuf = NULL;
3239 cnp->cn_nameptr = NULL;
3244 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3246 struct componentname *cnp;
3249 while (*(cnp->cn_nameptr) == '/') {
3254 *dpp = ndp->ni_rootdir;
3258 * Components of nameidata (or objects it can point to) which may
3259 * need restoring in case fast path lookup fails.
3261 struct nameidata_saved {
3269 struct nameidata *ndp;
3270 struct componentname *cnp;
3276 struct nameidata_saved snd;
3278 enum cache_fpl_status status:8;
3284 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3287 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3288 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3289 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3290 snd->ni_pathlen = fpl->ndp->ni_pathlen;
3294 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3297 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3298 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3299 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3300 fpl->ndp->ni_pathlen = snd->ni_pathlen;
3304 #define cache_fpl_smr_assert_entered(fpl) ({ \
3305 struct cache_fpl *_fpl = (fpl); \
3306 MPASS(_fpl->in_smr == true); \
3307 VFS_SMR_ASSERT_ENTERED(); \
3309 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
3310 struct cache_fpl *_fpl = (fpl); \
3311 MPASS(_fpl->in_smr == false); \
3312 VFS_SMR_ASSERT_NOT_ENTERED(); \
3315 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3316 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3319 #define cache_fpl_smr_enter_initial(fpl) ({ \
3320 struct cache_fpl *_fpl = (fpl); \
3322 _fpl->in_smr = true; \
3325 #define cache_fpl_smr_enter(fpl) ({ \
3326 struct cache_fpl *_fpl = (fpl); \
3327 MPASS(_fpl->in_smr == false); \
3329 _fpl->in_smr = true; \
3332 #define cache_fpl_smr_exit(fpl) ({ \
3333 struct cache_fpl *_fpl = (fpl); \
3334 MPASS(_fpl->in_smr == true); \
3336 _fpl->in_smr = false; \
3340 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3343 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3344 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3345 ("%s: converting to abort from %d at %d, set at %d\n",
3346 __func__, fpl->status, line, fpl->line));
3348 fpl->status = CACHE_FPL_STATUS_ABORTED;
3350 return (CACHE_FPL_FAILED);
3353 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
3356 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3359 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3360 ("%s: setting to partial at %d, but already set to %d at %d\n",
3361 __func__, line, fpl->status, fpl->line));
3362 cache_fpl_smr_assert_entered(fpl);
3363 fpl->status = CACHE_FPL_STATUS_PARTIAL;
3365 return (CACHE_FPL_FAILED);
3368 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
3371 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3374 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3375 ("%s: setting to handled at %d, but already set to %d at %d\n",
3376 __func__, line, fpl->status, fpl->line));
3377 cache_fpl_smr_assert_not_entered(fpl);
3378 MPASS(error != CACHE_FPL_FAILED);
3379 fpl->status = CACHE_FPL_STATUS_HANDLED;
3384 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3386 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3387 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3388 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3390 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3391 (ISDOTDOT | MAKEENTRY | ISLASTCN)
3393 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3394 "supported and internal flags overlap");
3397 cache_fpl_islastcn(struct nameidata *ndp)
3400 return (*ndp->ni_next == 0);
3404 cache_fpl_isdotdot(struct componentname *cnp)
3407 if (cnp->cn_namelen == 2 &&
3408 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3414 cache_can_fplookup(struct cache_fpl *fpl)
3416 struct nameidata *ndp;
3417 struct componentname *cnp;
3422 td = cnp->cn_thread;
3424 if (!cache_fast_lookup) {
3425 cache_fpl_aborted(fpl);
3429 if (mac_vnode_check_lookup_enabled()) {
3430 cache_fpl_aborted(fpl);
3434 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3435 cache_fpl_aborted(fpl);
3438 if (IN_CAPABILITY_MODE(td)) {
3439 cache_fpl_aborted(fpl);
3442 if (AUDITING_TD(td)) {
3443 cache_fpl_aborted(fpl);
3446 if (ndp->ni_startdir != NULL) {
3447 cache_fpl_aborted(fpl);
3454 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3456 struct nameidata *ndp;
3461 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3462 if (__predict_false(error != 0)) {
3463 cache_fpl_smr_exit(fpl);
3464 return (cache_fpl_aborted(fpl));
3466 fpl->fsearch = fsearch;
3471 cache_fplookup_vnode_supported(struct vnode *vp)
3474 return (vp->v_type != VLNK);
3477 static int __noinline
3478 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3481 struct componentname *cnp;
3487 cache_fpl_smr_exit(fpl);
3488 if (cache_negative_promote_cond(dvp, cnp, oncp, hash))
3489 return (cache_fpl_handled(fpl, ENOENT));
3491 return (cache_fpl_aborted(fpl));
3495 * The target vnode is not supported, prepare for the slow path to take over.
3497 static int __noinline
3498 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3500 struct nameidata *ndp;
3501 struct componentname *cnp;
3511 dvp_seqc = fpl->dvp_seqc;
3513 if (!pwd_hold_smr(pwd)) {
3514 cache_fpl_smr_exit(fpl);
3515 return (cache_fpl_aborted(fpl));
3518 dvs = vget_prep_smr(dvp);
3519 cache_fpl_smr_exit(fpl);
3520 if (__predict_false(dvs == VGET_NONE)) {
3522 return (cache_fpl_aborted(fpl));
3525 vget_finish_ref(dvp, dvs);
3526 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3529 return (cache_fpl_aborted(fpl));
3532 cache_fpl_restore(fpl, &fpl->snd);
3534 ndp->ni_startdir = dvp;
3535 cnp->cn_flags |= MAKEENTRY;
3536 if (cache_fpl_islastcn(ndp))
3537 cnp->cn_flags |= ISLASTCN;
3538 if (cache_fpl_isdotdot(cnp))
3539 cnp->cn_flags |= ISDOTDOT;
3545 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3547 struct componentname *cnp;
3554 tvp_seqc = fpl->tvp_seqc;
3556 if ((cnp->cn_flags & LOCKLEAF) != 0) {
3557 lkflags = LK_SHARED;
3558 if ((cnp->cn_flags & LOCKSHARED) == 0)
3559 lkflags = LK_EXCLUSIVE;
3560 error = vget_finish(tvp, lkflags, tvs);
3561 if (__predict_false(error != 0)) {
3562 return (cache_fpl_aborted(fpl));
3565 vget_finish_ref(tvp, tvs);
3568 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3569 if ((cnp->cn_flags & LOCKLEAF) != 0)
3573 return (cache_fpl_aborted(fpl));
3576 return (cache_fpl_handled(fpl, 0));
3580 * They want to possibly modify the state of the namecache.
3582 * Don't try to match the API contract, just leave.
3583 * TODO: this leaves scalability on the table
3586 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3588 struct componentname *cnp;
3591 MPASS(cnp->cn_nameiop != LOOKUP);
3592 return (cache_fpl_partial(fpl));
3595 static int __noinline
3596 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3598 struct componentname *cnp;
3599 enum vgetstate dvs, tvs;
3600 struct vnode *dvp, *tvp;
3606 dvp_seqc = fpl->dvp_seqc;
3609 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3612 * This is less efficient than it can be for simplicity.
3614 dvs = vget_prep_smr(dvp);
3615 if (__predict_false(dvs == VGET_NONE)) {
3616 return (cache_fpl_aborted(fpl));
3618 tvs = vget_prep_smr(tvp);
3619 if (__predict_false(tvs == VGET_NONE)) {
3620 cache_fpl_smr_exit(fpl);
3621 vget_abort(dvp, dvs);
3622 return (cache_fpl_aborted(fpl));
3625 cache_fpl_smr_exit(fpl);
3627 if ((cnp->cn_flags & LOCKPARENT) != 0) {
3628 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3629 if (__predict_false(error != 0)) {
3630 vget_abort(tvp, tvs);
3631 return (cache_fpl_aborted(fpl));
3634 vget_finish_ref(dvp, dvs);
3637 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3638 vget_abort(tvp, tvs);
3639 if ((cnp->cn_flags & LOCKPARENT) != 0)
3643 return (cache_fpl_aborted(fpl));
3646 error = cache_fplookup_final_child(fpl, tvs);
3647 if (__predict_false(error != 0)) {
3648 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3649 if ((cnp->cn_flags & LOCKPARENT) != 0)
3656 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3661 cache_fplookup_final(struct cache_fpl *fpl)
3663 struct componentname *cnp;
3665 struct vnode *dvp, *tvp;
3670 dvp_seqc = fpl->dvp_seqc;
3673 VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3675 if (cnp->cn_nameiop != LOOKUP) {
3676 return (cache_fplookup_final_modifying(fpl));
3679 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3680 return (cache_fplookup_final_withparent(fpl));
3682 tvs = vget_prep_smr(tvp);
3683 if (__predict_false(tvs == VGET_NONE)) {
3684 return (cache_fpl_partial(fpl));
3687 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3688 cache_fpl_smr_exit(fpl);
3689 vget_abort(tvp, tvs);
3690 return (cache_fpl_aborted(fpl));
3693 cache_fpl_smr_exit(fpl);
3694 return (cache_fplookup_final_child(fpl, tvs));
3697 static int __noinline
3698 cache_fplookup_dot(struct cache_fpl *fpl)
3705 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3706 if (seqc_in_modify(fpl->tvp_seqc)) {
3707 return (cache_fpl_aborted(fpl));
3710 counter_u64_add(dothits, 1);
3711 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3716 static int __noinline
3717 cache_fplookup_dotdot(struct cache_fpl *fpl)
3719 struct nameidata *ndp;
3720 struct componentname *cnp;
3721 struct namecache *ncp;
3731 * XXX this is racy the same way regular lookup is
3733 for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3735 if (dvp == pr->pr_root)
3738 if (dvp == ndp->ni_rootdir ||
3739 dvp == ndp->ni_topdir ||
3743 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3744 if (seqc_in_modify(fpl->tvp_seqc)) {
3745 return (cache_fpl_aborted(fpl));
3750 if ((dvp->v_vflag & VV_ROOT) != 0) {
3753 * The opposite of climb mount is needed here.
3755 return (cache_fpl_aborted(fpl));
3758 ncp = atomic_load_ptr(&dvp->v_cache_dd);
3760 return (cache_fpl_aborted(fpl));
3763 nc_flag = atomic_load_char(&ncp->nc_flag);
3764 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3765 if ((nc_flag & NCF_NEGATIVE) != 0)
3766 return (cache_fpl_aborted(fpl));
3767 fpl->tvp = ncp->nc_vp;
3769 fpl->tvp = ncp->nc_dvp;
3772 if (__predict_false(!cache_ncp_canuse(ncp))) {
3773 return (cache_fpl_aborted(fpl));
3776 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3777 if (seqc_in_modify(fpl->tvp_seqc)) {
3778 return (cache_fpl_partial(fpl));
3781 counter_u64_add(dotdothits, 1);
3786 cache_fplookup_next(struct cache_fpl *fpl)
3788 struct componentname *cnp;
3789 struct namecache *ncp;
3790 struct negstate *ns;
3791 struct vnode *dvp, *tvp;
3799 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3800 return (cache_fplookup_dot(fpl));
3803 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3805 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3806 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3807 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3812 * If there is no entry we have to punt to the slow path to perform
3813 * actual lookup. Should there be nothing with this name a negative
3814 * entry will be created.
3816 if (__predict_false(ncp == NULL)) {
3817 return (cache_fpl_partial(fpl));
3820 tvp = atomic_load_ptr(&ncp->nc_vp);
3821 nc_flag = atomic_load_char(&ncp->nc_flag);
3822 if ((nc_flag & NCF_NEGATIVE) != 0) {
3824 * If they want to create an entry we need to replace this one.
3826 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3827 return (cache_fpl_partial(fpl));
3829 ns = NCP2NEGSTATE(ncp);
3830 neg_hot = ((ns->neg_flag & NEG_HOT) != 0);
3831 if (__predict_false(!cache_ncp_canuse(ncp))) {
3832 return (cache_fpl_partial(fpl));
3834 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3835 return (cache_fpl_partial(fpl));
3838 return (cache_fplookup_negative_promote(fpl, ncp, hash));
3840 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3842 counter_u64_add(numneghits, 1);
3843 cache_fpl_smr_exit(fpl);
3844 return (cache_fpl_handled(fpl, ENOENT));
3847 if (__predict_false(!cache_ncp_canuse(ncp))) {
3848 return (cache_fpl_partial(fpl));
3852 fpl->tvp_seqc = vn_seqc_read_any(tvp);
3853 if (seqc_in_modify(fpl->tvp_seqc)) {
3854 return (cache_fpl_partial(fpl));
3857 if (!cache_fplookup_vnode_supported(tvp)) {
3858 return (cache_fpl_partial(fpl));
3861 counter_u64_add(numposhits, 1);
3862 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3867 cache_fplookup_mp_supported(struct mount *mp)
3872 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3878 * Walk up the mount stack (if any).
3880 * Correctness is provided in the following ways:
3881 * - all vnodes are protected from freeing with SMR
3882 * - struct mount objects are type stable making them always safe to access
3883 * - stability of the particular mount is provided by busying it
3884 * - relationship between the vnode which is mounted on and the mount is
3885 * verified with the vnode sequence counter after busying
3886 * - association between root vnode of the mount and the mount is protected
3889 * From that point on we can read the sequence counter of the root vnode
3890 * and get the next mount on the stack (if any) using the same protection.
3892 * By the end of successful walk we are guaranteed the reached state was
3893 * indeed present at least at some point which matches the regular lookup.
3895 static int __noinline
3896 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3898 struct mount *mp, *prev_mp;
3903 vp_seqc = fpl->tvp_seqc;
3905 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3906 mp = atomic_load_ptr(&vp->v_mountedhere);
3912 if (!vfs_op_thread_enter_crit(mp)) {
3913 if (prev_mp != NULL)
3914 vfs_op_thread_exit_crit(prev_mp);
3915 return (cache_fpl_partial(fpl));
3917 if (prev_mp != NULL)
3918 vfs_op_thread_exit_crit(prev_mp);
3919 if (!vn_seqc_consistent(vp, vp_seqc)) {
3920 vfs_op_thread_exit_crit(mp);
3921 return (cache_fpl_partial(fpl));
3923 if (!cache_fplookup_mp_supported(mp)) {
3924 vfs_op_thread_exit_crit(mp);
3925 return (cache_fpl_partial(fpl));
3927 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3928 if (vp == NULL || VN_IS_DOOMED(vp)) {
3929 vfs_op_thread_exit_crit(mp);
3930 return (cache_fpl_partial(fpl));
3932 vp_seqc = vn_seqc_read_any(vp);
3933 if (seqc_in_modify(vp_seqc)) {
3934 vfs_op_thread_exit_crit(mp);
3935 return (cache_fpl_partial(fpl));
3938 mp = atomic_load_ptr(&vp->v_mountedhere);
3943 vfs_op_thread_exit_crit(prev_mp);
3945 fpl->tvp_seqc = vp_seqc;
3950 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3958 * Hack: while this is a union, the pointer tends to be NULL so save on
3961 mp = atomic_load_ptr(&vp->v_mountedhere);
3964 if (vp->v_type == VDIR)
3972 * The code was originally copy-pasted from regular lookup and despite
3973 * clean ups leaves performance on the table. Any modifications here
3974 * must take into account that in case off fallback the resulting
3975 * nameidata state has to be compatible with the original.
3978 cache_fplookup_parse(struct cache_fpl *fpl)
3980 struct nameidata *ndp;
3981 struct componentname *cnp;
3988 * Search a new directory.
3990 * The last component of the filename is left accessible via
3991 * cnp->cn_nameptr for callers that need the name. Callers needing
3992 * the name set the SAVENAME flag. When done, they assume
3993 * responsibility for freeing the pathname buffer.
3995 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3997 cnp->cn_namelen = cp - cnp->cn_nameptr;
3998 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
3999 cache_fpl_smr_exit(fpl);
4000 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4002 ndp->ni_pathlen -= cnp->cn_namelen;
4003 KASSERT(ndp->ni_pathlen <= PATH_MAX,
4004 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4008 * Replace multiple slashes by a single slash and trailing slashes
4009 * by a null. This must be done before VOP_LOOKUP() because some
4010 * fs's don't know about trailing slashes. Remember if there were
4011 * trailing slashes to handle symlinks, existing non-directories
4012 * and non-existing files that won't be directories specially later.
4014 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4020 * Regular lookup performs the following:
4021 * *ndp->ni_next = '\0';
4022 * cnp->cn_flags |= TRAILINGSLASH;
4024 * Which is problematic since it modifies data read
4025 * from userspace. Then if fast path lookup was to
4026 * abort we would have to either restore it or convey
4027 * the flag. Since this is a corner case just ignore
4028 * it for simplicity.
4030 return (cache_fpl_partial(fpl));
4036 * Check for degenerate name (e.g. / or "")
4037 * which is a way of talking about a directory,
4038 * e.g. like "/." or ".".
4041 * Another corner case handled by the regular lookup
4043 if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4044 return (cache_fpl_partial(fpl));
4050 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4052 struct nameidata *ndp;
4053 struct componentname *cnp;
4058 cnp->cn_nameptr = ndp->ni_next;
4059 while (*cnp->cn_nameptr == '/') {
4066 * See the API contract for VOP_FPLOOKUP_VEXEC.
4068 static int __noinline
4069 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4071 struct componentname *cnp;
4077 dvp_seqc = fpl->dvp_seqc;
4080 * Hack: they may be looking up foo/bar, where foo is a
4081 * regular file. In such a case we need to turn ENOTDIR,
4082 * but we may happen to get here with a different error.
4084 if (dvp->v_type != VDIR) {
4086 * The check here is predominantly to catch
4087 * EOPNOTSUPP from dead_vnodeops. If the vnode
4088 * gets doomed past this point it is going to
4089 * fail seqc verification.
4091 if (VN_IS_DOOMED(dvp)) {
4092 return (cache_fpl_aborted(fpl));
4098 * Hack: handle O_SEARCH.
4100 * Open Group Base Specifications Issue 7, 2018 edition states:
4101 * If the access mode of the open file description associated with the
4102 * file descriptor is not O_SEARCH, the function shall check whether
4103 * directory searches are permitted using the current permissions of
4104 * the directory underlying the file descriptor. If the access mode is
4105 * O_SEARCH, the function shall not perform the check.
4107 * Regular lookup tests for the NOEXECCHECK flag for every path
4108 * component to decide whether to do the permission check. However,
4109 * since most lookups never have the flag (and when they do it is only
4110 * present for the first path component), lockless lookup only acts on
4111 * it if there is a permission problem. Here the flag is represented
4112 * with a boolean so that we don't have to clear it on the way out.
4114 * For simplicity this always aborts.
4115 * TODO: check if this is the first lookup and ignore the permission
4116 * problem. Note the flag has to survive fallback (if it happens to be
4120 return (cache_fpl_aborted(fpl));
4125 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4126 error = cache_fpl_aborted(fpl);
4128 cache_fpl_partial(fpl);
4132 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4133 error = cache_fpl_aborted(fpl);
4135 cache_fpl_smr_exit(fpl);
4136 cache_fpl_handled(fpl, error);
4144 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4146 struct nameidata *ndp;
4147 struct componentname *cnp;
4151 error = CACHE_FPL_FAILED;
4155 cache_fpl_checkpoint(fpl, &fpl->snd);
4158 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4159 if (seqc_in_modify(fpl->dvp_seqc)) {
4160 cache_fpl_aborted(fpl);
4163 mp = atomic_load_ptr(&fpl->dvp->v_mount);
4164 if (!cache_fplookup_mp_supported(mp)) {
4165 cache_fpl_aborted(fpl);
4169 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4172 error = cache_fplookup_parse(fpl);
4173 if (__predict_false(error != 0)) {
4177 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4179 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4180 if (__predict_false(error != 0)) {
4181 error = cache_fplookup_failed_vexec(fpl, error);
4185 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4186 error = cache_fplookup_dotdot(fpl);
4187 if (__predict_false(error != 0)) {
4191 error = cache_fplookup_next(fpl);
4192 if (__predict_false(error != 0)) {
4196 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4198 if (cache_fplookup_need_climb_mount(fpl)) {
4199 error = cache_fplookup_climb_mount(fpl);
4200 if (__predict_false(error != 0)) {
4206 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4208 if (cache_fpl_islastcn(ndp)) {
4209 error = cache_fplookup_final(fpl);
4213 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4214 error = cache_fpl_aborted(fpl);
4218 fpl->dvp = fpl->tvp;
4219 fpl->dvp_seqc = fpl->tvp_seqc;
4221 cache_fplookup_parse_advance(fpl);
4222 cache_fpl_checkpoint(fpl, &fpl->snd);
4225 switch (fpl->status) {
4226 case CACHE_FPL_STATUS_UNSET:
4227 __assert_unreachable();
4229 case CACHE_FPL_STATUS_PARTIAL:
4230 cache_fpl_smr_assert_entered(fpl);
4231 return (cache_fplookup_partial_setup(fpl));
4232 case CACHE_FPL_STATUS_ABORTED:
4234 cache_fpl_smr_exit(fpl);
4235 return (CACHE_FPL_FAILED);
4236 case CACHE_FPL_STATUS_HANDLED:
4237 MPASS(error != CACHE_FPL_FAILED);
4238 cache_fpl_smr_assert_not_entered(fpl);
4239 if (__predict_false(error != 0)) {
4242 cache_fpl_cleanup_cnp(cnp);
4245 ndp->ni_dvp = fpl->dvp;
4246 ndp->ni_vp = fpl->tvp;
4247 if (cnp->cn_flags & SAVENAME)
4248 cnp->cn_flags |= HASBUF;
4250 cache_fpl_cleanup_cnp(cnp);
4256 * Fast path lookup protected with SMR and sequence counters.
4258 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4260 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4263 * Traditional vnode lookup conceptually looks like this:
4269 * vn_unlock(current);
4276 * Each jump to the next vnode is safe memory-wise and atomic with respect to
4277 * any modifications thanks to holding respective locks.
4279 * The same guarantee can be provided with a combination of safe memory
4280 * reclamation and sequence counters instead. If all operations which affect
4281 * the relationship between the current vnode and the one we are looking for
4282 * also modify the counter, we can verify whether all the conditions held as
4283 * we made the jump. This includes things like permissions, mount points etc.
4284 * Counter modification is provided by enclosing relevant places in
4285 * vn_seqc_write_begin()/end() calls.
4287 * Thus this translates to:
4290 * dvp_seqc = seqc_read_any(dvp);
4291 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4295 * tvp_seqc = seqc_read_any(tvp);
4296 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4298 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4300 * dvp = tvp; // we know nothing of importance has changed
4301 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4305 * vget(); // secure the vnode
4306 * if (!seqc_consistent(tvp, tvp_seqc) // final check
4308 * // at this point we know nothing has changed for any parent<->child pair
4309 * // as they were crossed during the lookup, meaning we matched the guarantee
4310 * // of the locked variant
4313 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4314 * - they are called while within vfs_smr protection which they must never exit
4315 * - EAGAIN can be returned to denote checking could not be performed, it is
4316 * always valid to return it
4317 * - if the sequence counter has not changed the result must be valid
4318 * - if the sequence counter has changed both false positives and false negatives
4319 * are permitted (since the result will be rejected later)
4320 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4322 * Caveats to watch out for:
4323 * - vnodes are passed unlocked and unreferenced with nothing stopping
4324 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4325 * to use atomic_load_ptr to fetch it.
4326 * - the aforementioned object can also get freed, meaning absent other means it
4327 * should be protected with vfs_smr
4328 * - either safely checking permissions as they are modified or guaranteeing
4329 * their stability is left to the routine
4332 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4335 struct cache_fpl fpl;
4338 struct componentname *cnp;
4339 struct nameidata_saved orig;
4342 MPASS(ndp->ni_lcf == 0);
4344 fpl.status = CACHE_FPL_STATUS_UNSET;
4346 fpl.cnp = &ndp->ni_cnd;
4347 MPASS(curthread == fpl.cnp->cn_thread);
4349 if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4350 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4352 if (!cache_can_fplookup(&fpl)) {
4353 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4354 *status = fpl.status;
4355 return (EOPNOTSUPP);
4358 cache_fpl_checkpoint(&fpl, &orig);
4360 cache_fpl_smr_enter_initial(&fpl);
4361 fpl.fsearch = false;
4362 pwd = pwd_get_smr();
4364 ndp->ni_rootdir = pwd->pwd_rdir;
4365 ndp->ni_topdir = pwd->pwd_jdir;
4368 cnp->cn_nameptr = cnp->cn_pnbuf;
4369 if (cnp->cn_pnbuf[0] == '/') {
4370 cache_fpl_handle_root(ndp, &dvp);
4372 if (ndp->ni_dirfd == AT_FDCWD) {
4373 dvp = pwd->pwd_cdir;
4375 error = cache_fplookup_dirfd(&fpl, &dvp);
4376 if (__predict_false(error != 0)) {
4382 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4384 error = cache_fplookup_impl(dvp, &fpl);
4386 cache_fpl_smr_assert_not_entered(&fpl);
4387 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4389 *status = fpl.status;
4390 switch (fpl.status) {
4391 case CACHE_FPL_STATUS_UNSET:
4392 __assert_unreachable();
4394 case CACHE_FPL_STATUS_HANDLED:
4395 SDT_PROBE3(vfs, namei, lookup, return, error,
4396 (error == 0 ? ndp->ni_vp : NULL), true);
4398 case CACHE_FPL_STATUS_PARTIAL:
4401 * Status restored by cache_fplookup_partial_setup.
4404 case CACHE_FPL_STATUS_ABORTED:
4405 cache_fpl_restore(&fpl, &orig);