2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
41 #include "opt_ktrace.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
58 #include <sys/rwlock.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/sysproto.h>
66 #include <sys/vnode.h>
69 #include <sys/ktrace.h>
72 #include <sys/capsicum.h>
74 #include <security/audit/audit.h>
75 #include <security/mac/mac_framework.h>
83 SDT_PROVIDER_DECLARE(vfs);
84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
90 "char *", "struct vnode *");
91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
93 "struct vnode *", "char *");
94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
97 "struct vnode *", "char *");
98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
100 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
101 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
102 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
103 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
105 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
107 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
110 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
111 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
112 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
115 * This structure describes the elements in the cache of recent
116 * names looked up by namei.
121 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
122 "the state must fit in a union with a pointer without growing it");
125 LIST_ENTRY(namecache) nc_src; /* source vnode list */
126 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
127 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
128 struct vnode *nc_dvp; /* vnode of parent of name */
130 struct vnode *nu_vp; /* vnode the name refers to */
131 struct negstate nu_neg;/* negative entry state */
133 u_char nc_flag; /* flag bits */
134 u_char nc_nlen; /* length of name */
135 char nc_name[0]; /* segment name + nul */
139 * struct namecache_ts repeats struct namecache layout up to the
141 * struct namecache_ts is used in place of struct namecache when time(s) need
142 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
143 * both a non-dotdot directory name plus dotdot for the directory's
146 * See below for alignment requirement.
148 struct namecache_ts {
149 struct timespec nc_time; /* timespec provided by fs */
150 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
151 int nc_ticks; /* ticks value when entry was added */
152 struct namecache nc_nc;
156 * At least mips n32 performs 64-bit accesses to timespec as found
157 * in namecache_ts and requires them to be aligned. Since others
158 * may be in the same spot suffer a little bit and enforce the
159 * alignment for everyone. Note this is a nop for 64-bit platforms.
161 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
163 #define nc_vp n_un.nu_vp
164 #define nc_neg n_un.nu_neg
167 * Flags in namecache.nc_flag
169 #define NCF_WHITE 0x01
170 #define NCF_ISDOTDOT 0x02
173 #define NCF_DVDROP 0x10
174 #define NCF_NEGATIVE 0x20
175 #define NCF_INVALID 0x40
179 * Flags in negstate.neg_flag
184 * Mark an entry as invalid.
186 * This is called before it starts getting deconstructed.
189 cache_ncp_invalidate(struct namecache *ncp)
192 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
193 ("%s: entry %p already invalid", __func__, ncp));
194 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
195 atomic_thread_fence_rel();
199 * Check whether the entry can be safely used.
201 * All places which elide locks are supposed to call this after they are
202 * done with reading from an entry.
205 cache_ncp_canuse(struct namecache *ncp)
208 atomic_thread_fence_acq();
209 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
213 * Name caching works as follows:
215 * Names found by directory scans are retained in a cache
216 * for future reference. It is managed LRU, so frequently
217 * used names will hang around. Cache is indexed by hash value
218 * obtained from (dvp, name) where dvp refers to the directory
221 * If it is a "negative" entry, (i.e. for a name that is known NOT to
222 * exist) the vnode pointer will be NULL.
224 * Upon reaching the last segment of a path, if the reference
225 * is for DELETE, or NOCACHE is set (rewrite), and the
226 * name is located in the cache, it will be dropped.
228 * These locks are used (in the order in which they can be taken):
230 * vnodelock mtx vnode lists and v_cache_dd field protection
231 * bucketlock rwlock for access to given set of hash buckets
232 * neglist mtx negative entry LRU management
234 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
235 * shrinking the LRU list.
237 * It is legal to take multiple vnodelock and bucketlock locks. The locking
238 * order is lower address first. Both are recursive.
240 * "." lookups are lockless.
242 * ".." and vnode -> name lookups require vnodelock.
244 * name -> vnode lookup requires the relevant bucketlock to be held for reading.
246 * Insertions and removals of entries require involved vnodes and bucketlocks
247 * to be write-locked to prevent other threads from seeing the entry.
249 * Some lookups result in removal of the found entry (e.g. getting rid of a
250 * negative entry with the intent to create a positive one), which poses a
251 * problem when multiple threads reach the state. Similarly, two different
252 * threads can purge two different vnodes and try to remove the same name.
254 * If the already held vnode lock is lower than the second required lock, we
255 * can just take the other lock. However, in the opposite case, this could
256 * deadlock. As such, this is resolved by trylocking and if that fails unlocking
257 * the first node, locking everything in order and revalidating the state.
263 * Structures associated with name caching.
265 #define NCHHASH(hash) \
266 (&nchashtbl[(hash) & nchash])
267 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
268 static u_long __read_mostly nchash; /* size of hash table */
269 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
270 "Size of namecache hash table");
271 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
272 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
273 "Ratio of negative namecache entries");
274 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
275 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
276 u_int ncsizefactor = 2;
277 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
278 "Size factor for namecache");
279 static u_int __read_mostly ncpurgeminvnodes;
280 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
281 "Number of vnodes below which purgevfs ignores the request");
282 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
284 struct nchstats nchstats; /* cache effectiveness statistics */
286 static struct mtx __exclusive_cache_line ncneg_shrink_lock;
290 TAILQ_HEAD(, namecache) nl_list;
291 } __aligned(CACHE_LINE_SIZE);
293 static struct neglist __read_mostly *neglists;
294 static struct neglist ncneg_hot;
295 static u_long numhotneg;
298 #define numneglists (ncneghash + 1)
299 static inline struct neglist *
300 NCP2NEGLIST(struct namecache *ncp)
303 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
306 static inline struct negstate *
307 NCP2NEGSTATE(struct namecache *ncp)
310 MPASS(ncp->nc_flag & NCF_NEGATIVE);
311 return (&ncp->nc_neg);
314 #define numbucketlocks (ncbuckethash + 1)
315 static u_int __read_mostly ncbuckethash;
316 static struct rwlock_padalign __read_mostly *bucketlocks;
317 #define HASH2BUCKETLOCK(hash) \
318 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
320 #define numvnodelocks (ncvnodehash + 1)
321 static u_int __read_mostly ncvnodehash;
322 static struct mtx __read_mostly *vnodelocks;
323 static inline struct mtx *
324 VP2VNODELOCK(struct vnode *vp)
327 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
331 * UMA zones for the VFS cache.
333 * The small cache is used for entries with short names, which are the
334 * most common. The large cache is used for entries which are too big to
335 * fit in the small cache.
337 static uma_zone_t __read_mostly cache_zone_small;
338 static uma_zone_t __read_mostly cache_zone_small_ts;
339 static uma_zone_t __read_mostly cache_zone_large;
340 static uma_zone_t __read_mostly cache_zone_large_ts;
342 #define CACHE_PATH_CUTOFF 35
344 static struct namecache *
345 cache_alloc(int len, int ts)
347 struct namecache_ts *ncp_ts;
348 struct namecache *ncp;
350 if (__predict_false(ts)) {
351 if (len <= CACHE_PATH_CUTOFF)
352 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
354 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
355 ncp = &ncp_ts->nc_nc;
357 if (len <= CACHE_PATH_CUTOFF)
358 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
360 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
366 cache_free(struct namecache *ncp)
368 struct namecache_ts *ncp_ts;
372 if ((ncp->nc_flag & NCF_DVDROP) != 0)
374 if (__predict_false(ncp->nc_flag & NCF_TS)) {
375 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
376 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
377 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
379 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
381 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
382 uma_zfree_smr(cache_zone_small, ncp);
384 uma_zfree_smr(cache_zone_large, ncp);
389 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
391 struct namecache_ts *ncp_ts;
393 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
394 (tsp == NULL && ticksp == NULL),
397 if (tsp == NULL && ticksp == NULL)
400 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
402 *tsp = ncp_ts->nc_time;
404 *ticksp = ncp_ts->nc_ticks;
408 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
409 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
410 "VFS namecache enabled");
413 /* Export size information to userland */
414 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
415 sizeof(struct namecache), "sizeof(struct namecache)");
418 * The new name cache statistics
420 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
421 "Name cache statistics");
422 #define STATNODE_ULONG(name, descr) \
423 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
424 #define STATNODE_COUNTER(name, descr) \
425 static COUNTER_U64_DEFINE_EARLY(name); \
426 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
428 STATNODE_ULONG(numneg, "Number of negative cache entries");
429 STATNODE_ULONG(numcache, "Number of cache entries");
430 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
431 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
432 STATNODE_COUNTER(dothits, "Number of '.' hits");
433 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
434 STATNODE_COUNTER(nummiss, "Number of cache misses");
435 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
436 STATNODE_COUNTER(numposzaps,
437 "Number of cache hits (positive) we do not want to cache");
438 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
439 STATNODE_COUNTER(numnegzaps,
440 "Number of cache hits (negative) we do not want to cache");
441 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
442 /* These count for vn_getcwd(), too. */
443 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
444 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
445 STATNODE_COUNTER(numfullpathfail2,
446 "Number of fullpath search errors (VOP_VPTOCNP failures)");
447 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
448 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
449 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
450 "Number of successful removals after relocking");
451 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
452 "Number of times zap_and_exit failed to lock");
453 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
454 "Number of times zap_and_exit failed to lock");
455 static long cache_lock_vnodes_cel_3_failures;
456 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
457 "Number of times 3-way vnode locking failed");
458 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
459 STATNODE_COUNTER(numneg_evicted,
460 "Number of negative entries evicted when adding a new entry");
461 STATNODE_COUNTER(shrinking_skipped,
462 "Number of times shrinking was already in progress");
464 static void cache_zap_locked(struct namecache *ncp);
465 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
466 char **freebuf, size_t *buflen);
467 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
468 char *buf, char **retbuf, size_t *buflen);
469 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
470 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
472 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
474 static int cache_yield;
475 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
476 "Number of times cache called yield");
478 static void __noinline
479 cache_maybe_yield(void)
482 if (should_yield()) {
484 kern_yield(PRI_USER);
489 cache_assert_vlp_locked(struct mtx *vlp)
493 mtx_assert(vlp, MA_OWNED);
497 cache_assert_vnode_locked(struct vnode *vp)
501 vlp = VP2VNODELOCK(vp);
502 cache_assert_vlp_locked(vlp);
506 * TODO: With the value stored we can do better than computing the hash based
507 * on the address and the choice of FNV should also be revisisted.
510 cache_prehash(struct vnode *vp)
513 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
517 cache_get_hash(char *name, u_char len, struct vnode *dvp)
520 return (fnv_32_buf(name, len, dvp->v_nchash));
523 static inline struct nchashhead *
524 NCP2BUCKET(struct namecache *ncp)
528 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
529 return (NCHHASH(hash));
532 static inline struct rwlock *
533 NCP2BUCKETLOCK(struct namecache *ncp)
537 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
538 return (HASH2BUCKETLOCK(hash));
543 cache_assert_bucket_locked(struct namecache *ncp, int mode)
547 blp = NCP2BUCKETLOCK(ncp);
548 rw_assert(blp, mode);
551 #define cache_assert_bucket_locked(x, y) do { } while (0)
554 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
556 _cache_sort_vnodes(void **p1, void **p2)
560 MPASS(*p1 != NULL || *p2 != NULL);
570 cache_lock_all_buckets(void)
574 for (i = 0; i < numbucketlocks; i++)
575 rw_wlock(&bucketlocks[i]);
579 cache_unlock_all_buckets(void)
583 for (i = 0; i < numbucketlocks; i++)
584 rw_wunlock(&bucketlocks[i]);
588 cache_lock_all_vnodes(void)
592 for (i = 0; i < numvnodelocks; i++)
593 mtx_lock(&vnodelocks[i]);
597 cache_unlock_all_vnodes(void)
601 for (i = 0; i < numvnodelocks; i++)
602 mtx_unlock(&vnodelocks[i]);
606 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
609 cache_sort_vnodes(&vlp1, &vlp2);
612 if (!mtx_trylock(vlp1))
615 if (!mtx_trylock(vlp2)) {
625 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
628 MPASS(vlp1 != NULL || vlp2 != NULL);
638 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
641 MPASS(vlp1 != NULL || vlp2 != NULL);
650 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
652 struct nchstats snap;
654 if (req->oldptr == NULL)
655 return (SYSCTL_OUT(req, 0, sizeof(snap)));
658 snap.ncs_goodhits = counter_u64_fetch(numposhits);
659 snap.ncs_neghits = counter_u64_fetch(numneghits);
660 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
661 counter_u64_fetch(numnegzaps);
662 snap.ncs_miss = counter_u64_fetch(nummisszap) +
663 counter_u64_fetch(nummiss);
665 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
667 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
668 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
669 "VFS cache effectiveness statistics");
673 * Grab an atomic snapshot of the name cache hash chain lengths
675 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
676 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
680 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
682 struct nchashhead *ncpp;
683 struct namecache *ncp;
684 int i, error, n_nchash, *cntbuf;
687 n_nchash = nchash + 1; /* nchash is max index, not count */
688 if (req->oldptr == NULL)
689 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
690 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
691 cache_lock_all_buckets();
692 if (n_nchash != nchash + 1) {
693 cache_unlock_all_buckets();
694 free(cntbuf, M_TEMP);
697 /* Scan hash tables counting entries */
698 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
699 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
701 cache_unlock_all_buckets();
702 for (error = 0, i = 0; i < n_nchash; i++)
703 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
705 free(cntbuf, M_TEMP);
708 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
709 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
710 "nchash chain lengths");
713 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
716 struct nchashhead *ncpp;
717 struct namecache *ncp;
719 int count, maxlength, used, pct;
722 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
724 cache_lock_all_buckets();
725 n_nchash = nchash + 1; /* nchash is max index, not count */
729 /* Scan hash tables for applicable entries */
730 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
732 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
737 if (maxlength < count)
740 n_nchash = nchash + 1;
741 cache_unlock_all_buckets();
742 pct = (used * 100) / (n_nchash / 100);
743 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
746 error = SYSCTL_OUT(req, &used, sizeof(used));
749 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
752 error = SYSCTL_OUT(req, &pct, sizeof(pct));
757 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
758 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
759 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
763 * Negative entries management
765 * A variation of LRU scheme is used. New entries are hashed into one of
766 * numneglists cold lists. Entries get promoted to the hot list on first hit.
768 * The shrinker will demote hot list head and evict from the cold list in a
769 * round-robin manner.
772 cache_negative_init(struct namecache *ncp)
774 struct negstate *negstate;
776 ncp->nc_flag |= NCF_NEGATIVE;
777 negstate = NCP2NEGSTATE(ncp);
778 negstate->neg_flag = 0;
782 cache_negative_hit(struct namecache *ncp)
784 struct neglist *neglist;
785 struct negstate *negstate;
787 negstate = NCP2NEGSTATE(ncp);
788 if ((negstate->neg_flag & NEG_HOT) != 0)
790 neglist = NCP2NEGLIST(ncp);
791 mtx_lock(&ncneg_hot.nl_lock);
792 mtx_lock(&neglist->nl_lock);
793 if ((negstate->neg_flag & NEG_HOT) == 0) {
795 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
796 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
797 negstate->neg_flag |= NEG_HOT;
799 mtx_unlock(&neglist->nl_lock);
800 mtx_unlock(&ncneg_hot.nl_lock);
804 cache_negative_insert(struct namecache *ncp)
806 struct neglist *neglist;
808 MPASS(ncp->nc_flag & NCF_NEGATIVE);
809 cache_assert_bucket_locked(ncp, RA_WLOCKED);
810 neglist = NCP2NEGLIST(ncp);
811 mtx_lock(&neglist->nl_lock);
812 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
813 mtx_unlock(&neglist->nl_lock);
814 atomic_add_rel_long(&numneg, 1);
818 cache_negative_remove(struct namecache *ncp)
820 struct neglist *neglist;
821 struct negstate *negstate;
822 bool hot_locked = false;
823 bool list_locked = false;
825 cache_assert_bucket_locked(ncp, RA_WLOCKED);
826 neglist = NCP2NEGLIST(ncp);
827 negstate = NCP2NEGSTATE(ncp);
828 if ((negstate->neg_flag & NEG_HOT) != 0) {
830 mtx_lock(&ncneg_hot.nl_lock);
831 if ((negstate->neg_flag & NEG_HOT) == 0) {
833 mtx_lock(&neglist->nl_lock);
837 mtx_lock(&neglist->nl_lock);
839 * We may be racing against promotion in lockless lookup.
841 if ((negstate->neg_flag & NEG_HOT) != 0) {
842 mtx_unlock(&neglist->nl_lock);
844 mtx_lock(&ncneg_hot.nl_lock);
845 mtx_lock(&neglist->nl_lock);
848 if ((negstate->neg_flag & NEG_HOT) != 0) {
849 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
850 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
853 mtx_assert(&neglist->nl_lock, MA_OWNED);
854 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
857 mtx_unlock(&neglist->nl_lock);
859 mtx_unlock(&ncneg_hot.nl_lock);
860 atomic_subtract_rel_long(&numneg, 1);
864 cache_negative_shrink_select(struct namecache **ncpp,
865 struct neglist **neglistpp)
867 struct neglist *neglist;
868 struct namecache *ncp;
874 for (i = 0; i < numneglists; i++) {
875 neglist = &neglists[(cycle + i) % numneglists];
876 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
878 mtx_lock(&neglist->nl_lock);
879 ncp = TAILQ_FIRST(&neglist->nl_list);
882 mtx_unlock(&neglist->nl_lock);
885 *neglistpp = neglist;
891 cache_negative_zap_one(void)
893 struct namecache *ncp, *ncp2;
894 struct neglist *neglist;
895 struct negstate *negstate;
899 if (mtx_owner(&ncneg_shrink_lock) != NULL ||
900 !mtx_trylock(&ncneg_shrink_lock)) {
901 counter_u64_add(shrinking_skipped, 1);
905 mtx_lock(&ncneg_hot.nl_lock);
906 ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
908 neglist = NCP2NEGLIST(ncp);
909 negstate = NCP2NEGSTATE(ncp);
910 mtx_lock(&neglist->nl_lock);
911 MPASS((negstate->neg_flag & NEG_HOT) != 0);
912 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
913 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
914 negstate->neg_flag &= ~NEG_HOT;
916 mtx_unlock(&neglist->nl_lock);
918 mtx_unlock(&ncneg_hot.nl_lock);
920 cache_negative_shrink_select(&ncp, &neglist);
922 mtx_unlock(&ncneg_shrink_lock);
926 MPASS(ncp->nc_flag & NCF_NEGATIVE);
927 dvlp = VP2VNODELOCK(ncp->nc_dvp);
928 blp = NCP2BUCKETLOCK(ncp);
929 mtx_unlock(&neglist->nl_lock);
933 * Enter SMR to safely check the negative list.
934 * Even if the found pointer matches, the entry may now be reallocated
935 * and used by a different vnode.
938 ncp2 = TAILQ_FIRST(&neglist->nl_list);
939 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
940 blp != NCP2BUCKETLOCK(ncp2)) {
945 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
947 cache_zap_locked(ncp);
948 counter_u64_add(numneg_evicted, 1);
956 * cache_zap_locked():
958 * Removes a namecache entry from cache, whether it contains an actual
959 * pointer to a vnode or if it is just a negative cache entry.
962 cache_zap_locked(struct namecache *ncp)
964 struct nchashhead *ncpp;
966 if (!(ncp->nc_flag & NCF_NEGATIVE))
967 cache_assert_vnode_locked(ncp->nc_vp);
968 cache_assert_vnode_locked(ncp->nc_dvp);
969 cache_assert_bucket_locked(ncp, RA_WLOCKED);
971 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
972 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
974 cache_ncp_invalidate(ncp);
976 ncpp = NCP2BUCKET(ncp);
977 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
978 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
979 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
980 ncp->nc_name, ncp->nc_vp);
981 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
982 if (ncp == ncp->nc_vp->v_cache_dd) {
983 vn_seqc_write_begin_unheld(ncp->nc_vp);
984 ncp->nc_vp->v_cache_dd = NULL;
985 vn_seqc_write_end(ncp->nc_vp);
988 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
990 cache_negative_remove(ncp);
992 if (ncp->nc_flag & NCF_ISDOTDOT) {
993 if (ncp == ncp->nc_dvp->v_cache_dd) {
994 vn_seqc_write_begin_unheld(ncp->nc_dvp);
995 ncp->nc_dvp->v_cache_dd = NULL;
996 vn_seqc_write_end(ncp->nc_dvp);
999 LIST_REMOVE(ncp, nc_src);
1000 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1001 ncp->nc_flag |= NCF_DVDROP;
1002 counter_u64_add(numcachehv, -1);
1005 atomic_subtract_rel_long(&numcache, 1);
1009 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1013 MPASS(ncp->nc_dvp == vp);
1014 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1015 cache_assert_vnode_locked(vp);
1017 blp = NCP2BUCKETLOCK(ncp);
1019 cache_zap_locked(ncp);
1024 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1027 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1030 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1031 cache_assert_vnode_locked(vp);
1033 if (ncp->nc_flag & NCF_NEGATIVE) {
1034 if (*vlpp != NULL) {
1038 cache_zap_negative_locked_vnode_kl(ncp, vp);
1042 pvlp = VP2VNODELOCK(vp);
1043 blp = NCP2BUCKETLOCK(ncp);
1044 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1045 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1047 if (*vlpp == vlp1 || *vlpp == vlp2) {
1051 if (*vlpp != NULL) {
1055 cache_sort_vnodes(&vlp1, &vlp2);
1060 if (!mtx_trylock(vlp1))
1066 cache_zap_locked(ncp);
1068 if (to_unlock != NULL)
1069 mtx_unlock(to_unlock);
1076 MPASS(*vlpp == NULL);
1081 static int __noinline
1082 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1084 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1088 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1089 cache_assert_vnode_locked(vp);
1091 pvlp = VP2VNODELOCK(vp);
1092 if (ncp->nc_flag & NCF_NEGATIVE) {
1093 cache_zap_negative_locked_vnode_kl(ncp, vp);
1097 blp = NCP2BUCKETLOCK(ncp);
1098 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1099 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1100 cache_sort_vnodes(&vlp1, &vlp2);
1105 if (!mtx_trylock(vlp1)) {
1112 cache_zap_locked(ncp);
1114 mtx_unlock(to_unlock);
1121 * If trylocking failed we can get here. We know enough to take all needed locks
1122 * in the right order and re-lookup the entry.
1125 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1126 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1129 struct namecache *rncp;
1131 cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1133 cache_sort_vnodes(&dvlp, &vlp);
1134 cache_lock_vnodes(dvlp, vlp);
1136 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1137 if (rncp == ncp && rncp->nc_dvp == dvp &&
1138 rncp->nc_nlen == cnp->cn_namelen &&
1139 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1143 cache_zap_locked(rncp);
1145 cache_unlock_vnodes(dvlp, vlp);
1146 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1151 cache_unlock_vnodes(dvlp, vlp);
1155 static int __noinline
1156 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1157 uint32_t hash, struct rwlock *blp)
1159 struct mtx *dvlp, *vlp;
1162 cache_assert_bucket_locked(ncp, RA_WLOCKED);
1164 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1166 if (!(ncp->nc_flag & NCF_NEGATIVE))
1167 vlp = VP2VNODELOCK(ncp->nc_vp);
1168 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1169 cache_zap_locked(ncp);
1171 cache_unlock_vnodes(dvlp, vlp);
1177 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1180 static int __noinline
1181 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1182 uint32_t hash, struct rwlock *blp)
1184 struct mtx *dvlp, *vlp;
1187 cache_assert_bucket_locked(ncp, RA_RLOCKED);
1189 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1191 if (!(ncp->nc_flag & NCF_NEGATIVE))
1192 vlp = VP2VNODELOCK(ncp->nc_vp);
1193 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1196 cache_zap_locked(ncp);
1198 cache_unlock_vnodes(dvlp, vlp);
1204 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1208 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1209 struct mtx **vlpp1, struct mtx **vlpp2)
1211 struct mtx *dvlp, *vlp;
1213 cache_assert_bucket_locked(ncp, RA_WLOCKED);
1215 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1217 if (!(ncp->nc_flag & NCF_NEGATIVE))
1218 vlp = VP2VNODELOCK(ncp->nc_vp);
1219 cache_sort_vnodes(&dvlp, &vlp);
1221 if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1222 cache_zap_locked(ncp);
1223 cache_unlock_vnodes(dvlp, vlp);
1236 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1237 cache_zap_locked(ncp);
1238 cache_unlock_vnodes(dvlp, vlp);
1253 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1263 static int __noinline
1264 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1265 struct timespec *tsp, int *ticksp)
1270 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1271 dvp, cnp->cn_nameptr);
1272 counter_u64_add(dothits, 1);
1273 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1280 * When we lookup "." we still can be asked to lock it
1283 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1284 if (ltype != VOP_ISLOCKED(*vpp)) {
1285 if (ltype == LK_EXCLUSIVE) {
1286 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1287 if (VN_IS_DOOMED((*vpp))) {
1288 /* forced unmount */
1294 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1299 static __noinline int
1300 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1301 struct componentname *cnp, struct timespec *tsp, int *ticksp)
1303 struct namecache *ncp;
1305 struct mtx *dvlp, *dvlp2;
1309 if (cnp->cn_namelen == 2 &&
1310 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1311 counter_u64_add(dotdothits, 1);
1312 dvlp = VP2VNODELOCK(dvp);
1316 ncp = dvp->v_cache_dd;
1318 SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1325 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1326 if (ncp->nc_dvp != dvp)
1327 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1328 if (!cache_zap_locked_vnode_kl2(ncp,
1331 MPASS(dvp->v_cache_dd == NULL);
1337 vn_seqc_write_begin(dvp);
1338 dvp->v_cache_dd = NULL;
1339 vn_seqc_write_end(dvp);
1347 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1348 blp = HASH2BUCKETLOCK(hash);
1350 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1355 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1356 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1357 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1361 /* We failed to find an entry */
1367 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1368 if (__predict_false(error != 0)) {
1369 zap_and_exit_bucket_fail++;
1370 cache_maybe_yield();
1373 counter_u64_add(numposzaps, 1);
1377 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1378 counter_u64_add(nummisszap, 1);
1383 * Lookup a name in the name cache
1387 * - dvp: Parent directory in which to search.
1388 * - vpp: Return argument. Will contain desired vnode on cache hit.
1389 * - cnp: Parameters of the name search. The most interesting bits of
1390 * the cn_flags field have the following meanings:
1391 * - MAKEENTRY: If clear, free an entry from the cache rather than look
1393 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
1394 * - tsp: Return storage for cache timestamp. On a successful (positive
1395 * or negative) lookup, tsp will be filled with any timespec that
1396 * was stored when this cache entry was created. However, it will
1397 * be clear for "." entries.
1398 * - ticks: Return storage for alternate cache timestamp. On a successful
1399 * (positive or negative) lookup, it will contain the ticks value
1400 * that was current when the cache entry was created, unless cnp
1405 * - -1: A positive cache hit. vpp will contain the desired vnode.
1406 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
1407 * to a forced unmount. vpp will not be modified. If the entry
1408 * is a whiteout, then the ISWHITEOUT flag will be set in
1410 * - 0: A cache miss. vpp will not be modified.
1414 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
1415 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
1416 * lock is not recursively acquired.
1419 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1420 struct timespec *tsp, int *ticksp)
1422 struct namecache_ts *ncp_ts;
1423 struct namecache *ncp;
1424 struct negstate *negstate;
1430 bool try_smr, doing_smr, whiteout;
1433 if (__predict_false(!doingcache)) {
1434 cnp->cn_flags &= ~MAKEENTRY;
1439 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1440 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1442 if ((cnp->cn_flags & MAKEENTRY) == 0)
1443 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1446 if (cnp->cn_nameiop == CREATE)
1453 if (cnp->cn_namelen == 2 &&
1454 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1455 counter_u64_add(dotdothits, 1);
1456 dvlp = VP2VNODELOCK(dvp);
1458 ncp = dvp->v_cache_dd;
1460 SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1465 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1466 if (ncp->nc_flag & NCF_NEGATIVE)
1472 /* Return failure if negative entry was found. */
1474 goto negative_success;
1475 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1476 dvp, cnp->cn_nameptr, *vpp);
1477 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1479 cache_out_ts(ncp, tsp, ticksp);
1480 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1481 NCF_DTS && tsp != NULL) {
1482 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1483 *tsp = ncp_ts->nc_dotdottime;
1488 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1495 blp = HASH2BUCKETLOCK(hash);
1499 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1500 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1501 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1505 /* We failed to find an entry */
1506 if (__predict_false(ncp == NULL)) {
1511 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1513 counter_u64_add(nummiss, 1);
1517 if (ncp->nc_flag & NCF_NEGATIVE)
1518 goto negative_success;
1520 /* We found a "positive" match, return the vnode */
1521 counter_u64_add(numposhits, 1);
1523 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1524 dvp, cnp->cn_nameptr, *vpp, ncp);
1525 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1527 cache_out_ts(ncp, tsp, ticksp);
1530 * On success we return a locked and ref'd vnode as per the lookup
1534 ltype = 0; /* silence gcc warning */
1535 if (cnp->cn_flags & ISDOTDOT) {
1536 ltype = VOP_ISLOCKED(dvp);
1540 if (!cache_ncp_canuse(ncp)) {
1545 vs = vget_prep_smr(*vpp);
1547 if (__predict_false(vs == VGET_NONE)) {
1552 vs = vget_prep(*vpp);
1553 cache_lookup_unlock(blp, dvlp);
1555 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1556 if (cnp->cn_flags & ISDOTDOT) {
1557 vn_lock(dvp, ltype | LK_RETRY);
1558 if (VN_IS_DOOMED(dvp)) {
1569 if ((cnp->cn_flags & ISLASTCN) &&
1570 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1571 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1576 /* We found a negative match, and want to create it, so purge */
1577 if (cnp->cn_nameiop == CREATE) {
1579 counter_u64_add(numnegzaps, 1);
1583 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1584 cache_out_ts(ncp, tsp, ticksp);
1585 counter_u64_add(numneghits, 1);
1586 whiteout = (ncp->nc_flag & NCF_WHITE);
1590 * We need to take locks to promote an entry.
1592 negstate = NCP2NEGSTATE(ncp);
1593 if ((negstate->neg_flag & NEG_HOT) == 0 ||
1594 !cache_ncp_canuse(ncp)) {
1601 cache_negative_hit(ncp);
1602 cache_lookup_unlock(blp, dvlp);
1605 cnp->cn_flags |= ISWHITEOUT;
1611 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1613 error = cache_zap_locked_vnode(ncp, dvp);
1614 if (__predict_false(error != 0)) {
1615 zap_and_exit_bucket_fail2++;
1616 cache_maybe_yield();
1623 struct celockstate {
1625 struct rwlock *blp[2];
1627 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1628 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1631 cache_celockstate_init(struct celockstate *cel)
1634 bzero(cel, sizeof(*cel));
1638 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1641 struct mtx *vlp1, *vlp2;
1643 MPASS(cel->vlp[0] == NULL);
1644 MPASS(cel->vlp[1] == NULL);
1645 MPASS(cel->vlp[2] == NULL);
1647 MPASS(vp != NULL || dvp != NULL);
1649 vlp1 = VP2VNODELOCK(vp);
1650 vlp2 = VP2VNODELOCK(dvp);
1651 cache_sort_vnodes(&vlp1, &vlp2);
1662 cache_unlock_vnodes_cel(struct celockstate *cel)
1665 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1667 if (cel->vlp[0] != NULL)
1668 mtx_unlock(cel->vlp[0]);
1669 if (cel->vlp[1] != NULL)
1670 mtx_unlock(cel->vlp[1]);
1671 if (cel->vlp[2] != NULL)
1672 mtx_unlock(cel->vlp[2]);
1676 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1681 cache_assert_vlp_locked(cel->vlp[0]);
1682 cache_assert_vlp_locked(cel->vlp[1]);
1683 MPASS(cel->vlp[2] == NULL);
1686 vlp = VP2VNODELOCK(vp);
1689 if (vlp >= cel->vlp[1]) {
1692 if (mtx_trylock(vlp))
1694 cache_lock_vnodes_cel_3_failures++;
1695 cache_unlock_vnodes_cel(cel);
1696 if (vlp < cel->vlp[0]) {
1698 mtx_lock(cel->vlp[0]);
1699 mtx_lock(cel->vlp[1]);
1701 if (cel->vlp[0] != NULL)
1702 mtx_lock(cel->vlp[0]);
1704 mtx_lock(cel->vlp[1]);
1714 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1715 struct rwlock *blp2)
1718 MPASS(cel->blp[0] == NULL);
1719 MPASS(cel->blp[1] == NULL);
1721 cache_sort_vnodes(&blp1, &blp2);
1732 cache_unlock_buckets_cel(struct celockstate *cel)
1735 if (cel->blp[0] != NULL)
1736 rw_wunlock(cel->blp[0]);
1737 rw_wunlock(cel->blp[1]);
1741 * Lock part of the cache affected by the insertion.
1743 * This means vnodelocks for dvp, vp and the relevant bucketlock.
1744 * However, insertion can result in removal of an old entry. In this
1745 * case we have an additional vnode and bucketlock pair to lock. If the
1746 * entry is negative, ncelock is locked instead of the vnode.
1748 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1749 * preserving the locking order (smaller address first).
1752 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1755 struct namecache *ncp;
1756 struct rwlock *blps[2];
1758 blps[0] = HASH2BUCKETLOCK(hash);
1761 cache_lock_vnodes_cel(cel, dvp, vp);
1762 if (vp == NULL || vp->v_type != VDIR)
1764 ncp = vp->v_cache_dd;
1767 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1769 MPASS(ncp->nc_dvp == vp);
1770 blps[1] = NCP2BUCKETLOCK(ncp);
1771 if (ncp->nc_flag & NCF_NEGATIVE)
1773 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1776 * All vnodes got re-locked. Re-validate the state and if
1777 * nothing changed we are done. Otherwise restart.
1779 if (ncp == vp->v_cache_dd &&
1780 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1781 blps[1] == NCP2BUCKETLOCK(ncp) &&
1782 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1784 cache_unlock_vnodes_cel(cel);
1789 cache_lock_buckets_cel(cel, blps[0], blps[1]);
1793 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1796 struct namecache *ncp;
1797 struct rwlock *blps[2];
1799 blps[0] = HASH2BUCKETLOCK(hash);
1802 cache_lock_vnodes_cel(cel, dvp, vp);
1803 ncp = dvp->v_cache_dd;
1806 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1808 MPASS(ncp->nc_dvp == dvp);
1809 blps[1] = NCP2BUCKETLOCK(ncp);
1810 if (ncp->nc_flag & NCF_NEGATIVE)
1812 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1814 if (ncp == dvp->v_cache_dd &&
1815 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1816 blps[1] == NCP2BUCKETLOCK(ncp) &&
1817 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1819 cache_unlock_vnodes_cel(cel);
1824 cache_lock_buckets_cel(cel, blps[0], blps[1]);
1828 cache_enter_unlock(struct celockstate *cel)
1831 cache_unlock_buckets_cel(cel);
1832 cache_unlock_vnodes_cel(cel);
1835 static void __noinline
1836 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1837 struct componentname *cnp)
1839 struct celockstate cel;
1840 struct namecache *ncp;
1844 if (dvp->v_cache_dd == NULL)
1846 len = cnp->cn_namelen;
1847 cache_celockstate_init(&cel);
1848 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1849 cache_enter_lock_dd(&cel, dvp, vp, hash);
1850 vn_seqc_write_begin(dvp);
1851 ncp = dvp->v_cache_dd;
1852 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1853 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1854 cache_zap_locked(ncp);
1858 dvp->v_cache_dd = NULL;
1859 vn_seqc_write_end(dvp);
1860 cache_enter_unlock(&cel);
1865 * Add an entry to the cache.
1868 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1869 struct timespec *tsp, struct timespec *dtsp)
1871 struct celockstate cel;
1872 struct namecache *ncp, *n2, *ndd;
1873 struct namecache_ts *ncp_ts, *n2_ts;
1874 struct nchashhead *ncpp;
1880 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1881 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1882 ("cache_enter: Adding a doomed vnode"));
1883 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1884 ("cache_enter: Doomed vnode used as src"));
1887 if (__predict_false(!doingcache))
1892 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1893 if (cnp->cn_namelen == 1)
1895 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1896 cache_enter_dotdot_prep(dvp, vp, cnp);
1897 flag = NCF_ISDOTDOT;
1902 * Avoid blowout in namecache entries.
1904 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1905 if (__predict_false(lnumcache >= ncsize)) {
1906 atomic_add_long(&numcache, -1);
1907 counter_u64_add(numdrops, 1);
1911 cache_celockstate_init(&cel);
1916 * Calculate the hash key and setup as much of the new
1917 * namecache entry as possible before acquiring the lock.
1919 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1920 ncp->nc_flag = flag | NCF_WIP;
1923 cache_negative_init(ncp);
1926 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1927 ncp_ts->nc_time = *tsp;
1928 ncp_ts->nc_ticks = ticks;
1929 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1931 ncp_ts->nc_dotdottime = *dtsp;
1932 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1935 len = ncp->nc_nlen = cnp->cn_namelen;
1936 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1937 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1938 ncp->nc_name[len] = '\0';
1939 cache_enter_lock(&cel, dvp, vp, hash);
1942 * See if this vnode or negative entry is already in the cache
1943 * with this name. This can happen with concurrent lookups of
1944 * the same path name.
1946 ncpp = NCHHASH(hash);
1947 CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1948 if (n2->nc_dvp == dvp &&
1949 n2->nc_nlen == cnp->cn_namelen &&
1950 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1952 KASSERT((n2->nc_flag & NCF_TS) != 0,
1954 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1955 n2_ts->nc_time = ncp_ts->nc_time;
1956 n2_ts->nc_ticks = ncp_ts->nc_ticks;
1958 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1959 n2_ts->nc_nc.nc_flag |= NCF_DTS;
1962 goto out_unlock_free;
1966 if (flag == NCF_ISDOTDOT) {
1968 * See if we are trying to add .. entry, but some other lookup
1969 * has populated v_cache_dd pointer already.
1971 if (dvp->v_cache_dd != NULL)
1972 goto out_unlock_free;
1973 KASSERT(vp == NULL || vp->v_type == VDIR,
1974 ("wrong vnode type %p", vp));
1975 vn_seqc_write_begin(dvp);
1976 dvp->v_cache_dd = ncp;
1977 vn_seqc_write_end(dvp);
1981 if (vp->v_type == VDIR) {
1982 if (flag != NCF_ISDOTDOT) {
1984 * For this case, the cache entry maps both the
1985 * directory name in it and the name ".." for the
1986 * directory's parent.
1988 vn_seqc_write_begin(vp);
1989 if ((ndd = vp->v_cache_dd) != NULL) {
1990 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1991 cache_zap_locked(ndd);
1995 vp->v_cache_dd = ncp;
1996 vn_seqc_write_end(vp);
1999 if (vp->v_cache_dd != NULL) {
2000 vn_seqc_write_begin(vp);
2001 vp->v_cache_dd = NULL;
2002 vn_seqc_write_end(vp);
2007 if (flag != NCF_ISDOTDOT) {
2008 if (LIST_EMPTY(&dvp->v_cache_src)) {
2010 counter_u64_add(numcachehv, 1);
2012 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2016 * If the entry is "negative", we place it into the
2017 * "negative" cache queue, otherwise, we place it into the
2018 * destination vnode's cache entries queue.
2021 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2022 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2025 if (cnp->cn_flags & ISWHITEOUT)
2026 ncp->nc_flag |= NCF_WHITE;
2027 cache_negative_insert(ncp);
2028 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2033 * Insert the new namecache entry into the appropriate chain
2034 * within the cache entries table.
2036 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2038 atomic_thread_fence_rel();
2040 * Mark the entry as fully constructed.
2041 * It is immutable past this point until its removal.
2043 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2045 cache_enter_unlock(&cel);
2046 if (numneg * ncnegfactor > lnumcache)
2047 cache_negative_zap_one();
2051 cache_enter_unlock(&cel);
2052 atomic_add_long(&numcache, -1);
2058 cache_roundup_2(u_int val)
2062 for (res = 1; res <= val; res <<= 1)
2068 static struct nchashhead *
2069 nchinittbl(u_long elements, u_long *hashmask)
2071 struct nchashhead *hashtbl;
2074 hashsize = cache_roundup_2(elements) / 2;
2076 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2077 for (i = 0; i < hashsize; i++)
2078 CK_SLIST_INIT(&hashtbl[i]);
2079 *hashmask = hashsize - 1;
2084 ncfreetbl(struct nchashhead *hashtbl)
2087 free(hashtbl, M_VFSCACHE);
2091 * Name cache initialization, from vfs_init() when we are booting
2094 nchinit(void *dummy __unused)
2098 cache_zone_small = uma_zcreate("S VFS Cache",
2099 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2100 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2102 cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2103 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2104 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2106 cache_zone_large = uma_zcreate("L VFS Cache",
2107 sizeof(struct namecache) + NAME_MAX + 1,
2108 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2110 cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2111 sizeof(struct namecache_ts) + NAME_MAX + 1,
2112 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT,
2115 VFS_SMR_ZONE_SET(cache_zone_small);
2116 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2117 VFS_SMR_ZONE_SET(cache_zone_large);
2118 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2120 ncsize = desiredvnodes * ncsizefactor;
2121 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2122 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2123 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2125 if (ncbuckethash > nchash)
2126 ncbuckethash = nchash;
2127 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2129 for (i = 0; i < numbucketlocks; i++)
2130 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2131 ncvnodehash = ncbuckethash;
2132 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2134 for (i = 0; i < numvnodelocks; i++)
2135 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2136 ncpurgeminvnodes = numbucketlocks * 2;
2138 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2140 for (i = 0; i < numneglists; i++) {
2141 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2142 TAILQ_INIT(&neglists[i].nl_list);
2144 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2145 TAILQ_INIT(&ncneg_hot.nl_list);
2147 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2149 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2152 cache_vnode_init(struct vnode *vp)
2155 LIST_INIT(&vp->v_cache_src);
2156 TAILQ_INIT(&vp->v_cache_dst);
2157 vp->v_cache_dd = NULL;
2162 cache_changesize(u_long newmaxvnodes)
2164 struct nchashhead *new_nchashtbl, *old_nchashtbl;
2165 u_long new_nchash, old_nchash;
2166 struct namecache *ncp;
2171 newncsize = newmaxvnodes * ncsizefactor;
2172 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2173 if (newmaxvnodes < numbucketlocks)
2174 newmaxvnodes = numbucketlocks;
2176 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2177 /* If same hash table size, nothing to do */
2178 if (nchash == new_nchash) {
2179 ncfreetbl(new_nchashtbl);
2183 * Move everything from the old hash table to the new table.
2184 * None of the namecache entries in the table can be removed
2185 * because to do so, they have to be removed from the hash table.
2187 cache_lock_all_vnodes();
2188 cache_lock_all_buckets();
2189 old_nchashtbl = nchashtbl;
2190 old_nchash = nchash;
2191 nchashtbl = new_nchashtbl;
2192 nchash = new_nchash;
2193 for (i = 0; i <= old_nchash; i++) {
2194 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2195 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2197 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2198 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2202 cache_unlock_all_buckets();
2203 cache_unlock_all_vnodes();
2204 ncfreetbl(old_nchashtbl);
2208 * Invalidate all entries from and to a particular vnode.
2211 cache_purge_impl(struct vnode *vp)
2213 TAILQ_HEAD(, namecache) ncps;
2214 struct namecache *ncp, *nnp;
2215 struct mtx *vlp, *vlp2;
2218 vlp = VP2VNODELOCK(vp);
2220 mtx_assert(vlp, MA_OWNED);
2222 while (!LIST_EMPTY(&vp->v_cache_src)) {
2223 ncp = LIST_FIRST(&vp->v_cache_src);
2224 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2226 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2228 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2229 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2230 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2232 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2234 ncp = vp->v_cache_dd;
2236 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2237 ("lost dotdot link"));
2238 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2240 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2242 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2246 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2252 cache_purge(struct vnode *vp)
2256 SDT_PROBE1(vfs, namecache, purge, done, vp);
2257 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2258 vp->v_cache_dd == NULL)
2260 vlp = VP2VNODELOCK(vp);
2262 cache_purge_impl(vp);
2266 * Only to be used by vgone.
2269 cache_purge_vgone(struct vnode *vp)
2273 VNPASS(VN_IS_DOOMED(vp), vp);
2274 vlp = VP2VNODELOCK(vp);
2275 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2276 vp->v_cache_dd == NULL)) {
2278 cache_purge_impl(vp);
2279 mtx_assert(vlp, MA_NOTOWNED);
2284 * All the NULL pointer state we found above may be transient.
2285 * Serialize against a possible thread doing cache_purge.
2287 mtx_wait_unlocked(vlp);
2288 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2289 vp->v_cache_dd == NULL)) {
2291 cache_purge_impl(vp);
2292 mtx_assert(vlp, MA_NOTOWNED);
2299 * Invalidate all negative entries for a particular directory vnode.
2302 cache_purge_negative(struct vnode *vp)
2304 TAILQ_HEAD(, namecache) ncps;
2305 struct namecache *ncp, *nnp;
2308 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2309 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2310 if (LIST_EMPTY(&vp->v_cache_src))
2313 vlp = VP2VNODELOCK(vp);
2315 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2316 if (!(ncp->nc_flag & NCF_NEGATIVE))
2318 cache_zap_negative_locked_vnode_kl(ncp, vp);
2319 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2322 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2328 * Flush all entries referencing a particular filesystem.
2331 cache_purgevfs(struct mount *mp, bool force)
2333 TAILQ_HEAD(, namecache) ncps;
2334 struct mtx *vlp1, *vlp2;
2336 struct nchashhead *bucket;
2337 struct namecache *ncp, *nnp;
2338 u_long i, j, n_nchash;
2341 /* Scan hash tables for applicable entries */
2342 SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2343 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2346 n_nchash = nchash + 1;
2348 for (i = 0; i < numbucketlocks; i++) {
2349 blp = (struct rwlock *)&bucketlocks[i];
2351 for (j = i; j < n_nchash; j += numbucketlocks) {
2353 bucket = &nchashtbl[j];
2354 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2355 cache_assert_bucket_locked(ncp, RA_WLOCKED);
2356 if (ncp->nc_dvp->v_mount != mp)
2358 error = cache_zap_wlocked_bucket_kl(ncp, blp,
2362 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2366 if (vlp1 == NULL && vlp2 == NULL)
2367 cache_maybe_yield();
2374 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2380 * Perform canonical checks and cache lookup and pass on to filesystem
2381 * through the vop_cachedlookup only if needed.
2385 vfs_cache_lookup(struct vop_lookup_args *ap)
2389 struct vnode **vpp = ap->a_vpp;
2390 struct componentname *cnp = ap->a_cnp;
2391 int flags = cnp->cn_flags;
2396 if (dvp->v_type != VDIR)
2399 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2400 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2403 error = vn_dir_check_exec(dvp, cnp);
2407 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2409 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2415 /* Implementation of the getcwd syscall. */
2417 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2423 buflen = uap->buflen;
2424 if (__predict_false(buflen < 2))
2426 if (buflen > MAXPATHLEN)
2427 buflen = MAXPATHLEN;
2429 buf = malloc(buflen, M_TEMP, M_WAITOK);
2430 error = vn_getcwd(td, buf, &retbuf, &buflen);
2432 error = copyout(retbuf, uap->buf, buflen);
2438 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2444 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2448 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2455 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2456 size_t size, int flags, enum uio_seg pathseg)
2458 struct nameidata nd;
2459 char *retbuf, *freebuf;
2464 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2465 pathseg, path, fd, &cap_fstat_rights, td);
2466 if ((error = namei(&nd)) != 0)
2468 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2470 error = copyout(retbuf, buf, size);
2471 free(freebuf, M_TEMP);
2478 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2481 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2482 uap->flags, UIO_USERSPACE));
2486 * Retrieve the full filesystem path that correspond to a vnode from the name
2487 * cache (if available)
2490 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2497 if (__predict_false(vn == NULL))
2500 buflen = MAXPATHLEN;
2501 buf = malloc(buflen, M_TEMP, M_WAITOK);
2503 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2514 * This function is similar to vn_fullpath, but it attempts to lookup the
2515 * pathname relative to the global root mount point. This is required for the
2516 * auditing sub-system, as audited pathnames must be absolute, relative to the
2517 * global root mount point.
2520 vn_fullpath_global(struct thread *td, struct vnode *vn,
2521 char **retbuf, char **freebuf)
2527 if (__predict_false(vn == NULL))
2529 buflen = MAXPATHLEN;
2530 buf = malloc(buflen, M_TEMP, M_WAITOK);
2531 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2540 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2543 struct namecache *ncp;
2547 vlp = VP2VNODELOCK(*vp);
2549 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2550 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2554 if (*buflen < ncp->nc_nlen) {
2557 counter_u64_add(numfullpathfail4, 1);
2559 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2563 *buflen -= ncp->nc_nlen;
2564 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2565 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2574 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2577 vn_lock(*vp, LK_SHARED | LK_RETRY);
2578 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2581 counter_u64_add(numfullpathfail2, 1);
2582 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2587 if (VN_IS_DOOMED(dvp)) {
2588 /* forced unmount */
2591 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2595 * *vp has its use count incremented still.
2602 * Resolve a directory to a pathname.
2604 * The name of the directory can always be found in the namecache or fetched
2605 * from the filesystem. There is also guaranteed to be only one parent, meaning
2606 * we can just follow vnodes up until we find the root.
2608 * The vnode must be referenced.
2611 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2612 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2614 #ifdef KDTRACE_HOOKS
2615 struct vnode *startvp = vp;
2621 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2622 VNPASS(vp->v_usecount > 0, vp);
2626 if (!slash_prefixed) {
2634 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2635 counter_u64_add(numfullpathcalls, 1);
2636 while (vp != rdir && vp != rootvnode) {
2638 * The vp vnode must be already fully constructed,
2639 * since it is either found in namecache or obtained
2640 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
2641 * without obtaining the vnode lock.
2643 if ((vp->v_vflag & VV_ROOT) != 0) {
2644 vn_lock(vp, LK_RETRY | LK_SHARED);
2647 * With the vnode locked, check for races with
2648 * unmount, forced or not. Note that we
2649 * already verified that vp is not equal to
2650 * the root vnode, which means that
2651 * mnt_vnodecovered can be NULL only for the
2654 if (VN_IS_DOOMED(vp) ||
2655 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2656 vp1->v_mountedhere != vp->v_mount) {
2659 SDT_PROBE3(vfs, namecache, fullpath, return,
2669 if (vp->v_type != VDIR) {
2671 counter_u64_add(numfullpathfail1, 1);
2673 SDT_PROBE3(vfs, namecache, fullpath, return,
2677 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2683 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2687 buf[--buflen] = '/';
2688 slash_prefixed = true;
2692 if (!slash_prefixed) {
2695 counter_u64_add(numfullpathfail4, 1);
2696 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2700 buf[--buflen] = '/';
2702 counter_u64_add(numfullpathfound, 1);
2705 *retbuf = buf + buflen;
2706 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2713 * Resolve an arbitrary vnode to a pathname.
2716 * - hardlinks are not tracked, thus if the vnode is not a directory this can
2717 * resolve to a different path than the one used to find it
2718 * - namecache is not mandatory, meaning names are not guaranteed to be added
2719 * (in which case resolving fails)
2722 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2723 char *buf, char **retbuf, size_t *buflen)
2726 bool slash_prefixed;
2732 orig_buflen = *buflen;
2735 slash_prefixed = false;
2736 if (vp->v_type != VDIR) {
2738 buf[*buflen] = '\0';
2739 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2748 slash_prefixed = true;
2751 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2752 orig_buflen - *buflen));
2756 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2758 * Since the namecache does not track handlings, the caller is expected to first
2759 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2761 * Then we have 2 cases:
2762 * - if the found vnode is a directory, the path can be constructed just by
2763 * fullowing names up the chain
2764 * - otherwise we populate the buffer with the saved name and start resolving
2768 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2769 char **freebuf, size_t *buflen)
2773 struct componentname *cnp;
2777 bool slash_prefixed;
2781 if (*buflen > MAXPATHLEN)
2782 *buflen = MAXPATHLEN;
2784 slash_prefixed = false;
2786 buf = malloc(*buflen, M_TEMP, M_WAITOK);
2791 if (vp->v_type != VDIR) {
2793 addend = cnp->cn_namelen + 2;
2794 if (*buflen < addend) {
2799 tmpbuf = buf + *buflen;
2801 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2802 tmpbuf[addend - 1] = '\0';
2803 slash_prefixed = true;
2808 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2809 slash_prefixed, addend);
2824 vn_dir_dd_ino(struct vnode *vp)
2826 struct namecache *ncp;
2831 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2832 vlp = VP2VNODELOCK(vp);
2834 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2835 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2838 vs = vget_prep(ddvp);
2840 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2849 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2851 struct namecache *ncp;
2855 vlp = VP2VNODELOCK(vp);
2857 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2858 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2864 l = min(ncp->nc_nlen, buflen - 1);
2865 memcpy(buf, ncp->nc_name, l);
2872 * This function updates path string to vnode's full global path
2873 * and checks the size of the new path string against the pathlen argument.
2875 * Requires a locked, referenced vnode.
2876 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2878 * If vp is a directory, the call to vn_fullpath_global() always succeeds
2879 * because it falls back to the ".." lookup if the namecache lookup fails.
2882 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2885 struct nameidata nd;
2890 ASSERT_VOP_ELOCKED(vp, __func__);
2892 /* Construct global filesystem path from vp. */
2894 error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2901 if (strlen(rpath) >= pathlen) {
2903 error = ENAMETOOLONG;
2908 * Re-lookup the vnode by path to detect a possible rename.
2909 * As a side effect, the vnode is relocked.
2910 * If vnode was renamed, return ENOENT.
2912 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2913 UIO_SYSSPACE, path, td);
2919 NDFREE(&nd, NDF_ONLY_PNBUF);
2923 strcpy(path, rpath);
2936 db_print_vpath(struct vnode *vp)
2939 while (vp != NULL) {
2940 db_printf("%p: ", vp);
2941 if (vp == rootvnode) {
2945 if (vp->v_vflag & VV_ROOT) {
2946 db_printf("<mount point>");
2947 vp = vp->v_mount->mnt_vnodecovered;
2949 struct namecache *ncp;
2953 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2956 for (i = 0; i < ncp->nc_nlen; i++)
2957 db_printf("%c", *ncn++);
2970 DB_SHOW_COMMAND(vpath, db_show_vpath)
2975 db_printf("usage: show vpath <struct vnode *>\n");
2979 vp = (struct vnode *)addr;
2985 extern uma_zone_t namei_zone;
2987 static bool __read_frequently cache_fast_lookup = true;
2988 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2989 &cache_fast_lookup, 0, "");
2991 #define CACHE_FPL_FAILED -2020
2994 cache_fpl_cleanup_cnp(struct componentname *cnp)
2997 uma_zfree(namei_zone, cnp->cn_pnbuf);
2999 cnp->cn_pnbuf = NULL;
3000 cnp->cn_nameptr = NULL;
3005 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3007 struct componentname *cnp;
3010 while (*(cnp->cn_nameptr) == '/') {
3015 *dpp = ndp->ni_rootdir;
3019 * Components of nameidata (or objects it can point to) which may
3020 * need restoring in case fast path lookup fails.
3022 struct nameidata_saved {
3030 struct nameidata *ndp;
3031 struct componentname *cnp;
3037 struct nameidata_saved snd;
3039 enum cache_fpl_status status:8;
3044 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3047 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3048 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3049 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3050 snd->ni_pathlen = fpl->ndp->ni_pathlen;
3054 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3057 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3058 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3059 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3060 fpl->ndp->ni_pathlen = snd->ni_pathlen;
3064 #define cache_fpl_smr_assert_entered(fpl) ({ \
3065 struct cache_fpl *_fpl = (fpl); \
3066 MPASS(_fpl->in_smr == true); \
3067 VFS_SMR_ASSERT_ENTERED(); \
3069 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
3070 struct cache_fpl *_fpl = (fpl); \
3071 MPASS(_fpl->in_smr == false); \
3072 VFS_SMR_ASSERT_NOT_ENTERED(); \
3075 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3076 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3079 #define cache_fpl_smr_enter_initial(fpl) ({ \
3080 struct cache_fpl *_fpl = (fpl); \
3082 _fpl->in_smr = true; \
3085 #define cache_fpl_smr_enter(fpl) ({ \
3086 struct cache_fpl *_fpl = (fpl); \
3087 MPASS(_fpl->in_smr == false); \
3089 _fpl->in_smr = true; \
3092 #define cache_fpl_smr_exit(fpl) ({ \
3093 struct cache_fpl *_fpl = (fpl); \
3094 MPASS(_fpl->in_smr == true); \
3096 _fpl->in_smr = false; \
3100 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3103 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3104 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3105 ("%s: converting to abort from %d at %d, set at %d\n",
3106 __func__, fpl->status, line, fpl->line));
3108 fpl->status = CACHE_FPL_STATUS_ABORTED;
3110 return (CACHE_FPL_FAILED);
3113 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
3116 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3119 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3120 ("%s: setting to partial at %d, but already set to %d at %d\n",
3121 __func__, line, fpl->status, fpl->line));
3122 cache_fpl_smr_assert_entered(fpl);
3123 fpl->status = CACHE_FPL_STATUS_PARTIAL;
3125 return (CACHE_FPL_FAILED);
3128 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
3131 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3134 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3135 ("%s: setting to handled at %d, but already set to %d at %d\n",
3136 __func__, line, fpl->status, fpl->line));
3137 cache_fpl_smr_assert_not_entered(fpl);
3138 MPASS(error != CACHE_FPL_FAILED);
3139 fpl->status = CACHE_FPL_STATUS_HANDLED;
3144 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3146 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3147 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3148 WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
3150 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3151 (ISDOTDOT | MAKEENTRY | ISLASTCN)
3153 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3154 "supported and internal flags overlap");
3157 cache_fpl_islastcn(struct nameidata *ndp)
3160 return (*ndp->ni_next == 0);
3164 cache_fpl_isdotdot(struct componentname *cnp)
3167 if (cnp->cn_namelen == 2 &&
3168 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3174 cache_can_fplookup(struct cache_fpl *fpl)
3176 struct nameidata *ndp;
3177 struct componentname *cnp;
3182 td = cnp->cn_thread;
3184 if (!cache_fast_lookup) {
3185 cache_fpl_aborted(fpl);
3189 if (mac_vnode_check_lookup_enabled()) {
3190 cache_fpl_aborted(fpl);
3194 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3195 cache_fpl_aborted(fpl);
3198 if (ndp->ni_dirfd != AT_FDCWD) {
3199 cache_fpl_aborted(fpl);
3202 if (IN_CAPABILITY_MODE(td)) {
3203 cache_fpl_aborted(fpl);
3206 if (AUDITING_TD(td)) {
3207 cache_fpl_aborted(fpl);
3210 if (ndp->ni_startdir != NULL) {
3211 cache_fpl_aborted(fpl);
3218 cache_fplookup_vnode_supported(struct vnode *vp)
3221 return (vp->v_type != VLNK);
3225 * Move a negative entry to the hot list.
3227 * We have to take locks, but they may be contended and in the worst
3228 * case we may need to go off CPU. We don't want to spin within the
3229 * smr section and we can't block with it. Instead we are going to
3230 * look up the entry again.
3232 static int __noinline
3233 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3236 struct componentname *cnp;
3237 struct namecache *ncp;
3238 struct neglist *neglist;
3239 struct negstate *negstate;
3246 if (!vhold_smr(dvp))
3247 return (cache_fpl_aborted(fpl));
3249 neglist = NCP2NEGLIST(oncp);
3250 cache_fpl_smr_exit(fpl);
3252 mtx_lock(&ncneg_hot.nl_lock);
3253 mtx_lock(&neglist->nl_lock);
3255 * For hash iteration.
3257 cache_fpl_smr_enter(fpl);
3260 * Avoid all surprises by only succeeding if we got the same entry and
3261 * bailing completely otherwise.
3263 * In particular at this point there can be a new ncp which matches the
3264 * search but hashes to a different neglist.
3266 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3272 * No match to begin with.
3274 if (__predict_false(ncp == NULL)) {
3279 * The newly found entry may be something different...
3281 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3282 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3287 * ... and not even negative.
3289 nc_flag = atomic_load_char(&ncp->nc_flag);
3290 if ((nc_flag & NCF_NEGATIVE) == 0) {
3294 if (__predict_false(!cache_ncp_canuse(ncp))) {
3298 negstate = NCP2NEGSTATE(ncp);
3299 if ((negstate->neg_flag & NEG_HOT) == 0) {
3301 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3302 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3303 negstate->neg_flag |= NEG_HOT;
3306 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3307 counter_u64_add(numneghits, 1);
3308 cache_fpl_smr_exit(fpl);
3309 mtx_unlock(&neglist->nl_lock);
3310 mtx_unlock(&ncneg_hot.nl_lock);
3312 return (cache_fpl_handled(fpl, ENOENT));
3314 cache_fpl_smr_exit(fpl);
3315 mtx_unlock(&neglist->nl_lock);
3316 mtx_unlock(&ncneg_hot.nl_lock);
3318 return (cache_fpl_aborted(fpl));
3322 * The target vnode is not supported, prepare for the slow path to take over.
3324 static int __noinline
3325 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3327 struct nameidata *ndp;
3328 struct componentname *cnp;
3337 dvp_seqc = fpl->dvp_seqc;
3339 dvs = vget_prep_smr(dvp);
3340 if (__predict_false(dvs == VGET_NONE)) {
3341 cache_fpl_smr_exit(fpl);
3342 return (cache_fpl_aborted(fpl));
3345 cache_fpl_smr_exit(fpl);
3347 vget_finish_ref(dvp, dvs);
3348 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3350 return (cache_fpl_aborted(fpl));
3353 pwd = pwd_hold(curthread);
3354 if (fpl->pwd != pwd) {
3357 return (cache_fpl_aborted(fpl));
3360 cache_fpl_restore(fpl, &fpl->snd);
3362 ndp->ni_startdir = dvp;
3363 cnp->cn_flags |= MAKEENTRY;
3364 if (cache_fpl_islastcn(ndp))
3365 cnp->cn_flags |= ISLASTCN;
3366 if (cache_fpl_isdotdot(cnp))
3367 cnp->cn_flags |= ISDOTDOT;
3373 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3375 struct componentname *cnp;
3382 tvp_seqc = fpl->tvp_seqc;
3384 if ((cnp->cn_flags & LOCKLEAF) != 0) {
3385 lkflags = LK_SHARED;
3386 if ((cnp->cn_flags & LOCKSHARED) == 0)
3387 lkflags = LK_EXCLUSIVE;
3388 error = vget_finish(tvp, lkflags, tvs);
3389 if (__predict_false(error != 0)) {
3390 return (cache_fpl_aborted(fpl));
3393 vget_finish_ref(tvp, tvs);
3396 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3397 if ((cnp->cn_flags & LOCKLEAF) != 0)
3401 return (cache_fpl_aborted(fpl));
3404 return (cache_fpl_handled(fpl, 0));
3408 * They want to possibly modify the state of the namecache.
3410 * Don't try to match the API contract, just leave.
3411 * TODO: this leaves scalability on the table
3414 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3416 struct componentname *cnp;
3419 MPASS(cnp->cn_nameiop != LOOKUP);
3420 return (cache_fpl_partial(fpl));
3423 static int __noinline
3424 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3426 struct componentname *cnp;
3427 enum vgetstate dvs, tvs;
3428 struct vnode *dvp, *tvp;
3429 seqc_t dvp_seqc, tvp_seqc;
3434 dvp_seqc = fpl->dvp_seqc;
3436 tvp_seqc = fpl->tvp_seqc;
3438 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3441 * This is less efficient than it can be for simplicity.
3443 dvs = vget_prep_smr(dvp);
3444 if (__predict_false(dvs == VGET_NONE)) {
3445 return (cache_fpl_aborted(fpl));
3447 tvs = vget_prep_smr(tvp);
3448 if (__predict_false(tvs == VGET_NONE)) {
3449 cache_fpl_smr_exit(fpl);
3450 vget_abort(dvp, dvs);
3451 return (cache_fpl_aborted(fpl));
3454 cache_fpl_smr_exit(fpl);
3456 if ((cnp->cn_flags & LOCKPARENT) != 0) {
3457 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3458 if (__predict_false(error != 0)) {
3459 vget_abort(tvp, tvs);
3460 return (cache_fpl_aborted(fpl));
3463 vget_finish_ref(dvp, dvs);
3466 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3467 vget_abort(tvp, tvs);
3468 if ((cnp->cn_flags & LOCKPARENT) != 0)
3472 return (cache_fpl_aborted(fpl));
3475 error = cache_fplookup_final_child(fpl, tvs);
3476 if (__predict_false(error != 0)) {
3477 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3478 if ((cnp->cn_flags & LOCKPARENT) != 0)
3485 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3490 cache_fplookup_final(struct cache_fpl *fpl)
3492 struct componentname *cnp;
3494 struct vnode *dvp, *tvp;
3495 seqc_t dvp_seqc, tvp_seqc;
3499 dvp_seqc = fpl->dvp_seqc;
3501 tvp_seqc = fpl->tvp_seqc;
3503 VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3505 if (cnp->cn_nameiop != LOOKUP) {
3506 return (cache_fplookup_final_modifying(fpl));
3509 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3510 return (cache_fplookup_final_withparent(fpl));
3512 tvs = vget_prep_smr(tvp);
3513 if (__predict_false(tvs == VGET_NONE)) {
3514 return (cache_fpl_partial(fpl));
3517 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3518 cache_fpl_smr_exit(fpl);
3519 vget_abort(tvp, tvs);
3520 return (cache_fpl_aborted(fpl));
3523 cache_fpl_smr_exit(fpl);
3524 return (cache_fplookup_final_child(fpl, tvs));
3527 static int __noinline
3528 cache_fplookup_dot(struct cache_fpl *fpl)
3535 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3536 if (seqc_in_modify(fpl->tvp_seqc)) {
3537 return (cache_fpl_aborted(fpl));
3540 counter_u64_add(dothits, 1);
3541 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3546 static int __noinline
3547 cache_fplookup_dotdot(struct cache_fpl *fpl)
3549 struct nameidata *ndp;
3550 struct componentname *cnp;
3551 struct namecache *ncp;
3561 * XXX this is racy the same way regular lookup is
3563 for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3565 if (dvp == pr->pr_root)
3568 if (dvp == ndp->ni_rootdir ||
3569 dvp == ndp->ni_topdir ||
3573 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3574 if (seqc_in_modify(fpl->tvp_seqc)) {
3575 return (cache_fpl_aborted(fpl));
3580 if ((dvp->v_vflag & VV_ROOT) != 0) {
3583 * The opposite of climb mount is needed here.
3585 return (cache_fpl_aborted(fpl));
3588 ncp = atomic_load_ptr(&dvp->v_cache_dd);
3590 return (cache_fpl_aborted(fpl));
3593 nc_flag = atomic_load_char(&ncp->nc_flag);
3594 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3595 if ((nc_flag & NCF_NEGATIVE) != 0)
3596 return (cache_fpl_aborted(fpl));
3597 fpl->tvp = ncp->nc_vp;
3599 fpl->tvp = ncp->nc_dvp;
3602 if (__predict_false(!cache_ncp_canuse(ncp))) {
3603 return (cache_fpl_aborted(fpl));
3606 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3607 if (seqc_in_modify(fpl->tvp_seqc)) {
3608 return (cache_fpl_partial(fpl));
3611 counter_u64_add(dotdothits, 1);
3616 cache_fplookup_next(struct cache_fpl *fpl)
3618 struct componentname *cnp;
3619 struct namecache *ncp;
3620 struct negstate *negstate;
3621 struct vnode *dvp, *tvp;
3629 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3630 return (cache_fplookup_dot(fpl));
3633 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3635 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3636 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3637 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3642 * If there is no entry we have to punt to the slow path to perform
3643 * actual lookup. Should there be nothing with this name a negative
3644 * entry will be created.
3646 if (__predict_false(ncp == NULL)) {
3647 return (cache_fpl_partial(fpl));
3650 tvp = atomic_load_ptr(&ncp->nc_vp);
3651 nc_flag = atomic_load_char(&ncp->nc_flag);
3652 if ((nc_flag & NCF_NEGATIVE) != 0) {
3654 * If they want to create an entry we need to replace this one.
3656 if (__predict_false(fpl->cnp->cn_nameiop == CREATE)) {
3657 return (cache_fpl_partial(fpl));
3659 negstate = NCP2NEGSTATE(ncp);
3660 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3661 if (__predict_false(!cache_ncp_canuse(ncp))) {
3662 return (cache_fpl_partial(fpl));
3664 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3665 return (cache_fpl_partial(fpl));
3668 return (cache_fplookup_negative_promote(fpl, ncp, hash));
3670 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3672 counter_u64_add(numneghits, 1);
3673 cache_fpl_smr_exit(fpl);
3674 return (cache_fpl_handled(fpl, ENOENT));
3677 if (__predict_false(!cache_ncp_canuse(ncp))) {
3678 return (cache_fpl_partial(fpl));
3682 fpl->tvp_seqc = vn_seqc_read_any(tvp);
3683 if (seqc_in_modify(fpl->tvp_seqc)) {
3684 return (cache_fpl_partial(fpl));
3687 if (!cache_fplookup_vnode_supported(tvp)) {
3688 return (cache_fpl_partial(fpl));
3691 counter_u64_add(numposhits, 1);
3692 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3697 cache_fplookup_mp_supported(struct mount *mp)
3702 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3708 * Walk up the mount stack (if any).
3710 * Correctness is provided in the following ways:
3711 * - all vnodes are protected from freeing with SMR
3712 * - struct mount objects are type stable making them always safe to access
3713 * - stability of the particular mount is provided by busying it
3714 * - relationship between the vnode which is mounted on and the mount is
3715 * verified with the vnode sequence counter after busying
3716 * - association between root vnode of the mount and the mount is protected
3719 * From that point on we can read the sequence counter of the root vnode
3720 * and get the next mount on the stack (if any) using the same protection.
3722 * By the end of successful walk we are guaranteed the reached state was
3723 * indeed present at least at some point which matches the regular lookup.
3725 static int __noinline
3726 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3728 struct mount *mp, *prev_mp;
3733 vp_seqc = fpl->tvp_seqc;
3735 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3736 mp = atomic_load_ptr(&vp->v_mountedhere);
3742 if (!vfs_op_thread_enter_crit(mp)) {
3743 if (prev_mp != NULL)
3744 vfs_op_thread_exit_crit(prev_mp);
3745 return (cache_fpl_partial(fpl));
3747 if (prev_mp != NULL)
3748 vfs_op_thread_exit_crit(prev_mp);
3749 if (!vn_seqc_consistent(vp, vp_seqc)) {
3750 vfs_op_thread_exit_crit(mp);
3751 return (cache_fpl_partial(fpl));
3753 if (!cache_fplookup_mp_supported(mp)) {
3754 vfs_op_thread_exit_crit(mp);
3755 return (cache_fpl_partial(fpl));
3757 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3758 if (vp == NULL || VN_IS_DOOMED(vp)) {
3759 vfs_op_thread_exit_crit(mp);
3760 return (cache_fpl_partial(fpl));
3762 vp_seqc = vn_seqc_read_any(vp);
3763 if (seqc_in_modify(vp_seqc)) {
3764 vfs_op_thread_exit_crit(mp);
3765 return (cache_fpl_partial(fpl));
3768 mp = atomic_load_ptr(&vp->v_mountedhere);
3773 vfs_op_thread_exit_crit(prev_mp);
3775 fpl->tvp_seqc = vp_seqc;
3780 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3788 * Hack: while this is a union, the pointer tends to be NULL so save on
3791 mp = atomic_load_ptr(&vp->v_mountedhere);
3794 if (vp->v_type == VDIR)
3802 * The code is mostly copy-pasted from regular lookup, see lookup().
3803 * The structure is maintained along with comments for easier maintenance.
3804 * Deduplicating the code will become feasible after fast path lookup
3805 * becomes more feature-complete.
3808 cache_fplookup_parse(struct cache_fpl *fpl)
3810 struct nameidata *ndp;
3811 struct componentname *cnp;
3813 char *prev_ni_next; /* saved ndp->ni_next */
3814 size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
3820 * Search a new directory.
3822 * The last component of the filename is left accessible via
3823 * cnp->cn_nameptr for callers that need the name. Callers needing
3824 * the name set the SAVENAME flag. When done, they assume
3825 * responsibility for freeing the pathname buffer.
3827 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3829 cnp->cn_namelen = cp - cnp->cn_nameptr;
3830 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
3831 cache_fpl_smr_exit(fpl);
3832 return (cache_fpl_handled(fpl, ENAMETOOLONG));
3834 prev_ni_pathlen = ndp->ni_pathlen;
3835 ndp->ni_pathlen -= cnp->cn_namelen;
3836 KASSERT(ndp->ni_pathlen <= PATH_MAX,
3837 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3838 prev_ni_next = ndp->ni_next;
3842 * Replace multiple slashes by a single slash and trailing slashes
3843 * by a null. This must be done before VOP_LOOKUP() because some
3844 * fs's don't know about trailing slashes. Remember if there were
3845 * trailing slashes to handle symlinks, existing non-directories
3846 * and non-existing files that won't be directories specially later.
3848 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3854 * Regular lookup performs the following:
3855 * *ndp->ni_next = '\0';
3856 * cnp->cn_flags |= TRAILINGSLASH;
3858 * Which is problematic since it modifies data read
3859 * from userspace. Then if fast path lookup was to
3860 * abort we would have to either restore it or convey
3861 * the flag. Since this is a corner case just ignore
3862 * it for simplicity.
3864 return (cache_fpl_partial(fpl));
3870 * Check for degenerate name (e.g. / or "")
3871 * which is a way of talking about a directory,
3872 * e.g. like "/." or ".".
3875 * Another corner case handled by the regular lookup
3877 if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3878 return (cache_fpl_partial(fpl));
3884 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3886 struct nameidata *ndp;
3887 struct componentname *cnp;
3892 cnp->cn_nameptr = ndp->ni_next;
3893 while (*cnp->cn_nameptr == '/') {
3899 static int __noinline
3900 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
3906 * Can happen when racing against vgone.
3909 cache_fpl_partial(fpl);
3913 * See the API contract for VOP_FPLOOKUP_VEXEC.
3915 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3916 error = cache_fpl_aborted(fpl);
3918 cache_fpl_smr_exit(fpl);
3919 cache_fpl_handled(fpl, error);
3927 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3929 struct nameidata *ndp;
3930 struct componentname *cnp;
3934 error = CACHE_FPL_FAILED;
3938 cache_fpl_checkpoint(fpl, &fpl->snd);
3941 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3942 if (seqc_in_modify(fpl->dvp_seqc)) {
3943 cache_fpl_aborted(fpl);
3946 mp = atomic_load_ptr(&fpl->dvp->v_mount);
3947 if (!cache_fplookup_mp_supported(mp)) {
3948 cache_fpl_aborted(fpl);
3952 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3955 error = cache_fplookup_parse(fpl);
3956 if (__predict_false(error != 0)) {
3960 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3962 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
3963 if (__predict_false(error != 0)) {
3964 error = cache_fplookup_failed_vexec(fpl, error);
3968 if (__predict_false(cache_fpl_isdotdot(cnp))) {
3969 error = cache_fplookup_dotdot(fpl);
3970 if (__predict_false(error != 0)) {
3974 error = cache_fplookup_next(fpl);
3975 if (__predict_false(error != 0)) {
3979 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3981 if (cache_fplookup_need_climb_mount(fpl)) {
3982 error = cache_fplookup_climb_mount(fpl);
3983 if (__predict_false(error != 0)) {
3989 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3991 if (cache_fpl_islastcn(ndp)) {
3992 error = cache_fplookup_final(fpl);
3996 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3997 error = cache_fpl_aborted(fpl);
4001 fpl->dvp = fpl->tvp;
4002 fpl->dvp_seqc = fpl->tvp_seqc;
4004 cache_fplookup_parse_advance(fpl);
4005 cache_fpl_checkpoint(fpl, &fpl->snd);
4008 switch (fpl->status) {
4009 case CACHE_FPL_STATUS_UNSET:
4010 __assert_unreachable();
4012 case CACHE_FPL_STATUS_PARTIAL:
4013 cache_fpl_smr_assert_entered(fpl);
4014 return (cache_fplookup_partial_setup(fpl));
4015 case CACHE_FPL_STATUS_ABORTED:
4017 cache_fpl_smr_exit(fpl);
4018 return (CACHE_FPL_FAILED);
4019 case CACHE_FPL_STATUS_HANDLED:
4020 MPASS(error != CACHE_FPL_FAILED);
4021 cache_fpl_smr_assert_not_entered(fpl);
4022 if (__predict_false(error != 0)) {
4025 cache_fpl_cleanup_cnp(cnp);
4028 ndp->ni_dvp = fpl->dvp;
4029 ndp->ni_vp = fpl->tvp;
4030 if (cnp->cn_flags & SAVENAME)
4031 cnp->cn_flags |= HASBUF;
4033 cache_fpl_cleanup_cnp(cnp);
4039 * Fast path lookup protected with SMR and sequence counters.
4041 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4043 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4046 * Traditional vnode lookup conceptually looks like this:
4052 * vn_unlock(current);
4059 * Each jump to the next vnode is safe memory-wise and atomic with respect to
4060 * any modifications thanks to holding respective locks.
4062 * The same guarantee can be provided with a combination of safe memory
4063 * reclamation and sequence counters instead. If all operations which affect
4064 * the relationship between the current vnode and the one we are looking for
4065 * also modify the counter, we can verify whether all the conditions held as
4066 * we made the jump. This includes things like permissions, mount points etc.
4067 * Counter modification is provided by enclosing relevant places in
4068 * vn_seqc_write_begin()/end() calls.
4070 * Thus this translates to:
4073 * dvp_seqc = seqc_read_any(dvp);
4074 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4078 * tvp_seqc = seqc_read_any(tvp);
4079 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4081 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4083 * dvp = tvp; // we know nothing of importance has changed
4084 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4088 * vget(); // secure the vnode
4089 * if (!seqc_consistent(tvp, tvp_seqc) // final check
4091 * // at this point we know nothing has changed for any parent<->child pair
4092 * // as they were crossed during the lookup, meaning we matched the guarantee
4093 * // of the locked variant
4096 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4097 * - they are called while within vfs_smr protection which they must never exit
4098 * - EAGAIN can be returned to denote checking could not be performed, it is
4099 * always valid to return it
4100 * - if the sequence counter has not changed the result must be valid
4101 * - if the sequence counter has changed both false positives and false negatives
4102 * are permitted (since the result will be rejected later)
4103 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4105 * Caveats to watch out for:
4106 * - vnodes are passed unlocked and unreferenced with nothing stopping
4107 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4108 * to use atomic_load_ptr to fetch it.
4109 * - the aforementioned object can also get freed, meaning absent other means it
4110 * should be protected with vfs_smr
4111 * - either safely checking permissions as they are modified or guaranteeing
4112 * their stability is left to the routine
4115 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4118 struct cache_fpl fpl;
4121 struct componentname *cnp;
4122 struct nameidata_saved orig;
4125 MPASS(ndp->ni_lcf == 0);
4127 fpl.status = CACHE_FPL_STATUS_UNSET;
4129 fpl.cnp = &ndp->ni_cnd;
4130 MPASS(curthread == fpl.cnp->cn_thread);
4132 if (!cache_can_fplookup(&fpl)) {
4133 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4134 *status = fpl.status;
4135 return (EOPNOTSUPP);
4138 cache_fpl_checkpoint(&fpl, &orig);
4140 cache_fpl_smr_enter_initial(&fpl);
4141 pwd = pwd_get_smr();
4143 ndp->ni_rootdir = pwd->pwd_rdir;
4144 ndp->ni_topdir = pwd->pwd_jdir;
4147 cnp->cn_nameptr = cnp->cn_pnbuf;
4148 if (cnp->cn_pnbuf[0] == '/') {
4149 cache_fpl_handle_root(ndp, &dvp);
4151 MPASS(ndp->ni_dirfd == AT_FDCWD);
4152 dvp = pwd->pwd_cdir;
4155 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4157 error = cache_fplookup_impl(dvp, &fpl);
4158 cache_fpl_smr_assert_not_entered(&fpl);
4159 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4161 *status = fpl.status;
4162 switch (fpl.status) {
4163 case CACHE_FPL_STATUS_UNSET:
4164 __assert_unreachable();
4166 case CACHE_FPL_STATUS_HANDLED:
4167 SDT_PROBE3(vfs, namei, lookup, return, error,
4168 (error == 0 ? ndp->ni_vp : NULL), true);
4170 case CACHE_FPL_STATUS_PARTIAL:
4173 * Status restored by cache_fplookup_partial_setup.
4176 case CACHE_FPL_STATUS_ABORTED:
4177 cache_fpl_restore(&fpl, &orig);