2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
41 #include "opt_ktrace.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
68 #include <sys/ktrace.h>
71 #include <sys/capsicum.h>
73 #include <security/audit/audit.h>
74 #include <security/mac/mac_framework.h>
82 SDT_PROVIDER_DECLARE(vfs);
83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
90 "struct namecache *", "int", "int");
91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
93 "char *", "struct vnode *");
94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
96 "struct vnode *", "char *");
97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
100 "struct vnode *", "char *");
101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
104 "struct componentname *");
105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
106 "struct componentname *");
107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
119 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
122 * This structure describes the elements in the cache of recent
123 * names looked up by namei.
128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
129 "the state must fit in a union with a pointer without growing it");
132 LIST_ENTRY(namecache) nc_src; /* source vnode list */
133 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
134 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
135 struct vnode *nc_dvp; /* vnode of parent of name */
137 struct vnode *nu_vp; /* vnode the name refers to */
138 struct negstate nu_neg;/* negative entry state */
140 u_char nc_flag; /* flag bits */
141 u_char nc_nlen; /* length of name */
142 char nc_name[0]; /* segment name + nul */
146 * struct namecache_ts repeats struct namecache layout up to the
148 * struct namecache_ts is used in place of struct namecache when time(s) need
149 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
150 * both a non-dotdot directory name plus dotdot for the directory's
153 * See below for alignment requirement.
155 struct namecache_ts {
156 struct timespec nc_time; /* timespec provided by fs */
157 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
158 int nc_ticks; /* ticks value when entry was added */
159 struct namecache nc_nc;
163 * At least mips n32 performs 64-bit accesses to timespec as found
164 * in namecache_ts and requires them to be aligned. Since others
165 * may be in the same spot suffer a little bit and enforce the
166 * alignment for everyone. Note this is a nop for 64-bit platforms.
168 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
169 #define CACHE_PATH_CUTOFF 39
171 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
172 #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
173 #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1)
174 #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1)
176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
181 #define nc_vp n_un.nu_vp
182 #define nc_neg n_un.nu_neg
185 * Flags in namecache.nc_flag
187 #define NCF_WHITE 0x01
188 #define NCF_ISDOTDOT 0x02
191 #define NCF_DVDROP 0x10
192 #define NCF_NEGATIVE 0x20
193 #define NCF_INVALID 0x40
197 * Flags in negstate.neg_flag
202 * Mark an entry as invalid.
204 * This is called before it starts getting deconstructed.
207 cache_ncp_invalidate(struct namecache *ncp)
210 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
211 ("%s: entry %p already invalid", __func__, ncp));
212 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
213 atomic_thread_fence_rel();
217 * Check whether the entry can be safely used.
219 * All places which elide locks are supposed to call this after they are
220 * done with reading from an entry.
223 cache_ncp_canuse(struct namecache *ncp)
226 atomic_thread_fence_acq();
227 return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
231 * Name caching works as follows:
233 * Names found by directory scans are retained in a cache
234 * for future reference. It is managed LRU, so frequently
235 * used names will hang around. Cache is indexed by hash value
236 * obtained from (dvp, name) where dvp refers to the directory
239 * If it is a "negative" entry, (i.e. for a name that is known NOT to
240 * exist) the vnode pointer will be NULL.
242 * Upon reaching the last segment of a path, if the reference
243 * is for DELETE, or NOCACHE is set (rewrite), and the
244 * name is located in the cache, it will be dropped.
246 * These locks are used (in the order in which they can be taken):
248 * vnodelock mtx vnode lists and v_cache_dd field protection
249 * bucketlock mtx for access to given set of hash buckets
250 * neglist mtx negative entry LRU management
252 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
253 * shrinking the LRU list.
255 * It is legal to take multiple vnodelock and bucketlock locks. The locking
256 * order is lower address first. Both are recursive.
258 * "." lookups are lockless.
260 * ".." and vnode -> name lookups require vnodelock.
262 * name -> vnode lookup requires the relevant bucketlock to be held for reading.
264 * Insertions and removals of entries require involved vnodes and bucketlocks
265 * to be locked to provide safe operation against other threads modifying the
268 * Some lookups result in removal of the found entry (e.g. getting rid of a
269 * negative entry with the intent to create a positive one), which poses a
270 * problem when multiple threads reach the state. Similarly, two different
271 * threads can purge two different vnodes and try to remove the same name.
273 * If the already held vnode lock is lower than the second required lock, we
274 * can just take the other lock. However, in the opposite case, this could
275 * deadlock. As such, this is resolved by trylocking and if that fails unlocking
276 * the first node, locking everything in order and revalidating the state.
282 * Structures associated with name caching.
284 #define NCHHASH(hash) \
285 (&nchashtbl[(hash) & nchash])
286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
287 static u_long __read_mostly nchash; /* size of hash table */
288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
289 "Size of namecache hash table");
290 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
292 "Ratio of negative namecache entries");
293 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
294 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
295 u_int ncsizefactor = 2;
296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
297 "Size factor for namecache");
298 static u_int __read_mostly ncpurgeminvnodes;
299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
300 "Number of vnodes below which purgevfs ignores the request");
301 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
303 struct nchstats nchstats; /* cache effectiveness statistics */
305 static bool __read_frequently cache_fast_revlookup = true;
306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
307 &cache_fast_revlookup, 0, "");
309 static struct mtx __exclusive_cache_line ncneg_shrink_lock;
313 TAILQ_HEAD(, namecache) nl_list;
314 } __aligned(CACHE_LINE_SIZE);
316 static struct neglist __read_mostly *neglists;
317 static struct neglist ncneg_hot;
318 static u_long numhotneg;
321 #define numneglists (ncneghash + 1)
322 static inline struct neglist *
323 NCP2NEGLIST(struct namecache *ncp)
326 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
329 static inline struct negstate *
330 NCP2NEGSTATE(struct namecache *ncp)
333 MPASS(ncp->nc_flag & NCF_NEGATIVE);
334 return (&ncp->nc_neg);
337 #define numbucketlocks (ncbuckethash + 1)
338 static u_int __read_mostly ncbuckethash;
339 static struct mtx_padalign __read_mostly *bucketlocks;
340 #define HASH2BUCKETLOCK(hash) \
341 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
343 #define numvnodelocks (ncvnodehash + 1)
344 static u_int __read_mostly ncvnodehash;
345 static struct mtx __read_mostly *vnodelocks;
346 static inline struct mtx *
347 VP2VNODELOCK(struct vnode *vp)
350 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
354 * UMA zones for the VFS cache.
356 * The small cache is used for entries with short names, which are the
357 * most common. The large cache is used for entries which are too big to
358 * fit in the small cache.
360 static uma_zone_t __read_mostly cache_zone_small;
361 static uma_zone_t __read_mostly cache_zone_small_ts;
362 static uma_zone_t __read_mostly cache_zone_large;
363 static uma_zone_t __read_mostly cache_zone_large_ts;
365 static struct namecache *
366 cache_alloc(int len, int ts)
368 struct namecache_ts *ncp_ts;
369 struct namecache *ncp;
371 if (__predict_false(ts)) {
372 if (len <= CACHE_PATH_CUTOFF)
373 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
375 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
376 ncp = &ncp_ts->nc_nc;
378 if (len <= CACHE_PATH_CUTOFF)
379 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
381 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
387 cache_free(struct namecache *ncp)
389 struct namecache_ts *ncp_ts;
393 if ((ncp->nc_flag & NCF_DVDROP) != 0)
395 if (__predict_false(ncp->nc_flag & NCF_TS)) {
396 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
397 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
398 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
400 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
402 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
403 uma_zfree_smr(cache_zone_small, ncp);
405 uma_zfree_smr(cache_zone_large, ncp);
410 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
412 struct namecache_ts *ncp_ts;
414 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
415 (tsp == NULL && ticksp == NULL),
421 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
422 *tsp = ncp_ts->nc_time;
423 *ticksp = ncp_ts->nc_ticks;
427 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
428 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
429 "VFS namecache enabled");
432 /* Export size information to userland */
433 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
434 sizeof(struct namecache), "sizeof(struct namecache)");
437 * The new name cache statistics
439 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
440 "Name cache statistics");
441 #define STATNODE_ULONG(name, descr) \
442 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
443 #define STATNODE_COUNTER(name, descr) \
444 static COUNTER_U64_DEFINE_EARLY(name); \
445 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
447 STATNODE_ULONG(numneg, "Number of negative cache entries");
448 STATNODE_ULONG(numcache, "Number of cache entries");
449 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
450 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
451 STATNODE_COUNTER(dothits, "Number of '.' hits");
452 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
453 STATNODE_COUNTER(nummiss, "Number of cache misses");
454 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
455 STATNODE_COUNTER(numposzaps,
456 "Number of cache hits (positive) we do not want to cache");
457 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
458 STATNODE_COUNTER(numnegzaps,
459 "Number of cache hits (negative) we do not want to cache");
460 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
461 /* These count for vn_getcwd(), too. */
462 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
463 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
464 STATNODE_COUNTER(numfullpathfail2,
465 "Number of fullpath search errors (VOP_VPTOCNP failures)");
466 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
467 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
468 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
469 "Number of successful removals after relocking");
470 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
471 "Number of times zap_and_exit failed to lock");
472 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
473 "Number of times zap_and_exit failed to lock");
474 static long cache_lock_vnodes_cel_3_failures;
475 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
476 "Number of times 3-way vnode locking failed");
477 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
478 STATNODE_COUNTER(numneg_evicted,
479 "Number of negative entries evicted when adding a new entry");
480 STATNODE_COUNTER(shrinking_skipped,
481 "Number of times shrinking was already in progress");
483 static void cache_zap_locked(struct namecache *ncp);
484 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
485 char **freebuf, size_t *buflen);
486 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
487 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
488 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
489 char **retbuf, size_t *buflen);
490 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
491 char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
493 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
495 static int cache_yield;
496 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
497 "Number of times cache called yield");
499 static void __noinline
500 cache_maybe_yield(void)
503 if (should_yield()) {
505 kern_yield(PRI_USER);
510 cache_assert_vlp_locked(struct mtx *vlp)
514 mtx_assert(vlp, MA_OWNED);
518 cache_assert_vnode_locked(struct vnode *vp)
522 vlp = VP2VNODELOCK(vp);
523 cache_assert_vlp_locked(vlp);
527 * TODO: With the value stored we can do better than computing the hash based
528 * on the address. The choice of FNV should also be revisited.
531 cache_prehash(struct vnode *vp)
534 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
538 cache_get_hash(char *name, u_char len, struct vnode *dvp)
541 return (fnv_32_buf(name, len, dvp->v_nchash));
544 static inline struct nchashhead *
545 NCP2BUCKET(struct namecache *ncp)
549 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
550 return (NCHHASH(hash));
553 static inline struct mtx *
554 NCP2BUCKETLOCK(struct namecache *ncp)
558 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
559 return (HASH2BUCKETLOCK(hash));
564 cache_assert_bucket_locked(struct namecache *ncp)
568 blp = NCP2BUCKETLOCK(ncp);
569 mtx_assert(blp, MA_OWNED);
573 cache_assert_bucket_unlocked(struct namecache *ncp)
577 blp = NCP2BUCKETLOCK(ncp);
578 mtx_assert(blp, MA_NOTOWNED);
581 #define cache_assert_bucket_locked(x) do { } while (0)
582 #define cache_assert_bucket_unlocked(x) do { } while (0)
585 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
587 _cache_sort_vnodes(void **p1, void **p2)
591 MPASS(*p1 != NULL || *p2 != NULL);
601 cache_lock_all_buckets(void)
605 for (i = 0; i < numbucketlocks; i++)
606 mtx_lock(&bucketlocks[i]);
610 cache_unlock_all_buckets(void)
614 for (i = 0; i < numbucketlocks; i++)
615 mtx_unlock(&bucketlocks[i]);
619 cache_lock_all_vnodes(void)
623 for (i = 0; i < numvnodelocks; i++)
624 mtx_lock(&vnodelocks[i]);
628 cache_unlock_all_vnodes(void)
632 for (i = 0; i < numvnodelocks; i++)
633 mtx_unlock(&vnodelocks[i]);
637 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
640 cache_sort_vnodes(&vlp1, &vlp2);
643 if (!mtx_trylock(vlp1))
646 if (!mtx_trylock(vlp2)) {
656 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
659 MPASS(vlp1 != NULL || vlp2 != NULL);
669 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
672 MPASS(vlp1 != NULL || vlp2 != NULL);
681 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
683 struct nchstats snap;
685 if (req->oldptr == NULL)
686 return (SYSCTL_OUT(req, 0, sizeof(snap)));
689 snap.ncs_goodhits = counter_u64_fetch(numposhits);
690 snap.ncs_neghits = counter_u64_fetch(numneghits);
691 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
692 counter_u64_fetch(numnegzaps);
693 snap.ncs_miss = counter_u64_fetch(nummisszap) +
694 counter_u64_fetch(nummiss);
696 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
698 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
699 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
700 "VFS cache effectiveness statistics");
704 * Grab an atomic snapshot of the name cache hash chain lengths
706 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
707 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
711 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
713 struct nchashhead *ncpp;
714 struct namecache *ncp;
715 int i, error, n_nchash, *cntbuf;
718 n_nchash = nchash + 1; /* nchash is max index, not count */
719 if (req->oldptr == NULL)
720 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
721 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
722 cache_lock_all_buckets();
723 if (n_nchash != nchash + 1) {
724 cache_unlock_all_buckets();
725 free(cntbuf, M_TEMP);
728 /* Scan hash tables counting entries */
729 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
730 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
732 cache_unlock_all_buckets();
733 for (error = 0, i = 0; i < n_nchash; i++)
734 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
736 free(cntbuf, M_TEMP);
739 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
740 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
741 "nchash chain lengths");
744 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
747 struct nchashhead *ncpp;
748 struct namecache *ncp;
750 int count, maxlength, used, pct;
753 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
755 cache_lock_all_buckets();
756 n_nchash = nchash + 1; /* nchash is max index, not count */
760 /* Scan hash tables for applicable entries */
761 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
763 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
768 if (maxlength < count)
771 n_nchash = nchash + 1;
772 cache_unlock_all_buckets();
773 pct = (used * 100) / (n_nchash / 100);
774 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
777 error = SYSCTL_OUT(req, &used, sizeof(used));
780 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
783 error = SYSCTL_OUT(req, &pct, sizeof(pct));
788 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
789 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
790 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
794 * Negative entries management
796 * A variation of LRU scheme is used. New entries are hashed into one of
797 * numneglists cold lists. Entries get promoted to the hot list on first hit.
799 * The shrinker will demote hot list head and evict from the cold list in a
800 * round-robin manner.
803 cache_negative_init(struct namecache *ncp)
805 struct negstate *negstate;
807 ncp->nc_flag |= NCF_NEGATIVE;
808 negstate = NCP2NEGSTATE(ncp);
809 negstate->neg_flag = 0;
813 cache_negative_hit(struct namecache *ncp)
815 struct neglist *neglist;
816 struct negstate *negstate;
818 negstate = NCP2NEGSTATE(ncp);
819 if ((negstate->neg_flag & NEG_HOT) != 0)
821 neglist = NCP2NEGLIST(ncp);
822 mtx_lock(&ncneg_hot.nl_lock);
823 mtx_lock(&neglist->nl_lock);
824 if ((negstate->neg_flag & NEG_HOT) == 0) {
826 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
827 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
828 negstate->neg_flag |= NEG_HOT;
830 mtx_unlock(&neglist->nl_lock);
831 mtx_unlock(&ncneg_hot.nl_lock);
835 cache_negative_insert(struct namecache *ncp)
837 struct neglist *neglist;
839 MPASS(ncp->nc_flag & NCF_NEGATIVE);
840 cache_assert_bucket_locked(ncp);
841 neglist = NCP2NEGLIST(ncp);
842 mtx_lock(&neglist->nl_lock);
843 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
844 mtx_unlock(&neglist->nl_lock);
845 atomic_add_rel_long(&numneg, 1);
849 cache_negative_remove(struct namecache *ncp)
851 struct neglist *neglist;
852 struct negstate *negstate;
853 bool hot_locked = false;
854 bool list_locked = false;
856 cache_assert_bucket_locked(ncp);
857 neglist = NCP2NEGLIST(ncp);
858 negstate = NCP2NEGSTATE(ncp);
859 if ((negstate->neg_flag & NEG_HOT) != 0) {
861 mtx_lock(&ncneg_hot.nl_lock);
862 if ((negstate->neg_flag & NEG_HOT) == 0) {
864 mtx_lock(&neglist->nl_lock);
868 mtx_lock(&neglist->nl_lock);
870 * We may be racing against promotion in lockless lookup.
872 if ((negstate->neg_flag & NEG_HOT) != 0) {
873 mtx_unlock(&neglist->nl_lock);
875 mtx_lock(&ncneg_hot.nl_lock);
876 mtx_lock(&neglist->nl_lock);
879 if ((negstate->neg_flag & NEG_HOT) != 0) {
880 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
881 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
884 mtx_assert(&neglist->nl_lock, MA_OWNED);
885 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
888 mtx_unlock(&neglist->nl_lock);
890 mtx_unlock(&ncneg_hot.nl_lock);
891 atomic_subtract_rel_long(&numneg, 1);
895 cache_negative_shrink_select(struct namecache **ncpp,
896 struct neglist **neglistpp)
898 struct neglist *neglist;
899 struct namecache *ncp;
905 for (i = 0; i < numneglists; i++) {
906 neglist = &neglists[(cycle + i) % numneglists];
907 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
909 mtx_lock(&neglist->nl_lock);
910 ncp = TAILQ_FIRST(&neglist->nl_list);
913 mtx_unlock(&neglist->nl_lock);
916 *neglistpp = neglist;
922 cache_negative_zap_one(void)
924 struct namecache *ncp, *ncp2;
925 struct neglist *neglist;
926 struct negstate *negstate;
930 if (mtx_owner(&ncneg_shrink_lock) != NULL ||
931 !mtx_trylock(&ncneg_shrink_lock)) {
932 counter_u64_add(shrinking_skipped, 1);
936 mtx_lock(&ncneg_hot.nl_lock);
937 ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
939 neglist = NCP2NEGLIST(ncp);
940 negstate = NCP2NEGSTATE(ncp);
941 mtx_lock(&neglist->nl_lock);
942 MPASS((negstate->neg_flag & NEG_HOT) != 0);
943 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
944 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
945 negstate->neg_flag &= ~NEG_HOT;
947 mtx_unlock(&neglist->nl_lock);
949 mtx_unlock(&ncneg_hot.nl_lock);
951 cache_negative_shrink_select(&ncp, &neglist);
953 mtx_unlock(&ncneg_shrink_lock);
957 MPASS(ncp->nc_flag & NCF_NEGATIVE);
958 dvlp = VP2VNODELOCK(ncp->nc_dvp);
959 blp = NCP2BUCKETLOCK(ncp);
960 mtx_unlock(&neglist->nl_lock);
964 * Enter SMR to safely check the negative list.
965 * Even if the found pointer matches, the entry may now be reallocated
966 * and used by a different vnode.
969 ncp2 = TAILQ_FIRST(&neglist->nl_list);
970 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
971 blp != NCP2BUCKETLOCK(ncp2)) {
976 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
978 cache_zap_locked(ncp);
979 counter_u64_add(numneg_evicted, 1);
987 * cache_zap_locked():
989 * Removes a namecache entry from cache, whether it contains an actual
990 * pointer to a vnode or if it is just a negative cache entry.
993 cache_zap_locked(struct namecache *ncp)
995 struct nchashhead *ncpp;
997 if (!(ncp->nc_flag & NCF_NEGATIVE))
998 cache_assert_vnode_locked(ncp->nc_vp);
999 cache_assert_vnode_locked(ncp->nc_dvp);
1000 cache_assert_bucket_locked(ncp);
1002 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
1003 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
1005 cache_ncp_invalidate(ncp);
1007 ncpp = NCP2BUCKET(ncp);
1008 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1009 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1010 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1011 ncp->nc_name, ncp->nc_vp);
1012 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1013 if (ncp == ncp->nc_vp->v_cache_dd) {
1014 vn_seqc_write_begin_unheld(ncp->nc_vp);
1015 ncp->nc_vp->v_cache_dd = NULL;
1016 vn_seqc_write_end(ncp->nc_vp);
1019 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1021 cache_negative_remove(ncp);
1023 if (ncp->nc_flag & NCF_ISDOTDOT) {
1024 if (ncp == ncp->nc_dvp->v_cache_dd) {
1025 vn_seqc_write_begin_unheld(ncp->nc_dvp);
1026 ncp->nc_dvp->v_cache_dd = NULL;
1027 vn_seqc_write_end(ncp->nc_dvp);
1030 LIST_REMOVE(ncp, nc_src);
1031 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1032 ncp->nc_flag |= NCF_DVDROP;
1033 counter_u64_add(numcachehv, -1);
1036 atomic_subtract_rel_long(&numcache, 1);
1040 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1044 MPASS(ncp->nc_dvp == vp);
1045 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1046 cache_assert_vnode_locked(vp);
1048 blp = NCP2BUCKETLOCK(ncp);
1050 cache_zap_locked(ncp);
1055 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1058 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1061 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1062 cache_assert_vnode_locked(vp);
1064 if (ncp->nc_flag & NCF_NEGATIVE) {
1065 if (*vlpp != NULL) {
1069 cache_zap_negative_locked_vnode_kl(ncp, vp);
1073 pvlp = VP2VNODELOCK(vp);
1074 blp = NCP2BUCKETLOCK(ncp);
1075 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1076 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1078 if (*vlpp == vlp1 || *vlpp == vlp2) {
1082 if (*vlpp != NULL) {
1086 cache_sort_vnodes(&vlp1, &vlp2);
1091 if (!mtx_trylock(vlp1))
1097 cache_zap_locked(ncp);
1099 if (to_unlock != NULL)
1100 mtx_unlock(to_unlock);
1107 MPASS(*vlpp == NULL);
1112 static int __noinline
1113 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1115 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1119 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1120 cache_assert_vnode_locked(vp);
1122 pvlp = VP2VNODELOCK(vp);
1123 if (ncp->nc_flag & NCF_NEGATIVE) {
1124 cache_zap_negative_locked_vnode_kl(ncp, vp);
1128 blp = NCP2BUCKETLOCK(ncp);
1129 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1130 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1131 cache_sort_vnodes(&vlp1, &vlp2);
1136 if (!mtx_trylock(vlp1)) {
1143 cache_zap_locked(ncp);
1145 mtx_unlock(to_unlock);
1152 * If trylocking failed we can get here. We know enough to take all needed locks
1153 * in the right order and re-lookup the entry.
1156 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1157 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1160 struct namecache *rncp;
1162 cache_assert_bucket_unlocked(ncp);
1164 cache_sort_vnodes(&dvlp, &vlp);
1165 cache_lock_vnodes(dvlp, vlp);
1167 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1168 if (rncp == ncp && rncp->nc_dvp == dvp &&
1169 rncp->nc_nlen == cnp->cn_namelen &&
1170 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1174 cache_zap_locked(rncp);
1176 cache_unlock_vnodes(dvlp, vlp);
1177 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1182 cache_unlock_vnodes(dvlp, vlp);
1186 static int __noinline
1187 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1188 uint32_t hash, struct mtx *blp)
1190 struct mtx *dvlp, *vlp;
1193 cache_assert_bucket_locked(ncp);
1195 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1197 if (!(ncp->nc_flag & NCF_NEGATIVE))
1198 vlp = VP2VNODELOCK(ncp->nc_vp);
1199 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1200 cache_zap_locked(ncp);
1202 cache_unlock_vnodes(dvlp, vlp);
1208 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1212 cache_zap_locked_bucket_kl(struct namecache *ncp, struct mtx *blp,
1213 struct mtx **vlpp1, struct mtx **vlpp2)
1215 struct mtx *dvlp, *vlp;
1217 cache_assert_bucket_locked(ncp);
1219 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1221 if (!(ncp->nc_flag & NCF_NEGATIVE))
1222 vlp = VP2VNODELOCK(ncp->nc_vp);
1223 cache_sort_vnodes(&dvlp, &vlp);
1225 if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1226 cache_zap_locked(ncp);
1227 cache_unlock_vnodes(dvlp, vlp);
1240 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1241 cache_zap_locked(ncp);
1242 cache_unlock_vnodes(dvlp, vlp);
1256 static int __noinline
1257 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1258 struct timespec *tsp, int *ticksp)
1263 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1264 dvp, cnp->cn_nameptr);
1265 counter_u64_add(dothits, 1);
1266 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1273 * When we lookup "." we still can be asked to lock it
1276 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1277 if (ltype != VOP_ISLOCKED(*vpp)) {
1278 if (ltype == LK_EXCLUSIVE) {
1279 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1280 if (VN_IS_DOOMED((*vpp))) {
1281 /* forced unmount */
1287 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1292 static __noinline int
1293 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp);
1296 static int __noinline
1297 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1298 struct timespec *tsp, int *ticksp)
1300 struct namecache_ts *ncp_ts;
1301 struct namecache *ncp;
1307 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1309 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1310 cache_remove_cnp(dvp, cnp);
1314 counter_u64_add(dotdothits, 1);
1316 dvlp = VP2VNODELOCK(dvp);
1318 ncp = dvp->v_cache_dd;
1320 SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1325 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1326 if (ncp->nc_flag & NCF_NEGATIVE)
1332 /* Return failure if negative entry was found. */
1334 goto negative_success;
1335 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1336 dvp, cnp->cn_nameptr, *vpp);
1337 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1339 cache_out_ts(ncp, tsp, ticksp);
1340 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1341 NCF_DTS && tsp != NULL) {
1342 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1343 *tsp = ncp_ts->nc_dotdottime;
1347 * On success we return a locked and ref'd vnode as per the lookup
1351 ltype = 0; /* silence gcc warning */
1352 ltype = VOP_ISLOCKED(dvp);
1354 vs = vget_prep(*vpp);
1356 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1357 vn_lock(dvp, ltype | LK_RETRY);
1358 if (VN_IS_DOOMED(dvp)) {
1368 if ((cnp->cn_flags & ISLASTCN) &&
1369 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1370 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1374 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1375 if (cnp->cn_flags & ISLASTCN) {
1376 counter_u64_add(numnegzaps, 1);
1377 error = cache_zap_locked_vnode(ncp, dvp);
1378 if (__predict_false(error != 0)) {
1379 zap_and_exit_bucket_fail2++;
1380 cache_maybe_yield();
1388 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1389 cache_out_ts(ncp, tsp, ticksp);
1390 counter_u64_add(numneghits, 1);
1391 whiteout = (ncp->nc_flag & NCF_WHITE);
1392 cache_negative_hit(ncp);
1395 cnp->cn_flags |= ISWHITEOUT;
1399 static __noinline int
1400 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1402 struct namecache *ncp;
1404 struct mtx *dvlp, *dvlp2;
1408 if (cnp->cn_namelen == 2 &&
1409 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1410 dvlp = VP2VNODELOCK(dvp);
1414 ncp = dvp->v_cache_dd;
1419 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1422 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1423 if (ncp->nc_dvp != dvp)
1424 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1425 if (!cache_zap_locked_vnode_kl2(ncp,
1428 MPASS(dvp->v_cache_dd == NULL);
1434 vn_seqc_write_begin(dvp);
1435 dvp->v_cache_dd = NULL;
1436 vn_seqc_write_end(dvp);
1441 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1445 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1446 blp = HASH2BUCKETLOCK(hash);
1448 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1453 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1454 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1455 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1459 /* We failed to find an entry */
1465 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1466 if (__predict_false(error != 0)) {
1467 zap_and_exit_bucket_fail++;
1468 cache_maybe_yield();
1471 counter_u64_add(numposzaps, 1);
1473 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1476 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1477 counter_u64_add(nummisszap, 1);
1482 * Lookup a name in the name cache
1486 * - dvp: Parent directory in which to search.
1487 * - vpp: Return argument. Will contain desired vnode on cache hit.
1488 * - cnp: Parameters of the name search. The most interesting bits of
1489 * the cn_flags field have the following meanings:
1490 * - MAKEENTRY: If clear, free an entry from the cache rather than look
1492 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
1493 * - tsp: Return storage for cache timestamp. On a successful (positive
1494 * or negative) lookup, tsp will be filled with any timespec that
1495 * was stored when this cache entry was created. However, it will
1496 * be clear for "." entries.
1497 * - ticks: Return storage for alternate cache timestamp. On a successful
1498 * (positive or negative) lookup, it will contain the ticks value
1499 * that was current when the cache entry was created, unless cnp
1504 * - -1: A positive cache hit. vpp will contain the desired vnode.
1505 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
1506 * to a forced unmount. vpp will not be modified. If the entry
1507 * is a whiteout, then the ISWHITEOUT flag will be set in
1509 * - 0: A cache miss. vpp will not be modified.
1513 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
1514 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
1515 * lock is not recursively acquired.
1517 static int __noinline
1518 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1519 struct timespec *tsp, int *ticksp)
1521 struct namecache *ncp;
1528 MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1531 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1532 blp = HASH2BUCKETLOCK(hash);
1535 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1536 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1537 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1541 /* We failed to find an entry */
1542 if (__predict_false(ncp == NULL)) {
1544 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1546 counter_u64_add(nummiss, 1);
1550 if (ncp->nc_flag & NCF_NEGATIVE)
1551 goto negative_success;
1553 /* We found a "positive" match, return the vnode */
1554 counter_u64_add(numposhits, 1);
1556 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1557 dvp, cnp->cn_nameptr, *vpp, ncp);
1558 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1560 cache_out_ts(ncp, tsp, ticksp);
1562 * On success we return a locked and ref'd vnode as per the lookup
1566 vs = vget_prep(*vpp);
1568 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1573 if ((cnp->cn_flags & ISLASTCN) &&
1574 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1575 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1580 /* We found a negative match, and want to create it, so purge */
1581 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1582 if (cnp->cn_flags & ISLASTCN) {
1583 counter_u64_add(numnegzaps, 1);
1584 error = cache_zap_locked_vnode(ncp, dvp);
1585 if (__predict_false(error != 0)) {
1586 zap_and_exit_bucket_fail2++;
1587 cache_maybe_yield();
1595 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1596 cache_out_ts(ncp, tsp, ticksp);
1597 counter_u64_add(numneghits, 1);
1598 whiteout = (ncp->nc_flag & NCF_WHITE);
1599 cache_negative_hit(ncp);
1602 cnp->cn_flags |= ISWHITEOUT;
1607 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1608 struct timespec *tsp, int *ticksp)
1610 struct namecache *ncp;
1611 struct negstate *negstate;
1618 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1621 if (__predict_false(!doingcache)) {
1622 cnp->cn_flags &= ~MAKEENTRY;
1627 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1628 if (cnp->cn_namelen == 1)
1629 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1630 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1631 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1634 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1636 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1637 cache_remove_cnp(dvp, cnp);
1641 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1644 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1645 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1646 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1650 /* We failed to find an entry */
1651 if (__predict_false(ncp == NULL)) {
1653 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1655 counter_u64_add(nummiss, 1);
1659 nc_flag = atomic_load_char(&ncp->nc_flag);
1660 if (nc_flag & NCF_NEGATIVE)
1661 goto negative_success;
1663 /* We found a "positive" match, return the vnode */
1664 counter_u64_add(numposhits, 1);
1666 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1667 dvp, cnp->cn_nameptr, *vpp, ncp);
1668 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1670 cache_out_ts(ncp, tsp, ticksp);
1672 * On success we return a locked and ref'd vnode as per the lookup
1676 if (!cache_ncp_canuse(ncp)) {
1681 vs = vget_prep_smr(*vpp);
1683 if (__predict_false(vs == VGET_NONE)) {
1687 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1692 if ((cnp->cn_flags & ISLASTCN) &&
1693 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1694 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1699 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1700 if (cnp->cn_flags & ISLASTCN) {
1706 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1707 cache_out_ts(ncp, tsp, ticksp);
1708 counter_u64_add(numneghits, 1);
1709 whiteout = (ncp->nc_flag & NCF_WHITE);
1711 * We need to take locks to promote an entry.
1713 negstate = NCP2NEGSTATE(ncp);
1714 if ((negstate->neg_flag & NEG_HOT) == 0 ||
1715 !cache_ncp_canuse(ncp)) {
1721 cnp->cn_flags |= ISWHITEOUT;
1724 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1727 struct celockstate {
1731 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1732 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1735 cache_celockstate_init(struct celockstate *cel)
1738 bzero(cel, sizeof(*cel));
1742 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1745 struct mtx *vlp1, *vlp2;
1747 MPASS(cel->vlp[0] == NULL);
1748 MPASS(cel->vlp[1] == NULL);
1749 MPASS(cel->vlp[2] == NULL);
1751 MPASS(vp != NULL || dvp != NULL);
1753 vlp1 = VP2VNODELOCK(vp);
1754 vlp2 = VP2VNODELOCK(dvp);
1755 cache_sort_vnodes(&vlp1, &vlp2);
1766 cache_unlock_vnodes_cel(struct celockstate *cel)
1769 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1771 if (cel->vlp[0] != NULL)
1772 mtx_unlock(cel->vlp[0]);
1773 if (cel->vlp[1] != NULL)
1774 mtx_unlock(cel->vlp[1]);
1775 if (cel->vlp[2] != NULL)
1776 mtx_unlock(cel->vlp[2]);
1780 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1785 cache_assert_vlp_locked(cel->vlp[0]);
1786 cache_assert_vlp_locked(cel->vlp[1]);
1787 MPASS(cel->vlp[2] == NULL);
1790 vlp = VP2VNODELOCK(vp);
1793 if (vlp >= cel->vlp[1]) {
1796 if (mtx_trylock(vlp))
1798 cache_lock_vnodes_cel_3_failures++;
1799 cache_unlock_vnodes_cel(cel);
1800 if (vlp < cel->vlp[0]) {
1802 mtx_lock(cel->vlp[0]);
1803 mtx_lock(cel->vlp[1]);
1805 if (cel->vlp[0] != NULL)
1806 mtx_lock(cel->vlp[0]);
1808 mtx_lock(cel->vlp[1]);
1818 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1822 MPASS(cel->blp[0] == NULL);
1823 MPASS(cel->blp[1] == NULL);
1825 cache_sort_vnodes(&blp1, &blp2);
1836 cache_unlock_buckets_cel(struct celockstate *cel)
1839 if (cel->blp[0] != NULL)
1840 mtx_unlock(cel->blp[0]);
1841 mtx_unlock(cel->blp[1]);
1845 * Lock part of the cache affected by the insertion.
1847 * This means vnodelocks for dvp, vp and the relevant bucketlock.
1848 * However, insertion can result in removal of an old entry. In this
1849 * case we have an additional vnode and bucketlock pair to lock. If the
1850 * entry is negative, ncelock is locked instead of the vnode.
1852 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1853 * preserving the locking order (smaller address first).
1856 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1859 struct namecache *ncp;
1860 struct mtx *blps[2];
1862 blps[0] = HASH2BUCKETLOCK(hash);
1865 cache_lock_vnodes_cel(cel, dvp, vp);
1866 if (vp == NULL || vp->v_type != VDIR)
1868 ncp = vp->v_cache_dd;
1871 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1873 MPASS(ncp->nc_dvp == vp);
1874 blps[1] = NCP2BUCKETLOCK(ncp);
1875 if (ncp->nc_flag & NCF_NEGATIVE)
1877 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1880 * All vnodes got re-locked. Re-validate the state and if
1881 * nothing changed we are done. Otherwise restart.
1883 if (ncp == vp->v_cache_dd &&
1884 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1885 blps[1] == NCP2BUCKETLOCK(ncp) &&
1886 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1888 cache_unlock_vnodes_cel(cel);
1893 cache_lock_buckets_cel(cel, blps[0], blps[1]);
1897 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1900 struct namecache *ncp;
1901 struct mtx *blps[2];
1903 blps[0] = HASH2BUCKETLOCK(hash);
1906 cache_lock_vnodes_cel(cel, dvp, vp);
1907 ncp = dvp->v_cache_dd;
1910 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1912 MPASS(ncp->nc_dvp == dvp);
1913 blps[1] = NCP2BUCKETLOCK(ncp);
1914 if (ncp->nc_flag & NCF_NEGATIVE)
1916 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1918 if (ncp == dvp->v_cache_dd &&
1919 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1920 blps[1] == NCP2BUCKETLOCK(ncp) &&
1921 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1923 cache_unlock_vnodes_cel(cel);
1928 cache_lock_buckets_cel(cel, blps[0], blps[1]);
1932 cache_enter_unlock(struct celockstate *cel)
1935 cache_unlock_buckets_cel(cel);
1936 cache_unlock_vnodes_cel(cel);
1939 static void __noinline
1940 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1941 struct componentname *cnp)
1943 struct celockstate cel;
1944 struct namecache *ncp;
1948 if (dvp->v_cache_dd == NULL)
1950 len = cnp->cn_namelen;
1951 cache_celockstate_init(&cel);
1952 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1953 cache_enter_lock_dd(&cel, dvp, vp, hash);
1954 vn_seqc_write_begin(dvp);
1955 ncp = dvp->v_cache_dd;
1956 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1957 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1958 cache_zap_locked(ncp);
1962 dvp->v_cache_dd = NULL;
1963 vn_seqc_write_end(dvp);
1964 cache_enter_unlock(&cel);
1969 * Add an entry to the cache.
1972 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1973 struct timespec *tsp, struct timespec *dtsp)
1975 struct celockstate cel;
1976 struct namecache *ncp, *n2, *ndd;
1977 struct namecache_ts *ncp_ts, *n2_ts;
1978 struct nchashhead *ncpp;
1984 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1985 VNPASS(!VN_IS_DOOMED(dvp), dvp);
1986 VNPASS(dvp->v_type != VNON, dvp);
1988 VNPASS(!VN_IS_DOOMED(vp), vp);
1989 VNPASS(vp->v_type != VNON, vp);
1993 if (__predict_false(!doingcache))
1998 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1999 if (cnp->cn_namelen == 1)
2001 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2002 cache_enter_dotdot_prep(dvp, vp, cnp);
2003 flag = NCF_ISDOTDOT;
2008 * Avoid blowout in namecache entries.
2010 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
2011 if (__predict_false(lnumcache >= ncsize)) {
2012 atomic_add_long(&numcache, -1);
2013 counter_u64_add(numdrops, 1);
2017 cache_celockstate_init(&cel);
2022 * Calculate the hash key and setup as much of the new
2023 * namecache entry as possible before acquiring the lock.
2025 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2026 ncp->nc_flag = flag | NCF_WIP;
2029 cache_negative_init(ncp);
2032 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2033 ncp_ts->nc_time = *tsp;
2034 ncp_ts->nc_ticks = ticks;
2035 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2037 ncp_ts->nc_dotdottime = *dtsp;
2038 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2041 len = ncp->nc_nlen = cnp->cn_namelen;
2042 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2043 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2044 ncp->nc_name[len] = '\0';
2045 cache_enter_lock(&cel, dvp, vp, hash);
2048 * See if this vnode or negative entry is already in the cache
2049 * with this name. This can happen with concurrent lookups of
2050 * the same path name.
2052 ncpp = NCHHASH(hash);
2053 CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2054 if (n2->nc_dvp == dvp &&
2055 n2->nc_nlen == cnp->cn_namelen &&
2056 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2057 MPASS(cache_ncp_canuse(n2));
2058 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2060 ("%s: found entry pointing to a different vnode (%p != %p)",
2061 __func__, NULL, vp));
2063 KASSERT(n2->nc_vp == vp,
2064 ("%s: found entry pointing to a different vnode (%p != %p)",
2065 __func__, n2->nc_vp, vp));
2067 KASSERT((n2->nc_flag & NCF_TS) != 0,
2069 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2070 n2_ts->nc_time = ncp_ts->nc_time;
2071 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2073 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2074 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2077 goto out_unlock_free;
2081 if (flag == NCF_ISDOTDOT) {
2083 * See if we are trying to add .. entry, but some other lookup
2084 * has populated v_cache_dd pointer already.
2086 if (dvp->v_cache_dd != NULL)
2087 goto out_unlock_free;
2088 KASSERT(vp == NULL || vp->v_type == VDIR,
2089 ("wrong vnode type %p", vp));
2090 vn_seqc_write_begin(dvp);
2091 dvp->v_cache_dd = ncp;
2092 vn_seqc_write_end(dvp);
2096 if (flag != NCF_ISDOTDOT) {
2098 * For this case, the cache entry maps both the
2099 * directory name in it and the name ".." for the
2100 * directory's parent.
2102 vn_seqc_write_begin(vp);
2103 if ((ndd = vp->v_cache_dd) != NULL) {
2104 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2105 cache_zap_locked(ndd);
2109 vp->v_cache_dd = ncp;
2110 vn_seqc_write_end(vp);
2111 } else if (vp->v_type != VDIR) {
2112 if (vp->v_cache_dd != NULL) {
2113 vn_seqc_write_begin(vp);
2114 vp->v_cache_dd = NULL;
2115 vn_seqc_write_end(vp);
2120 if (flag != NCF_ISDOTDOT) {
2121 if (LIST_EMPTY(&dvp->v_cache_src)) {
2123 counter_u64_add(numcachehv, 1);
2125 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2129 * If the entry is "negative", we place it into the
2130 * "negative" cache queue, otherwise, we place it into the
2131 * destination vnode's cache entries queue.
2134 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2135 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2138 if (cnp->cn_flags & ISWHITEOUT)
2139 ncp->nc_flag |= NCF_WHITE;
2140 cache_negative_insert(ncp);
2141 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2146 * Insert the new namecache entry into the appropriate chain
2147 * within the cache entries table.
2149 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2151 atomic_thread_fence_rel();
2153 * Mark the entry as fully constructed.
2154 * It is immutable past this point until its removal.
2156 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2158 cache_enter_unlock(&cel);
2159 if (numneg * ncnegfactor > lnumcache)
2160 cache_negative_zap_one();
2164 cache_enter_unlock(&cel);
2165 atomic_add_long(&numcache, -1);
2171 cache_roundup_2(u_int val)
2175 for (res = 1; res <= val; res <<= 1)
2181 static struct nchashhead *
2182 nchinittbl(u_long elements, u_long *hashmask)
2184 struct nchashhead *hashtbl;
2187 hashsize = cache_roundup_2(elements) / 2;
2189 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2190 for (i = 0; i < hashsize; i++)
2191 CK_SLIST_INIT(&hashtbl[i]);
2192 *hashmask = hashsize - 1;
2197 ncfreetbl(struct nchashhead *hashtbl)
2200 free(hashtbl, M_VFSCACHE);
2204 * Name cache initialization, from vfs_init() when we are booting
2207 nchinit(void *dummy __unused)
2211 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2212 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2213 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2214 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2215 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2216 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2217 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2218 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2220 VFS_SMR_ZONE_SET(cache_zone_small);
2221 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2222 VFS_SMR_ZONE_SET(cache_zone_large);
2223 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2225 ncsize = desiredvnodes * ncsizefactor;
2226 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2227 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2228 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2230 if (ncbuckethash > nchash)
2231 ncbuckethash = nchash;
2232 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2234 for (i = 0; i < numbucketlocks; i++)
2235 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2236 ncvnodehash = ncbuckethash;
2237 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2239 for (i = 0; i < numvnodelocks; i++)
2240 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2241 ncpurgeminvnodes = numbucketlocks * 2;
2243 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2245 for (i = 0; i < numneglists; i++) {
2246 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2247 TAILQ_INIT(&neglists[i].nl_list);
2249 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2250 TAILQ_INIT(&ncneg_hot.nl_list);
2252 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2254 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2257 cache_vnode_init(struct vnode *vp)
2260 LIST_INIT(&vp->v_cache_src);
2261 TAILQ_INIT(&vp->v_cache_dst);
2262 vp->v_cache_dd = NULL;
2267 cache_changesize(u_long newmaxvnodes)
2269 struct nchashhead *new_nchashtbl, *old_nchashtbl;
2270 u_long new_nchash, old_nchash;
2271 struct namecache *ncp;
2276 newncsize = newmaxvnodes * ncsizefactor;
2277 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2278 if (newmaxvnodes < numbucketlocks)
2279 newmaxvnodes = numbucketlocks;
2281 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2282 /* If same hash table size, nothing to do */
2283 if (nchash == new_nchash) {
2284 ncfreetbl(new_nchashtbl);
2288 * Move everything from the old hash table to the new table.
2289 * None of the namecache entries in the table can be removed
2290 * because to do so, they have to be removed from the hash table.
2292 cache_lock_all_vnodes();
2293 cache_lock_all_buckets();
2294 old_nchashtbl = nchashtbl;
2295 old_nchash = nchash;
2296 nchashtbl = new_nchashtbl;
2297 nchash = new_nchash;
2298 for (i = 0; i <= old_nchash; i++) {
2299 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2300 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2302 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2303 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2307 cache_unlock_all_buckets();
2308 cache_unlock_all_vnodes();
2309 ncfreetbl(old_nchashtbl);
2313 * Invalidate all entries from and to a particular vnode.
2316 cache_purge_impl(struct vnode *vp)
2318 TAILQ_HEAD(, namecache) ncps;
2319 struct namecache *ncp, *nnp;
2320 struct mtx *vlp, *vlp2;
2323 vlp = VP2VNODELOCK(vp);
2325 mtx_assert(vlp, MA_OWNED);
2327 while (!LIST_EMPTY(&vp->v_cache_src)) {
2328 ncp = LIST_FIRST(&vp->v_cache_src);
2329 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2331 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2333 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2334 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2335 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2337 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2339 ncp = vp->v_cache_dd;
2341 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2342 ("lost dotdot link"));
2343 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2345 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2347 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2351 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2357 cache_purge(struct vnode *vp)
2361 SDT_PROBE1(vfs, namecache, purge, done, vp);
2362 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2363 vp->v_cache_dd == NULL)
2365 vlp = VP2VNODELOCK(vp);
2367 cache_purge_impl(vp);
2371 * Only to be used by vgone.
2374 cache_purge_vgone(struct vnode *vp)
2378 VNPASS(VN_IS_DOOMED(vp), vp);
2379 vlp = VP2VNODELOCK(vp);
2380 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2381 vp->v_cache_dd == NULL)) {
2383 cache_purge_impl(vp);
2384 mtx_assert(vlp, MA_NOTOWNED);
2389 * All the NULL pointer state we found above may be transient.
2390 * Serialize against a possible thread doing cache_purge.
2392 mtx_wait_unlocked(vlp);
2393 if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2394 vp->v_cache_dd == NULL)) {
2396 cache_purge_impl(vp);
2397 mtx_assert(vlp, MA_NOTOWNED);
2404 * Invalidate all negative entries for a particular directory vnode.
2407 cache_purge_negative(struct vnode *vp)
2409 TAILQ_HEAD(, namecache) ncps;
2410 struct namecache *ncp, *nnp;
2413 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2414 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2415 if (LIST_EMPTY(&vp->v_cache_src))
2418 vlp = VP2VNODELOCK(vp);
2420 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2421 if (!(ncp->nc_flag & NCF_NEGATIVE))
2423 cache_zap_negative_locked_vnode_kl(ncp, vp);
2424 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2427 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2433 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2434 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2437 ASSERT_VOP_IN_SEQC(fdvp);
2438 ASSERT_VOP_IN_SEQC(fvp);
2439 ASSERT_VOP_IN_SEQC(tdvp);
2441 ASSERT_VOP_IN_SEQC(tvp);
2446 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2447 ("%s: lingering negative entry", __func__));
2449 cache_remove_cnp(tdvp, tcnp);
2454 * Flush all entries referencing a particular filesystem.
2457 cache_purgevfs(struct mount *mp, bool force)
2459 TAILQ_HEAD(, namecache) ncps;
2460 struct mtx *vlp1, *vlp2;
2462 struct nchashhead *bucket;
2463 struct namecache *ncp, *nnp;
2464 u_long i, j, n_nchash;
2467 /* Scan hash tables for applicable entries */
2468 SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2469 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2472 n_nchash = nchash + 1;
2474 for (i = 0; i < numbucketlocks; i++) {
2475 blp = (struct mtx *)&bucketlocks[i];
2477 for (j = i; j < n_nchash; j += numbucketlocks) {
2479 bucket = &nchashtbl[j];
2480 CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2481 cache_assert_bucket_locked(ncp);
2482 if (ncp->nc_dvp->v_mount != mp)
2484 error = cache_zap_locked_bucket_kl(ncp, blp,
2488 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2492 if (vlp1 == NULL && vlp2 == NULL)
2493 cache_maybe_yield();
2500 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2506 * Perform canonical checks and cache lookup and pass on to filesystem
2507 * through the vop_cachedlookup only if needed.
2511 vfs_cache_lookup(struct vop_lookup_args *ap)
2515 struct vnode **vpp = ap->a_vpp;
2516 struct componentname *cnp = ap->a_cnp;
2517 int flags = cnp->cn_flags;
2522 if (dvp->v_type != VDIR)
2525 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2526 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2529 error = vn_dir_check_exec(dvp, cnp);
2533 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2535 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2541 /* Implementation of the getcwd syscall. */
2543 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2549 buflen = uap->buflen;
2550 if (__predict_false(buflen < 2))
2552 if (buflen > MAXPATHLEN)
2553 buflen = MAXPATHLEN;
2555 buf = uma_zalloc(namei_zone, M_WAITOK);
2556 error = vn_getcwd(buf, &retbuf, &buflen);
2558 error = copyout(retbuf, uap->buf, buflen);
2559 uma_zfree(namei_zone, buf);
2564 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2570 pwd = pwd_get_smr();
2571 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2573 VFS_SMR_ASSERT_NOT_ENTERED();
2575 pwd = pwd_hold(curthread);
2576 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2582 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2589 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2590 size_t size, int flags, enum uio_seg pathseg)
2592 struct nameidata nd;
2593 char *retbuf, *freebuf;
2598 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2599 pathseg, path, fd, &cap_fstat_rights, td);
2600 if ((error = namei(&nd)) != 0)
2602 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2604 error = copyout(retbuf, buf, size);
2605 free(freebuf, M_TEMP);
2612 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2615 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2616 uap->flags, UIO_USERSPACE));
2620 * Retrieve the full filesystem path that correspond to a vnode from the name
2621 * cache (if available)
2624 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2631 if (__predict_false(vp == NULL))
2634 buflen = MAXPATHLEN;
2635 buf = malloc(buflen, M_TEMP, M_WAITOK);
2637 pwd = pwd_get_smr();
2638 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2639 VFS_SMR_ASSERT_NOT_ENTERED();
2641 pwd = pwd_hold(curthread);
2642 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2653 * This function is similar to vn_fullpath, but it attempts to lookup the
2654 * pathname relative to the global root mount point. This is required for the
2655 * auditing sub-system, as audited pathnames must be absolute, relative to the
2656 * global root mount point.
2659 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2665 if (__predict_false(vp == NULL))
2667 buflen = MAXPATHLEN;
2668 buf = malloc(buflen, M_TEMP, M_WAITOK);
2670 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2671 VFS_SMR_ASSERT_NOT_ENTERED();
2673 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2682 static struct namecache *
2683 vn_dd_from_dst(struct vnode *vp)
2685 struct namecache *ncp;
2687 cache_assert_vnode_locked(vp);
2688 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2689 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2696 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2699 struct namecache *ncp;
2703 vlp = VP2VNODELOCK(*vp);
2705 ncp = (*vp)->v_cache_dd;
2706 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2707 KASSERT(ncp == vn_dd_from_dst(*vp),
2708 ("%s: mismatch for dd entry (%p != %p)", __func__,
2709 ncp, vn_dd_from_dst(*vp)));
2711 ncp = vn_dd_from_dst(*vp);
2714 if (*buflen < ncp->nc_nlen) {
2717 counter_u64_add(numfullpathfail4, 1);
2719 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2723 *buflen -= ncp->nc_nlen;
2724 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2725 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2734 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2737 vn_lock(*vp, LK_SHARED | LK_RETRY);
2738 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2741 counter_u64_add(numfullpathfail2, 1);
2742 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2747 if (VN_IS_DOOMED(dvp)) {
2748 /* forced unmount */
2751 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2755 * *vp has its use count incremented still.
2762 * Resolve a directory to a pathname.
2764 * The name of the directory can always be found in the namecache or fetched
2765 * from the filesystem. There is also guaranteed to be only one parent, meaning
2766 * we can just follow vnodes up until we find the root.
2768 * The vnode must be referenced.
2771 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2772 size_t *len, bool slash_prefixed, size_t addend)
2774 #ifdef KDTRACE_HOOKS
2775 struct vnode *startvp = vp;
2781 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2782 VNPASS(vp->v_usecount > 0, vp);
2786 if (!slash_prefixed) {
2794 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2795 counter_u64_add(numfullpathcalls, 1);
2796 while (vp != rdir && vp != rootvnode) {
2798 * The vp vnode must be already fully constructed,
2799 * since it is either found in namecache or obtained
2800 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
2801 * without obtaining the vnode lock.
2803 if ((vp->v_vflag & VV_ROOT) != 0) {
2804 vn_lock(vp, LK_RETRY | LK_SHARED);
2807 * With the vnode locked, check for races with
2808 * unmount, forced or not. Note that we
2809 * already verified that vp is not equal to
2810 * the root vnode, which means that
2811 * mnt_vnodecovered can be NULL only for the
2814 if (VN_IS_DOOMED(vp) ||
2815 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2816 vp1->v_mountedhere != vp->v_mount) {
2819 SDT_PROBE3(vfs, namecache, fullpath, return,
2829 if (vp->v_type != VDIR) {
2831 counter_u64_add(numfullpathfail1, 1);
2833 SDT_PROBE3(vfs, namecache, fullpath, return,
2837 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2843 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2847 buf[--buflen] = '/';
2848 slash_prefixed = true;
2852 if (!slash_prefixed) {
2855 counter_u64_add(numfullpathfail4, 1);
2856 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2860 buf[--buflen] = '/';
2862 counter_u64_add(numfullpathfound, 1);
2865 *retbuf = buf + buflen;
2866 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2873 * Resolve an arbitrary vnode to a pathname.
2876 * - hardlinks are not tracked, thus if the vnode is not a directory this can
2877 * resolve to a different path than the one used to find it
2878 * - namecache is not mandatory, meaning names are not guaranteed to be added
2879 * (in which case resolving fails)
2881 static void __inline
2882 cache_rev_failed_impl(int *reason, int line)
2887 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
2890 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2891 char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2893 #ifdef KDTRACE_HOOKS
2894 struct vnode *startvp = vp;
2898 struct namecache *ncp;
2902 #ifdef KDTRACE_HOOKS
2905 seqc_t vp_seqc, tvp_seqc;
2908 VFS_SMR_ASSERT_ENTERED();
2910 if (!cache_fast_revlookup) {
2915 orig_buflen = *buflen;
2917 if (!slash_prefixed) {
2918 MPASS(*buflen >= 2);
2920 buf[*buflen] = '\0';
2923 if (vp == rdir || vp == rootvnode) {
2924 if (!slash_prefixed) {
2931 #ifdef KDTRACE_HOOKS
2935 vp_seqc = vn_seqc_read_any(vp);
2936 if (seqc_in_modify(vp_seqc)) {
2937 cache_rev_failed(&reason);
2942 #ifdef KDTRACE_HOOKS
2945 if ((vp->v_vflag & VV_ROOT) != 0) {
2946 mp = atomic_load_ptr(&vp->v_mount);
2948 cache_rev_failed(&reason);
2951 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2952 tvp_seqc = vn_seqc_read_any(tvp);
2953 if (seqc_in_modify(tvp_seqc)) {
2954 cache_rev_failed(&reason);
2957 if (!vn_seqc_consistent(vp, vp_seqc)) {
2958 cache_rev_failed(&reason);
2965 ncp = atomic_load_ptr(&vp->v_cache_dd);
2967 cache_rev_failed(&reason);
2970 nc_flag = atomic_load_char(&ncp->nc_flag);
2971 if ((nc_flag & NCF_ISDOTDOT) != 0) {
2972 cache_rev_failed(&reason);
2975 if (!cache_ncp_canuse(ncp)) {
2976 cache_rev_failed(&reason);
2979 if (ncp->nc_nlen >= *buflen) {
2980 cache_rev_failed(&reason);
2984 *buflen -= ncp->nc_nlen;
2985 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2989 tvp_seqc = vn_seqc_read_any(tvp);
2990 if (seqc_in_modify(tvp_seqc)) {
2991 cache_rev_failed(&reason);
2994 if (!vn_seqc_consistent(vp, vp_seqc)) {
2995 cache_rev_failed(&reason);
3000 if (vp == rdir || vp == rootvnode)
3005 *retbuf = buf + *buflen;
3006 *buflen = orig_buflen - *buflen + addend;
3007 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3011 *buflen = orig_buflen;
3012 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3018 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3022 bool slash_prefixed;
3028 orig_buflen = *buflen;
3031 slash_prefixed = false;
3032 if (vp->v_type != VDIR) {
3034 buf[*buflen] = '\0';
3035 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
3044 slash_prefixed = true;
3047 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
3048 orig_buflen - *buflen));
3052 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3054 * Since the namecache does not track handlings, the caller is expected to first
3055 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3057 * Then we have 2 cases:
3058 * - if the found vnode is a directory, the path can be constructed just by
3059 * fullowing names up the chain
3060 * - otherwise we populate the buffer with the saved name and start resolving
3064 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3069 struct componentname *cnp;
3073 bool slash_prefixed;
3078 if (*buflen > MAXPATHLEN)
3079 *buflen = MAXPATHLEN;
3081 slash_prefixed = false;
3083 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3088 * Check for VBAD to work around the vp_crossmp bug in lookup().
3090 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3091 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3092 * If the type is VDIR (like in this very case) we can skip looking
3093 * at ni_dvp in the first place. However, since vnodes get passed here
3094 * unlocked the target may transition to doomed state (type == VBAD)
3095 * before we get to evaluate the condition. If this happens, we will
3096 * populate part of the buffer and descend to vn_fullpath_dir with
3097 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3099 * This should be atomic_load(&vp->v_type) but it is ilegal to take
3100 * an address of a bit field, even if said field is sized to char.
3101 * Work around the problem by reading the value into a full-sized enum
3102 * and then re-reading it with atomic_load which will still prevent
3103 * the compiler from re-reading down the road.
3106 type = atomic_load_int(&type);
3113 addend = cnp->cn_namelen + 2;
3114 if (*buflen < addend) {
3119 tmpbuf = buf + *buflen;
3121 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3122 tmpbuf[addend - 1] = '\0';
3123 slash_prefixed = true;
3128 pwd = pwd_get_smr();
3129 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3130 slash_prefixed, addend);
3131 VFS_SMR_ASSERT_NOT_ENTERED();
3133 pwd = pwd_hold(curthread);
3135 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3136 slash_prefixed, addend);
3151 vn_dir_dd_ino(struct vnode *vp)
3153 struct namecache *ncp;
3158 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3159 vlp = VP2VNODELOCK(vp);
3161 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3162 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3165 vs = vget_prep(ddvp);
3167 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3176 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3178 struct namecache *ncp;
3182 vlp = VP2VNODELOCK(vp);
3184 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3185 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3191 l = min(ncp->nc_nlen, buflen - 1);
3192 memcpy(buf, ncp->nc_name, l);
3199 * This function updates path string to vnode's full global path
3200 * and checks the size of the new path string against the pathlen argument.
3202 * Requires a locked, referenced vnode.
3203 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3205 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3206 * because it falls back to the ".." lookup if the namecache lookup fails.
3209 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3212 struct nameidata nd;
3217 ASSERT_VOP_ELOCKED(vp, __func__);
3219 /* Construct global filesystem path from vp. */
3221 error = vn_fullpath_global(vp, &rpath, &fbuf);
3228 if (strlen(rpath) >= pathlen) {
3230 error = ENAMETOOLONG;
3235 * Re-lookup the vnode by path to detect a possible rename.
3236 * As a side effect, the vnode is relocked.
3237 * If vnode was renamed, return ENOENT.
3239 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3240 UIO_SYSSPACE, path, td);
3246 NDFREE(&nd, NDF_ONLY_PNBUF);
3250 strcpy(path, rpath);
3263 db_print_vpath(struct vnode *vp)
3266 while (vp != NULL) {
3267 db_printf("%p: ", vp);
3268 if (vp == rootvnode) {
3272 if (vp->v_vflag & VV_ROOT) {
3273 db_printf("<mount point>");
3274 vp = vp->v_mount->mnt_vnodecovered;
3276 struct namecache *ncp;
3280 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3283 for (i = 0; i < ncp->nc_nlen; i++)
3284 db_printf("%c", *ncn++);
3297 DB_SHOW_COMMAND(vpath, db_show_vpath)
3302 db_printf("usage: show vpath <struct vnode *>\n");
3306 vp = (struct vnode *)addr;
3312 static bool __read_frequently cache_fast_lookup = true;
3313 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3314 &cache_fast_lookup, 0, "");
3316 #define CACHE_FPL_FAILED -2020
3319 cache_fpl_cleanup_cnp(struct componentname *cnp)
3322 uma_zfree(namei_zone, cnp->cn_pnbuf);
3324 cnp->cn_pnbuf = NULL;
3325 cnp->cn_nameptr = NULL;
3330 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3332 struct componentname *cnp;
3335 while (*(cnp->cn_nameptr) == '/') {
3340 *dpp = ndp->ni_rootdir;
3344 * Components of nameidata (or objects it can point to) which may
3345 * need restoring in case fast path lookup fails.
3347 struct nameidata_saved {
3355 struct nameidata *ndp;
3356 struct componentname *cnp;
3362 struct nameidata_saved snd;
3364 enum cache_fpl_status status:8;
3369 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3372 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3373 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3374 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3375 snd->ni_pathlen = fpl->ndp->ni_pathlen;
3379 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3382 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3383 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3384 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3385 fpl->ndp->ni_pathlen = snd->ni_pathlen;
3389 #define cache_fpl_smr_assert_entered(fpl) ({ \
3390 struct cache_fpl *_fpl = (fpl); \
3391 MPASS(_fpl->in_smr == true); \
3392 VFS_SMR_ASSERT_ENTERED(); \
3394 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
3395 struct cache_fpl *_fpl = (fpl); \
3396 MPASS(_fpl->in_smr == false); \
3397 VFS_SMR_ASSERT_NOT_ENTERED(); \
3400 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3401 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3404 #define cache_fpl_smr_enter_initial(fpl) ({ \
3405 struct cache_fpl *_fpl = (fpl); \
3407 _fpl->in_smr = true; \
3410 #define cache_fpl_smr_enter(fpl) ({ \
3411 struct cache_fpl *_fpl = (fpl); \
3412 MPASS(_fpl->in_smr == false); \
3414 _fpl->in_smr = true; \
3417 #define cache_fpl_smr_exit(fpl) ({ \
3418 struct cache_fpl *_fpl = (fpl); \
3419 MPASS(_fpl->in_smr == true); \
3421 _fpl->in_smr = false; \
3425 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3428 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3429 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3430 ("%s: converting to abort from %d at %d, set at %d\n",
3431 __func__, fpl->status, line, fpl->line));
3433 fpl->status = CACHE_FPL_STATUS_ABORTED;
3435 return (CACHE_FPL_FAILED);
3438 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
3441 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3444 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3445 ("%s: setting to partial at %d, but already set to %d at %d\n",
3446 __func__, line, fpl->status, fpl->line));
3447 cache_fpl_smr_assert_entered(fpl);
3448 fpl->status = CACHE_FPL_STATUS_PARTIAL;
3450 return (CACHE_FPL_FAILED);
3453 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
3456 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3459 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3460 ("%s: setting to handled at %d, but already set to %d at %d\n",
3461 __func__, line, fpl->status, fpl->line));
3462 cache_fpl_smr_assert_not_entered(fpl);
3463 MPASS(error != CACHE_FPL_FAILED);
3464 fpl->status = CACHE_FPL_STATUS_HANDLED;
3469 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3471 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3472 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3473 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3475 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3476 (ISDOTDOT | MAKEENTRY | ISLASTCN)
3478 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3479 "supported and internal flags overlap");
3482 cache_fpl_islastcn(struct nameidata *ndp)
3485 return (*ndp->ni_next == 0);
3489 cache_fpl_isdotdot(struct componentname *cnp)
3492 if (cnp->cn_namelen == 2 &&
3493 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3499 cache_can_fplookup(struct cache_fpl *fpl)
3501 struct nameidata *ndp;
3502 struct componentname *cnp;
3507 td = cnp->cn_thread;
3509 if (!cache_fast_lookup) {
3510 cache_fpl_aborted(fpl);
3514 if (mac_vnode_check_lookup_enabled()) {
3515 cache_fpl_aborted(fpl);
3519 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3520 cache_fpl_aborted(fpl);
3523 if (ndp->ni_dirfd != AT_FDCWD) {
3524 cache_fpl_aborted(fpl);
3527 if (IN_CAPABILITY_MODE(td)) {
3528 cache_fpl_aborted(fpl);
3531 if (AUDITING_TD(td)) {
3532 cache_fpl_aborted(fpl);
3535 if (ndp->ni_startdir != NULL) {
3536 cache_fpl_aborted(fpl);
3543 cache_fplookup_vnode_supported(struct vnode *vp)
3546 return (vp->v_type != VLNK);
3550 * Move a negative entry to the hot list.
3552 * We have to take locks, but they may be contended and in the worst
3553 * case we may need to go off CPU. We don't want to spin within the
3554 * smr section and we can't block with it. Instead we are going to
3555 * look up the entry again.
3557 static int __noinline
3558 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3561 struct componentname *cnp;
3562 struct namecache *ncp;
3563 struct neglist *neglist;
3564 struct negstate *negstate;
3571 if (!vhold_smr(dvp))
3572 return (cache_fpl_aborted(fpl));
3574 neglist = NCP2NEGLIST(oncp);
3575 cache_fpl_smr_exit(fpl);
3577 mtx_lock(&ncneg_hot.nl_lock);
3578 mtx_lock(&neglist->nl_lock);
3580 * For hash iteration.
3582 cache_fpl_smr_enter(fpl);
3585 * Avoid all surprises by only succeeding if we got the same entry and
3586 * bailing completely otherwise.
3588 * In particular at this point there can be a new ncp which matches the
3589 * search but hashes to a different neglist.
3591 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3597 * No match to begin with.
3599 if (__predict_false(ncp == NULL)) {
3604 * The newly found entry may be something different...
3606 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3607 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3612 * ... and not even negative.
3614 nc_flag = atomic_load_char(&ncp->nc_flag);
3615 if ((nc_flag & NCF_NEGATIVE) == 0) {
3619 if (__predict_false(!cache_ncp_canuse(ncp))) {
3623 negstate = NCP2NEGSTATE(ncp);
3624 if ((negstate->neg_flag & NEG_HOT) == 0) {
3626 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3627 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3628 negstate->neg_flag |= NEG_HOT;
3631 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3632 counter_u64_add(numneghits, 1);
3633 cache_fpl_smr_exit(fpl);
3634 mtx_unlock(&neglist->nl_lock);
3635 mtx_unlock(&ncneg_hot.nl_lock);
3637 return (cache_fpl_handled(fpl, ENOENT));
3639 cache_fpl_smr_exit(fpl);
3640 mtx_unlock(&neglist->nl_lock);
3641 mtx_unlock(&ncneg_hot.nl_lock);
3643 return (cache_fpl_aborted(fpl));
3647 * The target vnode is not supported, prepare for the slow path to take over.
3649 static int __noinline
3650 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3652 struct nameidata *ndp;
3653 struct componentname *cnp;
3662 dvp_seqc = fpl->dvp_seqc;
3664 dvs = vget_prep_smr(dvp);
3665 if (__predict_false(dvs == VGET_NONE)) {
3666 cache_fpl_smr_exit(fpl);
3667 return (cache_fpl_aborted(fpl));
3670 cache_fpl_smr_exit(fpl);
3672 vget_finish_ref(dvp, dvs);
3673 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3675 return (cache_fpl_aborted(fpl));
3678 pwd = pwd_hold(curthread);
3679 if (fpl->pwd != pwd) {
3682 return (cache_fpl_aborted(fpl));
3685 cache_fpl_restore(fpl, &fpl->snd);
3687 ndp->ni_startdir = dvp;
3688 cnp->cn_flags |= MAKEENTRY;
3689 if (cache_fpl_islastcn(ndp))
3690 cnp->cn_flags |= ISLASTCN;
3691 if (cache_fpl_isdotdot(cnp))
3692 cnp->cn_flags |= ISDOTDOT;
3698 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3700 struct componentname *cnp;
3707 tvp_seqc = fpl->tvp_seqc;
3709 if ((cnp->cn_flags & LOCKLEAF) != 0) {
3710 lkflags = LK_SHARED;
3711 if ((cnp->cn_flags & LOCKSHARED) == 0)
3712 lkflags = LK_EXCLUSIVE;
3713 error = vget_finish(tvp, lkflags, tvs);
3714 if (__predict_false(error != 0)) {
3715 return (cache_fpl_aborted(fpl));
3718 vget_finish_ref(tvp, tvs);
3721 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3722 if ((cnp->cn_flags & LOCKLEAF) != 0)
3726 return (cache_fpl_aborted(fpl));
3729 return (cache_fpl_handled(fpl, 0));
3733 * They want to possibly modify the state of the namecache.
3735 * Don't try to match the API contract, just leave.
3736 * TODO: this leaves scalability on the table
3739 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3741 struct componentname *cnp;
3744 MPASS(cnp->cn_nameiop != LOOKUP);
3745 return (cache_fpl_partial(fpl));
3748 static int __noinline
3749 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3751 struct componentname *cnp;
3752 enum vgetstate dvs, tvs;
3753 struct vnode *dvp, *tvp;
3754 seqc_t dvp_seqc, tvp_seqc;
3759 dvp_seqc = fpl->dvp_seqc;
3761 tvp_seqc = fpl->tvp_seqc;
3763 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3766 * This is less efficient than it can be for simplicity.
3768 dvs = vget_prep_smr(dvp);
3769 if (__predict_false(dvs == VGET_NONE)) {
3770 return (cache_fpl_aborted(fpl));
3772 tvs = vget_prep_smr(tvp);
3773 if (__predict_false(tvs == VGET_NONE)) {
3774 cache_fpl_smr_exit(fpl);
3775 vget_abort(dvp, dvs);
3776 return (cache_fpl_aborted(fpl));
3779 cache_fpl_smr_exit(fpl);
3781 if ((cnp->cn_flags & LOCKPARENT) != 0) {
3782 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3783 if (__predict_false(error != 0)) {
3784 vget_abort(tvp, tvs);
3785 return (cache_fpl_aborted(fpl));
3788 vget_finish_ref(dvp, dvs);
3791 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3792 vget_abort(tvp, tvs);
3793 if ((cnp->cn_flags & LOCKPARENT) != 0)
3797 return (cache_fpl_aborted(fpl));
3800 error = cache_fplookup_final_child(fpl, tvs);
3801 if (__predict_false(error != 0)) {
3802 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3803 if ((cnp->cn_flags & LOCKPARENT) != 0)
3810 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3815 cache_fplookup_final(struct cache_fpl *fpl)
3817 struct componentname *cnp;
3819 struct vnode *dvp, *tvp;
3820 seqc_t dvp_seqc, tvp_seqc;
3824 dvp_seqc = fpl->dvp_seqc;
3826 tvp_seqc = fpl->tvp_seqc;
3828 VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3830 if (cnp->cn_nameiop != LOOKUP) {
3831 return (cache_fplookup_final_modifying(fpl));
3834 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3835 return (cache_fplookup_final_withparent(fpl));
3837 tvs = vget_prep_smr(tvp);
3838 if (__predict_false(tvs == VGET_NONE)) {
3839 return (cache_fpl_partial(fpl));
3842 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3843 cache_fpl_smr_exit(fpl);
3844 vget_abort(tvp, tvs);
3845 return (cache_fpl_aborted(fpl));
3848 cache_fpl_smr_exit(fpl);
3849 return (cache_fplookup_final_child(fpl, tvs));
3852 static int __noinline
3853 cache_fplookup_dot(struct cache_fpl *fpl)
3860 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3861 if (seqc_in_modify(fpl->tvp_seqc)) {
3862 return (cache_fpl_aborted(fpl));
3865 counter_u64_add(dothits, 1);
3866 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3871 static int __noinline
3872 cache_fplookup_dotdot(struct cache_fpl *fpl)
3874 struct nameidata *ndp;
3875 struct componentname *cnp;
3876 struct namecache *ncp;
3886 * XXX this is racy the same way regular lookup is
3888 for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3890 if (dvp == pr->pr_root)
3893 if (dvp == ndp->ni_rootdir ||
3894 dvp == ndp->ni_topdir ||
3898 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3899 if (seqc_in_modify(fpl->tvp_seqc)) {
3900 return (cache_fpl_aborted(fpl));
3905 if ((dvp->v_vflag & VV_ROOT) != 0) {
3908 * The opposite of climb mount is needed here.
3910 return (cache_fpl_aborted(fpl));
3913 ncp = atomic_load_ptr(&dvp->v_cache_dd);
3915 return (cache_fpl_aborted(fpl));
3918 nc_flag = atomic_load_char(&ncp->nc_flag);
3919 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3920 if ((nc_flag & NCF_NEGATIVE) != 0)
3921 return (cache_fpl_aborted(fpl));
3922 fpl->tvp = ncp->nc_vp;
3924 fpl->tvp = ncp->nc_dvp;
3927 if (__predict_false(!cache_ncp_canuse(ncp))) {
3928 return (cache_fpl_aborted(fpl));
3931 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3932 if (seqc_in_modify(fpl->tvp_seqc)) {
3933 return (cache_fpl_partial(fpl));
3936 counter_u64_add(dotdothits, 1);
3941 cache_fplookup_next(struct cache_fpl *fpl)
3943 struct componentname *cnp;
3944 struct namecache *ncp;
3945 struct negstate *negstate;
3946 struct vnode *dvp, *tvp;
3954 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3955 return (cache_fplookup_dot(fpl));
3958 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3960 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3961 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3962 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3967 * If there is no entry we have to punt to the slow path to perform
3968 * actual lookup. Should there be nothing with this name a negative
3969 * entry will be created.
3971 if (__predict_false(ncp == NULL)) {
3972 return (cache_fpl_partial(fpl));
3975 tvp = atomic_load_ptr(&ncp->nc_vp);
3976 nc_flag = atomic_load_char(&ncp->nc_flag);
3977 if ((nc_flag & NCF_NEGATIVE) != 0) {
3979 * If they want to create an entry we need to replace this one.
3981 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3982 return (cache_fpl_partial(fpl));
3984 negstate = NCP2NEGSTATE(ncp);
3985 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3986 if (__predict_false(!cache_ncp_canuse(ncp))) {
3987 return (cache_fpl_partial(fpl));
3989 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3990 return (cache_fpl_partial(fpl));
3993 return (cache_fplookup_negative_promote(fpl, ncp, hash));
3995 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3997 counter_u64_add(numneghits, 1);
3998 cache_fpl_smr_exit(fpl);
3999 return (cache_fpl_handled(fpl, ENOENT));
4002 if (__predict_false(!cache_ncp_canuse(ncp))) {
4003 return (cache_fpl_partial(fpl));
4007 fpl->tvp_seqc = vn_seqc_read_any(tvp);
4008 if (seqc_in_modify(fpl->tvp_seqc)) {
4009 return (cache_fpl_partial(fpl));
4012 if (!cache_fplookup_vnode_supported(tvp)) {
4013 return (cache_fpl_partial(fpl));
4016 counter_u64_add(numposhits, 1);
4017 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4022 cache_fplookup_mp_supported(struct mount *mp)
4027 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4033 * Walk up the mount stack (if any).
4035 * Correctness is provided in the following ways:
4036 * - all vnodes are protected from freeing with SMR
4037 * - struct mount objects are type stable making them always safe to access
4038 * - stability of the particular mount is provided by busying it
4039 * - relationship between the vnode which is mounted on and the mount is
4040 * verified with the vnode sequence counter after busying
4041 * - association between root vnode of the mount and the mount is protected
4044 * From that point on we can read the sequence counter of the root vnode
4045 * and get the next mount on the stack (if any) using the same protection.
4047 * By the end of successful walk we are guaranteed the reached state was
4048 * indeed present at least at some point which matches the regular lookup.
4050 static int __noinline
4051 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4053 struct mount *mp, *prev_mp;
4058 vp_seqc = fpl->tvp_seqc;
4060 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4061 mp = atomic_load_ptr(&vp->v_mountedhere);
4067 if (!vfs_op_thread_enter_crit(mp)) {
4068 if (prev_mp != NULL)
4069 vfs_op_thread_exit_crit(prev_mp);
4070 return (cache_fpl_partial(fpl));
4072 if (prev_mp != NULL)
4073 vfs_op_thread_exit_crit(prev_mp);
4074 if (!vn_seqc_consistent(vp, vp_seqc)) {
4075 vfs_op_thread_exit_crit(mp);
4076 return (cache_fpl_partial(fpl));
4078 if (!cache_fplookup_mp_supported(mp)) {
4079 vfs_op_thread_exit_crit(mp);
4080 return (cache_fpl_partial(fpl));
4082 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4083 if (vp == NULL || VN_IS_DOOMED(vp)) {
4084 vfs_op_thread_exit_crit(mp);
4085 return (cache_fpl_partial(fpl));
4087 vp_seqc = vn_seqc_read_any(vp);
4088 if (seqc_in_modify(vp_seqc)) {
4089 vfs_op_thread_exit_crit(mp);
4090 return (cache_fpl_partial(fpl));
4093 mp = atomic_load_ptr(&vp->v_mountedhere);
4098 vfs_op_thread_exit_crit(prev_mp);
4100 fpl->tvp_seqc = vp_seqc;
4105 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4113 * Hack: while this is a union, the pointer tends to be NULL so save on
4116 mp = atomic_load_ptr(&vp->v_mountedhere);
4119 if (vp->v_type == VDIR)
4127 * The code is mostly copy-pasted from regular lookup, see lookup().
4128 * The structure is maintained along with comments for easier maintenance.
4129 * Deduplicating the code will become feasible after fast path lookup
4130 * becomes more feature-complete.
4133 cache_fplookup_parse(struct cache_fpl *fpl)
4135 struct nameidata *ndp;
4136 struct componentname *cnp;
4143 * Search a new directory.
4145 * The last component of the filename is left accessible via
4146 * cnp->cn_nameptr for callers that need the name. Callers needing
4147 * the name set the SAVENAME flag. When done, they assume
4148 * responsibility for freeing the pathname buffer.
4150 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4152 cnp->cn_namelen = cp - cnp->cn_nameptr;
4153 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4154 cache_fpl_smr_exit(fpl);
4155 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4157 ndp->ni_pathlen -= cnp->cn_namelen;
4158 KASSERT(ndp->ni_pathlen <= PATH_MAX,
4159 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4163 * Replace multiple slashes by a single slash and trailing slashes
4164 * by a null. This must be done before VOP_LOOKUP() because some
4165 * fs's don't know about trailing slashes. Remember if there were
4166 * trailing slashes to handle symlinks, existing non-directories
4167 * and non-existing files that won't be directories specially later.
4169 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4175 * Regular lookup performs the following:
4176 * *ndp->ni_next = '\0';
4177 * cnp->cn_flags |= TRAILINGSLASH;
4179 * Which is problematic since it modifies data read
4180 * from userspace. Then if fast path lookup was to
4181 * abort we would have to either restore it or convey
4182 * the flag. Since this is a corner case just ignore
4183 * it for simplicity.
4185 return (cache_fpl_partial(fpl));
4191 * Check for degenerate name (e.g. / or "")
4192 * which is a way of talking about a directory,
4193 * e.g. like "/." or ".".
4196 * Another corner case handled by the regular lookup
4198 if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4199 return (cache_fpl_partial(fpl));
4205 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4207 struct nameidata *ndp;
4208 struct componentname *cnp;
4213 cnp->cn_nameptr = ndp->ni_next;
4214 while (*cnp->cn_nameptr == '/') {
4220 static int __noinline
4221 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4227 * Can happen when racing against vgone.
4230 cache_fpl_partial(fpl);
4234 * See the API contract for VOP_FPLOOKUP_VEXEC.
4236 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4237 error = cache_fpl_aborted(fpl);
4239 cache_fpl_smr_exit(fpl);
4240 cache_fpl_handled(fpl, error);
4248 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4250 struct nameidata *ndp;
4251 struct componentname *cnp;
4255 error = CACHE_FPL_FAILED;
4259 cache_fpl_checkpoint(fpl, &fpl->snd);
4262 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4263 if (seqc_in_modify(fpl->dvp_seqc)) {
4264 cache_fpl_aborted(fpl);
4267 mp = atomic_load_ptr(&fpl->dvp->v_mount);
4268 if (!cache_fplookup_mp_supported(mp)) {
4269 cache_fpl_aborted(fpl);
4273 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4276 error = cache_fplookup_parse(fpl);
4277 if (__predict_false(error != 0)) {
4281 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4283 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4284 if (__predict_false(error != 0)) {
4285 error = cache_fplookup_failed_vexec(fpl, error);
4289 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4290 error = cache_fplookup_dotdot(fpl);
4291 if (__predict_false(error != 0)) {
4295 error = cache_fplookup_next(fpl);
4296 if (__predict_false(error != 0)) {
4300 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4302 if (cache_fplookup_need_climb_mount(fpl)) {
4303 error = cache_fplookup_climb_mount(fpl);
4304 if (__predict_false(error != 0)) {
4310 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4312 if (cache_fpl_islastcn(ndp)) {
4313 error = cache_fplookup_final(fpl);
4317 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4318 error = cache_fpl_aborted(fpl);
4322 fpl->dvp = fpl->tvp;
4323 fpl->dvp_seqc = fpl->tvp_seqc;
4325 cache_fplookup_parse_advance(fpl);
4326 cache_fpl_checkpoint(fpl, &fpl->snd);
4329 switch (fpl->status) {
4330 case CACHE_FPL_STATUS_UNSET:
4331 __assert_unreachable();
4333 case CACHE_FPL_STATUS_PARTIAL:
4334 cache_fpl_smr_assert_entered(fpl);
4335 return (cache_fplookup_partial_setup(fpl));
4336 case CACHE_FPL_STATUS_ABORTED:
4338 cache_fpl_smr_exit(fpl);
4339 return (CACHE_FPL_FAILED);
4340 case CACHE_FPL_STATUS_HANDLED:
4341 MPASS(error != CACHE_FPL_FAILED);
4342 cache_fpl_smr_assert_not_entered(fpl);
4343 if (__predict_false(error != 0)) {
4346 cache_fpl_cleanup_cnp(cnp);
4349 ndp->ni_dvp = fpl->dvp;
4350 ndp->ni_vp = fpl->tvp;
4351 if (cnp->cn_flags & SAVENAME)
4352 cnp->cn_flags |= HASBUF;
4354 cache_fpl_cleanup_cnp(cnp);
4360 * Fast path lookup protected with SMR and sequence counters.
4362 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4364 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4367 * Traditional vnode lookup conceptually looks like this:
4373 * vn_unlock(current);
4380 * Each jump to the next vnode is safe memory-wise and atomic with respect to
4381 * any modifications thanks to holding respective locks.
4383 * The same guarantee can be provided with a combination of safe memory
4384 * reclamation and sequence counters instead. If all operations which affect
4385 * the relationship between the current vnode and the one we are looking for
4386 * also modify the counter, we can verify whether all the conditions held as
4387 * we made the jump. This includes things like permissions, mount points etc.
4388 * Counter modification is provided by enclosing relevant places in
4389 * vn_seqc_write_begin()/end() calls.
4391 * Thus this translates to:
4394 * dvp_seqc = seqc_read_any(dvp);
4395 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4399 * tvp_seqc = seqc_read_any(tvp);
4400 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4402 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4404 * dvp = tvp; // we know nothing of importance has changed
4405 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4409 * vget(); // secure the vnode
4410 * if (!seqc_consistent(tvp, tvp_seqc) // final check
4412 * // at this point we know nothing has changed for any parent<->child pair
4413 * // as they were crossed during the lookup, meaning we matched the guarantee
4414 * // of the locked variant
4417 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4418 * - they are called while within vfs_smr protection which they must never exit
4419 * - EAGAIN can be returned to denote checking could not be performed, it is
4420 * always valid to return it
4421 * - if the sequence counter has not changed the result must be valid
4422 * - if the sequence counter has changed both false positives and false negatives
4423 * are permitted (since the result will be rejected later)
4424 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4426 * Caveats to watch out for:
4427 * - vnodes are passed unlocked and unreferenced with nothing stopping
4428 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4429 * to use atomic_load_ptr to fetch it.
4430 * - the aforementioned object can also get freed, meaning absent other means it
4431 * should be protected with vfs_smr
4432 * - either safely checking permissions as they are modified or guaranteeing
4433 * their stability is left to the routine
4436 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4439 struct cache_fpl fpl;
4442 struct componentname *cnp;
4443 struct nameidata_saved orig;
4446 MPASS(ndp->ni_lcf == 0);
4448 fpl.status = CACHE_FPL_STATUS_UNSET;
4450 fpl.cnp = &ndp->ni_cnd;
4451 MPASS(curthread == fpl.cnp->cn_thread);
4453 if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4454 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4456 if (!cache_can_fplookup(&fpl)) {
4457 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4458 *status = fpl.status;
4459 return (EOPNOTSUPP);
4462 cache_fpl_checkpoint(&fpl, &orig);
4464 cache_fpl_smr_enter_initial(&fpl);
4465 pwd = pwd_get_smr();
4467 ndp->ni_rootdir = pwd->pwd_rdir;
4468 ndp->ni_topdir = pwd->pwd_jdir;
4471 cnp->cn_nameptr = cnp->cn_pnbuf;
4472 if (cnp->cn_pnbuf[0] == '/') {
4473 cache_fpl_handle_root(ndp, &dvp);
4475 MPASS(ndp->ni_dirfd == AT_FDCWD);
4476 dvp = pwd->pwd_cdir;
4479 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4481 error = cache_fplookup_impl(dvp, &fpl);
4482 cache_fpl_smr_assert_not_entered(&fpl);
4483 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4485 *status = fpl.status;
4486 switch (fpl.status) {
4487 case CACHE_FPL_STATUS_UNSET:
4488 __assert_unreachable();
4490 case CACHE_FPL_STATUS_HANDLED:
4491 SDT_PROBE3(vfs, namei, lookup, return, error,
4492 (error == 0 ? ndp->ni_vp : NULL), true);
4494 case CACHE_FPL_STATUS_PARTIAL:
4497 * Status restored by cache_fplookup_partial_setup.
4500 case CACHE_FPL_STATUS_ABORTED:
4501 cache_fpl_restore(&fpl, &orig);