2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Poul-Henning Kamp of the FreeBSD Project.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
41 #include "opt_ktrace.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
55 #include <sys/mount.h>
56 #include <sys/namei.h>
62 #include <sys/syscallsubr.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysproto.h>
65 #include <sys/vnode.h>
68 #include <sys/ktrace.h>
71 #include <sys/capsicum.h>
73 #include <security/audit/audit.h>
74 #include <security/mac/mac_framework.h>
82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
85 SDT_PROVIDER_DECLARE(vfs);
86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
95 "struct namecache *", "int", "int");
96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
98 "char *", "struct vnode *");
99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
101 "struct vnode *", "char *");
102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
105 "struct vnode *", "char *");
106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
109 "struct componentname *");
110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
111 "struct componentname *");
112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
124 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
127 * This structure describes the elements in the cache of recent
128 * names looked up by namei.
134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
135 "the state must fit in a union with a pointer without growing it");
138 LIST_ENTRY(namecache) nc_src; /* source vnode list */
139 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
140 CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
141 struct vnode *nc_dvp; /* vnode of parent of name */
143 struct vnode *nu_vp; /* vnode the name refers to */
144 struct negstate nu_neg;/* negative entry state */
146 u_char nc_flag; /* flag bits */
147 u_char nc_nlen; /* length of name */
148 char nc_name[0]; /* segment name + nul */
152 * struct namecache_ts repeats struct namecache layout up to the
154 * struct namecache_ts is used in place of struct namecache when time(s) need
155 * to be stored. The nc_dotdottime field is used when a cache entry is mapping
156 * both a non-dotdot directory name plus dotdot for the directory's
159 * See below for alignment requirement.
161 struct namecache_ts {
162 struct timespec nc_time; /* timespec provided by fs */
163 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
164 int nc_ticks; /* ticks value when entry was added */
166 struct namecache nc_nc;
170 * At least mips n32 performs 64-bit accesses to timespec as found
171 * in namecache_ts and requires them to be aligned. Since others
172 * may be in the same spot suffer a little bit and enforce the
173 * alignment for everyone. Note this is a nop for 64-bit platforms.
175 #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t)
178 * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
179 * 4.4 BSD codebase. Later on struct namecache was tweaked to become
180 * smaller and the value was bumped to retain the total size, but it
181 * was never re-evaluated for suitability. A simple test counting
182 * lengths during package building shows that the value of 45 covers
183 * about 86% of all added entries, reaching 99% at 65.
185 * Regardless of the above, use of dedicated zones instead of malloc may be
186 * inducing additional waste. This may be hard to address as said zones are
187 * tied to VFS SMR. Even if retaining them, the current split should be
191 #define CACHE_PATH_CUTOFF 45
192 #define CACHE_LARGE_PAD 6
194 #define CACHE_PATH_CUTOFF 41
195 #define CACHE_LARGE_PAD 2
198 #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
199 #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
200 #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
201 #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
203 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
204 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
205 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
206 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
208 #define nc_vp n_un.nu_vp
209 #define nc_neg n_un.nu_neg
212 * Flags in namecache.nc_flag
214 #define NCF_WHITE 0x01
215 #define NCF_ISDOTDOT 0x02
218 #define NCF_DVDROP 0x10
219 #define NCF_NEGATIVE 0x20
220 #define NCF_INVALID 0x40
224 * Flags in negstate.neg_flag
228 static bool cache_neg_evict_cond(u_long lnumcache);
231 * Mark an entry as invalid.
233 * This is called before it starts getting deconstructed.
236 cache_ncp_invalidate(struct namecache *ncp)
239 KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
240 ("%s: entry %p already invalid", __func__, ncp));
241 atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
242 atomic_thread_fence_rel();
246 * Check whether the entry can be safely used.
248 * All places which elide locks are supposed to call this after they are
249 * done with reading from an entry.
251 #define cache_ncp_canuse(ncp) ({ \
252 struct namecache *_ncp = (ncp); \
255 atomic_thread_fence_acq(); \
256 _nc_flag = atomic_load_char(&_ncp->nc_flag); \
257 __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \
261 * Name caching works as follows:
263 * Names found by directory scans are retained in a cache
264 * for future reference. It is managed LRU, so frequently
265 * used names will hang around. Cache is indexed by hash value
266 * obtained from (dvp, name) where dvp refers to the directory
269 * If it is a "negative" entry, (i.e. for a name that is known NOT to
270 * exist) the vnode pointer will be NULL.
272 * Upon reaching the last segment of a path, if the reference
273 * is for DELETE, or NOCACHE is set (rewrite), and the
274 * name is located in the cache, it will be dropped.
276 * These locks are used (in the order in which they can be taken):
278 * vnodelock mtx vnode lists and v_cache_dd field protection
279 * bucketlock mtx for access to given set of hash buckets
280 * neglist mtx negative entry LRU management
282 * It is legal to take multiple vnodelock and bucketlock locks. The locking
283 * order is lower address first. Both are recursive.
285 * "." lookups are lockless.
287 * ".." and vnode -> name lookups require vnodelock.
289 * name -> vnode lookup requires the relevant bucketlock to be held for reading.
291 * Insertions and removals of entries require involved vnodes and bucketlocks
292 * to be locked to provide safe operation against other threads modifying the
295 * Some lookups result in removal of the found entry (e.g. getting rid of a
296 * negative entry with the intent to create a positive one), which poses a
297 * problem when multiple threads reach the state. Similarly, two different
298 * threads can purge two different vnodes and try to remove the same name.
300 * If the already held vnode lock is lower than the second required lock, we
301 * can just take the other lock. However, in the opposite case, this could
302 * deadlock. As such, this is resolved by trylocking and if that fails unlocking
303 * the first node, locking everything in order and revalidating the state.
308 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
309 "Name cache parameters");
311 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */
312 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
313 "Total namecache capacity");
315 u_int ncsizefactor = 2;
316 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
317 "Size factor for namecache");
319 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */
320 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
321 "Ratio of negative namecache entries");
324 * Negative entry % of namecache capacity above which automatic eviction is allowed.
326 * Check cache_neg_evict_cond for details.
328 static u_int ncnegminpct = 3;
330 static u_int __read_mostly neg_min; /* the above recomputed against ncsize */
331 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
332 "Negative entry count above which automatic eviction is allowed");
335 * Structures associated with name caching.
337 #define NCHHASH(hash) \
338 (&nchashtbl[(hash) & nchash])
339 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
340 static u_long __read_mostly nchash; /* size of hash table */
341 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
342 "Size of namecache hash table");
343 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
344 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
346 struct nchstats nchstats; /* cache effectiveness statistics */
348 static bool __read_frequently cache_fast_revlookup = true;
349 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
350 &cache_fast_revlookup, 0, "");
352 static u_int __exclusive_cache_line neg_cycle;
355 #define numneglists (ncneghash + 1)
358 struct mtx nl_evict_lock;
359 struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
360 TAILQ_HEAD(, namecache) nl_list;
361 TAILQ_HEAD(, namecache) nl_hotlist;
363 } __aligned(CACHE_LINE_SIZE);
365 static struct neglist neglists[numneglists];
367 static inline struct neglist *
368 NCP2NEGLIST(struct namecache *ncp)
371 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
374 static inline struct negstate *
375 NCP2NEGSTATE(struct namecache *ncp)
378 MPASS(ncp->nc_flag & NCF_NEGATIVE);
379 return (&ncp->nc_neg);
382 #define numbucketlocks (ncbuckethash + 1)
383 static u_int __read_mostly ncbuckethash;
384 static struct mtx_padalign __read_mostly *bucketlocks;
385 #define HASH2BUCKETLOCK(hash) \
386 ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
388 #define numvnodelocks (ncvnodehash + 1)
389 static u_int __read_mostly ncvnodehash;
390 static struct mtx __read_mostly *vnodelocks;
391 static inline struct mtx *
392 VP2VNODELOCK(struct vnode *vp)
395 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
399 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
401 struct namecache_ts *ncp_ts;
403 KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
404 (tsp == NULL && ticksp == NULL),
410 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
411 *tsp = ncp_ts->nc_time;
412 *ticksp = ncp_ts->nc_ticks;
416 static int __read_mostly doingcache = 1; /* 1 => enable the cache */
417 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
418 "VFS namecache enabled");
421 /* Export size information to userland */
422 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
423 sizeof(struct namecache), "sizeof(struct namecache)");
426 * The new name cache statistics
428 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
429 "Name cache statistics");
431 #define STATNODE_ULONG(name, varname, descr) \
432 SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
433 #define STATNODE_COUNTER(name, varname, descr) \
434 static COUNTER_U64_DEFINE_EARLY(varname); \
435 SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
437 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
438 STATNODE_ULONG(count, numcache, "Number of cache entries");
439 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
440 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
441 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
442 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
443 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
444 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
445 STATNODE_COUNTER(posszaps, numposzaps,
446 "Number of cache hits (positive) we do not want to cache");
447 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
448 STATNODE_COUNTER(negzaps, numnegzaps,
449 "Number of cache hits (negative) we do not want to cache");
450 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
451 /* These count for vn_getcwd(), too. */
452 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
453 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
454 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
455 "Number of fullpath search errors (VOP_VPTOCNP failures)");
456 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
457 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
460 * Debug or developer statistics.
462 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
463 "Name cache debugging");
464 #define DEBUGNODE_ULONG(name, varname, descr) \
465 SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
466 #define DEBUGNODE_COUNTER(name, varname, descr) \
467 static COUNTER_U64_DEFINE_EARLY(varname); \
468 SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
470 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
471 "Number of successful removals after relocking");
472 static long zap_bucket_fail;
473 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
474 static long zap_bucket_fail2;
475 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
476 static long cache_lock_vnodes_cel_3_failures;
477 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
478 "Number of times 3-way vnode locking failed");
480 static void cache_zap_locked(struct namecache *ncp);
481 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
482 char **freebuf, size_t *buflen);
483 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
484 char **retbuf, size_t *buflen, size_t addend);
485 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
486 char **retbuf, size_t *buflen);
487 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
488 char **retbuf, size_t *len, size_t addend);
490 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
493 cache_assert_vlp_locked(struct mtx *vlp)
497 mtx_assert(vlp, MA_OWNED);
501 cache_assert_vnode_locked(struct vnode *vp)
505 vlp = VP2VNODELOCK(vp);
506 cache_assert_vlp_locked(vlp);
510 * Directory vnodes with entries are held for two reasons:
511 * 1. make them less of a target for reclamation in vnlru
512 * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
514 * Note this preferably would not be done and it's a hold over from. It will be
515 * feasible to eliminate altogether if all filesystems start supporting
519 cache_hold_vnode(struct vnode *vp)
522 cache_assert_vnode_locked(vp);
523 VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
525 counter_u64_add(numcachehv, 1);
529 cache_drop_vnode(struct vnode *vp)
533 * Called after all locks are dropped, meaning we can't assert
534 * on the state of v_cache_src.
537 counter_u64_add(numcachehv, -1);
543 static uma_zone_t __read_mostly cache_zone_small;
544 static uma_zone_t __read_mostly cache_zone_small_ts;
545 static uma_zone_t __read_mostly cache_zone_large;
546 static uma_zone_t __read_mostly cache_zone_large_ts;
548 static struct namecache *
549 cache_alloc_uma(int len, bool ts)
551 struct namecache_ts *ncp_ts;
552 struct namecache *ncp;
554 if (__predict_false(ts)) {
555 if (len <= CACHE_PATH_CUTOFF)
556 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
558 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
559 ncp = &ncp_ts->nc_nc;
561 if (len <= CACHE_PATH_CUTOFF)
562 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
564 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
570 cache_free_uma(struct namecache *ncp)
572 struct namecache_ts *ncp_ts;
574 if (__predict_false(ncp->nc_flag & NCF_TS)) {
575 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
576 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
577 uma_zfree_smr(cache_zone_small_ts, ncp_ts);
579 uma_zfree_smr(cache_zone_large_ts, ncp_ts);
581 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
582 uma_zfree_smr(cache_zone_small, ncp);
584 uma_zfree_smr(cache_zone_large, ncp);
588 static struct namecache *
589 cache_alloc(int len, bool ts)
594 * Avoid blowout in namecache entries.
597 * 1. filesystems may end up tryng to add an already existing entry
598 * (for example this can happen after a cache miss during concurrent
599 * lookup), in which case we will call cache_neg_evict despite not
601 * 2. the routine may fail to free anything and no provisions are made
602 * to make it try harder (see the inside for failure modes)
603 * 3. it only ever looks at negative entries.
605 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
606 if (cache_neg_evict_cond(lnumcache)) {
607 lnumcache = atomic_load_long(&numcache);
609 if (__predict_false(lnumcache >= ncsize)) {
610 atomic_subtract_long(&numcache, 1);
611 counter_u64_add(numdrops, 1);
614 return (cache_alloc_uma(len, ts));
618 cache_free(struct namecache *ncp)
622 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
623 cache_drop_vnode(ncp->nc_dvp);
626 atomic_subtract_long(&numcache, 1);
630 * TODO: With the value stored we can do better than computing the hash based
631 * on the address. The choice of FNV should also be revisited.
634 cache_prehash(struct vnode *vp)
637 vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
641 cache_get_hash(char *name, u_char len, struct vnode *dvp)
644 return (fnv_32_buf(name, len, dvp->v_nchash));
647 static inline struct nchashhead *
648 NCP2BUCKET(struct namecache *ncp)
652 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
653 return (NCHHASH(hash));
656 static inline struct mtx *
657 NCP2BUCKETLOCK(struct namecache *ncp)
661 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
662 return (HASH2BUCKETLOCK(hash));
667 cache_assert_bucket_locked(struct namecache *ncp)
671 blp = NCP2BUCKETLOCK(ncp);
672 mtx_assert(blp, MA_OWNED);
676 cache_assert_bucket_unlocked(struct namecache *ncp)
680 blp = NCP2BUCKETLOCK(ncp);
681 mtx_assert(blp, MA_NOTOWNED);
684 #define cache_assert_bucket_locked(x) do { } while (0)
685 #define cache_assert_bucket_unlocked(x) do { } while (0)
688 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
690 _cache_sort_vnodes(void **p1, void **p2)
694 MPASS(*p1 != NULL || *p2 != NULL);
704 cache_lock_all_buckets(void)
708 for (i = 0; i < numbucketlocks; i++)
709 mtx_lock(&bucketlocks[i]);
713 cache_unlock_all_buckets(void)
717 for (i = 0; i < numbucketlocks; i++)
718 mtx_unlock(&bucketlocks[i]);
722 cache_lock_all_vnodes(void)
726 for (i = 0; i < numvnodelocks; i++)
727 mtx_lock(&vnodelocks[i]);
731 cache_unlock_all_vnodes(void)
735 for (i = 0; i < numvnodelocks; i++)
736 mtx_unlock(&vnodelocks[i]);
740 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
743 cache_sort_vnodes(&vlp1, &vlp2);
746 if (!mtx_trylock(vlp1))
749 if (!mtx_trylock(vlp2)) {
759 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
762 MPASS(vlp1 != NULL || vlp2 != NULL);
772 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
775 MPASS(vlp1 != NULL || vlp2 != NULL);
784 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
786 struct nchstats snap;
788 if (req->oldptr == NULL)
789 return (SYSCTL_OUT(req, 0, sizeof(snap)));
792 snap.ncs_goodhits = counter_u64_fetch(numposhits);
793 snap.ncs_neghits = counter_u64_fetch(numneghits);
794 snap.ncs_badhits = counter_u64_fetch(numposzaps) +
795 counter_u64_fetch(numnegzaps);
796 snap.ncs_miss = counter_u64_fetch(nummisszap) +
797 counter_u64_fetch(nummiss);
799 return (SYSCTL_OUT(req, &snap, sizeof(snap)));
801 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
802 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
803 "VFS cache effectiveness statistics");
806 cache_recalc_neg_min(u_int val)
809 neg_min = (ncsize * val) / 100;
813 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
819 error = sysctl_handle_int(oidp, &val, 0, req);
820 if (error != 0 || req->newptr == NULL)
823 if (val == ncnegminpct)
825 if (val < 0 || val > 99)
828 cache_recalc_neg_min(val);
832 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
833 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
834 "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
838 * Grab an atomic snapshot of the name cache hash chain lengths
840 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
841 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
845 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
847 struct nchashhead *ncpp;
848 struct namecache *ncp;
849 int i, error, n_nchash, *cntbuf;
852 n_nchash = nchash + 1; /* nchash is max index, not count */
853 if (req->oldptr == NULL)
854 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
855 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
856 cache_lock_all_buckets();
857 if (n_nchash != nchash + 1) {
858 cache_unlock_all_buckets();
859 free(cntbuf, M_TEMP);
862 /* Scan hash tables counting entries */
863 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
864 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
866 cache_unlock_all_buckets();
867 for (error = 0, i = 0; i < n_nchash; i++)
868 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
870 free(cntbuf, M_TEMP);
873 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
874 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
875 "nchash chain lengths");
878 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
881 struct nchashhead *ncpp;
882 struct namecache *ncp;
884 int count, maxlength, used, pct;
887 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
889 cache_lock_all_buckets();
890 n_nchash = nchash + 1; /* nchash is max index, not count */
894 /* Scan hash tables for applicable entries */
895 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
897 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
902 if (maxlength < count)
905 n_nchash = nchash + 1;
906 cache_unlock_all_buckets();
907 pct = (used * 100) / (n_nchash / 100);
908 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
911 error = SYSCTL_OUT(req, &used, sizeof(used));
914 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
917 error = SYSCTL_OUT(req, &pct, sizeof(pct));
922 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
923 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
924 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
928 * Negative entries management
930 * Various workloads create plenty of negative entries and barely use them
931 * afterwards. Moreover malicious users can keep performing bogus lookups
932 * adding even more entries. For example "make tinderbox" as of writing this
933 * comment ends up with 2.6M namecache entries in total, 1.2M of which are
936 * As such, a rather aggressive eviction method is needed. The currently
937 * employed method is a placeholder.
939 * Entries are split over numneglists separate lists, each of which is further
940 * split into hot and cold entries. Entries get promoted after getting a hit.
941 * Eviction happens on addition of new entry.
943 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
944 "Name cache negative entry statistics");
946 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
947 "Number of negative cache entries");
949 static COUNTER_U64_DEFINE_EARLY(neg_created);
950 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
951 "Number of created negative entries");
953 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
954 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
955 "Number of evicted negative entries");
957 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
958 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
959 &neg_evict_skipped_empty,
960 "Number of times evicting failed due to lack of entries");
962 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
963 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
964 &neg_evict_skipped_missed,
965 "Number of times evicting failed due to target entry disappearing");
967 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
968 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
969 &neg_evict_skipped_contended,
970 "Number of times evicting failed due to contention");
972 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
973 "Number of cache hits (negative)");
976 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
981 for (i = 0; i < numneglists; i++)
982 out += neglists[i].nl_hotnum;
984 return (SYSCTL_OUT(req, &out, sizeof(out)));
986 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
987 CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
988 "Number of hot negative entries");
991 cache_neg_init(struct namecache *ncp)
995 ncp->nc_flag |= NCF_NEGATIVE;
996 ns = NCP2NEGSTATE(ncp);
999 counter_u64_add(neg_created, 1);
1002 #define CACHE_NEG_PROMOTION_THRESH 2
1005 cache_neg_hit_prep(struct namecache *ncp)
1007 struct negstate *ns;
1010 ns = NCP2NEGSTATE(ncp);
1011 n = atomic_load_char(&ns->neg_hit);
1013 if (n >= CACHE_NEG_PROMOTION_THRESH)
1015 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1018 return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1022 * Nothing to do here but it is provided for completeness as some
1023 * cache_neg_hit_prep callers may end up returning without even
1024 * trying to promote.
1026 #define cache_neg_hit_abort(ncp) do { } while (0)
1029 cache_neg_hit_finish(struct namecache *ncp)
1032 SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1033 counter_u64_add(numneghits, 1);
1037 * Move a negative entry to the hot list.
1040 cache_neg_promote_locked(struct namecache *ncp)
1043 struct negstate *ns;
1045 ns = NCP2NEGSTATE(ncp);
1046 nl = NCP2NEGLIST(ncp);
1047 mtx_assert(&nl->nl_lock, MA_OWNED);
1048 if ((ns->neg_flag & NEG_HOT) == 0) {
1049 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1050 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1052 ns->neg_flag |= NEG_HOT;
1057 * Move a hot negative entry to the cold list.
1060 cache_neg_demote_locked(struct namecache *ncp)
1063 struct negstate *ns;
1065 ns = NCP2NEGSTATE(ncp);
1066 nl = NCP2NEGLIST(ncp);
1067 mtx_assert(&nl->nl_lock, MA_OWNED);
1068 MPASS(ns->neg_flag & NEG_HOT);
1069 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1070 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1072 ns->neg_flag &= ~NEG_HOT;
1073 atomic_store_char(&ns->neg_hit, 0);
1077 * Move a negative entry to the hot list if it matches the lookup.
1079 * We have to take locks, but they may be contended and in the worst
1080 * case we may need to go off CPU. We don't want to spin within the
1081 * smr section and we can't block with it. Exiting the section means
1082 * the found entry could have been evicted. We are going to look it
1086 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1087 struct namecache *oncp, uint32_t hash)
1089 struct namecache *ncp;
1093 nl = NCP2NEGLIST(oncp);
1095 mtx_lock(&nl->nl_lock);
1097 * For hash iteration.
1102 * Avoid all surprises by only succeeding if we got the same entry and
1103 * bailing completely otherwise.
1104 * XXX There are no provisions to keep the vnode around, meaning we may
1105 * end up promoting a negative entry for a *new* vnode and returning
1106 * ENOENT on its account. This is the error we want to return anyway
1107 * and promotion is harmless.
1109 * In particular at this point there can be a new ncp which matches the
1110 * search but hashes to a different neglist.
1112 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1118 * No match to begin with.
1120 if (__predict_false(ncp == NULL)) {
1125 * The newly found entry may be something different...
1127 if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1128 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1133 * ... and not even negative.
1135 nc_flag = atomic_load_char(&ncp->nc_flag);
1136 if ((nc_flag & NCF_NEGATIVE) == 0) {
1140 if (!cache_ncp_canuse(ncp)) {
1144 cache_neg_promote_locked(ncp);
1145 cache_neg_hit_finish(ncp);
1147 mtx_unlock(&nl->nl_lock);
1151 mtx_unlock(&nl->nl_lock);
1156 cache_neg_promote(struct namecache *ncp)
1160 nl = NCP2NEGLIST(ncp);
1161 mtx_lock(&nl->nl_lock);
1162 cache_neg_promote_locked(ncp);
1163 mtx_unlock(&nl->nl_lock);
1167 cache_neg_insert(struct namecache *ncp)
1171 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1172 cache_assert_bucket_locked(ncp);
1173 nl = NCP2NEGLIST(ncp);
1174 mtx_lock(&nl->nl_lock);
1175 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1176 mtx_unlock(&nl->nl_lock);
1177 atomic_add_long(&numneg, 1);
1181 cache_neg_remove(struct namecache *ncp)
1184 struct negstate *ns;
1186 cache_assert_bucket_locked(ncp);
1187 nl = NCP2NEGLIST(ncp);
1188 ns = NCP2NEGSTATE(ncp);
1189 mtx_lock(&nl->nl_lock);
1190 if ((ns->neg_flag & NEG_HOT) != 0) {
1191 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1194 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1196 mtx_unlock(&nl->nl_lock);
1197 atomic_subtract_long(&numneg, 1);
1200 static struct neglist *
1201 cache_neg_evict_select_list(void)
1206 c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1207 nl = &neglists[c % numneglists];
1208 if (!mtx_trylock(&nl->nl_evict_lock)) {
1209 counter_u64_add(neg_evict_skipped_contended, 1);
1215 static struct namecache *
1216 cache_neg_evict_select_entry(struct neglist *nl)
1218 struct namecache *ncp, *lncp;
1219 struct negstate *ns, *lns;
1222 mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1223 mtx_assert(&nl->nl_lock, MA_OWNED);
1224 ncp = TAILQ_FIRST(&nl->nl_list);
1228 lns = NCP2NEGSTATE(lncp);
1229 for (i = 1; i < 4; i++) {
1230 ncp = TAILQ_NEXT(ncp, nc_dst);
1233 ns = NCP2NEGSTATE(ncp);
1234 if (ns->neg_hit < lns->neg_hit) {
1243 cache_neg_evict(void)
1245 struct namecache *ncp, *ncp2;
1254 nl = cache_neg_evict_select_list();
1259 mtx_lock(&nl->nl_lock);
1260 ncp = TAILQ_FIRST(&nl->nl_hotlist);
1262 cache_neg_demote_locked(ncp);
1264 ncp = cache_neg_evict_select_entry(nl);
1266 counter_u64_add(neg_evict_skipped_empty, 1);
1267 mtx_unlock(&nl->nl_lock);
1268 mtx_unlock(&nl->nl_evict_lock);
1271 nlen = ncp->nc_nlen;
1273 hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1274 dvlp = VP2VNODELOCK(dvp);
1275 blp = HASH2BUCKETLOCK(hash);
1276 mtx_unlock(&nl->nl_lock);
1277 mtx_unlock(&nl->nl_evict_lock);
1281 * Note that since all locks were dropped above, the entry may be
1282 * gone or reallocated to be something else.
1284 CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1285 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1286 ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1290 counter_u64_add(neg_evict_skipped_missed, 1);
1294 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1295 MPASS(blp == NCP2BUCKETLOCK(ncp));
1296 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1298 cache_zap_locked(ncp);
1299 counter_u64_add(neg_evicted, 1);
1310 * Maybe evict a negative entry to create more room.
1312 * The ncnegfactor parameter limits what fraction of the total count
1313 * can comprise of negative entries. However, if the cache is just
1314 * warming up this leads to excessive evictions. As such, ncnegminpct
1315 * (recomputed to neg_min) dictates whether the above should be
1318 * Try evicting if the cache is close to full capacity regardless of
1319 * other considerations.
1322 cache_neg_evict_cond(u_long lnumcache)
1326 if (ncsize - 1000 < lnumcache)
1328 lnumneg = atomic_load_long(&numneg);
1329 if (lnumneg < neg_min)
1331 if (lnumneg * ncnegfactor < lnumcache)
1334 return (cache_neg_evict());
1338 * cache_zap_locked():
1340 * Removes a namecache entry from cache, whether it contains an actual
1341 * pointer to a vnode or if it is just a negative cache entry.
1344 cache_zap_locked(struct namecache *ncp)
1346 struct nchashhead *ncpp;
1348 if (!(ncp->nc_flag & NCF_NEGATIVE))
1349 cache_assert_vnode_locked(ncp->nc_vp);
1350 cache_assert_vnode_locked(ncp->nc_dvp);
1351 cache_assert_bucket_locked(ncp);
1353 cache_ncp_invalidate(ncp);
1355 ncpp = NCP2BUCKET(ncp);
1356 CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1357 if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1358 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1359 ncp->nc_name, ncp->nc_vp);
1360 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1361 if (ncp == ncp->nc_vp->v_cache_dd) {
1362 vn_seqc_write_begin_unheld(ncp->nc_vp);
1363 ncp->nc_vp->v_cache_dd = NULL;
1364 vn_seqc_write_end(ncp->nc_vp);
1367 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1369 cache_neg_remove(ncp);
1371 if (ncp->nc_flag & NCF_ISDOTDOT) {
1372 if (ncp == ncp->nc_dvp->v_cache_dd) {
1373 vn_seqc_write_begin_unheld(ncp->nc_dvp);
1374 ncp->nc_dvp->v_cache_dd = NULL;
1375 vn_seqc_write_end(ncp->nc_dvp);
1378 LIST_REMOVE(ncp, nc_src);
1379 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1380 ncp->nc_flag |= NCF_DVDROP;
1386 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1390 MPASS(ncp->nc_dvp == vp);
1391 MPASS(ncp->nc_flag & NCF_NEGATIVE);
1392 cache_assert_vnode_locked(vp);
1394 blp = NCP2BUCKETLOCK(ncp);
1396 cache_zap_locked(ncp);
1401 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1404 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1407 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1408 cache_assert_vnode_locked(vp);
1410 if (ncp->nc_flag & NCF_NEGATIVE) {
1411 if (*vlpp != NULL) {
1415 cache_zap_negative_locked_vnode_kl(ncp, vp);
1419 pvlp = VP2VNODELOCK(vp);
1420 blp = NCP2BUCKETLOCK(ncp);
1421 vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1422 vlp2 = VP2VNODELOCK(ncp->nc_vp);
1424 if (*vlpp == vlp1 || *vlpp == vlp2) {
1428 if (*vlpp != NULL) {
1432 cache_sort_vnodes(&vlp1, &vlp2);
1437 if (!mtx_trylock(vlp1))
1443 cache_zap_locked(ncp);
1445 if (to_unlock != NULL)
1446 mtx_unlock(to_unlock);
1453 MPASS(*vlpp == NULL);
1459 * If trylocking failed we can get here. We know enough to take all needed locks
1460 * in the right order and re-lookup the entry.
1463 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1464 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1467 struct namecache *rncp;
1469 cache_assert_bucket_unlocked(ncp);
1471 cache_sort_vnodes(&dvlp, &vlp);
1472 cache_lock_vnodes(dvlp, vlp);
1474 CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1475 if (rncp == ncp && rncp->nc_dvp == dvp &&
1476 rncp->nc_nlen == cnp->cn_namelen &&
1477 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1481 cache_zap_locked(rncp);
1483 cache_unlock_vnodes(dvlp, vlp);
1484 counter_u64_add(zap_bucket_relock_success, 1);
1489 cache_unlock_vnodes(dvlp, vlp);
1493 static int __noinline
1494 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1495 uint32_t hash, struct mtx *blp)
1497 struct mtx *dvlp, *vlp;
1500 cache_assert_bucket_locked(ncp);
1502 dvlp = VP2VNODELOCK(ncp->nc_dvp);
1504 if (!(ncp->nc_flag & NCF_NEGATIVE))
1505 vlp = VP2VNODELOCK(ncp->nc_vp);
1506 if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1507 cache_zap_locked(ncp);
1509 cache_unlock_vnodes(dvlp, vlp);
1515 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1518 static __noinline int
1519 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1521 struct namecache *ncp;
1523 struct mtx *dvlp, *dvlp2;
1527 if (cnp->cn_namelen == 2 &&
1528 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1529 dvlp = VP2VNODELOCK(dvp);
1533 ncp = dvp->v_cache_dd;
1538 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1541 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1542 if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1544 MPASS(dvp->v_cache_dd == NULL);
1550 vn_seqc_write_begin(dvp);
1551 dvp->v_cache_dd = NULL;
1552 vn_seqc_write_end(dvp);
1557 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1561 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1562 blp = HASH2BUCKETLOCK(hash);
1564 if (CK_SLIST_EMPTY(NCHHASH(hash)))
1569 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1570 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1571 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1580 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1581 if (__predict_false(error != 0)) {
1585 counter_u64_add(numposzaps, 1);
1586 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1590 counter_u64_add(nummisszap, 1);
1591 SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1595 static int __noinline
1596 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1597 struct timespec *tsp, int *ticksp)
1602 counter_u64_add(dothits, 1);
1603 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1610 * When we lookup "." we still can be asked to lock it
1613 ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1614 if (ltype != VOP_ISLOCKED(*vpp)) {
1615 if (ltype == LK_EXCLUSIVE) {
1616 vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1617 if (VN_IS_DOOMED((*vpp))) {
1618 /* forced unmount */
1624 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1629 static int __noinline
1630 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1631 struct timespec *tsp, int *ticksp)
1633 struct namecache_ts *ncp_ts;
1634 struct namecache *ncp;
1640 MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1642 if ((cnp->cn_flags & MAKEENTRY) == 0) {
1643 cache_remove_cnp(dvp, cnp);
1647 counter_u64_add(dotdothits, 1);
1649 dvlp = VP2VNODELOCK(dvp);
1651 ncp = dvp->v_cache_dd;
1653 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1657 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1658 if (ncp->nc_flag & NCF_NEGATIVE)
1665 goto negative_success;
1666 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1667 cache_out_ts(ncp, tsp, ticksp);
1668 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1669 NCF_DTS && tsp != NULL) {
1670 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1671 *tsp = ncp_ts->nc_dotdottime;
1675 ltype = VOP_ISLOCKED(dvp);
1677 vs = vget_prep(*vpp);
1679 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1680 vn_lock(dvp, ltype | LK_RETRY);
1681 if (VN_IS_DOOMED(dvp)) {
1693 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1694 if (cnp->cn_flags & ISLASTCN) {
1695 counter_u64_add(numnegzaps, 1);
1696 cache_zap_negative_locked_vnode_kl(ncp, dvp);
1703 whiteout = (ncp->nc_flag & NCF_WHITE);
1704 cache_out_ts(ncp, tsp, ticksp);
1705 if (cache_neg_hit_prep(ncp))
1706 cache_neg_promote(ncp);
1708 cache_neg_hit_finish(ncp);
1711 cnp->cn_flags |= ISWHITEOUT;
1716 * Lookup a name in the name cache
1720 * - dvp: Parent directory in which to search.
1721 * - vpp: Return argument. Will contain desired vnode on cache hit.
1722 * - cnp: Parameters of the name search. The most interesting bits of
1723 * the cn_flags field have the following meanings:
1724 * - MAKEENTRY: If clear, free an entry from the cache rather than look
1726 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
1727 * - tsp: Return storage for cache timestamp. On a successful (positive
1728 * or negative) lookup, tsp will be filled with any timespec that
1729 * was stored when this cache entry was created. However, it will
1730 * be clear for "." entries.
1731 * - ticks: Return storage for alternate cache timestamp. On a successful
1732 * (positive or negative) lookup, it will contain the ticks value
1733 * that was current when the cache entry was created, unless cnp
1736 * Either both tsp and ticks have to be provided or neither of them.
1740 * - -1: A positive cache hit. vpp will contain the desired vnode.
1741 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
1742 * to a forced unmount. vpp will not be modified. If the entry
1743 * is a whiteout, then the ISWHITEOUT flag will be set in
1745 * - 0: A cache miss. vpp will not be modified.
1749 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
1750 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
1751 * lock is not recursively acquired.
1753 static int __noinline
1754 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1755 struct timespec *tsp, int *ticksp)
1757 struct namecache *ncp;
1764 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1765 MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1768 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1769 blp = HASH2BUCKETLOCK(hash);
1772 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1773 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1774 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1778 if (__predict_false(ncp == NULL)) {
1780 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1782 counter_u64_add(nummiss, 1);
1786 if (ncp->nc_flag & NCF_NEGATIVE)
1787 goto negative_success;
1789 counter_u64_add(numposhits, 1);
1791 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1792 cache_out_ts(ncp, tsp, ticksp);
1794 vs = vget_prep(*vpp);
1796 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1803 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1804 if (cnp->cn_flags & ISLASTCN) {
1805 counter_u64_add(numnegzaps, 1);
1806 error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1807 if (__predict_false(error != 0)) {
1816 whiteout = (ncp->nc_flag & NCF_WHITE);
1817 cache_out_ts(ncp, tsp, ticksp);
1818 if (cache_neg_hit_prep(ncp))
1819 cache_neg_promote(ncp);
1821 cache_neg_hit_finish(ncp);
1824 cnp->cn_flags |= ISWHITEOUT;
1829 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1830 struct timespec *tsp, int *ticksp)
1832 struct namecache *ncp;
1836 bool whiteout, neg_promote;
1839 MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1842 if (__predict_false(!doingcache)) {
1843 cnp->cn_flags &= ~MAKEENTRY;
1848 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1849 if (cnp->cn_namelen == 1)
1850 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1851 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1852 return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1855 MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1857 if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1858 cache_remove_cnp(dvp, cnp);
1862 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1865 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1866 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1867 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1871 if (__predict_false(ncp == NULL)) {
1873 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1875 counter_u64_add(nummiss, 1);
1879 nc_flag = atomic_load_char(&ncp->nc_flag);
1880 if (nc_flag & NCF_NEGATIVE)
1881 goto negative_success;
1883 counter_u64_add(numposhits, 1);
1885 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1886 cache_out_ts(ncp, tsp, ticksp);
1888 if (!cache_ncp_canuse(ncp)) {
1893 vs = vget_prep_smr(*vpp);
1895 if (__predict_false(vs == VGET_NONE)) {
1899 error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1906 if (__predict_false(cnp->cn_nameiop == CREATE)) {
1907 if (cnp->cn_flags & ISLASTCN) {
1913 cache_out_ts(ncp, tsp, ticksp);
1914 whiteout = (ncp->nc_flag & NCF_WHITE);
1915 neg_promote = cache_neg_hit_prep(ncp);
1916 if (!cache_ncp_canuse(ncp)) {
1917 cache_neg_hit_abort(ncp);
1923 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1926 cache_neg_hit_finish(ncp);
1930 cnp->cn_flags |= ISWHITEOUT;
1933 return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1936 struct celockstate {
1940 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1941 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1944 cache_celockstate_init(struct celockstate *cel)
1947 bzero(cel, sizeof(*cel));
1951 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1954 struct mtx *vlp1, *vlp2;
1956 MPASS(cel->vlp[0] == NULL);
1957 MPASS(cel->vlp[1] == NULL);
1958 MPASS(cel->vlp[2] == NULL);
1960 MPASS(vp != NULL || dvp != NULL);
1962 vlp1 = VP2VNODELOCK(vp);
1963 vlp2 = VP2VNODELOCK(dvp);
1964 cache_sort_vnodes(&vlp1, &vlp2);
1975 cache_unlock_vnodes_cel(struct celockstate *cel)
1978 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1980 if (cel->vlp[0] != NULL)
1981 mtx_unlock(cel->vlp[0]);
1982 if (cel->vlp[1] != NULL)
1983 mtx_unlock(cel->vlp[1]);
1984 if (cel->vlp[2] != NULL)
1985 mtx_unlock(cel->vlp[2]);
1989 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1994 cache_assert_vlp_locked(cel->vlp[0]);
1995 cache_assert_vlp_locked(cel->vlp[1]);
1996 MPASS(cel->vlp[2] == NULL);
1999 vlp = VP2VNODELOCK(vp);
2002 if (vlp >= cel->vlp[1]) {
2005 if (mtx_trylock(vlp))
2007 cache_lock_vnodes_cel_3_failures++;
2008 cache_unlock_vnodes_cel(cel);
2009 if (vlp < cel->vlp[0]) {
2011 mtx_lock(cel->vlp[0]);
2012 mtx_lock(cel->vlp[1]);
2014 if (cel->vlp[0] != NULL)
2015 mtx_lock(cel->vlp[0]);
2017 mtx_lock(cel->vlp[1]);
2027 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2031 MPASS(cel->blp[0] == NULL);
2032 MPASS(cel->blp[1] == NULL);
2034 cache_sort_vnodes(&blp1, &blp2);
2045 cache_unlock_buckets_cel(struct celockstate *cel)
2048 if (cel->blp[0] != NULL)
2049 mtx_unlock(cel->blp[0]);
2050 mtx_unlock(cel->blp[1]);
2054 * Lock part of the cache affected by the insertion.
2056 * This means vnodelocks for dvp, vp and the relevant bucketlock.
2057 * However, insertion can result in removal of an old entry. In this
2058 * case we have an additional vnode and bucketlock pair to lock.
2060 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2061 * preserving the locking order (smaller address first).
2064 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2067 struct namecache *ncp;
2068 struct mtx *blps[2];
2070 blps[0] = HASH2BUCKETLOCK(hash);
2073 cache_lock_vnodes_cel(cel, dvp, vp);
2074 if (vp == NULL || vp->v_type != VDIR)
2076 ncp = vp->v_cache_dd;
2079 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2081 MPASS(ncp->nc_dvp == vp);
2082 blps[1] = NCP2BUCKETLOCK(ncp);
2083 if (ncp->nc_flag & NCF_NEGATIVE)
2085 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2088 * All vnodes got re-locked. Re-validate the state and if
2089 * nothing changed we are done. Otherwise restart.
2091 if (ncp == vp->v_cache_dd &&
2092 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2093 blps[1] == NCP2BUCKETLOCK(ncp) &&
2094 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2096 cache_unlock_vnodes_cel(cel);
2101 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2105 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2108 struct namecache *ncp;
2109 struct mtx *blps[2];
2111 blps[0] = HASH2BUCKETLOCK(hash);
2114 cache_lock_vnodes_cel(cel, dvp, vp);
2115 ncp = dvp->v_cache_dd;
2118 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2120 MPASS(ncp->nc_dvp == dvp);
2121 blps[1] = NCP2BUCKETLOCK(ncp);
2122 if (ncp->nc_flag & NCF_NEGATIVE)
2124 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2126 if (ncp == dvp->v_cache_dd &&
2127 (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2128 blps[1] == NCP2BUCKETLOCK(ncp) &&
2129 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2131 cache_unlock_vnodes_cel(cel);
2136 cache_lock_buckets_cel(cel, blps[0], blps[1]);
2140 cache_enter_unlock(struct celockstate *cel)
2143 cache_unlock_buckets_cel(cel);
2144 cache_unlock_vnodes_cel(cel);
2147 static void __noinline
2148 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2149 struct componentname *cnp)
2151 struct celockstate cel;
2152 struct namecache *ncp;
2156 if (dvp->v_cache_dd == NULL)
2158 len = cnp->cn_namelen;
2159 cache_celockstate_init(&cel);
2160 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2161 cache_enter_lock_dd(&cel, dvp, vp, hash);
2162 vn_seqc_write_begin(dvp);
2163 ncp = dvp->v_cache_dd;
2164 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2165 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2166 cache_zap_locked(ncp);
2170 dvp->v_cache_dd = NULL;
2171 vn_seqc_write_end(dvp);
2172 cache_enter_unlock(&cel);
2178 * Add an entry to the cache.
2181 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2182 struct timespec *tsp, struct timespec *dtsp)
2184 struct celockstate cel;
2185 struct namecache *ncp, *n2, *ndd;
2186 struct namecache_ts *ncp_ts;
2187 struct nchashhead *ncpp;
2192 VNPASS(dvp != vp, dvp);
2193 VNPASS(!VN_IS_DOOMED(dvp), dvp);
2194 VNPASS(dvp->v_type != VNON, dvp);
2196 VNPASS(!VN_IS_DOOMED(vp), vp);
2197 VNPASS(vp->v_type != VNON, vp);
2201 if (__predict_false(!doingcache))
2206 if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2207 if (cnp->cn_namelen == 1)
2209 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2210 cache_enter_dotdot_prep(dvp, vp, cnp);
2211 flag = NCF_ISDOTDOT;
2215 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2219 cache_celockstate_init(&cel);
2224 * Calculate the hash key and setup as much of the new
2225 * namecache entry as possible before acquiring the lock.
2227 ncp->nc_flag = flag | NCF_WIP;
2230 cache_neg_init(ncp);
2233 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2234 ncp_ts->nc_time = *tsp;
2235 ncp_ts->nc_ticks = ticks;
2236 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2238 ncp_ts->nc_dotdottime = *dtsp;
2239 ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2242 len = ncp->nc_nlen = cnp->cn_namelen;
2243 hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2244 memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2245 ncp->nc_name[len] = '\0';
2246 cache_enter_lock(&cel, dvp, vp, hash);
2249 * See if this vnode or negative entry is already in the cache
2250 * with this name. This can happen with concurrent lookups of
2251 * the same path name.
2253 ncpp = NCHHASH(hash);
2254 CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2255 if (n2->nc_dvp == dvp &&
2256 n2->nc_nlen == cnp->cn_namelen &&
2257 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2258 MPASS(cache_ncp_canuse(n2));
2259 if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2261 ("%s: found entry pointing to a different vnode (%p != %p)",
2262 __func__, NULL, vp));
2264 KASSERT(n2->nc_vp == vp,
2265 ("%s: found entry pointing to a different vnode (%p != %p)",
2266 __func__, n2->nc_vp, vp));
2268 * Entries are supposed to be immutable unless in the
2269 * process of getting destroyed. Accommodating for
2270 * changing timestamps is possible but not worth it.
2271 * This should be harmless in terms of correctness, in
2272 * the worst case resulting in an earlier expiration.
2273 * Alternatively, the found entry can be replaced
2276 MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2279 KASSERT((n2->nc_flag & NCF_TS) != 0,
2281 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2282 n2_ts->nc_time = ncp_ts->nc_time;
2283 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2285 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2286 n2_ts->nc_nc.nc_flag |= NCF_DTS;
2290 SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2292 goto out_unlock_free;
2296 if (flag == NCF_ISDOTDOT) {
2298 * See if we are trying to add .. entry, but some other lookup
2299 * has populated v_cache_dd pointer already.
2301 if (dvp->v_cache_dd != NULL)
2302 goto out_unlock_free;
2303 KASSERT(vp == NULL || vp->v_type == VDIR,
2304 ("wrong vnode type %p", vp));
2305 vn_seqc_write_begin(dvp);
2306 dvp->v_cache_dd = ncp;
2307 vn_seqc_write_end(dvp);
2311 if (flag != NCF_ISDOTDOT) {
2313 * For this case, the cache entry maps both the
2314 * directory name in it and the name ".." for the
2315 * directory's parent.
2317 vn_seqc_write_begin(vp);
2318 if ((ndd = vp->v_cache_dd) != NULL) {
2319 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2320 cache_zap_locked(ndd);
2324 vp->v_cache_dd = ncp;
2325 vn_seqc_write_end(vp);
2326 } else if (vp->v_type != VDIR) {
2327 if (vp->v_cache_dd != NULL) {
2328 vn_seqc_write_begin(vp);
2329 vp->v_cache_dd = NULL;
2330 vn_seqc_write_end(vp);
2335 if (flag != NCF_ISDOTDOT) {
2336 if (LIST_EMPTY(&dvp->v_cache_src)) {
2337 cache_hold_vnode(dvp);
2339 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2343 * If the entry is "negative", we place it into the
2344 * "negative" cache queue, otherwise, we place it into the
2345 * destination vnode's cache entries queue.
2348 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2349 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2352 if (cnp->cn_flags & ISWHITEOUT)
2353 ncp->nc_flag |= NCF_WHITE;
2354 cache_neg_insert(ncp);
2355 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2360 * Insert the new namecache entry into the appropriate chain
2361 * within the cache entries table.
2363 CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2365 atomic_thread_fence_rel();
2367 * Mark the entry as fully constructed.
2368 * It is immutable past this point until its removal.
2370 atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2372 cache_enter_unlock(&cel);
2377 cache_enter_unlock(&cel);
2383 cache_roundup_2(u_int val)
2387 for (res = 1; res <= val; res <<= 1)
2393 static struct nchashhead *
2394 nchinittbl(u_long elements, u_long *hashmask)
2396 struct nchashhead *hashtbl;
2399 hashsize = cache_roundup_2(elements) / 2;
2401 hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2402 for (i = 0; i < hashsize; i++)
2403 CK_SLIST_INIT(&hashtbl[i]);
2404 *hashmask = hashsize - 1;
2409 ncfreetbl(struct nchashhead *hashtbl)
2412 free(hashtbl, M_VFSCACHE);
2416 * Name cache initialization, from vfs_init() when we are booting
2419 nchinit(void *dummy __unused)
2423 cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2424 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2425 cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2426 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2427 cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2428 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2429 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2430 NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2432 VFS_SMR_ZONE_SET(cache_zone_small);
2433 VFS_SMR_ZONE_SET(cache_zone_small_ts);
2434 VFS_SMR_ZONE_SET(cache_zone_large);
2435 VFS_SMR_ZONE_SET(cache_zone_large_ts);
2437 ncsize = desiredvnodes * ncsizefactor;
2438 cache_recalc_neg_min(ncnegminpct);
2439 nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2440 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2441 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2443 if (ncbuckethash > nchash)
2444 ncbuckethash = nchash;
2445 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2447 for (i = 0; i < numbucketlocks; i++)
2448 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2449 ncvnodehash = ncbuckethash;
2450 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2452 for (i = 0; i < numvnodelocks; i++)
2453 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2455 for (i = 0; i < numneglists; i++) {
2456 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2457 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2458 TAILQ_INIT(&neglists[i].nl_list);
2459 TAILQ_INIT(&neglists[i].nl_hotlist);
2462 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2465 cache_vnode_init(struct vnode *vp)
2468 LIST_INIT(&vp->v_cache_src);
2469 TAILQ_INIT(&vp->v_cache_dst);
2470 vp->v_cache_dd = NULL;
2475 cache_changesize(u_long newmaxvnodes)
2477 struct nchashhead *new_nchashtbl, *old_nchashtbl;
2478 u_long new_nchash, old_nchash;
2479 struct namecache *ncp;
2484 newncsize = newmaxvnodes * ncsizefactor;
2485 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2486 if (newmaxvnodes < numbucketlocks)
2487 newmaxvnodes = numbucketlocks;
2489 new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2490 /* If same hash table size, nothing to do */
2491 if (nchash == new_nchash) {
2492 ncfreetbl(new_nchashtbl);
2496 * Move everything from the old hash table to the new table.
2497 * None of the namecache entries in the table can be removed
2498 * because to do so, they have to be removed from the hash table.
2500 cache_lock_all_vnodes();
2501 cache_lock_all_buckets();
2502 old_nchashtbl = nchashtbl;
2503 old_nchash = nchash;
2504 nchashtbl = new_nchashtbl;
2505 nchash = new_nchash;
2506 for (i = 0; i <= old_nchash; i++) {
2507 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2508 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2510 CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2511 CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2515 cache_recalc_neg_min(ncnegminpct);
2516 cache_unlock_all_buckets();
2517 cache_unlock_all_vnodes();
2518 ncfreetbl(old_nchashtbl);
2522 * Invalidate all entries from and to a particular vnode.
2525 cache_purge_impl(struct vnode *vp)
2527 TAILQ_HEAD(, namecache) ncps;
2528 struct namecache *ncp, *nnp;
2529 struct mtx *vlp, *vlp2;
2532 vlp = VP2VNODELOCK(vp);
2536 while (!LIST_EMPTY(&vp->v_cache_src)) {
2537 ncp = LIST_FIRST(&vp->v_cache_src);
2538 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2540 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2542 while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2543 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2544 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2546 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2548 ncp = vp->v_cache_dd;
2550 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2551 ("lost dotdot link"));
2552 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2554 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2556 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2560 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2566 * Opportunistic check to see if there is anything to do.
2569 cache_has_entries(struct vnode *vp)
2572 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2573 vp->v_cache_dd == NULL)
2579 cache_purge(struct vnode *vp)
2582 SDT_PROBE1(vfs, namecache, purge, done, vp);
2583 if (!cache_has_entries(vp))
2585 cache_purge_impl(vp);
2589 * Only to be used by vgone.
2592 cache_purge_vgone(struct vnode *vp)
2596 VNPASS(VN_IS_DOOMED(vp), vp);
2597 if (cache_has_entries(vp)) {
2598 cache_purge_impl(vp);
2603 * Serialize against a potential thread doing cache_purge.
2605 vlp = VP2VNODELOCK(vp);
2606 mtx_wait_unlocked(vlp);
2607 if (cache_has_entries(vp)) {
2608 cache_purge_impl(vp);
2615 * Invalidate all negative entries for a particular directory vnode.
2618 cache_purge_negative(struct vnode *vp)
2620 TAILQ_HEAD(, namecache) ncps;
2621 struct namecache *ncp, *nnp;
2624 SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2625 if (LIST_EMPTY(&vp->v_cache_src))
2628 vlp = VP2VNODELOCK(vp);
2630 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2631 if (!(ncp->nc_flag & NCF_NEGATIVE))
2633 cache_zap_negative_locked_vnode_kl(ncp, vp);
2634 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2637 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2643 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2644 struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2647 ASSERT_VOP_IN_SEQC(fdvp);
2648 ASSERT_VOP_IN_SEQC(fvp);
2649 ASSERT_VOP_IN_SEQC(tdvp);
2651 ASSERT_VOP_IN_SEQC(tvp);
2656 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2657 ("%s: lingering negative entry", __func__));
2659 cache_remove_cnp(tdvp, tcnp);
2665 * Validate that if an entry exists it matches.
2668 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2670 struct namecache *ncp;
2674 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2675 if (CK_SLIST_EMPTY(NCHHASH(hash)))
2677 blp = HASH2BUCKETLOCK(hash);
2679 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2680 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2681 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2682 if (ncp->nc_vp != vp)
2683 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2684 __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2693 * Flush all entries referencing a particular filesystem.
2696 cache_purgevfs(struct mount *mp)
2698 struct vnode *vp, *mvp;
2700 SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2702 * Somewhat wasteful iteration over all vnodes. Would be better to
2703 * support filtering and avoid the interlock to begin with.
2705 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2706 if (!cache_has_entries(vp)) {
2718 * Perform canonical checks and cache lookup and pass on to filesystem
2719 * through the vop_cachedlookup only if needed.
2723 vfs_cache_lookup(struct vop_lookup_args *ap)
2727 struct vnode **vpp = ap->a_vpp;
2728 struct componentname *cnp = ap->a_cnp;
2729 int flags = cnp->cn_flags;
2734 if (dvp->v_type != VDIR)
2737 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2738 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2741 error = vn_dir_check_exec(dvp, cnp);
2745 error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2747 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2753 /* Implementation of the getcwd syscall. */
2755 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2761 buflen = uap->buflen;
2762 if (__predict_false(buflen < 2))
2764 if (buflen > MAXPATHLEN)
2765 buflen = MAXPATHLEN;
2767 buf = uma_zalloc(namei_zone, M_WAITOK);
2768 error = vn_getcwd(buf, &retbuf, &buflen);
2770 error = copyout(retbuf, uap->buf, buflen);
2771 uma_zfree(namei_zone, buf);
2776 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2782 pwd = pwd_get_smr();
2783 error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2785 VFS_SMR_ASSERT_NOT_ENTERED();
2787 pwd = pwd_hold(curthread);
2788 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2794 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2801 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2802 size_t size, int flags, enum uio_seg pathseg)
2804 struct nameidata nd;
2805 char *retbuf, *freebuf;
2810 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2811 pathseg, path, fd, &cap_fstat_rights, td);
2812 if ((error = namei(&nd)) != 0)
2814 error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2816 error = copyout(retbuf, buf, size);
2817 free(freebuf, M_TEMP);
2824 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2827 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2828 uap->flags, UIO_USERSPACE));
2832 * Retrieve the full filesystem path that correspond to a vnode from the name
2833 * cache (if available)
2836 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2843 if (__predict_false(vp == NULL))
2846 buflen = MAXPATHLEN;
2847 buf = malloc(buflen, M_TEMP, M_WAITOK);
2849 pwd = pwd_get_smr();
2850 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2851 VFS_SMR_ASSERT_NOT_ENTERED();
2853 pwd = pwd_hold(curthread);
2854 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2865 * This function is similar to vn_fullpath, but it attempts to lookup the
2866 * pathname relative to the global root mount point. This is required for the
2867 * auditing sub-system, as audited pathnames must be absolute, relative to the
2868 * global root mount point.
2871 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2877 if (__predict_false(vp == NULL))
2879 buflen = MAXPATHLEN;
2880 buf = malloc(buflen, M_TEMP, M_WAITOK);
2882 error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2883 VFS_SMR_ASSERT_NOT_ENTERED();
2885 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2894 static struct namecache *
2895 vn_dd_from_dst(struct vnode *vp)
2897 struct namecache *ncp;
2899 cache_assert_vnode_locked(vp);
2900 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2901 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2908 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2911 struct namecache *ncp;
2915 vlp = VP2VNODELOCK(*vp);
2917 ncp = (*vp)->v_cache_dd;
2918 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2919 KASSERT(ncp == vn_dd_from_dst(*vp),
2920 ("%s: mismatch for dd entry (%p != %p)", __func__,
2921 ncp, vn_dd_from_dst(*vp)));
2923 ncp = vn_dd_from_dst(*vp);
2926 if (*buflen < ncp->nc_nlen) {
2929 counter_u64_add(numfullpathfail4, 1);
2931 SDT_PROBE3(vfs, namecache, fullpath, return, error,
2935 *buflen -= ncp->nc_nlen;
2936 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2937 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2946 SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2949 vn_lock(*vp, LK_SHARED | LK_RETRY);
2950 error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2953 counter_u64_add(numfullpathfail2, 1);
2954 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2959 if (VN_IS_DOOMED(dvp)) {
2960 /* forced unmount */
2963 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2967 * *vp has its use count incremented still.
2974 * Resolve a directory to a pathname.
2976 * The name of the directory can always be found in the namecache or fetched
2977 * from the filesystem. There is also guaranteed to be only one parent, meaning
2978 * we can just follow vnodes up until we find the root.
2980 * The vnode must be referenced.
2983 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2984 size_t *len, size_t addend)
2986 #ifdef KDTRACE_HOOKS
2987 struct vnode *startvp = vp;
2992 bool slash_prefixed;
2994 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2995 VNPASS(vp->v_usecount > 0, vp);
2999 slash_prefixed = true;
3004 slash_prefixed = false;
3009 SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3010 counter_u64_add(numfullpathcalls, 1);
3011 while (vp != rdir && vp != rootvnode) {
3013 * The vp vnode must be already fully constructed,
3014 * since it is either found in namecache or obtained
3015 * from VOP_VPTOCNP(). We may test for VV_ROOT safely
3016 * without obtaining the vnode lock.
3018 if ((vp->v_vflag & VV_ROOT) != 0) {
3019 vn_lock(vp, LK_RETRY | LK_SHARED);
3022 * With the vnode locked, check for races with
3023 * unmount, forced or not. Note that we
3024 * already verified that vp is not equal to
3025 * the root vnode, which means that
3026 * mnt_vnodecovered can be NULL only for the
3029 if (VN_IS_DOOMED(vp) ||
3030 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3031 vp1->v_mountedhere != vp->v_mount) {
3034 SDT_PROBE3(vfs, namecache, fullpath, return,
3044 if (vp->v_type != VDIR) {
3046 counter_u64_add(numfullpathfail1, 1);
3048 SDT_PROBE3(vfs, namecache, fullpath, return,
3052 error = vn_vptocnp(&vp, buf, &buflen);
3058 SDT_PROBE3(vfs, namecache, fullpath, return, error,
3062 buf[--buflen] = '/';
3063 slash_prefixed = true;
3067 if (!slash_prefixed) {
3070 counter_u64_add(numfullpathfail4, 1);
3071 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3075 buf[--buflen] = '/';
3077 counter_u64_add(numfullpathfound, 1);
3080 *retbuf = buf + buflen;
3081 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3088 * Resolve an arbitrary vnode to a pathname.
3091 * - hardlinks are not tracked, thus if the vnode is not a directory this can
3092 * resolve to a different path than the one used to find it
3093 * - namecache is not mandatory, meaning names are not guaranteed to be added
3094 * (in which case resolving fails)
3096 static void __inline
3097 cache_rev_failed_impl(int *reason, int line)
3102 #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__)
3105 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3106 char **retbuf, size_t *buflen, size_t addend)
3108 #ifdef KDTRACE_HOOKS
3109 struct vnode *startvp = vp;
3113 struct namecache *ncp;
3117 #ifdef KDTRACE_HOOKS
3120 seqc_t vp_seqc, tvp_seqc;
3123 VFS_SMR_ASSERT_ENTERED();
3125 if (!cache_fast_revlookup) {
3130 orig_buflen = *buflen;
3133 MPASS(*buflen >= 2);
3135 buf[*buflen] = '\0';
3138 if (vp == rdir || vp == rootvnode) {
3146 #ifdef KDTRACE_HOOKS
3150 ncp = NULL; /* for sdt probe down below */
3151 vp_seqc = vn_seqc_read_any(vp);
3152 if (seqc_in_modify(vp_seqc)) {
3153 cache_rev_failed(&reason);
3158 #ifdef KDTRACE_HOOKS
3161 if ((vp->v_vflag & VV_ROOT) != 0) {
3162 mp = atomic_load_ptr(&vp->v_mount);
3164 cache_rev_failed(&reason);
3167 tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3168 tvp_seqc = vn_seqc_read_any(tvp);
3169 if (seqc_in_modify(tvp_seqc)) {
3170 cache_rev_failed(&reason);
3173 if (!vn_seqc_consistent(vp, vp_seqc)) {
3174 cache_rev_failed(&reason);
3181 ncp = atomic_load_ptr(&vp->v_cache_dd);
3183 cache_rev_failed(&reason);
3186 nc_flag = atomic_load_char(&ncp->nc_flag);
3187 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3188 cache_rev_failed(&reason);
3191 if (!cache_ncp_canuse(ncp)) {
3192 cache_rev_failed(&reason);
3195 if (ncp->nc_nlen >= *buflen) {
3196 cache_rev_failed(&reason);
3200 *buflen -= ncp->nc_nlen;
3201 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3205 tvp_seqc = vn_seqc_read_any(tvp);
3206 if (seqc_in_modify(tvp_seqc)) {
3207 cache_rev_failed(&reason);
3210 if (!vn_seqc_consistent(vp, vp_seqc)) {
3211 cache_rev_failed(&reason);
3216 if (vp == rdir || vp == rootvnode)
3221 *retbuf = buf + *buflen;
3222 *buflen = orig_buflen - *buflen + addend;
3223 SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3227 *buflen = orig_buflen;
3228 SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3234 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3237 size_t orig_buflen, addend;
3243 orig_buflen = *buflen;
3247 if (vp->v_type != VDIR) {
3249 buf[*buflen] = '\0';
3250 error = vn_vptocnp(&vp, buf, buflen);
3259 addend = orig_buflen - *buflen;
3262 return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3266 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3268 * Since the namecache does not track handlings, the caller is expected to first
3269 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3271 * Then we have 2 cases:
3272 * - if the found vnode is a directory, the path can be constructed just by
3273 * fullowing names up the chain
3274 * - otherwise we populate the buffer with the saved name and start resolving
3278 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3283 struct componentname *cnp;
3291 if (*buflen > MAXPATHLEN)
3292 *buflen = MAXPATHLEN;
3294 buf = malloc(*buflen, M_TEMP, M_WAITOK);
3299 * Check for VBAD to work around the vp_crossmp bug in lookup().
3301 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3302 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3303 * If the type is VDIR (like in this very case) we can skip looking
3304 * at ni_dvp in the first place. However, since vnodes get passed here
3305 * unlocked the target may transition to doomed state (type == VBAD)
3306 * before we get to evaluate the condition. If this happens, we will
3307 * populate part of the buffer and descend to vn_fullpath_dir with
3308 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3310 * This should be atomic_load(&vp->v_type) but it is ilegal to take
3311 * an address of a bit field, even if said field is sized to char.
3312 * Work around the problem by reading the value into a full-sized enum
3313 * and then re-reading it with atomic_load which will still prevent
3314 * the compiler from re-reading down the road.
3317 type = atomic_load_int(&type);
3324 addend = cnp->cn_namelen + 2;
3325 if (*buflen < addend) {
3330 tmpbuf = buf + *buflen;
3332 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3333 tmpbuf[addend - 1] = '\0';
3338 pwd = pwd_get_smr();
3339 error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3341 VFS_SMR_ASSERT_NOT_ENTERED();
3343 pwd = pwd_hold(curthread);
3345 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3361 vn_dir_dd_ino(struct vnode *vp)
3363 struct namecache *ncp;
3368 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3369 vlp = VP2VNODELOCK(vp);
3371 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3372 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3375 vs = vget_prep(ddvp);
3377 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3386 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3388 struct namecache *ncp;
3392 vlp = VP2VNODELOCK(vp);
3394 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3395 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3401 l = min(ncp->nc_nlen, buflen - 1);
3402 memcpy(buf, ncp->nc_name, l);
3409 * This function updates path string to vnode's full global path
3410 * and checks the size of the new path string against the pathlen argument.
3412 * Requires a locked, referenced vnode.
3413 * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3415 * If vp is a directory, the call to vn_fullpath_global() always succeeds
3416 * because it falls back to the ".." lookup if the namecache lookup fails.
3419 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3422 struct nameidata nd;
3427 ASSERT_VOP_ELOCKED(vp, __func__);
3429 /* Construct global filesystem path from vp. */
3431 error = vn_fullpath_global(vp, &rpath, &fbuf);
3438 if (strlen(rpath) >= pathlen) {
3440 error = ENAMETOOLONG;
3445 * Re-lookup the vnode by path to detect a possible rename.
3446 * As a side effect, the vnode is relocked.
3447 * If vnode was renamed, return ENOENT.
3449 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3450 UIO_SYSSPACE, path, td);
3456 NDFREE(&nd, NDF_ONLY_PNBUF);
3460 strcpy(path, rpath);
3473 db_print_vpath(struct vnode *vp)
3476 while (vp != NULL) {
3477 db_printf("%p: ", vp);
3478 if (vp == rootvnode) {
3482 if (vp->v_vflag & VV_ROOT) {
3483 db_printf("<mount point>");
3484 vp = vp->v_mount->mnt_vnodecovered;
3486 struct namecache *ncp;
3490 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3493 for (i = 0; i < ncp->nc_nlen; i++)
3494 db_printf("%c", *ncn++);
3507 DB_SHOW_COMMAND(vpath, db_show_vpath)
3512 db_printf("usage: show vpath <struct vnode *>\n");
3516 vp = (struct vnode *)addr;
3522 static bool __read_frequently cache_fast_lookup = true;
3523 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3524 &cache_fast_lookup, 0, "");
3526 #define CACHE_FPL_FAILED -2020
3529 cache_fpl_cleanup_cnp(struct componentname *cnp)
3532 uma_zfree(namei_zone, cnp->cn_pnbuf);
3534 cnp->cn_pnbuf = NULL;
3535 cnp->cn_nameptr = NULL;
3540 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3542 struct componentname *cnp;
3545 while (*(cnp->cn_nameptr) == '/') {
3550 *dpp = ndp->ni_rootdir;
3554 * Components of nameidata (or objects it can point to) which may
3555 * need restoring in case fast path lookup fails.
3557 struct nameidata_saved {
3565 struct nameidata *ndp;
3566 struct componentname *cnp;
3572 struct nameidata_saved snd;
3574 enum cache_fpl_status status:8;
3580 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3583 snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3584 snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3585 snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3586 snd->ni_pathlen = fpl->ndp->ni_pathlen;
3590 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3593 fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3594 fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3595 fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3596 fpl->ndp->ni_pathlen = snd->ni_pathlen;
3600 #define cache_fpl_smr_assert_entered(fpl) ({ \
3601 struct cache_fpl *_fpl = (fpl); \
3602 MPASS(_fpl->in_smr == true); \
3603 VFS_SMR_ASSERT_ENTERED(); \
3605 #define cache_fpl_smr_assert_not_entered(fpl) ({ \
3606 struct cache_fpl *_fpl = (fpl); \
3607 MPASS(_fpl->in_smr == false); \
3608 VFS_SMR_ASSERT_NOT_ENTERED(); \
3611 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3612 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3615 #define cache_fpl_smr_enter_initial(fpl) ({ \
3616 struct cache_fpl *_fpl = (fpl); \
3618 _fpl->in_smr = true; \
3621 #define cache_fpl_smr_enter(fpl) ({ \
3622 struct cache_fpl *_fpl = (fpl); \
3623 MPASS(_fpl->in_smr == false); \
3625 _fpl->in_smr = true; \
3628 #define cache_fpl_smr_exit(fpl) ({ \
3629 struct cache_fpl *_fpl = (fpl); \
3630 MPASS(_fpl->in_smr == true); \
3632 _fpl->in_smr = false; \
3636 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3639 if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3640 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3641 ("%s: converting to abort from %d at %d, set at %d\n",
3642 __func__, fpl->status, line, fpl->line));
3644 fpl->status = CACHE_FPL_STATUS_ABORTED;
3646 return (CACHE_FPL_FAILED);
3649 #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
3652 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3655 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3656 ("%s: setting to partial at %d, but already set to %d at %d\n",
3657 __func__, line, fpl->status, fpl->line));
3658 cache_fpl_smr_assert_entered(fpl);
3659 fpl->status = CACHE_FPL_STATUS_PARTIAL;
3661 return (CACHE_FPL_FAILED);
3664 #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
3667 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3670 KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3671 ("%s: setting to handled at %d, but already set to %d at %d\n",
3672 __func__, line, fpl->status, fpl->line));
3673 cache_fpl_smr_assert_not_entered(fpl);
3674 MPASS(error != CACHE_FPL_FAILED);
3675 fpl->status = CACHE_FPL_STATUS_HANDLED;
3680 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3682 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3683 (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3684 SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3686 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3687 (ISDOTDOT | MAKEENTRY | ISLASTCN)
3689 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3690 "supported and internal flags overlap");
3693 cache_fpl_islastcn(struct nameidata *ndp)
3696 return (*ndp->ni_next == 0);
3700 cache_fpl_isdotdot(struct componentname *cnp)
3703 if (cnp->cn_namelen == 2 &&
3704 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3710 cache_can_fplookup(struct cache_fpl *fpl)
3712 struct nameidata *ndp;
3713 struct componentname *cnp;
3718 td = cnp->cn_thread;
3720 if (!cache_fast_lookup) {
3721 cache_fpl_aborted(fpl);
3725 if (mac_vnode_check_lookup_enabled()) {
3726 cache_fpl_aborted(fpl);
3730 if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3731 cache_fpl_aborted(fpl);
3734 if (IN_CAPABILITY_MODE(td)) {
3735 cache_fpl_aborted(fpl);
3738 if (AUDITING_TD(td)) {
3739 cache_fpl_aborted(fpl);
3742 if (ndp->ni_startdir != NULL) {
3743 cache_fpl_aborted(fpl);
3750 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3752 struct nameidata *ndp;
3757 error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3758 if (__predict_false(error != 0)) {
3759 cache_fpl_smr_exit(fpl);
3760 return (cache_fpl_aborted(fpl));
3762 fpl->fsearch = fsearch;
3767 cache_fplookup_vnode_supported(struct vnode *vp)
3770 return (vp->v_type != VLNK);
3773 static int __noinline
3774 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3777 struct componentname *cnp;
3783 cache_fpl_smr_exit(fpl);
3784 if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3785 return (cache_fpl_handled(fpl, ENOENT));
3787 return (cache_fpl_aborted(fpl));
3791 * The target vnode is not supported, prepare for the slow path to take over.
3793 static int __noinline
3794 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3796 struct nameidata *ndp;
3797 struct componentname *cnp;
3807 dvp_seqc = fpl->dvp_seqc;
3809 if (!pwd_hold_smr(pwd)) {
3810 cache_fpl_smr_exit(fpl);
3811 return (cache_fpl_aborted(fpl));
3814 dvs = vget_prep_smr(dvp);
3815 cache_fpl_smr_exit(fpl);
3816 if (__predict_false(dvs == VGET_NONE)) {
3818 return (cache_fpl_aborted(fpl));
3821 vget_finish_ref(dvp, dvs);
3822 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3825 return (cache_fpl_aborted(fpl));
3828 cache_fpl_restore(fpl, &fpl->snd);
3830 ndp->ni_startdir = dvp;
3831 cnp->cn_flags |= MAKEENTRY;
3832 if (cache_fpl_islastcn(ndp))
3833 cnp->cn_flags |= ISLASTCN;
3834 if (cache_fpl_isdotdot(cnp))
3835 cnp->cn_flags |= ISDOTDOT;
3841 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3843 struct componentname *cnp;
3850 tvp_seqc = fpl->tvp_seqc;
3852 if ((cnp->cn_flags & LOCKLEAF) != 0) {
3853 lkflags = LK_SHARED;
3854 if ((cnp->cn_flags & LOCKSHARED) == 0)
3855 lkflags = LK_EXCLUSIVE;
3856 error = vget_finish(tvp, lkflags, tvs);
3857 if (__predict_false(error != 0)) {
3858 return (cache_fpl_aborted(fpl));
3861 vget_finish_ref(tvp, tvs);
3864 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3865 if ((cnp->cn_flags & LOCKLEAF) != 0)
3869 return (cache_fpl_aborted(fpl));
3872 return (cache_fpl_handled(fpl, 0));
3876 * They want to possibly modify the state of the namecache.
3878 * Don't try to match the API contract, just leave.
3879 * TODO: this leaves scalability on the table
3882 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3884 struct componentname *cnp;
3887 MPASS(cnp->cn_nameiop != LOOKUP);
3888 return (cache_fpl_partial(fpl));
3891 static int __noinline
3892 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3894 struct componentname *cnp;
3895 enum vgetstate dvs, tvs;
3896 struct vnode *dvp, *tvp;
3902 dvp_seqc = fpl->dvp_seqc;
3905 MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3908 * This is less efficient than it can be for simplicity.
3910 dvs = vget_prep_smr(dvp);
3911 if (__predict_false(dvs == VGET_NONE)) {
3912 return (cache_fpl_aborted(fpl));
3914 tvs = vget_prep_smr(tvp);
3915 if (__predict_false(tvs == VGET_NONE)) {
3916 cache_fpl_smr_exit(fpl);
3917 vget_abort(dvp, dvs);
3918 return (cache_fpl_aborted(fpl));
3921 cache_fpl_smr_exit(fpl);
3923 if ((cnp->cn_flags & LOCKPARENT) != 0) {
3924 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3925 if (__predict_false(error != 0)) {
3926 vget_abort(tvp, tvs);
3927 return (cache_fpl_aborted(fpl));
3930 vget_finish_ref(dvp, dvs);
3933 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3934 vget_abort(tvp, tvs);
3935 if ((cnp->cn_flags & LOCKPARENT) != 0)
3939 return (cache_fpl_aborted(fpl));
3942 error = cache_fplookup_final_child(fpl, tvs);
3943 if (__predict_false(error != 0)) {
3944 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3945 if ((cnp->cn_flags & LOCKPARENT) != 0)
3952 MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3957 cache_fplookup_final(struct cache_fpl *fpl)
3959 struct componentname *cnp;
3961 struct vnode *dvp, *tvp;
3966 dvp_seqc = fpl->dvp_seqc;
3969 VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3971 if (cnp->cn_nameiop != LOOKUP) {
3972 return (cache_fplookup_final_modifying(fpl));
3975 if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3976 return (cache_fplookup_final_withparent(fpl));
3978 tvs = vget_prep_smr(tvp);
3979 if (__predict_false(tvs == VGET_NONE)) {
3980 return (cache_fpl_partial(fpl));
3983 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3984 cache_fpl_smr_exit(fpl);
3985 vget_abort(tvp, tvs);
3986 return (cache_fpl_aborted(fpl));
3989 cache_fpl_smr_exit(fpl);
3990 return (cache_fplookup_final_child(fpl, tvs));
3993 static int __noinline
3994 cache_fplookup_dot(struct cache_fpl *fpl)
4001 fpl->tvp_seqc = vn_seqc_read_any(dvp);
4002 if (seqc_in_modify(fpl->tvp_seqc)) {
4003 return (cache_fpl_aborted(fpl));
4006 counter_u64_add(dothits, 1);
4007 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
4012 static int __noinline
4013 cache_fplookup_dotdot(struct cache_fpl *fpl)
4015 struct nameidata *ndp;
4016 struct componentname *cnp;
4017 struct namecache *ncp;
4027 * XXX this is racy the same way regular lookup is
4029 for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4031 if (dvp == pr->pr_root)
4034 if (dvp == ndp->ni_rootdir ||
4035 dvp == ndp->ni_topdir ||
4039 fpl->tvp_seqc = vn_seqc_read_any(dvp);
4040 if (seqc_in_modify(fpl->tvp_seqc)) {
4041 return (cache_fpl_aborted(fpl));
4046 if ((dvp->v_vflag & VV_ROOT) != 0) {
4049 * The opposite of climb mount is needed here.
4051 return (cache_fpl_aborted(fpl));
4054 ncp = atomic_load_ptr(&dvp->v_cache_dd);
4056 return (cache_fpl_aborted(fpl));
4059 nc_flag = atomic_load_char(&ncp->nc_flag);
4060 if ((nc_flag & NCF_ISDOTDOT) != 0) {
4061 if ((nc_flag & NCF_NEGATIVE) != 0)
4062 return (cache_fpl_aborted(fpl));
4063 fpl->tvp = ncp->nc_vp;
4065 fpl->tvp = ncp->nc_dvp;
4068 if (!cache_ncp_canuse(ncp)) {
4069 return (cache_fpl_aborted(fpl));
4072 fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4073 if (seqc_in_modify(fpl->tvp_seqc)) {
4074 return (cache_fpl_partial(fpl));
4077 counter_u64_add(dotdothits, 1);
4081 static int __noinline
4082 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4087 nc_flag = atomic_load_char(&ncp->nc_flag);
4088 MPASS((nc_flag & NCF_NEGATIVE) != 0);
4090 * If they want to create an entry we need to replace this one.
4092 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4095 * This should call something similar to
4096 * cache_fplookup_final_modifying.
4098 return (cache_fpl_partial(fpl));
4100 neg_promote = cache_neg_hit_prep(ncp);
4101 if (!cache_ncp_canuse(ncp)) {
4102 cache_neg_hit_abort(ncp);
4103 return (cache_fpl_partial(fpl));
4105 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4106 cache_neg_hit_abort(ncp);
4107 return (cache_fpl_partial(fpl));
4110 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4112 cache_neg_hit_finish(ncp);
4113 cache_fpl_smr_exit(fpl);
4114 return (cache_fpl_handled(fpl, ENOENT));
4118 cache_fplookup_next(struct cache_fpl *fpl)
4120 struct componentname *cnp;
4121 struct namecache *ncp;
4122 struct vnode *dvp, *tvp;
4129 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4130 return (cache_fplookup_dot(fpl));
4133 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4135 CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4136 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4137 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4142 * If there is no entry we have to punt to the slow path to perform
4143 * actual lookup. Should there be nothing with this name a negative
4144 * entry will be created.
4146 if (__predict_false(ncp == NULL)) {
4147 return (cache_fpl_partial(fpl));
4150 tvp = atomic_load_ptr(&ncp->nc_vp);
4151 nc_flag = atomic_load_char(&ncp->nc_flag);
4152 if ((nc_flag & NCF_NEGATIVE) != 0) {
4153 return (cache_fplookup_neg(fpl, ncp, hash));
4156 if (!cache_ncp_canuse(ncp)) {
4157 return (cache_fpl_partial(fpl));
4161 fpl->tvp_seqc = vn_seqc_read_any(tvp);
4162 if (seqc_in_modify(fpl->tvp_seqc)) {
4163 return (cache_fpl_partial(fpl));
4166 if (!cache_fplookup_vnode_supported(tvp)) {
4167 return (cache_fpl_partial(fpl));
4170 counter_u64_add(numposhits, 1);
4171 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4176 cache_fplookup_mp_supported(struct mount *mp)
4181 if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4187 * Walk up the mount stack (if any).
4189 * Correctness is provided in the following ways:
4190 * - all vnodes are protected from freeing with SMR
4191 * - struct mount objects are type stable making them always safe to access
4192 * - stability of the particular mount is provided by busying it
4193 * - relationship between the vnode which is mounted on and the mount is
4194 * verified with the vnode sequence counter after busying
4195 * - association between root vnode of the mount and the mount is protected
4198 * From that point on we can read the sequence counter of the root vnode
4199 * and get the next mount on the stack (if any) using the same protection.
4201 * By the end of successful walk we are guaranteed the reached state was
4202 * indeed present at least at some point which matches the regular lookup.
4204 static int __noinline
4205 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4207 struct mount *mp, *prev_mp;
4212 vp_seqc = fpl->tvp_seqc;
4214 VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4215 mp = atomic_load_ptr(&vp->v_mountedhere);
4221 if (!vfs_op_thread_enter_crit(mp)) {
4222 if (prev_mp != NULL)
4223 vfs_op_thread_exit_crit(prev_mp);
4224 return (cache_fpl_partial(fpl));
4226 if (prev_mp != NULL)
4227 vfs_op_thread_exit_crit(prev_mp);
4228 if (!vn_seqc_consistent(vp, vp_seqc)) {
4229 vfs_op_thread_exit_crit(mp);
4230 return (cache_fpl_partial(fpl));
4232 if (!cache_fplookup_mp_supported(mp)) {
4233 vfs_op_thread_exit_crit(mp);
4234 return (cache_fpl_partial(fpl));
4236 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4237 if (vp == NULL || VN_IS_DOOMED(vp)) {
4238 vfs_op_thread_exit_crit(mp);
4239 return (cache_fpl_partial(fpl));
4241 vp_seqc = vn_seqc_read_any(vp);
4242 if (seqc_in_modify(vp_seqc)) {
4243 vfs_op_thread_exit_crit(mp);
4244 return (cache_fpl_partial(fpl));
4247 mp = atomic_load_ptr(&vp->v_mountedhere);
4252 vfs_op_thread_exit_crit(prev_mp);
4254 fpl->tvp_seqc = vp_seqc;
4259 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4267 * Hack: while this is a union, the pointer tends to be NULL so save on
4270 mp = atomic_load_ptr(&vp->v_mountedhere);
4273 if (vp->v_type == VDIR)
4281 * The code was originally copy-pasted from regular lookup and despite
4282 * clean ups leaves performance on the table. Any modifications here
4283 * must take into account that in case off fallback the resulting
4284 * nameidata state has to be compatible with the original.
4287 cache_fplookup_parse(struct cache_fpl *fpl)
4289 struct nameidata *ndp;
4290 struct componentname *cnp;
4297 * Search a new directory.
4299 * The last component of the filename is left accessible via
4300 * cnp->cn_nameptr for callers that need the name. Callers needing
4301 * the name set the SAVENAME flag. When done, they assume
4302 * responsibility for freeing the pathname buffer.
4304 for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4306 cnp->cn_namelen = cp - cnp->cn_nameptr;
4307 if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4308 cache_fpl_smr_exit(fpl);
4309 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4311 ndp->ni_pathlen -= cnp->cn_namelen;
4312 KASSERT(ndp->ni_pathlen <= PATH_MAX,
4313 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4317 * Replace multiple slashes by a single slash and trailing slashes
4318 * by a null. This must be done before VOP_LOOKUP() because some
4319 * fs's don't know about trailing slashes. Remember if there were
4320 * trailing slashes to handle symlinks, existing non-directories
4321 * and non-existing files that won't be directories specially later.
4323 while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4329 * Regular lookup performs the following:
4330 * *ndp->ni_next = '\0';
4331 * cnp->cn_flags |= TRAILINGSLASH;
4333 * Which is problematic since it modifies data read
4334 * from userspace. Then if fast path lookup was to
4335 * abort we would have to either restore it or convey
4336 * the flag. Since this is a corner case just ignore
4337 * it for simplicity.
4339 return (cache_fpl_partial(fpl));
4345 * Check for degenerate name (e.g. / or "")
4346 * which is a way of talking about a directory,
4347 * e.g. like "/." or ".".
4350 * Another corner case handled by the regular lookup
4352 if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4353 return (cache_fpl_partial(fpl));
4359 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4361 struct nameidata *ndp;
4362 struct componentname *cnp;
4367 cnp->cn_nameptr = ndp->ni_next;
4368 while (*cnp->cn_nameptr == '/') {
4375 * See the API contract for VOP_FPLOOKUP_VEXEC.
4377 static int __noinline
4378 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4384 dvp_seqc = fpl->dvp_seqc;
4387 * Hack: they may be looking up foo/bar, where foo is a
4388 * regular file. In such a case we need to turn ENOTDIR,
4389 * but we may happen to get here with a different error.
4391 if (dvp->v_type != VDIR) {
4393 * The check here is predominantly to catch
4394 * EOPNOTSUPP from dead_vnodeops. If the vnode
4395 * gets doomed past this point it is going to
4396 * fail seqc verification.
4398 if (VN_IS_DOOMED(dvp)) {
4399 return (cache_fpl_aborted(fpl));
4405 * Hack: handle O_SEARCH.
4407 * Open Group Base Specifications Issue 7, 2018 edition states:
4408 * If the access mode of the open file description associated with the
4409 * file descriptor is not O_SEARCH, the function shall check whether
4410 * directory searches are permitted using the current permissions of
4411 * the directory underlying the file descriptor. If the access mode is
4412 * O_SEARCH, the function shall not perform the check.
4414 * Regular lookup tests for the NOEXECCHECK flag for every path
4415 * component to decide whether to do the permission check. However,
4416 * since most lookups never have the flag (and when they do it is only
4417 * present for the first path component), lockless lookup only acts on
4418 * it if there is a permission problem. Here the flag is represented
4419 * with a boolean so that we don't have to clear it on the way out.
4421 * For simplicity this always aborts.
4422 * TODO: check if this is the first lookup and ignore the permission
4423 * problem. Note the flag has to survive fallback (if it happens to be
4427 return (cache_fpl_aborted(fpl));
4432 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4433 error = cache_fpl_aborted(fpl);
4435 cache_fpl_partial(fpl);
4439 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4440 error = cache_fpl_aborted(fpl);
4442 cache_fpl_smr_exit(fpl);
4443 cache_fpl_handled(fpl, error);
4451 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4453 struct nameidata *ndp;
4454 struct componentname *cnp;
4458 error = CACHE_FPL_FAILED;
4462 cache_fpl_checkpoint(fpl, &fpl->snd);
4465 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4466 if (seqc_in_modify(fpl->dvp_seqc)) {
4467 cache_fpl_aborted(fpl);
4470 mp = atomic_load_ptr(&fpl->dvp->v_mount);
4471 if (!cache_fplookup_mp_supported(mp)) {
4472 cache_fpl_aborted(fpl);
4476 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4479 error = cache_fplookup_parse(fpl);
4480 if (__predict_false(error != 0)) {
4484 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4486 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4487 if (__predict_false(error != 0)) {
4488 error = cache_fplookup_failed_vexec(fpl, error);
4492 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4493 error = cache_fplookup_dotdot(fpl);
4494 if (__predict_false(error != 0)) {
4498 error = cache_fplookup_next(fpl);
4499 if (__predict_false(error != 0)) {
4503 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4505 if (cache_fplookup_need_climb_mount(fpl)) {
4506 error = cache_fplookup_climb_mount(fpl);
4507 if (__predict_false(error != 0)) {
4513 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4515 if (cache_fpl_islastcn(ndp)) {
4516 error = cache_fplookup_final(fpl);
4520 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4521 error = cache_fpl_aborted(fpl);
4525 fpl->dvp = fpl->tvp;
4526 fpl->dvp_seqc = fpl->tvp_seqc;
4528 cache_fplookup_parse_advance(fpl);
4529 cache_fpl_checkpoint(fpl, &fpl->snd);
4532 switch (fpl->status) {
4533 case CACHE_FPL_STATUS_UNSET:
4534 __assert_unreachable();
4536 case CACHE_FPL_STATUS_PARTIAL:
4537 cache_fpl_smr_assert_entered(fpl);
4538 return (cache_fplookup_partial_setup(fpl));
4539 case CACHE_FPL_STATUS_ABORTED:
4541 cache_fpl_smr_exit(fpl);
4542 return (CACHE_FPL_FAILED);
4543 case CACHE_FPL_STATUS_HANDLED:
4544 MPASS(error != CACHE_FPL_FAILED);
4545 cache_fpl_smr_assert_not_entered(fpl);
4546 if (__predict_false(error != 0)) {
4549 cache_fpl_cleanup_cnp(cnp);
4552 ndp->ni_dvp = fpl->dvp;
4553 ndp->ni_vp = fpl->tvp;
4554 if (cnp->cn_flags & SAVENAME)
4555 cnp->cn_flags |= HASBUF;
4557 cache_fpl_cleanup_cnp(cnp);
4563 * Fast path lookup protected with SMR and sequence counters.
4565 * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4567 * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4570 * Traditional vnode lookup conceptually looks like this:
4576 * vn_unlock(current);
4583 * Each jump to the next vnode is safe memory-wise and atomic with respect to
4584 * any modifications thanks to holding respective locks.
4586 * The same guarantee can be provided with a combination of safe memory
4587 * reclamation and sequence counters instead. If all operations which affect
4588 * the relationship between the current vnode and the one we are looking for
4589 * also modify the counter, we can verify whether all the conditions held as
4590 * we made the jump. This includes things like permissions, mount points etc.
4591 * Counter modification is provided by enclosing relevant places in
4592 * vn_seqc_write_begin()/end() calls.
4594 * Thus this translates to:
4597 * dvp_seqc = seqc_read_any(dvp);
4598 * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4602 * tvp_seqc = seqc_read_any(tvp);
4603 * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4605 * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4607 * dvp = tvp; // we know nothing of importance has changed
4608 * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4612 * vget(); // secure the vnode
4613 * if (!seqc_consistent(tvp, tvp_seqc) // final check
4615 * // at this point we know nothing has changed for any parent<->child pair
4616 * // as they were crossed during the lookup, meaning we matched the guarantee
4617 * // of the locked variant
4620 * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4621 * - they are called while within vfs_smr protection which they must never exit
4622 * - EAGAIN can be returned to denote checking could not be performed, it is
4623 * always valid to return it
4624 * - if the sequence counter has not changed the result must be valid
4625 * - if the sequence counter has changed both false positives and false negatives
4626 * are permitted (since the result will be rejected later)
4627 * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4629 * Caveats to watch out for:
4630 * - vnodes are passed unlocked and unreferenced with nothing stopping
4631 * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4632 * to use atomic_load_ptr to fetch it.
4633 * - the aforementioned object can also get freed, meaning absent other means it
4634 * should be protected with vfs_smr
4635 * - either safely checking permissions as they are modified or guaranteeing
4636 * their stability is left to the routine
4639 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4642 struct cache_fpl fpl;
4645 struct componentname *cnp;
4646 struct nameidata_saved orig;
4649 MPASS(ndp->ni_lcf == 0);
4651 fpl.status = CACHE_FPL_STATUS_UNSET;
4653 fpl.cnp = &ndp->ni_cnd;
4654 MPASS(curthread == fpl.cnp->cn_thread);
4656 if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4657 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4659 if (!cache_can_fplookup(&fpl)) {
4660 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4661 *status = fpl.status;
4662 return (EOPNOTSUPP);
4665 cache_fpl_checkpoint(&fpl, &orig);
4667 cache_fpl_smr_enter_initial(&fpl);
4668 fpl.fsearch = false;
4669 pwd = pwd_get_smr();
4671 ndp->ni_rootdir = pwd->pwd_rdir;
4672 ndp->ni_topdir = pwd->pwd_jdir;
4675 cnp->cn_nameptr = cnp->cn_pnbuf;
4676 if (cnp->cn_pnbuf[0] == '/') {
4677 cache_fpl_handle_root(ndp, &dvp);
4679 if (ndp->ni_dirfd == AT_FDCWD) {
4680 dvp = pwd->pwd_cdir;
4682 error = cache_fplookup_dirfd(&fpl, &dvp);
4683 if (__predict_false(error != 0)) {
4689 SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4691 error = cache_fplookup_impl(dvp, &fpl);
4693 cache_fpl_smr_assert_not_entered(&fpl);
4694 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4696 *status = fpl.status;
4697 switch (fpl.status) {
4698 case CACHE_FPL_STATUS_UNSET:
4699 __assert_unreachable();
4701 case CACHE_FPL_STATUS_HANDLED:
4702 SDT_PROBE3(vfs, namei, lookup, return, error,
4703 (error == 0 ? ndp->ni_vp : NULL), true);
4705 case CACHE_FPL_STATUS_PARTIAL:
4708 * Status restored by cache_fplookup_partial_setup.
4711 case CACHE_FPL_STATUS_ABORTED:
4712 cache_fpl_restore(&fpl, &orig);