sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70 #ifdef INVARIANTS
  71 #include <machine/_inttypes.h>
  72 #endif
  73
  74 #include <sys/capsicum.h>
  75
  76 #include <security/audit/audit.h>
  77 #include <security/mac/mac_framework.h>
  78
  79 #ifdef DDB
  80 #include <ddb/ddb.h>
  81 #endif
  82
  83 #include <vm/uma.h>
  84
  85 /*
  86  * High level overview of name caching in the VFS layer.
  87  *
  88  * Originally caching was implemented as part of UFS, later extracted to allow
  89  * use by other filesystems. A decision was made to make it optional and
  90  * completely detached from the rest of the kernel, which comes with limitations
  91  * outlined near the end of this comment block.
  92  *
  93  * This fundamental choice needs to be revisited. In the meantime, the current
  94  * state is described below. Significance of all notable routines is explained
  95  * in comments placed above their implementation. Scattered thoroughout the
  96  * file are TODO comments indicating shortcomings which can be fixed without
  97  * reworking everything (most of the fixes will likely be reusable). Various
  98  * details are omitted from this explanation to not clutter the overview, they
  99  * have to be checked by reading the code and associated commentary.
 100  *
 101  * Keep in mind that it's individual path components which are cached, not full
 102  * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
 103  * one for each name.
 104  *
 105  * I. Data organization
 106  *
 107  * Entries are described by "struct namecache" objects and stored in a hash
 108  * table. See cache_get_hash for more information.
 109  *
 110  * "struct vnode" contains pointers to source entries (names which can be found
 111  * when traversing through said vnode), destination entries (names of that
 112  * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
 113  * the parent vnode.
 114  *
 115  * The (directory vnode; name) tuple reliably determines the target entry if
 116  * it exists.
 117  *
 118  * Since there are no small locks at this time (all are 32 bytes in size on
 119  * LP64), the code works around the problem by introducing lock arrays to
 120  * protect hash buckets and vnode lists.
 121  *
 122  * II. Filesystem integration
 123  *
 124  * Filesystems participating in name caching do the following:
 125  * - set vop_lookup routine to vfs_cache_lookup
 126  * - set vop_cachedlookup to whatever can perform the lookup if the above fails
 127  * - if they support lockless lookup (see below), vop_fplookup_vexec and
 128  *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
 129  *   mount point
 130  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
 131  *   applicable
 132  * - call cache_enter to add entries depending on the MAKEENTRY flag
 133  *
 134  * With the above in mind, there are 2 entry points when doing lookups:
 135  * - ... -> namei -> cache_fplookup -- this is the default
 136  * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
 137  *   should the above fail
 138  *
 139  * Example code flow how an entry is added:
 140  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
 141  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
 142  *
 143  * III. Performance considerations
 144  *
 145  * For lockless case forward lookup avoids any writes to shared areas apart
 146  * from the terminal path component. In other words non-modifying lookups of
 147  * different files don't suffer any scalability problems in the namecache.
 148  * Looking up the same file is limited by VFS and goes beyond the scope of this
 149  * file.
 150  *
 151  * At least on amd64 the single-threaded bottleneck for long paths is hashing
 152  * (see cache_get_hash). There are cases where the code issues acquire fence
 153  * multiple times, they can be combined on architectures which suffer from it.
 154  *
 155  * For locked case each encountered vnode has to be referenced and locked in
 156  * order to be handed out to the caller (normally that's namei). This
 157  * introduces significant hit single-threaded and serialization multi-threaded.
 158  *
 159  * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
 160  * avoids any writes to shared areas to any components.
 161  *
 162  * Unrelated insertions are partially serialized on updating the global entry
 163  * counter and possibly serialized on colliding bucket or vnode locks.
 164  *
 165  * IV. Observability
 166  *
 167  * Note not everything has an explicit dtrace probe nor it should have, thus
 168  * some of the one-liners below depend on implementation details.
 169  *
 170  * Examples:
 171  *
 172  * # Check what lookups failed to be handled in a lockless manner. Column 1 is
 173  * # line number, column 2 is status code (see cache_fpl_status)
 174  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
 175  *
 176  * # Lengths of names added by binary name
 177  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
 178  *
 179  * # Same as above but only those which exceed 64 characters
 180  * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
 181  *
 182  * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
 183  * # path is it
 184  * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
 185  *
 186  * V. Limitations and implementation defects
 187  *
 188  * - since it is possible there is no entry for an open file, tools like
 189  *   "procstat" may fail to resolve fd -> vnode -> path to anything
 190  * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
 191  *   shortage) in which case the above problem applies
 192  * - hardlinks are not tracked, thus if a vnode is reachable in more than one
 193  *   way, resolving a name may return a different path than the one used to
 194  *   open it (even if said path is still valid)
 195  * - by default entries are not added for newly created files
 196  * - adding an entry may need to evict negative entry first, which happens in 2
 197  *   distinct places (evicting on lookup, adding in a later VOP) making it
 198  *   impossible to simply reuse it
 199  * - there is a simple scheme to evict negative entries as the cache is approaching
 200  *   its capacity, but it is very unclear if doing so is a good idea to begin with
 201  * - vnodes are subject to being recycled even if target inode is left in memory,
 202  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
 203  *   names get duplicated -- kept by filesystem itself and namecache separately
 204  * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
 205  *   now hard to replace with malloc due to dependence on SMR.
 206  * - lack of better integration with the kernel also turns nullfs into a layered
 207  *   filesystem instead of something which can take advantage of caching
 208  */
 209
 210 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 211     "Name cache");
 212
 213 SDT_PROVIDER_DECLARE(vfs);
 214 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
 215     "struct vnode *");
 216 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
 217     "struct vnode *");
 218 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
 219     "char *");
 220 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
 221     "const char *");
 222 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
 223     "struct namecache *", "int", "int");
 224 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 225 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
 226     "char *", "struct vnode *");
 227 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 228 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 229     "struct vnode *", "char *");
 230 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 231     "struct vnode *");
 232 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 233     "struct vnode *", "char *");
 234 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 235     "char *");
 236 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 237     "struct componentname *");
 238 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 239     "struct componentname *");
 240 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
 241 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 242 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 243 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 244 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 245     "struct vnode *");
 246 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 247     "char *");
 248 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 249     "char *");
 250 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 251
 252 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 253 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 254 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 255
 256 static char __read_frequently cache_fast_lookup_enabled = true;
 257
 258 /*
 259  * This structure describes the elements in the cache of recent
 260  * names looked up by namei.
 261  */
 262 struct negstate {
 263         u_char neg_flag;
 264         u_char neg_hit;
 265 };
 266 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 267     "the state must fit in a union with a pointer without growing it");
 268
 269 struct  namecache {
 270         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 271         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 272         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 273         struct  vnode *nc_dvp;          /* vnode of parent of name */
 274         union {
 275                 struct  vnode *nu_vp;   /* vnode the name refers to */
 276                 struct  negstate nu_neg;/* negative entry state */
 277         } n_un;
 278         u_char  nc_flag;                /* flag bits */
 279         u_char  nc_nlen;                /* length of name */
 280         char    nc_name[0];             /* segment name + nul */
 281 };
 282
 283 /*
 284  * struct namecache_ts repeats struct namecache layout up to the
 285  * nc_nlen member.
 286  * struct namecache_ts is used in place of struct namecache when time(s) need
 287  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 288  * both a non-dotdot directory name plus dotdot for the directory's
 289  * parent.
 290  *
 291  * See below for alignment requirement.
 292  */
 293 struct  namecache_ts {
 294         struct  timespec nc_time;       /* timespec provided by fs */
 295         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 296         int     nc_ticks;               /* ticks value when entry was added */
 297         int     nc_pad;
 298         struct namecache nc_nc;
 299 };
 300
 301 TAILQ_HEAD(cache_freebatch, namecache);
 302
 303 /*
 304  * At least mips n32 performs 64-bit accesses to timespec as found
 305  * in namecache_ts and requires them to be aligned. Since others
 306  * may be in the same spot suffer a little bit and enforce the
 307  * alignment for everyone. Note this is a nop for 64-bit platforms.
 308  */
 309 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 310
 311 /*
 312  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
 313  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
 314  * smaller and the value was bumped to retain the total size, but it
 315  * was never re-evaluated for suitability. A simple test counting
 316  * lengths during package building shows that the value of 45 covers
 317  * about 86% of all added entries, reaching 99% at 65.
 318  *
 319  * Regardless of the above, use of dedicated zones instead of malloc may be
 320  * inducing additional waste. This may be hard to address as said zones are
 321  * tied to VFS SMR. Even if retaining them, the current split should be
 322  * re-evaluated.
 323  */
 324 #ifdef __LP64__
 325 #define CACHE_PATH_CUTOFF       45
 326 #define CACHE_LARGE_PAD         6
 327 #else
 328 #define CACHE_PATH_CUTOFF       41
 329 #define CACHE_LARGE_PAD         2
 330 #endif
 331
 332 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 333 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 334 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 335 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 336
 337 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 338 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 339 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 340 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 341
 342 #define nc_vp           n_un.nu_vp
 343 #define nc_neg          n_un.nu_neg
 344
 345 /*
 346  * Flags in namecache.nc_flag
 347  */
 348 #define NCF_WHITE       0x01
 349 #define NCF_ISDOTDOT    0x02
 350 #define NCF_TS          0x04
 351 #define NCF_DTS         0x08
 352 #define NCF_DVDROP      0x10
 353 #define NCF_NEGATIVE    0x20
 354 #define NCF_INVALID     0x40
 355 #define NCF_WIP         0x80
 356
 357 /*
 358  * Flags in negstate.neg_flag
 359  */
 360 #define NEG_HOT         0x01
 361
 362 static bool     cache_neg_evict_cond(u_long lnumcache);
 363
 364 /*
 365  * Mark an entry as invalid.
 366  *
 367  * This is called before it starts getting deconstructed.
 368  */
 369 static void
 370 cache_ncp_invalidate(struct namecache *ncp)
 371 {
 372
 373         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 374             ("%s: entry %p already invalid", __func__, ncp));
 375         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 376         atomic_thread_fence_rel();
 377 }
 378
 379 /*
 380  * Check whether the entry can be safely used.
 381  *
 382  * All places which elide locks are supposed to call this after they are
 383  * done with reading from an entry.
 384  */
 385 #define cache_ncp_canuse(ncp)   ({                                      \
 386         struct namecache *_ncp = (ncp);                                 \
 387         u_char _nc_flag;                                                \
 388                                                                         \
 389         atomic_thread_fence_acq();                                      \
 390         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 391         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
 392 })
 393
 394 /*
 395  * Like the above but also checks NCF_WHITE.
 396  */
 397 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
 398         struct namecache *_ncp = (ncp);                                 \
 399         u_char _nc_flag;                                                \
 400                                                                         \
 401         atomic_thread_fence_acq();                                      \
 402         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 403         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
 404 })
 405
 406 VFS_SMR_DECLARE;
 407
 408 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 409     "Name cache parameters");
 410
 411 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 412 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 413     "Total namecache capacity");
 414
 415 u_int ncsizefactor = 2;
 416 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 417     "Size factor for namecache");
 418
 419 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 420 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 421     "Ratio of negative namecache entries");
 422
 423 /*
 424  * Negative entry % of namecache capacity above which automatic eviction is allowed.
 425  *
 426  * Check cache_neg_evict_cond for details.
 427  */
 428 static u_int ncnegminpct = 3;
 429
 430 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 431 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 432     "Negative entry count above which automatic eviction is allowed");
 433
 434 /*
 435  * Structures associated with name caching.
 436  */
 437 #define NCHHASH(hash) \
 438         (&nchashtbl[(hash) & nchash])
 439 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 440 static u_long __read_mostly     nchash;                 /* size of hash table */
 441 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 442     "Size of namecache hash table");
 443 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 444 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 445
 446 struct nchstats nchstats;               /* cache effectiveness statistics */
 447
 448 static bool __read_mostly cache_rename_add = true;
 449 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW,
 450     &cache_rename_add, 0, "");
 451
 452 static u_int __exclusive_cache_line neg_cycle;
 453
 454 #define ncneghash       3
 455 #define numneglists     (ncneghash + 1)
 456
 457 struct neglist {
 458         struct mtx              nl_evict_lock;
 459         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 460         TAILQ_HEAD(, namecache) nl_list;
 461         TAILQ_HEAD(, namecache) nl_hotlist;
 462         u_long                  nl_hotnum;
 463 } __aligned(CACHE_LINE_SIZE);
 464
 465 static struct neglist neglists[numneglists];
 466
 467 static inline struct neglist *
 468 NCP2NEGLIST(struct namecache *ncp)
 469 {
 470
 471         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 472 }
 473
 474 static inline struct negstate *
 475 NCP2NEGSTATE(struct namecache *ncp)
 476 {
 477
 478         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
 479         return (&ncp->nc_neg);
 480 }
 481
 482 #define numbucketlocks (ncbuckethash + 1)
 483 static u_int __read_mostly  ncbuckethash;
 484 static struct mtx_padalign __read_mostly  *bucketlocks;
 485 #define HASH2BUCKETLOCK(hash) \
 486         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 487
 488 #define numvnodelocks (ncvnodehash + 1)
 489 static u_int __read_mostly  ncvnodehash;
 490 static struct mtx __read_mostly *vnodelocks;
 491 static inline struct mtx *
 492 VP2VNODELOCK(struct vnode *vp)
 493 {
 494
 495         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 496 }
 497
 498 static void
 499 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 500 {
 501         struct namecache_ts *ncp_ts;
 502
 503         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 504             (tsp == NULL && ticksp == NULL),
 505             ("No NCF_TS"));
 506
 507         if (tsp == NULL)
 508                 return;
 509
 510         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 511         *tsp = ncp_ts->nc_time;
 512         *ticksp = ncp_ts->nc_ticks;
 513 }
 514
 515 #ifdef DEBUG_CACHE
 516 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 517 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 518     "VFS namecache enabled");
 519 #endif
 520
 521 /* Export size information to userland */
 522 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 523     sizeof(struct namecache), "sizeof(struct namecache)");
 524
 525 /*
 526  * The new name cache statistics
 527  */
 528 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 529     "Name cache statistics");
 530
 531 #define STATNODE_ULONG(name, varname, descr)                                    \
 532         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 533 #define STATNODE_COUNTER(name, varname, descr)                                  \
 534         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 535         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 536             descr);
 537 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 538 STATNODE_ULONG(count, numcache, "Number of cache entries");
 539 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 540 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 541 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 542 STATNODE_COUNTER(dotdothits, dotdothits, "Number of '..' hits");
 543 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 544 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 545 STATNODE_COUNTER(poszaps, numposzaps,
 546     "Number of cache hits (positive) we do not want to cache");
 547 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 548 STATNODE_COUNTER(negzaps, numnegzaps,
 549     "Number of cache hits (negative) we do not want to cache");
 550 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 551 /* These count for vn_getcwd(), too. */
 552 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 553 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 554 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 555     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 556 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 557 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 558 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
 559
 560 /*
 561  * Debug or developer statistics.
 562  */
 563 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 564     "Name cache debugging");
 565 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 566         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 567 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 568         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 569         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 570             descr);
 571 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 572     "Number of successful removals after relocking");
 573 static long zap_bucket_fail;
 574 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 575 static long zap_bucket_fail2;
 576 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 577 static long cache_lock_vnodes_cel_3_failures;
 578 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 579     "Number of times 3-way vnode locking failed");
 580
 581 static void cache_zap_locked(struct namecache *ncp);
 582 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 583     char **retbuf, size_t *buflen, size_t addend);
 584 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 585     char **retbuf, size_t *buflen);
 586 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 587     char **retbuf, size_t *len, size_t addend);
 588
 589 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 590
 591 static inline void
 592 cache_assert_vlp_locked(struct mtx *vlp)
 593 {
 594
 595         if (vlp != NULL)
 596                 mtx_assert(vlp, MA_OWNED);
 597 }
 598
 599 static inline void
 600 cache_assert_vnode_locked(struct vnode *vp)
 601 {
 602         struct mtx *vlp;
 603
 604         vlp = VP2VNODELOCK(vp);
 605         cache_assert_vlp_locked(vlp);
 606 }
 607
 608 /*
 609  * Directory vnodes with entries are held for two reasons:
 610  * 1. make them less of a target for reclamation in vnlru
 611  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
 612  *
 613  * It will be feasible to stop doing it altogether if all filesystems start
 614  * supporting lockless lookup.
 615  */
 616 static void
 617 cache_hold_vnode(struct vnode *vp)
 618 {
 619
 620         cache_assert_vnode_locked(vp);
 621         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 622         vhold(vp);
 623         counter_u64_add(numcachehv, 1);
 624 }
 625
 626 static void
 627 cache_drop_vnode(struct vnode *vp)
 628 {
 629
 630         /*
 631          * Called after all locks are dropped, meaning we can't assert
 632          * on the state of v_cache_src.
 633          */
 634         vdrop(vp);
 635         counter_u64_add(numcachehv, -1);
 636 }
 637
 638 /*
 639  * UMA zones.
 640  */
 641 static uma_zone_t __read_mostly cache_zone_small;
 642 static uma_zone_t __read_mostly cache_zone_small_ts;
 643 static uma_zone_t __read_mostly cache_zone_large;
 644 static uma_zone_t __read_mostly cache_zone_large_ts;
 645
 646 char *
 647 cache_symlink_alloc(size_t size, int flags)
 648 {
 649
 650         if (size < CACHE_ZONE_SMALL_SIZE) {
 651                 return (uma_zalloc_smr(cache_zone_small, flags));
 652         }
 653         if (size < CACHE_ZONE_LARGE_SIZE) {
 654                 return (uma_zalloc_smr(cache_zone_large, flags));
 655         }
 656         counter_u64_add(symlinktoobig, 1);
 657         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
 658         return (NULL);
 659 }
 660
 661 void
 662 cache_symlink_free(char *string, size_t size)
 663 {
 664
 665         MPASS(string != NULL);
 666         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
 667             ("%s: size %zu too big", __func__, size));
 668
 669         if (size < CACHE_ZONE_SMALL_SIZE) {
 670                 uma_zfree_smr(cache_zone_small, string);
 671                 return;
 672         }
 673         if (size < CACHE_ZONE_LARGE_SIZE) {
 674                 uma_zfree_smr(cache_zone_large, string);
 675                 return;
 676         }
 677         __assert_unreachable();
 678 }
 679
 680 static struct namecache *
 681 cache_alloc_uma(int len, bool ts)
 682 {
 683         struct namecache_ts *ncp_ts;
 684         struct namecache *ncp;
 685
 686         if (__predict_false(ts)) {
 687                 if (len <= CACHE_PATH_CUTOFF)
 688                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 689                 else
 690                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 691                 ncp = &ncp_ts->nc_nc;
 692         } else {
 693                 if (len <= CACHE_PATH_CUTOFF)
 694                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 695                 else
 696                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 697         }
 698         return (ncp);
 699 }
 700
 701 static void
 702 cache_free_uma(struct namecache *ncp)
 703 {
 704         struct namecache_ts *ncp_ts;
 705
 706         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 707                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 708                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 709                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 710                 else
 711                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 712         } else {
 713                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 714                         uma_zfree_smr(cache_zone_small, ncp);
 715                 else
 716                         uma_zfree_smr(cache_zone_large, ncp);
 717         }
 718 }
 719
 720 static struct namecache *
 721 cache_alloc(int len, bool ts)
 722 {
 723         u_long lnumcache;
 724
 725         /*
 726          * Avoid blowout in namecache entries.
 727          *
 728          * Bugs:
 729          * 1. filesystems may end up trying to add an already existing entry
 730          * (for example this can happen after a cache miss during concurrent
 731          * lookup), in which case we will call cache_neg_evict despite not
 732          * adding anything.
 733          * 2. the routine may fail to free anything and no provisions are made
 734          * to make it try harder (see the inside for failure modes)
 735          * 3. it only ever looks at negative entries.
 736          */
 737         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 738         if (cache_neg_evict_cond(lnumcache)) {
 739                 lnumcache = atomic_load_long(&numcache);
 740         }
 741         if (__predict_false(lnumcache >= ncsize)) {
 742                 atomic_subtract_long(&numcache, 1);
 743                 counter_u64_add(numdrops, 1);
 744                 return (NULL);
 745         }
 746         return (cache_alloc_uma(len, ts));
 747 }
 748
 749 static void
 750 cache_free(struct namecache *ncp)
 751 {
 752
 753         MPASS(ncp != NULL);
 754         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 755                 cache_drop_vnode(ncp->nc_dvp);
 756         }
 757         cache_free_uma(ncp);
 758         atomic_subtract_long(&numcache, 1);
 759 }
 760
 761 static void
 762 cache_free_batch(struct cache_freebatch *batch)
 763 {
 764         struct namecache *ncp, *nnp;
 765         int i;
 766
 767         i = 0;
 768         if (TAILQ_EMPTY(batch))
 769                 goto out;
 770         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 771                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 772                         cache_drop_vnode(ncp->nc_dvp);
 773                 }
 774                 cache_free_uma(ncp);
 775                 i++;
 776         }
 777         atomic_subtract_long(&numcache, i);
 778 out:
 779         SDT_PROBE1(vfs, namecache, purge, batch, i);
 780 }
 781
 782 /*
 783  * Hashing.
 784  *
 785  * The code was made to use FNV in 2001 and this choice needs to be revisited.
 786  *
 787  * Short summary of the difficulty:
 788  * The longest name which can be inserted is NAME_MAX characters in length (or
 789  * 255 at the time of writing this comment), while majority of names used in
 790  * practice are significantly shorter (mostly below 10). More importantly
 791  * majority of lookups performed find names are even shorter than that.
 792  *
 793  * This poses a problem where hashes which do better than FNV past word size
 794  * (or so) tend to come with additional overhead when finalizing the result,
 795  * making them noticeably slower for the most commonly used range.
 796  *
 797  * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
 798  *
 799  * When looking it up the most time consuming part by a large margin (at least
 800  * on amd64) is hashing.  Replacing FNV with something which pessimizes short
 801  * input would make the slowest part stand out even more.
 802  */
 803
 804 /*
 805  * TODO: With the value stored we can do better than computing the hash based
 806  * on the address.
 807  */
 808 static void
 809 cache_prehash(struct vnode *vp)
 810 {
 811
 812         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 813 }
 814
 815 static uint32_t
 816 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 817 {
 818
 819         return (fnv_32_buf(name, len, dvp->v_nchash));
 820 }
 821
 822 static uint32_t
 823 cache_get_hash_iter_start(struct vnode *dvp)
 824 {
 825
 826         return (dvp->v_nchash);
 827 }
 828
 829 static uint32_t
 830 cache_get_hash_iter(char c, uint32_t hash)
 831 {
 832
 833         return (fnv_32_buf(&c, 1, hash));
 834 }
 835
 836 static uint32_t
 837 cache_get_hash_iter_finish(uint32_t hash)
 838 {
 839
 840         return (hash);
 841 }
 842
 843 static inline struct nchashhead *
 844 NCP2BUCKET(struct namecache *ncp)
 845 {
 846         uint32_t hash;
 847
 848         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 849         return (NCHHASH(hash));
 850 }
 851
 852 static inline struct mtx *
 853 NCP2BUCKETLOCK(struct namecache *ncp)
 854 {
 855         uint32_t hash;
 856
 857         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 858         return (HASH2BUCKETLOCK(hash));
 859 }
 860
 861 #ifdef INVARIANTS
 862 static void
 863 cache_assert_bucket_locked(struct namecache *ncp)
 864 {
 865         struct mtx *blp;
 866
 867         blp = NCP2BUCKETLOCK(ncp);
 868         mtx_assert(blp, MA_OWNED);
 869 }
 870
 871 static void
 872 cache_assert_bucket_unlocked(struct namecache *ncp)
 873 {
 874         struct mtx *blp;
 875
 876         blp = NCP2BUCKETLOCK(ncp);
 877         mtx_assert(blp, MA_NOTOWNED);
 878 }
 879 #else
 880 #define cache_assert_bucket_locked(x) do { } while (0)
 881 #define cache_assert_bucket_unlocked(x) do { } while (0)
 882 #endif
 883
 884 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 885 static void
 886 _cache_sort_vnodes(void **p1, void **p2)
 887 {
 888         void *tmp;
 889
 890         MPASS(*p1 != NULL || *p2 != NULL);
 891
 892         if (*p1 > *p2) {
 893                 tmp = *p2;
 894                 *p2 = *p1;
 895                 *p1 = tmp;
 896         }
 897 }
 898
 899 static void
 900 cache_lock_all_buckets(void)
 901 {
 902         u_int i;
 903
 904         for (i = 0; i < numbucketlocks; i++)
 905                 mtx_lock(&bucketlocks[i]);
 906 }
 907
 908 static void
 909 cache_unlock_all_buckets(void)
 910 {
 911         u_int i;
 912
 913         for (i = 0; i < numbucketlocks; i++)
 914                 mtx_unlock(&bucketlocks[i]);
 915 }
 916
 917 static void
 918 cache_lock_all_vnodes(void)
 919 {
 920         u_int i;
 921
 922         for (i = 0; i < numvnodelocks; i++)
 923                 mtx_lock(&vnodelocks[i]);
 924 }
 925
 926 static void
 927 cache_unlock_all_vnodes(void)
 928 {
 929         u_int i;
 930
 931         for (i = 0; i < numvnodelocks; i++)
 932                 mtx_unlock(&vnodelocks[i]);
 933 }
 934
 935 static int
 936 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 937 {
 938
 939         cache_sort_vnodes(&vlp1, &vlp2);
 940
 941         if (vlp1 != NULL) {
 942                 if (!mtx_trylock(vlp1))
 943                         return (EAGAIN);
 944         }
 945         if (!mtx_trylock(vlp2)) {
 946                 if (vlp1 != NULL)
 947                         mtx_unlock(vlp1);
 948                 return (EAGAIN);
 949         }
 950
 951         return (0);
 952 }
 953
 954 static void
 955 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 956 {
 957
 958         MPASS(vlp1 != NULL || vlp2 != NULL);
 959         MPASS(vlp1 <= vlp2);
 960
 961         if (vlp1 != NULL)
 962                 mtx_lock(vlp1);
 963         if (vlp2 != NULL)
 964                 mtx_lock(vlp2);
 965 }
 966
 967 static void
 968 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 969 {
 970
 971         MPASS(vlp1 != NULL || vlp2 != NULL);
 972
 973         if (vlp1 != NULL)
 974                 mtx_unlock(vlp1);
 975         if (vlp2 != NULL)
 976                 mtx_unlock(vlp2);
 977 }
 978
 979 static int
 980 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 981 {
 982         struct nchstats snap;
 983
 984         if (req->oldptr == NULL)
 985                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 986
 987         snap = nchstats;
 988         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 989         snap.ncs_neghits = counter_u64_fetch(numneghits);
 990         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 991             counter_u64_fetch(numnegzaps);
 992         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 993             counter_u64_fetch(nummiss);
 994
 995         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 996 }
 997 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 998     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 999     "VFS cache effectiveness statistics");
1000
1001 static void
1002 cache_recalc_neg_min(u_int val)
1003 {
1004
1005         neg_min = (ncsize * val) / 100;
1006 }
1007
1008 static int
1009 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1010 {
1011         u_int val;
1012         int error;
1013
1014         val = ncnegminpct;
1015         error = sysctl_handle_int(oidp, &val, 0, req);
1016         if (error != 0 || req->newptr == NULL)
1017                 return (error);
1018
1019         if (val == ncnegminpct)
1020                 return (0);
1021         if (val < 0 || val > 99)
1022                 return (EINVAL);
1023         ncnegminpct = val;
1024         cache_recalc_neg_min(val);
1025         return (0);
1026 }
1027
1028 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1029     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1030     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1031
1032 #ifdef DEBUG_CACHE
1033 /*
1034  * Grab an atomic snapshot of the name cache hash chain lengths
1035  */
1036 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1037     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1038     "hash table stats");
1039
1040 static int
1041 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1042 {
1043         struct nchashhead *ncpp;
1044         struct namecache *ncp;
1045         int i, error, n_nchash, *cntbuf;
1046
1047 retry:
1048         n_nchash = nchash + 1;  /* nchash is max index, not count */
1049         if (req->oldptr == NULL)
1050                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1051         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1052         cache_lock_all_buckets();
1053         if (n_nchash != nchash + 1) {
1054                 cache_unlock_all_buckets();
1055                 free(cntbuf, M_TEMP);
1056                 goto retry;
1057         }
1058         /* Scan hash tables counting entries */
1059         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1060                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1061                         cntbuf[i]++;
1062         cache_unlock_all_buckets();
1063         for (error = 0, i = 0; i < n_nchash; i++)
1064                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1065                         break;
1066         free(cntbuf, M_TEMP);
1067         return (error);
1068 }
1069 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1070     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1071     "nchash chain lengths");
1072
1073 static int
1074 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1075 {
1076         int error;
1077         struct nchashhead *ncpp;
1078         struct namecache *ncp;
1079         int n_nchash;
1080         int count, maxlength, used, pct;
1081
1082         if (!req->oldptr)
1083                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1084
1085         cache_lock_all_buckets();
1086         n_nchash = nchash + 1;  /* nchash is max index, not count */
1087         used = 0;
1088         maxlength = 0;
1089
1090         /* Scan hash tables for applicable entries */
1091         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1092                 count = 0;
1093                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1094                         count++;
1095                 }
1096                 if (count)
1097                         used++;
1098                 if (maxlength < count)
1099                         maxlength = count;
1100         }
1101         n_nchash = nchash + 1;
1102         cache_unlock_all_buckets();
1103         pct = (used * 100) / (n_nchash / 100);
1104         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1105         if (error)
1106                 return (error);
1107         error = SYSCTL_OUT(req, &used, sizeof(used));
1108         if (error)
1109                 return (error);
1110         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1111         if (error)
1112                 return (error);
1113         error = SYSCTL_OUT(req, &pct, sizeof(pct));
1114         if (error)
1115                 return (error);
1116         return (0);
1117 }
1118 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1119     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1120     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1121 #endif
1122
1123 /*
1124  * Negative entries management
1125  *
1126  * Various workloads create plenty of negative entries and barely use them
1127  * afterwards. Moreover malicious users can keep performing bogus lookups
1128  * adding even more entries. For example "make tinderbox" as of writing this
1129  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1130  * negative.
1131  *
1132  * As such, a rather aggressive eviction method is needed. The currently
1133  * employed method is a placeholder.
1134  *
1135  * Entries are split over numneglists separate lists, each of which is further
1136  * split into hot and cold entries. Entries get promoted after getting a hit.
1137  * Eviction happens on addition of new entry.
1138  */
1139 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1140     "Name cache negative entry statistics");
1141
1142 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1143     "Number of negative cache entries");
1144
1145 static COUNTER_U64_DEFINE_EARLY(neg_created);
1146 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1147     "Number of created negative entries");
1148
1149 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1150 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1151     "Number of evicted negative entries");
1152
1153 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1154 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1155     &neg_evict_skipped_empty,
1156     "Number of times evicting failed due to lack of entries");
1157
1158 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1159 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1160     &neg_evict_skipped_missed,
1161     "Number of times evicting failed due to target entry disappearing");
1162
1163 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1164 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1165     &neg_evict_skipped_contended,
1166     "Number of times evicting failed due to contention");
1167
1168 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1169     "Number of cache hits (negative)");
1170
1171 static int
1172 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1173 {
1174         int i, out;
1175
1176         out = 0;
1177         for (i = 0; i < numneglists; i++)
1178                 out += neglists[i].nl_hotnum;
1179
1180         return (SYSCTL_OUT(req, &out, sizeof(out)));
1181 }
1182 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1183     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1184     "Number of hot negative entries");
1185
1186 static void
1187 cache_neg_init(struct namecache *ncp)
1188 {
1189         struct negstate *ns;
1190
1191         ncp->nc_flag |= NCF_NEGATIVE;
1192         ns = NCP2NEGSTATE(ncp);
1193         ns->neg_flag = 0;
1194         ns->neg_hit = 0;
1195         counter_u64_add(neg_created, 1);
1196 }
1197
1198 #define CACHE_NEG_PROMOTION_THRESH 2
1199
1200 static bool
1201 cache_neg_hit_prep(struct namecache *ncp)
1202 {
1203         struct negstate *ns;
1204         u_char n;
1205
1206         ns = NCP2NEGSTATE(ncp);
1207         n = atomic_load_char(&ns->neg_hit);
1208         for (;;) {
1209                 if (n >= CACHE_NEG_PROMOTION_THRESH)
1210                         return (false);
1211                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1212                         break;
1213         }
1214         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1215 }
1216
1217 /*
1218  * Nothing to do here but it is provided for completeness as some
1219  * cache_neg_hit_prep callers may end up returning without even
1220  * trying to promote.
1221  */
1222 #define cache_neg_hit_abort(ncp)        do { } while (0)
1223
1224 static void
1225 cache_neg_hit_finish(struct namecache *ncp)
1226 {
1227
1228         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1229         counter_u64_add(numneghits, 1);
1230 }
1231
1232 /*
1233  * Move a negative entry to the hot list.
1234  */
1235 static void
1236 cache_neg_promote_locked(struct namecache *ncp)
1237 {
1238         struct neglist *nl;
1239         struct negstate *ns;
1240
1241         ns = NCP2NEGSTATE(ncp);
1242         nl = NCP2NEGLIST(ncp);
1243         mtx_assert(&nl->nl_lock, MA_OWNED);
1244         if ((ns->neg_flag & NEG_HOT) == 0) {
1245                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1246                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1247                 nl->nl_hotnum++;
1248                 ns->neg_flag |= NEG_HOT;
1249         }
1250 }
1251
1252 /*
1253  * Move a hot negative entry to the cold list.
1254  */
1255 static void
1256 cache_neg_demote_locked(struct namecache *ncp)
1257 {
1258         struct neglist *nl;
1259         struct negstate *ns;
1260
1261         ns = NCP2NEGSTATE(ncp);
1262         nl = NCP2NEGLIST(ncp);
1263         mtx_assert(&nl->nl_lock, MA_OWNED);
1264         MPASS(ns->neg_flag & NEG_HOT);
1265         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1266         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1267         nl->nl_hotnum--;
1268         ns->neg_flag &= ~NEG_HOT;
1269         atomic_store_char(&ns->neg_hit, 0);
1270 }
1271
1272 /*
1273  * Move a negative entry to the hot list if it matches the lookup.
1274  *
1275  * We have to take locks, but they may be contended and in the worst
1276  * case we may need to go off CPU. We don't want to spin within the
1277  * smr section and we can't block with it. Exiting the section means
1278  * the found entry could have been evicted. We are going to look it
1279  * up again.
1280  */
1281 static bool
1282 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1283     struct namecache *oncp, uint32_t hash)
1284 {
1285         struct namecache *ncp;
1286         struct neglist *nl;
1287         u_char nc_flag;
1288
1289         nl = NCP2NEGLIST(oncp);
1290
1291         mtx_lock(&nl->nl_lock);
1292         /*
1293          * For hash iteration.
1294          */
1295         vfs_smr_enter();
1296
1297         /*
1298          * Avoid all surprises by only succeeding if we got the same entry and
1299          * bailing completely otherwise.
1300          * XXX There are no provisions to keep the vnode around, meaning we may
1301          * end up promoting a negative entry for a *new* vnode and returning
1302          * ENOENT on its account. This is the error we want to return anyway
1303          * and promotion is harmless.
1304          *
1305          * In particular at this point there can be a new ncp which matches the
1306          * search but hashes to a different neglist.
1307          */
1308         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1309                 if (ncp == oncp)
1310                         break;
1311         }
1312
1313         /*
1314          * No match to begin with.
1315          */
1316         if (__predict_false(ncp == NULL)) {
1317                 goto out_abort;
1318         }
1319
1320         /*
1321          * The newly found entry may be something different...
1322          */
1323         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1324             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1325                 goto out_abort;
1326         }
1327
1328         /*
1329          * ... and not even negative.
1330          */
1331         nc_flag = atomic_load_char(&ncp->nc_flag);
1332         if ((nc_flag & NCF_NEGATIVE) == 0) {
1333                 goto out_abort;
1334         }
1335
1336         if (!cache_ncp_canuse(ncp)) {
1337                 goto out_abort;
1338         }
1339
1340         cache_neg_promote_locked(ncp);
1341         cache_neg_hit_finish(ncp);
1342         vfs_smr_exit();
1343         mtx_unlock(&nl->nl_lock);
1344         return (true);
1345 out_abort:
1346         vfs_smr_exit();
1347         mtx_unlock(&nl->nl_lock);
1348         return (false);
1349 }
1350
1351 static void
1352 cache_neg_promote(struct namecache *ncp)
1353 {
1354         struct neglist *nl;
1355
1356         nl = NCP2NEGLIST(ncp);
1357         mtx_lock(&nl->nl_lock);
1358         cache_neg_promote_locked(ncp);
1359         mtx_unlock(&nl->nl_lock);
1360 }
1361
1362 static void
1363 cache_neg_insert(struct namecache *ncp)
1364 {
1365         struct neglist *nl;
1366
1367         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1368         cache_assert_bucket_locked(ncp);
1369         nl = NCP2NEGLIST(ncp);
1370         mtx_lock(&nl->nl_lock);
1371         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1372         mtx_unlock(&nl->nl_lock);
1373         atomic_add_long(&numneg, 1);
1374 }
1375
1376 static void
1377 cache_neg_remove(struct namecache *ncp)
1378 {
1379         struct neglist *nl;
1380         struct negstate *ns;
1381
1382         cache_assert_bucket_locked(ncp);
1383         nl = NCP2NEGLIST(ncp);
1384         ns = NCP2NEGSTATE(ncp);
1385         mtx_lock(&nl->nl_lock);
1386         if ((ns->neg_flag & NEG_HOT) != 0) {
1387                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1388                 nl->nl_hotnum--;
1389         } else {
1390                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1391         }
1392         mtx_unlock(&nl->nl_lock);
1393         atomic_subtract_long(&numneg, 1);
1394 }
1395
1396 static struct neglist *
1397 cache_neg_evict_select_list(void)
1398 {
1399         struct neglist *nl;
1400         u_int c;
1401
1402         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1403         nl = &neglists[c % numneglists];
1404         if (!mtx_trylock(&nl->nl_evict_lock)) {
1405                 counter_u64_add(neg_evict_skipped_contended, 1);
1406                 return (NULL);
1407         }
1408         return (nl);
1409 }
1410
1411 static struct namecache *
1412 cache_neg_evict_select_entry(struct neglist *nl)
1413 {
1414         struct namecache *ncp, *lncp;
1415         struct negstate *ns, *lns;
1416         int i;
1417
1418         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1419         mtx_assert(&nl->nl_lock, MA_OWNED);
1420         ncp = TAILQ_FIRST(&nl->nl_list);
1421         if (ncp == NULL)
1422                 return (NULL);
1423         lncp = ncp;
1424         lns = NCP2NEGSTATE(lncp);
1425         for (i = 1; i < 4; i++) {
1426                 ncp = TAILQ_NEXT(ncp, nc_dst);
1427                 if (ncp == NULL)
1428                         break;
1429                 ns = NCP2NEGSTATE(ncp);
1430                 if (ns->neg_hit < lns->neg_hit) {
1431                         lncp = ncp;
1432                         lns = ns;
1433                 }
1434         }
1435         return (lncp);
1436 }
1437
1438 static bool
1439 cache_neg_evict(void)
1440 {
1441         struct namecache *ncp, *ncp2;
1442         struct neglist *nl;
1443         struct vnode *dvp;
1444         struct mtx *dvlp;
1445         struct mtx *blp;
1446         uint32_t hash;
1447         u_char nlen;
1448         bool evicted;
1449
1450         nl = cache_neg_evict_select_list();
1451         if (nl == NULL) {
1452                 return (false);
1453         }
1454
1455         mtx_lock(&nl->nl_lock);
1456         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1457         if (ncp != NULL) {
1458                 cache_neg_demote_locked(ncp);
1459         }
1460         ncp = cache_neg_evict_select_entry(nl);
1461         if (ncp == NULL) {
1462                 counter_u64_add(neg_evict_skipped_empty, 1);
1463                 mtx_unlock(&nl->nl_lock);
1464                 mtx_unlock(&nl->nl_evict_lock);
1465                 return (false);
1466         }
1467         nlen = ncp->nc_nlen;
1468         dvp = ncp->nc_dvp;
1469         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1470         dvlp = VP2VNODELOCK(dvp);
1471         blp = HASH2BUCKETLOCK(hash);
1472         mtx_unlock(&nl->nl_lock);
1473         mtx_unlock(&nl->nl_evict_lock);
1474         mtx_lock(dvlp);
1475         mtx_lock(blp);
1476         /*
1477          * Note that since all locks were dropped above, the entry may be
1478          * gone or reallocated to be something else.
1479          */
1480         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1481                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1482                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1483                         break;
1484         }
1485         if (ncp2 == NULL) {
1486                 counter_u64_add(neg_evict_skipped_missed, 1);
1487                 ncp = NULL;
1488                 evicted = false;
1489         } else {
1490                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1491                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1492                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1493                     ncp->nc_name);
1494                 cache_zap_locked(ncp);
1495                 counter_u64_add(neg_evicted, 1);
1496                 evicted = true;
1497         }
1498         mtx_unlock(blp);
1499         mtx_unlock(dvlp);
1500         if (ncp != NULL)
1501                 cache_free(ncp);
1502         return (evicted);
1503 }
1504
1505 /*
1506  * Maybe evict a negative entry to create more room.
1507  *
1508  * The ncnegfactor parameter limits what fraction of the total count
1509  * can comprise of negative entries. However, if the cache is just
1510  * warming up this leads to excessive evictions.  As such, ncnegminpct
1511  * (recomputed to neg_min) dictates whether the above should be
1512  * applied.
1513  *
1514  * Try evicting if the cache is close to full capacity regardless of
1515  * other considerations.
1516  */
1517 static bool
1518 cache_neg_evict_cond(u_long lnumcache)
1519 {
1520         u_long lnumneg;
1521
1522         if (ncsize - 1000 < lnumcache)
1523                 goto out_evict;
1524         lnumneg = atomic_load_long(&numneg);
1525         if (lnumneg < neg_min)
1526                 return (false);
1527         if (lnumneg * ncnegfactor < lnumcache)
1528                 return (false);
1529 out_evict:
1530         return (cache_neg_evict());
1531 }
1532
1533 /*
1534  * cache_zap_locked():
1535  *
1536  *   Removes a namecache entry from cache, whether it contains an actual
1537  *   pointer to a vnode or if it is just a negative cache entry.
1538  */
1539 static void
1540 cache_zap_locked(struct namecache *ncp)
1541 {
1542         struct nchashhead *ncpp;
1543         struct vnode *dvp, *vp;
1544
1545         dvp = ncp->nc_dvp;
1546         vp = ncp->nc_vp;
1547
1548         if (!(ncp->nc_flag & NCF_NEGATIVE))
1549                 cache_assert_vnode_locked(vp);
1550         cache_assert_vnode_locked(dvp);
1551         cache_assert_bucket_locked(ncp);
1552
1553         cache_ncp_invalidate(ncp);
1554
1555         ncpp = NCP2BUCKET(ncp);
1556         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1557         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1558                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1559                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1560                 if (ncp == vp->v_cache_dd) {
1561                         atomic_store_ptr(&vp->v_cache_dd, NULL);
1562                 }
1563         } else {
1564                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1565                 cache_neg_remove(ncp);
1566         }
1567         if (ncp->nc_flag & NCF_ISDOTDOT) {
1568                 if (ncp == dvp->v_cache_dd) {
1569                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
1570                 }
1571         } else {
1572                 LIST_REMOVE(ncp, nc_src);
1573                 if (LIST_EMPTY(&dvp->v_cache_src)) {
1574                         ncp->nc_flag |= NCF_DVDROP;
1575                 }
1576         }
1577 }
1578
1579 static void
1580 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1581 {
1582         struct mtx *blp;
1583
1584         MPASS(ncp->nc_dvp == vp);
1585         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1586         cache_assert_vnode_locked(vp);
1587
1588         blp = NCP2BUCKETLOCK(ncp);
1589         mtx_lock(blp);
1590         cache_zap_locked(ncp);
1591         mtx_unlock(blp);
1592 }
1593
1594 static bool
1595 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1596     struct mtx **vlpp)
1597 {
1598         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1599         struct mtx *blp;
1600
1601         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1602         cache_assert_vnode_locked(vp);
1603
1604         if (ncp->nc_flag & NCF_NEGATIVE) {
1605                 if (*vlpp != NULL) {
1606                         mtx_unlock(*vlpp);
1607                         *vlpp = NULL;
1608                 }
1609                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1610                 return (true);
1611         }
1612
1613         pvlp = VP2VNODELOCK(vp);
1614         blp = NCP2BUCKETLOCK(ncp);
1615         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1616         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1617
1618         if (*vlpp == vlp1 || *vlpp == vlp2) {
1619                 to_unlock = *vlpp;
1620                 *vlpp = NULL;
1621         } else {
1622                 if (*vlpp != NULL) {
1623                         mtx_unlock(*vlpp);
1624                         *vlpp = NULL;
1625                 }
1626                 cache_sort_vnodes(&vlp1, &vlp2);
1627                 if (vlp1 == pvlp) {
1628                         mtx_lock(vlp2);
1629                         to_unlock = vlp2;
1630                 } else {
1631                         if (!mtx_trylock(vlp1))
1632                                 goto out_relock;
1633                         to_unlock = vlp1;
1634                 }
1635         }
1636         mtx_lock(blp);
1637         cache_zap_locked(ncp);
1638         mtx_unlock(blp);
1639         if (to_unlock != NULL)
1640                 mtx_unlock(to_unlock);
1641         return (true);
1642
1643 out_relock:
1644         mtx_unlock(vlp2);
1645         mtx_lock(vlp1);
1646         mtx_lock(vlp2);
1647         MPASS(*vlpp == NULL);
1648         *vlpp = vlp1;
1649         return (false);
1650 }
1651
1652 /*
1653  * If trylocking failed we can get here. We know enough to take all needed locks
1654  * in the right order and re-lookup the entry.
1655  */
1656 static int
1657 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1658     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1659     struct mtx *blp)
1660 {
1661         struct namecache *rncp;
1662
1663         cache_assert_bucket_unlocked(ncp);
1664
1665         cache_sort_vnodes(&dvlp, &vlp);
1666         cache_lock_vnodes(dvlp, vlp);
1667         mtx_lock(blp);
1668         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1669                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1670                     rncp->nc_nlen == cnp->cn_namelen &&
1671                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1672                         break;
1673         }
1674         if (rncp != NULL) {
1675                 cache_zap_locked(rncp);
1676                 mtx_unlock(blp);
1677                 cache_unlock_vnodes(dvlp, vlp);
1678                 counter_u64_add(zap_bucket_relock_success, 1);
1679                 return (0);
1680         }
1681
1682         mtx_unlock(blp);
1683         cache_unlock_vnodes(dvlp, vlp);
1684         return (EAGAIN);
1685 }
1686
1687 static int __noinline
1688 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1689     uint32_t hash, struct mtx *blp)
1690 {
1691         struct mtx *dvlp, *vlp;
1692         struct vnode *dvp;
1693
1694         cache_assert_bucket_locked(ncp);
1695
1696         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1697         vlp = NULL;
1698         if (!(ncp->nc_flag & NCF_NEGATIVE))
1699                 vlp = VP2VNODELOCK(ncp->nc_vp);
1700         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1701                 cache_zap_locked(ncp);
1702                 mtx_unlock(blp);
1703                 cache_unlock_vnodes(dvlp, vlp);
1704                 return (0);
1705         }
1706
1707         dvp = ncp->nc_dvp;
1708         mtx_unlock(blp);
1709         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1710 }
1711
1712 static __noinline int
1713 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1714 {
1715         struct namecache *ncp;
1716         struct mtx *blp;
1717         struct mtx *dvlp, *dvlp2;
1718         uint32_t hash;
1719         int error;
1720
1721         if (cnp->cn_namelen == 2 &&
1722             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1723                 dvlp = VP2VNODELOCK(dvp);
1724                 dvlp2 = NULL;
1725                 mtx_lock(dvlp);
1726 retry_dotdot:
1727                 ncp = dvp->v_cache_dd;
1728                 if (ncp == NULL) {
1729                         mtx_unlock(dvlp);
1730                         if (dvlp2 != NULL)
1731                                 mtx_unlock(dvlp2);
1732                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1733                         return (0);
1734                 }
1735                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1736                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1737                                 goto retry_dotdot;
1738                         MPASS(dvp->v_cache_dd == NULL);
1739                         mtx_unlock(dvlp);
1740                         if (dvlp2 != NULL)
1741                                 mtx_unlock(dvlp2);
1742                         cache_free(ncp);
1743                 } else {
1744                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
1745                         mtx_unlock(dvlp);
1746                         if (dvlp2 != NULL)
1747                                 mtx_unlock(dvlp2);
1748                 }
1749                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1750                 return (1);
1751         }
1752
1753         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1754         blp = HASH2BUCKETLOCK(hash);
1755 retry:
1756         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1757                 goto out_no_entry;
1758
1759         mtx_lock(blp);
1760
1761         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1762                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1763                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1764                         break;
1765         }
1766
1767         if (ncp == NULL) {
1768                 mtx_unlock(blp);
1769                 goto out_no_entry;
1770         }
1771
1772         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1773         if (__predict_false(error != 0)) {
1774                 zap_bucket_fail++;
1775                 goto retry;
1776         }
1777         counter_u64_add(numposzaps, 1);
1778         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1779         cache_free(ncp);
1780         return (1);
1781 out_no_entry:
1782         counter_u64_add(nummisszap, 1);
1783         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1784         return (0);
1785 }
1786
1787 static int __noinline
1788 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1789     struct timespec *tsp, int *ticksp)
1790 {
1791         int ltype;
1792
1793         *vpp = dvp;
1794         counter_u64_add(dothits, 1);
1795         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1796         if (tsp != NULL)
1797                 timespecclear(tsp);
1798         if (ticksp != NULL)
1799                 *ticksp = ticks;
1800         vrefact(*vpp);
1801         /*
1802          * When we lookup "." we still can be asked to lock it
1803          * differently...
1804          */
1805         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1806         if (ltype != VOP_ISLOCKED(*vpp)) {
1807                 if (ltype == LK_EXCLUSIVE) {
1808                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1809                         if (VN_IS_DOOMED((*vpp))) {
1810                                 /* forced unmount */
1811                                 vrele(*vpp);
1812                                 *vpp = NULL;
1813                                 return (ENOENT);
1814                         }
1815                 } else
1816                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1817         }
1818         return (-1);
1819 }
1820
1821 static int __noinline
1822 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1823     struct timespec *tsp, int *ticksp)
1824 {
1825         struct namecache_ts *ncp_ts;
1826         struct namecache *ncp;
1827         struct mtx *dvlp;
1828         enum vgetstate vs;
1829         int error, ltype;
1830         bool whiteout;
1831
1832         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1833
1834         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1835                 cache_remove_cnp(dvp, cnp);
1836                 return (0);
1837         }
1838
1839         counter_u64_add(dotdothits, 1);
1840 retry:
1841         dvlp = VP2VNODELOCK(dvp);
1842         mtx_lock(dvlp);
1843         ncp = dvp->v_cache_dd;
1844         if (ncp == NULL) {
1845                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1846                 mtx_unlock(dvlp);
1847                 return (0);
1848         }
1849         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1850                 if (ncp->nc_flag & NCF_NEGATIVE)
1851                         *vpp = NULL;
1852                 else
1853                         *vpp = ncp->nc_vp;
1854         } else
1855                 *vpp = ncp->nc_dvp;
1856         if (*vpp == NULL)
1857                 goto negative_success;
1858         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1859         cache_out_ts(ncp, tsp, ticksp);
1860         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1861             NCF_DTS && tsp != NULL) {
1862                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1863                 *tsp = ncp_ts->nc_dotdottime;
1864         }
1865
1866         MPASS(dvp != *vpp);
1867         ltype = VOP_ISLOCKED(dvp);
1868         VOP_UNLOCK(dvp);
1869         vs = vget_prep(*vpp);
1870         mtx_unlock(dvlp);
1871         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1872         vn_lock(dvp, ltype | LK_RETRY);
1873         if (VN_IS_DOOMED(dvp)) {
1874                 if (error == 0)
1875                         vput(*vpp);
1876                 *vpp = NULL;
1877                 return (ENOENT);
1878         }
1879         if (error) {
1880                 *vpp = NULL;
1881                 goto retry;
1882         }
1883         return (-1);
1884 negative_success:
1885         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1886                 if (cnp->cn_flags & ISLASTCN) {
1887                         counter_u64_add(numnegzaps, 1);
1888                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1889                         mtx_unlock(dvlp);
1890                         cache_free(ncp);
1891                         return (0);
1892                 }
1893         }
1894
1895         whiteout = (ncp->nc_flag & NCF_WHITE);
1896         cache_out_ts(ncp, tsp, ticksp);
1897         if (cache_neg_hit_prep(ncp))
1898                 cache_neg_promote(ncp);
1899         else
1900                 cache_neg_hit_finish(ncp);
1901         mtx_unlock(dvlp);
1902         if (whiteout)
1903                 cnp->cn_flags |= ISWHITEOUT;
1904         return (ENOENT);
1905 }
1906
1907 /**
1908  * Lookup a name in the name cache
1909  *
1910  * # Arguments
1911  *
1912  * - dvp:       Parent directory in which to search.
1913  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1914  * - cnp:       Parameters of the name search.  The most interesting bits of
1915  *              the cn_flags field have the following meanings:
1916  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1917  *                      it up.
1918  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1919  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1920  *              or negative) lookup, tsp will be filled with any timespec that
1921  *              was stored when this cache entry was created.  However, it will
1922  *              be clear for "." entries.
1923  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1924  *              (positive or negative) lookup, it will contain the ticks value
1925  *              that was current when the cache entry was created, unless cnp
1926  *              was ".".
1927  *
1928  * Either both tsp and ticks have to be provided or neither of them.
1929  *
1930  * # Returns
1931  *
1932  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1933  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1934  *              to a forced unmount.  vpp will not be modified.  If the entry
1935  *              is a whiteout, then the ISWHITEOUT flag will be set in
1936  *              cnp->cn_flags.
1937  * - 0:         A cache miss.  vpp will not be modified.
1938  *
1939  * # Locking
1940  *
1941  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1942  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1943  * lock is not recursively acquired.
1944  */
1945 static int __noinline
1946 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1947     struct timespec *tsp, int *ticksp)
1948 {
1949         struct namecache *ncp;
1950         struct mtx *blp;
1951         uint32_t hash;
1952         enum vgetstate vs;
1953         int error;
1954         bool whiteout;
1955
1956         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1957         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1958
1959 retry:
1960         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1961         blp = HASH2BUCKETLOCK(hash);
1962         mtx_lock(blp);
1963
1964         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1965                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1966                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1967                         break;
1968         }
1969
1970         if (__predict_false(ncp == NULL)) {
1971                 mtx_unlock(blp);
1972                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
1973                 counter_u64_add(nummiss, 1);
1974                 return (0);
1975         }
1976
1977         if (ncp->nc_flag & NCF_NEGATIVE)
1978                 goto negative_success;
1979
1980         counter_u64_add(numposhits, 1);
1981         *vpp = ncp->nc_vp;
1982         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1983         cache_out_ts(ncp, tsp, ticksp);
1984         MPASS(dvp != *vpp);
1985         vs = vget_prep(*vpp);
1986         mtx_unlock(blp);
1987         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1988         if (error) {
1989                 *vpp = NULL;
1990                 goto retry;
1991         }
1992         return (-1);
1993 negative_success:
1994         /*
1995          * We don't get here with regular lookup apart from corner cases.
1996          */
1997         if (__predict_true(cnp->cn_nameiop == CREATE)) {
1998                 if (cnp->cn_flags & ISLASTCN) {
1999                         counter_u64_add(numnegzaps, 1);
2000                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2001                         if (__predict_false(error != 0)) {
2002                                 zap_bucket_fail2++;
2003                                 goto retry;
2004                         }
2005                         cache_free(ncp);
2006                         return (0);
2007                 }
2008         }
2009
2010         whiteout = (ncp->nc_flag & NCF_WHITE);
2011         cache_out_ts(ncp, tsp, ticksp);
2012         if (cache_neg_hit_prep(ncp))
2013                 cache_neg_promote(ncp);
2014         else
2015                 cache_neg_hit_finish(ncp);
2016         mtx_unlock(blp);
2017         if (whiteout)
2018                 cnp->cn_flags |= ISWHITEOUT;
2019         return (ENOENT);
2020 }
2021
2022 int
2023 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2024     struct timespec *tsp, int *ticksp)
2025 {
2026         struct namecache *ncp;
2027         uint32_t hash;
2028         enum vgetstate vs;
2029         int error;
2030         bool whiteout, neg_promote;
2031         u_short nc_flag;
2032
2033         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2034
2035 #ifdef DEBUG_CACHE
2036         if (__predict_false(!doingcache)) {
2037                 cnp->cn_flags &= ~MAKEENTRY;
2038                 return (0);
2039         }
2040 #endif
2041
2042         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2043                 if (cnp->cn_namelen == 1)
2044                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2045                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2046                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2047         }
2048
2049         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2050
2051         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2052                 cache_remove_cnp(dvp, cnp);
2053                 return (0);
2054         }
2055
2056         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2057         vfs_smr_enter();
2058
2059         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2060                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2061                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2062                         break;
2063         }
2064
2065         if (__predict_false(ncp == NULL)) {
2066                 vfs_smr_exit();
2067                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2068                 counter_u64_add(nummiss, 1);
2069                 return (0);
2070         }
2071
2072         nc_flag = atomic_load_char(&ncp->nc_flag);
2073         if (nc_flag & NCF_NEGATIVE)
2074                 goto negative_success;
2075
2076         counter_u64_add(numposhits, 1);
2077         *vpp = ncp->nc_vp;
2078         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2079         cache_out_ts(ncp, tsp, ticksp);
2080         MPASS(dvp != *vpp);
2081         if (!cache_ncp_canuse(ncp)) {
2082                 vfs_smr_exit();
2083                 *vpp = NULL;
2084                 goto out_fallback;
2085         }
2086         vs = vget_prep_smr(*vpp);
2087         vfs_smr_exit();
2088         if (__predict_false(vs == VGET_NONE)) {
2089                 *vpp = NULL;
2090                 goto out_fallback;
2091         }
2092         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2093         if (error) {
2094                 *vpp = NULL;
2095                 goto out_fallback;
2096         }
2097         return (-1);
2098 negative_success:
2099         if (cnp->cn_nameiop == CREATE) {
2100                 if (cnp->cn_flags & ISLASTCN) {
2101                         vfs_smr_exit();
2102                         goto out_fallback;
2103                 }
2104         }
2105
2106         cache_out_ts(ncp, tsp, ticksp);
2107         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2108         neg_promote = cache_neg_hit_prep(ncp);
2109         if (!cache_ncp_canuse(ncp)) {
2110                 cache_neg_hit_abort(ncp);
2111                 vfs_smr_exit();
2112                 goto out_fallback;
2113         }
2114         if (neg_promote) {
2115                 vfs_smr_exit();
2116                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2117                         goto out_fallback;
2118         } else {
2119                 cache_neg_hit_finish(ncp);
2120                 vfs_smr_exit();
2121         }
2122         if (whiteout)
2123                 cnp->cn_flags |= ISWHITEOUT;
2124         return (ENOENT);
2125 out_fallback:
2126         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2127 }
2128
2129 struct celockstate {
2130         struct mtx *vlp[3];
2131         struct mtx *blp[2];
2132 };
2133 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2134 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2135
2136 static inline void
2137 cache_celockstate_init(struct celockstate *cel)
2138 {
2139
2140         bzero(cel, sizeof(*cel));
2141 }
2142
2143 static void
2144 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2145     struct vnode *dvp)
2146 {
2147         struct mtx *vlp1, *vlp2;
2148
2149         MPASS(cel->vlp[0] == NULL);
2150         MPASS(cel->vlp[1] == NULL);
2151         MPASS(cel->vlp[2] == NULL);
2152
2153         MPASS(vp != NULL || dvp != NULL);
2154
2155         vlp1 = VP2VNODELOCK(vp);
2156         vlp2 = VP2VNODELOCK(dvp);
2157         cache_sort_vnodes(&vlp1, &vlp2);
2158
2159         if (vlp1 != NULL) {
2160                 mtx_lock(vlp1);
2161                 cel->vlp[0] = vlp1;
2162         }
2163         mtx_lock(vlp2);
2164         cel->vlp[1] = vlp2;
2165 }
2166
2167 static void
2168 cache_unlock_vnodes_cel(struct celockstate *cel)
2169 {
2170
2171         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2172
2173         if (cel->vlp[0] != NULL)
2174                 mtx_unlock(cel->vlp[0]);
2175         if (cel->vlp[1] != NULL)
2176                 mtx_unlock(cel->vlp[1]);
2177         if (cel->vlp[2] != NULL)
2178                 mtx_unlock(cel->vlp[2]);
2179 }
2180
2181 static bool
2182 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2183 {
2184         struct mtx *vlp;
2185         bool ret;
2186
2187         cache_assert_vlp_locked(cel->vlp[0]);
2188         cache_assert_vlp_locked(cel->vlp[1]);
2189         MPASS(cel->vlp[2] == NULL);
2190
2191         MPASS(vp != NULL);
2192         vlp = VP2VNODELOCK(vp);
2193
2194         ret = true;
2195         if (vlp >= cel->vlp[1]) {
2196                 mtx_lock(vlp);
2197         } else {
2198                 if (mtx_trylock(vlp))
2199                         goto out;
2200                 cache_lock_vnodes_cel_3_failures++;
2201                 cache_unlock_vnodes_cel(cel);
2202                 if (vlp < cel->vlp[0]) {
2203                         mtx_lock(vlp);
2204                         mtx_lock(cel->vlp[0]);
2205                         mtx_lock(cel->vlp[1]);
2206                 } else {
2207                         if (cel->vlp[0] != NULL)
2208                                 mtx_lock(cel->vlp[0]);
2209                         mtx_lock(vlp);
2210                         mtx_lock(cel->vlp[1]);
2211                 }
2212                 ret = false;
2213         }
2214 out:
2215         cel->vlp[2] = vlp;
2216         return (ret);
2217 }
2218
2219 static void
2220 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2221     struct mtx *blp2)
2222 {
2223
2224         MPASS(cel->blp[0] == NULL);
2225         MPASS(cel->blp[1] == NULL);
2226
2227         cache_sort_vnodes(&blp1, &blp2);
2228
2229         if (blp1 != NULL) {
2230                 mtx_lock(blp1);
2231                 cel->blp[0] = blp1;
2232         }
2233         mtx_lock(blp2);
2234         cel->blp[1] = blp2;
2235 }
2236
2237 static void
2238 cache_unlock_buckets_cel(struct celockstate *cel)
2239 {
2240
2241         if (cel->blp[0] != NULL)
2242                 mtx_unlock(cel->blp[0]);
2243         mtx_unlock(cel->blp[1]);
2244 }
2245
2246 /*
2247  * Lock part of the cache affected by the insertion.
2248  *
2249  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2250  * However, insertion can result in removal of an old entry. In this
2251  * case we have an additional vnode and bucketlock pair to lock.
2252  *
2253  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2254  * preserving the locking order (smaller address first).
2255  */
2256 static void
2257 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2258     uint32_t hash)
2259 {
2260         struct namecache *ncp;
2261         struct mtx *blps[2];
2262         u_char nc_flag;
2263
2264         blps[0] = HASH2BUCKETLOCK(hash);
2265         for (;;) {
2266                 blps[1] = NULL;
2267                 cache_lock_vnodes_cel(cel, dvp, vp);
2268                 if (vp == NULL || vp->v_type != VDIR)
2269                         break;
2270                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2271                 if (ncp == NULL)
2272                         break;
2273                 nc_flag = atomic_load_char(&ncp->nc_flag);
2274                 if ((nc_flag & NCF_ISDOTDOT) == 0)
2275                         break;
2276                 MPASS(ncp->nc_dvp == vp);
2277                 blps[1] = NCP2BUCKETLOCK(ncp);
2278                 if ((nc_flag & NCF_NEGATIVE) != 0)
2279                         break;
2280                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2281                         break;
2282                 /*
2283                  * All vnodes got re-locked. Re-validate the state and if
2284                  * nothing changed we are done. Otherwise restart.
2285                  */
2286                 if (ncp == vp->v_cache_dd &&
2287                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2288                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2289                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2290                         break;
2291                 cache_unlock_vnodes_cel(cel);
2292                 cel->vlp[0] = NULL;
2293                 cel->vlp[1] = NULL;
2294                 cel->vlp[2] = NULL;
2295         }
2296         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2297 }
2298
2299 static void
2300 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2301     uint32_t hash)
2302 {
2303         struct namecache *ncp;
2304         struct mtx *blps[2];
2305         u_char nc_flag;
2306
2307         blps[0] = HASH2BUCKETLOCK(hash);
2308         for (;;) {
2309                 blps[1] = NULL;
2310                 cache_lock_vnodes_cel(cel, dvp, vp);
2311                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2312                 if (ncp == NULL)
2313                         break;
2314                 nc_flag = atomic_load_char(&ncp->nc_flag);
2315                 if ((nc_flag & NCF_ISDOTDOT) == 0)
2316                         break;
2317                 MPASS(ncp->nc_dvp == dvp);
2318                 blps[1] = NCP2BUCKETLOCK(ncp);
2319                 if ((nc_flag & NCF_NEGATIVE) != 0)
2320                         break;
2321                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2322                         break;
2323                 if (ncp == dvp->v_cache_dd &&
2324                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2325                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2326                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2327                         break;
2328                 cache_unlock_vnodes_cel(cel);
2329                 cel->vlp[0] = NULL;
2330                 cel->vlp[1] = NULL;
2331                 cel->vlp[2] = NULL;
2332         }
2333         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2334 }
2335
2336 static void
2337 cache_enter_unlock(struct celockstate *cel)
2338 {
2339
2340         cache_unlock_buckets_cel(cel);
2341         cache_unlock_vnodes_cel(cel);
2342 }
2343
2344 static void __noinline
2345 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2346     struct componentname *cnp)
2347 {
2348         struct celockstate cel;
2349         struct namecache *ncp;
2350         uint32_t hash;
2351         int len;
2352
2353         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2354                 return;
2355         len = cnp->cn_namelen;
2356         cache_celockstate_init(&cel);
2357         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2358         cache_enter_lock_dd(&cel, dvp, vp, hash);
2359         ncp = dvp->v_cache_dd;
2360         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2361                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2362                 cache_zap_locked(ncp);
2363         } else {
2364                 ncp = NULL;
2365         }
2366         atomic_store_ptr(&dvp->v_cache_dd, NULL);
2367         cache_enter_unlock(&cel);
2368         if (ncp != NULL)
2369                 cache_free(ncp);
2370 }
2371
2372 /*
2373  * Add an entry to the cache.
2374  */
2375 void
2376 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2377     struct timespec *tsp, struct timespec *dtsp)
2378 {
2379         struct celockstate cel;
2380         struct namecache *ncp, *n2, *ndd;
2381         struct namecache_ts *ncp_ts;
2382         struct nchashhead *ncpp;
2383         uint32_t hash;
2384         int flag;
2385         int len;
2386
2387         KASSERT(cnp->cn_namelen <= NAME_MAX,
2388             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2389             NAME_MAX));
2390         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2391         VNPASS(dvp->v_type != VNON, dvp);
2392         if (vp != NULL) {
2393                 VNPASS(!VN_IS_DOOMED(vp), vp);
2394                 VNPASS(vp->v_type != VNON, vp);
2395         }
2396         if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2397                 KASSERT(dvp == vp,
2398                     ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2399                     dvp, vp));
2400         } else {
2401                 KASSERT(dvp != vp,
2402                     ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2403                     cnp->cn_nameptr, dvp));
2404         }
2405
2406 #ifdef DEBUG_CACHE
2407         if (__predict_false(!doingcache))
2408                 return;
2409 #endif
2410
2411         flag = 0;
2412         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2413                 if (cnp->cn_namelen == 1)
2414                         return;
2415                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2416                         cache_enter_dotdot_prep(dvp, vp, cnp);
2417                         flag = NCF_ISDOTDOT;
2418                 }
2419         }
2420
2421         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2422         if (ncp == NULL)
2423                 return;
2424
2425         cache_celockstate_init(&cel);
2426         ndd = NULL;
2427         ncp_ts = NULL;
2428
2429         /*
2430          * Calculate the hash key and setup as much of the new
2431          * namecache entry as possible before acquiring the lock.
2432          */
2433         ncp->nc_flag = flag | NCF_WIP;
2434         ncp->nc_vp = vp;
2435         if (vp == NULL)
2436                 cache_neg_init(ncp);
2437         ncp->nc_dvp = dvp;
2438         if (tsp != NULL) {
2439                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2440                 ncp_ts->nc_time = *tsp;
2441                 ncp_ts->nc_ticks = ticks;
2442                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2443                 if (dtsp != NULL) {
2444                         ncp_ts->nc_dotdottime = *dtsp;
2445                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2446                 }
2447         }
2448         len = ncp->nc_nlen = cnp->cn_namelen;
2449         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2450         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2451         ncp->nc_name[len] = '\0';
2452         cache_enter_lock(&cel, dvp, vp, hash);
2453
2454         /*
2455          * See if this vnode or negative entry is already in the cache
2456          * with this name.  This can happen with concurrent lookups of
2457          * the same path name.
2458          */
2459         ncpp = NCHHASH(hash);
2460         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2461                 if (n2->nc_dvp == dvp &&
2462                     n2->nc_nlen == cnp->cn_namelen &&
2463                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2464                         MPASS(cache_ncp_canuse(n2));
2465                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2466                                 KASSERT(vp == NULL,
2467                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2468                                     __func__, NULL, vp, cnp->cn_nameptr));
2469                         else
2470                                 KASSERT(n2->nc_vp == vp,
2471                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2472                                     __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2473                         /*
2474                          * Entries are supposed to be immutable unless in the
2475                          * process of getting destroyed. Accommodating for
2476                          * changing timestamps is possible but not worth it.
2477                          * This should be harmless in terms of correctness, in
2478                          * the worst case resulting in an earlier expiration.
2479                          * Alternatively, the found entry can be replaced
2480                          * altogether.
2481                          */
2482                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2483 #if 0
2484                         if (tsp != NULL) {
2485                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2486                                     ("no NCF_TS"));
2487                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2488                                 n2_ts->nc_time = ncp_ts->nc_time;
2489                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2490                                 if (dtsp != NULL) {
2491                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2492                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2493                                 }
2494                         }
2495 #endif
2496                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2497                             vp);
2498                         goto out_unlock_free;
2499                 }
2500         }
2501
2502         if (flag == NCF_ISDOTDOT) {
2503                 /*
2504                  * See if we are trying to add .. entry, but some other lookup
2505                  * has populated v_cache_dd pointer already.
2506                  */
2507                 if (dvp->v_cache_dd != NULL)
2508                         goto out_unlock_free;
2509                 KASSERT(vp == NULL || vp->v_type == VDIR,
2510                     ("wrong vnode type %p", vp));
2511                 atomic_thread_fence_rel();
2512                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2513         }
2514
2515         if (vp != NULL) {
2516                 if (flag != NCF_ISDOTDOT) {
2517                         /*
2518                          * For this case, the cache entry maps both the
2519                          * directory name in it and the name ".." for the
2520                          * directory's parent.
2521                          */
2522                         if ((ndd = vp->v_cache_dd) != NULL) {
2523                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2524                                         cache_zap_locked(ndd);
2525                                 else
2526                                         ndd = NULL;
2527                         }
2528                         atomic_thread_fence_rel();
2529                         atomic_store_ptr(&vp->v_cache_dd, ncp);
2530                 } else if (vp->v_type != VDIR) {
2531                         if (vp->v_cache_dd != NULL) {
2532                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
2533                         }
2534                 }
2535         }
2536
2537         if (flag != NCF_ISDOTDOT) {
2538                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2539                         cache_hold_vnode(dvp);
2540                 }
2541                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2542         }
2543
2544         /*
2545          * If the entry is "negative", we place it into the
2546          * "negative" cache queue, otherwise, we place it into the
2547          * destination vnode's cache entries queue.
2548          */
2549         if (vp != NULL) {
2550                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2551                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2552                     vp);
2553         } else {
2554                 if (cnp->cn_flags & ISWHITEOUT)
2555                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2556                 cache_neg_insert(ncp);
2557                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2558                     ncp->nc_name);
2559         }
2560
2561         /*
2562          * Insert the new namecache entry into the appropriate chain
2563          * within the cache entries table.
2564          */
2565         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2566
2567         atomic_thread_fence_rel();
2568         /*
2569          * Mark the entry as fully constructed.
2570          * It is immutable past this point until its removal.
2571          */
2572         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2573
2574         cache_enter_unlock(&cel);
2575         if (ndd != NULL)
2576                 cache_free(ndd);
2577         return;
2578 out_unlock_free:
2579         cache_enter_unlock(&cel);
2580         cache_free(ncp);
2581         return;
2582 }
2583
2584 /*
2585  * A variant of the above accepting flags.
2586  *
2587  * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2588  *
2589  * TODO: this routine is a hack. It blindly removes the old entry, even if it
2590  * happens to match and it is doing it in an inefficient manner. It was added
2591  * to accommodate NFS which runs into a case where the target for a given name
2592  * may change from under it. Note this does nothing to solve the following
2593  * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2594  * the same [dvp, cnp]. It may be argued that code doing this is broken.
2595  */
2596 void
2597 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2598     struct timespec *tsp, struct timespec *dtsp, int flags)
2599 {
2600
2601         MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2602
2603         if (flags & VFS_CACHE_DROPOLD)
2604                 cache_remove_cnp(dvp, cnp);
2605         cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2606 }
2607
2608 static u_int
2609 cache_roundup_2(u_int val)
2610 {
2611         u_int res;
2612
2613         for (res = 1; res <= val; res <<= 1)
2614                 continue;
2615
2616         return (res);
2617 }
2618
2619 static struct nchashhead *
2620 nchinittbl(u_long elements, u_long *hashmask)
2621 {
2622         struct nchashhead *hashtbl;
2623         u_long hashsize, i;
2624
2625         hashsize = cache_roundup_2(elements) / 2;
2626
2627         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2628         for (i = 0; i < hashsize; i++)
2629                 CK_SLIST_INIT(&hashtbl[i]);
2630         *hashmask = hashsize - 1;
2631         return (hashtbl);
2632 }
2633
2634 static void
2635 ncfreetbl(struct nchashhead *hashtbl)
2636 {
2637
2638         free(hashtbl, M_VFSCACHE);
2639 }
2640
2641 /*
2642  * Name cache initialization, from vfs_init() when we are booting
2643  */
2644 static void
2645 nchinit(void *dummy __unused)
2646 {
2647         u_int i;
2648
2649         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2650             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2651         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2652             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2653         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2654             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2655         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2656             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2657
2658         VFS_SMR_ZONE_SET(cache_zone_small);
2659         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2660         VFS_SMR_ZONE_SET(cache_zone_large);
2661         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2662
2663         ncsize = desiredvnodes * ncsizefactor;
2664         cache_recalc_neg_min(ncnegminpct);
2665         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2666         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2667         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2668                 ncbuckethash = 7;
2669         if (ncbuckethash > nchash)
2670                 ncbuckethash = nchash;
2671         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2672             M_WAITOK | M_ZERO);
2673         for (i = 0; i < numbucketlocks; i++)
2674                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2675         ncvnodehash = ncbuckethash;
2676         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2677             M_WAITOK | M_ZERO);
2678         for (i = 0; i < numvnodelocks; i++)
2679                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2680
2681         for (i = 0; i < numneglists; i++) {
2682                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2683                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2684                 TAILQ_INIT(&neglists[i].nl_list);
2685                 TAILQ_INIT(&neglists[i].nl_hotlist);
2686         }
2687 }
2688 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2689
2690 void
2691 cache_vnode_init(struct vnode *vp)
2692 {
2693
2694         LIST_INIT(&vp->v_cache_src);
2695         TAILQ_INIT(&vp->v_cache_dst);
2696         vp->v_cache_dd = NULL;
2697         cache_prehash(vp);
2698 }
2699
2700 /*
2701  * Induce transient cache misses for lockless operation in cache_lookup() by
2702  * using a temporary hash table.
2703  *
2704  * This will force a fs lookup.
2705  *
2706  * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2707  * to observe all CPUs not performing the lookup.
2708  */
2709 static void
2710 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2711 {
2712
2713         MPASS(temphash < nchash);
2714         /*
2715          * Change the size. The new size is smaller and can safely be used
2716          * against the existing table. All lookups which now hash wrong will
2717          * result in a cache miss, which all callers are supposed to know how
2718          * to handle.
2719          */
2720         atomic_store_long(&nchash, temphash);
2721         atomic_thread_fence_rel();
2722         vfs_smr_synchronize();
2723         /*
2724          * At this point everyone sees the updated hash value, but they still
2725          * see the old table.
2726          */
2727         atomic_store_ptr(&nchashtbl, temptbl);
2728         atomic_thread_fence_rel();
2729         vfs_smr_synchronize();
2730         /*
2731          * At this point everyone sees the updated table pointer and size pair.
2732          */
2733 }
2734
2735 /*
2736  * Set the new hash table.
2737  *
2738  * Similarly to cache_changesize_set_temp(), this has to synchronize against
2739  * lockless operation in cache_lookup().
2740  */
2741 static void
2742 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2743 {
2744
2745         MPASS(nchash < new_hash);
2746         /*
2747          * Change the pointer first. This wont result in out of bounds access
2748          * since the temporary table is guaranteed to be smaller.
2749          */
2750         atomic_store_ptr(&nchashtbl, new_tbl);
2751         atomic_thread_fence_rel();
2752         vfs_smr_synchronize();
2753         /*
2754          * At this point everyone sees the updated pointer value, but they
2755          * still see the old size.
2756          */
2757         atomic_store_long(&nchash, new_hash);
2758         atomic_thread_fence_rel();
2759         vfs_smr_synchronize();
2760         /*
2761          * At this point everyone sees the updated table pointer and size pair.
2762          */
2763 }
2764
2765 void
2766 cache_changesize(u_long newmaxvnodes)
2767 {
2768         struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2769         u_long new_nchash, old_nchash, temphash;
2770         struct namecache *ncp;
2771         uint32_t hash;
2772         u_long newncsize;
2773         int i;
2774
2775         newncsize = newmaxvnodes * ncsizefactor;
2776         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2777         if (newmaxvnodes < numbucketlocks)
2778                 newmaxvnodes = numbucketlocks;
2779
2780         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2781         /* If same hash table size, nothing to do */
2782         if (nchash == new_nchash) {
2783                 ncfreetbl(new_nchashtbl);
2784                 return;
2785         }
2786
2787         temptbl = nchinittbl(1, &temphash);
2788
2789         /*
2790          * Move everything from the old hash table to the new table.
2791          * None of the namecache entries in the table can be removed
2792          * because to do so, they have to be removed from the hash table.
2793          */
2794         cache_lock_all_vnodes();
2795         cache_lock_all_buckets();
2796         old_nchashtbl = nchashtbl;
2797         old_nchash = nchash;
2798         cache_changesize_set_temp(temptbl, temphash);
2799         for (i = 0; i <= old_nchash; i++) {
2800                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2801                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2802                             ncp->nc_dvp);
2803                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2804                         CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2805                 }
2806         }
2807         ncsize = newncsize;
2808         cache_recalc_neg_min(ncnegminpct);
2809         cache_changesize_set_new(new_nchashtbl, new_nchash);
2810         cache_unlock_all_buckets();
2811         cache_unlock_all_vnodes();
2812         ncfreetbl(old_nchashtbl);
2813         ncfreetbl(temptbl);
2814 }
2815
2816 /*
2817  * Remove all entries from and to a particular vnode.
2818  */
2819 static void
2820 cache_purge_impl(struct vnode *vp)
2821 {
2822         struct cache_freebatch batch;
2823         struct namecache *ncp;
2824         struct mtx *vlp, *vlp2;
2825
2826         TAILQ_INIT(&batch);
2827         vlp = VP2VNODELOCK(vp);
2828         vlp2 = NULL;
2829         mtx_lock(vlp);
2830 retry:
2831         while (!LIST_EMPTY(&vp->v_cache_src)) {
2832                 ncp = LIST_FIRST(&vp->v_cache_src);
2833                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2834                         goto retry;
2835                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2836         }
2837         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2838                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2839                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2840                         goto retry;
2841                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2842         }
2843         ncp = vp->v_cache_dd;
2844         if (ncp != NULL) {
2845                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2846                    ("lost dotdot link"));
2847                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2848                         goto retry;
2849                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2850         }
2851         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2852         mtx_unlock(vlp);
2853         if (vlp2 != NULL)
2854                 mtx_unlock(vlp2);
2855         cache_free_batch(&batch);
2856 }
2857
2858 /*
2859  * Opportunistic check to see if there is anything to do.
2860  */
2861 static bool
2862 cache_has_entries(struct vnode *vp)
2863 {
2864
2865         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2866             atomic_load_ptr(&vp->v_cache_dd) == NULL)
2867                 return (false);
2868         return (true);
2869 }
2870
2871 void
2872 cache_purge(struct vnode *vp)
2873 {
2874
2875         SDT_PROBE1(vfs, namecache, purge, done, vp);
2876         if (!cache_has_entries(vp))
2877                 return;
2878         cache_purge_impl(vp);
2879 }
2880
2881 /*
2882  * Only to be used by vgone.
2883  */
2884 void
2885 cache_purge_vgone(struct vnode *vp)
2886 {
2887         struct mtx *vlp;
2888
2889         VNPASS(VN_IS_DOOMED(vp), vp);
2890         if (cache_has_entries(vp)) {
2891                 cache_purge_impl(vp);
2892                 return;
2893         }
2894
2895         /*
2896          * Serialize against a potential thread doing cache_purge.
2897          */
2898         vlp = VP2VNODELOCK(vp);
2899         mtx_wait_unlocked(vlp);
2900         if (cache_has_entries(vp)) {
2901                 cache_purge_impl(vp);
2902                 return;
2903         }
2904         return;
2905 }
2906
2907 /*
2908  * Remove all negative entries for a particular directory vnode.
2909  */
2910 void
2911 cache_purge_negative(struct vnode *vp)
2912 {
2913         struct cache_freebatch batch;
2914         struct namecache *ncp, *nnp;
2915         struct mtx *vlp;
2916
2917         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2918         if (LIST_EMPTY(&vp->v_cache_src))
2919                 return;
2920         TAILQ_INIT(&batch);
2921         vlp = VP2VNODELOCK(vp);
2922         mtx_lock(vlp);
2923         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2924                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2925                         continue;
2926                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2927                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2928         }
2929         mtx_unlock(vlp);
2930         cache_free_batch(&batch);
2931 }
2932
2933 /*
2934  * Entry points for modifying VOP operations.
2935  */
2936 void
2937 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2938     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2939 {
2940
2941         ASSERT_VOP_IN_SEQC(fdvp);
2942         ASSERT_VOP_IN_SEQC(fvp);
2943         ASSERT_VOP_IN_SEQC(tdvp);
2944         if (tvp != NULL)
2945                 ASSERT_VOP_IN_SEQC(tvp);
2946
2947         cache_purge(fvp);
2948         if (tvp != NULL) {
2949                 cache_purge(tvp);
2950                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2951                     ("%s: lingering negative entry", __func__));
2952         } else {
2953                 cache_remove_cnp(tdvp, tcnp);
2954         }
2955
2956         /*
2957          * TODO
2958          *
2959          * Historically renaming was always purging all revelang entries,
2960          * but that's quite wasteful. In particular turns out that in many cases
2961          * the target file is immediately accessed after rename, inducing a cache
2962          * miss.
2963          *
2964          * Recode this to reduce relocking and reuse the existing entry (if any)
2965          * instead of just removing it above and allocating a new one here.
2966          */
2967         if (cache_rename_add) {
2968                 cache_enter(tdvp, fvp, tcnp);
2969         }
2970 }
2971
2972 void
2973 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2974 {
2975
2976         ASSERT_VOP_IN_SEQC(dvp);
2977         ASSERT_VOP_IN_SEQC(vp);
2978         cache_purge(vp);
2979 }
2980
2981 #ifdef INVARIANTS
2982 /*
2983  * Validate that if an entry exists it matches.
2984  */
2985 void
2986 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2987 {
2988         struct namecache *ncp;
2989         struct mtx *blp;
2990         uint32_t hash;
2991
2992         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2993         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2994                 return;
2995         blp = HASH2BUCKETLOCK(hash);
2996         mtx_lock(blp);
2997         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2998                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2999                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
3000                         if (ncp->nc_vp != vp)
3001                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3002                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3003                 }
3004         }
3005         mtx_unlock(blp);
3006 }
3007 #endif
3008
3009 /*
3010  * Flush all entries referencing a particular filesystem.
3011  */
3012 void
3013 cache_purgevfs(struct mount *mp)
3014 {
3015         struct vnode *vp, *mvp;
3016         size_t visited, purged;
3017
3018         visited = purged = 0;
3019         /*
3020          * Somewhat wasteful iteration over all vnodes. Would be better to
3021          * support filtering and avoid the interlock to begin with.
3022          */
3023         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3024                 visited++;
3025                 if (!cache_has_entries(vp)) {
3026                         VI_UNLOCK(vp);
3027                         continue;
3028                 }
3029                 vholdl(vp);
3030                 VI_UNLOCK(vp);
3031                 cache_purge(vp);
3032                 purged++;
3033                 vdrop(vp);
3034         }
3035
3036         SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3037 }
3038
3039 /*
3040  * Perform canonical checks and cache lookup and pass on to filesystem
3041  * through the vop_cachedlookup only if needed.
3042  */
3043
3044 int
3045 vfs_cache_lookup(struct vop_lookup_args *ap)
3046 {
3047         struct vnode *dvp;
3048         int error;
3049         struct vnode **vpp = ap->a_vpp;
3050         struct componentname *cnp = ap->a_cnp;
3051         int flags = cnp->cn_flags;
3052
3053         *vpp = NULL;
3054         dvp = ap->a_dvp;
3055
3056         if (dvp->v_type != VDIR)
3057                 return (ENOTDIR);
3058
3059         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3060             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3061                 return (EROFS);
3062
3063         error = vn_dir_check_exec(dvp, cnp);
3064         if (error != 0)
3065                 return (error);
3066
3067         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3068         if (error == 0)
3069                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3070         if (error == -1)
3071                 return (0);
3072         return (error);
3073 }
3074
3075 /* Implementation of the getcwd syscall. */
3076 int
3077 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3078 {
3079         char *buf, *retbuf;
3080         size_t buflen;
3081         int error;
3082
3083         buflen = uap->buflen;
3084         if (__predict_false(buflen < 2))
3085                 return (EINVAL);
3086         if (buflen > MAXPATHLEN)
3087                 buflen = MAXPATHLEN;
3088
3089         buf = uma_zalloc(namei_zone, M_WAITOK);
3090         error = vn_getcwd(buf, &retbuf, &buflen);
3091         if (error == 0)
3092                 error = copyout(retbuf, uap->buf, buflen);
3093         uma_zfree(namei_zone, buf);
3094         return (error);
3095 }
3096
3097 int
3098 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3099 {
3100         struct pwd *pwd;
3101         int error;
3102
3103         vfs_smr_enter();
3104         pwd = pwd_get_smr();
3105         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3106             buflen, 0);
3107         VFS_SMR_ASSERT_NOT_ENTERED();
3108         if (error < 0) {
3109                 pwd = pwd_hold(curthread);
3110                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3111                     retbuf, buflen);
3112                 pwd_drop(pwd);
3113         }
3114
3115 #ifdef KTRACE
3116         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3117                 ktrnamei(*retbuf);
3118 #endif
3119         return (error);
3120 }
3121
3122 static int
3123 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3124     size_t size, int flags, enum uio_seg pathseg)
3125 {
3126         struct nameidata nd;
3127         char *retbuf, *freebuf;
3128         int error;
3129
3130         if (flags != 0)
3131                 return (EINVAL);
3132         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
3133             pathseg, path, fd, &cap_fstat_rights, td);
3134         if ((error = namei(&nd)) != 0)
3135                 return (error);
3136
3137         if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3138             (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3139                 /*
3140                  * This happens if vp is a file mount. The call to
3141                  * vn_fullpath_hardlink can panic if path resolution can't be
3142                  * handled without the directory.
3143                  *
3144                  * To resolve this, we find the vnode which was mounted on -
3145                  * this should have a unique global path since we disallow
3146                  * mounting on linked files.
3147                  */
3148                 struct vnode *covered_vp;
3149                 error = vn_lock(nd.ni_vp, LK_SHARED);
3150                 if (error != 0)
3151                         goto out;
3152                 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3153                 vref(covered_vp);
3154                 VOP_UNLOCK(nd.ni_vp);
3155                 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3156                 vrele(covered_vp);
3157         } else {
3158                 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
3159                     nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
3160         }
3161         if (error == 0) {
3162                 error = copyout(retbuf, buf, size);
3163                 free(freebuf, M_TEMP);
3164         }
3165 out:
3166         NDFREE(&nd, 0);
3167         return (error);
3168 }
3169
3170 int
3171 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3172 {
3173
3174         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3175             uap->flags, UIO_USERSPACE));
3176 }
3177
3178 /*
3179  * Retrieve the full filesystem path that correspond to a vnode from the name
3180  * cache (if available)
3181  */
3182 int
3183 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3184 {
3185         struct pwd *pwd;
3186         char *buf;
3187         size_t buflen;
3188         int error;
3189
3190         if (__predict_false(vp == NULL))
3191                 return (EINVAL);
3192
3193         buflen = MAXPATHLEN;
3194         buf = malloc(buflen, M_TEMP, M_WAITOK);
3195         vfs_smr_enter();
3196         pwd = pwd_get_smr();
3197         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3198         VFS_SMR_ASSERT_NOT_ENTERED();
3199         if (error < 0) {
3200                 pwd = pwd_hold(curthread);
3201                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3202                 pwd_drop(pwd);
3203         }
3204         if (error == 0)
3205                 *freebuf = buf;
3206         else
3207                 free(buf, M_TEMP);
3208         return (error);
3209 }
3210
3211 /*
3212  * This function is similar to vn_fullpath, but it attempts to lookup the
3213  * pathname relative to the global root mount point.  This is required for the
3214  * auditing sub-system, as audited pathnames must be absolute, relative to the
3215  * global root mount point.
3216  */
3217 int
3218 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3219 {
3220         char *buf;
3221         size_t buflen;
3222         int error;
3223
3224         if (__predict_false(vp == NULL))
3225                 return (EINVAL);
3226         buflen = MAXPATHLEN;
3227         buf = malloc(buflen, M_TEMP, M_WAITOK);
3228         vfs_smr_enter();
3229         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3230         VFS_SMR_ASSERT_NOT_ENTERED();
3231         if (error < 0) {
3232                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3233         }
3234         if (error == 0)
3235                 *freebuf = buf;
3236         else
3237                 free(buf, M_TEMP);
3238         return (error);
3239 }
3240
3241 static struct namecache *
3242 vn_dd_from_dst(struct vnode *vp)
3243 {
3244         struct namecache *ncp;
3245
3246         cache_assert_vnode_locked(vp);
3247         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3248                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3249                         return (ncp);
3250         }
3251         return (NULL);
3252 }
3253
3254 int
3255 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3256 {
3257         struct vnode *dvp;
3258         struct namecache *ncp;
3259         struct mtx *vlp;
3260         int error;
3261
3262         vlp = VP2VNODELOCK(*vp);
3263         mtx_lock(vlp);
3264         ncp = (*vp)->v_cache_dd;
3265         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3266                 KASSERT(ncp == vn_dd_from_dst(*vp),
3267                     ("%s: mismatch for dd entry (%p != %p)", __func__,
3268                     ncp, vn_dd_from_dst(*vp)));
3269         } else {
3270                 ncp = vn_dd_from_dst(*vp);
3271         }
3272         if (ncp != NULL) {
3273                 if (*buflen < ncp->nc_nlen) {
3274                         mtx_unlock(vlp);
3275                         vrele(*vp);
3276                         counter_u64_add(numfullpathfail4, 1);
3277                         error = ENOMEM;
3278                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3279                             vp, NULL);
3280                         return (error);
3281                 }
3282                 *buflen -= ncp->nc_nlen;
3283                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3284                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3285                     ncp->nc_name, vp);
3286                 dvp = *vp;
3287                 *vp = ncp->nc_dvp;
3288                 vref(*vp);
3289                 mtx_unlock(vlp);
3290                 vrele(dvp);
3291                 return (0);
3292         }
3293         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3294
3295         mtx_unlock(vlp);
3296         vn_lock(*vp, LK_SHARED | LK_RETRY);
3297         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3298         vput(*vp);
3299         if (error) {
3300                 counter_u64_add(numfullpathfail2, 1);
3301                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
3302                 return (error);
3303         }
3304
3305         *vp = dvp;
3306         if (VN_IS_DOOMED(dvp)) {
3307                 /* forced unmount */
3308                 vrele(dvp);
3309                 error = ENOENT;
3310                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3311                 return (error);
3312         }
3313         /*
3314          * *vp has its use count incremented still.
3315          */
3316
3317         return (0);
3318 }
3319
3320 /*
3321  * Resolve a directory to a pathname.
3322  *
3323  * The name of the directory can always be found in the namecache or fetched
3324  * from the filesystem. There is also guaranteed to be only one parent, meaning
3325  * we can just follow vnodes up until we find the root.
3326  *
3327  * The vnode must be referenced.
3328  */
3329 static int
3330 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3331     size_t *len, size_t addend)
3332 {
3333 #ifdef KDTRACE_HOOKS
3334         struct vnode *startvp = vp;
3335 #endif
3336         struct vnode *vp1;
3337         size_t buflen;
3338         int error;
3339         bool slash_prefixed;
3340
3341         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3342         VNPASS(vp->v_usecount > 0, vp);
3343
3344         buflen = *len;
3345
3346         slash_prefixed = true;
3347         if (addend == 0) {
3348                 MPASS(*len >= 2);
3349                 buflen--;
3350                 buf[buflen] = '\0';
3351                 slash_prefixed = false;
3352         }
3353
3354         error = 0;
3355
3356         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3357         counter_u64_add(numfullpathcalls, 1);
3358         while (vp != rdir && vp != rootvnode) {
3359                 /*
3360                  * The vp vnode must be already fully constructed,
3361                  * since it is either found in namecache or obtained
3362                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3363                  * without obtaining the vnode lock.
3364                  */
3365                 if ((vp->v_vflag & VV_ROOT) != 0) {
3366                         vn_lock(vp, LK_RETRY | LK_SHARED);
3367
3368                         /*
3369                          * With the vnode locked, check for races with
3370                          * unmount, forced or not.  Note that we
3371                          * already verified that vp is not equal to
3372                          * the root vnode, which means that
3373                          * mnt_vnodecovered can be NULL only for the
3374                          * case of unmount.
3375                          */
3376                         if (VN_IS_DOOMED(vp) ||
3377                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3378                             vp1->v_mountedhere != vp->v_mount) {
3379                                 vput(vp);
3380                                 error = ENOENT;
3381                                 SDT_PROBE3(vfs, namecache, fullpath, return,
3382                                     error, vp, NULL);
3383                                 break;
3384                         }
3385
3386                         vref(vp1);
3387                         vput(vp);
3388                         vp = vp1;
3389                         continue;
3390                 }
3391                 if (vp->v_type != VDIR) {
3392                         vrele(vp);
3393                         counter_u64_add(numfullpathfail1, 1);
3394                         error = ENOTDIR;
3395                         SDT_PROBE3(vfs, namecache, fullpath, return,
3396                             error, vp, NULL);
3397                         break;
3398                 }
3399                 error = vn_vptocnp(&vp, buf, &buflen);
3400                 if (error)
3401                         break;
3402                 if (buflen == 0) {
3403                         vrele(vp);
3404                         error = ENOMEM;
3405                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3406                             startvp, NULL);
3407                         break;
3408                 }
3409                 buf[--buflen] = '/';
3410                 slash_prefixed = true;
3411         }
3412         if (error)
3413                 return (error);
3414         if (!slash_prefixed) {
3415                 if (buflen == 0) {
3416                         vrele(vp);
3417                         counter_u64_add(numfullpathfail4, 1);
3418                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3419                             startvp, NULL);
3420                         return (ENOMEM);
3421                 }
3422                 buf[--buflen] = '/';
3423         }
3424         counter_u64_add(numfullpathfound, 1);
3425         vrele(vp);
3426
3427         *retbuf = buf + buflen;
3428         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3429         *len -= buflen;
3430         *len += addend;
3431         return (0);
3432 }
3433
3434 /*
3435  * Resolve an arbitrary vnode to a pathname.
3436  *
3437  * Note 2 caveats:
3438  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3439  *   resolve to a different path than the one used to find it
3440  * - namecache is not mandatory, meaning names are not guaranteed to be added
3441  *   (in which case resolving fails)
3442  */
3443 static void __inline
3444 cache_rev_failed_impl(int *reason, int line)
3445 {
3446
3447         *reason = line;
3448 }
3449 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3450
3451 static int
3452 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3453     char **retbuf, size_t *buflen, size_t addend)
3454 {
3455 #ifdef KDTRACE_HOOKS
3456         struct vnode *startvp = vp;
3457 #endif
3458         struct vnode *tvp;
3459         struct mount *mp;
3460         struct namecache *ncp;
3461         size_t orig_buflen;
3462         int reason;
3463         int error;
3464 #ifdef KDTRACE_HOOKS
3465         int i;
3466 #endif
3467         seqc_t vp_seqc, tvp_seqc;
3468         u_char nc_flag;
3469
3470         VFS_SMR_ASSERT_ENTERED();
3471
3472         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3473                 vfs_smr_exit();
3474                 return (-1);
3475         }
3476
3477         orig_buflen = *buflen;
3478
3479         if (addend == 0) {
3480                 MPASS(*buflen >= 2);
3481                 *buflen -= 1;
3482                 buf[*buflen] = '\0';
3483         }
3484
3485         if (vp == rdir || vp == rootvnode) {
3486                 if (addend == 0) {
3487                         *buflen -= 1;
3488                         buf[*buflen] = '/';
3489                 }
3490                 goto out_ok;
3491         }
3492
3493 #ifdef KDTRACE_HOOKS
3494         i = 0;
3495 #endif
3496         error = -1;
3497         ncp = NULL; /* for sdt probe down below */
3498         vp_seqc = vn_seqc_read_any(vp);
3499         if (seqc_in_modify(vp_seqc)) {
3500                 cache_rev_failed(&reason);
3501                 goto out_abort;
3502         }
3503
3504         for (;;) {
3505 #ifdef KDTRACE_HOOKS
3506                 i++;
3507 #endif
3508                 if ((vp->v_vflag & VV_ROOT) != 0) {
3509                         mp = atomic_load_ptr(&vp->v_mount);
3510                         if (mp == NULL) {
3511                                 cache_rev_failed(&reason);
3512                                 goto out_abort;
3513                         }
3514                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3515                         tvp_seqc = vn_seqc_read_any(tvp);
3516                         if (seqc_in_modify(tvp_seqc)) {
3517                                 cache_rev_failed(&reason);
3518                                 goto out_abort;
3519                         }
3520                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3521                                 cache_rev_failed(&reason);
3522                                 goto out_abort;
3523                         }
3524                         vp = tvp;
3525                         vp_seqc = tvp_seqc;
3526                         continue;
3527                 }
3528                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3529                 if (ncp == NULL) {
3530                         cache_rev_failed(&reason);
3531                         goto out_abort;
3532                 }
3533                 nc_flag = atomic_load_char(&ncp->nc_flag);
3534                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3535                         cache_rev_failed(&reason);
3536                         goto out_abort;
3537                 }
3538                 if (ncp->nc_nlen >= *buflen) {
3539                         cache_rev_failed(&reason);
3540                         error = ENOMEM;
3541                         goto out_abort;
3542                 }
3543                 *buflen -= ncp->nc_nlen;
3544                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3545                 *buflen -= 1;
3546                 buf[*buflen] = '/';
3547                 tvp = ncp->nc_dvp;
3548                 tvp_seqc = vn_seqc_read_any(tvp);
3549                 if (seqc_in_modify(tvp_seqc)) {
3550                         cache_rev_failed(&reason);
3551                         goto out_abort;
3552                 }
3553                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3554                         cache_rev_failed(&reason);
3555                         goto out_abort;
3556                 }
3557                 /*
3558                  * Acquire fence provided by vn_seqc_read_any above.
3559                  */
3560                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3561                         cache_rev_failed(&reason);
3562                         goto out_abort;
3563                 }
3564                 if (!cache_ncp_canuse(ncp)) {
3565                         cache_rev_failed(&reason);
3566                         goto out_abort;
3567                 }
3568                 vp = tvp;
3569                 vp_seqc = tvp_seqc;
3570                 if (vp == rdir || vp == rootvnode)
3571                         break;
3572         }
3573 out_ok:
3574         vfs_smr_exit();
3575         *retbuf = buf + *buflen;
3576         *buflen = orig_buflen - *buflen + addend;
3577         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3578         return (0);
3579
3580 out_abort:
3581         *buflen = orig_buflen;
3582         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3583         vfs_smr_exit();
3584         return (error);
3585 }
3586
3587 static int
3588 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3589     size_t *buflen)
3590 {
3591         size_t orig_buflen, addend;
3592         int error;
3593
3594         if (*buflen < 2)
3595                 return (EINVAL);
3596
3597         orig_buflen = *buflen;
3598
3599         vref(vp);
3600         addend = 0;
3601         if (vp->v_type != VDIR) {
3602                 *buflen -= 1;
3603                 buf[*buflen] = '\0';
3604                 error = vn_vptocnp(&vp, buf, buflen);
3605                 if (error)
3606                         return (error);
3607                 if (*buflen == 0) {
3608                         vrele(vp);
3609                         return (ENOMEM);
3610                 }
3611                 *buflen -= 1;
3612                 buf[*buflen] = '/';
3613                 addend = orig_buflen - *buflen;
3614         }
3615
3616         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3617 }
3618
3619 /*
3620  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3621  *
3622  * Since the namecache does not track hardlinks, the caller is
3623  * expected to first look up the target vnode with SAVENAME |
3624  * WANTPARENT flags passed to namei to get dvp and vp.
3625  *
3626  * Then we have 2 cases:
3627  * - if the found vnode is a directory, the path can be constructed just by
3628  *   following names up the chain
3629  * - otherwise we populate the buffer with the saved name and start resolving
3630  *   from the parent
3631  */
3632 int
3633 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3634     const char *hrdl_name, size_t hrdl_name_length,
3635     char **retbuf, char **freebuf, size_t *buflen)
3636 {
3637         char *buf, *tmpbuf;
3638         struct pwd *pwd;
3639         size_t addend;
3640         int error;
3641         enum vtype type;
3642
3643         if (*buflen < 2)
3644                 return (EINVAL);
3645         if (*buflen > MAXPATHLEN)
3646                 *buflen = MAXPATHLEN;
3647
3648         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3649
3650         addend = 0;
3651
3652         /*
3653          * Check for VBAD to work around the vp_crossmp bug in lookup().
3654          *
3655          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3656          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3657          * If the type is VDIR (like in this very case) we can skip looking
3658          * at ni_dvp in the first place. However, since vnodes get passed here
3659          * unlocked the target may transition to doomed state (type == VBAD)
3660          * before we get to evaluate the condition. If this happens, we will
3661          * populate part of the buffer and descend to vn_fullpath_dir with
3662          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3663          *
3664          * This should be atomic_load(&vp->v_type) but it is illegal to take
3665          * an address of a bit field, even if said field is sized to char.
3666          * Work around the problem by reading the value into a full-sized enum
3667          * and then re-reading it with atomic_load which will still prevent
3668          * the compiler from re-reading down the road.
3669          */
3670         type = vp->v_type;
3671         type = atomic_load_int(&type);
3672         if (type == VBAD) {
3673                 error = ENOENT;
3674                 goto out_bad;
3675         }
3676         if (type != VDIR) {
3677                 addend = hrdl_name_length + 2;
3678                 if (*buflen < addend) {
3679                         error = ENOMEM;
3680                         goto out_bad;
3681                 }
3682                 *buflen -= addend;
3683                 tmpbuf = buf + *buflen;
3684                 tmpbuf[0] = '/';
3685                 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3686                 tmpbuf[addend - 1] = '\0';
3687                 vp = dvp;
3688         }
3689
3690         vfs_smr_enter();
3691         pwd = pwd_get_smr();
3692         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3693             addend);
3694         VFS_SMR_ASSERT_NOT_ENTERED();
3695         if (error < 0) {
3696                 pwd = pwd_hold(curthread);
3697                 vref(vp);
3698                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3699                     addend);
3700                 pwd_drop(pwd);
3701         }
3702         if (error != 0)
3703                 goto out_bad;
3704
3705         *freebuf = buf;
3706
3707         return (0);
3708 out_bad:
3709         free(buf, M_TEMP);
3710         return (error);
3711 }
3712
3713 struct vnode *
3714 vn_dir_dd_ino(struct vnode *vp)
3715 {
3716         struct namecache *ncp;
3717         struct vnode *ddvp;
3718         struct mtx *vlp;
3719         enum vgetstate vs;
3720
3721         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3722         vlp = VP2VNODELOCK(vp);
3723         mtx_lock(vlp);
3724         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3725                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3726                         continue;
3727                 ddvp = ncp->nc_dvp;
3728                 vs = vget_prep(ddvp);
3729                 mtx_unlock(vlp);
3730                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3731                         return (NULL);
3732                 return (ddvp);
3733         }
3734         mtx_unlock(vlp);
3735         return (NULL);
3736 }
3737
3738 int
3739 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3740 {
3741         struct namecache *ncp;
3742         struct mtx *vlp;
3743         int l;
3744
3745         vlp = VP2VNODELOCK(vp);
3746         mtx_lock(vlp);
3747         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3748                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3749                         break;
3750         if (ncp == NULL) {
3751                 mtx_unlock(vlp);
3752                 return (ENOENT);
3753         }
3754         l = min(ncp->nc_nlen, buflen - 1);
3755         memcpy(buf, ncp->nc_name, l);
3756         mtx_unlock(vlp);
3757         buf[l] = '\0';
3758         return (0);
3759 }
3760
3761 /*
3762  * This function updates path string to vnode's full global path
3763  * and checks the size of the new path string against the pathlen argument.
3764  *
3765  * Requires a locked, referenced vnode.
3766  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3767  *
3768  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3769  * because it falls back to the ".." lookup if the namecache lookup fails.
3770  */
3771 int
3772 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3773     u_int pathlen)
3774 {
3775         struct nameidata nd;
3776         struct vnode *vp1;
3777         char *rpath, *fbuf;
3778         int error;
3779
3780         ASSERT_VOP_ELOCKED(vp, __func__);
3781
3782         /* Construct global filesystem path from vp. */
3783         VOP_UNLOCK(vp);
3784         error = vn_fullpath_global(vp, &rpath, &fbuf);
3785
3786         if (error != 0) {
3787                 vrele(vp);
3788                 return (error);
3789         }
3790
3791         if (strlen(rpath) >= pathlen) {
3792                 vrele(vp);
3793                 error = ENAMETOOLONG;
3794                 goto out;
3795         }
3796
3797         /*
3798          * Re-lookup the vnode by path to detect a possible rename.
3799          * As a side effect, the vnode is relocked.
3800          * If vnode was renamed, return ENOENT.
3801          */
3802         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3803             UIO_SYSSPACE, path, td);
3804         error = namei(&nd);
3805         if (error != 0) {
3806                 vrele(vp);
3807                 goto out;
3808         }
3809         NDFREE(&nd, NDF_ONLY_PNBUF);
3810         vp1 = nd.ni_vp;
3811         vrele(vp);
3812         if (vp1 == vp)
3813                 strcpy(path, rpath);
3814         else {
3815                 vput(vp1);
3816                 error = ENOENT;
3817         }
3818
3819 out:
3820         free(fbuf, M_TEMP);
3821         return (error);
3822 }
3823
3824 /*
3825  * This is similar to vn_path_to_global_path but allows for regular
3826  * files which may not be present in the cache.
3827  *
3828  * Requires a locked, referenced vnode.
3829  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3830  */
3831 int
3832 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3833     struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3834     size_t leaf_length)
3835 {
3836         struct nameidata nd;
3837         struct vnode *vp1;
3838         char *rpath, *fbuf;
3839         size_t len;
3840         int error;
3841
3842         ASSERT_VOP_ELOCKED(vp, __func__);
3843
3844         /*
3845          * Construct global filesystem path from dvp, vp and leaf
3846          * name.
3847          */
3848         VOP_UNLOCK(vp);
3849         error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3850             &rpath, &fbuf, &len);
3851
3852         if (error != 0) {
3853                 vrele(vp);
3854                 goto out;
3855         }
3856
3857         if (strlen(rpath) >= pathlen) {
3858                 vrele(vp);
3859                 error = ENAMETOOLONG;
3860                 goto out;
3861         }
3862
3863         /*
3864          * Re-lookup the vnode by path to detect a possible rename.
3865          * As a side effect, the vnode is relocked.
3866          * If vnode was renamed, return ENOENT.
3867          */
3868         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td);
3869         error = namei(&nd);
3870         if (error != 0) {
3871                 vrele(vp);
3872                 goto out;
3873         }
3874         NDFREE_PNBUF(&nd);
3875         vp1 = nd.ni_vp;
3876         vrele(vp);
3877         if (vp1 == vp)
3878                 strcpy(path, rpath);
3879         else {
3880                 vput(vp1);
3881                 error = ENOENT;
3882         }
3883
3884 out:
3885         free(fbuf, M_TEMP);
3886         return (error);
3887 }
3888
3889 #ifdef DDB
3890 static void
3891 db_print_vpath(struct vnode *vp)
3892 {
3893
3894         while (vp != NULL) {
3895                 db_printf("%p: ", vp);
3896                 if (vp == rootvnode) {
3897                         db_printf("/");
3898                         vp = NULL;
3899                 } else {
3900                         if (vp->v_vflag & VV_ROOT) {
3901                                 db_printf("<mount point>");
3902                                 vp = vp->v_mount->mnt_vnodecovered;
3903                         } else {
3904                                 struct namecache *ncp;
3905                                 char *ncn;
3906                                 int i;
3907
3908                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3909                                 if (ncp != NULL) {
3910                                         ncn = ncp->nc_name;
3911                                         for (i = 0; i < ncp->nc_nlen; i++)
3912                                                 db_printf("%c", *ncn++);
3913                                         vp = ncp->nc_dvp;
3914                                 } else {
3915                                         vp = NULL;
3916                                 }
3917                         }
3918                 }
3919                 db_printf("\n");
3920         }
3921
3922         return;
3923 }
3924
3925 DB_SHOW_COMMAND(vpath, db_show_vpath)
3926 {
3927         struct vnode *vp;
3928
3929         if (!have_addr) {
3930                 db_printf("usage: show vpath <struct vnode *>\n");
3931                 return;
3932         }
3933
3934         vp = (struct vnode *)addr;
3935         db_print_vpath(vp);
3936 }
3937
3938 #endif
3939
3940 static int cache_fast_lookup = 1;
3941
3942 #define CACHE_FPL_FAILED        -2020
3943
3944 void
3945 cache_fast_lookup_enabled_recalc(void)
3946 {
3947         int lookup_flag;
3948         int mac_on;
3949
3950 #ifdef MAC
3951         mac_on = mac_vnode_check_lookup_enabled();
3952         mac_on |= mac_vnode_check_readlink_enabled();
3953 #else
3954         mac_on = 0;
3955 #endif
3956
3957         lookup_flag = atomic_load_int(&cache_fast_lookup);
3958         if (lookup_flag && !mac_on) {
3959                 atomic_store_char(&cache_fast_lookup_enabled, true);
3960         } else {
3961                 atomic_store_char(&cache_fast_lookup_enabled, false);
3962         }
3963 }
3964
3965 static int
3966 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
3967 {
3968         int error, old;
3969
3970         old = atomic_load_int(&cache_fast_lookup);
3971         error = sysctl_handle_int(oidp, arg1, arg2, req);
3972         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
3973                 cache_fast_lookup_enabled_recalc();
3974         return (error);
3975 }
3976 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
3977     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
3978
3979 /*
3980  * Components of nameidata (or objects it can point to) which may
3981  * need restoring in case fast path lookup fails.
3982  */
3983 struct nameidata_outer {
3984         size_t ni_pathlen;
3985         int cn_flags;
3986 };
3987
3988 struct nameidata_saved {
3989 #ifdef INVARIANTS
3990         char *cn_nameptr;
3991         size_t ni_pathlen;
3992 #endif
3993 };
3994
3995 #ifdef INVARIANTS
3996 struct cache_fpl_debug {
3997         size_t ni_pathlen;
3998 };
3999 #endif
4000
4001 struct cache_fpl {
4002         struct nameidata *ndp;
4003         struct componentname *cnp;
4004         char *nulchar;
4005         struct vnode *dvp;
4006         struct vnode *tvp;
4007         seqc_t dvp_seqc;
4008         seqc_t tvp_seqc;
4009         uint32_t hash;
4010         struct nameidata_saved snd;
4011         struct nameidata_outer snd_outer;
4012         int line;
4013         enum cache_fpl_status status:8;
4014         bool in_smr;
4015         bool fsearch;
4016         bool savename;
4017         struct pwd **pwd;
4018 #ifdef INVARIANTS
4019         struct cache_fpl_debug debug;
4020 #endif
4021 };
4022
4023 static bool cache_fplookup_mp_supported(struct mount *mp);
4024 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4025 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4026 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4027 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4028 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4029 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4030 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4031 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4032 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4033
4034 static void
4035 cache_fpl_cleanup_cnp(struct componentname *cnp)
4036 {
4037
4038         uma_zfree(namei_zone, cnp->cn_pnbuf);
4039 #ifdef DIAGNOSTIC
4040         cnp->cn_pnbuf = NULL;
4041         cnp->cn_nameptr = NULL;
4042 #endif
4043 }
4044
4045 static struct vnode *
4046 cache_fpl_handle_root(struct cache_fpl *fpl)
4047 {
4048         struct nameidata *ndp;
4049         struct componentname *cnp;
4050
4051         ndp = fpl->ndp;
4052         cnp = fpl->cnp;
4053
4054         MPASS(*(cnp->cn_nameptr) == '/');
4055         cnp->cn_nameptr++;
4056         cache_fpl_pathlen_dec(fpl);
4057
4058         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4059                 do {
4060                         cnp->cn_nameptr++;
4061                         cache_fpl_pathlen_dec(fpl);
4062                 } while (*(cnp->cn_nameptr) == '/');
4063         }
4064
4065         return (ndp->ni_rootdir);
4066 }
4067
4068 static void
4069 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4070 {
4071
4072         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4073         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4074 }
4075
4076 static void
4077 cache_fpl_checkpoint(struct cache_fpl *fpl)
4078 {
4079
4080 #ifdef INVARIANTS
4081         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4082         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4083 #endif
4084 }
4085
4086 static void
4087 cache_fpl_restore_partial(struct cache_fpl *fpl)
4088 {
4089
4090         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4091 #ifdef INVARIANTS
4092         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4093 #endif
4094 }
4095
4096 static void
4097 cache_fpl_restore_abort(struct cache_fpl *fpl)
4098 {
4099
4100         cache_fpl_restore_partial(fpl);
4101         /*
4102          * It is 0 on entry by API contract.
4103          */
4104         fpl->ndp->ni_resflags = 0;
4105         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4106         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4107 }
4108
4109 #ifdef INVARIANTS
4110 #define cache_fpl_smr_assert_entered(fpl) ({                    \
4111         struct cache_fpl *_fpl = (fpl);                         \
4112         MPASS(_fpl->in_smr == true);                            \
4113         VFS_SMR_ASSERT_ENTERED();                               \
4114 })
4115 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
4116         struct cache_fpl *_fpl = (fpl);                         \
4117         MPASS(_fpl->in_smr == false);                           \
4118         VFS_SMR_ASSERT_NOT_ENTERED();                           \
4119 })
4120 static void
4121 cache_fpl_assert_status(struct cache_fpl *fpl)
4122 {
4123
4124         switch (fpl->status) {
4125         case CACHE_FPL_STATUS_UNSET:
4126                 __assert_unreachable();
4127                 break;
4128         case CACHE_FPL_STATUS_DESTROYED:
4129         case CACHE_FPL_STATUS_ABORTED:
4130         case CACHE_FPL_STATUS_PARTIAL:
4131         case CACHE_FPL_STATUS_HANDLED:
4132                 break;
4133         }
4134 }
4135 #else
4136 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4137 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4138 #define cache_fpl_assert_status(fpl) do { } while (0)
4139 #endif
4140
4141 #define cache_fpl_smr_enter_initial(fpl) ({                     \
4142         struct cache_fpl *_fpl = (fpl);                         \
4143         vfs_smr_enter();                                        \
4144         _fpl->in_smr = true;                                    \
4145 })
4146
4147 #define cache_fpl_smr_enter(fpl) ({                             \
4148         struct cache_fpl *_fpl = (fpl);                         \
4149         MPASS(_fpl->in_smr == false);                           \
4150         vfs_smr_enter();                                        \
4151         _fpl->in_smr = true;                                    \
4152 })
4153
4154 #define cache_fpl_smr_exit(fpl) ({                              \
4155         struct cache_fpl *_fpl = (fpl);                         \
4156         MPASS(_fpl->in_smr == true);                            \
4157         vfs_smr_exit();                                         \
4158         _fpl->in_smr = false;                                   \
4159 })
4160
4161 static int
4162 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4163 {
4164
4165         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4166                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4167                     ("%s: converting to abort from %d at %d, set at %d\n",
4168                     __func__, fpl->status, line, fpl->line));
4169         }
4170         cache_fpl_smr_assert_not_entered(fpl);
4171         fpl->status = CACHE_FPL_STATUS_ABORTED;
4172         fpl->line = line;
4173         return (CACHE_FPL_FAILED);
4174 }
4175
4176 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
4177
4178 static int __noinline
4179 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4180 {
4181         struct nameidata *ndp;
4182         struct componentname *cnp;
4183
4184         ndp = fpl->ndp;
4185         cnp = fpl->cnp;
4186
4187         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4188                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4189                     ("%s: converting to abort from %d at %d, set at %d\n",
4190                     __func__, fpl->status, line, fpl->line));
4191         }
4192         fpl->status = CACHE_FPL_STATUS_ABORTED;
4193         fpl->line = line;
4194         if (fpl->in_smr)
4195                 cache_fpl_smr_exit(fpl);
4196         cache_fpl_restore_abort(fpl);
4197         /*
4198          * Resolving symlinks overwrites data passed by the caller.
4199          * Let namei know.
4200          */
4201         if (ndp->ni_loopcnt > 0) {
4202                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4203                 cache_fpl_cleanup_cnp(cnp);
4204         }
4205         return (CACHE_FPL_FAILED);
4206 }
4207
4208 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
4209
4210 static int __noinline
4211 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4212 {
4213
4214         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4215             ("%s: setting to partial at %d, but already set to %d at %d\n",
4216             __func__, line, fpl->status, fpl->line));
4217         cache_fpl_smr_assert_entered(fpl);
4218         fpl->status = CACHE_FPL_STATUS_PARTIAL;
4219         fpl->line = line;
4220         return (cache_fplookup_partial_setup(fpl));
4221 }
4222
4223 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
4224
4225 static int
4226 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4227 {
4228
4229         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4230             ("%s: setting to handled at %d, but already set to %d at %d\n",
4231             __func__, line, fpl->status, fpl->line));
4232         cache_fpl_smr_assert_not_entered(fpl);
4233         fpl->status = CACHE_FPL_STATUS_HANDLED;
4234         fpl->line = line;
4235         return (0);
4236 }
4237
4238 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
4239
4240 static int
4241 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4242 {
4243
4244         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4245             ("%s: setting to handled at %d, but already set to %d at %d\n",
4246             __func__, line, fpl->status, fpl->line));
4247         MPASS(error != 0);
4248         MPASS(error != CACHE_FPL_FAILED);
4249         cache_fpl_smr_assert_not_entered(fpl);
4250         fpl->status = CACHE_FPL_STATUS_HANDLED;
4251         fpl->line = line;
4252         fpl->dvp = NULL;
4253         fpl->tvp = NULL;
4254         fpl->savename = false;
4255         return (error);
4256 }
4257
4258 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
4259
4260 static bool
4261 cache_fpl_terminated(struct cache_fpl *fpl)
4262 {
4263
4264         return (fpl->status != CACHE_FPL_STATUS_UNSET);
4265 }
4266
4267 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4268         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4269          FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | SAVENAME | SAVESTART | \
4270          WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | \
4271          WANTIOCTLCAPS)
4272
4273 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4274         (ISDOTDOT | MAKEENTRY | ISLASTCN)
4275
4276 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4277     "supported and internal flags overlap");
4278
4279 static bool
4280 cache_fpl_islastcn(struct nameidata *ndp)
4281 {
4282
4283         return (*ndp->ni_next == 0);
4284 }
4285
4286 static bool
4287 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4288 {
4289
4290         MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4291         return (*(fpl->nulchar - 1) == '/');
4292 }
4293
4294 static bool
4295 cache_fpl_isdotdot(struct componentname *cnp)
4296 {
4297
4298         if (cnp->cn_namelen == 2 &&
4299             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4300                 return (true);
4301         return (false);
4302 }
4303
4304 static bool
4305 cache_can_fplookup(struct cache_fpl *fpl)
4306 {
4307         struct nameidata *ndp;
4308         struct componentname *cnp;
4309         struct thread *td;
4310
4311         ndp = fpl->ndp;
4312         cnp = fpl->cnp;
4313         td = cnp->cn_thread;
4314
4315         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4316                 cache_fpl_aborted_early(fpl);
4317                 return (false);
4318         }
4319         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4320                 cache_fpl_aborted_early(fpl);
4321                 return (false);
4322         }
4323         if (IN_CAPABILITY_MODE(td)) {
4324                 cache_fpl_aborted_early(fpl);
4325                 return (false);
4326         }
4327         if (AUDITING_TD(td)) {
4328                 cache_fpl_aborted_early(fpl);
4329                 return (false);
4330         }
4331         if (ndp->ni_startdir != NULL) {
4332                 cache_fpl_aborted_early(fpl);
4333                 return (false);
4334         }
4335         return (true);
4336 }
4337
4338 static int __noinline
4339 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4340 {
4341         struct nameidata *ndp;
4342         struct componentname *cnp;
4343         int error;
4344         bool fsearch;
4345
4346         ndp = fpl->ndp;
4347         cnp = fpl->cnp;
4348
4349         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
4350         if (__predict_false(error != 0)) {
4351                 return (cache_fpl_aborted(fpl));
4352         }
4353         fpl->fsearch = fsearch;
4354         if ((*vpp)->v_type != VDIR) {
4355                 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4356                         cache_fpl_smr_exit(fpl);
4357                         return (cache_fpl_handled_error(fpl, ENOTDIR));
4358                 }
4359         }
4360         return (0);
4361 }
4362
4363 static int __noinline
4364 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4365     uint32_t hash)
4366 {
4367         struct componentname *cnp;
4368         struct vnode *dvp;
4369
4370         cnp = fpl->cnp;
4371         dvp = fpl->dvp;
4372
4373         cache_fpl_smr_exit(fpl);
4374         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4375                 return (cache_fpl_handled_error(fpl, ENOENT));
4376         else
4377                 return (cache_fpl_aborted(fpl));
4378 }
4379
4380 /*
4381  * The target vnode is not supported, prepare for the slow path to take over.
4382  */
4383 static int __noinline
4384 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4385 {
4386         struct nameidata *ndp;
4387         struct componentname *cnp;
4388         enum vgetstate dvs;
4389         struct vnode *dvp;
4390         struct pwd *pwd;
4391         seqc_t dvp_seqc;
4392
4393         ndp = fpl->ndp;
4394         cnp = fpl->cnp;
4395         pwd = *(fpl->pwd);
4396         dvp = fpl->dvp;
4397         dvp_seqc = fpl->dvp_seqc;
4398
4399         if (!pwd_hold_smr(pwd)) {
4400                 return (cache_fpl_aborted(fpl));
4401         }
4402
4403         /*
4404          * Note that seqc is checked before the vnode is locked, so by
4405          * the time regular lookup gets to it it may have moved.
4406          *
4407          * Ultimately this does not affect correctness, any lookup errors
4408          * are userspace racing with itself. It is guaranteed that any
4409          * path which ultimately gets found could also have been found
4410          * by regular lookup going all the way in absence of concurrent
4411          * modifications.
4412          */
4413         dvs = vget_prep_smr(dvp);
4414         cache_fpl_smr_exit(fpl);
4415         if (__predict_false(dvs == VGET_NONE)) {
4416                 pwd_drop(pwd);
4417                 return (cache_fpl_aborted(fpl));
4418         }
4419
4420         vget_finish_ref(dvp, dvs);
4421         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4422                 vrele(dvp);
4423                 pwd_drop(pwd);
4424                 return (cache_fpl_aborted(fpl));
4425         }
4426
4427         cache_fpl_restore_partial(fpl);
4428 #ifdef INVARIANTS
4429         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4430                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4431                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4432         }
4433 #endif
4434
4435         ndp->ni_startdir = dvp;
4436         cnp->cn_flags |= MAKEENTRY;
4437         if (cache_fpl_islastcn(ndp))
4438                 cnp->cn_flags |= ISLASTCN;
4439         if (cache_fpl_isdotdot(cnp))
4440                 cnp->cn_flags |= ISDOTDOT;
4441
4442         /*
4443          * Skip potential extra slashes parsing did not take care of.
4444          * cache_fplookup_skip_slashes explains the mechanism.
4445          */
4446         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4447                 do {
4448                         cnp->cn_nameptr++;
4449                         cache_fpl_pathlen_dec(fpl);
4450                 } while (*(cnp->cn_nameptr) == '/');
4451         }
4452
4453         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4454 #ifdef INVARIANTS
4455         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4456                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4457                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4458                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4459         }
4460 #endif
4461         return (0);
4462 }
4463
4464 static int
4465 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4466 {
4467         struct componentname *cnp;
4468         struct vnode *tvp;
4469         seqc_t tvp_seqc;
4470         int error, lkflags;
4471
4472         cnp = fpl->cnp;
4473         tvp = fpl->tvp;
4474         tvp_seqc = fpl->tvp_seqc;
4475
4476         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4477                 lkflags = LK_SHARED;
4478                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4479                         lkflags = LK_EXCLUSIVE;
4480                 error = vget_finish(tvp, lkflags, tvs);
4481                 if (__predict_false(error != 0)) {
4482                         return (cache_fpl_aborted(fpl));
4483                 }
4484         } else {
4485                 vget_finish_ref(tvp, tvs);
4486         }
4487
4488         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4489                 if ((cnp->cn_flags & LOCKLEAF) != 0)
4490                         vput(tvp);
4491                 else
4492                         vrele(tvp);
4493                 return (cache_fpl_aborted(fpl));
4494         }
4495
4496         return (cache_fpl_handled(fpl));
4497 }
4498
4499 /*
4500  * They want to possibly modify the state of the namecache.
4501  */
4502 static int __noinline
4503 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4504 {
4505         struct nameidata *ndp;
4506         struct componentname *cnp;
4507         enum vgetstate dvs;
4508         struct vnode *dvp, *tvp;
4509         struct mount *mp;
4510         seqc_t dvp_seqc;
4511         int error;
4512         bool docache;
4513
4514         ndp = fpl->ndp;
4515         cnp = fpl->cnp;
4516         dvp = fpl->dvp;
4517         dvp_seqc = fpl->dvp_seqc;
4518
4519         MPASS(*(cnp->cn_nameptr) != '/');
4520         MPASS(cache_fpl_islastcn(ndp));
4521         if ((cnp->cn_flags & LOCKPARENT) == 0)
4522                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4523         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4524         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4525             cnp->cn_nameiop == RENAME);
4526         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4527         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4528
4529         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4530         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4531                 docache = false;
4532
4533         /*
4534          * Regular lookup nulifies the slash, which we don't do here.
4535          * Don't take chances with filesystem routines seeing it for
4536          * the last entry.
4537          */
4538         if (cache_fpl_istrailingslash(fpl)) {
4539                 return (cache_fpl_partial(fpl));
4540         }
4541
4542         mp = atomic_load_ptr(&dvp->v_mount);
4543         if (__predict_false(mp == NULL)) {
4544                 return (cache_fpl_aborted(fpl));
4545         }
4546
4547         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4548                 cache_fpl_smr_exit(fpl);
4549                 /*
4550                  * Original code keeps not checking for CREATE which
4551                  * might be a bug. For now let the old lookup decide.
4552                  */
4553                 if (cnp->cn_nameiop == CREATE) {
4554                         return (cache_fpl_aborted(fpl));
4555                 }
4556                 return (cache_fpl_handled_error(fpl, EROFS));
4557         }
4558
4559         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4560                 cache_fpl_smr_exit(fpl);
4561                 return (cache_fpl_handled_error(fpl, EEXIST));
4562         }
4563
4564         /*
4565          * Secure access to dvp; check cache_fplookup_partial_setup for
4566          * reasoning.
4567          *
4568          * XXX At least UFS requires its lookup routine to be called for
4569          * the last path component, which leads to some level of complication
4570          * and inefficiency:
4571          * - the target routine always locks the target vnode, but our caller
4572          *   may not need it locked
4573          * - some of the VOP machinery asserts that the parent is locked, which
4574          *   once more may be not required
4575          *
4576          * TODO: add a flag for filesystems which don't need this.
4577          */
4578         dvs = vget_prep_smr(dvp);
4579         cache_fpl_smr_exit(fpl);
4580         if (__predict_false(dvs == VGET_NONE)) {
4581                 return (cache_fpl_aborted(fpl));
4582         }
4583
4584         vget_finish_ref(dvp, dvs);
4585         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4586                 vrele(dvp);
4587                 return (cache_fpl_aborted(fpl));
4588         }
4589
4590         error = vn_lock(dvp, LK_EXCLUSIVE);
4591         if (__predict_false(error != 0)) {
4592                 vrele(dvp);
4593                 return (cache_fpl_aborted(fpl));
4594         }
4595
4596         tvp = NULL;
4597         cnp->cn_flags |= ISLASTCN;
4598         if (docache)
4599                 cnp->cn_flags |= MAKEENTRY;
4600         if (cache_fpl_isdotdot(cnp))
4601                 cnp->cn_flags |= ISDOTDOT;
4602         cnp->cn_lkflags = LK_EXCLUSIVE;
4603         error = VOP_LOOKUP(dvp, &tvp, cnp);
4604         switch (error) {
4605         case EJUSTRETURN:
4606         case 0:
4607                 break;
4608         case ENOTDIR:
4609         case ENOENT:
4610                 vput(dvp);
4611                 return (cache_fpl_handled_error(fpl, error));
4612         default:
4613                 vput(dvp);
4614                 return (cache_fpl_aborted(fpl));
4615         }
4616
4617         fpl->tvp = tvp;
4618         fpl->savename = (cnp->cn_flags & SAVENAME) != 0;
4619
4620         if (tvp == NULL) {
4621                 if ((cnp->cn_flags & SAVESTART) != 0) {
4622                         ndp->ni_startdir = dvp;
4623                         vrefact(ndp->ni_startdir);
4624                         cnp->cn_flags |= SAVENAME;
4625                         fpl->savename = true;
4626                 }
4627                 MPASS(error == EJUSTRETURN);
4628                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4629                         VOP_UNLOCK(dvp);
4630                 }
4631                 return (cache_fpl_handled(fpl));
4632         }
4633
4634         /*
4635          * There are very hairy corner cases concerning various flag combinations
4636          * and locking state. In particular here we only hold one lock instead of
4637          * two.
4638          *
4639          * Skip the complexity as it is of no significance for normal workloads.
4640          */
4641         if (__predict_false(tvp == dvp)) {
4642                 vput(dvp);
4643                 vrele(tvp);
4644                 return (cache_fpl_aborted(fpl));
4645         }
4646
4647         /*
4648          * If they want the symlink itself we are fine, but if they want to
4649          * follow it regular lookup has to be engaged.
4650          */
4651         if (tvp->v_type == VLNK) {
4652                 if ((cnp->cn_flags & FOLLOW) != 0) {
4653                         vput(dvp);
4654                         vput(tvp);
4655                         return (cache_fpl_aborted(fpl));
4656                 }
4657         }
4658
4659         /*
4660          * Since we expect this to be the terminal vnode it should almost never
4661          * be a mount point.
4662          */
4663         if (__predict_false(cache_fplookup_is_mp(fpl))) {
4664                 vput(dvp);
4665                 vput(tvp);
4666                 return (cache_fpl_aborted(fpl));
4667         }
4668
4669         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4670                 vput(dvp);
4671                 vput(tvp);
4672                 return (cache_fpl_handled_error(fpl, EEXIST));
4673         }
4674
4675         if ((cnp->cn_flags & LOCKLEAF) == 0) {
4676                 VOP_UNLOCK(tvp);
4677         }
4678
4679         if ((cnp->cn_flags & LOCKPARENT) == 0) {
4680                 VOP_UNLOCK(dvp);
4681         }
4682
4683         if ((cnp->cn_flags & SAVESTART) != 0) {
4684                 ndp->ni_startdir = dvp;
4685                 vrefact(ndp->ni_startdir);
4686                 cnp->cn_flags |= SAVENAME;
4687                 fpl->savename = true;
4688         }
4689
4690         return (cache_fpl_handled(fpl));
4691 }
4692
4693 static int __noinline
4694 cache_fplookup_modifying(struct cache_fpl *fpl)
4695 {
4696         struct nameidata *ndp;
4697
4698         ndp = fpl->ndp;
4699
4700         if (!cache_fpl_islastcn(ndp)) {
4701                 return (cache_fpl_partial(fpl));
4702         }
4703         return (cache_fplookup_final_modifying(fpl));
4704 }
4705
4706 static int __noinline
4707 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4708 {
4709         struct componentname *cnp;
4710         enum vgetstate dvs, tvs;
4711         struct vnode *dvp, *tvp;
4712         seqc_t dvp_seqc;
4713         int error;
4714
4715         cnp = fpl->cnp;
4716         dvp = fpl->dvp;
4717         dvp_seqc = fpl->dvp_seqc;
4718         tvp = fpl->tvp;
4719
4720         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4721
4722         /*
4723          * This is less efficient than it can be for simplicity.
4724          */
4725         dvs = vget_prep_smr(dvp);
4726         if (__predict_false(dvs == VGET_NONE)) {
4727                 return (cache_fpl_aborted(fpl));
4728         }
4729         tvs = vget_prep_smr(tvp);
4730         if (__predict_false(tvs == VGET_NONE)) {
4731                 cache_fpl_smr_exit(fpl);
4732                 vget_abort(dvp, dvs);
4733                 return (cache_fpl_aborted(fpl));
4734         }
4735
4736         cache_fpl_smr_exit(fpl);
4737
4738         if ((cnp->cn_flags & LOCKPARENT) != 0) {
4739                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4740                 if (__predict_false(error != 0)) {
4741                         vget_abort(tvp, tvs);
4742                         return (cache_fpl_aborted(fpl));
4743                 }
4744         } else {
4745                 vget_finish_ref(dvp, dvs);
4746         }
4747
4748         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4749                 vget_abort(tvp, tvs);
4750                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4751                         vput(dvp);
4752                 else
4753                         vrele(dvp);
4754                 return (cache_fpl_aborted(fpl));
4755         }
4756
4757         error = cache_fplookup_final_child(fpl, tvs);
4758         if (__predict_false(error != 0)) {
4759                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4760                     fpl->status == CACHE_FPL_STATUS_DESTROYED);
4761                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4762                         vput(dvp);
4763                 else
4764                         vrele(dvp);
4765                 return (error);
4766         }
4767
4768         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4769         return (0);
4770 }
4771
4772 static int
4773 cache_fplookup_final(struct cache_fpl *fpl)
4774 {
4775         struct componentname *cnp;
4776         enum vgetstate tvs;
4777         struct vnode *dvp, *tvp;
4778         seqc_t dvp_seqc;
4779
4780         cnp = fpl->cnp;
4781         dvp = fpl->dvp;
4782         dvp_seqc = fpl->dvp_seqc;
4783         tvp = fpl->tvp;
4784
4785         MPASS(*(cnp->cn_nameptr) != '/');
4786
4787         if (cnp->cn_nameiop != LOOKUP) {
4788                 return (cache_fplookup_final_modifying(fpl));
4789         }
4790
4791         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4792                 return (cache_fplookup_final_withparent(fpl));
4793
4794         tvs = vget_prep_smr(tvp);
4795         if (__predict_false(tvs == VGET_NONE)) {
4796                 return (cache_fpl_partial(fpl));
4797         }
4798
4799         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4800                 cache_fpl_smr_exit(fpl);
4801                 vget_abort(tvp, tvs);
4802                 return (cache_fpl_aborted(fpl));
4803         }
4804
4805         cache_fpl_smr_exit(fpl);
4806         return (cache_fplookup_final_child(fpl, tvs));
4807 }
4808
4809 /*
4810  * Comment from locked lookup:
4811  * Check for degenerate name (e.g. / or "") which is a way of talking about a
4812  * directory, e.g. like "/." or ".".
4813  */
4814 static int __noinline
4815 cache_fplookup_degenerate(struct cache_fpl *fpl)
4816 {
4817         struct componentname *cnp;
4818         struct vnode *dvp;
4819         enum vgetstate dvs;
4820         int error, lkflags;
4821 #ifdef INVARIANTS
4822         char *cp;
4823 #endif
4824
4825         fpl->tvp = fpl->dvp;
4826         fpl->tvp_seqc = fpl->dvp_seqc;
4827
4828         cnp = fpl->cnp;
4829         dvp = fpl->dvp;
4830
4831 #ifdef INVARIANTS
4832         for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
4833                 KASSERT(*cp == '/',
4834                     ("%s: encountered non-slash; string [%s]\n", __func__,
4835                     cnp->cn_pnbuf));
4836         }
4837 #endif
4838
4839         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4840                 cache_fpl_smr_exit(fpl);
4841                 return (cache_fpl_handled_error(fpl, EISDIR));
4842         }
4843
4844         MPASS((cnp->cn_flags & SAVESTART) == 0);
4845
4846         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4847                 return (cache_fplookup_final_withparent(fpl));
4848         }
4849
4850         dvs = vget_prep_smr(dvp);
4851         cache_fpl_smr_exit(fpl);
4852         if (__predict_false(dvs == VGET_NONE)) {
4853                 return (cache_fpl_aborted(fpl));
4854         }
4855
4856         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4857                 lkflags = LK_SHARED;
4858                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4859                         lkflags = LK_EXCLUSIVE;
4860                 error = vget_finish(dvp, lkflags, dvs);
4861                 if (__predict_false(error != 0)) {
4862                         return (cache_fpl_aborted(fpl));
4863                 }
4864         } else {
4865                 vget_finish_ref(dvp, dvs);
4866         }
4867         return (cache_fpl_handled(fpl));
4868 }
4869
4870 static int __noinline
4871 cache_fplookup_emptypath(struct cache_fpl *fpl)
4872 {
4873         struct nameidata *ndp;
4874         struct componentname *cnp;
4875         enum vgetstate tvs;
4876         struct vnode *tvp;
4877         int error, lkflags;
4878
4879         fpl->tvp = fpl->dvp;
4880         fpl->tvp_seqc = fpl->dvp_seqc;
4881
4882         ndp = fpl->ndp;
4883         cnp = fpl->cnp;
4884         tvp = fpl->tvp;
4885
4886         MPASS(*cnp->cn_pnbuf == '\0');
4887
4888         if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
4889                 cache_fpl_smr_exit(fpl);
4890                 return (cache_fpl_handled_error(fpl, ENOENT));
4891         }
4892
4893         MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
4894
4895         tvs = vget_prep_smr(tvp);
4896         cache_fpl_smr_exit(fpl);
4897         if (__predict_false(tvs == VGET_NONE)) {
4898                 return (cache_fpl_aborted(fpl));
4899         }
4900
4901         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4902                 lkflags = LK_SHARED;
4903                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4904                         lkflags = LK_EXCLUSIVE;
4905                 error = vget_finish(tvp, lkflags, tvs);
4906                 if (__predict_false(error != 0)) {
4907                         return (cache_fpl_aborted(fpl));
4908                 }
4909         } else {
4910                 vget_finish_ref(tvp, tvs);
4911         }
4912
4913         ndp->ni_resflags |= NIRES_EMPTYPATH;
4914         return (cache_fpl_handled(fpl));
4915 }
4916
4917 static int __noinline
4918 cache_fplookup_noentry(struct cache_fpl *fpl)
4919 {
4920         struct nameidata *ndp;
4921         struct componentname *cnp;
4922         enum vgetstate dvs;
4923         struct vnode *dvp, *tvp;
4924         seqc_t dvp_seqc;
4925         int error;
4926
4927         ndp = fpl->ndp;
4928         cnp = fpl->cnp;
4929         dvp = fpl->dvp;
4930         dvp_seqc = fpl->dvp_seqc;
4931
4932         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4933         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4934         if (cnp->cn_nameiop == LOOKUP)
4935                 MPASS((cnp->cn_flags & NOCACHE) == 0);
4936         MPASS(!cache_fpl_isdotdot(cnp));
4937
4938         /*
4939          * Hack: delayed name len checking.
4940          */
4941         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4942                 cache_fpl_smr_exit(fpl);
4943                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
4944         }
4945
4946         if (cnp->cn_nameptr[0] == '/') {
4947                 return (cache_fplookup_skip_slashes(fpl));
4948         }
4949
4950         if (cnp->cn_pnbuf[0] == '\0') {
4951                 return (cache_fplookup_emptypath(fpl));
4952         }
4953
4954         if (cnp->cn_nameptr[0] == '\0') {
4955                 if (fpl->tvp == NULL) {
4956                         return (cache_fplookup_degenerate(fpl));
4957                 }
4958                 return (cache_fplookup_trailingslash(fpl));
4959         }
4960
4961         if (cnp->cn_nameiop != LOOKUP) {
4962                 fpl->tvp = NULL;
4963                 return (cache_fplookup_modifying(fpl));
4964         }
4965
4966         MPASS((cnp->cn_flags & SAVESTART) == 0);
4967
4968         /*
4969          * Only try to fill in the component if it is the last one,
4970          * otherwise not only there may be several to handle but the
4971          * walk may be complicated.
4972          */
4973         if (!cache_fpl_islastcn(ndp)) {
4974                 return (cache_fpl_partial(fpl));
4975         }
4976
4977         /*
4978          * Regular lookup nulifies the slash, which we don't do here.
4979          * Don't take chances with filesystem routines seeing it for
4980          * the last entry.
4981          */
4982         if (cache_fpl_istrailingslash(fpl)) {
4983                 return (cache_fpl_partial(fpl));
4984         }
4985
4986         /*
4987          * Secure access to dvp; check cache_fplookup_partial_setup for
4988          * reasoning.
4989          */
4990         dvs = vget_prep_smr(dvp);
4991         cache_fpl_smr_exit(fpl);
4992         if (__predict_false(dvs == VGET_NONE)) {
4993                 return (cache_fpl_aborted(fpl));
4994         }
4995
4996         vget_finish_ref(dvp, dvs);
4997         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4998                 vrele(dvp);
4999                 return (cache_fpl_aborted(fpl));
5000         }
5001
5002         error = vn_lock(dvp, LK_SHARED);
5003         if (__predict_false(error != 0)) {
5004                 vrele(dvp);
5005                 return (cache_fpl_aborted(fpl));
5006         }
5007
5008         tvp = NULL;
5009         /*
5010          * TODO: provide variants which don't require locking either vnode.
5011          */
5012         cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5013         cnp->cn_lkflags = LK_SHARED;
5014         if ((cnp->cn_flags & LOCKSHARED) == 0) {
5015                 cnp->cn_lkflags = LK_EXCLUSIVE;
5016         }
5017         error = VOP_LOOKUP(dvp, &tvp, cnp);
5018         switch (error) {
5019         case EJUSTRETURN:
5020         case 0:
5021                 break;
5022         case ENOTDIR:
5023         case ENOENT:
5024                 vput(dvp);
5025                 return (cache_fpl_handled_error(fpl, error));
5026         default:
5027                 vput(dvp);
5028                 return (cache_fpl_aborted(fpl));
5029         }
5030
5031         fpl->tvp = tvp;
5032         if (!fpl->savename) {
5033                 MPASS((cnp->cn_flags & SAVENAME) == 0);
5034         }
5035
5036         if (tvp == NULL) {
5037                 MPASS(error == EJUSTRETURN);
5038                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5039                         vput(dvp);
5040                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5041                         VOP_UNLOCK(dvp);
5042                 }
5043                 return (cache_fpl_handled(fpl));
5044         }
5045
5046         if (tvp->v_type == VLNK) {
5047                 if ((cnp->cn_flags & FOLLOW) != 0) {
5048                         vput(dvp);
5049                         vput(tvp);
5050                         return (cache_fpl_aborted(fpl));
5051                 }
5052         }
5053
5054         if (__predict_false(cache_fplookup_is_mp(fpl))) {
5055                 vput(dvp);
5056                 vput(tvp);
5057                 return (cache_fpl_aborted(fpl));
5058         }
5059
5060         if ((cnp->cn_flags & LOCKLEAF) == 0) {
5061                 VOP_UNLOCK(tvp);
5062         }
5063
5064         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5065                 vput(dvp);
5066         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5067                 VOP_UNLOCK(dvp);
5068         }
5069         return (cache_fpl_handled(fpl));
5070 }
5071
5072 static int __noinline
5073 cache_fplookup_dot(struct cache_fpl *fpl)
5074 {
5075         int error;
5076
5077         MPASS(!seqc_in_modify(fpl->dvp_seqc));
5078
5079         if (__predict_false(fpl->dvp->v_type != VDIR)) {
5080                 cache_fpl_smr_exit(fpl);
5081                 return (cache_fpl_handled_error(fpl, ENOTDIR));
5082         }
5083
5084         /*
5085          * Just re-assign the value. seqc will be checked later for the first
5086          * non-dot path component in line and/or before deciding to return the
5087          * vnode.
5088          */
5089         fpl->tvp = fpl->dvp;
5090         fpl->tvp_seqc = fpl->dvp_seqc;
5091
5092         counter_u64_add(dothits, 1);
5093         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5094
5095         error = 0;
5096         if (cache_fplookup_is_mp(fpl)) {
5097                 error = cache_fplookup_cross_mount(fpl);
5098         }
5099         return (error);
5100 }
5101
5102 static int __noinline
5103 cache_fplookup_dotdot(struct cache_fpl *fpl)
5104 {
5105         struct nameidata *ndp;
5106         struct componentname *cnp;
5107         struct namecache *ncp;
5108         struct vnode *dvp;
5109         struct prison *pr;
5110         u_char nc_flag;
5111
5112         ndp = fpl->ndp;
5113         cnp = fpl->cnp;
5114         dvp = fpl->dvp;
5115
5116         MPASS(cache_fpl_isdotdot(cnp));
5117
5118         /*
5119          * XXX this is racy the same way regular lookup is
5120          */
5121         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
5122             pr = pr->pr_parent)
5123                 if (dvp == pr->pr_root)
5124                         break;
5125
5126         if (dvp == ndp->ni_rootdir ||
5127             dvp == ndp->ni_topdir ||
5128             dvp == rootvnode ||
5129             pr != NULL) {
5130                 fpl->tvp = dvp;
5131                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5132                 if (seqc_in_modify(fpl->tvp_seqc)) {
5133                         return (cache_fpl_aborted(fpl));
5134                 }
5135                 return (0);
5136         }
5137
5138         if ((dvp->v_vflag & VV_ROOT) != 0) {
5139                 /*
5140                  * TODO
5141                  * The opposite of climb mount is needed here.
5142                  */
5143                 return (cache_fpl_partial(fpl));
5144         }
5145
5146         if (__predict_false(dvp->v_type != VDIR)) {
5147                 cache_fpl_smr_exit(fpl);
5148                 return (cache_fpl_handled_error(fpl, ENOTDIR));
5149         }
5150
5151         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5152         if (ncp == NULL) {
5153                 return (cache_fpl_aborted(fpl));
5154         }
5155
5156         nc_flag = atomic_load_char(&ncp->nc_flag);
5157         if ((nc_flag & NCF_ISDOTDOT) != 0) {
5158                 if ((nc_flag & NCF_NEGATIVE) != 0)
5159                         return (cache_fpl_aborted(fpl));
5160                 fpl->tvp = ncp->nc_vp;
5161         } else {
5162                 fpl->tvp = ncp->nc_dvp;
5163         }
5164
5165         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5166         if (seqc_in_modify(fpl->tvp_seqc)) {
5167                 return (cache_fpl_partial(fpl));
5168         }
5169
5170         /*
5171          * Acquire fence provided by vn_seqc_read_any above.
5172          */
5173         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5174                 return (cache_fpl_aborted(fpl));
5175         }
5176
5177         if (!cache_ncp_canuse(ncp)) {
5178                 return (cache_fpl_aborted(fpl));
5179         }
5180
5181         counter_u64_add(dotdothits, 1);
5182         return (0);
5183 }
5184
5185 static int __noinline
5186 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5187 {
5188         u_char nc_flag __diagused;
5189         bool neg_promote;
5190
5191 #ifdef INVARIANTS
5192         nc_flag = atomic_load_char(&ncp->nc_flag);
5193         MPASS((nc_flag & NCF_NEGATIVE) != 0);
5194 #endif
5195         /*
5196          * If they want to create an entry we need to replace this one.
5197          */
5198         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5199                 fpl->tvp = NULL;
5200                 return (cache_fplookup_modifying(fpl));
5201         }
5202         neg_promote = cache_neg_hit_prep(ncp);
5203         if (!cache_fpl_neg_ncp_canuse(ncp)) {
5204                 cache_neg_hit_abort(ncp);
5205                 return (cache_fpl_partial(fpl));
5206         }
5207         if (neg_promote) {
5208                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5209         }
5210         cache_neg_hit_finish(ncp);
5211         cache_fpl_smr_exit(fpl);
5212         return (cache_fpl_handled_error(fpl, ENOENT));
5213 }
5214
5215 /*
5216  * Resolve a symlink. Called by filesystem-specific routines.
5217  *
5218  * Code flow is:
5219  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5220  */
5221 int
5222 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5223 {
5224         struct nameidata *ndp;
5225         struct componentname *cnp;
5226         size_t adjust;
5227
5228         ndp = fpl->ndp;
5229         cnp = fpl->cnp;
5230
5231         if (__predict_false(len == 0)) {
5232                 return (ENOENT);
5233         }
5234
5235         if (__predict_false(len > MAXPATHLEN - 2)) {
5236                 if (cache_fpl_istrailingslash(fpl)) {
5237                         return (EAGAIN);
5238                 }
5239         }
5240
5241         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5242 #ifdef INVARIANTS
5243         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5244                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5245                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5246                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5247         }
5248 #endif
5249
5250         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5251                 return (ENAMETOOLONG);
5252         }
5253
5254         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5255                 return (ELOOP);
5256         }
5257
5258         adjust = len;
5259         if (ndp->ni_pathlen > 1) {
5260                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5261         } else {
5262                 if (cache_fpl_istrailingslash(fpl)) {
5263                         adjust = len + 1;
5264                         cnp->cn_pnbuf[len] = '/';
5265                         cnp->cn_pnbuf[len + 1] = '\0';
5266                 } else {
5267                         cnp->cn_pnbuf[len] = '\0';
5268                 }
5269         }
5270         bcopy(string, cnp->cn_pnbuf, len);
5271
5272         ndp->ni_pathlen += adjust;
5273         cache_fpl_pathlen_add(fpl, adjust);
5274         cnp->cn_nameptr = cnp->cn_pnbuf;
5275         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5276         fpl->tvp = NULL;
5277         return (0);
5278 }
5279
5280 static int __noinline
5281 cache_fplookup_symlink(struct cache_fpl *fpl)
5282 {
5283         struct mount *mp;
5284         struct nameidata *ndp;
5285         struct componentname *cnp;
5286         struct vnode *dvp, *tvp;
5287         int error;
5288
5289         ndp = fpl->ndp;
5290         cnp = fpl->cnp;
5291         dvp = fpl->dvp;
5292         tvp = fpl->tvp;
5293
5294         if (cache_fpl_islastcn(ndp)) {
5295                 if ((cnp->cn_flags & FOLLOW) == 0) {
5296                         return (cache_fplookup_final(fpl));
5297                 }
5298         }
5299
5300         mp = atomic_load_ptr(&dvp->v_mount);
5301         if (__predict_false(mp == NULL)) {
5302                 return (cache_fpl_aborted(fpl));
5303         }
5304
5305         /*
5306          * Note this check races against setting the flag just like regular
5307          * lookup.
5308          */
5309         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5310                 cache_fpl_smr_exit(fpl);
5311                 return (cache_fpl_handled_error(fpl, EACCES));
5312         }
5313
5314         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5315         if (__predict_false(error != 0)) {
5316                 switch (error) {
5317                 case EAGAIN:
5318                         return (cache_fpl_partial(fpl));
5319                 case ENOENT:
5320                 case ENAMETOOLONG:
5321                 case ELOOP:
5322                         cache_fpl_smr_exit(fpl);
5323                         return (cache_fpl_handled_error(fpl, error));
5324                 default:
5325                         return (cache_fpl_aborted(fpl));
5326                 }
5327         }
5328
5329         if (*(cnp->cn_nameptr) == '/') {
5330                 fpl->dvp = cache_fpl_handle_root(fpl);
5331                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5332                 if (seqc_in_modify(fpl->dvp_seqc)) {
5333                         return (cache_fpl_aborted(fpl));
5334                 }
5335                 /*
5336                  * The main loop assumes that ->dvp points to a vnode belonging
5337                  * to a filesystem which can do lockless lookup, but the absolute
5338                  * symlink can be wandering off to one which does not.
5339                  */
5340                 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5341                 if (__predict_false(mp == NULL)) {
5342                         return (cache_fpl_aborted(fpl));
5343                 }
5344                 if (!cache_fplookup_mp_supported(mp)) {
5345                         cache_fpl_checkpoint(fpl);
5346                         return (cache_fpl_partial(fpl));
5347                 }
5348         }
5349         return (0);
5350 }
5351
5352 static int
5353 cache_fplookup_next(struct cache_fpl *fpl)
5354 {
5355         struct componentname *cnp;
5356         struct namecache *ncp;
5357         struct vnode *dvp, *tvp;
5358         u_char nc_flag;
5359         uint32_t hash;
5360         int error;
5361
5362         cnp = fpl->cnp;
5363         dvp = fpl->dvp;
5364         hash = fpl->hash;
5365
5366         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5367                 if (cnp->cn_namelen == 1) {
5368                         return (cache_fplookup_dot(fpl));
5369                 }
5370                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5371                         return (cache_fplookup_dotdot(fpl));
5372                 }
5373         }
5374
5375         MPASS(!cache_fpl_isdotdot(cnp));
5376
5377         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
5378                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
5379                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
5380                         break;
5381         }
5382
5383         if (__predict_false(ncp == NULL)) {
5384                 return (cache_fplookup_noentry(fpl));
5385         }
5386
5387         tvp = atomic_load_ptr(&ncp->nc_vp);
5388         nc_flag = atomic_load_char(&ncp->nc_flag);
5389         if ((nc_flag & NCF_NEGATIVE) != 0) {
5390                 return (cache_fplookup_neg(fpl, ncp, hash));
5391         }
5392
5393         if (!cache_ncp_canuse(ncp)) {
5394                 return (cache_fpl_partial(fpl));
5395         }
5396
5397         fpl->tvp = tvp;
5398         fpl->tvp_seqc = vn_seqc_read_any(tvp);
5399         if (seqc_in_modify(fpl->tvp_seqc)) {
5400                 return (cache_fpl_partial(fpl));
5401         }
5402
5403         counter_u64_add(numposhits, 1);
5404         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5405
5406         error = 0;
5407         if (cache_fplookup_is_mp(fpl)) {
5408                 error = cache_fplookup_cross_mount(fpl);
5409         }
5410         return (error);
5411 }
5412
5413 static bool
5414 cache_fplookup_mp_supported(struct mount *mp)
5415 {
5416
5417         MPASS(mp != NULL);
5418         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5419                 return (false);
5420         return (true);
5421 }
5422
5423 /*
5424  * Walk up the mount stack (if any).
5425  *
5426  * Correctness is provided in the following ways:
5427  * - all vnodes are protected from freeing with SMR
5428  * - struct mount objects are type stable making them always safe to access
5429  * - stability of the particular mount is provided by busying it
5430  * - relationship between the vnode which is mounted on and the mount is
5431  *   verified with the vnode sequence counter after busying
5432  * - association between root vnode of the mount and the mount is protected
5433  *   by busy
5434  *
5435  * From that point on we can read the sequence counter of the root vnode
5436  * and get the next mount on the stack (if any) using the same protection.
5437  *
5438  * By the end of successful walk we are guaranteed the reached state was
5439  * indeed present at least at some point which matches the regular lookup.
5440  */
5441 static int __noinline
5442 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5443 {
5444         struct mount *mp, *prev_mp;
5445         struct mount_pcpu *mpcpu, *prev_mpcpu;
5446         struct vnode *vp;
5447         seqc_t vp_seqc;
5448
5449         vp = fpl->tvp;
5450         vp_seqc = fpl->tvp_seqc;
5451
5452         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5453         mp = atomic_load_ptr(&vp->v_mountedhere);
5454         if (__predict_false(mp == NULL)) {
5455                 return (0);
5456         }
5457
5458         prev_mp = NULL;
5459         for (;;) {
5460                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5461                         if (prev_mp != NULL)
5462                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5463                         return (cache_fpl_partial(fpl));
5464                 }
5465                 if (prev_mp != NULL)
5466                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5467                 if (!vn_seqc_consistent(vp, vp_seqc)) {
5468                         vfs_op_thread_exit_crit(mp, mpcpu);
5469                         return (cache_fpl_partial(fpl));
5470                 }
5471                 if (!cache_fplookup_mp_supported(mp)) {
5472                         vfs_op_thread_exit_crit(mp, mpcpu);
5473                         return (cache_fpl_partial(fpl));
5474                 }
5475                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5476                 if (vp == NULL) {
5477                         vfs_op_thread_exit_crit(mp, mpcpu);
5478                         return (cache_fpl_partial(fpl));
5479                 }
5480                 vp_seqc = vn_seqc_read_any(vp);
5481                 if (seqc_in_modify(vp_seqc)) {
5482                         vfs_op_thread_exit_crit(mp, mpcpu);
5483                         return (cache_fpl_partial(fpl));
5484                 }
5485                 prev_mp = mp;
5486                 prev_mpcpu = mpcpu;
5487                 mp = atomic_load_ptr(&vp->v_mountedhere);
5488                 if (mp == NULL)
5489                         break;
5490         }
5491
5492         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5493         fpl->tvp = vp;
5494         fpl->tvp_seqc = vp_seqc;
5495         return (0);
5496 }
5497
5498 static int __noinline
5499 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5500 {
5501         struct mount *mp;
5502         struct mount_pcpu *mpcpu;
5503         struct vnode *vp;
5504         seqc_t vp_seqc;
5505
5506         vp = fpl->tvp;
5507         vp_seqc = fpl->tvp_seqc;
5508
5509         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5510         mp = atomic_load_ptr(&vp->v_mountedhere);
5511         if (__predict_false(mp == NULL)) {
5512                 return (0);
5513         }
5514
5515         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5516                 return (cache_fpl_partial(fpl));
5517         }
5518         if (!vn_seqc_consistent(vp, vp_seqc)) {
5519                 vfs_op_thread_exit_crit(mp, mpcpu);
5520                 return (cache_fpl_partial(fpl));
5521         }
5522         if (!cache_fplookup_mp_supported(mp)) {
5523                 vfs_op_thread_exit_crit(mp, mpcpu);
5524                 return (cache_fpl_partial(fpl));
5525         }
5526         vp = atomic_load_ptr(&mp->mnt_rootvnode);
5527         if (__predict_false(vp == NULL)) {
5528                 vfs_op_thread_exit_crit(mp, mpcpu);
5529                 return (cache_fpl_partial(fpl));
5530         }
5531         vp_seqc = vn_seqc_read_any(vp);
5532         vfs_op_thread_exit_crit(mp, mpcpu);
5533         if (seqc_in_modify(vp_seqc)) {
5534                 return (cache_fpl_partial(fpl));
5535         }
5536         mp = atomic_load_ptr(&vp->v_mountedhere);
5537         if (__predict_false(mp != NULL)) {
5538                 /*
5539                  * There are possibly more mount points on top.
5540                  * Normally this does not happen so for simplicity just start
5541                  * over.
5542                  */
5543                 return (cache_fplookup_climb_mount(fpl));
5544         }
5545
5546         fpl->tvp = vp;
5547         fpl->tvp_seqc = vp_seqc;
5548         return (0);
5549 }
5550
5551 /*
5552  * Check if a vnode is mounted on.
5553  */
5554 static bool
5555 cache_fplookup_is_mp(struct cache_fpl *fpl)
5556 {
5557         struct vnode *vp;
5558
5559         vp = fpl->tvp;
5560         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5561 }
5562
5563 /*
5564  * Parse the path.
5565  *
5566  * The code was originally copy-pasted from regular lookup and despite
5567  * clean ups leaves performance on the table. Any modifications here
5568  * must take into account that in case off fallback the resulting
5569  * nameidata state has to be compatible with the original.
5570  */
5571
5572 /*
5573  * Debug ni_pathlen tracking.
5574  */
5575 #ifdef INVARIANTS
5576 static void
5577 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5578 {
5579
5580         fpl->debug.ni_pathlen += n;
5581         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5582             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5583 }
5584
5585 static void
5586 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5587 {
5588
5589         fpl->debug.ni_pathlen -= n;
5590         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5591             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5592 }
5593
5594 static void
5595 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5596 {
5597
5598         cache_fpl_pathlen_add(fpl, 1);
5599 }
5600
5601 static void
5602 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5603 {
5604
5605         cache_fpl_pathlen_sub(fpl, 1);
5606 }
5607 #else
5608 static void
5609 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5610 {
5611 }
5612
5613 static void
5614 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5615 {
5616 }
5617
5618 static void
5619 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5620 {
5621 }
5622
5623 static void
5624 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5625 {
5626 }
5627 #endif
5628
5629 static void
5630 cache_fplookup_parse(struct cache_fpl *fpl)
5631 {
5632         struct nameidata *ndp;
5633         struct componentname *cnp;
5634         struct vnode *dvp;
5635         char *cp;
5636         uint32_t hash;
5637
5638         ndp = fpl->ndp;
5639         cnp = fpl->cnp;
5640         dvp = fpl->dvp;
5641
5642         /*
5643          * Find the end of this path component, it is either / or nul.
5644          *
5645          * Store / as a temporary sentinel so that we only have one character
5646          * to test for. Pathnames tend to be short so this should not be
5647          * resulting in cache misses.
5648          *
5649          * TODO: fix this to be word-sized.
5650          */
5651         MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5652         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5653             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5654             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5655             fpl->nulchar, cnp->cn_pnbuf));
5656         KASSERT(*fpl->nulchar == '\0',
5657             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5658             cnp->cn_pnbuf));
5659         hash = cache_get_hash_iter_start(dvp);
5660         *fpl->nulchar = '/';
5661         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5662                 KASSERT(*cp != '\0',
5663                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
5664                     cnp->cn_nameptr));
5665                 hash = cache_get_hash_iter(*cp, hash);
5666                 continue;
5667         }
5668         *fpl->nulchar = '\0';
5669         fpl->hash = cache_get_hash_iter_finish(hash);
5670
5671         cnp->cn_namelen = cp - cnp->cn_nameptr;
5672         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5673
5674 #ifdef INVARIANTS
5675         /*
5676          * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5677          * we are going to fail this lookup with ENAMETOOLONG (see below).
5678          */
5679         if (cnp->cn_namelen <= NAME_MAX) {
5680                 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5681                         panic("%s: mismatched hash for [%s] len %ld", __func__,
5682                             cnp->cn_nameptr, cnp->cn_namelen);
5683                 }
5684         }
5685 #endif
5686
5687         /*
5688          * Hack: we have to check if the found path component's length exceeds
5689          * NAME_MAX. However, the condition is very rarely true and check can
5690          * be elided in the common case -- if an entry was found in the cache,
5691          * then it could not have been too long to begin with.
5692          */
5693         ndp->ni_next = cp;
5694 }
5695
5696 static void
5697 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5698 {
5699         struct nameidata *ndp;
5700         struct componentname *cnp;
5701
5702         ndp = fpl->ndp;
5703         cnp = fpl->cnp;
5704
5705         cnp->cn_nameptr = ndp->ni_next;
5706         KASSERT(*(cnp->cn_nameptr) == '/',
5707             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5708             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5709         cnp->cn_nameptr++;
5710         cache_fpl_pathlen_dec(fpl);
5711 }
5712
5713 /*
5714  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5715  *
5716  * Lockless lookup tries to elide checking for spurious slashes and should they
5717  * be present is guaranteed to fail to find an entry. In this case the caller
5718  * must check if the name starts with a slash and call this routine.  It is
5719  * going to fast forward across the spurious slashes and set the state up for
5720  * retry.
5721  */
5722 static int __noinline
5723 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5724 {
5725         struct nameidata *ndp;
5726         struct componentname *cnp;
5727
5728         ndp = fpl->ndp;
5729         cnp = fpl->cnp;
5730
5731         MPASS(*(cnp->cn_nameptr) == '/');
5732         do {
5733                 cnp->cn_nameptr++;
5734                 cache_fpl_pathlen_dec(fpl);
5735         } while (*(cnp->cn_nameptr) == '/');
5736
5737         /*
5738          * Go back to one slash so that cache_fplookup_parse_advance has
5739          * something to skip.
5740          */
5741         cnp->cn_nameptr--;
5742         cache_fpl_pathlen_inc(fpl);
5743
5744         /*
5745          * cache_fplookup_parse_advance starts from ndp->ni_next
5746          */
5747         ndp->ni_next = cnp->cn_nameptr;
5748
5749         /*
5750          * See cache_fplookup_dot.
5751          */
5752         fpl->tvp = fpl->dvp;
5753         fpl->tvp_seqc = fpl->dvp_seqc;
5754
5755         return (0);
5756 }
5757
5758 /*
5759  * Handle trailing slashes (e.g., "foo/").
5760  *
5761  * If a trailing slash is found the terminal vnode must be a directory.
5762  * Regular lookup shortens the path by nulifying the first trailing slash and
5763  * sets the TRAILINGSLASH flag to denote this took place. There are several
5764  * checks on it performed later.
5765  *
5766  * Similarly to spurious slashes, lockless lookup handles this in a speculative
5767  * manner relying on an invariant that a non-directory vnode will get a miss.
5768  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5769  *
5770  * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5771  * and denotes this is the last path component, which avoids looping back.
5772  *
5773  * Only plain lookups are supported for now to restrict corner cases to handle.
5774  */
5775 static int __noinline
5776 cache_fplookup_trailingslash(struct cache_fpl *fpl)
5777 {
5778 #ifdef INVARIANTS
5779         size_t ni_pathlen;
5780 #endif
5781         struct nameidata *ndp;
5782         struct componentname *cnp;
5783         struct namecache *ncp;
5784         struct vnode *tvp;
5785         char *cn_nameptr_orig, *cn_nameptr_slash;
5786         seqc_t tvp_seqc;
5787         u_char nc_flag;
5788
5789         ndp = fpl->ndp;
5790         cnp = fpl->cnp;
5791         tvp = fpl->tvp;
5792         tvp_seqc = fpl->tvp_seqc;
5793
5794         MPASS(fpl->dvp == fpl->tvp);
5795         KASSERT(cache_fpl_istrailingslash(fpl),
5796             ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5797             cnp->cn_pnbuf));
5798         KASSERT(cnp->cn_nameptr[0] == '\0',
5799             ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5800             cnp->cn_pnbuf));
5801         KASSERT(cnp->cn_namelen == 0,
5802             ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5803             cnp->cn_pnbuf));
5804         MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5805
5806         if (cnp->cn_nameiop != LOOKUP) {
5807                 return (cache_fpl_aborted(fpl));
5808         }
5809
5810         if (__predict_false(tvp->v_type != VDIR)) {
5811                 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5812                         return (cache_fpl_aborted(fpl));
5813                 }
5814                 cache_fpl_smr_exit(fpl);
5815                 return (cache_fpl_handled_error(fpl, ENOTDIR));
5816         }
5817
5818         /*
5819          * Denote the last component.
5820          */
5821         ndp->ni_next = &cnp->cn_nameptr[0];
5822         MPASS(cache_fpl_islastcn(ndp));
5823
5824         /*
5825          * Unwind trailing slashes.
5826          */
5827         cn_nameptr_orig = cnp->cn_nameptr;
5828         while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5829                 cnp->cn_nameptr--;
5830                 if (cnp->cn_nameptr[0] != '/') {
5831                         break;
5832                 }
5833         }
5834
5835         /*
5836          * Unwind to the beginning of the path component.
5837          *
5838          * Note the path may or may not have started with a slash.
5839          */
5840         cn_nameptr_slash = cnp->cn_nameptr;
5841         while (cnp->cn_nameptr > cnp->cn_pnbuf) {
5842                 cnp->cn_nameptr--;
5843                 if (cnp->cn_nameptr[0] == '/') {
5844                         break;
5845                 }
5846         }
5847         if (cnp->cn_nameptr[0] == '/') {
5848                 cnp->cn_nameptr++;
5849         }
5850
5851         cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
5852         cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
5853         cache_fpl_checkpoint(fpl);
5854
5855 #ifdef INVARIANTS
5856         ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
5857         if (ni_pathlen != fpl->debug.ni_pathlen) {
5858                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5859                     __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5860                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5861         }
5862 #endif
5863
5864         /*
5865          * If this was a "./" lookup the parent directory is already correct.
5866          */
5867         if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
5868                 return (0);
5869         }
5870
5871         /*
5872          * Otherwise we need to look it up.
5873          */
5874         tvp = fpl->tvp;
5875         ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
5876         if (__predict_false(ncp == NULL)) {
5877                 return (cache_fpl_aborted(fpl));
5878         }
5879         nc_flag = atomic_load_char(&ncp->nc_flag);
5880         if ((nc_flag & NCF_ISDOTDOT) != 0) {
5881                 return (cache_fpl_aborted(fpl));
5882         }
5883         fpl->dvp = ncp->nc_dvp;
5884         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5885         if (seqc_in_modify(fpl->dvp_seqc)) {
5886                 return (cache_fpl_aborted(fpl));
5887         }
5888         return (0);
5889 }
5890
5891 /*
5892  * See the API contract for VOP_FPLOOKUP_VEXEC.
5893  */
5894 static int __noinline
5895 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
5896 {
5897         struct componentname *cnp;
5898         struct vnode *dvp;
5899         seqc_t dvp_seqc;
5900
5901         cnp = fpl->cnp;
5902         dvp = fpl->dvp;
5903         dvp_seqc = fpl->dvp_seqc;
5904
5905         /*
5906          * Hack: delayed empty path checking.
5907          */
5908         if (cnp->cn_pnbuf[0] == '\0') {
5909                 return (cache_fplookup_emptypath(fpl));
5910         }
5911
5912         /*
5913          * TODO: Due to ignoring trailing slashes lookup will perform a
5914          * permission check on the last dir when it should not be doing it.  It
5915          * may fail, but said failure should be ignored. It is possible to fix
5916          * it up fully without resorting to regular lookup, but for now just
5917          * abort.
5918          */
5919         if (cache_fpl_istrailingslash(fpl)) {
5920                 return (cache_fpl_aborted(fpl));
5921         }
5922
5923         /*
5924          * Hack: delayed degenerate path checking.
5925          */
5926         if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
5927                 return (cache_fplookup_degenerate(fpl));
5928         }
5929
5930         /*
5931          * Hack: delayed name len checking.
5932          */
5933         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5934                 cache_fpl_smr_exit(fpl);
5935                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5936         }
5937
5938         /*
5939          * Hack: they may be looking up foo/bar, where foo is not a directory.
5940          * In such a case we need to return ENOTDIR, but we may happen to get
5941          * here with a different error.
5942          */
5943         if (dvp->v_type != VDIR) {
5944                 error = ENOTDIR;
5945         }
5946
5947         /*
5948          * Hack: handle O_SEARCH.
5949          *
5950          * Open Group Base Specifications Issue 7, 2018 edition states:
5951          * <quote>
5952          * If the access mode of the open file description associated with the
5953          * file descriptor is not O_SEARCH, the function shall check whether
5954          * directory searches are permitted using the current permissions of
5955          * the directory underlying the file descriptor. If the access mode is
5956          * O_SEARCH, the function shall not perform the check.
5957          * </quote>
5958          *
5959          * Regular lookup tests for the NOEXECCHECK flag for every path
5960          * component to decide whether to do the permission check. However,
5961          * since most lookups never have the flag (and when they do it is only
5962          * present for the first path component), lockless lookup only acts on
5963          * it if there is a permission problem. Here the flag is represented
5964          * with a boolean so that we don't have to clear it on the way out.
5965          *
5966          * For simplicity this always aborts.
5967          * TODO: check if this is the first lookup and ignore the permission
5968          * problem. Note the flag has to survive fallback (if it happens to be
5969          * performed).
5970          */
5971         if (fpl->fsearch) {
5972                 return (cache_fpl_aborted(fpl));
5973         }
5974
5975         switch (error) {
5976         case EAGAIN:
5977                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5978                         error = cache_fpl_aborted(fpl);
5979                 } else {
5980                         cache_fpl_partial(fpl);
5981                 }
5982                 break;
5983         default:
5984                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5985                         error = cache_fpl_aborted(fpl);
5986                 } else {
5987                         cache_fpl_smr_exit(fpl);
5988                         cache_fpl_handled_error(fpl, error);
5989                 }
5990                 break;
5991         }
5992         return (error);
5993 }
5994
5995 static int
5996 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
5997 {
5998         struct nameidata *ndp;
5999         struct componentname *cnp;
6000         struct mount *mp;
6001         int error;
6002
6003         ndp = fpl->ndp;
6004         cnp = fpl->cnp;
6005
6006         cache_fpl_checkpoint(fpl);
6007
6008         /*
6009          * The vnode at hand is almost always stable, skip checking for it.
6010          * Worst case this postpones the check towards the end of the iteration
6011          * of the main loop.
6012          */
6013         fpl->dvp = dvp;
6014         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6015
6016         mp = atomic_load_ptr(&dvp->v_mount);
6017         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6018                 return (cache_fpl_aborted(fpl));
6019         }
6020
6021         MPASS(fpl->tvp == NULL);
6022
6023         for (;;) {
6024                 cache_fplookup_parse(fpl);
6025
6026                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6027                 if (__predict_false(error != 0)) {
6028                         error = cache_fplookup_failed_vexec(fpl, error);
6029                         break;
6030                 }
6031
6032                 error = cache_fplookup_next(fpl);
6033                 if (__predict_false(cache_fpl_terminated(fpl))) {
6034                         break;
6035                 }
6036
6037                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6038
6039                 if (fpl->tvp->v_type == VLNK) {
6040                         error = cache_fplookup_symlink(fpl);
6041                         if (cache_fpl_terminated(fpl)) {
6042                                 break;
6043                         }
6044                 } else {
6045                         if (cache_fpl_islastcn(ndp)) {
6046                                 error = cache_fplookup_final(fpl);
6047                                 break;
6048                         }
6049
6050                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6051                                 error = cache_fpl_aborted(fpl);
6052                                 break;
6053                         }
6054
6055                         fpl->dvp = fpl->tvp;
6056                         fpl->dvp_seqc = fpl->tvp_seqc;
6057                         cache_fplookup_parse_advance(fpl);
6058                 }
6059
6060                 cache_fpl_checkpoint(fpl);
6061         }
6062
6063         return (error);
6064 }
6065
6066 /*
6067  * Fast path lookup protected with SMR and sequence counters.
6068  *
6069  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6070  *
6071  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6072  * outlined below.
6073  *
6074  * Traditional vnode lookup conceptually looks like this:
6075  *
6076  * vn_lock(current);
6077  * for (;;) {
6078  *      next = find();
6079  *      vn_lock(next);
6080  *      vn_unlock(current);
6081  *      current = next;
6082  *      if (last)
6083  *          break;
6084  * }
6085  * return (current);
6086  *
6087  * Each jump to the next vnode is safe memory-wise and atomic with respect to
6088  * any modifications thanks to holding respective locks.
6089  *
6090  * The same guarantee can be provided with a combination of safe memory
6091  * reclamation and sequence counters instead. If all operations which affect
6092  * the relationship between the current vnode and the one we are looking for
6093  * also modify the counter, we can verify whether all the conditions held as
6094  * we made the jump. This includes things like permissions, mount points etc.
6095  * Counter modification is provided by enclosing relevant places in
6096  * vn_seqc_write_begin()/end() calls.
6097  *
6098  * Thus this translates to:
6099  *
6100  * vfs_smr_enter();
6101  * dvp_seqc = seqc_read_any(dvp);
6102  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6103  *     abort();
6104  * for (;;) {
6105  *      tvp = find();
6106  *      tvp_seqc = seqc_read_any(tvp);
6107  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6108  *          abort();
6109  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6110  *          abort();
6111  *      dvp = tvp; // we know nothing of importance has changed
6112  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6113  *      if (last)
6114  *          break;
6115  * }
6116  * vget(); // secure the vnode
6117  * if (!seqc_consistent(tvp, tvp_seqc) // final check
6118  *          abort();
6119  * // at this point we know nothing has changed for any parent<->child pair
6120  * // as they were crossed during the lookup, meaning we matched the guarantee
6121  * // of the locked variant
6122  * return (tvp);
6123  *
6124  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6125  * - they are called while within vfs_smr protection which they must never exit
6126  * - EAGAIN can be returned to denote checking could not be performed, it is
6127  *   always valid to return it
6128  * - if the sequence counter has not changed the result must be valid
6129  * - if the sequence counter has changed both false positives and false negatives
6130  *   are permitted (since the result will be rejected later)
6131  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6132  *
6133  * Caveats to watch out for:
6134  * - vnodes are passed unlocked and unreferenced with nothing stopping
6135  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6136  *   to use atomic_load_ptr to fetch it.
6137  * - the aforementioned object can also get freed, meaning absent other means it
6138  *   should be protected with vfs_smr
6139  * - either safely checking permissions as they are modified or guaranteeing
6140  *   their stability is left to the routine
6141  */
6142 int
6143 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6144     struct pwd **pwdp)
6145 {
6146         struct cache_fpl fpl;
6147         struct pwd *pwd;
6148         struct vnode *dvp;
6149         struct componentname *cnp;
6150         int error;
6151
6152         fpl.status = CACHE_FPL_STATUS_UNSET;
6153         fpl.in_smr = false;
6154         fpl.ndp = ndp;
6155         fpl.cnp = cnp = &ndp->ni_cnd;
6156         MPASS(ndp->ni_lcf == 0);
6157         MPASS(curthread == cnp->cn_thread);
6158         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6159             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6160             cnp->cn_flags));
6161         if ((cnp->cn_flags & SAVESTART) != 0) {
6162                 MPASS(cnp->cn_nameiop != LOOKUP);
6163         }
6164         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6165
6166         if (__predict_false(!cache_can_fplookup(&fpl))) {
6167                 *status = fpl.status;
6168                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6169                 return (EOPNOTSUPP);
6170         }
6171
6172         cache_fpl_checkpoint_outer(&fpl);
6173
6174         cache_fpl_smr_enter_initial(&fpl);
6175 #ifdef INVARIANTS
6176         fpl.debug.ni_pathlen = ndp->ni_pathlen;
6177 #endif
6178         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6179         fpl.fsearch = false;
6180         fpl.savename = (cnp->cn_flags & SAVENAME) != 0;
6181         fpl.tvp = NULL; /* for degenerate path handling */
6182         fpl.pwd = pwdp;
6183         pwd = pwd_get_smr();
6184         *(fpl.pwd) = pwd;
6185         ndp->ni_rootdir = pwd->pwd_rdir;
6186         ndp->ni_topdir = pwd->pwd_jdir;
6187
6188         if (cnp->cn_pnbuf[0] == '/') {
6189                 dvp = cache_fpl_handle_root(&fpl);
6190                 MPASS(ndp->ni_resflags == 0);
6191                 ndp->ni_resflags = NIRES_ABS;
6192         } else {
6193                 if (ndp->ni_dirfd == AT_FDCWD) {
6194                         dvp = pwd->pwd_cdir;
6195                 } else {
6196                         error = cache_fplookup_dirfd(&fpl, &dvp);
6197                         if (__predict_false(error != 0)) {
6198                                 goto out;
6199                         }
6200                 }
6201         }
6202
6203         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6204         error = cache_fplookup_impl(dvp, &fpl);
6205 out:
6206         cache_fpl_smr_assert_not_entered(&fpl);
6207         cache_fpl_assert_status(&fpl);
6208         *status = fpl.status;
6209         if (SDT_PROBES_ENABLED()) {
6210                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6211                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6212                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6213                             ndp);
6214         }
6215
6216         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6217                 MPASS(error != CACHE_FPL_FAILED);
6218                 if (error != 0) {
6219                         MPASS(fpl.dvp == NULL);
6220                         MPASS(fpl.tvp == NULL);
6221                         MPASS(fpl.savename == false);
6222                 }
6223                 ndp->ni_dvp = fpl.dvp;
6224                 ndp->ni_vp = fpl.tvp;
6225                 if (fpl.savename) {
6226                         cnp->cn_flags |= HASBUF;
6227                 } else {
6228                         cache_fpl_cleanup_cnp(cnp);
6229                 }
6230         }
6231         return (error);
6232 }