sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 #include "opt_ddb.h"
  39 #include "opt_ktrace.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/capsicum.h>
  44 #include <sys/counter.h>
  45 #include <sys/filedesc.h>
  46 #include <sys/fnv_hash.h>
  47 #include <sys/kernel.h>
  48 #include <sys/ktr.h>
  49 #include <sys/lock.h>
  50 #include <sys/malloc.h>
  51 #include <sys/fcntl.h>
  52 #include <sys/jail.h>
  53 #include <sys/mount.h>
  54 #include <sys/namei.h>
  55 #include <sys/proc.h>
  56 #include <sys/seqc.h>
  57 #include <sys/sdt.h>
  58 #include <sys/smr.h>
  59 #include <sys/smp.h>
  60 #include <sys/syscallsubr.h>
  61 #include <sys/sysctl.h>
  62 #include <sys/sysproto.h>
  63 #include <sys/vnode.h>
  64 #include <ck_queue.h>
  65 #ifdef KTRACE
  66 #include <sys/ktrace.h>
  67 #endif
  68 #ifdef INVARIANTS
  69 #include <machine/_inttypes.h>
  70 #endif
  71
  72 #include <security/audit/audit.h>
  73 #include <security/mac/mac_framework.h>
  74
  75 #ifdef DDB
  76 #include <ddb/ddb.h>
  77 #endif
  78
  79 #include <vm/uma.h>
  80
  81 /*
  82  * High level overview of name caching in the VFS layer.
  83  *
  84  * Originally caching was implemented as part of UFS, later extracted to allow
  85  * use by other filesystems. A decision was made to make it optional and
  86  * completely detached from the rest of the kernel, which comes with limitations
  87  * outlined near the end of this comment block.
  88  *
  89  * This fundamental choice needs to be revisited. In the meantime, the current
  90  * state is described below. Significance of all notable routines is explained
  91  * in comments placed above their implementation. Scattered thoroughout the
  92  * file are TODO comments indicating shortcomings which can be fixed without
  93  * reworking everything (most of the fixes will likely be reusable). Various
  94  * details are omitted from this explanation to not clutter the overview, they
  95  * have to be checked by reading the code and associated commentary.
  96  *
  97  * Keep in mind that it's individual path components which are cached, not full
  98  * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
  99  * one for each name.
 100  *
 101  * I. Data organization
 102  *
 103  * Entries are described by "struct namecache" objects and stored in a hash
 104  * table. See cache_get_hash for more information.
 105  *
 106  * "struct vnode" contains pointers to source entries (names which can be found
 107  * when traversing through said vnode), destination entries (names of that
 108  * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
 109  * the parent vnode.
 110  *
 111  * The (directory vnode; name) tuple reliably determines the target entry if
 112  * it exists.
 113  *
 114  * Since there are no small locks at this time (all are 32 bytes in size on
 115  * LP64), the code works around the problem by introducing lock arrays to
 116  * protect hash buckets and vnode lists.
 117  *
 118  * II. Filesystem integration
 119  *
 120  * Filesystems participating in name caching do the following:
 121  * - set vop_lookup routine to vfs_cache_lookup
 122  * - set vop_cachedlookup to whatever can perform the lookup if the above fails
 123  * - if they support lockless lookup (see below), vop_fplookup_vexec and
 124  *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
 125  *   mount point
 126  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
 127  *   applicable
 128  * - call cache_enter to add entries depending on the MAKEENTRY flag
 129  *
 130  * With the above in mind, there are 2 entry points when doing lookups:
 131  * - ... -> namei -> cache_fplookup -- this is the default
 132  * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
 133  *   should the above fail
 134  *
 135  * Example code flow how an entry is added:
 136  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
 137  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
 138  *
 139  * III. Performance considerations
 140  *
 141  * For lockless case forward lookup avoids any writes to shared areas apart
 142  * from the terminal path component. In other words non-modifying lookups of
 143  * different files don't suffer any scalability problems in the namecache.
 144  * Looking up the same file is limited by VFS and goes beyond the scope of this
 145  * file.
 146  *
 147  * At least on amd64 the single-threaded bottleneck for long paths is hashing
 148  * (see cache_get_hash). There are cases where the code issues acquire fence
 149  * multiple times, they can be combined on architectures which suffer from it.
 150  *
 151  * For locked case each encountered vnode has to be referenced and locked in
 152  * order to be handed out to the caller (normally that's namei). This
 153  * introduces significant hit single-threaded and serialization multi-threaded.
 154  *
 155  * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
 156  * avoids any writes to shared areas to any components.
 157  *
 158  * Unrelated insertions are partially serialized on updating the global entry
 159  * counter and possibly serialized on colliding bucket or vnode locks.
 160  *
 161  * IV. Observability
 162  *
 163  * Note not everything has an explicit dtrace probe nor it should have, thus
 164  * some of the one-liners below depend on implementation details.
 165  *
 166  * Examples:
 167  *
 168  * # Check what lookups failed to be handled in a lockless manner. Column 1 is
 169  * # line number, column 2 is status code (see cache_fpl_status)
 170  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
 171  *
 172  * # Lengths of names added by binary name
 173  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
 174  *
 175  * # Same as above but only those which exceed 64 characters
 176  * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
 177  *
 178  * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
 179  * # path is it
 180  * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
 181  *
 182  * V. Limitations and implementation defects
 183  *
 184  * - since it is possible there is no entry for an open file, tools like
 185  *   "procstat" may fail to resolve fd -> vnode -> path to anything
 186  * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
 187  *   shortage) in which case the above problem applies
 188  * - hardlinks are not tracked, thus if a vnode is reachable in more than one
 189  *   way, resolving a name may return a different path than the one used to
 190  *   open it (even if said path is still valid)
 191  * - by default entries are not added for newly created files
 192  * - adding an entry may need to evict negative entry first, which happens in 2
 193  *   distinct places (evicting on lookup, adding in a later VOP) making it
 194  *   impossible to simply reuse it
 195  * - there is a simple scheme to evict negative entries as the cache is approaching
 196  *   its capacity, but it is very unclear if doing so is a good idea to begin with
 197  * - vnodes are subject to being recycled even if target inode is left in memory,
 198  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
 199  *   names get duplicated -- kept by filesystem itself and namecache separately
 200  * - struct namecache has a fixed size and comes in 2 variants, often wasting
 201  *   space.  now hard to replace with malloc due to dependence on SMR, which
 202  *   requires UMA zones to opt in
 203  * - lack of better integration with the kernel also turns nullfs into a layered
 204  *   filesystem instead of something which can take advantage of caching
 205  *
 206  * Appendix A: where is the time lost, expanding on paragraph III
 207  *
 208  * While some care went into optimizing lookups, there is still plenty of
 209  * performance left on the table, most notably from single-threaded standpoint.
 210  * Below is a woefully incomplete list of changes which can help.  Ideas are
 211  * mostly sketched out, no claim is made all kinks or prerequisites are laid
 212  * out.
 213  *
 214  * Note there is performance lost all over VFS.
 215  *
 216  * === SMR-only lookup
 217  *
 218  * For commonly used ops like stat(2), when the terminal vnode *is* cached,
 219  * lockless lookup could refrain from refing/locking the found vnode and
 220  * instead return while within the SMR section. Then a call to, say,
 221  * vop_stat_smr could do the work (or fail with EAGAIN), finally the result
 222  * would be validated with seqc not changing. This would be faster
 223  * single-threaded as it dodges atomics and would provide full scalability for
 224  * multicore uses. This would *not* work for open(2) or other calls which need
 225  * the vnode to hang around for the long haul, but would work for aforementioned
 226  * stat(2) but also access(2), readlink(2), realpathat(2) and probably more.
 227  *
 228  * === hotpatching for sdt probes
 229  *
 230  * They result in *tons* of branches all over with rather regrettable codegen
 231  * at times. Removing sdt probes altogether gives over 2% boost in lookup rate.
 232  * Reworking the code to patch itself at runtime with asm goto would solve it.
 233  * asm goto is fully supported by gcc and clang.
 234  *
 235  * === copyinstr
 236  *
 237  * On all architectures it operates one byte at a time, while it could be
 238  * word-sized instead thanks to the Mycroft trick.
 239  *
 240  * API itself is rather pessimal for path lookup, accepting arbitrary sizes and
 241  * *optionally* filling in the length parameter.
 242  *
 243  * Instead a new routine (copyinpath?) could be introduced, demanding a buffer
 244  * size which is a multiply of the word (and never zero), with the length
 245  * always returned. On top of it the routine could be allowed to transform the
 246  * buffer in arbitrary ways, most notably writing past the found length (not to
 247  * be confused with writing past buffer size) -- this would allow word-sized
 248  * movs while checking for '\0' later.
 249  *
 250  * === detour through namei
 251  *
 252  * Currently one suffers being called from namei, which then has to check if
 253  * things worked out locklessly. Instead the lockless lookup could be the
 254  * actual entry point which calls what is currently namei as a fallback.
 255  *
 256  * === avoidable branches in cache_can_fplookup
 257  *
 258  * The cache_fast_lookup_enabled flag check could be hotpatchable (in fact if
 259  * this is off, none of fplookup code should execute).
 260  *
 261  * Both audit and capsicum branches can be combined into one, but it requires
 262  * paying off a lot of tech debt first.
 263  *
 264  * ni_startdir could be indicated with a flag in cn_flags, eliminating the
 265  * branch.
 266  *
 267  * === mount stacks
 268  *
 269  * Crossing a mount requires checking if perhaps something is mounted on top.
 270  * Instead, an additional entry could be added to struct mount with a pointer
 271  * to the final mount on the stack. This would be recalculated on each
 272  * mount/unmount.
 273  *
 274  * === root vnodes
 275  *
 276  * It could become part of the API contract to *always* have a rootvnode set in
 277  * mnt_rootvnode. Such vnodes are annotated with VV_ROOT and vnlru would have
 278  * to be modified to always skip them.
 279  */
 280
 281 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 282     "Name cache");
 283
 284 SDT_PROVIDER_DECLARE(vfs);
 285 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
 286     "struct vnode *");
 287 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
 288     "struct vnode *");
 289 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
 290     "char *");
 291 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
 292     "const char *");
 293 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
 294     "struct namecache *", "int", "int");
 295 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 296 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
 297     "char *", "struct vnode *");
 298 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 299 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 300     "struct vnode *", "char *");
 301 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 302     "struct vnode *");
 303 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 304     "struct vnode *", "char *");
 305 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 306     "char *");
 307 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 308     "struct componentname *");
 309 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 310     "struct componentname *");
 311 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
 312 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 313 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 314 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 315 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 316     "struct vnode *");
 317 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 318     "char *");
 319 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 320     "char *");
 321 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 322
 323 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 324 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 325 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 326
 327 static char __read_frequently cache_fast_lookup_enabled = true;
 328
 329 /*
 330  * This structure describes the elements in the cache of recent
 331  * names looked up by namei.
 332  */
 333 struct negstate {
 334         u_char neg_flag;
 335         u_char neg_hit;
 336 };
 337 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 338     "the state must fit in a union with a pointer without growing it");
 339
 340 struct  namecache {
 341         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 342         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 343         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 344         struct  vnode *nc_dvp;          /* vnode of parent of name */
 345         union {
 346                 struct  vnode *nu_vp;   /* vnode the name refers to */
 347                 struct  negstate nu_neg;/* negative entry state */
 348         } n_un;
 349         u_char  nc_flag;                /* flag bits */
 350         u_char  nc_nlen;                /* length of name */
 351         char    nc_name[];              /* segment name + nul */
 352 };
 353
 354 /*
 355  * struct namecache_ts repeats struct namecache layout up to the
 356  * nc_nlen member.
 357  * struct namecache_ts is used in place of struct namecache when time(s) need
 358  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 359  * both a non-dotdot directory name plus dotdot for the directory's
 360  * parent.
 361  *
 362  * See below for alignment requirement.
 363  */
 364 struct  namecache_ts {
 365         struct  timespec nc_time;       /* timespec provided by fs */
 366         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 367         int     nc_ticks;               /* ticks value when entry was added */
 368         int     nc_pad;
 369         struct namecache nc_nc;
 370 };
 371
 372 TAILQ_HEAD(cache_freebatch, namecache);
 373
 374 /*
 375  * At least mips n32 performs 64-bit accesses to timespec as found
 376  * in namecache_ts and requires them to be aligned. Since others
 377  * may be in the same spot suffer a little bit and enforce the
 378  * alignment for everyone. Note this is a nop for 64-bit platforms.
 379  */
 380 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 381
 382 /*
 383  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
 384  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
 385  * smaller and the value was bumped to retain the total size, but it
 386  * was never re-evaluated for suitability. A simple test counting
 387  * lengths during package building shows that the value of 45 covers
 388  * about 86% of all added entries, reaching 99% at 65.
 389  *
 390  * Regardless of the above, use of dedicated zones instead of malloc may be
 391  * inducing additional waste. This may be hard to address as said zones are
 392  * tied to VFS SMR. Even if retaining them, the current split should be
 393  * re-evaluated.
 394  */
 395 #ifdef __LP64__
 396 #define CACHE_PATH_CUTOFF       45
 397 #define CACHE_LARGE_PAD         6
 398 #else
 399 #define CACHE_PATH_CUTOFF       41
 400 #define CACHE_LARGE_PAD         2
 401 #endif
 402
 403 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 404 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 405 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 406 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 407
 408 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 409 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 410 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 411 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 412
 413 #define nc_vp           n_un.nu_vp
 414 #define nc_neg          n_un.nu_neg
 415
 416 /*
 417  * Flags in namecache.nc_flag
 418  */
 419 #define NCF_WHITE       0x01
 420 #define NCF_ISDOTDOT    0x02
 421 #define NCF_TS          0x04
 422 #define NCF_DTS         0x08
 423 #define NCF_DVDROP      0x10
 424 #define NCF_NEGATIVE    0x20
 425 #define NCF_INVALID     0x40
 426 #define NCF_WIP         0x80
 427
 428 /*
 429  * Flags in negstate.neg_flag
 430  */
 431 #define NEG_HOT         0x01
 432
 433 static bool     cache_neg_evict_cond(u_long lnumcache);
 434
 435 /*
 436  * Mark an entry as invalid.
 437  *
 438  * This is called before it starts getting deconstructed.
 439  */
 440 static void
 441 cache_ncp_invalidate(struct namecache *ncp)
 442 {
 443
 444         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 445             ("%s: entry %p already invalid", __func__, ncp));
 446         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 447         atomic_thread_fence_rel();
 448 }
 449
 450 /*
 451  * Check whether the entry can be safely used.
 452  *
 453  * All places which elide locks are supposed to call this after they are
 454  * done with reading from an entry.
 455  */
 456 #define cache_ncp_canuse(ncp)   ({                                      \
 457         struct namecache *_ncp = (ncp);                                 \
 458         u_char _nc_flag;                                                \
 459                                                                         \
 460         atomic_thread_fence_acq();                                      \
 461         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 462         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
 463 })
 464
 465 /*
 466  * Like the above but also checks NCF_WHITE.
 467  */
 468 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
 469         struct namecache *_ncp = (ncp);                                 \
 470         u_char _nc_flag;                                                \
 471                                                                         \
 472         atomic_thread_fence_acq();                                      \
 473         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 474         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
 475 })
 476
 477 VFS_SMR_DECLARE;
 478
 479 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 480     "Name cache parameters");
 481
 482 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 483 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
 484     "Total namecache capacity");
 485
 486 u_int ncsizefactor = 2;
 487 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 488     "Size factor for namecache");
 489
 490 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 491 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 492     "Ratio of negative namecache entries");
 493
 494 /*
 495  * Negative entry % of namecache capacity above which automatic eviction is allowed.
 496  *
 497  * Check cache_neg_evict_cond for details.
 498  */
 499 static u_int ncnegminpct = 3;
 500
 501 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 502 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 503     "Negative entry count above which automatic eviction is allowed");
 504
 505 /*
 506  * Structures associated with name caching.
 507  */
 508 #define NCHHASH(hash) \
 509         (&nchashtbl[(hash) & nchash])
 510 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 511 static u_long __read_mostly     nchash;                 /* size of hash table */
 512 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 513     "Size of namecache hash table");
 514 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 515 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 516
 517 struct nchstats nchstats;               /* cache effectiveness statistics */
 518
 519 static u_int __exclusive_cache_line neg_cycle;
 520
 521 #define ncneghash       3
 522 #define numneglists     (ncneghash + 1)
 523
 524 struct neglist {
 525         struct mtx              nl_evict_lock;
 526         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 527         TAILQ_HEAD(, namecache) nl_list;
 528         TAILQ_HEAD(, namecache) nl_hotlist;
 529         u_long                  nl_hotnum;
 530 } __aligned(CACHE_LINE_SIZE);
 531
 532 static struct neglist neglists[numneglists];
 533
 534 static inline struct neglist *
 535 NCP2NEGLIST(struct namecache *ncp)
 536 {
 537
 538         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 539 }
 540
 541 static inline struct negstate *
 542 NCP2NEGSTATE(struct namecache *ncp)
 543 {
 544
 545         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
 546         return (&ncp->nc_neg);
 547 }
 548
 549 #define numbucketlocks (ncbuckethash + 1)
 550 static u_int __read_mostly  ncbuckethash;
 551 static struct mtx_padalign __read_mostly  *bucketlocks;
 552 #define HASH2BUCKETLOCK(hash) \
 553         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 554
 555 #define numvnodelocks (ncvnodehash + 1)
 556 static u_int __read_mostly  ncvnodehash;
 557 static struct mtx __read_mostly *vnodelocks;
 558 static inline struct mtx *
 559 VP2VNODELOCK(struct vnode *vp)
 560 {
 561
 562         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 563 }
 564
 565 static void
 566 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 567 {
 568         struct namecache_ts *ncp_ts;
 569
 570         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 571             (tsp == NULL && ticksp == NULL),
 572             ("No NCF_TS"));
 573
 574         if (tsp == NULL)
 575                 return;
 576
 577         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 578         *tsp = ncp_ts->nc_time;
 579         *ticksp = ncp_ts->nc_ticks;
 580 }
 581
 582 #ifdef DEBUG_CACHE
 583 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 584 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 585     "VFS namecache enabled");
 586 #endif
 587
 588 /* Export size information to userland */
 589 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 590     sizeof(struct namecache), "sizeof(struct namecache)");
 591
 592 /*
 593  * The new name cache statistics
 594  */
 595 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 596     "Name cache statistics");
 597
 598 #define STATNODE_ULONG(name, varname, descr)                                    \
 599         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 600 #define STATNODE_COUNTER(name, varname, descr)                                  \
 601         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 602         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 603             descr);
 604 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 605 STATNODE_ULONG(count, numcache, "Number of cache entries");
 606 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 607 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 608 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 609 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 610 STATNODE_COUNTER(poszaps, numposzaps,
 611     "Number of cache hits (positive) we do not want to cache");
 612 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 613 STATNODE_COUNTER(negzaps, numnegzaps,
 614     "Number of cache hits (negative) we do not want to cache");
 615 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 616 /* These count for vn_getcwd(), too. */
 617 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 618 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 619     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 620 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 621 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 622 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
 623
 624 /*
 625  * Debug or developer statistics.
 626  */
 627 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 628     "Name cache debugging");
 629 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 630         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 631 static u_long zap_bucket_relock_success;
 632 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
 633     "Number of successful removals after relocking");
 634 static u_long zap_bucket_fail;
 635 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 636 static u_long zap_bucket_fail2;
 637 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 638 static u_long cache_lock_vnodes_cel_3_failures;
 639 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 640     "Number of times 3-way vnode locking failed");
 641
 642 static void cache_zap_locked(struct namecache *ncp);
 643 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 644     char **retbuf, size_t *buflen, size_t addend);
 645 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 646     char **retbuf, size_t *buflen);
 647 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 648     char **retbuf, size_t *len, size_t addend);
 649
 650 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 651
 652 static inline void
 653 cache_assert_vlp_locked(struct mtx *vlp)
 654 {
 655
 656         if (vlp != NULL)
 657                 mtx_assert(vlp, MA_OWNED);
 658 }
 659
 660 static inline void
 661 cache_assert_vnode_locked(struct vnode *vp)
 662 {
 663         struct mtx *vlp;
 664
 665         vlp = VP2VNODELOCK(vp);
 666         cache_assert_vlp_locked(vlp);
 667 }
 668
 669 /*
 670  * Directory vnodes with entries are held for two reasons:
 671  * 1. make them less of a target for reclamation in vnlru
 672  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
 673  *
 674  * It will be feasible to stop doing it altogether if all filesystems start
 675  * supporting lockless lookup.
 676  */
 677 static void
 678 cache_hold_vnode(struct vnode *vp)
 679 {
 680
 681         cache_assert_vnode_locked(vp);
 682         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 683         vhold(vp);
 684         counter_u64_add(numcachehv, 1);
 685 }
 686
 687 static void
 688 cache_drop_vnode(struct vnode *vp)
 689 {
 690
 691         /*
 692          * Called after all locks are dropped, meaning we can't assert
 693          * on the state of v_cache_src.
 694          */
 695         vdrop(vp);
 696         counter_u64_add(numcachehv, -1);
 697 }
 698
 699 /*
 700  * UMA zones.
 701  */
 702 static uma_zone_t __read_mostly cache_zone_small;
 703 static uma_zone_t __read_mostly cache_zone_small_ts;
 704 static uma_zone_t __read_mostly cache_zone_large;
 705 static uma_zone_t __read_mostly cache_zone_large_ts;
 706
 707 char *
 708 cache_symlink_alloc(size_t size, int flags)
 709 {
 710
 711         if (size < CACHE_ZONE_SMALL_SIZE) {
 712                 return (uma_zalloc_smr(cache_zone_small, flags));
 713         }
 714         if (size < CACHE_ZONE_LARGE_SIZE) {
 715                 return (uma_zalloc_smr(cache_zone_large, flags));
 716         }
 717         counter_u64_add(symlinktoobig, 1);
 718         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
 719         return (NULL);
 720 }
 721
 722 void
 723 cache_symlink_free(char *string, size_t size)
 724 {
 725
 726         MPASS(string != NULL);
 727         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
 728             ("%s: size %zu too big", __func__, size));
 729
 730         if (size < CACHE_ZONE_SMALL_SIZE) {
 731                 uma_zfree_smr(cache_zone_small, string);
 732                 return;
 733         }
 734         if (size < CACHE_ZONE_LARGE_SIZE) {
 735                 uma_zfree_smr(cache_zone_large, string);
 736                 return;
 737         }
 738         __assert_unreachable();
 739 }
 740
 741 static struct namecache *
 742 cache_alloc_uma(int len, bool ts)
 743 {
 744         struct namecache_ts *ncp_ts;
 745         struct namecache *ncp;
 746
 747         if (__predict_false(ts)) {
 748                 if (len <= CACHE_PATH_CUTOFF)
 749                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 750                 else
 751                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 752                 ncp = &ncp_ts->nc_nc;
 753         } else {
 754                 if (len <= CACHE_PATH_CUTOFF)
 755                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 756                 else
 757                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 758         }
 759         return (ncp);
 760 }
 761
 762 static void
 763 cache_free_uma(struct namecache *ncp)
 764 {
 765         struct namecache_ts *ncp_ts;
 766
 767         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 768                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 769                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 770                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 771                 else
 772                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 773         } else {
 774                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 775                         uma_zfree_smr(cache_zone_small, ncp);
 776                 else
 777                         uma_zfree_smr(cache_zone_large, ncp);
 778         }
 779 }
 780
 781 static struct namecache *
 782 cache_alloc(int len, bool ts)
 783 {
 784         u_long lnumcache;
 785
 786         /*
 787          * Avoid blowout in namecache entries.
 788          *
 789          * Bugs:
 790          * 1. filesystems may end up trying to add an already existing entry
 791          * (for example this can happen after a cache miss during concurrent
 792          * lookup), in which case we will call cache_neg_evict despite not
 793          * adding anything.
 794          * 2. the routine may fail to free anything and no provisions are made
 795          * to make it try harder (see the inside for failure modes)
 796          * 3. it only ever looks at negative entries.
 797          */
 798         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 799         if (cache_neg_evict_cond(lnumcache)) {
 800                 lnumcache = atomic_load_long(&numcache);
 801         }
 802         if (__predict_false(lnumcache >= ncsize)) {
 803                 atomic_subtract_long(&numcache, 1);
 804                 counter_u64_add(numdrops, 1);
 805                 return (NULL);
 806         }
 807         return (cache_alloc_uma(len, ts));
 808 }
 809
 810 static void
 811 cache_free(struct namecache *ncp)
 812 {
 813
 814         MPASS(ncp != NULL);
 815         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 816                 cache_drop_vnode(ncp->nc_dvp);
 817         }
 818         cache_free_uma(ncp);
 819         atomic_subtract_long(&numcache, 1);
 820 }
 821
 822 static void
 823 cache_free_batch(struct cache_freebatch *batch)
 824 {
 825         struct namecache *ncp, *nnp;
 826         int i;
 827
 828         i = 0;
 829         if (TAILQ_EMPTY(batch))
 830                 goto out;
 831         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 832                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 833                         cache_drop_vnode(ncp->nc_dvp);
 834                 }
 835                 cache_free_uma(ncp);
 836                 i++;
 837         }
 838         atomic_subtract_long(&numcache, i);
 839 out:
 840         SDT_PROBE1(vfs, namecache, purge, batch, i);
 841 }
 842
 843 /*
 844  * Hashing.
 845  *
 846  * The code was made to use FNV in 2001 and this choice needs to be revisited.
 847  *
 848  * Short summary of the difficulty:
 849  * The longest name which can be inserted is NAME_MAX characters in length (or
 850  * 255 at the time of writing this comment), while majority of names used in
 851  * practice are significantly shorter (mostly below 10). More importantly
 852  * majority of lookups performed find names are even shorter than that.
 853  *
 854  * This poses a problem where hashes which do better than FNV past word size
 855  * (or so) tend to come with additional overhead when finalizing the result,
 856  * making them noticeably slower for the most commonly used range.
 857  *
 858  * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
 859  *
 860  * When looking it up the most time consuming part by a large margin (at least
 861  * on amd64) is hashing.  Replacing FNV with something which pessimizes short
 862  * input would make the slowest part stand out even more.
 863  */
 864
 865 /*
 866  * TODO: With the value stored we can do better than computing the hash based
 867  * on the address.
 868  */
 869 static void
 870 cache_prehash(struct vnode *vp)
 871 {
 872
 873         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 874 }
 875
 876 static uint32_t
 877 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 878 {
 879
 880         return (fnv_32_buf(name, len, dvp->v_nchash));
 881 }
 882
 883 static uint32_t
 884 cache_get_hash_iter_start(struct vnode *dvp)
 885 {
 886
 887         return (dvp->v_nchash);
 888 }
 889
 890 static uint32_t
 891 cache_get_hash_iter(char c, uint32_t hash)
 892 {
 893
 894         return (fnv_32_buf(&c, 1, hash));
 895 }
 896
 897 static uint32_t
 898 cache_get_hash_iter_finish(uint32_t hash)
 899 {
 900
 901         return (hash);
 902 }
 903
 904 static inline struct nchashhead *
 905 NCP2BUCKET(struct namecache *ncp)
 906 {
 907         uint32_t hash;
 908
 909         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 910         return (NCHHASH(hash));
 911 }
 912
 913 static inline struct mtx *
 914 NCP2BUCKETLOCK(struct namecache *ncp)
 915 {
 916         uint32_t hash;
 917
 918         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 919         return (HASH2BUCKETLOCK(hash));
 920 }
 921
 922 #ifdef INVARIANTS
 923 static void
 924 cache_assert_bucket_locked(struct namecache *ncp)
 925 {
 926         struct mtx *blp;
 927
 928         blp = NCP2BUCKETLOCK(ncp);
 929         mtx_assert(blp, MA_OWNED);
 930 }
 931
 932 static void
 933 cache_assert_bucket_unlocked(struct namecache *ncp)
 934 {
 935         struct mtx *blp;
 936
 937         blp = NCP2BUCKETLOCK(ncp);
 938         mtx_assert(blp, MA_NOTOWNED);
 939 }
 940 #else
 941 #define cache_assert_bucket_locked(x) do { } while (0)
 942 #define cache_assert_bucket_unlocked(x) do { } while (0)
 943 #endif
 944
 945 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 946 static void
 947 _cache_sort_vnodes(void **p1, void **p2)
 948 {
 949         void *tmp;
 950
 951         MPASS(*p1 != NULL || *p2 != NULL);
 952
 953         if (*p1 > *p2) {
 954                 tmp = *p2;
 955                 *p2 = *p1;
 956                 *p1 = tmp;
 957         }
 958 }
 959
 960 static void
 961 cache_lock_all_buckets(void)
 962 {
 963         u_int i;
 964
 965         for (i = 0; i < numbucketlocks; i++)
 966                 mtx_lock(&bucketlocks[i]);
 967 }
 968
 969 static void
 970 cache_unlock_all_buckets(void)
 971 {
 972         u_int i;
 973
 974         for (i = 0; i < numbucketlocks; i++)
 975                 mtx_unlock(&bucketlocks[i]);
 976 }
 977
 978 static void
 979 cache_lock_all_vnodes(void)
 980 {
 981         u_int i;
 982
 983         for (i = 0; i < numvnodelocks; i++)
 984                 mtx_lock(&vnodelocks[i]);
 985 }
 986
 987 static void
 988 cache_unlock_all_vnodes(void)
 989 {
 990         u_int i;
 991
 992         for (i = 0; i < numvnodelocks; i++)
 993                 mtx_unlock(&vnodelocks[i]);
 994 }
 995
 996 static int
 997 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 998 {
 999
1000         cache_sort_vnodes(&vlp1, &vlp2);
1001
1002         if (vlp1 != NULL) {
1003                 if (!mtx_trylock(vlp1))
1004                         return (EAGAIN);
1005         }
1006         if (!mtx_trylock(vlp2)) {
1007                 if (vlp1 != NULL)
1008                         mtx_unlock(vlp1);
1009                 return (EAGAIN);
1010         }
1011
1012         return (0);
1013 }
1014
1015 static void
1016 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1017 {
1018
1019         MPASS(vlp1 != NULL || vlp2 != NULL);
1020         MPASS(vlp1 <= vlp2);
1021
1022         if (vlp1 != NULL)
1023                 mtx_lock(vlp1);
1024         if (vlp2 != NULL)
1025                 mtx_lock(vlp2);
1026 }
1027
1028 static void
1029 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
1030 {
1031
1032         MPASS(vlp1 != NULL || vlp2 != NULL);
1033
1034         if (vlp1 != NULL)
1035                 mtx_unlock(vlp1);
1036         if (vlp2 != NULL)
1037                 mtx_unlock(vlp2);
1038 }
1039
1040 static int
1041 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
1042 {
1043         struct nchstats snap;
1044
1045         if (req->oldptr == NULL)
1046                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
1047
1048         snap = nchstats;
1049         snap.ncs_goodhits = counter_u64_fetch(numposhits);
1050         snap.ncs_neghits = counter_u64_fetch(numneghits);
1051         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
1052             counter_u64_fetch(numnegzaps);
1053         snap.ncs_miss = counter_u64_fetch(nummisszap) +
1054             counter_u64_fetch(nummiss);
1055
1056         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
1057 }
1058 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
1059     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
1060     "VFS cache effectiveness statistics");
1061
1062 static void
1063 cache_recalc_neg_min(void)
1064 {
1065
1066         neg_min = (ncsize * ncnegminpct) / 100;
1067 }
1068
1069 static int
1070 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
1071 {
1072         u_int val;
1073         int error;
1074
1075         val = ncnegminpct;
1076         error = sysctl_handle_int(oidp, &val, 0, req);
1077         if (error != 0 || req->newptr == NULL)
1078                 return (error);
1079
1080         if (val == ncnegminpct)
1081                 return (0);
1082         if (val < 0 || val > 99)
1083                 return (EINVAL);
1084         ncnegminpct = val;
1085         cache_recalc_neg_min();
1086         return (0);
1087 }
1088
1089 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
1090     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
1091     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
1092
1093 #ifdef DEBUG_CACHE
1094 /*
1095  * Grab an atomic snapshot of the name cache hash chain lengths
1096  */
1097 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
1098     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
1099     "hash table stats");
1100
1101 static int
1102 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
1103 {
1104         struct nchashhead *ncpp;
1105         struct namecache *ncp;
1106         int i, error, n_nchash, *cntbuf;
1107
1108 retry:
1109         n_nchash = nchash + 1;  /* nchash is max index, not count */
1110         if (req->oldptr == NULL)
1111                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
1112         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
1113         cache_lock_all_buckets();
1114         if (n_nchash != nchash + 1) {
1115                 cache_unlock_all_buckets();
1116                 free(cntbuf, M_TEMP);
1117                 goto retry;
1118         }
1119         /* Scan hash tables counting entries */
1120         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
1121                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
1122                         cntbuf[i]++;
1123         cache_unlock_all_buckets();
1124         for (error = 0, i = 0; i < n_nchash; i++)
1125                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
1126                         break;
1127         free(cntbuf, M_TEMP);
1128         return (error);
1129 }
1130 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
1131     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
1132     "nchash chain lengths");
1133
1134 static int
1135 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
1136 {
1137         int error;
1138         struct nchashhead *ncpp;
1139         struct namecache *ncp;
1140         int n_nchash;
1141         int count, maxlength, used, pct;
1142
1143         if (!req->oldptr)
1144                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
1145
1146         cache_lock_all_buckets();
1147         n_nchash = nchash + 1;  /* nchash is max index, not count */
1148         used = 0;
1149         maxlength = 0;
1150
1151         /* Scan hash tables for applicable entries */
1152         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
1153                 count = 0;
1154                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
1155                         count++;
1156                 }
1157                 if (count)
1158                         used++;
1159                 if (maxlength < count)
1160                         maxlength = count;
1161         }
1162         n_nchash = nchash + 1;
1163         cache_unlock_all_buckets();
1164         pct = (used * 100) / (n_nchash / 100);
1165         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
1166         if (error)
1167                 return (error);
1168         error = SYSCTL_OUT(req, &used, sizeof(used));
1169         if (error)
1170                 return (error);
1171         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
1172         if (error)
1173                 return (error);
1174         error = SYSCTL_OUT(req, &pct, sizeof(pct));
1175         if (error)
1176                 return (error);
1177         return (0);
1178 }
1179 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1180     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1181     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1182 #endif
1183
1184 /*
1185  * Negative entries management
1186  *
1187  * Various workloads create plenty of negative entries and barely use them
1188  * afterwards. Moreover malicious users can keep performing bogus lookups
1189  * adding even more entries. For example "make tinderbox" as of writing this
1190  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1191  * negative.
1192  *
1193  * As such, a rather aggressive eviction method is needed. The currently
1194  * employed method is a placeholder.
1195  *
1196  * Entries are split over numneglists separate lists, each of which is further
1197  * split into hot and cold entries. Entries get promoted after getting a hit.
1198  * Eviction happens on addition of new entry.
1199  */
1200 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1201     "Name cache negative entry statistics");
1202
1203 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1204     "Number of negative cache entries");
1205
1206 static COUNTER_U64_DEFINE_EARLY(neg_created);
1207 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1208     "Number of created negative entries");
1209
1210 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1211 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1212     "Number of evicted negative entries");
1213
1214 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1215 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1216     &neg_evict_skipped_empty,
1217     "Number of times evicting failed due to lack of entries");
1218
1219 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1220 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1221     &neg_evict_skipped_missed,
1222     "Number of times evicting failed due to target entry disappearing");
1223
1224 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1225 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1226     &neg_evict_skipped_contended,
1227     "Number of times evicting failed due to contention");
1228
1229 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1230     "Number of cache hits (negative)");
1231
1232 static int
1233 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1234 {
1235         int i, out;
1236
1237         out = 0;
1238         for (i = 0; i < numneglists; i++)
1239                 out += neglists[i].nl_hotnum;
1240
1241         return (SYSCTL_OUT(req, &out, sizeof(out)));
1242 }
1243 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1244     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1245     "Number of hot negative entries");
1246
1247 static void
1248 cache_neg_init(struct namecache *ncp)
1249 {
1250         struct negstate *ns;
1251
1252         ncp->nc_flag |= NCF_NEGATIVE;
1253         ns = NCP2NEGSTATE(ncp);
1254         ns->neg_flag = 0;
1255         ns->neg_hit = 0;
1256         counter_u64_add(neg_created, 1);
1257 }
1258
1259 #define CACHE_NEG_PROMOTION_THRESH 2
1260
1261 static bool
1262 cache_neg_hit_prep(struct namecache *ncp)
1263 {
1264         struct negstate *ns;
1265         u_char n;
1266
1267         ns = NCP2NEGSTATE(ncp);
1268         n = atomic_load_char(&ns->neg_hit);
1269         for (;;) {
1270                 if (n >= CACHE_NEG_PROMOTION_THRESH)
1271                         return (false);
1272                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1273                         break;
1274         }
1275         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1276 }
1277
1278 /*
1279  * Nothing to do here but it is provided for completeness as some
1280  * cache_neg_hit_prep callers may end up returning without even
1281  * trying to promote.
1282  */
1283 #define cache_neg_hit_abort(ncp)        do { } while (0)
1284
1285 static void
1286 cache_neg_hit_finish(struct namecache *ncp)
1287 {
1288
1289         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1290         counter_u64_add(numneghits, 1);
1291 }
1292
1293 /*
1294  * Move a negative entry to the hot list.
1295  */
1296 static void
1297 cache_neg_promote_locked(struct namecache *ncp)
1298 {
1299         struct neglist *nl;
1300         struct negstate *ns;
1301
1302         ns = NCP2NEGSTATE(ncp);
1303         nl = NCP2NEGLIST(ncp);
1304         mtx_assert(&nl->nl_lock, MA_OWNED);
1305         if ((ns->neg_flag & NEG_HOT) == 0) {
1306                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1307                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1308                 nl->nl_hotnum++;
1309                 ns->neg_flag |= NEG_HOT;
1310         }
1311 }
1312
1313 /*
1314  * Move a hot negative entry to the cold list.
1315  */
1316 static void
1317 cache_neg_demote_locked(struct namecache *ncp)
1318 {
1319         struct neglist *nl;
1320         struct negstate *ns;
1321
1322         ns = NCP2NEGSTATE(ncp);
1323         nl = NCP2NEGLIST(ncp);
1324         mtx_assert(&nl->nl_lock, MA_OWNED);
1325         MPASS(ns->neg_flag & NEG_HOT);
1326         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1327         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1328         nl->nl_hotnum--;
1329         ns->neg_flag &= ~NEG_HOT;
1330         atomic_store_char(&ns->neg_hit, 0);
1331 }
1332
1333 /*
1334  * Move a negative entry to the hot list if it matches the lookup.
1335  *
1336  * We have to take locks, but they may be contended and in the worst
1337  * case we may need to go off CPU. We don't want to spin within the
1338  * smr section and we can't block with it. Exiting the section means
1339  * the found entry could have been evicted. We are going to look it
1340  * up again.
1341  */
1342 static bool
1343 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1344     struct namecache *oncp, uint32_t hash)
1345 {
1346         struct namecache *ncp;
1347         struct neglist *nl;
1348         u_char nc_flag;
1349
1350         nl = NCP2NEGLIST(oncp);
1351
1352         mtx_lock(&nl->nl_lock);
1353         /*
1354          * For hash iteration.
1355          */
1356         vfs_smr_enter();
1357
1358         /*
1359          * Avoid all surprises by only succeeding if we got the same entry and
1360          * bailing completely otherwise.
1361          * XXX There are no provisions to keep the vnode around, meaning we may
1362          * end up promoting a negative entry for a *new* vnode and returning
1363          * ENOENT on its account. This is the error we want to return anyway
1364          * and promotion is harmless.
1365          *
1366          * In particular at this point there can be a new ncp which matches the
1367          * search but hashes to a different neglist.
1368          */
1369         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1370                 if (ncp == oncp)
1371                         break;
1372         }
1373
1374         /*
1375          * No match to begin with.
1376          */
1377         if (__predict_false(ncp == NULL)) {
1378                 goto out_abort;
1379         }
1380
1381         /*
1382          * The newly found entry may be something different...
1383          */
1384         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1385             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1386                 goto out_abort;
1387         }
1388
1389         /*
1390          * ... and not even negative.
1391          */
1392         nc_flag = atomic_load_char(&ncp->nc_flag);
1393         if ((nc_flag & NCF_NEGATIVE) == 0) {
1394                 goto out_abort;
1395         }
1396
1397         if (!cache_ncp_canuse(ncp)) {
1398                 goto out_abort;
1399         }
1400
1401         cache_neg_promote_locked(ncp);
1402         cache_neg_hit_finish(ncp);
1403         vfs_smr_exit();
1404         mtx_unlock(&nl->nl_lock);
1405         return (true);
1406 out_abort:
1407         vfs_smr_exit();
1408         mtx_unlock(&nl->nl_lock);
1409         return (false);
1410 }
1411
1412 static void
1413 cache_neg_promote(struct namecache *ncp)
1414 {
1415         struct neglist *nl;
1416
1417         nl = NCP2NEGLIST(ncp);
1418         mtx_lock(&nl->nl_lock);
1419         cache_neg_promote_locked(ncp);
1420         mtx_unlock(&nl->nl_lock);
1421 }
1422
1423 static void
1424 cache_neg_insert(struct namecache *ncp)
1425 {
1426         struct neglist *nl;
1427
1428         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1429         cache_assert_bucket_locked(ncp);
1430         nl = NCP2NEGLIST(ncp);
1431         mtx_lock(&nl->nl_lock);
1432         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1433         mtx_unlock(&nl->nl_lock);
1434         atomic_add_long(&numneg, 1);
1435 }
1436
1437 static void
1438 cache_neg_remove(struct namecache *ncp)
1439 {
1440         struct neglist *nl;
1441         struct negstate *ns;
1442
1443         cache_assert_bucket_locked(ncp);
1444         nl = NCP2NEGLIST(ncp);
1445         ns = NCP2NEGSTATE(ncp);
1446         mtx_lock(&nl->nl_lock);
1447         if ((ns->neg_flag & NEG_HOT) != 0) {
1448                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1449                 nl->nl_hotnum--;
1450         } else {
1451                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1452         }
1453         mtx_unlock(&nl->nl_lock);
1454         atomic_subtract_long(&numneg, 1);
1455 }
1456
1457 static struct neglist *
1458 cache_neg_evict_select_list(void)
1459 {
1460         struct neglist *nl;
1461         u_int c;
1462
1463         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1464         nl = &neglists[c % numneglists];
1465         if (!mtx_trylock(&nl->nl_evict_lock)) {
1466                 counter_u64_add(neg_evict_skipped_contended, 1);
1467                 return (NULL);
1468         }
1469         return (nl);
1470 }
1471
1472 static struct namecache *
1473 cache_neg_evict_select_entry(struct neglist *nl)
1474 {
1475         struct namecache *ncp, *lncp;
1476         struct negstate *ns, *lns;
1477         int i;
1478
1479         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1480         mtx_assert(&nl->nl_lock, MA_OWNED);
1481         ncp = TAILQ_FIRST(&nl->nl_list);
1482         if (ncp == NULL)
1483                 return (NULL);
1484         lncp = ncp;
1485         lns = NCP2NEGSTATE(lncp);
1486         for (i = 1; i < 4; i++) {
1487                 ncp = TAILQ_NEXT(ncp, nc_dst);
1488                 if (ncp == NULL)
1489                         break;
1490                 ns = NCP2NEGSTATE(ncp);
1491                 if (ns->neg_hit < lns->neg_hit) {
1492                         lncp = ncp;
1493                         lns = ns;
1494                 }
1495         }
1496         return (lncp);
1497 }
1498
1499 static bool
1500 cache_neg_evict(void)
1501 {
1502         struct namecache *ncp, *ncp2;
1503         struct neglist *nl;
1504         struct vnode *dvp;
1505         struct mtx *dvlp;
1506         struct mtx *blp;
1507         uint32_t hash;
1508         u_char nlen;
1509         bool evicted;
1510
1511         nl = cache_neg_evict_select_list();
1512         if (nl == NULL) {
1513                 return (false);
1514         }
1515
1516         mtx_lock(&nl->nl_lock);
1517         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1518         if (ncp != NULL) {
1519                 cache_neg_demote_locked(ncp);
1520         }
1521         ncp = cache_neg_evict_select_entry(nl);
1522         if (ncp == NULL) {
1523                 counter_u64_add(neg_evict_skipped_empty, 1);
1524                 mtx_unlock(&nl->nl_lock);
1525                 mtx_unlock(&nl->nl_evict_lock);
1526                 return (false);
1527         }
1528         nlen = ncp->nc_nlen;
1529         dvp = ncp->nc_dvp;
1530         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1531         dvlp = VP2VNODELOCK(dvp);
1532         blp = HASH2BUCKETLOCK(hash);
1533         mtx_unlock(&nl->nl_lock);
1534         mtx_unlock(&nl->nl_evict_lock);
1535         mtx_lock(dvlp);
1536         mtx_lock(blp);
1537         /*
1538          * Note that since all locks were dropped above, the entry may be
1539          * gone or reallocated to be something else.
1540          */
1541         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1542                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1543                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1544                         break;
1545         }
1546         if (ncp2 == NULL) {
1547                 counter_u64_add(neg_evict_skipped_missed, 1);
1548                 ncp = NULL;
1549                 evicted = false;
1550         } else {
1551                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1552                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1553                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1554                     ncp->nc_name);
1555                 cache_zap_locked(ncp);
1556                 counter_u64_add(neg_evicted, 1);
1557                 evicted = true;
1558         }
1559         mtx_unlock(blp);
1560         mtx_unlock(dvlp);
1561         if (ncp != NULL)
1562                 cache_free(ncp);
1563         return (evicted);
1564 }
1565
1566 /*
1567  * Maybe evict a negative entry to create more room.
1568  *
1569  * The ncnegfactor parameter limits what fraction of the total count
1570  * can comprise of negative entries. However, if the cache is just
1571  * warming up this leads to excessive evictions.  As such, ncnegminpct
1572  * (recomputed to neg_min) dictates whether the above should be
1573  * applied.
1574  *
1575  * Try evicting if the cache is close to full capacity regardless of
1576  * other considerations.
1577  */
1578 static bool
1579 cache_neg_evict_cond(u_long lnumcache)
1580 {
1581         u_long lnumneg;
1582
1583         if (ncsize - 1000 < lnumcache)
1584                 goto out_evict;
1585         lnumneg = atomic_load_long(&numneg);
1586         if (lnumneg < neg_min)
1587                 return (false);
1588         if (lnumneg * ncnegfactor < lnumcache)
1589                 return (false);
1590 out_evict:
1591         return (cache_neg_evict());
1592 }
1593
1594 /*
1595  * cache_zap_locked():
1596  *
1597  *   Removes a namecache entry from cache, whether it contains an actual
1598  *   pointer to a vnode or if it is just a negative cache entry.
1599  */
1600 static void
1601 cache_zap_locked(struct namecache *ncp)
1602 {
1603         struct nchashhead *ncpp;
1604         struct vnode *dvp, *vp;
1605
1606         dvp = ncp->nc_dvp;
1607         vp = ncp->nc_vp;
1608
1609         if (!(ncp->nc_flag & NCF_NEGATIVE))
1610                 cache_assert_vnode_locked(vp);
1611         cache_assert_vnode_locked(dvp);
1612         cache_assert_bucket_locked(ncp);
1613
1614         cache_ncp_invalidate(ncp);
1615
1616         ncpp = NCP2BUCKET(ncp);
1617         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1618         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1619                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1620                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1621                 if (ncp == vp->v_cache_dd) {
1622                         atomic_store_ptr(&vp->v_cache_dd, NULL);
1623                 }
1624         } else {
1625                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1626                 cache_neg_remove(ncp);
1627         }
1628         if (ncp->nc_flag & NCF_ISDOTDOT) {
1629                 if (ncp == dvp->v_cache_dd) {
1630                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
1631                 }
1632         } else {
1633                 LIST_REMOVE(ncp, nc_src);
1634                 if (LIST_EMPTY(&dvp->v_cache_src)) {
1635                         ncp->nc_flag |= NCF_DVDROP;
1636                 }
1637         }
1638 }
1639
1640 static void
1641 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1642 {
1643         struct mtx *blp;
1644
1645         MPASS(ncp->nc_dvp == vp);
1646         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1647         cache_assert_vnode_locked(vp);
1648
1649         blp = NCP2BUCKETLOCK(ncp);
1650         mtx_lock(blp);
1651         cache_zap_locked(ncp);
1652         mtx_unlock(blp);
1653 }
1654
1655 static bool
1656 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1657     struct mtx **vlpp)
1658 {
1659         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1660         struct mtx *blp;
1661
1662         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1663         cache_assert_vnode_locked(vp);
1664
1665         if (ncp->nc_flag & NCF_NEGATIVE) {
1666                 if (*vlpp != NULL) {
1667                         mtx_unlock(*vlpp);
1668                         *vlpp = NULL;
1669                 }
1670                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1671                 return (true);
1672         }
1673
1674         pvlp = VP2VNODELOCK(vp);
1675         blp = NCP2BUCKETLOCK(ncp);
1676         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1677         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1678
1679         if (*vlpp == vlp1 || *vlpp == vlp2) {
1680                 to_unlock = *vlpp;
1681                 *vlpp = NULL;
1682         } else {
1683                 if (*vlpp != NULL) {
1684                         mtx_unlock(*vlpp);
1685                         *vlpp = NULL;
1686                 }
1687                 cache_sort_vnodes(&vlp1, &vlp2);
1688                 if (vlp1 == pvlp) {
1689                         mtx_lock(vlp2);
1690                         to_unlock = vlp2;
1691                 } else {
1692                         if (!mtx_trylock(vlp1))
1693                                 goto out_relock;
1694                         to_unlock = vlp1;
1695                 }
1696         }
1697         mtx_lock(blp);
1698         cache_zap_locked(ncp);
1699         mtx_unlock(blp);
1700         if (to_unlock != NULL)
1701                 mtx_unlock(to_unlock);
1702         return (true);
1703
1704 out_relock:
1705         mtx_unlock(vlp2);
1706         mtx_lock(vlp1);
1707         mtx_lock(vlp2);
1708         MPASS(*vlpp == NULL);
1709         *vlpp = vlp1;
1710         return (false);
1711 }
1712
1713 /*
1714  * If trylocking failed we can get here. We know enough to take all needed locks
1715  * in the right order and re-lookup the entry.
1716  */
1717 static int
1718 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1719     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1720     struct mtx *blp)
1721 {
1722         struct namecache *rncp;
1723         struct mtx *rvlp;
1724
1725         cache_assert_bucket_unlocked(ncp);
1726
1727         cache_sort_vnodes(&dvlp, &vlp);
1728         cache_lock_vnodes(dvlp, vlp);
1729         mtx_lock(blp);
1730         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1731                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1732                     rncp->nc_nlen == cnp->cn_namelen &&
1733                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1734                         break;
1735         }
1736
1737         if (rncp == NULL)
1738                 goto out_mismatch;
1739
1740         if (!(ncp->nc_flag & NCF_NEGATIVE))
1741                 rvlp = VP2VNODELOCK(rncp->nc_vp);
1742         else
1743                 rvlp = NULL;
1744         if (rvlp != vlp)
1745                 goto out_mismatch;
1746
1747         cache_zap_locked(rncp);
1748         mtx_unlock(blp);
1749         cache_unlock_vnodes(dvlp, vlp);
1750         atomic_add_long(&zap_bucket_relock_success, 1);
1751         return (0);
1752
1753 out_mismatch:
1754         mtx_unlock(blp);
1755         cache_unlock_vnodes(dvlp, vlp);
1756         return (EAGAIN);
1757 }
1758
1759 static int __noinline
1760 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1761     uint32_t hash, struct mtx *blp)
1762 {
1763         struct mtx *dvlp, *vlp;
1764         struct vnode *dvp;
1765
1766         cache_assert_bucket_locked(ncp);
1767
1768         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1769         vlp = NULL;
1770         if (!(ncp->nc_flag & NCF_NEGATIVE))
1771                 vlp = VP2VNODELOCK(ncp->nc_vp);
1772         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1773                 cache_zap_locked(ncp);
1774                 mtx_unlock(blp);
1775                 cache_unlock_vnodes(dvlp, vlp);
1776                 return (0);
1777         }
1778
1779         dvp = ncp->nc_dvp;
1780         mtx_unlock(blp);
1781         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1782 }
1783
1784 static __noinline int
1785 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1786 {
1787         struct namecache *ncp;
1788         struct mtx *blp;
1789         struct mtx *dvlp, *dvlp2;
1790         uint32_t hash;
1791         int error;
1792
1793         if (cnp->cn_namelen == 2 &&
1794             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1795                 dvlp = VP2VNODELOCK(dvp);
1796                 dvlp2 = NULL;
1797                 mtx_lock(dvlp);
1798 retry_dotdot:
1799                 ncp = dvp->v_cache_dd;
1800                 if (ncp == NULL) {
1801                         mtx_unlock(dvlp);
1802                         if (dvlp2 != NULL)
1803                                 mtx_unlock(dvlp2);
1804                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1805                         return (0);
1806                 }
1807                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1808                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1809                                 goto retry_dotdot;
1810                         MPASS(dvp->v_cache_dd == NULL);
1811                         mtx_unlock(dvlp);
1812                         if (dvlp2 != NULL)
1813                                 mtx_unlock(dvlp2);
1814                         cache_free(ncp);
1815                 } else {
1816                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
1817                         mtx_unlock(dvlp);
1818                         if (dvlp2 != NULL)
1819                                 mtx_unlock(dvlp2);
1820                 }
1821                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1822                 return (1);
1823         }
1824
1825         /*
1826          * XXX note that access here is completely unlocked with no provisions
1827          * to keep the hash allocated. If one is sufficiently unlucky a
1828          * parallel cache resize can reallocate the hash, unmap backing pages
1829          * and cause the empty check below to fault.
1830          *
1831          * Fixing this has epsilon priority, but can be done with no overhead
1832          * for this codepath with sufficient effort.
1833          */
1834         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1835         blp = HASH2BUCKETLOCK(hash);
1836 retry:
1837         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1838                 goto out_no_entry;
1839
1840         mtx_lock(blp);
1841
1842         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1843                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1844                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1845                         break;
1846         }
1847
1848         if (ncp == NULL) {
1849                 mtx_unlock(blp);
1850                 goto out_no_entry;
1851         }
1852
1853         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1854         if (__predict_false(error != 0)) {
1855                 atomic_add_long(&zap_bucket_fail, 1);
1856                 goto retry;
1857         }
1858         counter_u64_add(numposzaps, 1);
1859         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1860         cache_free(ncp);
1861         return (1);
1862 out_no_entry:
1863         counter_u64_add(nummisszap, 1);
1864         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1865         return (0);
1866 }
1867
1868 static int __noinline
1869 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1870     struct timespec *tsp, int *ticksp)
1871 {
1872         int ltype;
1873
1874         *vpp = dvp;
1875         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1876         if (tsp != NULL)
1877                 timespecclear(tsp);
1878         if (ticksp != NULL)
1879                 *ticksp = ticks;
1880         vrefact(*vpp);
1881         /*
1882          * When we lookup "." we still can be asked to lock it
1883          * differently...
1884          */
1885         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1886         if (ltype != VOP_ISLOCKED(*vpp)) {
1887                 if (ltype == LK_EXCLUSIVE) {
1888                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1889                         if (VN_IS_DOOMED((*vpp))) {
1890                                 /* forced unmount */
1891                                 vrele(*vpp);
1892                                 *vpp = NULL;
1893                                 return (ENOENT);
1894                         }
1895                 } else
1896                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1897         }
1898         return (-1);
1899 }
1900
1901 static int __noinline
1902 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1903     struct timespec *tsp, int *ticksp)
1904 {
1905         struct namecache_ts *ncp_ts;
1906         struct namecache *ncp;
1907         struct mtx *dvlp;
1908         enum vgetstate vs;
1909         int error, ltype;
1910         bool whiteout;
1911
1912         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1913
1914         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1915                 cache_remove_cnp(dvp, cnp);
1916                 return (0);
1917         }
1918
1919 retry:
1920         dvlp = VP2VNODELOCK(dvp);
1921         mtx_lock(dvlp);
1922         ncp = dvp->v_cache_dd;
1923         if (ncp == NULL) {
1924                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
1925                 mtx_unlock(dvlp);
1926                 return (0);
1927         }
1928         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1929                 if (ncp->nc_flag & NCF_NEGATIVE)
1930                         *vpp = NULL;
1931                 else
1932                         *vpp = ncp->nc_vp;
1933         } else
1934                 *vpp = ncp->nc_dvp;
1935         if (*vpp == NULL)
1936                 goto negative_success;
1937         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1938         cache_out_ts(ncp, tsp, ticksp);
1939         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1940             NCF_DTS && tsp != NULL) {
1941                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1942                 *tsp = ncp_ts->nc_dotdottime;
1943         }
1944
1945         MPASS(dvp != *vpp);
1946         ltype = VOP_ISLOCKED(dvp);
1947         VOP_UNLOCK(dvp);
1948         vs = vget_prep(*vpp);
1949         mtx_unlock(dvlp);
1950         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1951         vn_lock(dvp, ltype | LK_RETRY);
1952         if (VN_IS_DOOMED(dvp)) {
1953                 if (error == 0)
1954                         vput(*vpp);
1955                 *vpp = NULL;
1956                 return (ENOENT);
1957         }
1958         if (error) {
1959                 *vpp = NULL;
1960                 goto retry;
1961         }
1962         return (-1);
1963 negative_success:
1964         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1965                 if (cnp->cn_flags & ISLASTCN) {
1966                         counter_u64_add(numnegzaps, 1);
1967                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1968                         mtx_unlock(dvlp);
1969                         cache_free(ncp);
1970                         return (0);
1971                 }
1972         }
1973
1974         whiteout = (ncp->nc_flag & NCF_WHITE);
1975         cache_out_ts(ncp, tsp, ticksp);
1976         if (cache_neg_hit_prep(ncp))
1977                 cache_neg_promote(ncp);
1978         else
1979                 cache_neg_hit_finish(ncp);
1980         mtx_unlock(dvlp);
1981         if (whiteout)
1982                 cnp->cn_flags |= ISWHITEOUT;
1983         return (ENOENT);
1984 }
1985
1986 /**
1987  * Lookup a name in the name cache
1988  *
1989  * # Arguments
1990  *
1991  * - dvp:       Parent directory in which to search.
1992  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1993  * - cnp:       Parameters of the name search.  The most interesting bits of
1994  *              the cn_flags field have the following meanings:
1995  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1996  *                      it up.
1997  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1998  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1999  *              or negative) lookup, tsp will be filled with any timespec that
2000  *              was stored when this cache entry was created.  However, it will
2001  *              be clear for "." entries.
2002  * - ticks:     Return storage for alternate cache timestamp.  On a successful
2003  *              (positive or negative) lookup, it will contain the ticks value
2004  *              that was current when the cache entry was created, unless cnp
2005  *              was ".".
2006  *
2007  * Either both tsp and ticks have to be provided or neither of them.
2008  *
2009  * # Returns
2010  *
2011  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
2012  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
2013  *              to a forced unmount.  vpp will not be modified.  If the entry
2014  *              is a whiteout, then the ISWHITEOUT flag will be set in
2015  *              cnp->cn_flags.
2016  * - 0:         A cache miss.  vpp will not be modified.
2017  *
2018  * # Locking
2019  *
2020  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
2021  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
2022  * lock is not recursively acquired.
2023  */
2024 static int __noinline
2025 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2026     struct timespec *tsp, int *ticksp)
2027 {
2028         struct namecache *ncp;
2029         struct mtx *blp;
2030         uint32_t hash;
2031         enum vgetstate vs;
2032         int error;
2033         bool whiteout;
2034
2035         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2036         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
2037
2038 retry:
2039         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2040         blp = HASH2BUCKETLOCK(hash);
2041         mtx_lock(blp);
2042
2043         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2044                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2045                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2046                         break;
2047         }
2048
2049         if (__predict_false(ncp == NULL)) {
2050                 mtx_unlock(blp);
2051                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2052                 counter_u64_add(nummiss, 1);
2053                 return (0);
2054         }
2055
2056         if (ncp->nc_flag & NCF_NEGATIVE)
2057                 goto negative_success;
2058
2059         counter_u64_add(numposhits, 1);
2060         *vpp = ncp->nc_vp;
2061         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2062         cache_out_ts(ncp, tsp, ticksp);
2063         MPASS(dvp != *vpp);
2064         vs = vget_prep(*vpp);
2065         mtx_unlock(blp);
2066         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2067         if (error) {
2068                 *vpp = NULL;
2069                 goto retry;
2070         }
2071         return (-1);
2072 negative_success:
2073         /*
2074          * We don't get here with regular lookup apart from corner cases.
2075          */
2076         if (__predict_true(cnp->cn_nameiop == CREATE)) {
2077                 if (cnp->cn_flags & ISLASTCN) {
2078                         counter_u64_add(numnegzaps, 1);
2079                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
2080                         if (__predict_false(error != 0)) {
2081                                 atomic_add_long(&zap_bucket_fail2, 1);
2082                                 goto retry;
2083                         }
2084                         cache_free(ncp);
2085                         return (0);
2086                 }
2087         }
2088
2089         whiteout = (ncp->nc_flag & NCF_WHITE);
2090         cache_out_ts(ncp, tsp, ticksp);
2091         if (cache_neg_hit_prep(ncp))
2092                 cache_neg_promote(ncp);
2093         else
2094                 cache_neg_hit_finish(ncp);
2095         mtx_unlock(blp);
2096         if (whiteout)
2097                 cnp->cn_flags |= ISWHITEOUT;
2098         return (ENOENT);
2099 }
2100
2101 int
2102 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
2103     struct timespec *tsp, int *ticksp)
2104 {
2105         struct namecache *ncp;
2106         uint32_t hash;
2107         enum vgetstate vs;
2108         int error;
2109         bool whiteout, neg_promote;
2110         u_short nc_flag;
2111
2112         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
2113
2114 #ifdef DEBUG_CACHE
2115         if (__predict_false(!doingcache)) {
2116                 cnp->cn_flags &= ~MAKEENTRY;
2117                 return (0);
2118         }
2119 #endif
2120
2121         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2122                 if (cnp->cn_namelen == 1)
2123                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
2124                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
2125                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
2126         }
2127
2128         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
2129
2130         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
2131                 cache_remove_cnp(dvp, cnp);
2132                 return (0);
2133         }
2134
2135         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2136         vfs_smr_enter();
2137
2138         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2139                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2140                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
2141                         break;
2142         }
2143
2144         if (__predict_false(ncp == NULL)) {
2145                 vfs_smr_exit();
2146                 SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
2147                 counter_u64_add(nummiss, 1);
2148                 return (0);
2149         }
2150
2151         nc_flag = atomic_load_char(&ncp->nc_flag);
2152         if (nc_flag & NCF_NEGATIVE)
2153                 goto negative_success;
2154
2155         counter_u64_add(numposhits, 1);
2156         *vpp = ncp->nc_vp;
2157         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
2158         cache_out_ts(ncp, tsp, ticksp);
2159         MPASS(dvp != *vpp);
2160         if (!cache_ncp_canuse(ncp)) {
2161                 vfs_smr_exit();
2162                 *vpp = NULL;
2163                 goto out_fallback;
2164         }
2165         vs = vget_prep_smr(*vpp);
2166         vfs_smr_exit();
2167         if (__predict_false(vs == VGET_NONE)) {
2168                 *vpp = NULL;
2169                 goto out_fallback;
2170         }
2171         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
2172         if (error) {
2173                 *vpp = NULL;
2174                 goto out_fallback;
2175         }
2176         return (-1);
2177 negative_success:
2178         if (cnp->cn_nameiop == CREATE) {
2179                 if (cnp->cn_flags & ISLASTCN) {
2180                         vfs_smr_exit();
2181                         goto out_fallback;
2182                 }
2183         }
2184
2185         cache_out_ts(ncp, tsp, ticksp);
2186         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
2187         neg_promote = cache_neg_hit_prep(ncp);
2188         if (!cache_ncp_canuse(ncp)) {
2189                 cache_neg_hit_abort(ncp);
2190                 vfs_smr_exit();
2191                 goto out_fallback;
2192         }
2193         if (neg_promote) {
2194                 vfs_smr_exit();
2195                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2196                         goto out_fallback;
2197         } else {
2198                 cache_neg_hit_finish(ncp);
2199                 vfs_smr_exit();
2200         }
2201         if (whiteout)
2202                 cnp->cn_flags |= ISWHITEOUT;
2203         return (ENOENT);
2204 out_fallback:
2205         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2206 }
2207
2208 struct celockstate {
2209         struct mtx *vlp[3];
2210         struct mtx *blp[2];
2211 };
2212 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2213 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2214
2215 static inline void
2216 cache_celockstate_init(struct celockstate *cel)
2217 {
2218
2219         bzero(cel, sizeof(*cel));
2220 }
2221
2222 static void
2223 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2224     struct vnode *dvp)
2225 {
2226         struct mtx *vlp1, *vlp2;
2227
2228         MPASS(cel->vlp[0] == NULL);
2229         MPASS(cel->vlp[1] == NULL);
2230         MPASS(cel->vlp[2] == NULL);
2231
2232         MPASS(vp != NULL || dvp != NULL);
2233
2234         vlp1 = VP2VNODELOCK(vp);
2235         vlp2 = VP2VNODELOCK(dvp);
2236         cache_sort_vnodes(&vlp1, &vlp2);
2237
2238         if (vlp1 != NULL) {
2239                 mtx_lock(vlp1);
2240                 cel->vlp[0] = vlp1;
2241         }
2242         mtx_lock(vlp2);
2243         cel->vlp[1] = vlp2;
2244 }
2245
2246 static void
2247 cache_unlock_vnodes_cel(struct celockstate *cel)
2248 {
2249
2250         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2251
2252         if (cel->vlp[0] != NULL)
2253                 mtx_unlock(cel->vlp[0]);
2254         if (cel->vlp[1] != NULL)
2255                 mtx_unlock(cel->vlp[1]);
2256         if (cel->vlp[2] != NULL)
2257                 mtx_unlock(cel->vlp[2]);
2258 }
2259
2260 static bool
2261 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2262 {
2263         struct mtx *vlp;
2264         bool ret;
2265
2266         cache_assert_vlp_locked(cel->vlp[0]);
2267         cache_assert_vlp_locked(cel->vlp[1]);
2268         MPASS(cel->vlp[2] == NULL);
2269
2270         MPASS(vp != NULL);
2271         vlp = VP2VNODELOCK(vp);
2272
2273         ret = true;
2274         if (vlp >= cel->vlp[1]) {
2275                 mtx_lock(vlp);
2276         } else {
2277                 if (mtx_trylock(vlp))
2278                         goto out;
2279                 cache_unlock_vnodes_cel(cel);
2280                 atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
2281                 if (vlp < cel->vlp[0]) {
2282                         mtx_lock(vlp);
2283                         mtx_lock(cel->vlp[0]);
2284                         mtx_lock(cel->vlp[1]);
2285                 } else {
2286                         if (cel->vlp[0] != NULL)
2287                                 mtx_lock(cel->vlp[0]);
2288                         mtx_lock(vlp);
2289                         mtx_lock(cel->vlp[1]);
2290                 }
2291                 ret = false;
2292         }
2293 out:
2294         cel->vlp[2] = vlp;
2295         return (ret);
2296 }
2297
2298 static void
2299 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2300     struct mtx *blp2)
2301 {
2302
2303         MPASS(cel->blp[0] == NULL);
2304         MPASS(cel->blp[1] == NULL);
2305
2306         cache_sort_vnodes(&blp1, &blp2);
2307
2308         if (blp1 != NULL) {
2309                 mtx_lock(blp1);
2310                 cel->blp[0] = blp1;
2311         }
2312         mtx_lock(blp2);
2313         cel->blp[1] = blp2;
2314 }
2315
2316 static void
2317 cache_unlock_buckets_cel(struct celockstate *cel)
2318 {
2319
2320         if (cel->blp[0] != NULL)
2321                 mtx_unlock(cel->blp[0]);
2322         mtx_unlock(cel->blp[1]);
2323 }
2324
2325 /*
2326  * Lock part of the cache affected by the insertion.
2327  *
2328  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2329  * However, insertion can result in removal of an old entry. In this
2330  * case we have an additional vnode and bucketlock pair to lock.
2331  *
2332  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2333  * preserving the locking order (smaller address first).
2334  */
2335 static void
2336 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2337     uint32_t hash)
2338 {
2339         struct namecache *ncp;
2340         struct mtx *blps[2];
2341         u_char nc_flag;
2342
2343         blps[0] = HASH2BUCKETLOCK(hash);
2344         for (;;) {
2345                 blps[1] = NULL;
2346                 cache_lock_vnodes_cel(cel, dvp, vp);
2347                 if (vp == NULL || vp->v_type != VDIR)
2348                         break;
2349                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2350                 if (ncp == NULL)
2351                         break;
2352                 nc_flag = atomic_load_char(&ncp->nc_flag);
2353                 if ((nc_flag & NCF_ISDOTDOT) == 0)
2354                         break;
2355                 MPASS(ncp->nc_dvp == vp);
2356                 blps[1] = NCP2BUCKETLOCK(ncp);
2357                 if ((nc_flag & NCF_NEGATIVE) != 0)
2358                         break;
2359                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2360                         break;
2361                 /*
2362                  * All vnodes got re-locked. Re-validate the state and if
2363                  * nothing changed we are done. Otherwise restart.
2364                  */
2365                 if (ncp == vp->v_cache_dd &&
2366                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2367                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2368                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2369                         break;
2370                 cache_unlock_vnodes_cel(cel);
2371                 cel->vlp[0] = NULL;
2372                 cel->vlp[1] = NULL;
2373                 cel->vlp[2] = NULL;
2374         }
2375         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2376 }
2377
2378 static void
2379 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2380     uint32_t hash)
2381 {
2382         struct namecache *ncp;
2383         struct mtx *blps[2];
2384         u_char nc_flag;
2385
2386         blps[0] = HASH2BUCKETLOCK(hash);
2387         for (;;) {
2388                 blps[1] = NULL;
2389                 cache_lock_vnodes_cel(cel, dvp, vp);
2390                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2391                 if (ncp == NULL)
2392                         break;
2393                 nc_flag = atomic_load_char(&ncp->nc_flag);
2394                 if ((nc_flag & NCF_ISDOTDOT) == 0)
2395                         break;
2396                 MPASS(ncp->nc_dvp == dvp);
2397                 blps[1] = NCP2BUCKETLOCK(ncp);
2398                 if ((nc_flag & NCF_NEGATIVE) != 0)
2399                         break;
2400                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2401                         break;
2402                 if (ncp == dvp->v_cache_dd &&
2403                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2404                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2405                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2406                         break;
2407                 cache_unlock_vnodes_cel(cel);
2408                 cel->vlp[0] = NULL;
2409                 cel->vlp[1] = NULL;
2410                 cel->vlp[2] = NULL;
2411         }
2412         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2413 }
2414
2415 static void
2416 cache_enter_unlock(struct celockstate *cel)
2417 {
2418
2419         cache_unlock_buckets_cel(cel);
2420         cache_unlock_vnodes_cel(cel);
2421 }
2422
2423 static void __noinline
2424 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2425     struct componentname *cnp)
2426 {
2427         struct celockstate cel;
2428         struct namecache *ncp;
2429         uint32_t hash;
2430         int len;
2431
2432         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2433                 return;
2434         len = cnp->cn_namelen;
2435         cache_celockstate_init(&cel);
2436         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2437         cache_enter_lock_dd(&cel, dvp, vp, hash);
2438         ncp = dvp->v_cache_dd;
2439         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2440                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2441                 cache_zap_locked(ncp);
2442         } else {
2443                 ncp = NULL;
2444         }
2445         atomic_store_ptr(&dvp->v_cache_dd, NULL);
2446         cache_enter_unlock(&cel);
2447         if (ncp != NULL)
2448                 cache_free(ncp);
2449 }
2450
2451 /*
2452  * Add an entry to the cache.
2453  */
2454 void
2455 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2456     struct timespec *tsp, struct timespec *dtsp)
2457 {
2458         struct celockstate cel;
2459         struct namecache *ncp, *n2, *ndd;
2460         struct namecache_ts *ncp_ts;
2461         struct nchashhead *ncpp;
2462         uint32_t hash;
2463         int flag;
2464         int len;
2465
2466         KASSERT(cnp->cn_namelen <= NAME_MAX,
2467             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2468             NAME_MAX));
2469         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2470         VNPASS(dvp->v_type != VNON, dvp);
2471         if (vp != NULL) {
2472                 VNPASS(!VN_IS_DOOMED(vp), vp);
2473                 VNPASS(vp->v_type != VNON, vp);
2474         }
2475         if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
2476                 KASSERT(dvp == vp,
2477                     ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
2478                     dvp, vp));
2479         } else {
2480                 KASSERT(dvp != vp,
2481                     ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
2482                     cnp->cn_nameptr, dvp));
2483         }
2484
2485 #ifdef DEBUG_CACHE
2486         if (__predict_false(!doingcache))
2487                 return;
2488 #endif
2489
2490         flag = 0;
2491         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2492                 if (cnp->cn_namelen == 1)
2493                         return;
2494                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2495                         cache_enter_dotdot_prep(dvp, vp, cnp);
2496                         flag = NCF_ISDOTDOT;
2497                 }
2498         }
2499
2500         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2501         if (ncp == NULL)
2502                 return;
2503
2504         cache_celockstate_init(&cel);
2505         ndd = NULL;
2506         ncp_ts = NULL;
2507
2508         /*
2509          * Calculate the hash key and setup as much of the new
2510          * namecache entry as possible before acquiring the lock.
2511          */
2512         ncp->nc_flag = flag | NCF_WIP;
2513         ncp->nc_vp = vp;
2514         if (vp == NULL)
2515                 cache_neg_init(ncp);
2516         ncp->nc_dvp = dvp;
2517         if (tsp != NULL) {
2518                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2519                 ncp_ts->nc_time = *tsp;
2520                 ncp_ts->nc_ticks = ticks;
2521                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2522                 if (dtsp != NULL) {
2523                         ncp_ts->nc_dotdottime = *dtsp;
2524                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2525                 }
2526         }
2527         len = ncp->nc_nlen = cnp->cn_namelen;
2528         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2529         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2530         ncp->nc_name[len] = '\0';
2531         cache_enter_lock(&cel, dvp, vp, hash);
2532
2533         /*
2534          * See if this vnode or negative entry is already in the cache
2535          * with this name.  This can happen with concurrent lookups of
2536          * the same path name.
2537          */
2538         ncpp = NCHHASH(hash);
2539         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2540                 if (n2->nc_dvp == dvp &&
2541                     n2->nc_nlen == cnp->cn_namelen &&
2542                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2543                         MPASS(cache_ncp_canuse(n2));
2544                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2545                                 KASSERT(vp == NULL,
2546                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2547                                     __func__, NULL, vp, cnp->cn_nameptr));
2548                         else
2549                                 KASSERT(n2->nc_vp == vp,
2550                                     ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
2551                                     __func__, n2->nc_vp, vp, cnp->cn_nameptr));
2552                         /*
2553                          * Entries are supposed to be immutable unless in the
2554                          * process of getting destroyed. Accommodating for
2555                          * changing timestamps is possible but not worth it.
2556                          * This should be harmless in terms of correctness, in
2557                          * the worst case resulting in an earlier expiration.
2558                          * Alternatively, the found entry can be replaced
2559                          * altogether.
2560                          */
2561                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2562 #if 0
2563                         if (tsp != NULL) {
2564                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2565                                     ("no NCF_TS"));
2566                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2567                                 n2_ts->nc_time = ncp_ts->nc_time;
2568                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2569                                 if (dtsp != NULL) {
2570                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2571                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2572                                 }
2573                         }
2574 #endif
2575                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2576                             vp);
2577                         goto out_unlock_free;
2578                 }
2579         }
2580
2581         if (flag == NCF_ISDOTDOT) {
2582                 /*
2583                  * See if we are trying to add .. entry, but some other lookup
2584                  * has populated v_cache_dd pointer already.
2585                  */
2586                 if (dvp->v_cache_dd != NULL)
2587                         goto out_unlock_free;
2588                 KASSERT(vp == NULL || vp->v_type == VDIR,
2589                     ("wrong vnode type %p", vp));
2590                 atomic_thread_fence_rel();
2591                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2592         }
2593
2594         if (vp != NULL) {
2595                 if (flag != NCF_ISDOTDOT) {
2596                         /*
2597                          * For this case, the cache entry maps both the
2598                          * directory name in it and the name ".." for the
2599                          * directory's parent.
2600                          */
2601                         if ((ndd = vp->v_cache_dd) != NULL) {
2602                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2603                                         cache_zap_locked(ndd);
2604                                 else
2605                                         ndd = NULL;
2606                         }
2607                         atomic_thread_fence_rel();
2608                         atomic_store_ptr(&vp->v_cache_dd, ncp);
2609                 } else if (vp->v_type != VDIR) {
2610                         if (vp->v_cache_dd != NULL) {
2611                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
2612                         }
2613                 }
2614         }
2615
2616         if (flag != NCF_ISDOTDOT) {
2617                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2618                         cache_hold_vnode(dvp);
2619                 }
2620                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2621         }
2622
2623         /*
2624          * If the entry is "negative", we place it into the
2625          * "negative" cache queue, otherwise, we place it into the
2626          * destination vnode's cache entries queue.
2627          */
2628         if (vp != NULL) {
2629                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2630                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2631                     vp);
2632         } else {
2633                 if (cnp->cn_flags & ISWHITEOUT)
2634                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2635                 cache_neg_insert(ncp);
2636                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2637                     ncp->nc_name);
2638         }
2639
2640         /*
2641          * Insert the new namecache entry into the appropriate chain
2642          * within the cache entries table.
2643          */
2644         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2645
2646         atomic_thread_fence_rel();
2647         /*
2648          * Mark the entry as fully constructed.
2649          * It is immutable past this point until its removal.
2650          */
2651         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2652
2653         cache_enter_unlock(&cel);
2654         if (ndd != NULL)
2655                 cache_free(ndd);
2656         return;
2657 out_unlock_free:
2658         cache_enter_unlock(&cel);
2659         cache_free(ncp);
2660         return;
2661 }
2662
2663 /*
2664  * A variant of the above accepting flags.
2665  *
2666  * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
2667  *
2668  * TODO: this routine is a hack. It blindly removes the old entry, even if it
2669  * happens to match and it is doing it in an inefficient manner. It was added
2670  * to accommodate NFS which runs into a case where the target for a given name
2671  * may change from under it. Note this does nothing to solve the following
2672  * race: 2 callers of cache_enter_time_flags pass a different target vnode for
2673  * the same [dvp, cnp]. It may be argued that code doing this is broken.
2674  */
2675 void
2676 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2677     struct timespec *tsp, struct timespec *dtsp, int flags)
2678 {
2679
2680         MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
2681
2682         if (flags & VFS_CACHE_DROPOLD)
2683                 cache_remove_cnp(dvp, cnp);
2684         cache_enter_time(dvp, vp, cnp, tsp, dtsp);
2685 }
2686
2687 static u_long
2688 cache_roundup_2(u_long val)
2689 {
2690         u_long res;
2691
2692         for (res = 1; res <= val; res <<= 1)
2693                 continue;
2694
2695         return (res);
2696 }
2697
2698 static struct nchashhead *
2699 nchinittbl(u_long elements, u_long *hashmask)
2700 {
2701         struct nchashhead *hashtbl;
2702         u_long hashsize, i;
2703
2704         hashsize = cache_roundup_2(elements) / 2;
2705
2706         hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2707         for (i = 0; i < hashsize; i++)
2708                 CK_SLIST_INIT(&hashtbl[i]);
2709         *hashmask = hashsize - 1;
2710         return (hashtbl);
2711 }
2712
2713 static void
2714 ncfreetbl(struct nchashhead *hashtbl)
2715 {
2716
2717         free(hashtbl, M_VFSCACHE);
2718 }
2719
2720 /*
2721  * Name cache initialization, from vfs_init() when we are booting
2722  */
2723 static void
2724 nchinit(void *dummy __unused)
2725 {
2726         u_int i;
2727
2728         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2729             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2730         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2731             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2732         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2733             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2734         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2735             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2736
2737         VFS_SMR_ZONE_SET(cache_zone_small);
2738         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2739         VFS_SMR_ZONE_SET(cache_zone_large);
2740         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2741
2742         ncsize = desiredvnodes * ncsizefactor;
2743         cache_recalc_neg_min();
2744         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2745         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2746         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2747                 ncbuckethash = 7;
2748         if (ncbuckethash > nchash)
2749                 ncbuckethash = nchash;
2750         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2751             M_WAITOK | M_ZERO);
2752         for (i = 0; i < numbucketlocks; i++)
2753                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2754         ncvnodehash = ncbuckethash;
2755         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2756             M_WAITOK | M_ZERO);
2757         for (i = 0; i < numvnodelocks; i++)
2758                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2759
2760         for (i = 0; i < numneglists; i++) {
2761                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2762                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2763                 TAILQ_INIT(&neglists[i].nl_list);
2764                 TAILQ_INIT(&neglists[i].nl_hotlist);
2765         }
2766 }
2767 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2768
2769 void
2770 cache_vnode_init(struct vnode *vp)
2771 {
2772
2773         LIST_INIT(&vp->v_cache_src);
2774         TAILQ_INIT(&vp->v_cache_dst);
2775         vp->v_cache_dd = NULL;
2776         cache_prehash(vp);
2777 }
2778
2779 /*
2780  * Induce transient cache misses for lockless operation in cache_lookup() by
2781  * using a temporary hash table.
2782  *
2783  * This will force a fs lookup.
2784  *
2785  * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
2786  * to observe all CPUs not performing the lookup.
2787  */
2788 static void
2789 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
2790 {
2791
2792         MPASS(temphash < nchash);
2793         /*
2794          * Change the size. The new size is smaller and can safely be used
2795          * against the existing table. All lookups which now hash wrong will
2796          * result in a cache miss, which all callers are supposed to know how
2797          * to handle.
2798          */
2799         atomic_store_long(&nchash, temphash);
2800         atomic_thread_fence_rel();
2801         vfs_smr_synchronize();
2802         /*
2803          * At this point everyone sees the updated hash value, but they still
2804          * see the old table.
2805          */
2806         atomic_store_ptr(&nchashtbl, temptbl);
2807         atomic_thread_fence_rel();
2808         vfs_smr_synchronize();
2809         /*
2810          * At this point everyone sees the updated table pointer and size pair.
2811          */
2812 }
2813
2814 /*
2815  * Set the new hash table.
2816  *
2817  * Similarly to cache_changesize_set_temp(), this has to synchronize against
2818  * lockless operation in cache_lookup().
2819  */
2820 static void
2821 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
2822 {
2823
2824         MPASS(nchash < new_hash);
2825         /*
2826          * Change the pointer first. This wont result in out of bounds access
2827          * since the temporary table is guaranteed to be smaller.
2828          */
2829         atomic_store_ptr(&nchashtbl, new_tbl);
2830         atomic_thread_fence_rel();
2831         vfs_smr_synchronize();
2832         /*
2833          * At this point everyone sees the updated pointer value, but they
2834          * still see the old size.
2835          */
2836         atomic_store_long(&nchash, new_hash);
2837         atomic_thread_fence_rel();
2838         vfs_smr_synchronize();
2839         /*
2840          * At this point everyone sees the updated table pointer and size pair.
2841          */
2842 }
2843
2844 void
2845 cache_changesize(u_long newmaxvnodes)
2846 {
2847         struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
2848         u_long new_nchash, old_nchash, temphash;
2849         struct namecache *ncp;
2850         uint32_t hash;
2851         u_long newncsize;
2852         u_long i;
2853
2854         newncsize = newmaxvnodes * ncsizefactor;
2855         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2856         if (newmaxvnodes < numbucketlocks)
2857                 newmaxvnodes = numbucketlocks;
2858
2859         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2860         /* If same hash table size, nothing to do */
2861         if (nchash == new_nchash) {
2862                 ncfreetbl(new_nchashtbl);
2863                 return;
2864         }
2865
2866         temptbl = nchinittbl(1, &temphash);
2867
2868         /*
2869          * Move everything from the old hash table to the new table.
2870          * None of the namecache entries in the table can be removed
2871          * because to do so, they have to be removed from the hash table.
2872          */
2873         cache_lock_all_vnodes();
2874         cache_lock_all_buckets();
2875         old_nchashtbl = nchashtbl;
2876         old_nchash = nchash;
2877         cache_changesize_set_temp(temptbl, temphash);
2878         for (i = 0; i <= old_nchash; i++) {
2879                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2880                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2881                             ncp->nc_dvp);
2882                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2883                         CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
2884                 }
2885         }
2886         ncsize = newncsize;
2887         cache_recalc_neg_min();
2888         cache_changesize_set_new(new_nchashtbl, new_nchash);
2889         cache_unlock_all_buckets();
2890         cache_unlock_all_vnodes();
2891         ncfreetbl(old_nchashtbl);
2892         ncfreetbl(temptbl);
2893 }
2894
2895 /*
2896  * Remove all entries from and to a particular vnode.
2897  */
2898 static void
2899 cache_purge_impl(struct vnode *vp)
2900 {
2901         struct cache_freebatch batch;
2902         struct namecache *ncp;
2903         struct mtx *vlp, *vlp2;
2904
2905         TAILQ_INIT(&batch);
2906         vlp = VP2VNODELOCK(vp);
2907         vlp2 = NULL;
2908         mtx_lock(vlp);
2909 retry:
2910         while (!LIST_EMPTY(&vp->v_cache_src)) {
2911                 ncp = LIST_FIRST(&vp->v_cache_src);
2912                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2913                         goto retry;
2914                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2915         }
2916         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2917                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2918                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2919                         goto retry;
2920                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2921         }
2922         ncp = vp->v_cache_dd;
2923         if (ncp != NULL) {
2924                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2925                    ("lost dotdot link"));
2926                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2927                         goto retry;
2928                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2929         }
2930         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2931         mtx_unlock(vlp);
2932         if (vlp2 != NULL)
2933                 mtx_unlock(vlp2);
2934         cache_free_batch(&batch);
2935 }
2936
2937 /*
2938  * Opportunistic check to see if there is anything to do.
2939  */
2940 static bool
2941 cache_has_entries(struct vnode *vp)
2942 {
2943
2944         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2945             atomic_load_ptr(&vp->v_cache_dd) == NULL)
2946                 return (false);
2947         return (true);
2948 }
2949
2950 void
2951 cache_purge(struct vnode *vp)
2952 {
2953
2954         SDT_PROBE1(vfs, namecache, purge, done, vp);
2955         if (!cache_has_entries(vp))
2956                 return;
2957         cache_purge_impl(vp);
2958 }
2959
2960 /*
2961  * Only to be used by vgone.
2962  */
2963 void
2964 cache_purge_vgone(struct vnode *vp)
2965 {
2966         struct mtx *vlp;
2967
2968         VNPASS(VN_IS_DOOMED(vp), vp);
2969         if (cache_has_entries(vp)) {
2970                 cache_purge_impl(vp);
2971                 return;
2972         }
2973
2974         /*
2975          * Serialize against a potential thread doing cache_purge.
2976          */
2977         vlp = VP2VNODELOCK(vp);
2978         mtx_wait_unlocked(vlp);
2979         if (cache_has_entries(vp)) {
2980                 cache_purge_impl(vp);
2981                 return;
2982         }
2983         return;
2984 }
2985
2986 /*
2987  * Remove all negative entries for a particular directory vnode.
2988  */
2989 void
2990 cache_purge_negative(struct vnode *vp)
2991 {
2992         struct cache_freebatch batch;
2993         struct namecache *ncp, *nnp;
2994         struct mtx *vlp;
2995
2996         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2997         if (LIST_EMPTY(&vp->v_cache_src))
2998                 return;
2999         TAILQ_INIT(&batch);
3000         vlp = VP2VNODELOCK(vp);
3001         mtx_lock(vlp);
3002         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
3003                 if (!(ncp->nc_flag & NCF_NEGATIVE))
3004                         continue;
3005                 cache_zap_negative_locked_vnode_kl(ncp, vp);
3006                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
3007         }
3008         mtx_unlock(vlp);
3009         cache_free_batch(&batch);
3010 }
3011
3012 /*
3013  * Entry points for modifying VOP operations.
3014  */
3015 void
3016 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
3017     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
3018 {
3019
3020         ASSERT_VOP_IN_SEQC(fdvp);
3021         ASSERT_VOP_IN_SEQC(fvp);
3022         ASSERT_VOP_IN_SEQC(tdvp);
3023         if (tvp != NULL)
3024                 ASSERT_VOP_IN_SEQC(tvp);
3025
3026         cache_purge(fvp);
3027         if (tvp != NULL) {
3028                 cache_purge(tvp);
3029                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
3030                     ("%s: lingering negative entry", __func__));
3031         } else {
3032                 cache_remove_cnp(tdvp, tcnp);
3033         }
3034
3035         /*
3036          * TODO
3037          *
3038          * Historically renaming was always purging all revelang entries,
3039          * but that's quite wasteful. In particular turns out that in many cases
3040          * the target file is immediately accessed after rename, inducing a cache
3041          * miss.
3042          *
3043          * Recode this to reduce relocking and reuse the existing entry (if any)
3044          * instead of just removing it above and allocating a new one here.
3045          */
3046         cache_enter(tdvp, fvp, tcnp);
3047 }
3048
3049 void
3050 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
3051 {
3052
3053         ASSERT_VOP_IN_SEQC(dvp);
3054         ASSERT_VOP_IN_SEQC(vp);
3055         cache_purge(vp);
3056 }
3057
3058 #ifdef INVARIANTS
3059 /*
3060  * Validate that if an entry exists it matches.
3061  */
3062 void
3063 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
3064 {
3065         struct namecache *ncp;
3066         struct mtx *blp;
3067         uint32_t hash;
3068
3069         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3070         if (CK_SLIST_EMPTY(NCHHASH(hash)))
3071                 return;
3072         blp = HASH2BUCKETLOCK(hash);
3073         mtx_lock(blp);
3074         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3075                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3076                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
3077                         if (ncp->nc_vp != vp)
3078                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
3079                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
3080                 }
3081         }
3082         mtx_unlock(blp);
3083 }
3084
3085 void
3086 cache_assert_no_entries(struct vnode *vp)
3087 {
3088
3089         VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
3090         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
3091         VNPASS(vp->v_cache_dd == NULL, vp);
3092 }
3093 #endif
3094
3095 /*
3096  * Flush all entries referencing a particular filesystem.
3097  */
3098 void
3099 cache_purgevfs(struct mount *mp)
3100 {
3101         struct vnode *vp, *mvp;
3102         size_t visited __sdt_used, purged __sdt_used;
3103
3104         visited = purged = 0;
3105         /*
3106          * Somewhat wasteful iteration over all vnodes. Would be better to
3107          * support filtering and avoid the interlock to begin with.
3108          */
3109         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3110                 visited++;
3111                 if (!cache_has_entries(vp)) {
3112                         VI_UNLOCK(vp);
3113                         continue;
3114                 }
3115                 vholdl(vp);
3116                 VI_UNLOCK(vp);
3117                 cache_purge(vp);
3118                 purged++;
3119                 vdrop(vp);
3120         }
3121
3122         SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
3123 }
3124
3125 /*
3126  * Perform canonical checks and cache lookup and pass on to filesystem
3127  * through the vop_cachedlookup only if needed.
3128  */
3129
3130 int
3131 vfs_cache_lookup(struct vop_lookup_args *ap)
3132 {
3133         struct vnode *dvp;
3134         int error;
3135         struct vnode **vpp = ap->a_vpp;
3136         struct componentname *cnp = ap->a_cnp;
3137         int flags = cnp->cn_flags;
3138
3139         *vpp = NULL;
3140         dvp = ap->a_dvp;
3141
3142         if (dvp->v_type != VDIR)
3143                 return (ENOTDIR);
3144
3145         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
3146             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
3147                 return (EROFS);
3148
3149         error = vn_dir_check_exec(dvp, cnp);
3150         if (error != 0)
3151                 return (error);
3152
3153         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
3154         if (error == 0)
3155                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
3156         if (error == -1)
3157                 return (0);
3158         return (error);
3159 }
3160
3161 /* Implementation of the getcwd syscall. */
3162 int
3163 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
3164 {
3165         char *buf, *retbuf;
3166         size_t buflen;
3167         int error;
3168
3169         buflen = uap->buflen;
3170         if (__predict_false(buflen < 2))
3171                 return (EINVAL);
3172         if (buflen > MAXPATHLEN)
3173                 buflen = MAXPATHLEN;
3174
3175         buf = uma_zalloc(namei_zone, M_WAITOK);
3176         error = vn_getcwd(buf, &retbuf, &buflen);
3177         if (error == 0)
3178                 error = copyout(retbuf, uap->buf, buflen);
3179         uma_zfree(namei_zone, buf);
3180         return (error);
3181 }
3182
3183 int
3184 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
3185 {
3186         struct pwd *pwd;
3187         int error;
3188
3189         vfs_smr_enter();
3190         pwd = pwd_get_smr();
3191         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
3192             buflen, 0);
3193         VFS_SMR_ASSERT_NOT_ENTERED();
3194         if (error < 0) {
3195                 pwd = pwd_hold(curthread);
3196                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
3197                     retbuf, buflen);
3198                 pwd_drop(pwd);
3199         }
3200
3201 #ifdef KTRACE
3202         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
3203                 ktrnamei(*retbuf);
3204 #endif
3205         return (error);
3206 }
3207
3208 /*
3209  * Canonicalize a path by walking it forward and back.
3210  *
3211  * BUGS:
3212  * - Nothing guarantees the integrity of the entire chain. Consider the case
3213  *   where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
3214  *   "foo" into "quux" during the backwards walk. The result will be
3215  *   "quux/bar/baz/qux", which could not have been obtained by an incremental
3216  *   walk in userspace. Moreover, the path we return is inaccessible if the
3217  *   calling thread lacks permission to traverse "quux".
3218  */
3219 static int
3220 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
3221     size_t size, int flags, enum uio_seg pathseg)
3222 {
3223         struct nameidata nd;
3224         char *retbuf, *freebuf;
3225         int error;
3226
3227         if (flags != 0)
3228                 return (EINVAL);
3229         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
3230             pathseg, path, fd, &cap_fstat_rights);
3231         if ((error = namei(&nd)) != 0)
3232                 return (error);
3233
3234         if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
3235             (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
3236                 /*
3237                  * This happens if vp is a file mount. The call to
3238                  * vn_fullpath_hardlink can panic if path resolution can't be
3239                  * handled without the directory.
3240                  *
3241                  * To resolve this, we find the vnode which was mounted on -
3242                  * this should have a unique global path since we disallow
3243                  * mounting on linked files.
3244                  */
3245                 struct vnode *covered_vp;
3246                 error = vn_lock(nd.ni_vp, LK_SHARED);
3247                 if (error != 0)
3248                         goto out;
3249                 covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
3250                 vref(covered_vp);
3251                 VOP_UNLOCK(nd.ni_vp);
3252                 error = vn_fullpath(covered_vp, &retbuf, &freebuf);
3253                 vrele(covered_vp);
3254         } else {
3255                 error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr,
3256                     nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size);
3257         }
3258         if (error == 0) {
3259                 error = copyout(retbuf, buf, size);
3260                 free(freebuf, M_TEMP);
3261         }
3262 out:
3263         vrele(nd.ni_vp);
3264         vrele(nd.ni_dvp);
3265         NDFREE_PNBUF(&nd);
3266         return (error);
3267 }
3268
3269 int
3270 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
3271 {
3272
3273         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
3274             uap->flags, UIO_USERSPACE));
3275 }
3276
3277 /*
3278  * Retrieve the full filesystem path that correspond to a vnode from the name
3279  * cache (if available)
3280  */
3281 int
3282 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
3283 {
3284         struct pwd *pwd;
3285         char *buf;
3286         size_t buflen;
3287         int error;
3288
3289         if (__predict_false(vp == NULL))
3290                 return (EINVAL);
3291
3292         buflen = MAXPATHLEN;
3293         buf = malloc(buflen, M_TEMP, M_WAITOK);
3294         vfs_smr_enter();
3295         pwd = pwd_get_smr();
3296         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
3297         VFS_SMR_ASSERT_NOT_ENTERED();
3298         if (error < 0) {
3299                 pwd = pwd_hold(curthread);
3300                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
3301                 pwd_drop(pwd);
3302         }
3303         if (error == 0)
3304                 *freebuf = buf;
3305         else
3306                 free(buf, M_TEMP);
3307         return (error);
3308 }
3309
3310 /*
3311  * This function is similar to vn_fullpath, but it attempts to lookup the
3312  * pathname relative to the global root mount point.  This is required for the
3313  * auditing sub-system, as audited pathnames must be absolute, relative to the
3314  * global root mount point.
3315  */
3316 int
3317 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
3318 {
3319         char *buf;
3320         size_t buflen;
3321         int error;
3322
3323         if (__predict_false(vp == NULL))
3324                 return (EINVAL);
3325         buflen = MAXPATHLEN;
3326         buf = malloc(buflen, M_TEMP, M_WAITOK);
3327         vfs_smr_enter();
3328         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
3329         VFS_SMR_ASSERT_NOT_ENTERED();
3330         if (error < 0) {
3331                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
3332         }
3333         if (error == 0)
3334                 *freebuf = buf;
3335         else
3336                 free(buf, M_TEMP);
3337         return (error);
3338 }
3339
3340 static struct namecache *
3341 vn_dd_from_dst(struct vnode *vp)
3342 {
3343         struct namecache *ncp;
3344
3345         cache_assert_vnode_locked(vp);
3346         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3347                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3348                         return (ncp);
3349         }
3350         return (NULL);
3351 }
3352
3353 int
3354 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3355 {
3356         struct vnode *dvp;
3357         struct namecache *ncp;
3358         struct mtx *vlp;
3359         int error;
3360
3361         vlp = VP2VNODELOCK(*vp);
3362         mtx_lock(vlp);
3363         ncp = (*vp)->v_cache_dd;
3364         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3365                 KASSERT(ncp == vn_dd_from_dst(*vp),
3366                     ("%s: mismatch for dd entry (%p != %p)", __func__,
3367                     ncp, vn_dd_from_dst(*vp)));
3368         } else {
3369                 ncp = vn_dd_from_dst(*vp);
3370         }
3371         if (ncp != NULL) {
3372                 if (*buflen < ncp->nc_nlen) {
3373                         mtx_unlock(vlp);
3374                         vrele(*vp);
3375                         counter_u64_add(numfullpathfail4, 1);
3376                         error = ENOMEM;
3377                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3378                             vp, NULL);
3379                         return (error);
3380                 }
3381                 *buflen -= ncp->nc_nlen;
3382                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3383                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3384                     ncp->nc_name, vp);
3385                 dvp = *vp;
3386                 *vp = ncp->nc_dvp;
3387                 vref(*vp);
3388                 mtx_unlock(vlp);
3389                 vrele(dvp);
3390                 return (0);
3391         }
3392         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3393
3394         mtx_unlock(vlp);
3395         vn_lock(*vp, LK_SHARED | LK_RETRY);
3396         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3397         vput(*vp);
3398         if (error) {
3399                 counter_u64_add(numfullpathfail2, 1);
3400                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
3401                 return (error);
3402         }
3403
3404         *vp = dvp;
3405         if (VN_IS_DOOMED(dvp)) {
3406                 /* forced unmount */
3407                 vrele(dvp);
3408                 error = ENOENT;
3409                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3410                 return (error);
3411         }
3412         /*
3413          * *vp has its use count incremented still.
3414          */
3415
3416         return (0);
3417 }
3418
3419 /*
3420  * Resolve a directory to a pathname.
3421  *
3422  * The name of the directory can always be found in the namecache or fetched
3423  * from the filesystem. There is also guaranteed to be only one parent, meaning
3424  * we can just follow vnodes up until we find the root.
3425  *
3426  * The vnode must be referenced.
3427  */
3428 static int
3429 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3430     size_t *len, size_t addend)
3431 {
3432 #ifdef KDTRACE_HOOKS
3433         struct vnode *startvp = vp;
3434 #endif
3435         struct vnode *vp1;
3436         size_t buflen;
3437         int error;
3438         bool slash_prefixed;
3439
3440         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3441         VNPASS(vp->v_usecount > 0, vp);
3442
3443         buflen = *len;
3444
3445         slash_prefixed = true;
3446         if (addend == 0) {
3447                 MPASS(*len >= 2);
3448                 buflen--;
3449                 buf[buflen] = '\0';
3450                 slash_prefixed = false;
3451         }
3452
3453         error = 0;
3454
3455         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3456         counter_u64_add(numfullpathcalls, 1);
3457         while (vp != rdir && vp != rootvnode) {
3458                 /*
3459                  * The vp vnode must be already fully constructed,
3460                  * since it is either found in namecache or obtained
3461                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3462                  * without obtaining the vnode lock.
3463                  */
3464                 if ((vp->v_vflag & VV_ROOT) != 0) {
3465                         vn_lock(vp, LK_RETRY | LK_SHARED);
3466
3467                         /*
3468                          * With the vnode locked, check for races with
3469                          * unmount, forced or not.  Note that we
3470                          * already verified that vp is not equal to
3471                          * the root vnode, which means that
3472                          * mnt_vnodecovered can be NULL only for the
3473                          * case of unmount.
3474                          */
3475                         if (VN_IS_DOOMED(vp) ||
3476                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3477                             vp1->v_mountedhere != vp->v_mount) {
3478                                 vput(vp);
3479                                 error = ENOENT;
3480                                 SDT_PROBE3(vfs, namecache, fullpath, return,
3481                                     error, vp, NULL);
3482                                 break;
3483                         }
3484
3485                         vref(vp1);
3486                         vput(vp);
3487                         vp = vp1;
3488                         continue;
3489                 }
3490                 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3491                 error = vn_vptocnp(&vp, buf, &buflen);
3492                 if (error)
3493                         break;
3494                 if (buflen == 0) {
3495                         vrele(vp);
3496                         error = ENOMEM;
3497                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3498                             startvp, NULL);
3499                         break;
3500                 }
3501                 buf[--buflen] = '/';
3502                 slash_prefixed = true;
3503         }
3504         if (error)
3505                 return (error);
3506         if (!slash_prefixed) {
3507                 if (buflen == 0) {
3508                         vrele(vp);
3509                         counter_u64_add(numfullpathfail4, 1);
3510                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3511                             startvp, NULL);
3512                         return (ENOMEM);
3513                 }
3514                 buf[--buflen] = '/';
3515         }
3516         counter_u64_add(numfullpathfound, 1);
3517         vrele(vp);
3518
3519         *retbuf = buf + buflen;
3520         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3521         *len -= buflen;
3522         *len += addend;
3523         return (0);
3524 }
3525
3526 /*
3527  * Resolve an arbitrary vnode to a pathname.
3528  *
3529  * Note 2 caveats:
3530  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3531  *   resolve to a different path than the one used to find it
3532  * - namecache is not mandatory, meaning names are not guaranteed to be added
3533  *   (in which case resolving fails)
3534  */
3535 static void __inline
3536 cache_rev_failed_impl(int *reason, int line)
3537 {
3538
3539         *reason = line;
3540 }
3541 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3542
3543 static int
3544 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3545     char **retbuf, size_t *buflen, size_t addend)
3546 {
3547 #ifdef KDTRACE_HOOKS
3548         struct vnode *startvp = vp;
3549 #endif
3550         struct vnode *tvp;
3551         struct mount *mp;
3552         struct namecache *ncp;
3553         size_t orig_buflen;
3554         int reason;
3555         int error;
3556 #ifdef KDTRACE_HOOKS
3557         int i;
3558 #endif
3559         seqc_t vp_seqc, tvp_seqc;
3560         u_char nc_flag;
3561
3562         VFS_SMR_ASSERT_ENTERED();
3563
3564         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3565                 vfs_smr_exit();
3566                 return (-1);
3567         }
3568
3569         orig_buflen = *buflen;
3570
3571         if (addend == 0) {
3572                 MPASS(*buflen >= 2);
3573                 *buflen -= 1;
3574                 buf[*buflen] = '\0';
3575         }
3576
3577         if (vp == rdir || vp == rootvnode) {
3578                 if (addend == 0) {
3579                         *buflen -= 1;
3580                         buf[*buflen] = '/';
3581                 }
3582                 goto out_ok;
3583         }
3584
3585 #ifdef KDTRACE_HOOKS
3586         i = 0;
3587 #endif
3588         error = -1;
3589         ncp = NULL; /* for sdt probe down below */
3590         vp_seqc = vn_seqc_read_any(vp);
3591         if (seqc_in_modify(vp_seqc)) {
3592                 cache_rev_failed(&reason);
3593                 goto out_abort;
3594         }
3595
3596         for (;;) {
3597 #ifdef KDTRACE_HOOKS
3598                 i++;
3599 #endif
3600                 if ((vp->v_vflag & VV_ROOT) != 0) {
3601                         mp = atomic_load_ptr(&vp->v_mount);
3602                         if (mp == NULL) {
3603                                 cache_rev_failed(&reason);
3604                                 goto out_abort;
3605                         }
3606                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3607                         tvp_seqc = vn_seqc_read_any(tvp);
3608                         if (seqc_in_modify(tvp_seqc)) {
3609                                 cache_rev_failed(&reason);
3610                                 goto out_abort;
3611                         }
3612                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3613                                 cache_rev_failed(&reason);
3614                                 goto out_abort;
3615                         }
3616                         vp = tvp;
3617                         vp_seqc = tvp_seqc;
3618                         continue;
3619                 }
3620                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3621                 if (ncp == NULL) {
3622                         cache_rev_failed(&reason);
3623                         goto out_abort;
3624                 }
3625                 nc_flag = atomic_load_char(&ncp->nc_flag);
3626                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3627                         cache_rev_failed(&reason);
3628                         goto out_abort;
3629                 }
3630                 if (ncp->nc_nlen >= *buflen) {
3631                         cache_rev_failed(&reason);
3632                         error = ENOMEM;
3633                         goto out_abort;
3634                 }
3635                 *buflen -= ncp->nc_nlen;
3636                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3637                 *buflen -= 1;
3638                 buf[*buflen] = '/';
3639                 tvp = ncp->nc_dvp;
3640                 tvp_seqc = vn_seqc_read_any(tvp);
3641                 if (seqc_in_modify(tvp_seqc)) {
3642                         cache_rev_failed(&reason);
3643                         goto out_abort;
3644                 }
3645                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3646                         cache_rev_failed(&reason);
3647                         goto out_abort;
3648                 }
3649                 /*
3650                  * Acquire fence provided by vn_seqc_read_any above.
3651                  */
3652                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3653                         cache_rev_failed(&reason);
3654                         goto out_abort;
3655                 }
3656                 if (!cache_ncp_canuse(ncp)) {
3657                         cache_rev_failed(&reason);
3658                         goto out_abort;
3659                 }
3660                 vp = tvp;
3661                 vp_seqc = tvp_seqc;
3662                 if (vp == rdir || vp == rootvnode)
3663                         break;
3664         }
3665 out_ok:
3666         vfs_smr_exit();
3667         *retbuf = buf + *buflen;
3668         *buflen = orig_buflen - *buflen + addend;
3669         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3670         return (0);
3671
3672 out_abort:
3673         *buflen = orig_buflen;
3674         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3675         vfs_smr_exit();
3676         return (error);
3677 }
3678
3679 static int
3680 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3681     size_t *buflen)
3682 {
3683         size_t orig_buflen, addend;
3684         int error;
3685
3686         if (*buflen < 2)
3687                 return (EINVAL);
3688
3689         orig_buflen = *buflen;
3690
3691         vref(vp);
3692         addend = 0;
3693         if (vp->v_type != VDIR) {
3694                 *buflen -= 1;
3695                 buf[*buflen] = '\0';
3696                 error = vn_vptocnp(&vp, buf, buflen);
3697                 if (error)
3698                         return (error);
3699                 if (*buflen == 0) {
3700                         vrele(vp);
3701                         return (ENOMEM);
3702                 }
3703                 *buflen -= 1;
3704                 buf[*buflen] = '/';
3705                 addend = orig_buflen - *buflen;
3706         }
3707
3708         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3709 }
3710
3711 /*
3712  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3713  *
3714  * Since the namecache does not track hardlinks, the caller is expected to
3715  * first look up the target vnode with WANTPARENT flag passed to namei to get
3716  * dvp and vp.
3717  *
3718  * Then we have 2 cases:
3719  * - if the found vnode is a directory, the path can be constructed just by
3720  *   following names up the chain
3721  * - otherwise we populate the buffer with the saved name and start resolving
3722  *   from the parent
3723  */
3724 int
3725 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
3726     const char *hrdl_name, size_t hrdl_name_length,
3727     char **retbuf, char **freebuf, size_t *buflen)
3728 {
3729         char *buf, *tmpbuf;
3730         struct pwd *pwd;
3731         size_t addend;
3732         int error;
3733         __enum_uint8(vtype) type;
3734
3735         if (*buflen < 2)
3736                 return (EINVAL);
3737         if (*buflen > MAXPATHLEN)
3738                 *buflen = MAXPATHLEN;
3739
3740         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3741
3742         addend = 0;
3743
3744         /*
3745          * Check for VBAD to work around the vp_crossmp bug in lookup().
3746          *
3747          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3748          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3749          * If the type is VDIR (like in this very case) we can skip looking
3750          * at ni_dvp in the first place. However, since vnodes get passed here
3751          * unlocked the target may transition to doomed state (type == VBAD)
3752          * before we get to evaluate the condition. If this happens, we will
3753          * populate part of the buffer and descend to vn_fullpath_dir with
3754          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3755          */
3756         type = atomic_load_8(&vp->v_type);
3757         if (type == VBAD) {
3758                 error = ENOENT;
3759                 goto out_bad;
3760         }
3761         if (type != VDIR) {
3762                 addend = hrdl_name_length + 2;
3763                 if (*buflen < addend) {
3764                         error = ENOMEM;
3765                         goto out_bad;
3766                 }
3767                 *buflen -= addend;
3768                 tmpbuf = buf + *buflen;
3769                 tmpbuf[0] = '/';
3770                 memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
3771                 tmpbuf[addend - 1] = '\0';
3772                 vp = dvp;
3773         }
3774
3775         vfs_smr_enter();
3776         pwd = pwd_get_smr();
3777         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3778             addend);
3779         VFS_SMR_ASSERT_NOT_ENTERED();
3780         if (error < 0) {
3781                 pwd = pwd_hold(curthread);
3782                 vref(vp);
3783                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3784                     addend);
3785                 pwd_drop(pwd);
3786         }
3787         if (error != 0)
3788                 goto out_bad;
3789
3790         *freebuf = buf;
3791
3792         return (0);
3793 out_bad:
3794         free(buf, M_TEMP);
3795         return (error);
3796 }
3797
3798 struct vnode *
3799 vn_dir_dd_ino(struct vnode *vp)
3800 {
3801         struct namecache *ncp;
3802         struct vnode *ddvp;
3803         struct mtx *vlp;
3804         enum vgetstate vs;
3805
3806         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3807         vlp = VP2VNODELOCK(vp);
3808         mtx_lock(vlp);
3809         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3810                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3811                         continue;
3812                 ddvp = ncp->nc_dvp;
3813                 vs = vget_prep(ddvp);
3814                 mtx_unlock(vlp);
3815                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3816                         return (NULL);
3817                 return (ddvp);
3818         }
3819         mtx_unlock(vlp);
3820         return (NULL);
3821 }
3822
3823 int
3824 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3825 {
3826         struct namecache *ncp;
3827         struct mtx *vlp;
3828         int l;
3829
3830         vlp = VP2VNODELOCK(vp);
3831         mtx_lock(vlp);
3832         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3833                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3834                         break;
3835         if (ncp == NULL) {
3836                 mtx_unlock(vlp);
3837                 return (ENOENT);
3838         }
3839         l = min(ncp->nc_nlen, buflen - 1);
3840         memcpy(buf, ncp->nc_name, l);
3841         mtx_unlock(vlp);
3842         buf[l] = '\0';
3843         return (0);
3844 }
3845
3846 /*
3847  * This function updates path string to vnode's full global path
3848  * and checks the size of the new path string against the pathlen argument.
3849  *
3850  * Requires a locked, referenced vnode.
3851  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3852  *
3853  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3854  * because it falls back to the ".." lookup if the namecache lookup fails.
3855  */
3856 int
3857 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3858     u_int pathlen)
3859 {
3860         struct nameidata nd;
3861         struct vnode *vp1;
3862         char *rpath, *fbuf;
3863         int error;
3864
3865         ASSERT_VOP_ELOCKED(vp, __func__);
3866
3867         /* Construct global filesystem path from vp. */
3868         VOP_UNLOCK(vp);
3869         error = vn_fullpath_global(vp, &rpath, &fbuf);
3870
3871         if (error != 0) {
3872                 vrele(vp);
3873                 return (error);
3874         }
3875
3876         if (strlen(rpath) >= pathlen) {
3877                 vrele(vp);
3878                 error = ENAMETOOLONG;
3879                 goto out;
3880         }
3881
3882         /*
3883          * Re-lookup the vnode by path to detect a possible rename.
3884          * As a side effect, the vnode is relocked.
3885          * If vnode was renamed, return ENOENT.
3886          */
3887         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3888         error = namei(&nd);
3889         if (error != 0) {
3890                 vrele(vp);
3891                 goto out;
3892         }
3893         NDFREE_PNBUF(&nd);
3894         vp1 = nd.ni_vp;
3895         vrele(vp);
3896         if (vp1 == vp)
3897                 strcpy(path, rpath);
3898         else {
3899                 vput(vp1);
3900                 error = ENOENT;
3901         }
3902
3903 out:
3904         free(fbuf, M_TEMP);
3905         return (error);
3906 }
3907
3908 /*
3909  * This is similar to vn_path_to_global_path but allows for regular
3910  * files which may not be present in the cache.
3911  *
3912  * Requires a locked, referenced vnode.
3913  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3914  */
3915 int
3916 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
3917     struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
3918     size_t leaf_length)
3919 {
3920         struct nameidata nd;
3921         struct vnode *vp1;
3922         char *rpath, *fbuf;
3923         size_t len;
3924         int error;
3925
3926         ASSERT_VOP_ELOCKED(vp, __func__);
3927
3928         /*
3929          * Construct global filesystem path from dvp, vp and leaf
3930          * name.
3931          */
3932         VOP_UNLOCK(vp);
3933         len = pathlen;
3934         error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
3935             &rpath, &fbuf, &len);
3936
3937         if (error != 0) {
3938                 vrele(vp);
3939                 return (error);
3940         }
3941
3942         if (strlen(rpath) >= pathlen) {
3943                 vrele(vp);
3944                 error = ENAMETOOLONG;
3945                 goto out;
3946         }
3947
3948         /*
3949          * Re-lookup the vnode by path to detect a possible rename.
3950          * As a side effect, the vnode is relocked.
3951          * If vnode was renamed, return ENOENT.
3952          */
3953         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
3954         error = namei(&nd);
3955         if (error != 0) {
3956                 vrele(vp);
3957                 goto out;
3958         }
3959         NDFREE_PNBUF(&nd);
3960         vp1 = nd.ni_vp;
3961         vrele(vp);
3962         if (vp1 == vp)
3963                 strcpy(path, rpath);
3964         else {
3965                 vput(vp1);
3966                 error = ENOENT;
3967         }
3968
3969 out:
3970         free(fbuf, M_TEMP);
3971         return (error);
3972 }
3973
3974 #ifdef DDB
3975 static void
3976 db_print_vpath(struct vnode *vp)
3977 {
3978
3979         while (vp != NULL) {
3980                 db_printf("%p: ", vp);
3981                 if (vp == rootvnode) {
3982                         db_printf("/");
3983                         vp = NULL;
3984                 } else {
3985                         if (vp->v_vflag & VV_ROOT) {
3986                                 db_printf("<mount point>");
3987                                 vp = vp->v_mount->mnt_vnodecovered;
3988                         } else {
3989                                 struct namecache *ncp;
3990                                 char *ncn;
3991                                 int i;
3992
3993                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3994                                 if (ncp != NULL) {
3995                                         ncn = ncp->nc_name;
3996                                         for (i = 0; i < ncp->nc_nlen; i++)
3997                                                 db_printf("%c", *ncn++);
3998                                         vp = ncp->nc_dvp;
3999                                 } else {
4000                                         vp = NULL;
4001                                 }
4002                         }
4003                 }
4004                 db_printf("\n");
4005         }
4006
4007         return;
4008 }
4009
4010 DB_SHOW_COMMAND(vpath, db_show_vpath)
4011 {
4012         struct vnode *vp;
4013
4014         if (!have_addr) {
4015                 db_printf("usage: show vpath <struct vnode *>\n");
4016                 return;
4017         }
4018
4019         vp = (struct vnode *)addr;
4020         db_print_vpath(vp);
4021 }
4022
4023 #endif
4024
4025 static int cache_fast_lookup = 1;
4026
4027 #define CACHE_FPL_FAILED        -2020
4028
4029 static int
4030 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
4031 {
4032         vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
4033         panic("no proper vop_fplookup_vexec");
4034 }
4035
4036 static int
4037 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
4038 {
4039         vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
4040         panic("no proper vop_fplookup_symlink");
4041 }
4042
4043 void
4044 cache_vop_vector_register(struct vop_vector *v)
4045 {
4046         size_t ops;
4047
4048         ops = 0;
4049         if (v->vop_fplookup_vexec != NULL) {
4050                 ops++;
4051         }
4052         if (v->vop_fplookup_symlink != NULL) {
4053                 ops++;
4054         }
4055
4056         if (ops == 2) {
4057                 return;
4058         }
4059
4060         if (ops == 0) {
4061                 v->vop_fplookup_vexec = cache_vop_bad_vexec;
4062                 v->vop_fplookup_symlink = cache_vop_bad_symlink;
4063                 return;
4064         }
4065
4066         printf("%s: invalid vop vector %p -- either all or none fplookup vops "
4067             "need to be provided",  __func__, v);
4068         if (v->vop_fplookup_vexec == NULL) {
4069                 printf("%s: missing vop_fplookup_vexec\n", __func__);
4070         }
4071         if (v->vop_fplookup_symlink == NULL) {
4072                 printf("%s: missing vop_fplookup_symlink\n", __func__);
4073         }
4074         panic("bad vop vector %p", v);
4075 }
4076
4077 #ifdef INVARIANTS
4078 void
4079 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
4080 {
4081         if (mp == NULL)
4082                 return;
4083
4084         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4085                 return;
4086
4087         if (vops->vop_fplookup_vexec == NULL ||
4088             vops->vop_fplookup_vexec == cache_vop_bad_vexec)
4089                 panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
4090                     vops, mp->mnt_vfc->vfc_name);
4091
4092         if (vops->vop_fplookup_symlink == NULL ||
4093             vops->vop_fplookup_symlink == cache_vop_bad_symlink)
4094                 panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
4095                     vops, mp->mnt_vfc->vfc_name);
4096 }
4097 #endif
4098
4099 void
4100 cache_fast_lookup_enabled_recalc(void)
4101 {
4102         int lookup_flag;
4103         int mac_on;
4104
4105 #ifdef MAC
4106         mac_on = mac_vnode_check_lookup_enabled();
4107         mac_on |= mac_vnode_check_readlink_enabled();
4108 #else
4109         mac_on = 0;
4110 #endif
4111
4112         lookup_flag = atomic_load_int(&cache_fast_lookup);
4113         if (lookup_flag && !mac_on) {
4114                 atomic_store_char(&cache_fast_lookup_enabled, true);
4115         } else {
4116                 atomic_store_char(&cache_fast_lookup_enabled, false);
4117         }
4118 }
4119
4120 static int
4121 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
4122 {
4123         int error, old;
4124
4125         old = atomic_load_int(&cache_fast_lookup);
4126         error = sysctl_handle_int(oidp, arg1, arg2, req);
4127         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
4128                 cache_fast_lookup_enabled_recalc();
4129         return (error);
4130 }
4131 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
4132     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
4133
4134 /*
4135  * Components of nameidata (or objects it can point to) which may
4136  * need restoring in case fast path lookup fails.
4137  */
4138 struct nameidata_outer {
4139         size_t ni_pathlen;
4140         int cn_flags;
4141 };
4142
4143 struct nameidata_saved {
4144 #ifdef INVARIANTS
4145         char *cn_nameptr;
4146         size_t ni_pathlen;
4147 #endif
4148 };
4149
4150 #ifdef INVARIANTS
4151 struct cache_fpl_debug {
4152         size_t ni_pathlen;
4153 };
4154 #endif
4155
4156 struct cache_fpl {
4157         struct nameidata *ndp;
4158         struct componentname *cnp;
4159         char *nulchar;
4160         struct vnode *dvp;
4161         struct vnode *tvp;
4162         seqc_t dvp_seqc;
4163         seqc_t tvp_seqc;
4164         uint32_t hash;
4165         struct nameidata_saved snd;
4166         struct nameidata_outer snd_outer;
4167         int line;
4168         enum cache_fpl_status status:8;
4169         bool in_smr;
4170         bool fsearch;
4171         struct pwd **pwd;
4172 #ifdef INVARIANTS
4173         struct cache_fpl_debug debug;
4174 #endif
4175 };
4176
4177 static bool cache_fplookup_mp_supported(struct mount *mp);
4178 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
4179 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
4180 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
4181 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
4182 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
4183 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
4184 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
4185 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
4186 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
4187
4188 static void
4189 cache_fpl_cleanup_cnp(struct componentname *cnp)
4190 {
4191
4192         uma_zfree(namei_zone, cnp->cn_pnbuf);
4193         cnp->cn_pnbuf = NULL;
4194         cnp->cn_nameptr = NULL;
4195 }
4196
4197 static struct vnode *
4198 cache_fpl_handle_root(struct cache_fpl *fpl)
4199 {
4200         struct nameidata *ndp;
4201         struct componentname *cnp;
4202
4203         ndp = fpl->ndp;
4204         cnp = fpl->cnp;
4205
4206         MPASS(*(cnp->cn_nameptr) == '/');
4207         cnp->cn_nameptr++;
4208         cache_fpl_pathlen_dec(fpl);
4209
4210         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4211                 do {
4212                         cnp->cn_nameptr++;
4213                         cache_fpl_pathlen_dec(fpl);
4214                 } while (*(cnp->cn_nameptr) == '/');
4215         }
4216
4217         return (ndp->ni_rootdir);
4218 }
4219
4220 static void
4221 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
4222 {
4223
4224         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
4225         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
4226 }
4227
4228 static void
4229 cache_fpl_checkpoint(struct cache_fpl *fpl)
4230 {
4231
4232 #ifdef INVARIANTS
4233         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
4234         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
4235 #endif
4236 }
4237
4238 static void
4239 cache_fpl_restore_partial(struct cache_fpl *fpl)
4240 {
4241
4242         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
4243 #ifdef INVARIANTS
4244         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
4245 #endif
4246 }
4247
4248 static void
4249 cache_fpl_restore_abort(struct cache_fpl *fpl)
4250 {
4251
4252         cache_fpl_restore_partial(fpl);
4253         /*
4254          * It is 0 on entry by API contract.
4255          */
4256         fpl->ndp->ni_resflags = 0;
4257         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
4258         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
4259 }
4260
4261 #ifdef INVARIANTS
4262 #define cache_fpl_smr_assert_entered(fpl) ({                    \
4263         struct cache_fpl *_fpl = (fpl);                         \
4264         MPASS(_fpl->in_smr == true);                            \
4265         VFS_SMR_ASSERT_ENTERED();                               \
4266 })
4267 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
4268         struct cache_fpl *_fpl = (fpl);                         \
4269         MPASS(_fpl->in_smr == false);                           \
4270         VFS_SMR_ASSERT_NOT_ENTERED();                           \
4271 })
4272 static void
4273 cache_fpl_assert_status(struct cache_fpl *fpl)
4274 {
4275
4276         switch (fpl->status) {
4277         case CACHE_FPL_STATUS_UNSET:
4278                 __assert_unreachable();
4279                 break;
4280         case CACHE_FPL_STATUS_DESTROYED:
4281         case CACHE_FPL_STATUS_ABORTED:
4282         case CACHE_FPL_STATUS_PARTIAL:
4283         case CACHE_FPL_STATUS_HANDLED:
4284                 break;
4285         }
4286 }
4287 #else
4288 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
4289 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
4290 #define cache_fpl_assert_status(fpl) do { } while (0)
4291 #endif
4292
4293 #define cache_fpl_smr_enter_initial(fpl) ({                     \
4294         struct cache_fpl *_fpl = (fpl);                         \
4295         vfs_smr_enter();                                        \
4296         _fpl->in_smr = true;                                    \
4297 })
4298
4299 #define cache_fpl_smr_enter(fpl) ({                             \
4300         struct cache_fpl *_fpl = (fpl);                         \
4301         MPASS(_fpl->in_smr == false);                           \
4302         vfs_smr_enter();                                        \
4303         _fpl->in_smr = true;                                    \
4304 })
4305
4306 #define cache_fpl_smr_exit(fpl) ({                              \
4307         struct cache_fpl *_fpl = (fpl);                         \
4308         MPASS(_fpl->in_smr == true);                            \
4309         vfs_smr_exit();                                         \
4310         _fpl->in_smr = false;                                   \
4311 })
4312
4313 static int
4314 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
4315 {
4316
4317         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4318                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4319                     ("%s: converting to abort from %d at %d, set at %d\n",
4320                     __func__, fpl->status, line, fpl->line));
4321         }
4322         cache_fpl_smr_assert_not_entered(fpl);
4323         fpl->status = CACHE_FPL_STATUS_ABORTED;
4324         fpl->line = line;
4325         return (CACHE_FPL_FAILED);
4326 }
4327
4328 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
4329
4330 static int __noinline
4331 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
4332 {
4333         struct nameidata *ndp;
4334         struct componentname *cnp;
4335
4336         ndp = fpl->ndp;
4337         cnp = fpl->cnp;
4338
4339         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
4340                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
4341                     ("%s: converting to abort from %d at %d, set at %d\n",
4342                     __func__, fpl->status, line, fpl->line));
4343         }
4344         fpl->status = CACHE_FPL_STATUS_ABORTED;
4345         fpl->line = line;
4346         if (fpl->in_smr)
4347                 cache_fpl_smr_exit(fpl);
4348         cache_fpl_restore_abort(fpl);
4349         /*
4350          * Resolving symlinks overwrites data passed by the caller.
4351          * Let namei know.
4352          */
4353         if (ndp->ni_loopcnt > 0) {
4354                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
4355                 cache_fpl_cleanup_cnp(cnp);
4356         }
4357         return (CACHE_FPL_FAILED);
4358 }
4359
4360 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
4361
4362 static int __noinline
4363 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
4364 {
4365
4366         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4367             ("%s: setting to partial at %d, but already set to %d at %d\n",
4368             __func__, line, fpl->status, fpl->line));
4369         cache_fpl_smr_assert_entered(fpl);
4370         fpl->status = CACHE_FPL_STATUS_PARTIAL;
4371         fpl->line = line;
4372         return (cache_fplookup_partial_setup(fpl));
4373 }
4374
4375 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
4376
4377 static int
4378 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
4379 {
4380
4381         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4382             ("%s: setting to handled at %d, but already set to %d at %d\n",
4383             __func__, line, fpl->status, fpl->line));
4384         cache_fpl_smr_assert_not_entered(fpl);
4385         fpl->status = CACHE_FPL_STATUS_HANDLED;
4386         fpl->line = line;
4387         return (0);
4388 }
4389
4390 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
4391
4392 static int
4393 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
4394 {
4395
4396         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
4397             ("%s: setting to handled at %d, but already set to %d at %d\n",
4398             __func__, line, fpl->status, fpl->line));
4399         MPASS(error != 0);
4400         MPASS(error != CACHE_FPL_FAILED);
4401         cache_fpl_smr_assert_not_entered(fpl);
4402         fpl->status = CACHE_FPL_STATUS_HANDLED;
4403         fpl->line = line;
4404         fpl->dvp = NULL;
4405         fpl->tvp = NULL;
4406         return (error);
4407 }
4408
4409 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
4410
4411 static bool
4412 cache_fpl_terminated(struct cache_fpl *fpl)
4413 {
4414
4415         return (fpl->status != CACHE_FPL_STATUS_UNSET);
4416 }
4417
4418 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
4419         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
4420          FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
4421          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
4422          OPENWRITE | WANTIOCTLCAPS)
4423
4424 #define CACHE_FPL_INTERNAL_CN_FLAGS \
4425         (ISDOTDOT | MAKEENTRY | ISLASTCN)
4426
4427 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
4428     "supported and internal flags overlap");
4429
4430 static bool
4431 cache_fpl_islastcn(struct nameidata *ndp)
4432 {
4433
4434         return (*ndp->ni_next == 0);
4435 }
4436
4437 static bool
4438 cache_fpl_istrailingslash(struct cache_fpl *fpl)
4439 {
4440
4441         MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
4442         return (*(fpl->nulchar - 1) == '/');
4443 }
4444
4445 static bool
4446 cache_fpl_isdotdot(struct componentname *cnp)
4447 {
4448
4449         if (cnp->cn_namelen == 2 &&
4450             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
4451                 return (true);
4452         return (false);
4453 }
4454
4455 static bool
4456 cache_can_fplookup(struct cache_fpl *fpl)
4457 {
4458         struct nameidata *ndp;
4459         struct componentname *cnp;
4460         struct thread *td;
4461
4462         ndp = fpl->ndp;
4463         cnp = fpl->cnp;
4464         td = curthread;
4465
4466         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
4467                 cache_fpl_aborted_early(fpl);
4468                 return (false);
4469         }
4470         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4471                 cache_fpl_aborted_early(fpl);
4472                 return (false);
4473         }
4474         if (IN_CAPABILITY_MODE(td)) {
4475                 cache_fpl_aborted_early(fpl);
4476                 return (false);
4477         }
4478         if (AUDITING_TD(td)) {
4479                 cache_fpl_aborted_early(fpl);
4480                 return (false);
4481         }
4482         if (ndp->ni_startdir != NULL) {
4483                 cache_fpl_aborted_early(fpl);
4484                 return (false);
4485         }
4486         return (true);
4487 }
4488
4489 static int __noinline
4490 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4491 {
4492         struct nameidata *ndp;
4493         struct componentname *cnp;
4494         int error;
4495         bool fsearch;
4496
4497         ndp = fpl->ndp;
4498         cnp = fpl->cnp;
4499
4500         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
4501         if (__predict_false(error != 0)) {
4502                 return (cache_fpl_aborted(fpl));
4503         }
4504         fpl->fsearch = fsearch;
4505         if ((*vpp)->v_type != VDIR) {
4506                 if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
4507                         cache_fpl_smr_exit(fpl);
4508                         return (cache_fpl_handled_error(fpl, ENOTDIR));
4509                 }
4510         }
4511         return (0);
4512 }
4513
4514 static int __noinline
4515 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4516     uint32_t hash)
4517 {
4518         struct componentname *cnp;
4519         struct vnode *dvp;
4520
4521         cnp = fpl->cnp;
4522         dvp = fpl->dvp;
4523
4524         cache_fpl_smr_exit(fpl);
4525         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4526                 return (cache_fpl_handled_error(fpl, ENOENT));
4527         else
4528                 return (cache_fpl_aborted(fpl));
4529 }
4530
4531 /*
4532  * The target vnode is not supported, prepare for the slow path to take over.
4533  */
4534 static int __noinline
4535 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4536 {
4537         struct nameidata *ndp;
4538         struct componentname *cnp;
4539         enum vgetstate dvs;
4540         struct vnode *dvp;
4541         struct pwd *pwd;
4542         seqc_t dvp_seqc;
4543
4544         ndp = fpl->ndp;
4545         cnp = fpl->cnp;
4546         pwd = *(fpl->pwd);
4547         dvp = fpl->dvp;
4548         dvp_seqc = fpl->dvp_seqc;
4549
4550         if (!pwd_hold_smr(pwd)) {
4551                 return (cache_fpl_aborted(fpl));
4552         }
4553
4554         /*
4555          * Note that seqc is checked before the vnode is locked, so by
4556          * the time regular lookup gets to it it may have moved.
4557          *
4558          * Ultimately this does not affect correctness, any lookup errors
4559          * are userspace racing with itself. It is guaranteed that any
4560          * path which ultimately gets found could also have been found
4561          * by regular lookup going all the way in absence of concurrent
4562          * modifications.
4563          */
4564         dvs = vget_prep_smr(dvp);
4565         cache_fpl_smr_exit(fpl);
4566         if (__predict_false(dvs == VGET_NONE)) {
4567                 pwd_drop(pwd);
4568                 return (cache_fpl_aborted(fpl));
4569         }
4570
4571         vget_finish_ref(dvp, dvs);
4572         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4573                 vrele(dvp);
4574                 pwd_drop(pwd);
4575                 return (cache_fpl_aborted(fpl));
4576         }
4577
4578         cache_fpl_restore_partial(fpl);
4579 #ifdef INVARIANTS
4580         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4581                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4582                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4583         }
4584 #endif
4585
4586         ndp->ni_startdir = dvp;
4587         cnp->cn_flags |= MAKEENTRY;
4588         if (cache_fpl_islastcn(ndp))
4589                 cnp->cn_flags |= ISLASTCN;
4590         if (cache_fpl_isdotdot(cnp))
4591                 cnp->cn_flags |= ISDOTDOT;
4592
4593         /*
4594          * Skip potential extra slashes parsing did not take care of.
4595          * cache_fplookup_skip_slashes explains the mechanism.
4596          */
4597         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4598                 do {
4599                         cnp->cn_nameptr++;
4600                         cache_fpl_pathlen_dec(fpl);
4601                 } while (*(cnp->cn_nameptr) == '/');
4602         }
4603
4604         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4605 #ifdef INVARIANTS
4606         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4607                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4608                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4609                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4610         }
4611 #endif
4612         return (0);
4613 }
4614
4615 static int
4616 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4617 {
4618         struct componentname *cnp;
4619         struct vnode *tvp;
4620         seqc_t tvp_seqc;
4621         int error, lkflags;
4622
4623         cnp = fpl->cnp;
4624         tvp = fpl->tvp;
4625         tvp_seqc = fpl->tvp_seqc;
4626
4627         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4628                 lkflags = LK_SHARED;
4629                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4630                         lkflags = LK_EXCLUSIVE;
4631                 error = vget_finish(tvp, lkflags, tvs);
4632                 if (__predict_false(error != 0)) {
4633                         return (cache_fpl_aborted(fpl));
4634                 }
4635         } else {
4636                 vget_finish_ref(tvp, tvs);
4637         }
4638
4639         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4640                 if ((cnp->cn_flags & LOCKLEAF) != 0)
4641                         vput(tvp);
4642                 else
4643                         vrele(tvp);
4644                 return (cache_fpl_aborted(fpl));
4645         }
4646
4647         return (cache_fpl_handled(fpl));
4648 }
4649
4650 /*
4651  * They want to possibly modify the state of the namecache.
4652  */
4653 static int __noinline
4654 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4655 {
4656         struct nameidata *ndp __diagused;
4657         struct componentname *cnp;
4658         enum vgetstate dvs;
4659         struct vnode *dvp, *tvp;
4660         struct mount *mp;
4661         seqc_t dvp_seqc;
4662         int error;
4663         bool docache;
4664
4665         ndp = fpl->ndp;
4666         cnp = fpl->cnp;
4667         dvp = fpl->dvp;
4668         dvp_seqc = fpl->dvp_seqc;
4669
4670         MPASS(*(cnp->cn_nameptr) != '/');
4671         MPASS(cache_fpl_islastcn(ndp));
4672         if ((cnp->cn_flags & LOCKPARENT) == 0)
4673                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4674         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4675         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4676             cnp->cn_nameiop == RENAME);
4677         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4678         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4679
4680         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4681         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4682                 docache = false;
4683
4684         /*
4685          * Regular lookup nulifies the slash, which we don't do here.
4686          * Don't take chances with filesystem routines seeing it for
4687          * the last entry.
4688          */
4689         if (cache_fpl_istrailingslash(fpl)) {
4690                 return (cache_fpl_partial(fpl));
4691         }
4692
4693         mp = atomic_load_ptr(&dvp->v_mount);
4694         if (__predict_false(mp == NULL)) {
4695                 return (cache_fpl_aborted(fpl));
4696         }
4697
4698         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4699                 cache_fpl_smr_exit(fpl);
4700                 /*
4701                  * Original code keeps not checking for CREATE which
4702                  * might be a bug. For now let the old lookup decide.
4703                  */
4704                 if (cnp->cn_nameiop == CREATE) {
4705                         return (cache_fpl_aborted(fpl));
4706                 }
4707                 return (cache_fpl_handled_error(fpl, EROFS));
4708         }
4709
4710         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4711                 cache_fpl_smr_exit(fpl);
4712                 return (cache_fpl_handled_error(fpl, EEXIST));
4713         }
4714
4715         /*
4716          * Secure access to dvp; check cache_fplookup_partial_setup for
4717          * reasoning.
4718          *
4719          * XXX At least UFS requires its lookup routine to be called for
4720          * the last path component, which leads to some level of complication
4721          * and inefficiency:
4722          * - the target routine always locks the target vnode, but our caller
4723          *   may not need it locked
4724          * - some of the VOP machinery asserts that the parent is locked, which
4725          *   once more may be not required
4726          *
4727          * TODO: add a flag for filesystems which don't need this.
4728          */
4729         dvs = vget_prep_smr(dvp);
4730         cache_fpl_smr_exit(fpl);
4731         if (__predict_false(dvs == VGET_NONE)) {
4732                 return (cache_fpl_aborted(fpl));
4733         }
4734
4735         vget_finish_ref(dvp, dvs);
4736         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4737                 vrele(dvp);
4738                 return (cache_fpl_aborted(fpl));
4739         }
4740
4741         error = vn_lock(dvp, LK_EXCLUSIVE);
4742         if (__predict_false(error != 0)) {
4743                 vrele(dvp);
4744                 return (cache_fpl_aborted(fpl));
4745         }
4746
4747         tvp = NULL;
4748         cnp->cn_flags |= ISLASTCN;
4749         if (docache)
4750                 cnp->cn_flags |= MAKEENTRY;
4751         if (cache_fpl_isdotdot(cnp))
4752                 cnp->cn_flags |= ISDOTDOT;
4753         cnp->cn_lkflags = LK_EXCLUSIVE;
4754         error = VOP_LOOKUP(dvp, &tvp, cnp);
4755         switch (error) {
4756         case EJUSTRETURN:
4757         case 0:
4758                 break;
4759         case ENOTDIR:
4760         case ENOENT:
4761                 vput(dvp);
4762                 return (cache_fpl_handled_error(fpl, error));
4763         default:
4764                 vput(dvp);
4765                 return (cache_fpl_aborted(fpl));
4766         }
4767
4768         fpl->tvp = tvp;
4769
4770         if (tvp == NULL) {
4771                 MPASS(error == EJUSTRETURN);
4772                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4773                         VOP_UNLOCK(dvp);
4774                 }
4775                 return (cache_fpl_handled(fpl));
4776         }
4777
4778         /*
4779          * There are very hairy corner cases concerning various flag combinations
4780          * and locking state. In particular here we only hold one lock instead of
4781          * two.
4782          *
4783          * Skip the complexity as it is of no significance for normal workloads.
4784          */
4785         if (__predict_false(tvp == dvp)) {
4786                 vput(dvp);
4787                 vrele(tvp);
4788                 return (cache_fpl_aborted(fpl));
4789         }
4790
4791         /*
4792          * If they want the symlink itself we are fine, but if they want to
4793          * follow it regular lookup has to be engaged.
4794          */
4795         if (tvp->v_type == VLNK) {
4796                 if ((cnp->cn_flags & FOLLOW) != 0) {
4797                         vput(dvp);
4798                         vput(tvp);
4799                         return (cache_fpl_aborted(fpl));
4800                 }
4801         }
4802
4803         /*
4804          * Since we expect this to be the terminal vnode it should almost never
4805          * be a mount point.
4806          */
4807         if (__predict_false(cache_fplookup_is_mp(fpl))) {
4808                 vput(dvp);
4809                 vput(tvp);
4810                 return (cache_fpl_aborted(fpl));
4811         }
4812
4813         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4814                 vput(dvp);
4815                 vput(tvp);
4816                 return (cache_fpl_handled_error(fpl, EEXIST));
4817         }
4818
4819         if ((cnp->cn_flags & LOCKLEAF) == 0) {
4820                 VOP_UNLOCK(tvp);
4821         }
4822
4823         if ((cnp->cn_flags & LOCKPARENT) == 0) {
4824                 VOP_UNLOCK(dvp);
4825         }
4826
4827         return (cache_fpl_handled(fpl));
4828 }
4829
4830 static int __noinline
4831 cache_fplookup_modifying(struct cache_fpl *fpl)
4832 {
4833         struct nameidata *ndp;
4834
4835         ndp = fpl->ndp;
4836
4837         if (!cache_fpl_islastcn(ndp)) {
4838                 return (cache_fpl_partial(fpl));
4839         }
4840         return (cache_fplookup_final_modifying(fpl));
4841 }
4842
4843 static int __noinline
4844 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4845 {
4846         struct componentname *cnp;
4847         enum vgetstate dvs, tvs;
4848         struct vnode *dvp, *tvp;
4849         seqc_t dvp_seqc;
4850         int error;
4851
4852         cnp = fpl->cnp;
4853         dvp = fpl->dvp;
4854         dvp_seqc = fpl->dvp_seqc;
4855         tvp = fpl->tvp;
4856
4857         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4858
4859         /*
4860          * This is less efficient than it can be for simplicity.
4861          */
4862         dvs = vget_prep_smr(dvp);
4863         if (__predict_false(dvs == VGET_NONE)) {
4864                 return (cache_fpl_aborted(fpl));
4865         }
4866         tvs = vget_prep_smr(tvp);
4867         if (__predict_false(tvs == VGET_NONE)) {
4868                 cache_fpl_smr_exit(fpl);
4869                 vget_abort(dvp, dvs);
4870                 return (cache_fpl_aborted(fpl));
4871         }
4872
4873         cache_fpl_smr_exit(fpl);
4874
4875         if ((cnp->cn_flags & LOCKPARENT) != 0) {
4876                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4877                 if (__predict_false(error != 0)) {
4878                         vget_abort(tvp, tvs);
4879                         return (cache_fpl_aborted(fpl));
4880                 }
4881         } else {
4882                 vget_finish_ref(dvp, dvs);
4883         }
4884
4885         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4886                 vget_abort(tvp, tvs);
4887                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4888                         vput(dvp);
4889                 else
4890                         vrele(dvp);
4891                 return (cache_fpl_aborted(fpl));
4892         }
4893
4894         error = cache_fplookup_final_child(fpl, tvs);
4895         if (__predict_false(error != 0)) {
4896                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
4897                     fpl->status == CACHE_FPL_STATUS_DESTROYED);
4898                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4899                         vput(dvp);
4900                 else
4901                         vrele(dvp);
4902                 return (error);
4903         }
4904
4905         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4906         return (0);
4907 }
4908
4909 static int
4910 cache_fplookup_final(struct cache_fpl *fpl)
4911 {
4912         struct componentname *cnp;
4913         enum vgetstate tvs;
4914         struct vnode *dvp, *tvp;
4915         seqc_t dvp_seqc;
4916
4917         cnp = fpl->cnp;
4918         dvp = fpl->dvp;
4919         dvp_seqc = fpl->dvp_seqc;
4920         tvp = fpl->tvp;
4921
4922         MPASS(*(cnp->cn_nameptr) != '/');
4923
4924         if (cnp->cn_nameiop != LOOKUP) {
4925                 return (cache_fplookup_final_modifying(fpl));
4926         }
4927
4928         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4929                 return (cache_fplookup_final_withparent(fpl));
4930
4931         tvs = vget_prep_smr(tvp);
4932         if (__predict_false(tvs == VGET_NONE)) {
4933                 return (cache_fpl_partial(fpl));
4934         }
4935
4936         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4937                 cache_fpl_smr_exit(fpl);
4938                 vget_abort(tvp, tvs);
4939                 return (cache_fpl_aborted(fpl));
4940         }
4941
4942         cache_fpl_smr_exit(fpl);
4943         return (cache_fplookup_final_child(fpl, tvs));
4944 }
4945
4946 /*
4947  * Comment from locked lookup:
4948  * Check for degenerate name (e.g. / or "") which is a way of talking about a
4949  * directory, e.g. like "/." or ".".
4950  */
4951 static int __noinline
4952 cache_fplookup_degenerate(struct cache_fpl *fpl)
4953 {
4954         struct componentname *cnp;
4955         struct vnode *dvp;
4956         enum vgetstate dvs;
4957         int error, lkflags;
4958 #ifdef INVARIANTS
4959         char *cp;
4960 #endif
4961
4962         fpl->tvp = fpl->dvp;
4963         fpl->tvp_seqc = fpl->dvp_seqc;
4964
4965         cnp = fpl->cnp;
4966         dvp = fpl->dvp;
4967
4968 #ifdef INVARIANTS
4969         for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
4970                 KASSERT(*cp == '/',
4971                     ("%s: encountered non-slash; string [%s]\n", __func__,
4972                     cnp->cn_pnbuf));
4973         }
4974 #endif
4975
4976         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4977                 cache_fpl_smr_exit(fpl);
4978                 return (cache_fpl_handled_error(fpl, EISDIR));
4979         }
4980
4981         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4982                 return (cache_fplookup_final_withparent(fpl));
4983         }
4984
4985         dvs = vget_prep_smr(dvp);
4986         cache_fpl_smr_exit(fpl);
4987         if (__predict_false(dvs == VGET_NONE)) {
4988                 return (cache_fpl_aborted(fpl));
4989         }
4990
4991         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4992                 lkflags = LK_SHARED;
4993                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4994                         lkflags = LK_EXCLUSIVE;
4995                 error = vget_finish(dvp, lkflags, dvs);
4996                 if (__predict_false(error != 0)) {
4997                         return (cache_fpl_aborted(fpl));
4998                 }
4999         } else {
5000                 vget_finish_ref(dvp, dvs);
5001         }
5002         return (cache_fpl_handled(fpl));
5003 }
5004
5005 static int __noinline
5006 cache_fplookup_emptypath(struct cache_fpl *fpl)
5007 {
5008         struct nameidata *ndp;
5009         struct componentname *cnp;
5010         enum vgetstate tvs;
5011         struct vnode *tvp;
5012         int error, lkflags;
5013
5014         fpl->tvp = fpl->dvp;
5015         fpl->tvp_seqc = fpl->dvp_seqc;
5016
5017         ndp = fpl->ndp;
5018         cnp = fpl->cnp;
5019         tvp = fpl->tvp;
5020
5021         MPASS(*cnp->cn_pnbuf == '\0');
5022
5023         if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
5024                 cache_fpl_smr_exit(fpl);
5025                 return (cache_fpl_handled_error(fpl, ENOENT));
5026         }
5027
5028         MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
5029
5030         tvs = vget_prep_smr(tvp);
5031         cache_fpl_smr_exit(fpl);
5032         if (__predict_false(tvs == VGET_NONE)) {
5033                 return (cache_fpl_aborted(fpl));
5034         }
5035
5036         if ((cnp->cn_flags & LOCKLEAF) != 0) {
5037                 lkflags = LK_SHARED;
5038                 if ((cnp->cn_flags & LOCKSHARED) == 0)
5039                         lkflags = LK_EXCLUSIVE;
5040                 error = vget_finish(tvp, lkflags, tvs);
5041                 if (__predict_false(error != 0)) {
5042                         return (cache_fpl_aborted(fpl));
5043                 }
5044         } else {
5045                 vget_finish_ref(tvp, tvs);
5046         }
5047
5048         ndp->ni_resflags |= NIRES_EMPTYPATH;
5049         return (cache_fpl_handled(fpl));
5050 }
5051
5052 static int __noinline
5053 cache_fplookup_noentry(struct cache_fpl *fpl)
5054 {
5055         struct nameidata *ndp;
5056         struct componentname *cnp;
5057         enum vgetstate dvs;
5058         struct vnode *dvp, *tvp;
5059         seqc_t dvp_seqc;
5060         int error;
5061
5062         ndp = fpl->ndp;
5063         cnp = fpl->cnp;
5064         dvp = fpl->dvp;
5065         dvp_seqc = fpl->dvp_seqc;
5066
5067         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
5068         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
5069         if (cnp->cn_nameiop == LOOKUP)
5070                 MPASS((cnp->cn_flags & NOCACHE) == 0);
5071         MPASS(!cache_fpl_isdotdot(cnp));
5072
5073         /*
5074          * Hack: delayed name len checking.
5075          */
5076         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5077                 cache_fpl_smr_exit(fpl);
5078                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5079         }
5080
5081         if (cnp->cn_nameptr[0] == '/') {
5082                 return (cache_fplookup_skip_slashes(fpl));
5083         }
5084
5085         if (cnp->cn_pnbuf[0] == '\0') {
5086                 return (cache_fplookup_emptypath(fpl));
5087         }
5088
5089         if (cnp->cn_nameptr[0] == '\0') {
5090                 if (fpl->tvp == NULL) {
5091                         return (cache_fplookup_degenerate(fpl));
5092                 }
5093                 return (cache_fplookup_trailingslash(fpl));
5094         }
5095
5096         if (cnp->cn_nameiop != LOOKUP) {
5097                 fpl->tvp = NULL;
5098                 return (cache_fplookup_modifying(fpl));
5099         }
5100
5101         /*
5102          * Only try to fill in the component if it is the last one,
5103          * otherwise not only there may be several to handle but the
5104          * walk may be complicated.
5105          */
5106         if (!cache_fpl_islastcn(ndp)) {
5107                 return (cache_fpl_partial(fpl));
5108         }
5109
5110         /*
5111          * Regular lookup nulifies the slash, which we don't do here.
5112          * Don't take chances with filesystem routines seeing it for
5113          * the last entry.
5114          */
5115         if (cache_fpl_istrailingslash(fpl)) {
5116                 return (cache_fpl_partial(fpl));
5117         }
5118
5119         /*
5120          * Secure access to dvp; check cache_fplookup_partial_setup for
5121          * reasoning.
5122          */
5123         dvs = vget_prep_smr(dvp);
5124         cache_fpl_smr_exit(fpl);
5125         if (__predict_false(dvs == VGET_NONE)) {
5126                 return (cache_fpl_aborted(fpl));
5127         }
5128
5129         vget_finish_ref(dvp, dvs);
5130         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5131                 vrele(dvp);
5132                 return (cache_fpl_aborted(fpl));
5133         }
5134
5135         error = vn_lock(dvp, LK_SHARED);
5136         if (__predict_false(error != 0)) {
5137                 vrele(dvp);
5138                 return (cache_fpl_aborted(fpl));
5139         }
5140
5141         tvp = NULL;
5142         /*
5143          * TODO: provide variants which don't require locking either vnode.
5144          */
5145         cnp->cn_flags |= ISLASTCN | MAKEENTRY;
5146         cnp->cn_lkflags = LK_SHARED;
5147         if ((cnp->cn_flags & LOCKSHARED) == 0) {
5148                 cnp->cn_lkflags = LK_EXCLUSIVE;
5149         }
5150         error = VOP_LOOKUP(dvp, &tvp, cnp);
5151         switch (error) {
5152         case EJUSTRETURN:
5153         case 0:
5154                 break;
5155         case ENOTDIR:
5156         case ENOENT:
5157                 vput(dvp);
5158                 return (cache_fpl_handled_error(fpl, error));
5159         default:
5160                 vput(dvp);
5161                 return (cache_fpl_aborted(fpl));
5162         }
5163
5164         fpl->tvp = tvp;
5165
5166         if (tvp == NULL) {
5167                 MPASS(error == EJUSTRETURN);
5168                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5169                         vput(dvp);
5170                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5171                         VOP_UNLOCK(dvp);
5172                 }
5173                 return (cache_fpl_handled(fpl));
5174         }
5175
5176         if (tvp->v_type == VLNK) {
5177                 if ((cnp->cn_flags & FOLLOW) != 0) {
5178                         vput(dvp);
5179                         vput(tvp);
5180                         return (cache_fpl_aborted(fpl));
5181                 }
5182         }
5183
5184         if (__predict_false(cache_fplookup_is_mp(fpl))) {
5185                 vput(dvp);
5186                 vput(tvp);
5187                 return (cache_fpl_aborted(fpl));
5188         }
5189
5190         if ((cnp->cn_flags & LOCKLEAF) == 0) {
5191                 VOP_UNLOCK(tvp);
5192         }
5193
5194         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
5195                 vput(dvp);
5196         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
5197                 VOP_UNLOCK(dvp);
5198         }
5199         return (cache_fpl_handled(fpl));
5200 }
5201
5202 static int __noinline
5203 cache_fplookup_dot(struct cache_fpl *fpl)
5204 {
5205         int error;
5206
5207         MPASS(!seqc_in_modify(fpl->dvp_seqc));
5208
5209         if (__predict_false(fpl->dvp->v_type != VDIR)) {
5210                 cache_fpl_smr_exit(fpl);
5211                 return (cache_fpl_handled_error(fpl, ENOTDIR));
5212         }
5213
5214         /*
5215          * Just re-assign the value. seqc will be checked later for the first
5216          * non-dot path component in line and/or before deciding to return the
5217          * vnode.
5218          */
5219         fpl->tvp = fpl->dvp;
5220         fpl->tvp_seqc = fpl->dvp_seqc;
5221
5222         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
5223
5224         error = 0;
5225         if (cache_fplookup_is_mp(fpl)) {
5226                 error = cache_fplookup_cross_mount(fpl);
5227         }
5228         return (error);
5229 }
5230
5231 static int __noinline
5232 cache_fplookup_dotdot(struct cache_fpl *fpl)
5233 {
5234         struct nameidata *ndp;
5235         struct componentname *cnp;
5236         struct namecache *ncp;
5237         struct vnode *dvp;
5238         struct prison *pr;
5239         u_char nc_flag;
5240
5241         ndp = fpl->ndp;
5242         cnp = fpl->cnp;
5243         dvp = fpl->dvp;
5244
5245         MPASS(cache_fpl_isdotdot(cnp));
5246
5247         /*
5248          * XXX this is racy the same way regular lookup is
5249          */
5250         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
5251             pr = pr->pr_parent)
5252                 if (dvp == pr->pr_root)
5253                         break;
5254
5255         if (dvp == ndp->ni_rootdir ||
5256             dvp == ndp->ni_topdir ||
5257             dvp == rootvnode ||
5258             pr != NULL) {
5259                 fpl->tvp = dvp;
5260                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
5261                 if (seqc_in_modify(fpl->tvp_seqc)) {
5262                         return (cache_fpl_aborted(fpl));
5263                 }
5264                 return (0);
5265         }
5266
5267         if ((dvp->v_vflag & VV_ROOT) != 0) {
5268                 /*
5269                  * TODO
5270                  * The opposite of climb mount is needed here.
5271                  */
5272                 return (cache_fpl_partial(fpl));
5273         }
5274
5275         if (__predict_false(dvp->v_type != VDIR)) {
5276                 cache_fpl_smr_exit(fpl);
5277                 return (cache_fpl_handled_error(fpl, ENOTDIR));
5278         }
5279
5280         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
5281         if (ncp == NULL) {
5282                 return (cache_fpl_aborted(fpl));
5283         }
5284
5285         nc_flag = atomic_load_char(&ncp->nc_flag);
5286         if ((nc_flag & NCF_ISDOTDOT) != 0) {
5287                 if ((nc_flag & NCF_NEGATIVE) != 0)
5288                         return (cache_fpl_aborted(fpl));
5289                 fpl->tvp = ncp->nc_vp;
5290         } else {
5291                 fpl->tvp = ncp->nc_dvp;
5292         }
5293
5294         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
5295         if (seqc_in_modify(fpl->tvp_seqc)) {
5296                 return (cache_fpl_partial(fpl));
5297         }
5298
5299         /*
5300          * Acquire fence provided by vn_seqc_read_any above.
5301          */
5302         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
5303                 return (cache_fpl_aborted(fpl));
5304         }
5305
5306         if (!cache_ncp_canuse(ncp)) {
5307                 return (cache_fpl_aborted(fpl));
5308         }
5309
5310         return (0);
5311 }
5312
5313 static int __noinline
5314 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
5315 {
5316         u_char nc_flag __diagused;
5317         bool neg_promote;
5318
5319 #ifdef INVARIANTS
5320         nc_flag = atomic_load_char(&ncp->nc_flag);
5321         MPASS((nc_flag & NCF_NEGATIVE) != 0);
5322 #endif
5323         /*
5324          * If they want to create an entry we need to replace this one.
5325          */
5326         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
5327                 fpl->tvp = NULL;
5328                 return (cache_fplookup_modifying(fpl));
5329         }
5330         neg_promote = cache_neg_hit_prep(ncp);
5331         if (!cache_fpl_neg_ncp_canuse(ncp)) {
5332                 cache_neg_hit_abort(ncp);
5333                 return (cache_fpl_partial(fpl));
5334         }
5335         if (neg_promote) {
5336                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
5337         }
5338         cache_neg_hit_finish(ncp);
5339         cache_fpl_smr_exit(fpl);
5340         return (cache_fpl_handled_error(fpl, ENOENT));
5341 }
5342
5343 /*
5344  * Resolve a symlink. Called by filesystem-specific routines.
5345  *
5346  * Code flow is:
5347  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
5348  */
5349 int
5350 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
5351 {
5352         struct nameidata *ndp;
5353         struct componentname *cnp;
5354         size_t adjust;
5355
5356         ndp = fpl->ndp;
5357         cnp = fpl->cnp;
5358
5359         if (__predict_false(len == 0)) {
5360                 return (ENOENT);
5361         }
5362
5363         if (__predict_false(len > MAXPATHLEN - 2)) {
5364                 if (cache_fpl_istrailingslash(fpl)) {
5365                         return (EAGAIN);
5366                 }
5367         }
5368
5369         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
5370 #ifdef INVARIANTS
5371         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
5372                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5373                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5374                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5375         }
5376 #endif
5377
5378         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
5379                 return (ENAMETOOLONG);
5380         }
5381
5382         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
5383                 return (ELOOP);
5384         }
5385
5386         adjust = len;
5387         if (ndp->ni_pathlen > 1) {
5388                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
5389         } else {
5390                 if (cache_fpl_istrailingslash(fpl)) {
5391                         adjust = len + 1;
5392                         cnp->cn_pnbuf[len] = '/';
5393                         cnp->cn_pnbuf[len + 1] = '\0';
5394                 } else {
5395                         cnp->cn_pnbuf[len] = '\0';
5396                 }
5397         }
5398         bcopy(string, cnp->cn_pnbuf, len);
5399
5400         ndp->ni_pathlen += adjust;
5401         cache_fpl_pathlen_add(fpl, adjust);
5402         cnp->cn_nameptr = cnp->cn_pnbuf;
5403         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5404         fpl->tvp = NULL;
5405         return (0);
5406 }
5407
5408 static int __noinline
5409 cache_fplookup_symlink(struct cache_fpl *fpl)
5410 {
5411         struct mount *mp;
5412         struct nameidata *ndp;
5413         struct componentname *cnp;
5414         struct vnode *dvp, *tvp;
5415         int error;
5416
5417         ndp = fpl->ndp;
5418         cnp = fpl->cnp;
5419         dvp = fpl->dvp;
5420         tvp = fpl->tvp;
5421
5422         if (cache_fpl_islastcn(ndp)) {
5423                 if ((cnp->cn_flags & FOLLOW) == 0) {
5424                         return (cache_fplookup_final(fpl));
5425                 }
5426         }
5427
5428         mp = atomic_load_ptr(&dvp->v_mount);
5429         if (__predict_false(mp == NULL)) {
5430                 return (cache_fpl_aborted(fpl));
5431         }
5432
5433         /*
5434          * Note this check races against setting the flag just like regular
5435          * lookup.
5436          */
5437         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
5438                 cache_fpl_smr_exit(fpl);
5439                 return (cache_fpl_handled_error(fpl, EACCES));
5440         }
5441
5442         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
5443         if (__predict_false(error != 0)) {
5444                 switch (error) {
5445                 case EAGAIN:
5446                         return (cache_fpl_partial(fpl));
5447                 case ENOENT:
5448                 case ENAMETOOLONG:
5449                 case ELOOP:
5450                         cache_fpl_smr_exit(fpl);
5451                         return (cache_fpl_handled_error(fpl, error));
5452                 default:
5453                         return (cache_fpl_aborted(fpl));
5454                 }
5455         }
5456
5457         if (*(cnp->cn_nameptr) == '/') {
5458                 fpl->dvp = cache_fpl_handle_root(fpl);
5459                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
5460                 if (seqc_in_modify(fpl->dvp_seqc)) {
5461                         return (cache_fpl_aborted(fpl));
5462                 }
5463                 /*
5464                  * The main loop assumes that ->dvp points to a vnode belonging
5465                  * to a filesystem which can do lockless lookup, but the absolute
5466                  * symlink can be wandering off to one which does not.
5467                  */
5468                 mp = atomic_load_ptr(&fpl->dvp->v_mount);
5469                 if (__predict_false(mp == NULL)) {
5470                         return (cache_fpl_aborted(fpl));
5471                 }
5472                 if (!cache_fplookup_mp_supported(mp)) {
5473                         cache_fpl_checkpoint(fpl);
5474                         return (cache_fpl_partial(fpl));
5475                 }
5476         }
5477         return (0);
5478 }
5479
5480 static int
5481 cache_fplookup_next(struct cache_fpl *fpl)
5482 {
5483         struct componentname *cnp;
5484         struct namecache *ncp;
5485         struct vnode *dvp, *tvp;
5486         u_char nc_flag;
5487         uint32_t hash;
5488         int error;
5489
5490         cnp = fpl->cnp;
5491         dvp = fpl->dvp;
5492         hash = fpl->hash;
5493
5494         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
5495                 if (cnp->cn_namelen == 1) {
5496                         return (cache_fplookup_dot(fpl));
5497                 }
5498                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
5499                         return (cache_fplookup_dotdot(fpl));
5500                 }
5501         }
5502
5503         MPASS(!cache_fpl_isdotdot(cnp));
5504
5505         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
5506                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
5507                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
5508                         break;
5509         }
5510
5511         if (__predict_false(ncp == NULL)) {
5512                 return (cache_fplookup_noentry(fpl));
5513         }
5514
5515         tvp = atomic_load_ptr(&ncp->nc_vp);
5516         nc_flag = atomic_load_char(&ncp->nc_flag);
5517         if ((nc_flag & NCF_NEGATIVE) != 0) {
5518                 return (cache_fplookup_neg(fpl, ncp, hash));
5519         }
5520
5521         if (!cache_ncp_canuse(ncp)) {
5522                 return (cache_fpl_partial(fpl));
5523         }
5524
5525         fpl->tvp = tvp;
5526         fpl->tvp_seqc = vn_seqc_read_any(tvp);
5527         if (seqc_in_modify(fpl->tvp_seqc)) {
5528                 return (cache_fpl_partial(fpl));
5529         }
5530
5531         counter_u64_add(numposhits, 1);
5532         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
5533
5534         error = 0;
5535         if (cache_fplookup_is_mp(fpl)) {
5536                 error = cache_fplookup_cross_mount(fpl);
5537         }
5538         return (error);
5539 }
5540
5541 static bool
5542 cache_fplookup_mp_supported(struct mount *mp)
5543 {
5544
5545         MPASS(mp != NULL);
5546         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
5547                 return (false);
5548         return (true);
5549 }
5550
5551 /*
5552  * Walk up the mount stack (if any).
5553  *
5554  * Correctness is provided in the following ways:
5555  * - all vnodes are protected from freeing with SMR
5556  * - struct mount objects are type stable making them always safe to access
5557  * - stability of the particular mount is provided by busying it
5558  * - relationship between the vnode which is mounted on and the mount is
5559  *   verified with the vnode sequence counter after busying
5560  * - association between root vnode of the mount and the mount is protected
5561  *   by busy
5562  *
5563  * From that point on we can read the sequence counter of the root vnode
5564  * and get the next mount on the stack (if any) using the same protection.
5565  *
5566  * By the end of successful walk we are guaranteed the reached state was
5567  * indeed present at least at some point which matches the regular lookup.
5568  */
5569 static int __noinline
5570 cache_fplookup_climb_mount(struct cache_fpl *fpl)
5571 {
5572         struct mount *mp, *prev_mp;
5573         struct mount_pcpu *mpcpu, *prev_mpcpu;
5574         struct vnode *vp;
5575         seqc_t vp_seqc;
5576
5577         vp = fpl->tvp;
5578         vp_seqc = fpl->tvp_seqc;
5579
5580         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5581         mp = atomic_load_ptr(&vp->v_mountedhere);
5582         if (__predict_false(mp == NULL)) {
5583                 return (0);
5584         }
5585
5586         prev_mp = NULL;
5587         for (;;) {
5588                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5589                         if (prev_mp != NULL)
5590                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5591                         return (cache_fpl_partial(fpl));
5592                 }
5593                 if (prev_mp != NULL)
5594                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5595                 if (!vn_seqc_consistent(vp, vp_seqc)) {
5596                         vfs_op_thread_exit_crit(mp, mpcpu);
5597                         return (cache_fpl_partial(fpl));
5598                 }
5599                 if (!cache_fplookup_mp_supported(mp)) {
5600                         vfs_op_thread_exit_crit(mp, mpcpu);
5601                         return (cache_fpl_partial(fpl));
5602                 }
5603                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5604                 if (vp == NULL) {
5605                         vfs_op_thread_exit_crit(mp, mpcpu);
5606                         return (cache_fpl_partial(fpl));
5607                 }
5608                 vp_seqc = vn_seqc_read_any(vp);
5609                 if (seqc_in_modify(vp_seqc)) {
5610                         vfs_op_thread_exit_crit(mp, mpcpu);
5611                         return (cache_fpl_partial(fpl));
5612                 }
5613                 prev_mp = mp;
5614                 prev_mpcpu = mpcpu;
5615                 mp = atomic_load_ptr(&vp->v_mountedhere);
5616                 if (mp == NULL)
5617                         break;
5618         }
5619
5620         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5621         fpl->tvp = vp;
5622         fpl->tvp_seqc = vp_seqc;
5623         return (0);
5624 }
5625
5626 static int __noinline
5627 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5628 {
5629         struct mount *mp;
5630         struct mount_pcpu *mpcpu;
5631         struct vnode *vp;
5632         seqc_t vp_seqc;
5633
5634         vp = fpl->tvp;
5635         vp_seqc = fpl->tvp_seqc;
5636
5637         VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
5638         mp = atomic_load_ptr(&vp->v_mountedhere);
5639         if (__predict_false(mp == NULL)) {
5640                 return (0);
5641         }
5642
5643         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5644                 return (cache_fpl_partial(fpl));
5645         }
5646         if (!vn_seqc_consistent(vp, vp_seqc)) {
5647                 vfs_op_thread_exit_crit(mp, mpcpu);
5648                 return (cache_fpl_partial(fpl));
5649         }
5650         if (!cache_fplookup_mp_supported(mp)) {
5651                 vfs_op_thread_exit_crit(mp, mpcpu);
5652                 return (cache_fpl_partial(fpl));
5653         }
5654         vp = atomic_load_ptr(&mp->mnt_rootvnode);
5655         if (__predict_false(vp == NULL)) {
5656                 vfs_op_thread_exit_crit(mp, mpcpu);
5657                 return (cache_fpl_partial(fpl));
5658         }
5659         vp_seqc = vn_seqc_read_any(vp);
5660         vfs_op_thread_exit_crit(mp, mpcpu);
5661         if (seqc_in_modify(vp_seqc)) {
5662                 return (cache_fpl_partial(fpl));
5663         }
5664         mp = atomic_load_ptr(&vp->v_mountedhere);
5665         if (__predict_false(mp != NULL)) {
5666                 /*
5667                  * There are possibly more mount points on top.
5668                  * Normally this does not happen so for simplicity just start
5669                  * over.
5670                  */
5671                 return (cache_fplookup_climb_mount(fpl));
5672         }
5673
5674         fpl->tvp = vp;
5675         fpl->tvp_seqc = vp_seqc;
5676         return (0);
5677 }
5678
5679 /*
5680  * Check if a vnode is mounted on.
5681  */
5682 static bool
5683 cache_fplookup_is_mp(struct cache_fpl *fpl)
5684 {
5685         struct vnode *vp;
5686
5687         vp = fpl->tvp;
5688         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5689 }
5690
5691 /*
5692  * Parse the path.
5693  *
5694  * The code was originally copy-pasted from regular lookup and despite
5695  * clean ups leaves performance on the table. Any modifications here
5696  * must take into account that in case off fallback the resulting
5697  * nameidata state has to be compatible with the original.
5698  */
5699
5700 /*
5701  * Debug ni_pathlen tracking.
5702  */
5703 #ifdef INVARIANTS
5704 static void
5705 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5706 {
5707
5708         fpl->debug.ni_pathlen += n;
5709         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5710             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5711 }
5712
5713 static void
5714 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5715 {
5716
5717         fpl->debug.ni_pathlen -= n;
5718         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5719             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5720 }
5721
5722 static void
5723 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5724 {
5725
5726         cache_fpl_pathlen_add(fpl, 1);
5727 }
5728
5729 static void
5730 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5731 {
5732
5733         cache_fpl_pathlen_sub(fpl, 1);
5734 }
5735 #else
5736 static void
5737 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5738 {
5739 }
5740
5741 static void
5742 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5743 {
5744 }
5745
5746 static void
5747 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5748 {
5749 }
5750
5751 static void
5752 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5753 {
5754 }
5755 #endif
5756
5757 static void
5758 cache_fplookup_parse(struct cache_fpl *fpl)
5759 {
5760         struct nameidata *ndp;
5761         struct componentname *cnp;
5762         struct vnode *dvp;
5763         char *cp;
5764         uint32_t hash;
5765
5766         ndp = fpl->ndp;
5767         cnp = fpl->cnp;
5768         dvp = fpl->dvp;
5769
5770         /*
5771          * Find the end of this path component, it is either / or nul.
5772          *
5773          * Store / as a temporary sentinel so that we only have one character
5774          * to test for. Pathnames tend to be short so this should not be
5775          * resulting in cache misses.
5776          *
5777          * TODO: fix this to be word-sized.
5778          */
5779         MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
5780         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5781             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5782             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5783             fpl->nulchar, cnp->cn_pnbuf));
5784         KASSERT(*fpl->nulchar == '\0',
5785             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5786             cnp->cn_pnbuf));
5787         hash = cache_get_hash_iter_start(dvp);
5788         *fpl->nulchar = '/';
5789         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5790                 KASSERT(*cp != '\0',
5791                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
5792                     cnp->cn_nameptr));
5793                 hash = cache_get_hash_iter(*cp, hash);
5794                 continue;
5795         }
5796         *fpl->nulchar = '\0';
5797         fpl->hash = cache_get_hash_iter_finish(hash);
5798
5799         cnp->cn_namelen = cp - cnp->cn_nameptr;
5800         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5801
5802 #ifdef INVARIANTS
5803         /*
5804          * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
5805          * we are going to fail this lookup with ENAMETOOLONG (see below).
5806          */
5807         if (cnp->cn_namelen <= NAME_MAX) {
5808                 if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
5809                         panic("%s: mismatched hash for [%s] len %ld", __func__,
5810                             cnp->cn_nameptr, cnp->cn_namelen);
5811                 }
5812         }
5813 #endif
5814
5815         /*
5816          * Hack: we have to check if the found path component's length exceeds
5817          * NAME_MAX. However, the condition is very rarely true and check can
5818          * be elided in the common case -- if an entry was found in the cache,
5819          * then it could not have been too long to begin with.
5820          */
5821         ndp->ni_next = cp;
5822 }
5823
5824 static void
5825 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5826 {
5827         struct nameidata *ndp;
5828         struct componentname *cnp;
5829
5830         ndp = fpl->ndp;
5831         cnp = fpl->cnp;
5832
5833         cnp->cn_nameptr = ndp->ni_next;
5834         KASSERT(*(cnp->cn_nameptr) == '/',
5835             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5836             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5837         cnp->cn_nameptr++;
5838         cache_fpl_pathlen_dec(fpl);
5839 }
5840
5841 /*
5842  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5843  *
5844  * Lockless lookup tries to elide checking for spurious slashes and should they
5845  * be present is guaranteed to fail to find an entry. In this case the caller
5846  * must check if the name starts with a slash and call this routine.  It is
5847  * going to fast forward across the spurious slashes and set the state up for
5848  * retry.
5849  */
5850 static int __noinline
5851 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5852 {
5853         struct nameidata *ndp;
5854         struct componentname *cnp;
5855
5856         ndp = fpl->ndp;
5857         cnp = fpl->cnp;
5858
5859         MPASS(*(cnp->cn_nameptr) == '/');
5860         do {
5861                 cnp->cn_nameptr++;
5862                 cache_fpl_pathlen_dec(fpl);
5863         } while (*(cnp->cn_nameptr) == '/');
5864
5865         /*
5866          * Go back to one slash so that cache_fplookup_parse_advance has
5867          * something to skip.
5868          */
5869         cnp->cn_nameptr--;
5870         cache_fpl_pathlen_inc(fpl);
5871
5872         /*
5873          * cache_fplookup_parse_advance starts from ndp->ni_next
5874          */
5875         ndp->ni_next = cnp->cn_nameptr;
5876
5877         /*
5878          * See cache_fplookup_dot.
5879          */
5880         fpl->tvp = fpl->dvp;
5881         fpl->tvp_seqc = fpl->dvp_seqc;
5882
5883         return (0);
5884 }
5885
5886 /*
5887  * Handle trailing slashes (e.g., "foo/").
5888  *
5889  * If a trailing slash is found the terminal vnode must be a directory.
5890  * Regular lookup shortens the path by nulifying the first trailing slash and
5891  * sets the TRAILINGSLASH flag to denote this took place. There are several
5892  * checks on it performed later.
5893  *
5894  * Similarly to spurious slashes, lockless lookup handles this in a speculative
5895  * manner relying on an invariant that a non-directory vnode will get a miss.
5896  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
5897  *
5898  * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
5899  * and denotes this is the last path component, which avoids looping back.
5900  *
5901  * Only plain lookups are supported for now to restrict corner cases to handle.
5902  */
5903 static int __noinline
5904 cache_fplookup_trailingslash(struct cache_fpl *fpl)
5905 {
5906 #ifdef INVARIANTS
5907         size_t ni_pathlen;
5908 #endif
5909         struct nameidata *ndp;
5910         struct componentname *cnp;
5911         struct namecache *ncp;
5912         struct vnode *tvp;
5913         char *cn_nameptr_orig, *cn_nameptr_slash;
5914         seqc_t tvp_seqc;
5915         u_char nc_flag;
5916
5917         ndp = fpl->ndp;
5918         cnp = fpl->cnp;
5919         tvp = fpl->tvp;
5920         tvp_seqc = fpl->tvp_seqc;
5921
5922         MPASS(fpl->dvp == fpl->tvp);
5923         KASSERT(cache_fpl_istrailingslash(fpl),
5924             ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
5925             cnp->cn_pnbuf));
5926         KASSERT(cnp->cn_nameptr[0] == '\0',
5927             ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
5928             cnp->cn_pnbuf));
5929         KASSERT(cnp->cn_namelen == 0,
5930             ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
5931             cnp->cn_pnbuf));
5932         MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
5933
5934         if (cnp->cn_nameiop != LOOKUP) {
5935                 return (cache_fpl_aborted(fpl));
5936         }
5937
5938         if (__predict_false(tvp->v_type != VDIR)) {
5939                 if (!vn_seqc_consistent(tvp, tvp_seqc)) {
5940                         return (cache_fpl_aborted(fpl));
5941                 }
5942                 cache_fpl_smr_exit(fpl);
5943                 return (cache_fpl_handled_error(fpl, ENOTDIR));
5944         }
5945
5946         /*
5947          * Denote the last component.
5948          */
5949         ndp->ni_next = &cnp->cn_nameptr[0];
5950         MPASS(cache_fpl_islastcn(ndp));
5951
5952         /*
5953          * Unwind trailing slashes.
5954          */
5955         cn_nameptr_orig = cnp->cn_nameptr;
5956         while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
5957                 cnp->cn_nameptr--;
5958                 if (cnp->cn_nameptr[0] != '/') {
5959                         break;
5960                 }
5961         }
5962
5963         /*
5964          * Unwind to the beginning of the path component.
5965          *
5966          * Note the path may or may not have started with a slash.
5967          */
5968         cn_nameptr_slash = cnp->cn_nameptr;
5969         while (cnp->cn_nameptr > cnp->cn_pnbuf) {
5970                 cnp->cn_nameptr--;
5971                 if (cnp->cn_nameptr[0] == '/') {
5972                         break;
5973                 }
5974         }
5975         if (cnp->cn_nameptr[0] == '/') {
5976                 cnp->cn_nameptr++;
5977         }
5978
5979         cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
5980         cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
5981         cache_fpl_checkpoint(fpl);
5982
5983 #ifdef INVARIANTS
5984         ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
5985         if (ni_pathlen != fpl->debug.ni_pathlen) {
5986                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
5987                     __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
5988                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
5989         }
5990 #endif
5991
5992         /*
5993          * If this was a "./" lookup the parent directory is already correct.
5994          */
5995         if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
5996                 return (0);
5997         }
5998
5999         /*
6000          * Otherwise we need to look it up.
6001          */
6002         tvp = fpl->tvp;
6003         ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
6004         if (__predict_false(ncp == NULL)) {
6005                 return (cache_fpl_aborted(fpl));
6006         }
6007         nc_flag = atomic_load_char(&ncp->nc_flag);
6008         if ((nc_flag & NCF_ISDOTDOT) != 0) {
6009                 return (cache_fpl_aborted(fpl));
6010         }
6011         fpl->dvp = ncp->nc_dvp;
6012         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
6013         if (seqc_in_modify(fpl->dvp_seqc)) {
6014                 return (cache_fpl_aborted(fpl));
6015         }
6016         return (0);
6017 }
6018
6019 /*
6020  * See the API contract for VOP_FPLOOKUP_VEXEC.
6021  */
6022 static int __noinline
6023 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
6024 {
6025         struct componentname *cnp;
6026         struct vnode *dvp;
6027         seqc_t dvp_seqc;
6028
6029         cnp = fpl->cnp;
6030         dvp = fpl->dvp;
6031         dvp_seqc = fpl->dvp_seqc;
6032
6033         /*
6034          * Hack: delayed empty path checking.
6035          */
6036         if (cnp->cn_pnbuf[0] == '\0') {
6037                 return (cache_fplookup_emptypath(fpl));
6038         }
6039
6040         /*
6041          * TODO: Due to ignoring trailing slashes lookup will perform a
6042          * permission check on the last dir when it should not be doing it.  It
6043          * may fail, but said failure should be ignored. It is possible to fix
6044          * it up fully without resorting to regular lookup, but for now just
6045          * abort.
6046          */
6047         if (cache_fpl_istrailingslash(fpl)) {
6048                 return (cache_fpl_aborted(fpl));
6049         }
6050
6051         /*
6052          * Hack: delayed degenerate path checking.
6053          */
6054         if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
6055                 return (cache_fplookup_degenerate(fpl));
6056         }
6057
6058         /*
6059          * Hack: delayed name len checking.
6060          */
6061         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
6062                 cache_fpl_smr_exit(fpl);
6063                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
6064         }
6065
6066         /*
6067          * Hack: they may be looking up foo/bar, where foo is not a directory.
6068          * In such a case we need to return ENOTDIR, but we may happen to get
6069          * here with a different error.
6070          */
6071         if (dvp->v_type != VDIR) {
6072                 error = ENOTDIR;
6073         }
6074
6075         /*
6076          * Hack: handle O_SEARCH.
6077          *
6078          * Open Group Base Specifications Issue 7, 2018 edition states:
6079          * <quote>
6080          * If the access mode of the open file description associated with the
6081          * file descriptor is not O_SEARCH, the function shall check whether
6082          * directory searches are permitted using the current permissions of
6083          * the directory underlying the file descriptor. If the access mode is
6084          * O_SEARCH, the function shall not perform the check.
6085          * </quote>
6086          *
6087          * Regular lookup tests for the NOEXECCHECK flag for every path
6088          * component to decide whether to do the permission check. However,
6089          * since most lookups never have the flag (and when they do it is only
6090          * present for the first path component), lockless lookup only acts on
6091          * it if there is a permission problem. Here the flag is represented
6092          * with a boolean so that we don't have to clear it on the way out.
6093          *
6094          * For simplicity this always aborts.
6095          * TODO: check if this is the first lookup and ignore the permission
6096          * problem. Note the flag has to survive fallback (if it happens to be
6097          * performed).
6098          */
6099         if (fpl->fsearch) {
6100                 return (cache_fpl_aborted(fpl));
6101         }
6102
6103         switch (error) {
6104         case EAGAIN:
6105                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6106                         error = cache_fpl_aborted(fpl);
6107                 } else {
6108                         cache_fpl_partial(fpl);
6109                 }
6110                 break;
6111         default:
6112                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
6113                         error = cache_fpl_aborted(fpl);
6114                 } else {
6115                         cache_fpl_smr_exit(fpl);
6116                         cache_fpl_handled_error(fpl, error);
6117                 }
6118                 break;
6119         }
6120         return (error);
6121 }
6122
6123 static int
6124 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
6125 {
6126         struct nameidata *ndp;
6127         struct componentname *cnp;
6128         struct mount *mp;
6129         int error;
6130
6131         ndp = fpl->ndp;
6132         cnp = fpl->cnp;
6133
6134         cache_fpl_checkpoint(fpl);
6135
6136         /*
6137          * The vnode at hand is almost always stable, skip checking for it.
6138          * Worst case this postpones the check towards the end of the iteration
6139          * of the main loop.
6140          */
6141         fpl->dvp = dvp;
6142         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
6143
6144         mp = atomic_load_ptr(&dvp->v_mount);
6145         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
6146                 return (cache_fpl_aborted(fpl));
6147         }
6148
6149         MPASS(fpl->tvp == NULL);
6150
6151         for (;;) {
6152                 cache_fplookup_parse(fpl);
6153
6154                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
6155                 if (__predict_false(error != 0)) {
6156                         error = cache_fplookup_failed_vexec(fpl, error);
6157                         break;
6158                 }
6159
6160                 error = cache_fplookup_next(fpl);
6161                 if (__predict_false(cache_fpl_terminated(fpl))) {
6162                         break;
6163                 }
6164
6165                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
6166
6167                 if (fpl->tvp->v_type == VLNK) {
6168                         error = cache_fplookup_symlink(fpl);
6169                         if (cache_fpl_terminated(fpl)) {
6170                                 break;
6171                         }
6172                 } else {
6173                         if (cache_fpl_islastcn(ndp)) {
6174                                 error = cache_fplookup_final(fpl);
6175                                 break;
6176                         }
6177
6178                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
6179                                 error = cache_fpl_aborted(fpl);
6180                                 break;
6181                         }
6182
6183                         fpl->dvp = fpl->tvp;
6184                         fpl->dvp_seqc = fpl->tvp_seqc;
6185                         cache_fplookup_parse_advance(fpl);
6186                 }
6187
6188                 cache_fpl_checkpoint(fpl);
6189         }
6190
6191         return (error);
6192 }
6193
6194 /*
6195  * Fast path lookup protected with SMR and sequence counters.
6196  *
6197  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
6198  *
6199  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
6200  * outlined below.
6201  *
6202  * Traditional vnode lookup conceptually looks like this:
6203  *
6204  * vn_lock(current);
6205  * for (;;) {
6206  *      next = find();
6207  *      vn_lock(next);
6208  *      vn_unlock(current);
6209  *      current = next;
6210  *      if (last)
6211  *          break;
6212  * }
6213  * return (current);
6214  *
6215  * Each jump to the next vnode is safe memory-wise and atomic with respect to
6216  * any modifications thanks to holding respective locks.
6217  *
6218  * The same guarantee can be provided with a combination of safe memory
6219  * reclamation and sequence counters instead. If all operations which affect
6220  * the relationship between the current vnode and the one we are looking for
6221  * also modify the counter, we can verify whether all the conditions held as
6222  * we made the jump. This includes things like permissions, mount points etc.
6223  * Counter modification is provided by enclosing relevant places in
6224  * vn_seqc_write_begin()/end() calls.
6225  *
6226  * Thus this translates to:
6227  *
6228  * vfs_smr_enter();
6229  * dvp_seqc = seqc_read_any(dvp);
6230  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
6231  *     abort();
6232  * for (;;) {
6233  *      tvp = find();
6234  *      tvp_seqc = seqc_read_any(tvp);
6235  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
6236  *          abort();
6237  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
6238  *          abort();
6239  *      dvp = tvp; // we know nothing of importance has changed
6240  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
6241  *      if (last)
6242  *          break;
6243  * }
6244  * vget(); // secure the vnode
6245  * if (!seqc_consistent(tvp, tvp_seqc) // final check
6246  *          abort();
6247  * // at this point we know nothing has changed for any parent<->child pair
6248  * // as they were crossed during the lookup, meaning we matched the guarantee
6249  * // of the locked variant
6250  * return (tvp);
6251  *
6252  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
6253  * - they are called while within vfs_smr protection which they must never exit
6254  * - EAGAIN can be returned to denote checking could not be performed, it is
6255  *   always valid to return it
6256  * - if the sequence counter has not changed the result must be valid
6257  * - if the sequence counter has changed both false positives and false negatives
6258  *   are permitted (since the result will be rejected later)
6259  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
6260  *
6261  * Caveats to watch out for:
6262  * - vnodes are passed unlocked and unreferenced with nothing stopping
6263  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
6264  *   to use atomic_load_ptr to fetch it.
6265  * - the aforementioned object can also get freed, meaning absent other means it
6266  *   should be protected with vfs_smr
6267  * - either safely checking permissions as they are modified or guaranteeing
6268  *   their stability is left to the routine
6269  */
6270 int
6271 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
6272     struct pwd **pwdp)
6273 {
6274         struct cache_fpl fpl;
6275         struct pwd *pwd;
6276         struct vnode *dvp;
6277         struct componentname *cnp;
6278         int error;
6279
6280         fpl.status = CACHE_FPL_STATUS_UNSET;
6281         fpl.in_smr = false;
6282         fpl.ndp = ndp;
6283         fpl.cnp = cnp = &ndp->ni_cnd;
6284         MPASS(ndp->ni_lcf == 0);
6285         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
6286             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
6287             cnp->cn_flags));
6288         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
6289         MPASS(ndp->ni_resflags == 0);
6290
6291         if (__predict_false(!cache_can_fplookup(&fpl))) {
6292                 *status = fpl.status;
6293                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6294                 return (EOPNOTSUPP);
6295         }
6296
6297         cache_fpl_checkpoint_outer(&fpl);
6298
6299         cache_fpl_smr_enter_initial(&fpl);
6300 #ifdef INVARIANTS
6301         fpl.debug.ni_pathlen = ndp->ni_pathlen;
6302 #endif
6303         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
6304         fpl.fsearch = false;
6305         fpl.tvp = NULL; /* for degenerate path handling */
6306         fpl.pwd = pwdp;
6307         pwd = pwd_get_smr();
6308         *(fpl.pwd) = pwd;
6309         namei_setup_rootdir(ndp, cnp, pwd);
6310         ndp->ni_topdir = pwd->pwd_jdir;
6311
6312         if (cnp->cn_pnbuf[0] == '/') {
6313                 dvp = cache_fpl_handle_root(&fpl);
6314                 ndp->ni_resflags = NIRES_ABS;
6315         } else {
6316                 if (ndp->ni_dirfd == AT_FDCWD) {
6317                         dvp = pwd->pwd_cdir;
6318                 } else {
6319                         error = cache_fplookup_dirfd(&fpl, &dvp);
6320                         if (__predict_false(error != 0)) {
6321                                 goto out;
6322                         }
6323                 }
6324         }
6325
6326         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
6327         error = cache_fplookup_impl(dvp, &fpl);
6328 out:
6329         cache_fpl_smr_assert_not_entered(&fpl);
6330         cache_fpl_assert_status(&fpl);
6331         *status = fpl.status;
6332         if (SDT_PROBES_ENABLED()) {
6333                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
6334                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
6335                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
6336                             ndp);
6337         }
6338
6339         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
6340                 MPASS(error != CACHE_FPL_FAILED);
6341                 if (error != 0) {
6342                         cache_fpl_cleanup_cnp(fpl.cnp);
6343                         MPASS(fpl.dvp == NULL);
6344                         MPASS(fpl.tvp == NULL);
6345                 }
6346                 ndp->ni_dvp = fpl.dvp;
6347                 ndp->ni_vp = fpl.tvp;
6348         }
6349         return (error);
6350 }