sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70 #ifdef INVARIANTS
  71 #include <machine/_inttypes.h>
  72 #endif
  73
  74 #include <sys/capsicum.h>
  75
  76 #include <security/audit/audit.h>
  77 #include <security/mac/mac_framework.h>
  78
  79 #ifdef DDB
  80 #include <ddb/ddb.h>
  81 #endif
  82
  83 #include <vm/uma.h>
  84
  85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  86     "Name cache");
  87
  88 SDT_PROVIDER_DECLARE(vfs);
  89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  90     "struct vnode *");
  91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  92     "struct vnode *");
  93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  94     "char *");
  95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  96     "const char *");
  97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  98     "struct namecache *", "int", "int");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
 101     "char *", "struct vnode *");
 102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 104     "struct vnode *", "char *");
 105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 106     "struct vnode *");
 107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 108     "struct vnode *", "char *");
 109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 110     "char *");
 111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 112     "struct componentname *");
 113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 114     "struct componentname *");
 115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 120     "struct vnode *");
 121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 122     "char *");
 123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 124     "char *");
 125
 126 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 127 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 128 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 129
 130 /*
 131  * This structure describes the elements in the cache of recent
 132  * names looked up by namei.
 133  */
 134 struct negstate {
 135         u_char neg_flag;
 136         u_char neg_hit;
 137 };
 138 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 139     "the state must fit in a union with a pointer without growing it");
 140
 141 struct  namecache {
 142         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 143         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 144         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 145         struct  vnode *nc_dvp;          /* vnode of parent of name */
 146         union {
 147                 struct  vnode *nu_vp;   /* vnode the name refers to */
 148                 struct  negstate nu_neg;/* negative entry state */
 149         } n_un;
 150         u_char  nc_flag;                /* flag bits */
 151         u_char  nc_nlen;                /* length of name */
 152         char    nc_name[0];             /* segment name + nul */
 153 };
 154
 155 /*
 156  * struct namecache_ts repeats struct namecache layout up to the
 157  * nc_nlen member.
 158  * struct namecache_ts is used in place of struct namecache when time(s) need
 159  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 160  * both a non-dotdot directory name plus dotdot for the directory's
 161  * parent.
 162  *
 163  * See below for alignment requirement.
 164  */
 165 struct  namecache_ts {
 166         struct  timespec nc_time;       /* timespec provided by fs */
 167         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 168         int     nc_ticks;               /* ticks value when entry was added */
 169         int     nc_pad;
 170         struct namecache nc_nc;
 171 };
 172
 173 TAILQ_HEAD(cache_freebatch, namecache);
 174
 175 /*
 176  * At least mips n32 performs 64-bit accesses to timespec as found
 177  * in namecache_ts and requires them to be aligned. Since others
 178  * may be in the same spot suffer a little bit and enforce the
 179  * alignment for everyone. Note this is a nop for 64-bit platforms.
 180  */
 181 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 182
 183 /*
 184  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
 185  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
 186  * smaller and the value was bumped to retain the total size, but it
 187  * was never re-evaluated for suitability. A simple test counting
 188  * lengths during package building shows that the value of 45 covers
 189  * about 86% of all added entries, reaching 99% at 65.
 190  *
 191  * Regardless of the above, use of dedicated zones instead of malloc may be
 192  * inducing additional waste. This may be hard to address as said zones are
 193  * tied to VFS SMR. Even if retaining them, the current split should be
 194  * re-evaluated.
 195  */
 196 #ifdef __LP64__
 197 #define CACHE_PATH_CUTOFF       45
 198 #define CACHE_LARGE_PAD         6
 199 #else
 200 #define CACHE_PATH_CUTOFF       41
 201 #define CACHE_LARGE_PAD         2
 202 #endif
 203
 204 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 205 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 206 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 207 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 208
 209 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 210 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 211 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 212 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 213
 214 #define nc_vp           n_un.nu_vp
 215 #define nc_neg          n_un.nu_neg
 216
 217 /*
 218  * Flags in namecache.nc_flag
 219  */
 220 #define NCF_WHITE       0x01
 221 #define NCF_ISDOTDOT    0x02
 222 #define NCF_TS          0x04
 223 #define NCF_DTS         0x08
 224 #define NCF_DVDROP      0x10
 225 #define NCF_NEGATIVE    0x20
 226 #define NCF_INVALID     0x40
 227 #define NCF_WIP         0x80
 228
 229 /*
 230  * Flags in negstate.neg_flag
 231  */
 232 #define NEG_HOT         0x01
 233
 234 static bool     cache_neg_evict_cond(u_long lnumcache);
 235
 236 /*
 237  * Mark an entry as invalid.
 238  *
 239  * This is called before it starts getting deconstructed.
 240  */
 241 static void
 242 cache_ncp_invalidate(struct namecache *ncp)
 243 {
 244
 245         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 246             ("%s: entry %p already invalid", __func__, ncp));
 247         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 248         atomic_thread_fence_rel();
 249 }
 250
 251 /*
 252  * Check whether the entry can be safely used.
 253  *
 254  * All places which elide locks are supposed to call this after they are
 255  * done with reading from an entry.
 256  */
 257 #define cache_ncp_canuse(ncp)   ({                                      \
 258         struct namecache *_ncp = (ncp);                                 \
 259         u_char _nc_flag;                                                \
 260                                                                         \
 261         atomic_thread_fence_acq();                                      \
 262         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 263         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
 264 })
 265
 266 /*
 267  * Like the above but also checks NCF_WHITE.
 268  */
 269 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
 270         struct namecache *_ncp = (ncp);                                 \
 271         u_char _nc_flag;                                                \
 272                                                                         \
 273         atomic_thread_fence_acq();                                      \
 274         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 275         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
 276 })
 277
 278 /*
 279  * Name caching works as follows:
 280  *
 281  * Names found by directory scans are retained in a cache
 282  * for future reference.  It is managed LRU, so frequently
 283  * used names will hang around.  Cache is indexed by hash value
 284  * obtained from (dvp, name) where dvp refers to the directory
 285  * containing name.
 286  *
 287  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 288  * exist) the vnode pointer will be NULL.
 289  *
 290  * Upon reaching the last segment of a path, if the reference
 291  * is for DELETE, or NOCACHE is set (rewrite), and the
 292  * name is located in the cache, it will be dropped.
 293  *
 294  * These locks are used (in the order in which they can be taken):
 295  * NAME         TYPE    ROLE
 296  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 297  * bucketlock   mtx     for access to given set of hash buckets
 298  * neglist      mtx     negative entry LRU management
 299  *
 300  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 301  * order is lower address first. Both are recursive.
 302  *
 303  * "." lookups are lockless.
 304  *
 305  * ".." and vnode -> name lookups require vnodelock.
 306  *
 307  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 308  *
 309  * Insertions and removals of entries require involved vnodes and bucketlocks
 310  * to be locked to provide safe operation against other threads modifying the
 311  * cache.
 312  *
 313  * Some lookups result in removal of the found entry (e.g. getting rid of a
 314  * negative entry with the intent to create a positive one), which poses a
 315  * problem when multiple threads reach the state. Similarly, two different
 316  * threads can purge two different vnodes and try to remove the same name.
 317  *
 318  * If the already held vnode lock is lower than the second required lock, we
 319  * can just take the other lock. However, in the opposite case, this could
 320  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 321  * the first node, locking everything in order and revalidating the state.
 322  */
 323
 324 VFS_SMR_DECLARE;
 325
 326 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 327     "Name cache parameters");
 328
 329 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 330 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 331     "Total namecache capacity");
 332
 333 u_int ncsizefactor = 2;
 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 335     "Size factor for namecache");
 336
 337 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 338 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 339     "Ratio of negative namecache entries");
 340
 341 /*
 342  * Negative entry % of namecache capacity above which automatic eviction is allowed.
 343  *
 344  * Check cache_neg_evict_cond for details.
 345  */
 346 static u_int ncnegminpct = 3;
 347
 348 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 349 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 350     "Negative entry count above which automatic eviction is allowed");
 351
 352 /*
 353  * Structures associated with name caching.
 354  */
 355 #define NCHHASH(hash) \
 356         (&nchashtbl[(hash) & nchash])
 357 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 358 static u_long __read_mostly     nchash;                 /* size of hash table */
 359 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 360     "Size of namecache hash table");
 361 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 362 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 363
 364 struct nchstats nchstats;               /* cache effectiveness statistics */
 365
 366 static bool __read_frequently cache_fast_revlookup = true;
 367 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 368     &cache_fast_revlookup, 0, "");
 369
 370 static u_int __exclusive_cache_line neg_cycle;
 371
 372 #define ncneghash       3
 373 #define numneglists     (ncneghash + 1)
 374
 375 struct neglist {
 376         struct mtx              nl_evict_lock;
 377         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 378         TAILQ_HEAD(, namecache) nl_list;
 379         TAILQ_HEAD(, namecache) nl_hotlist;
 380         u_long                  nl_hotnum;
 381 } __aligned(CACHE_LINE_SIZE);
 382
 383 static struct neglist neglists[numneglists];
 384
 385 static inline struct neglist *
 386 NCP2NEGLIST(struct namecache *ncp)
 387 {
 388
 389         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 390 }
 391
 392 static inline struct negstate *
 393 NCP2NEGSTATE(struct namecache *ncp)
 394 {
 395
 396         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 397         return (&ncp->nc_neg);
 398 }
 399
 400 #define numbucketlocks (ncbuckethash + 1)
 401 static u_int __read_mostly  ncbuckethash;
 402 static struct mtx_padalign __read_mostly  *bucketlocks;
 403 #define HASH2BUCKETLOCK(hash) \
 404         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 405
 406 #define numvnodelocks (ncvnodehash + 1)
 407 static u_int __read_mostly  ncvnodehash;
 408 static struct mtx __read_mostly *vnodelocks;
 409 static inline struct mtx *
 410 VP2VNODELOCK(struct vnode *vp)
 411 {
 412
 413         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 414 }
 415
 416 static void
 417 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 418 {
 419         struct namecache_ts *ncp_ts;
 420
 421         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 422             (tsp == NULL && ticksp == NULL),
 423             ("No NCF_TS"));
 424
 425         if (tsp == NULL)
 426                 return;
 427
 428         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 429         *tsp = ncp_ts->nc_time;
 430         *ticksp = ncp_ts->nc_ticks;
 431 }
 432
 433 #ifdef DEBUG_CACHE
 434 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 435 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 436     "VFS namecache enabled");
 437 #endif
 438
 439 /* Export size information to userland */
 440 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 441     sizeof(struct namecache), "sizeof(struct namecache)");
 442
 443 /*
 444  * The new name cache statistics
 445  */
 446 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 447     "Name cache statistics");
 448
 449 #define STATNODE_ULONG(name, varname, descr)                                    \
 450         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 451 #define STATNODE_COUNTER(name, varname, descr)                                  \
 452         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 453         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 454             descr);
 455 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 456 STATNODE_ULONG(count, numcache, "Number of cache entries");
 457 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 458 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 459 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 460 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 461 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 462 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 463 STATNODE_COUNTER(posszaps, numposzaps,
 464     "Number of cache hits (positive) we do not want to cache");
 465 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 466 STATNODE_COUNTER(negzaps, numnegzaps,
 467     "Number of cache hits (negative) we do not want to cache");
 468 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 469 /* These count for vn_getcwd(), too. */
 470 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 471 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 472 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 473     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 474 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 475 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 476
 477 /*
 478  * Debug or developer statistics.
 479  */
 480 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 481     "Name cache debugging");
 482 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 483         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 484 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 485         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 486         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 487             descr);
 488 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 489     "Number of successful removals after relocking");
 490 static long zap_bucket_fail;
 491 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 492 static long zap_bucket_fail2;
 493 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 494 static long cache_lock_vnodes_cel_3_failures;
 495 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 496     "Number of times 3-way vnode locking failed");
 497
 498 static void cache_zap_locked(struct namecache *ncp);
 499 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 500     char **freebuf, size_t *buflen);
 501 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 502     char **retbuf, size_t *buflen, size_t addend);
 503 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 504     char **retbuf, size_t *buflen);
 505 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 506     char **retbuf, size_t *len, size_t addend);
 507
 508 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 509
 510 static inline void
 511 cache_assert_vlp_locked(struct mtx *vlp)
 512 {
 513
 514         if (vlp != NULL)
 515                 mtx_assert(vlp, MA_OWNED);
 516 }
 517
 518 static inline void
 519 cache_assert_vnode_locked(struct vnode *vp)
 520 {
 521         struct mtx *vlp;
 522
 523         vlp = VP2VNODELOCK(vp);
 524         cache_assert_vlp_locked(vlp);
 525 }
 526
 527 /*
 528  * Directory vnodes with entries are held for two reasons:
 529  * 1. make them less of a target for reclamation in vnlru
 530  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
 531  *
 532  * It will be feasible to stop doing it altogether if all filesystems start
 533  * supporting lockless lookup.
 534  */
 535 static void
 536 cache_hold_vnode(struct vnode *vp)
 537 {
 538
 539         cache_assert_vnode_locked(vp);
 540         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 541         vhold(vp);
 542         counter_u64_add(numcachehv, 1);
 543 }
 544
 545 static void
 546 cache_drop_vnode(struct vnode *vp)
 547 {
 548
 549         /*
 550          * Called after all locks are dropped, meaning we can't assert
 551          * on the state of v_cache_src.
 552          */
 553         vdrop(vp);
 554         counter_u64_add(numcachehv, -1);
 555 }
 556
 557 /*
 558  * UMA zones.
 559  */
 560 static uma_zone_t __read_mostly cache_zone_small;
 561 static uma_zone_t __read_mostly cache_zone_small_ts;
 562 static uma_zone_t __read_mostly cache_zone_large;
 563 static uma_zone_t __read_mostly cache_zone_large_ts;
 564
 565 static struct namecache *
 566 cache_alloc_uma(int len, bool ts)
 567 {
 568         struct namecache_ts *ncp_ts;
 569         struct namecache *ncp;
 570
 571         if (__predict_false(ts)) {
 572                 if (len <= CACHE_PATH_CUTOFF)
 573                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 574                 else
 575                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 576                 ncp = &ncp_ts->nc_nc;
 577         } else {
 578                 if (len <= CACHE_PATH_CUTOFF)
 579                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 580                 else
 581                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 582         }
 583         return (ncp);
 584 }
 585
 586 static void
 587 cache_free_uma(struct namecache *ncp)
 588 {
 589         struct namecache_ts *ncp_ts;
 590
 591         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 592                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 593                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 594                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 595                 else
 596                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 597         } else {
 598                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 599                         uma_zfree_smr(cache_zone_small, ncp);
 600                 else
 601                         uma_zfree_smr(cache_zone_large, ncp);
 602         }
 603 }
 604
 605 static struct namecache *
 606 cache_alloc(int len, bool ts)
 607 {
 608         u_long lnumcache;
 609
 610         /*
 611          * Avoid blowout in namecache entries.
 612          *
 613          * Bugs:
 614          * 1. filesystems may end up trying to add an already existing entry
 615          * (for example this can happen after a cache miss during concurrent
 616          * lookup), in which case we will call cache_neg_evict despite not
 617          * adding anything.
 618          * 2. the routine may fail to free anything and no provisions are made
 619          * to make it try harder (see the inside for failure modes)
 620          * 3. it only ever looks at negative entries.
 621          */
 622         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 623         if (cache_neg_evict_cond(lnumcache)) {
 624                 lnumcache = atomic_load_long(&numcache);
 625         }
 626         if (__predict_false(lnumcache >= ncsize)) {
 627                 atomic_subtract_long(&numcache, 1);
 628                 counter_u64_add(numdrops, 1);
 629                 return (NULL);
 630         }
 631         return (cache_alloc_uma(len, ts));
 632 }
 633
 634 static void
 635 cache_free(struct namecache *ncp)
 636 {
 637
 638         MPASS(ncp != NULL);
 639         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 640                 cache_drop_vnode(ncp->nc_dvp);
 641         }
 642         cache_free_uma(ncp);
 643         atomic_subtract_long(&numcache, 1);
 644 }
 645
 646 static void
 647 cache_free_batch(struct cache_freebatch *batch)
 648 {
 649         struct namecache *ncp, *nnp;
 650         int i;
 651
 652         i = 0;
 653         if (TAILQ_EMPTY(batch))
 654                 goto out;
 655         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 656                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 657                         cache_drop_vnode(ncp->nc_dvp);
 658                 }
 659                 cache_free_uma(ncp);
 660                 i++;
 661         }
 662         atomic_subtract_long(&numcache, i);
 663 out:
 664         SDT_PROBE1(vfs, namecache, purge, batch, i);
 665 }
 666
 667 /*
 668  * TODO: With the value stored we can do better than computing the hash based
 669  * on the address. The choice of FNV should also be revisited.
 670  */
 671 static void
 672 cache_prehash(struct vnode *vp)
 673 {
 674
 675         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 676 }
 677
 678 static uint32_t
 679 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 680 {
 681
 682         return (fnv_32_buf(name, len, dvp->v_nchash));
 683 }
 684
 685 static inline struct nchashhead *
 686 NCP2BUCKET(struct namecache *ncp)
 687 {
 688         uint32_t hash;
 689
 690         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 691         return (NCHHASH(hash));
 692 }
 693
 694 static inline struct mtx *
 695 NCP2BUCKETLOCK(struct namecache *ncp)
 696 {
 697         uint32_t hash;
 698
 699         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 700         return (HASH2BUCKETLOCK(hash));
 701 }
 702
 703 #ifdef INVARIANTS
 704 static void
 705 cache_assert_bucket_locked(struct namecache *ncp)
 706 {
 707         struct mtx *blp;
 708
 709         blp = NCP2BUCKETLOCK(ncp);
 710         mtx_assert(blp, MA_OWNED);
 711 }
 712
 713 static void
 714 cache_assert_bucket_unlocked(struct namecache *ncp)
 715 {
 716         struct mtx *blp;
 717
 718         blp = NCP2BUCKETLOCK(ncp);
 719         mtx_assert(blp, MA_NOTOWNED);
 720 }
 721 #else
 722 #define cache_assert_bucket_locked(x) do { } while (0)
 723 #define cache_assert_bucket_unlocked(x) do { } while (0)
 724 #endif
 725
 726 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 727 static void
 728 _cache_sort_vnodes(void **p1, void **p2)
 729 {
 730         void *tmp;
 731
 732         MPASS(*p1 != NULL || *p2 != NULL);
 733
 734         if (*p1 > *p2) {
 735                 tmp = *p2;
 736                 *p2 = *p1;
 737                 *p1 = tmp;
 738         }
 739 }
 740
 741 static void
 742 cache_lock_all_buckets(void)
 743 {
 744         u_int i;
 745
 746         for (i = 0; i < numbucketlocks; i++)
 747                 mtx_lock(&bucketlocks[i]);
 748 }
 749
 750 static void
 751 cache_unlock_all_buckets(void)
 752 {
 753         u_int i;
 754
 755         for (i = 0; i < numbucketlocks; i++)
 756                 mtx_unlock(&bucketlocks[i]);
 757 }
 758
 759 static void
 760 cache_lock_all_vnodes(void)
 761 {
 762         u_int i;
 763
 764         for (i = 0; i < numvnodelocks; i++)
 765                 mtx_lock(&vnodelocks[i]);
 766 }
 767
 768 static void
 769 cache_unlock_all_vnodes(void)
 770 {
 771         u_int i;
 772
 773         for (i = 0; i < numvnodelocks; i++)
 774                 mtx_unlock(&vnodelocks[i]);
 775 }
 776
 777 static int
 778 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 779 {
 780
 781         cache_sort_vnodes(&vlp1, &vlp2);
 782
 783         if (vlp1 != NULL) {
 784                 if (!mtx_trylock(vlp1))
 785                         return (EAGAIN);
 786         }
 787         if (!mtx_trylock(vlp2)) {
 788                 if (vlp1 != NULL)
 789                         mtx_unlock(vlp1);
 790                 return (EAGAIN);
 791         }
 792
 793         return (0);
 794 }
 795
 796 static void
 797 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 798 {
 799
 800         MPASS(vlp1 != NULL || vlp2 != NULL);
 801         MPASS(vlp1 <= vlp2);
 802
 803         if (vlp1 != NULL)
 804                 mtx_lock(vlp1);
 805         if (vlp2 != NULL)
 806                 mtx_lock(vlp2);
 807 }
 808
 809 static void
 810 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 811 {
 812
 813         MPASS(vlp1 != NULL || vlp2 != NULL);
 814
 815         if (vlp1 != NULL)
 816                 mtx_unlock(vlp1);
 817         if (vlp2 != NULL)
 818                 mtx_unlock(vlp2);
 819 }
 820
 821 static int
 822 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 823 {
 824         struct nchstats snap;
 825
 826         if (req->oldptr == NULL)
 827                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 828
 829         snap = nchstats;
 830         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 831         snap.ncs_neghits = counter_u64_fetch(numneghits);
 832         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 833             counter_u64_fetch(numnegzaps);
 834         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 835             counter_u64_fetch(nummiss);
 836
 837         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 838 }
 839 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 840     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 841     "VFS cache effectiveness statistics");
 842
 843 static void
 844 cache_recalc_neg_min(u_int val)
 845 {
 846
 847         neg_min = (ncsize * val) / 100;
 848 }
 849
 850 static int
 851 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 852 {
 853         u_int val;
 854         int error;
 855
 856         val = ncnegminpct;
 857         error = sysctl_handle_int(oidp, &val, 0, req);
 858         if (error != 0 || req->newptr == NULL)
 859                 return (error);
 860
 861         if (val == ncnegminpct)
 862                 return (0);
 863         if (val < 0 || val > 99)
 864                 return (EINVAL);
 865         ncnegminpct = val;
 866         cache_recalc_neg_min(val);
 867         return (0);
 868 }
 869
 870 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 871     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 872     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 873
 874 #ifdef DIAGNOSTIC
 875 /*
 876  * Grab an atomic snapshot of the name cache hash chain lengths
 877  */
 878 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 879     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 880     "hash table stats");
 881
 882 static int
 883 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 884 {
 885         struct nchashhead *ncpp;
 886         struct namecache *ncp;
 887         int i, error, n_nchash, *cntbuf;
 888
 889 retry:
 890         n_nchash = nchash + 1;  /* nchash is max index, not count */
 891         if (req->oldptr == NULL)
 892                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 893         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 894         cache_lock_all_buckets();
 895         if (n_nchash != nchash + 1) {
 896                 cache_unlock_all_buckets();
 897                 free(cntbuf, M_TEMP);
 898                 goto retry;
 899         }
 900         /* Scan hash tables counting entries */
 901         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 902                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 903                         cntbuf[i]++;
 904         cache_unlock_all_buckets();
 905         for (error = 0, i = 0; i < n_nchash; i++)
 906                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 907                         break;
 908         free(cntbuf, M_TEMP);
 909         return (error);
 910 }
 911 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 912     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 913     "nchash chain lengths");
 914
 915 static int
 916 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 917 {
 918         int error;
 919         struct nchashhead *ncpp;
 920         struct namecache *ncp;
 921         int n_nchash;
 922         int count, maxlength, used, pct;
 923
 924         if (!req->oldptr)
 925                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 926
 927         cache_lock_all_buckets();
 928         n_nchash = nchash + 1;  /* nchash is max index, not count */
 929         used = 0;
 930         maxlength = 0;
 931
 932         /* Scan hash tables for applicable entries */
 933         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 934                 count = 0;
 935                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 936                         count++;
 937                 }
 938                 if (count)
 939                         used++;
 940                 if (maxlength < count)
 941                         maxlength = count;
 942         }
 943         n_nchash = nchash + 1;
 944         cache_unlock_all_buckets();
 945         pct = (used * 100) / (n_nchash / 100);
 946         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 947         if (error)
 948                 return (error);
 949         error = SYSCTL_OUT(req, &used, sizeof(used));
 950         if (error)
 951                 return (error);
 952         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 953         if (error)
 954                 return (error);
 955         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 956         if (error)
 957                 return (error);
 958         return (0);
 959 }
 960 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 961     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 962     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 963 #endif
 964
 965 /*
 966  * Negative entries management
 967  *
 968  * Various workloads create plenty of negative entries and barely use them
 969  * afterwards. Moreover malicious users can keep performing bogus lookups
 970  * adding even more entries. For example "make tinderbox" as of writing this
 971  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 972  * negative.
 973  *
 974  * As such, a rather aggressive eviction method is needed. The currently
 975  * employed method is a placeholder.
 976  *
 977  * Entries are split over numneglists separate lists, each of which is further
 978  * split into hot and cold entries. Entries get promoted after getting a hit.
 979  * Eviction happens on addition of new entry.
 980  */
 981 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 982     "Name cache negative entry statistics");
 983
 984 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 985     "Number of negative cache entries");
 986
 987 static COUNTER_U64_DEFINE_EARLY(neg_created);
 988 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 989     "Number of created negative entries");
 990
 991 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 992 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 993     "Number of evicted negative entries");
 994
 995 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 996 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 997     &neg_evict_skipped_empty,
 998     "Number of times evicting failed due to lack of entries");
 999
1000 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1001 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1002     &neg_evict_skipped_missed,
1003     "Number of times evicting failed due to target entry disappearing");
1004
1005 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1006 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1007     &neg_evict_skipped_contended,
1008     "Number of times evicting failed due to contention");
1009
1010 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1011     "Number of cache hits (negative)");
1012
1013 static int
1014 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1015 {
1016         int i, out;
1017
1018         out = 0;
1019         for (i = 0; i < numneglists; i++)
1020                 out += neglists[i].nl_hotnum;
1021
1022         return (SYSCTL_OUT(req, &out, sizeof(out)));
1023 }
1024 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1025     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1026     "Number of hot negative entries");
1027
1028 static void
1029 cache_neg_init(struct namecache *ncp)
1030 {
1031         struct negstate *ns;
1032
1033         ncp->nc_flag |= NCF_NEGATIVE;
1034         ns = NCP2NEGSTATE(ncp);
1035         ns->neg_flag = 0;
1036         ns->neg_hit = 0;
1037         counter_u64_add(neg_created, 1);
1038 }
1039
1040 #define CACHE_NEG_PROMOTION_THRESH 2
1041
1042 static bool
1043 cache_neg_hit_prep(struct namecache *ncp)
1044 {
1045         struct negstate *ns;
1046         u_char n;
1047
1048         ns = NCP2NEGSTATE(ncp);
1049         n = atomic_load_char(&ns->neg_hit);
1050         for (;;) {
1051                 if (n >= CACHE_NEG_PROMOTION_THRESH)
1052                         return (false);
1053                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1054                         break;
1055         }
1056         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1057 }
1058
1059 /*
1060  * Nothing to do here but it is provided for completeness as some
1061  * cache_neg_hit_prep callers may end up returning without even
1062  * trying to promote.
1063  */
1064 #define cache_neg_hit_abort(ncp)        do { } while (0)
1065
1066 static void
1067 cache_neg_hit_finish(struct namecache *ncp)
1068 {
1069
1070         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1071         counter_u64_add(numneghits, 1);
1072 }
1073
1074 /*
1075  * Move a negative entry to the hot list.
1076  */
1077 static void
1078 cache_neg_promote_locked(struct namecache *ncp)
1079 {
1080         struct neglist *nl;
1081         struct negstate *ns;
1082
1083         ns = NCP2NEGSTATE(ncp);
1084         nl = NCP2NEGLIST(ncp);
1085         mtx_assert(&nl->nl_lock, MA_OWNED);
1086         if ((ns->neg_flag & NEG_HOT) == 0) {
1087                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1088                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1089                 nl->nl_hotnum++;
1090                 ns->neg_flag |= NEG_HOT;
1091         }
1092 }
1093
1094 /*
1095  * Move a hot negative entry to the cold list.
1096  */
1097 static void
1098 cache_neg_demote_locked(struct namecache *ncp)
1099 {
1100         struct neglist *nl;
1101         struct negstate *ns;
1102
1103         ns = NCP2NEGSTATE(ncp);
1104         nl = NCP2NEGLIST(ncp);
1105         mtx_assert(&nl->nl_lock, MA_OWNED);
1106         MPASS(ns->neg_flag & NEG_HOT);
1107         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1108         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1109         nl->nl_hotnum--;
1110         ns->neg_flag &= ~NEG_HOT;
1111         atomic_store_char(&ns->neg_hit, 0);
1112 }
1113
1114 /*
1115  * Move a negative entry to the hot list if it matches the lookup.
1116  *
1117  * We have to take locks, but they may be contended and in the worst
1118  * case we may need to go off CPU. We don't want to spin within the
1119  * smr section and we can't block with it. Exiting the section means
1120  * the found entry could have been evicted. We are going to look it
1121  * up again.
1122  */
1123 static bool
1124 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1125     struct namecache *oncp, uint32_t hash)
1126 {
1127         struct namecache *ncp;
1128         struct neglist *nl;
1129         u_char nc_flag;
1130
1131         nl = NCP2NEGLIST(oncp);
1132
1133         mtx_lock(&nl->nl_lock);
1134         /*
1135          * For hash iteration.
1136          */
1137         vfs_smr_enter();
1138
1139         /*
1140          * Avoid all surprises by only succeeding if we got the same entry and
1141          * bailing completely otherwise.
1142          * XXX There are no provisions to keep the vnode around, meaning we may
1143          * end up promoting a negative entry for a *new* vnode and returning
1144          * ENOENT on its account. This is the error we want to return anyway
1145          * and promotion is harmless.
1146          *
1147          * In particular at this point there can be a new ncp which matches the
1148          * search but hashes to a different neglist.
1149          */
1150         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1151                 if (ncp == oncp)
1152                         break;
1153         }
1154
1155         /*
1156          * No match to begin with.
1157          */
1158         if (__predict_false(ncp == NULL)) {
1159                 goto out_abort;
1160         }
1161
1162         /*
1163          * The newly found entry may be something different...
1164          */
1165         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1166             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1167                 goto out_abort;
1168         }
1169
1170         /*
1171          * ... and not even negative.
1172          */
1173         nc_flag = atomic_load_char(&ncp->nc_flag);
1174         if ((nc_flag & NCF_NEGATIVE) == 0) {
1175                 goto out_abort;
1176         }
1177
1178         if (!cache_ncp_canuse(ncp)) {
1179                 goto out_abort;
1180         }
1181
1182         cache_neg_promote_locked(ncp);
1183         cache_neg_hit_finish(ncp);
1184         vfs_smr_exit();
1185         mtx_unlock(&nl->nl_lock);
1186         return (true);
1187 out_abort:
1188         vfs_smr_exit();
1189         mtx_unlock(&nl->nl_lock);
1190         return (false);
1191 }
1192
1193 static void
1194 cache_neg_promote(struct namecache *ncp)
1195 {
1196         struct neglist *nl;
1197
1198         nl = NCP2NEGLIST(ncp);
1199         mtx_lock(&nl->nl_lock);
1200         cache_neg_promote_locked(ncp);
1201         mtx_unlock(&nl->nl_lock);
1202 }
1203
1204 static void
1205 cache_neg_insert(struct namecache *ncp)
1206 {
1207         struct neglist *nl;
1208
1209         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1210         cache_assert_bucket_locked(ncp);
1211         nl = NCP2NEGLIST(ncp);
1212         mtx_lock(&nl->nl_lock);
1213         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1214         mtx_unlock(&nl->nl_lock);
1215         atomic_add_long(&numneg, 1);
1216 }
1217
1218 static void
1219 cache_neg_remove(struct namecache *ncp)
1220 {
1221         struct neglist *nl;
1222         struct negstate *ns;
1223
1224         cache_assert_bucket_locked(ncp);
1225         nl = NCP2NEGLIST(ncp);
1226         ns = NCP2NEGSTATE(ncp);
1227         mtx_lock(&nl->nl_lock);
1228         if ((ns->neg_flag & NEG_HOT) != 0) {
1229                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1230                 nl->nl_hotnum--;
1231         } else {
1232                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1233         }
1234         mtx_unlock(&nl->nl_lock);
1235         atomic_subtract_long(&numneg, 1);
1236 }
1237
1238 static struct neglist *
1239 cache_neg_evict_select_list(void)
1240 {
1241         struct neglist *nl;
1242         u_int c;
1243
1244         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1245         nl = &neglists[c % numneglists];
1246         if (!mtx_trylock(&nl->nl_evict_lock)) {
1247                 counter_u64_add(neg_evict_skipped_contended, 1);
1248                 return (NULL);
1249         }
1250         return (nl);
1251 }
1252
1253 static struct namecache *
1254 cache_neg_evict_select_entry(struct neglist *nl)
1255 {
1256         struct namecache *ncp, *lncp;
1257         struct negstate *ns, *lns;
1258         int i;
1259
1260         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1261         mtx_assert(&nl->nl_lock, MA_OWNED);
1262         ncp = TAILQ_FIRST(&nl->nl_list);
1263         if (ncp == NULL)
1264                 return (NULL);
1265         lncp = ncp;
1266         lns = NCP2NEGSTATE(lncp);
1267         for (i = 1; i < 4; i++) {
1268                 ncp = TAILQ_NEXT(ncp, nc_dst);
1269                 if (ncp == NULL)
1270                         break;
1271                 ns = NCP2NEGSTATE(ncp);
1272                 if (ns->neg_hit < lns->neg_hit) {
1273                         lncp = ncp;
1274                         lns = ns;
1275                 }
1276         }
1277         return (lncp);
1278 }
1279
1280 static bool
1281 cache_neg_evict(void)
1282 {
1283         struct namecache *ncp, *ncp2;
1284         struct neglist *nl;
1285         struct vnode *dvp;
1286         struct mtx *dvlp;
1287         struct mtx *blp;
1288         uint32_t hash;
1289         u_char nlen;
1290         bool evicted;
1291
1292         nl = cache_neg_evict_select_list();
1293         if (nl == NULL) {
1294                 return (false);
1295         }
1296
1297         mtx_lock(&nl->nl_lock);
1298         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1299         if (ncp != NULL) {
1300                 cache_neg_demote_locked(ncp);
1301         }
1302         ncp = cache_neg_evict_select_entry(nl);
1303         if (ncp == NULL) {
1304                 counter_u64_add(neg_evict_skipped_empty, 1);
1305                 mtx_unlock(&nl->nl_lock);
1306                 mtx_unlock(&nl->nl_evict_lock);
1307                 return (false);
1308         }
1309         nlen = ncp->nc_nlen;
1310         dvp = ncp->nc_dvp;
1311         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1312         dvlp = VP2VNODELOCK(dvp);
1313         blp = HASH2BUCKETLOCK(hash);
1314         mtx_unlock(&nl->nl_lock);
1315         mtx_unlock(&nl->nl_evict_lock);
1316         mtx_lock(dvlp);
1317         mtx_lock(blp);
1318         /*
1319          * Note that since all locks were dropped above, the entry may be
1320          * gone or reallocated to be something else.
1321          */
1322         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1323                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1324                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1325                         break;
1326         }
1327         if (ncp2 == NULL) {
1328                 counter_u64_add(neg_evict_skipped_missed, 1);
1329                 ncp = NULL;
1330                 evicted = false;
1331         } else {
1332                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1333                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1334                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1335                     ncp->nc_name);
1336                 cache_zap_locked(ncp);
1337                 counter_u64_add(neg_evicted, 1);
1338                 evicted = true;
1339         }
1340         mtx_unlock(blp);
1341         mtx_unlock(dvlp);
1342         if (ncp != NULL)
1343                 cache_free(ncp);
1344         return (evicted);
1345 }
1346
1347 /*
1348  * Maybe evict a negative entry to create more room.
1349  *
1350  * The ncnegfactor parameter limits what fraction of the total count
1351  * can comprise of negative entries. However, if the cache is just
1352  * warming up this leads to excessive evictions.  As such, ncnegminpct
1353  * (recomputed to neg_min) dictates whether the above should be
1354  * applied.
1355  *
1356  * Try evicting if the cache is close to full capacity regardless of
1357  * other considerations.
1358  */
1359 static bool
1360 cache_neg_evict_cond(u_long lnumcache)
1361 {
1362         u_long lnumneg;
1363
1364         if (ncsize - 1000 < lnumcache)
1365                 goto out_evict;
1366         lnumneg = atomic_load_long(&numneg);
1367         if (lnumneg < neg_min)
1368                 return (false);
1369         if (lnumneg * ncnegfactor < lnumcache)
1370                 return (false);
1371 out_evict:
1372         return (cache_neg_evict());
1373 }
1374
1375 /*
1376  * cache_zap_locked():
1377  *
1378  *   Removes a namecache entry from cache, whether it contains an actual
1379  *   pointer to a vnode or if it is just a negative cache entry.
1380  */
1381 static void
1382 cache_zap_locked(struct namecache *ncp)
1383 {
1384         struct nchashhead *ncpp;
1385
1386         if (!(ncp->nc_flag & NCF_NEGATIVE))
1387                 cache_assert_vnode_locked(ncp->nc_vp);
1388         cache_assert_vnode_locked(ncp->nc_dvp);
1389         cache_assert_bucket_locked(ncp);
1390
1391         cache_ncp_invalidate(ncp);
1392
1393         ncpp = NCP2BUCKET(ncp);
1394         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1395         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1396                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1397                     ncp->nc_name, ncp->nc_vp);
1398                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1399                 if (ncp == ncp->nc_vp->v_cache_dd) {
1400                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1401                         ncp->nc_vp->v_cache_dd = NULL;
1402                         vn_seqc_write_end(ncp->nc_vp);
1403                 }
1404         } else {
1405                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1406                     ncp->nc_name);
1407                 cache_neg_remove(ncp);
1408         }
1409         if (ncp->nc_flag & NCF_ISDOTDOT) {
1410                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1411                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1412                         ncp->nc_dvp->v_cache_dd = NULL;
1413                         vn_seqc_write_end(ncp->nc_dvp);
1414                 }
1415         } else {
1416                 LIST_REMOVE(ncp, nc_src);
1417                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1418                         ncp->nc_flag |= NCF_DVDROP;
1419                 }
1420         }
1421 }
1422
1423 static void
1424 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1425 {
1426         struct mtx *blp;
1427
1428         MPASS(ncp->nc_dvp == vp);
1429         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1430         cache_assert_vnode_locked(vp);
1431
1432         blp = NCP2BUCKETLOCK(ncp);
1433         mtx_lock(blp);
1434         cache_zap_locked(ncp);
1435         mtx_unlock(blp);
1436 }
1437
1438 static bool
1439 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1440     struct mtx **vlpp)
1441 {
1442         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1443         struct mtx *blp;
1444
1445         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1446         cache_assert_vnode_locked(vp);
1447
1448         if (ncp->nc_flag & NCF_NEGATIVE) {
1449                 if (*vlpp != NULL) {
1450                         mtx_unlock(*vlpp);
1451                         *vlpp = NULL;
1452                 }
1453                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1454                 return (true);
1455         }
1456
1457         pvlp = VP2VNODELOCK(vp);
1458         blp = NCP2BUCKETLOCK(ncp);
1459         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1460         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1461
1462         if (*vlpp == vlp1 || *vlpp == vlp2) {
1463                 to_unlock = *vlpp;
1464                 *vlpp = NULL;
1465         } else {
1466                 if (*vlpp != NULL) {
1467                         mtx_unlock(*vlpp);
1468                         *vlpp = NULL;
1469                 }
1470                 cache_sort_vnodes(&vlp1, &vlp2);
1471                 if (vlp1 == pvlp) {
1472                         mtx_lock(vlp2);
1473                         to_unlock = vlp2;
1474                 } else {
1475                         if (!mtx_trylock(vlp1))
1476                                 goto out_relock;
1477                         to_unlock = vlp1;
1478                 }
1479         }
1480         mtx_lock(blp);
1481         cache_zap_locked(ncp);
1482         mtx_unlock(blp);
1483         if (to_unlock != NULL)
1484                 mtx_unlock(to_unlock);
1485         return (true);
1486
1487 out_relock:
1488         mtx_unlock(vlp2);
1489         mtx_lock(vlp1);
1490         mtx_lock(vlp2);
1491         MPASS(*vlpp == NULL);
1492         *vlpp = vlp1;
1493         return (false);
1494 }
1495
1496 /*
1497  * If trylocking failed we can get here. We know enough to take all needed locks
1498  * in the right order and re-lookup the entry.
1499  */
1500 static int
1501 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1502     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1503     struct mtx *blp)
1504 {
1505         struct namecache *rncp;
1506
1507         cache_assert_bucket_unlocked(ncp);
1508
1509         cache_sort_vnodes(&dvlp, &vlp);
1510         cache_lock_vnodes(dvlp, vlp);
1511         mtx_lock(blp);
1512         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1513                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1514                     rncp->nc_nlen == cnp->cn_namelen &&
1515                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1516                         break;
1517         }
1518         if (rncp != NULL) {
1519                 cache_zap_locked(rncp);
1520                 mtx_unlock(blp);
1521                 cache_unlock_vnodes(dvlp, vlp);
1522                 counter_u64_add(zap_bucket_relock_success, 1);
1523                 return (0);
1524         }
1525
1526         mtx_unlock(blp);
1527         cache_unlock_vnodes(dvlp, vlp);
1528         return (EAGAIN);
1529 }
1530
1531 static int __noinline
1532 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1533     uint32_t hash, struct mtx *blp)
1534 {
1535         struct mtx *dvlp, *vlp;
1536         struct vnode *dvp;
1537
1538         cache_assert_bucket_locked(ncp);
1539
1540         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1541         vlp = NULL;
1542         if (!(ncp->nc_flag & NCF_NEGATIVE))
1543                 vlp = VP2VNODELOCK(ncp->nc_vp);
1544         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1545                 cache_zap_locked(ncp);
1546                 mtx_unlock(blp);
1547                 cache_unlock_vnodes(dvlp, vlp);
1548                 return (0);
1549         }
1550
1551         dvp = ncp->nc_dvp;
1552         mtx_unlock(blp);
1553         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1554 }
1555
1556 static __noinline int
1557 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1558 {
1559         struct namecache *ncp;
1560         struct mtx *blp;
1561         struct mtx *dvlp, *dvlp2;
1562         uint32_t hash;
1563         int error;
1564
1565         if (cnp->cn_namelen == 2 &&
1566             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1567                 dvlp = VP2VNODELOCK(dvp);
1568                 dvlp2 = NULL;
1569                 mtx_lock(dvlp);
1570 retry_dotdot:
1571                 ncp = dvp->v_cache_dd;
1572                 if (ncp == NULL) {
1573                         mtx_unlock(dvlp);
1574                         if (dvlp2 != NULL)
1575                                 mtx_unlock(dvlp2);
1576                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1577                         return (0);
1578                 }
1579                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1580                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1581                                 goto retry_dotdot;
1582                         MPASS(dvp->v_cache_dd == NULL);
1583                         mtx_unlock(dvlp);
1584                         if (dvlp2 != NULL)
1585                                 mtx_unlock(dvlp2);
1586                         cache_free(ncp);
1587                 } else {
1588                         vn_seqc_write_begin(dvp);
1589                         dvp->v_cache_dd = NULL;
1590                         vn_seqc_write_end(dvp);
1591                         mtx_unlock(dvlp);
1592                         if (dvlp2 != NULL)
1593                                 mtx_unlock(dvlp2);
1594                 }
1595                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1596                 return (1);
1597         }
1598
1599         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1600         blp = HASH2BUCKETLOCK(hash);
1601 retry:
1602         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1603                 goto out_no_entry;
1604
1605         mtx_lock(blp);
1606
1607         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1608                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1609                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1610                         break;
1611         }
1612
1613         if (ncp == NULL) {
1614                 mtx_unlock(blp);
1615                 goto out_no_entry;
1616         }
1617
1618         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1619         if (__predict_false(error != 0)) {
1620                 zap_bucket_fail++;
1621                 goto retry;
1622         }
1623         counter_u64_add(numposzaps, 1);
1624         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1625         cache_free(ncp);
1626         return (1);
1627 out_no_entry:
1628         counter_u64_add(nummisszap, 1);
1629         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1630         return (0);
1631 }
1632
1633 static int __noinline
1634 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1635     struct timespec *tsp, int *ticksp)
1636 {
1637         int ltype;
1638
1639         *vpp = dvp;
1640         counter_u64_add(dothits, 1);
1641         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1642         if (tsp != NULL)
1643                 timespecclear(tsp);
1644         if (ticksp != NULL)
1645                 *ticksp = ticks;
1646         vrefact(*vpp);
1647         /*
1648          * When we lookup "." we still can be asked to lock it
1649          * differently...
1650          */
1651         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1652         if (ltype != VOP_ISLOCKED(*vpp)) {
1653                 if (ltype == LK_EXCLUSIVE) {
1654                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1655                         if (VN_IS_DOOMED((*vpp))) {
1656                                 /* forced unmount */
1657                                 vrele(*vpp);
1658                                 *vpp = NULL;
1659                                 return (ENOENT);
1660                         }
1661                 } else
1662                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1663         }
1664         return (-1);
1665 }
1666
1667 static int __noinline
1668 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1669     struct timespec *tsp, int *ticksp)
1670 {
1671         struct namecache_ts *ncp_ts;
1672         struct namecache *ncp;
1673         struct mtx *dvlp;
1674         enum vgetstate vs;
1675         int error, ltype;
1676         bool whiteout;
1677
1678         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1679
1680         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1681                 cache_remove_cnp(dvp, cnp);
1682                 return (0);
1683         }
1684
1685         counter_u64_add(dotdothits, 1);
1686 retry:
1687         dvlp = VP2VNODELOCK(dvp);
1688         mtx_lock(dvlp);
1689         ncp = dvp->v_cache_dd;
1690         if (ncp == NULL) {
1691                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1692                 mtx_unlock(dvlp);
1693                 return (0);
1694         }
1695         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1696                 if (ncp->nc_flag & NCF_NEGATIVE)
1697                         *vpp = NULL;
1698                 else
1699                         *vpp = ncp->nc_vp;
1700         } else
1701                 *vpp = ncp->nc_dvp;
1702         if (*vpp == NULL)
1703                 goto negative_success;
1704         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1705         cache_out_ts(ncp, tsp, ticksp);
1706         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1707             NCF_DTS && tsp != NULL) {
1708                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1709                 *tsp = ncp_ts->nc_dotdottime;
1710         }
1711
1712         MPASS(dvp != *vpp);
1713         ltype = VOP_ISLOCKED(dvp);
1714         VOP_UNLOCK(dvp);
1715         vs = vget_prep(*vpp);
1716         mtx_unlock(dvlp);
1717         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1718         vn_lock(dvp, ltype | LK_RETRY);
1719         if (VN_IS_DOOMED(dvp)) {
1720                 if (error == 0)
1721                         vput(*vpp);
1722                 *vpp = NULL;
1723                 return (ENOENT);
1724         }
1725         if (error) {
1726                 *vpp = NULL;
1727                 goto retry;
1728         }
1729         return (-1);
1730 negative_success:
1731         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1732                 if (cnp->cn_flags & ISLASTCN) {
1733                         counter_u64_add(numnegzaps, 1);
1734                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1735                         mtx_unlock(dvlp);
1736                         cache_free(ncp);
1737                         return (0);
1738                 }
1739         }
1740
1741         whiteout = (ncp->nc_flag & NCF_WHITE);
1742         cache_out_ts(ncp, tsp, ticksp);
1743         if (cache_neg_hit_prep(ncp))
1744                 cache_neg_promote(ncp);
1745         else
1746                 cache_neg_hit_finish(ncp);
1747         mtx_unlock(dvlp);
1748         if (whiteout)
1749                 cnp->cn_flags |= ISWHITEOUT;
1750         return (ENOENT);
1751 }
1752
1753 /**
1754  * Lookup a name in the name cache
1755  *
1756  * # Arguments
1757  *
1758  * - dvp:       Parent directory in which to search.
1759  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1760  * - cnp:       Parameters of the name search.  The most interesting bits of
1761  *              the cn_flags field have the following meanings:
1762  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1763  *                      it up.
1764  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1765  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1766  *              or negative) lookup, tsp will be filled with any timespec that
1767  *              was stored when this cache entry was created.  However, it will
1768  *              be clear for "." entries.
1769  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1770  *              (positive or negative) lookup, it will contain the ticks value
1771  *              that was current when the cache entry was created, unless cnp
1772  *              was ".".
1773  *
1774  * Either both tsp and ticks have to be provided or neither of them.
1775  *
1776  * # Returns
1777  *
1778  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1779  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1780  *              to a forced unmount.  vpp will not be modified.  If the entry
1781  *              is a whiteout, then the ISWHITEOUT flag will be set in
1782  *              cnp->cn_flags.
1783  * - 0:         A cache miss.  vpp will not be modified.
1784  *
1785  * # Locking
1786  *
1787  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1788  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1789  * lock is not recursively acquired.
1790  */
1791 static int __noinline
1792 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1793     struct timespec *tsp, int *ticksp)
1794 {
1795         struct namecache *ncp;
1796         struct mtx *blp;
1797         uint32_t hash;
1798         enum vgetstate vs;
1799         int error;
1800         bool whiteout;
1801
1802         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1803         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1804
1805 retry:
1806         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1807         blp = HASH2BUCKETLOCK(hash);
1808         mtx_lock(blp);
1809
1810         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1811                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1812                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1813                         break;
1814         }
1815
1816         if (__predict_false(ncp == NULL)) {
1817                 mtx_unlock(blp);
1818                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1819                     NULL);
1820                 counter_u64_add(nummiss, 1);
1821                 return (0);
1822         }
1823
1824         if (ncp->nc_flag & NCF_NEGATIVE)
1825                 goto negative_success;
1826
1827         counter_u64_add(numposhits, 1);
1828         *vpp = ncp->nc_vp;
1829         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1830         cache_out_ts(ncp, tsp, ticksp);
1831         MPASS(dvp != *vpp);
1832         vs = vget_prep(*vpp);
1833         mtx_unlock(blp);
1834         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1835         if (error) {
1836                 *vpp = NULL;
1837                 goto retry;
1838         }
1839         return (-1);
1840 negative_success:
1841         /*
1842          * We don't get here with regular lookup apart from corner cases.
1843          */
1844         if (__predict_true(cnp->cn_nameiop == CREATE)) {
1845                 if (cnp->cn_flags & ISLASTCN) {
1846                         counter_u64_add(numnegzaps, 1);
1847                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1848                         if (__predict_false(error != 0)) {
1849                                 zap_bucket_fail2++;
1850                                 goto retry;
1851                         }
1852                         cache_free(ncp);
1853                         return (0);
1854                 }
1855         }
1856
1857         whiteout = (ncp->nc_flag & NCF_WHITE);
1858         cache_out_ts(ncp, tsp, ticksp);
1859         if (cache_neg_hit_prep(ncp))
1860                 cache_neg_promote(ncp);
1861         else
1862                 cache_neg_hit_finish(ncp);
1863         mtx_unlock(blp);
1864         if (whiteout)
1865                 cnp->cn_flags |= ISWHITEOUT;
1866         return (ENOENT);
1867 }
1868
1869 int
1870 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1871     struct timespec *tsp, int *ticksp)
1872 {
1873         struct namecache *ncp;
1874         uint32_t hash;
1875         enum vgetstate vs;
1876         int error;
1877         bool whiteout, neg_promote;
1878         u_short nc_flag;
1879
1880         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1881
1882 #ifdef DEBUG_CACHE
1883         if (__predict_false(!doingcache)) {
1884                 cnp->cn_flags &= ~MAKEENTRY;
1885                 return (0);
1886         }
1887 #endif
1888
1889         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1890                 if (cnp->cn_namelen == 1)
1891                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1892                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1893                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1894         }
1895
1896         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1897
1898         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1899                 cache_remove_cnp(dvp, cnp);
1900                 return (0);
1901         }
1902
1903         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1904         vfs_smr_enter();
1905
1906         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1907                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1908                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1909                         break;
1910         }
1911
1912         if (__predict_false(ncp == NULL)) {
1913                 vfs_smr_exit();
1914                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1915                     NULL);
1916                 counter_u64_add(nummiss, 1);
1917                 return (0);
1918         }
1919
1920         nc_flag = atomic_load_char(&ncp->nc_flag);
1921         if (nc_flag & NCF_NEGATIVE)
1922                 goto negative_success;
1923
1924         counter_u64_add(numposhits, 1);
1925         *vpp = ncp->nc_vp;
1926         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1927         cache_out_ts(ncp, tsp, ticksp);
1928         MPASS(dvp != *vpp);
1929         if (!cache_ncp_canuse(ncp)) {
1930                 vfs_smr_exit();
1931                 *vpp = NULL;
1932                 goto out_fallback;
1933         }
1934         vs = vget_prep_smr(*vpp);
1935         vfs_smr_exit();
1936         if (__predict_false(vs == VGET_NONE)) {
1937                 *vpp = NULL;
1938                 goto out_fallback;
1939         }
1940         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1941         if (error) {
1942                 *vpp = NULL;
1943                 goto out_fallback;
1944         }
1945         return (-1);
1946 negative_success:
1947         if (cnp->cn_nameiop == CREATE) {
1948                 if (cnp->cn_flags & ISLASTCN) {
1949                         vfs_smr_exit();
1950                         goto out_fallback;
1951                 }
1952         }
1953
1954         cache_out_ts(ncp, tsp, ticksp);
1955         whiteout = (ncp->nc_flag & NCF_WHITE);
1956         neg_promote = cache_neg_hit_prep(ncp);
1957         if (!cache_ncp_canuse(ncp)) {
1958                 cache_neg_hit_abort(ncp);
1959                 vfs_smr_exit();
1960                 goto out_fallback;
1961         }
1962         if (neg_promote) {
1963                 vfs_smr_exit();
1964                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1965                         goto out_fallback;
1966         } else {
1967                 cache_neg_hit_finish(ncp);
1968                 vfs_smr_exit();
1969         }
1970         if (whiteout)
1971                 cnp->cn_flags |= ISWHITEOUT;
1972         return (ENOENT);
1973 out_fallback:
1974         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1975 }
1976
1977 struct celockstate {
1978         struct mtx *vlp[3];
1979         struct mtx *blp[2];
1980 };
1981 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1982 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1983
1984 static inline void
1985 cache_celockstate_init(struct celockstate *cel)
1986 {
1987
1988         bzero(cel, sizeof(*cel));
1989 }
1990
1991 static void
1992 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1993     struct vnode *dvp)
1994 {
1995         struct mtx *vlp1, *vlp2;
1996
1997         MPASS(cel->vlp[0] == NULL);
1998         MPASS(cel->vlp[1] == NULL);
1999         MPASS(cel->vlp[2] == NULL);
2000
2001         MPASS(vp != NULL || dvp != NULL);
2002
2003         vlp1 = VP2VNODELOCK(vp);
2004         vlp2 = VP2VNODELOCK(dvp);
2005         cache_sort_vnodes(&vlp1, &vlp2);
2006
2007         if (vlp1 != NULL) {
2008                 mtx_lock(vlp1);
2009                 cel->vlp[0] = vlp1;
2010         }
2011         mtx_lock(vlp2);
2012         cel->vlp[1] = vlp2;
2013 }
2014
2015 static void
2016 cache_unlock_vnodes_cel(struct celockstate *cel)
2017 {
2018
2019         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2020
2021         if (cel->vlp[0] != NULL)
2022                 mtx_unlock(cel->vlp[0]);
2023         if (cel->vlp[1] != NULL)
2024                 mtx_unlock(cel->vlp[1]);
2025         if (cel->vlp[2] != NULL)
2026                 mtx_unlock(cel->vlp[2]);
2027 }
2028
2029 static bool
2030 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2031 {
2032         struct mtx *vlp;
2033         bool ret;
2034
2035         cache_assert_vlp_locked(cel->vlp[0]);
2036         cache_assert_vlp_locked(cel->vlp[1]);
2037         MPASS(cel->vlp[2] == NULL);
2038
2039         MPASS(vp != NULL);
2040         vlp = VP2VNODELOCK(vp);
2041
2042         ret = true;
2043         if (vlp >= cel->vlp[1]) {
2044                 mtx_lock(vlp);
2045         } else {
2046                 if (mtx_trylock(vlp))
2047                         goto out;
2048                 cache_lock_vnodes_cel_3_failures++;
2049                 cache_unlock_vnodes_cel(cel);
2050                 if (vlp < cel->vlp[0]) {
2051                         mtx_lock(vlp);
2052                         mtx_lock(cel->vlp[0]);
2053                         mtx_lock(cel->vlp[1]);
2054                 } else {
2055                         if (cel->vlp[0] != NULL)
2056                                 mtx_lock(cel->vlp[0]);
2057                         mtx_lock(vlp);
2058                         mtx_lock(cel->vlp[1]);
2059                 }
2060                 ret = false;
2061         }
2062 out:
2063         cel->vlp[2] = vlp;
2064         return (ret);
2065 }
2066
2067 static void
2068 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2069     struct mtx *blp2)
2070 {
2071
2072         MPASS(cel->blp[0] == NULL);
2073         MPASS(cel->blp[1] == NULL);
2074
2075         cache_sort_vnodes(&blp1, &blp2);
2076
2077         if (blp1 != NULL) {
2078                 mtx_lock(blp1);
2079                 cel->blp[0] = blp1;
2080         }
2081         mtx_lock(blp2);
2082         cel->blp[1] = blp2;
2083 }
2084
2085 static void
2086 cache_unlock_buckets_cel(struct celockstate *cel)
2087 {
2088
2089         if (cel->blp[0] != NULL)
2090                 mtx_unlock(cel->blp[0]);
2091         mtx_unlock(cel->blp[1]);
2092 }
2093
2094 /*
2095  * Lock part of the cache affected by the insertion.
2096  *
2097  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2098  * However, insertion can result in removal of an old entry. In this
2099  * case we have an additional vnode and bucketlock pair to lock.
2100  *
2101  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2102  * preserving the locking order (smaller address first).
2103  */
2104 static void
2105 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2106     uint32_t hash)
2107 {
2108         struct namecache *ncp;
2109         struct mtx *blps[2];
2110
2111         blps[0] = HASH2BUCKETLOCK(hash);
2112         for (;;) {
2113                 blps[1] = NULL;
2114                 cache_lock_vnodes_cel(cel, dvp, vp);
2115                 if (vp == NULL || vp->v_type != VDIR)
2116                         break;
2117                 ncp = vp->v_cache_dd;
2118                 if (ncp == NULL)
2119                         break;
2120                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2121                         break;
2122                 MPASS(ncp->nc_dvp == vp);
2123                 blps[1] = NCP2BUCKETLOCK(ncp);
2124                 if (ncp->nc_flag & NCF_NEGATIVE)
2125                         break;
2126                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2127                         break;
2128                 /*
2129                  * All vnodes got re-locked. Re-validate the state and if
2130                  * nothing changed we are done. Otherwise restart.
2131                  */
2132                 if (ncp == vp->v_cache_dd &&
2133                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2134                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2135                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2136                         break;
2137                 cache_unlock_vnodes_cel(cel);
2138                 cel->vlp[0] = NULL;
2139                 cel->vlp[1] = NULL;
2140                 cel->vlp[2] = NULL;
2141         }
2142         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2143 }
2144
2145 static void
2146 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2147     uint32_t hash)
2148 {
2149         struct namecache *ncp;
2150         struct mtx *blps[2];
2151
2152         blps[0] = HASH2BUCKETLOCK(hash);
2153         for (;;) {
2154                 blps[1] = NULL;
2155                 cache_lock_vnodes_cel(cel, dvp, vp);
2156                 ncp = dvp->v_cache_dd;
2157                 if (ncp == NULL)
2158                         break;
2159                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2160                         break;
2161                 MPASS(ncp->nc_dvp == dvp);
2162                 blps[1] = NCP2BUCKETLOCK(ncp);
2163                 if (ncp->nc_flag & NCF_NEGATIVE)
2164                         break;
2165                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2166                         break;
2167                 if (ncp == dvp->v_cache_dd &&
2168                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2169                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2170                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2171                         break;
2172                 cache_unlock_vnodes_cel(cel);
2173                 cel->vlp[0] = NULL;
2174                 cel->vlp[1] = NULL;
2175                 cel->vlp[2] = NULL;
2176         }
2177         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2178 }
2179
2180 static void
2181 cache_enter_unlock(struct celockstate *cel)
2182 {
2183
2184         cache_unlock_buckets_cel(cel);
2185         cache_unlock_vnodes_cel(cel);
2186 }
2187
2188 static void __noinline
2189 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2190     struct componentname *cnp)
2191 {
2192         struct celockstate cel;
2193         struct namecache *ncp;
2194         uint32_t hash;
2195         int len;
2196
2197         if (dvp->v_cache_dd == NULL)
2198                 return;
2199         len = cnp->cn_namelen;
2200         cache_celockstate_init(&cel);
2201         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2202         cache_enter_lock_dd(&cel, dvp, vp, hash);
2203         vn_seqc_write_begin(dvp);
2204         ncp = dvp->v_cache_dd;
2205         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2206                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2207                 cache_zap_locked(ncp);
2208         } else {
2209                 ncp = NULL;
2210         }
2211         dvp->v_cache_dd = NULL;
2212         vn_seqc_write_end(dvp);
2213         cache_enter_unlock(&cel);
2214         if (ncp != NULL)
2215                 cache_free(ncp);
2216 }
2217
2218 /*
2219  * Add an entry to the cache.
2220  */
2221 void
2222 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2223     struct timespec *tsp, struct timespec *dtsp)
2224 {
2225         struct celockstate cel;
2226         struct namecache *ncp, *n2, *ndd;
2227         struct namecache_ts *ncp_ts;
2228         struct nchashhead *ncpp;
2229         uint32_t hash;
2230         int flag;
2231         int len;
2232
2233         VNPASS(dvp != vp, dvp);
2234         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2235         VNPASS(dvp->v_type != VNON, dvp);
2236         if (vp != NULL) {
2237                 VNPASS(!VN_IS_DOOMED(vp), vp);
2238                 VNPASS(vp->v_type != VNON, vp);
2239         }
2240
2241 #ifdef DEBUG_CACHE
2242         if (__predict_false(!doingcache))
2243                 return;
2244 #endif
2245
2246         flag = 0;
2247         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2248                 if (cnp->cn_namelen == 1)
2249                         return;
2250                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2251                         cache_enter_dotdot_prep(dvp, vp, cnp);
2252                         flag = NCF_ISDOTDOT;
2253                 }
2254         }
2255
2256         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2257         if (ncp == NULL)
2258                 return;
2259
2260         cache_celockstate_init(&cel);
2261         ndd = NULL;
2262         ncp_ts = NULL;
2263
2264         /*
2265          * Calculate the hash key and setup as much of the new
2266          * namecache entry as possible before acquiring the lock.
2267          */
2268         ncp->nc_flag = flag | NCF_WIP;
2269         ncp->nc_vp = vp;
2270         if (vp == NULL)
2271                 cache_neg_init(ncp);
2272         ncp->nc_dvp = dvp;
2273         if (tsp != NULL) {
2274                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2275                 ncp_ts->nc_time = *tsp;
2276                 ncp_ts->nc_ticks = ticks;
2277                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2278                 if (dtsp != NULL) {
2279                         ncp_ts->nc_dotdottime = *dtsp;
2280                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2281                 }
2282         }
2283         len = ncp->nc_nlen = cnp->cn_namelen;
2284         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2285         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2286         ncp->nc_name[len] = '\0';
2287         cache_enter_lock(&cel, dvp, vp, hash);
2288
2289         /*
2290          * See if this vnode or negative entry is already in the cache
2291          * with this name.  This can happen with concurrent lookups of
2292          * the same path name.
2293          */
2294         ncpp = NCHHASH(hash);
2295         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2296                 if (n2->nc_dvp == dvp &&
2297                     n2->nc_nlen == cnp->cn_namelen &&
2298                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2299                         MPASS(cache_ncp_canuse(n2));
2300                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2301                                 KASSERT(vp == NULL,
2302                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2303                                     __func__, NULL, vp));
2304                         else
2305                                 KASSERT(n2->nc_vp == vp,
2306                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2307                                     __func__, n2->nc_vp, vp));
2308                         /*
2309                          * Entries are supposed to be immutable unless in the
2310                          * process of getting destroyed. Accommodating for
2311                          * changing timestamps is possible but not worth it.
2312                          * This should be harmless in terms of correctness, in
2313                          * the worst case resulting in an earlier expiration.
2314                          * Alternatively, the found entry can be replaced
2315                          * altogether.
2316                          */
2317                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2318 #if 0
2319                         if (tsp != NULL) {
2320                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2321                                     ("no NCF_TS"));
2322                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2323                                 n2_ts->nc_time = ncp_ts->nc_time;
2324                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2325                                 if (dtsp != NULL) {
2326                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2327                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2328                                 }
2329                         }
2330 #endif
2331                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2332                             vp);
2333                         goto out_unlock_free;
2334                 }
2335         }
2336
2337         if (flag == NCF_ISDOTDOT) {
2338                 /*
2339                  * See if we are trying to add .. entry, but some other lookup
2340                  * has populated v_cache_dd pointer already.
2341                  */
2342                 if (dvp->v_cache_dd != NULL)
2343                         goto out_unlock_free;
2344                 KASSERT(vp == NULL || vp->v_type == VDIR,
2345                     ("wrong vnode type %p", vp));
2346                 vn_seqc_write_begin(dvp);
2347                 dvp->v_cache_dd = ncp;
2348                 vn_seqc_write_end(dvp);
2349         }
2350
2351         if (vp != NULL) {
2352                 if (flag != NCF_ISDOTDOT) {
2353                         /*
2354                          * For this case, the cache entry maps both the
2355                          * directory name in it and the name ".." for the
2356                          * directory's parent.
2357                          */
2358                         vn_seqc_write_begin(vp);
2359                         if ((ndd = vp->v_cache_dd) != NULL) {
2360                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2361                                         cache_zap_locked(ndd);
2362                                 else
2363                                         ndd = NULL;
2364                         }
2365                         vp->v_cache_dd = ncp;
2366                         vn_seqc_write_end(vp);
2367                 } else if (vp->v_type != VDIR) {
2368                         if (vp->v_cache_dd != NULL) {
2369                                 vn_seqc_write_begin(vp);
2370                                 vp->v_cache_dd = NULL;
2371                                 vn_seqc_write_end(vp);
2372                         }
2373                 }
2374         }
2375
2376         if (flag != NCF_ISDOTDOT) {
2377                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2378                         cache_hold_vnode(dvp);
2379                 }
2380                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2381         }
2382
2383         /*
2384          * If the entry is "negative", we place it into the
2385          * "negative" cache queue, otherwise, we place it into the
2386          * destination vnode's cache entries queue.
2387          */
2388         if (vp != NULL) {
2389                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2390                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2391                     vp);
2392         } else {
2393                 if (cnp->cn_flags & ISWHITEOUT)
2394                         ncp->nc_flag |= NCF_WHITE;
2395                 cache_neg_insert(ncp);
2396                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2397                     ncp->nc_name);
2398         }
2399
2400         /*
2401          * Insert the new namecache entry into the appropriate chain
2402          * within the cache entries table.
2403          */
2404         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2405
2406         atomic_thread_fence_rel();
2407         /*
2408          * Mark the entry as fully constructed.
2409          * It is immutable past this point until its removal.
2410          */
2411         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2412
2413         cache_enter_unlock(&cel);
2414         if (ndd != NULL)
2415                 cache_free(ndd);
2416         return;
2417 out_unlock_free:
2418         cache_enter_unlock(&cel);
2419         cache_free(ncp);
2420         return;
2421 }
2422
2423 static u_int
2424 cache_roundup_2(u_int val)
2425 {
2426         u_int res;
2427
2428         for (res = 1; res <= val; res <<= 1)
2429                 continue;
2430
2431         return (res);
2432 }
2433
2434 static struct nchashhead *
2435 nchinittbl(u_long elements, u_long *hashmask)
2436 {
2437         struct nchashhead *hashtbl;
2438         u_long hashsize, i;
2439
2440         hashsize = cache_roundup_2(elements) / 2;
2441
2442         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2443         for (i = 0; i < hashsize; i++)
2444                 CK_SLIST_INIT(&hashtbl[i]);
2445         *hashmask = hashsize - 1;
2446         return (hashtbl);
2447 }
2448
2449 static void
2450 ncfreetbl(struct nchashhead *hashtbl)
2451 {
2452
2453         free(hashtbl, M_VFSCACHE);
2454 }
2455
2456 /*
2457  * Name cache initialization, from vfs_init() when we are booting
2458  */
2459 static void
2460 nchinit(void *dummy __unused)
2461 {
2462         u_int i;
2463
2464         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2465             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2466         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2467             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2468         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2469             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2470         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2471             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2472
2473         VFS_SMR_ZONE_SET(cache_zone_small);
2474         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2475         VFS_SMR_ZONE_SET(cache_zone_large);
2476         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2477
2478         ncsize = desiredvnodes * ncsizefactor;
2479         cache_recalc_neg_min(ncnegminpct);
2480         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2481         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2482         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2483                 ncbuckethash = 7;
2484         if (ncbuckethash > nchash)
2485                 ncbuckethash = nchash;
2486         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2487             M_WAITOK | M_ZERO);
2488         for (i = 0; i < numbucketlocks; i++)
2489                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2490         ncvnodehash = ncbuckethash;
2491         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2492             M_WAITOK | M_ZERO);
2493         for (i = 0; i < numvnodelocks; i++)
2494                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2495
2496         for (i = 0; i < numneglists; i++) {
2497                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2498                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2499                 TAILQ_INIT(&neglists[i].nl_list);
2500                 TAILQ_INIT(&neglists[i].nl_hotlist);
2501         }
2502 }
2503 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2504
2505 void
2506 cache_vnode_init(struct vnode *vp)
2507 {
2508
2509         LIST_INIT(&vp->v_cache_src);
2510         TAILQ_INIT(&vp->v_cache_dst);
2511         vp->v_cache_dd = NULL;
2512         cache_prehash(vp);
2513 }
2514
2515 void
2516 cache_changesize(u_long newmaxvnodes)
2517 {
2518         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2519         u_long new_nchash, old_nchash;
2520         struct namecache *ncp;
2521         uint32_t hash;
2522         u_long newncsize;
2523         int i;
2524
2525         newncsize = newmaxvnodes * ncsizefactor;
2526         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2527         if (newmaxvnodes < numbucketlocks)
2528                 newmaxvnodes = numbucketlocks;
2529
2530         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2531         /* If same hash table size, nothing to do */
2532         if (nchash == new_nchash) {
2533                 ncfreetbl(new_nchashtbl);
2534                 return;
2535         }
2536         /*
2537          * Move everything from the old hash table to the new table.
2538          * None of the namecache entries in the table can be removed
2539          * because to do so, they have to be removed from the hash table.
2540          */
2541         cache_lock_all_vnodes();
2542         cache_lock_all_buckets();
2543         old_nchashtbl = nchashtbl;
2544         old_nchash = nchash;
2545         nchashtbl = new_nchashtbl;
2546         nchash = new_nchash;
2547         for (i = 0; i <= old_nchash; i++) {
2548                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2549                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2550                             ncp->nc_dvp);
2551                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2552                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2553                 }
2554         }
2555         ncsize = newncsize;
2556         cache_recalc_neg_min(ncnegminpct);
2557         cache_unlock_all_buckets();
2558         cache_unlock_all_vnodes();
2559         ncfreetbl(old_nchashtbl);
2560 }
2561
2562 /*
2563  * Invalidate all entries from and to a particular vnode.
2564  */
2565 static void
2566 cache_purge_impl(struct vnode *vp)
2567 {
2568         struct cache_freebatch batch;
2569         struct namecache *ncp;
2570         struct mtx *vlp, *vlp2;
2571
2572         TAILQ_INIT(&batch);
2573         vlp = VP2VNODELOCK(vp);
2574         vlp2 = NULL;
2575         mtx_lock(vlp);
2576 retry:
2577         while (!LIST_EMPTY(&vp->v_cache_src)) {
2578                 ncp = LIST_FIRST(&vp->v_cache_src);
2579                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2580                         goto retry;
2581                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2582         }
2583         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2584                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2585                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2586                         goto retry;
2587                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2588         }
2589         ncp = vp->v_cache_dd;
2590         if (ncp != NULL) {
2591                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2592                    ("lost dotdot link"));
2593                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2594                         goto retry;
2595                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2596         }
2597         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2598         mtx_unlock(vlp);
2599         if (vlp2 != NULL)
2600                 mtx_unlock(vlp2);
2601         cache_free_batch(&batch);
2602 }
2603
2604 /*
2605  * Opportunistic check to see if there is anything to do.
2606  */
2607 static bool
2608 cache_has_entries(struct vnode *vp)
2609 {
2610
2611         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2612             vp->v_cache_dd == NULL)
2613                 return (false);
2614         return (true);
2615 }
2616
2617 void
2618 cache_purge(struct vnode *vp)
2619 {
2620
2621         SDT_PROBE1(vfs, namecache, purge, done, vp);
2622         if (!cache_has_entries(vp))
2623                 return;
2624         cache_purge_impl(vp);
2625 }
2626
2627 /*
2628  * Only to be used by vgone.
2629  */
2630 void
2631 cache_purge_vgone(struct vnode *vp)
2632 {
2633         struct mtx *vlp;
2634
2635         VNPASS(VN_IS_DOOMED(vp), vp);
2636         if (cache_has_entries(vp)) {
2637                 cache_purge_impl(vp);
2638                 return;
2639         }
2640
2641         /*
2642          * Serialize against a potential thread doing cache_purge.
2643          */
2644         vlp = VP2VNODELOCK(vp);
2645         mtx_wait_unlocked(vlp);
2646         if (cache_has_entries(vp)) {
2647                 cache_purge_impl(vp);
2648                 return;
2649         }
2650         return;
2651 }
2652
2653 /*
2654  * Invalidate all negative entries for a particular directory vnode.
2655  */
2656 void
2657 cache_purge_negative(struct vnode *vp)
2658 {
2659         struct cache_freebatch batch;
2660         struct namecache *ncp, *nnp;
2661         struct mtx *vlp;
2662
2663         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2664         if (LIST_EMPTY(&vp->v_cache_src))
2665                 return;
2666         TAILQ_INIT(&batch);
2667         vlp = VP2VNODELOCK(vp);
2668         mtx_lock(vlp);
2669         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2670                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2671                         continue;
2672                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2673                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2674         }
2675         mtx_unlock(vlp);
2676         cache_free_batch(&batch);
2677 }
2678
2679 /*
2680  * Entry points for modifying VOP operations.
2681  */
2682 void
2683 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2684     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2685 {
2686
2687         ASSERT_VOP_IN_SEQC(fdvp);
2688         ASSERT_VOP_IN_SEQC(fvp);
2689         ASSERT_VOP_IN_SEQC(tdvp);
2690         if (tvp != NULL)
2691                 ASSERT_VOP_IN_SEQC(tvp);
2692
2693         cache_purge(fvp);
2694         if (tvp != NULL) {
2695                 cache_purge(tvp);
2696                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2697                     ("%s: lingering negative entry", __func__));
2698         } else {
2699                 cache_remove_cnp(tdvp, tcnp);
2700         }
2701 }
2702
2703 void
2704 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2705 {
2706
2707         ASSERT_VOP_IN_SEQC(dvp);
2708         ASSERT_VOP_IN_SEQC(vp);
2709         cache_purge(vp);
2710 }
2711
2712 #ifdef INVARIANTS
2713 /*
2714  * Validate that if an entry exists it matches.
2715  */
2716 void
2717 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2718 {
2719         struct namecache *ncp;
2720         struct mtx *blp;
2721         uint32_t hash;
2722
2723         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2724         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2725                 return;
2726         blp = HASH2BUCKETLOCK(hash);
2727         mtx_lock(blp);
2728         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2729                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2730                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2731                         if (ncp->nc_vp != vp)
2732                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2733                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2734                                     ncp->nc_vp);
2735                 }
2736         }
2737         mtx_unlock(blp);
2738 }
2739 #endif
2740
2741 /*
2742  * Flush all entries referencing a particular filesystem.
2743  */
2744 void
2745 cache_purgevfs(struct mount *mp)
2746 {
2747         struct vnode *vp, *mvp;
2748
2749         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2750         /*
2751          * Somewhat wasteful iteration over all vnodes. Would be better to
2752          * support filtering and avoid the interlock to begin with.
2753          */
2754         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2755                 if (!cache_has_entries(vp)) {
2756                         VI_UNLOCK(vp);
2757                         continue;
2758                 }
2759                 vholdl(vp);
2760                 VI_UNLOCK(vp);
2761                 cache_purge(vp);
2762                 vdrop(vp);
2763         }
2764 }
2765
2766 /*
2767  * Perform canonical checks and cache lookup and pass on to filesystem
2768  * through the vop_cachedlookup only if needed.
2769  */
2770
2771 int
2772 vfs_cache_lookup(struct vop_lookup_args *ap)
2773 {
2774         struct vnode *dvp;
2775         int error;
2776         struct vnode **vpp = ap->a_vpp;
2777         struct componentname *cnp = ap->a_cnp;
2778         int flags = cnp->cn_flags;
2779
2780         *vpp = NULL;
2781         dvp = ap->a_dvp;
2782
2783         if (dvp->v_type != VDIR)
2784                 return (ENOTDIR);
2785
2786         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2787             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2788                 return (EROFS);
2789
2790         error = vn_dir_check_exec(dvp, cnp);
2791         if (error != 0)
2792                 return (error);
2793
2794         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2795         if (error == 0)
2796                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2797         if (error == -1)
2798                 return (0);
2799         return (error);
2800 }
2801
2802 /* Implementation of the getcwd syscall. */
2803 int
2804 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2805 {
2806         char *buf, *retbuf;
2807         size_t buflen;
2808         int error;
2809
2810         buflen = uap->buflen;
2811         if (__predict_false(buflen < 2))
2812                 return (EINVAL);
2813         if (buflen > MAXPATHLEN)
2814                 buflen = MAXPATHLEN;
2815
2816         buf = uma_zalloc(namei_zone, M_WAITOK);
2817         error = vn_getcwd(buf, &retbuf, &buflen);
2818         if (error == 0)
2819                 error = copyout(retbuf, uap->buf, buflen);
2820         uma_zfree(namei_zone, buf);
2821         return (error);
2822 }
2823
2824 int
2825 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2826 {
2827         struct pwd *pwd;
2828         int error;
2829
2830         vfs_smr_enter();
2831         pwd = pwd_get_smr();
2832         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2833             buflen, 0);
2834         VFS_SMR_ASSERT_NOT_ENTERED();
2835         if (error < 0) {
2836                 pwd = pwd_hold(curthread);
2837                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2838                     retbuf, buflen);
2839                 pwd_drop(pwd);
2840         }
2841
2842 #ifdef KTRACE
2843         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2844                 ktrnamei(*retbuf);
2845 #endif
2846         return (error);
2847 }
2848
2849 static int
2850 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2851     size_t size, int flags, enum uio_seg pathseg)
2852 {
2853         struct nameidata nd;
2854         char *retbuf, *freebuf;
2855         int error;
2856
2857         if (flags != 0)
2858                 return (EINVAL);
2859         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2860             pathseg, path, fd, &cap_fstat_rights, td);
2861         if ((error = namei(&nd)) != 0)
2862                 return (error);
2863         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2864         if (error == 0) {
2865                 error = copyout(retbuf, buf, size);
2866                 free(freebuf, M_TEMP);
2867         }
2868         NDFREE(&nd, 0);
2869         return (error);
2870 }
2871
2872 int
2873 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2874 {
2875
2876         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2877             uap->flags, UIO_USERSPACE));
2878 }
2879
2880 /*
2881  * Retrieve the full filesystem path that correspond to a vnode from the name
2882  * cache (if available)
2883  */
2884 int
2885 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2886 {
2887         struct pwd *pwd;
2888         char *buf;
2889         size_t buflen;
2890         int error;
2891
2892         if (__predict_false(vp == NULL))
2893                 return (EINVAL);
2894
2895         buflen = MAXPATHLEN;
2896         buf = malloc(buflen, M_TEMP, M_WAITOK);
2897         vfs_smr_enter();
2898         pwd = pwd_get_smr();
2899         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2900         VFS_SMR_ASSERT_NOT_ENTERED();
2901         if (error < 0) {
2902                 pwd = pwd_hold(curthread);
2903                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2904                 pwd_drop(pwd);
2905         }
2906         if (error == 0)
2907                 *freebuf = buf;
2908         else
2909                 free(buf, M_TEMP);
2910         return (error);
2911 }
2912
2913 /*
2914  * This function is similar to vn_fullpath, but it attempts to lookup the
2915  * pathname relative to the global root mount point.  This is required for the
2916  * auditing sub-system, as audited pathnames must be absolute, relative to the
2917  * global root mount point.
2918  */
2919 int
2920 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2921 {
2922         char *buf;
2923         size_t buflen;
2924         int error;
2925
2926         if (__predict_false(vp == NULL))
2927                 return (EINVAL);
2928         buflen = MAXPATHLEN;
2929         buf = malloc(buflen, M_TEMP, M_WAITOK);
2930         vfs_smr_enter();
2931         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2932         VFS_SMR_ASSERT_NOT_ENTERED();
2933         if (error < 0) {
2934                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2935         }
2936         if (error == 0)
2937                 *freebuf = buf;
2938         else
2939                 free(buf, M_TEMP);
2940         return (error);
2941 }
2942
2943 static struct namecache *
2944 vn_dd_from_dst(struct vnode *vp)
2945 {
2946         struct namecache *ncp;
2947
2948         cache_assert_vnode_locked(vp);
2949         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2950                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2951                         return (ncp);
2952         }
2953         return (NULL);
2954 }
2955
2956 int
2957 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2958 {
2959         struct vnode *dvp;
2960         struct namecache *ncp;
2961         struct mtx *vlp;
2962         int error;
2963
2964         vlp = VP2VNODELOCK(*vp);
2965         mtx_lock(vlp);
2966         ncp = (*vp)->v_cache_dd;
2967         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2968                 KASSERT(ncp == vn_dd_from_dst(*vp),
2969                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2970                     ncp, vn_dd_from_dst(*vp)));
2971         } else {
2972                 ncp = vn_dd_from_dst(*vp);
2973         }
2974         if (ncp != NULL) {
2975                 if (*buflen < ncp->nc_nlen) {
2976                         mtx_unlock(vlp);
2977                         vrele(*vp);
2978                         counter_u64_add(numfullpathfail4, 1);
2979                         error = ENOMEM;
2980                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2981                             vp, NULL);
2982                         return (error);
2983                 }
2984                 *buflen -= ncp->nc_nlen;
2985                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2986                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2987                     ncp->nc_name, vp);
2988                 dvp = *vp;
2989                 *vp = ncp->nc_dvp;
2990                 vref(*vp);
2991                 mtx_unlock(vlp);
2992                 vrele(dvp);
2993                 return (0);
2994         }
2995         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2996
2997         mtx_unlock(vlp);
2998         vn_lock(*vp, LK_SHARED | LK_RETRY);
2999         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3000         vput(*vp);
3001         if (error) {
3002                 counter_u64_add(numfullpathfail2, 1);
3003                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
3004                 return (error);
3005         }
3006
3007         *vp = dvp;
3008         if (VN_IS_DOOMED(dvp)) {
3009                 /* forced unmount */
3010                 vrele(dvp);
3011                 error = ENOENT;
3012                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3013                 return (error);
3014         }
3015         /*
3016          * *vp has its use count incremented still.
3017          */
3018
3019         return (0);
3020 }
3021
3022 /*
3023  * Resolve a directory to a pathname.
3024  *
3025  * The name of the directory can always be found in the namecache or fetched
3026  * from the filesystem. There is also guaranteed to be only one parent, meaning
3027  * we can just follow vnodes up until we find the root.
3028  *
3029  * The vnode must be referenced.
3030  */
3031 static int
3032 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3033     size_t *len, size_t addend)
3034 {
3035 #ifdef KDTRACE_HOOKS
3036         struct vnode *startvp = vp;
3037 #endif
3038         struct vnode *vp1;
3039         size_t buflen;
3040         int error;
3041         bool slash_prefixed;
3042
3043         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3044         VNPASS(vp->v_usecount > 0, vp);
3045
3046         buflen = *len;
3047
3048         slash_prefixed = true;
3049         if (addend == 0) {
3050                 MPASS(*len >= 2);
3051                 buflen--;
3052                 buf[buflen] = '\0';
3053                 slash_prefixed = false;
3054         }
3055
3056         error = 0;
3057
3058         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3059         counter_u64_add(numfullpathcalls, 1);
3060         while (vp != rdir && vp != rootvnode) {
3061                 /*
3062                  * The vp vnode must be already fully constructed,
3063                  * since it is either found in namecache or obtained
3064                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3065                  * without obtaining the vnode lock.
3066                  */
3067                 if ((vp->v_vflag & VV_ROOT) != 0) {
3068                         vn_lock(vp, LK_RETRY | LK_SHARED);
3069
3070                         /*
3071                          * With the vnode locked, check for races with
3072                          * unmount, forced or not.  Note that we
3073                          * already verified that vp is not equal to
3074                          * the root vnode, which means that
3075                          * mnt_vnodecovered can be NULL only for the
3076                          * case of unmount.
3077                          */
3078                         if (VN_IS_DOOMED(vp) ||
3079                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3080                             vp1->v_mountedhere != vp->v_mount) {
3081                                 vput(vp);
3082                                 error = ENOENT;
3083                                 SDT_PROBE3(vfs, namecache, fullpath, return,
3084                                     error, vp, NULL);
3085                                 break;
3086                         }
3087
3088                         vref(vp1);
3089                         vput(vp);
3090                         vp = vp1;
3091                         continue;
3092                 }
3093                 if (vp->v_type != VDIR) {
3094                         vrele(vp);
3095                         counter_u64_add(numfullpathfail1, 1);
3096                         error = ENOTDIR;
3097                         SDT_PROBE3(vfs, namecache, fullpath, return,
3098                             error, vp, NULL);
3099                         break;
3100                 }
3101                 error = vn_vptocnp(&vp, buf, &buflen);
3102                 if (error)
3103                         break;
3104                 if (buflen == 0) {
3105                         vrele(vp);
3106                         error = ENOMEM;
3107                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3108                             startvp, NULL);
3109                         break;
3110                 }
3111                 buf[--buflen] = '/';
3112                 slash_prefixed = true;
3113         }
3114         if (error)
3115                 return (error);
3116         if (!slash_prefixed) {
3117                 if (buflen == 0) {
3118                         vrele(vp);
3119                         counter_u64_add(numfullpathfail4, 1);
3120                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3121                             startvp, NULL);
3122                         return (ENOMEM);
3123                 }
3124                 buf[--buflen] = '/';
3125         }
3126         counter_u64_add(numfullpathfound, 1);
3127         vrele(vp);
3128
3129         *retbuf = buf + buflen;
3130         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3131         *len -= buflen;
3132         *len += addend;
3133         return (0);
3134 }
3135
3136 /*
3137  * Resolve an arbitrary vnode to a pathname.
3138  *
3139  * Note 2 caveats:
3140  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3141  *   resolve to a different path than the one used to find it
3142  * - namecache is not mandatory, meaning names are not guaranteed to be added
3143  *   (in which case resolving fails)
3144  */
3145 static void __inline
3146 cache_rev_failed_impl(int *reason, int line)
3147 {
3148
3149         *reason = line;
3150 }
3151 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3152
3153 static int
3154 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3155     char **retbuf, size_t *buflen, size_t addend)
3156 {
3157 #ifdef KDTRACE_HOOKS
3158         struct vnode *startvp = vp;
3159 #endif
3160         struct vnode *tvp;
3161         struct mount *mp;
3162         struct namecache *ncp;
3163         size_t orig_buflen;
3164         int reason;
3165         int error;
3166 #ifdef KDTRACE_HOOKS
3167         int i;
3168 #endif
3169         seqc_t vp_seqc, tvp_seqc;
3170         u_char nc_flag;
3171
3172         VFS_SMR_ASSERT_ENTERED();
3173
3174         if (!cache_fast_revlookup) {
3175                 vfs_smr_exit();
3176                 return (-1);
3177         }
3178
3179         orig_buflen = *buflen;
3180
3181         if (addend == 0) {
3182                 MPASS(*buflen >= 2);
3183                 *buflen -= 1;
3184                 buf[*buflen] = '\0';
3185         }
3186
3187         if (vp == rdir || vp == rootvnode) {
3188                 if (addend == 0) {
3189                         *buflen -= 1;
3190                         buf[*buflen] = '/';
3191                 }
3192                 goto out_ok;
3193         }
3194
3195 #ifdef KDTRACE_HOOKS
3196         i = 0;
3197 #endif
3198         error = -1;
3199         ncp = NULL; /* for sdt probe down below */
3200         vp_seqc = vn_seqc_read_any(vp);
3201         if (seqc_in_modify(vp_seqc)) {
3202                 cache_rev_failed(&reason);
3203                 goto out_abort;
3204         }
3205
3206         for (;;) {
3207 #ifdef KDTRACE_HOOKS
3208                 i++;
3209 #endif
3210                 if ((vp->v_vflag & VV_ROOT) != 0) {
3211                         mp = atomic_load_ptr(&vp->v_mount);
3212                         if (mp == NULL) {
3213                                 cache_rev_failed(&reason);
3214                                 goto out_abort;
3215                         }
3216                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3217                         tvp_seqc = vn_seqc_read_any(tvp);
3218                         if (seqc_in_modify(tvp_seqc)) {
3219                                 cache_rev_failed(&reason);
3220                                 goto out_abort;
3221                         }
3222                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3223                                 cache_rev_failed(&reason);
3224                                 goto out_abort;
3225                         }
3226                         vp = tvp;
3227                         vp_seqc = tvp_seqc;
3228                         continue;
3229                 }
3230                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3231                 if (ncp == NULL) {
3232                         cache_rev_failed(&reason);
3233                         goto out_abort;
3234                 }
3235                 nc_flag = atomic_load_char(&ncp->nc_flag);
3236                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3237                         cache_rev_failed(&reason);
3238                         goto out_abort;
3239                 }
3240                 if (!cache_ncp_canuse(ncp)) {
3241                         cache_rev_failed(&reason);
3242                         goto out_abort;
3243                 }
3244                 if (ncp->nc_nlen >= *buflen) {
3245                         cache_rev_failed(&reason);
3246                         error = ENOMEM;
3247                         goto out_abort;
3248                 }
3249                 *buflen -= ncp->nc_nlen;
3250                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3251                 *buflen -= 1;
3252                 buf[*buflen] = '/';
3253                 tvp = ncp->nc_dvp;
3254                 tvp_seqc = vn_seqc_read_any(tvp);
3255                 if (seqc_in_modify(tvp_seqc)) {
3256                         cache_rev_failed(&reason);
3257                         goto out_abort;
3258                 }
3259                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3260                         cache_rev_failed(&reason);
3261                         goto out_abort;
3262                 }
3263                 vp = tvp;
3264                 vp_seqc = tvp_seqc;
3265                 if (vp == rdir || vp == rootvnode)
3266                         break;
3267         }
3268 out_ok:
3269         vfs_smr_exit();
3270         *retbuf = buf + *buflen;
3271         *buflen = orig_buflen - *buflen + addend;
3272         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3273         return (0);
3274
3275 out_abort:
3276         *buflen = orig_buflen;
3277         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3278         vfs_smr_exit();
3279         return (error);
3280 }
3281
3282 static int
3283 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3284     size_t *buflen)
3285 {
3286         size_t orig_buflen, addend;
3287         int error;
3288
3289         if (*buflen < 2)
3290                 return (EINVAL);
3291
3292         orig_buflen = *buflen;
3293
3294         vref(vp);
3295         addend = 0;
3296         if (vp->v_type != VDIR) {
3297                 *buflen -= 1;
3298                 buf[*buflen] = '\0';
3299                 error = vn_vptocnp(&vp, buf, buflen);
3300                 if (error)
3301                         return (error);
3302                 if (*buflen == 0) {
3303                         vrele(vp);
3304                         return (ENOMEM);
3305                 }
3306                 *buflen -= 1;
3307                 buf[*buflen] = '/';
3308                 addend = orig_buflen - *buflen;
3309         }
3310
3311         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3312 }
3313
3314 /*
3315  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3316  *
3317  * Since the namecache does not track hardlinks, the caller is expected to first
3318  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3319  *
3320  * Then we have 2 cases:
3321  * - if the found vnode is a directory, the path can be constructed just by
3322  *   following names up the chain
3323  * - otherwise we populate the buffer with the saved name and start resolving
3324  *   from the parent
3325  */
3326 static int
3327 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3328     size_t *buflen)
3329 {
3330         char *buf, *tmpbuf;
3331         struct pwd *pwd;
3332         struct componentname *cnp;
3333         struct vnode *vp;
3334         size_t addend;
3335         int error;
3336         enum vtype type;
3337
3338         if (*buflen < 2)
3339                 return (EINVAL);
3340         if (*buflen > MAXPATHLEN)
3341                 *buflen = MAXPATHLEN;
3342
3343         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3344
3345         addend = 0;
3346         vp = ndp->ni_vp;
3347         /*
3348          * Check for VBAD to work around the vp_crossmp bug in lookup().
3349          *
3350          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3351          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3352          * If the type is VDIR (like in this very case) we can skip looking
3353          * at ni_dvp in the first place. However, since vnodes get passed here
3354          * unlocked the target may transition to doomed state (type == VBAD)
3355          * before we get to evaluate the condition. If this happens, we will
3356          * populate part of the buffer and descend to vn_fullpath_dir with
3357          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3358          *
3359          * This should be atomic_load(&vp->v_type) but it is illegal to take
3360          * an address of a bit field, even if said field is sized to char.
3361          * Work around the problem by reading the value into a full-sized enum
3362          * and then re-reading it with atomic_load which will still prevent
3363          * the compiler from re-reading down the road.
3364          */
3365         type = vp->v_type;
3366         type = atomic_load_int(&type);
3367         if (type == VBAD) {
3368                 error = ENOENT;
3369                 goto out_bad;
3370         }
3371         if (type != VDIR) {
3372                 cnp = &ndp->ni_cnd;
3373                 addend = cnp->cn_namelen + 2;
3374                 if (*buflen < addend) {
3375                         error = ENOMEM;
3376                         goto out_bad;
3377                 }
3378                 *buflen -= addend;
3379                 tmpbuf = buf + *buflen;
3380                 tmpbuf[0] = '/';
3381                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3382                 tmpbuf[addend - 1] = '\0';
3383                 vp = ndp->ni_dvp;
3384         }
3385
3386         vfs_smr_enter();
3387         pwd = pwd_get_smr();
3388         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3389             addend);
3390         VFS_SMR_ASSERT_NOT_ENTERED();
3391         if (error < 0) {
3392                 pwd = pwd_hold(curthread);
3393                 vref(vp);
3394                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3395                     addend);
3396                 pwd_drop(pwd);
3397                 if (error != 0)
3398                         goto out_bad;
3399         }
3400
3401         *freebuf = buf;
3402
3403         return (0);
3404 out_bad:
3405         free(buf, M_TEMP);
3406         return (error);
3407 }
3408
3409 struct vnode *
3410 vn_dir_dd_ino(struct vnode *vp)
3411 {
3412         struct namecache *ncp;
3413         struct vnode *ddvp;
3414         struct mtx *vlp;
3415         enum vgetstate vs;
3416
3417         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3418         vlp = VP2VNODELOCK(vp);
3419         mtx_lock(vlp);
3420         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3421                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3422                         continue;
3423                 ddvp = ncp->nc_dvp;
3424                 vs = vget_prep(ddvp);
3425                 mtx_unlock(vlp);
3426                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3427                         return (NULL);
3428                 return (ddvp);
3429         }
3430         mtx_unlock(vlp);
3431         return (NULL);
3432 }
3433
3434 int
3435 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3436 {
3437         struct namecache *ncp;
3438         struct mtx *vlp;
3439         int l;
3440
3441         vlp = VP2VNODELOCK(vp);
3442         mtx_lock(vlp);
3443         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3444                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3445                         break;
3446         if (ncp == NULL) {
3447                 mtx_unlock(vlp);
3448                 return (ENOENT);
3449         }
3450         l = min(ncp->nc_nlen, buflen - 1);
3451         memcpy(buf, ncp->nc_name, l);
3452         mtx_unlock(vlp);
3453         buf[l] = '\0';
3454         return (0);
3455 }
3456
3457 /*
3458  * This function updates path string to vnode's full global path
3459  * and checks the size of the new path string against the pathlen argument.
3460  *
3461  * Requires a locked, referenced vnode.
3462  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3463  *
3464  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3465  * because it falls back to the ".." lookup if the namecache lookup fails.
3466  */
3467 int
3468 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3469     u_int pathlen)
3470 {
3471         struct nameidata nd;
3472         struct vnode *vp1;
3473         char *rpath, *fbuf;
3474         int error;
3475
3476         ASSERT_VOP_ELOCKED(vp, __func__);
3477
3478         /* Construct global filesystem path from vp. */
3479         VOP_UNLOCK(vp);
3480         error = vn_fullpath_global(vp, &rpath, &fbuf);
3481
3482         if (error != 0) {
3483                 vrele(vp);
3484                 return (error);
3485         }
3486
3487         if (strlen(rpath) >= pathlen) {
3488                 vrele(vp);
3489                 error = ENAMETOOLONG;
3490                 goto out;
3491         }
3492
3493         /*
3494          * Re-lookup the vnode by path to detect a possible rename.
3495          * As a side effect, the vnode is relocked.
3496          * If vnode was renamed, return ENOENT.
3497          */
3498         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3499             UIO_SYSSPACE, path, td);
3500         error = namei(&nd);
3501         if (error != 0) {
3502                 vrele(vp);
3503                 goto out;
3504         }
3505         NDFREE(&nd, NDF_ONLY_PNBUF);
3506         vp1 = nd.ni_vp;
3507         vrele(vp);
3508         if (vp1 == vp)
3509                 strcpy(path, rpath);
3510         else {
3511                 vput(vp1);
3512                 error = ENOENT;
3513         }
3514
3515 out:
3516         free(fbuf, M_TEMP);
3517         return (error);
3518 }
3519
3520 #ifdef DDB
3521 static void
3522 db_print_vpath(struct vnode *vp)
3523 {
3524
3525         while (vp != NULL) {
3526                 db_printf("%p: ", vp);
3527                 if (vp == rootvnode) {
3528                         db_printf("/");
3529                         vp = NULL;
3530                 } else {
3531                         if (vp->v_vflag & VV_ROOT) {
3532                                 db_printf("<mount point>");
3533                                 vp = vp->v_mount->mnt_vnodecovered;
3534                         } else {
3535                                 struct namecache *ncp;
3536                                 char *ncn;
3537                                 int i;
3538
3539                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3540                                 if (ncp != NULL) {
3541                                         ncn = ncp->nc_name;
3542                                         for (i = 0; i < ncp->nc_nlen; i++)
3543                                                 db_printf("%c", *ncn++);
3544                                         vp = ncp->nc_dvp;
3545                                 } else {
3546                                         vp = NULL;
3547                                 }
3548                         }
3549                 }
3550                 db_printf("\n");
3551         }
3552
3553         return;
3554 }
3555
3556 DB_SHOW_COMMAND(vpath, db_show_vpath)
3557 {
3558         struct vnode *vp;
3559
3560         if (!have_addr) {
3561                 db_printf("usage: show vpath <struct vnode *>\n");
3562                 return;
3563         }
3564
3565         vp = (struct vnode *)addr;
3566         db_print_vpath(vp);
3567 }
3568
3569 #endif
3570
3571 static bool __read_frequently cache_fast_lookup = true;
3572 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3573     &cache_fast_lookup, 0, "");
3574
3575 #define CACHE_FPL_FAILED        -2020
3576
3577 /*
3578  * Components of nameidata (or objects it can point to) which may
3579  * need restoring in case fast path lookup fails.
3580  */
3581 struct nameidata_saved {
3582         long cn_namelen;
3583         char *cn_nameptr;
3584         size_t ni_pathlen;
3585         int cn_flags;
3586 };
3587
3588 struct cache_fpl {
3589         struct nameidata *ndp;
3590         struct componentname *cnp;
3591         struct pwd *pwd;
3592         struct vnode *dvp;
3593         struct vnode *tvp;
3594         seqc_t dvp_seqc;
3595         seqc_t tvp_seqc;
3596         struct nameidata_saved snd;
3597         int line;
3598         enum cache_fpl_status status:8;
3599         bool in_smr;
3600         bool fsearch;
3601 };
3602
3603 static void
3604 cache_fpl_cleanup_cnp(struct componentname *cnp)
3605 {
3606
3607         uma_zfree(namei_zone, cnp->cn_pnbuf);
3608 #ifdef DIAGNOSTIC
3609         cnp->cn_pnbuf = NULL;
3610         cnp->cn_nameptr = NULL;
3611 #endif
3612 }
3613
3614 static struct vnode *
3615 cache_fpl_handle_root(struct cache_fpl *fpl)
3616 {
3617         struct nameidata *ndp;
3618         struct componentname *cnp;
3619
3620         ndp = fpl->ndp;
3621         cnp = fpl->cnp;
3622
3623         while (*(cnp->cn_nameptr) == '/') {
3624                 cnp->cn_nameptr++;
3625                 ndp->ni_pathlen--;
3626         }
3627
3628         return (ndp->ni_rootdir);
3629 }
3630
3631 static void
3632 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3633 {
3634
3635         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3636         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3637         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3638         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3639 }
3640
3641 static void
3642 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd)
3643 {
3644
3645         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3646         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3647         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3648         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3649 }
3650
3651 static void
3652 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd)
3653 {
3654
3655         cache_fpl_restore_partial(fpl, snd);
3656         /*
3657          * It is 0 on entry by API contract.
3658          */
3659         fpl->ndp->ni_resflags = 0;
3660 }
3661
3662 #ifdef INVARIANTS
3663 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3664         struct cache_fpl *_fpl = (fpl);                         \
3665         MPASS(_fpl->in_smr == true);                            \
3666         VFS_SMR_ASSERT_ENTERED();                               \
3667 })
3668 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3669         struct cache_fpl *_fpl = (fpl);                         \
3670         MPASS(_fpl->in_smr == false);                           \
3671         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3672 })
3673 #else
3674 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3675 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3676 #endif
3677
3678 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3679         struct cache_fpl *_fpl = (fpl);                         \
3680         vfs_smr_enter();                                        \
3681         _fpl->in_smr = true;                                    \
3682 })
3683
3684 #define cache_fpl_smr_enter(fpl) ({                             \
3685         struct cache_fpl *_fpl = (fpl);                         \
3686         MPASS(_fpl->in_smr == false);                           \
3687         vfs_smr_enter();                                        \
3688         _fpl->in_smr = true;                                    \
3689 })
3690
3691 #define cache_fpl_smr_exit(fpl) ({                              \
3692         struct cache_fpl *_fpl = (fpl);                         \
3693         MPASS(_fpl->in_smr == true);                            \
3694         vfs_smr_exit();                                         \
3695         _fpl->in_smr = false;                                   \
3696 })
3697
3698 static int
3699 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3700 {
3701
3702         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3703                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3704                     ("%s: converting to abort from %d at %d, set at %d\n",
3705                     __func__, fpl->status, line, fpl->line));
3706         }
3707         fpl->status = CACHE_FPL_STATUS_ABORTED;
3708         fpl->line = line;
3709         return (CACHE_FPL_FAILED);
3710 }
3711
3712 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3713
3714 static int
3715 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3716 {
3717
3718         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3719             ("%s: setting to partial at %d, but already set to %d at %d\n",
3720             __func__, line, fpl->status, fpl->line));
3721         cache_fpl_smr_assert_entered(fpl);
3722         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3723         fpl->line = line;
3724         return (CACHE_FPL_FAILED);
3725 }
3726
3727 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3728
3729 static int
3730 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3731 {
3732
3733         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3734             ("%s: setting to handled at %d, but already set to %d at %d\n",
3735             __func__, line, fpl->status, fpl->line));
3736         cache_fpl_smr_assert_not_entered(fpl);
3737         MPASS(error != CACHE_FPL_FAILED);
3738         fpl->status = CACHE_FPL_STATUS_HANDLED;
3739         fpl->line = line;
3740         return (error);
3741 }
3742
3743 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3744
3745 static bool
3746 cache_fpl_terminated(struct cache_fpl *fpl)
3747 {
3748
3749         return (fpl->status != CACHE_FPL_STATUS_UNSET);
3750 }
3751
3752 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3753         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
3754          FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \
3755          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3756
3757 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3758         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3759
3760 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3761     "supported and internal flags overlap");
3762
3763 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
3764 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
3765
3766 static bool
3767 cache_fpl_islastcn(struct nameidata *ndp)
3768 {
3769
3770         return (*ndp->ni_next == 0);
3771 }
3772
3773 static bool
3774 cache_fpl_isdotdot(struct componentname *cnp)
3775 {
3776
3777         if (cnp->cn_namelen == 2 &&
3778             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3779                 return (true);
3780         return (false);
3781 }
3782
3783 static bool
3784 cache_can_fplookup(struct cache_fpl *fpl)
3785 {
3786         struct nameidata *ndp;
3787         struct componentname *cnp;
3788         struct thread *td;
3789
3790         ndp = fpl->ndp;
3791         cnp = fpl->cnp;
3792         td = cnp->cn_thread;
3793
3794         if (!cache_fast_lookup) {
3795                 cache_fpl_aborted(fpl);
3796                 return (false);
3797         }
3798 #ifdef MAC
3799         if (mac_vnode_check_lookup_enabled()) {
3800                 cache_fpl_aborted(fpl);
3801                 return (false);
3802         }
3803 #endif
3804         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3805                 cache_fpl_aborted(fpl);
3806                 return (false);
3807         }
3808         if (IN_CAPABILITY_MODE(td)) {
3809                 cache_fpl_aborted(fpl);
3810                 return (false);
3811         }
3812         if (AUDITING_TD(td)) {
3813                 cache_fpl_aborted(fpl);
3814                 return (false);
3815         }
3816         if (ndp->ni_startdir != NULL) {
3817                 cache_fpl_aborted(fpl);
3818                 return (false);
3819         }
3820         return (true);
3821 }
3822
3823 static int
3824 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3825 {
3826         struct nameidata *ndp;
3827         int error;
3828         bool fsearch;
3829
3830         ndp = fpl->ndp;
3831         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3832         if (__predict_false(error != 0)) {
3833                 cache_fpl_smr_exit(fpl);
3834                 return (cache_fpl_aborted(fpl));
3835         }
3836         fpl->fsearch = fsearch;
3837         return (0);
3838 }
3839
3840 static bool
3841 cache_fplookup_vnode_supported(struct vnode *vp)
3842 {
3843
3844         return (vp->v_type != VLNK);
3845 }
3846
3847 static int __noinline
3848 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3849     uint32_t hash)
3850 {
3851         struct componentname *cnp;
3852         struct vnode *dvp;
3853
3854         cnp = fpl->cnp;
3855         dvp = fpl->dvp;
3856
3857         cache_fpl_smr_exit(fpl);
3858         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3859                 return (cache_fpl_handled(fpl, ENOENT));
3860         else
3861                 return (cache_fpl_aborted(fpl));
3862 }
3863
3864 /*
3865  * The target vnode is not supported, prepare for the slow path to take over.
3866  */
3867 static int __noinline
3868 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3869 {
3870         struct nameidata *ndp;
3871         struct componentname *cnp;
3872         enum vgetstate dvs;
3873         struct vnode *dvp;
3874         struct pwd *pwd;
3875         seqc_t dvp_seqc;
3876
3877         ndp = fpl->ndp;
3878         cnp = fpl->cnp;
3879         pwd = fpl->pwd;
3880         dvp = fpl->dvp;
3881         dvp_seqc = fpl->dvp_seqc;
3882
3883         if (!pwd_hold_smr(pwd)) {
3884                 cache_fpl_smr_exit(fpl);
3885                 return (cache_fpl_aborted(fpl));
3886         }
3887
3888         /*
3889          * Note that seqc is checked before the vnode is locked, so by
3890          * the time regular lookup gets to it it may have moved.
3891          *
3892          * Ultimately this does not affect correctness, any lookup errors
3893          * are userspace racing with itself. It is guaranteed that any
3894          * path which ultimatley gets found could also have been found
3895          * by regular lookup going all the way in absence of concurrent
3896          * modifications.
3897          */
3898         dvs = vget_prep_smr(dvp);
3899         cache_fpl_smr_exit(fpl);
3900         if (__predict_false(dvs == VGET_NONE)) {
3901                 pwd_drop(pwd);
3902                 return (cache_fpl_aborted(fpl));
3903         }
3904
3905         vget_finish_ref(dvp, dvs);
3906         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3907                 vrele(dvp);
3908                 pwd_drop(pwd);
3909                 return (cache_fpl_aborted(fpl));
3910         }
3911
3912         cache_fpl_restore_partial(fpl, &fpl->snd);
3913
3914         ndp->ni_startdir = dvp;
3915         cnp->cn_flags |= MAKEENTRY;
3916         if (cache_fpl_islastcn(ndp))
3917                 cnp->cn_flags |= ISLASTCN;
3918         if (cache_fpl_isdotdot(cnp))
3919                 cnp->cn_flags |= ISDOTDOT;
3920
3921         return (0);
3922 }
3923
3924 static int
3925 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3926 {
3927         struct componentname *cnp;
3928         struct vnode *tvp;
3929         seqc_t tvp_seqc;
3930         int error, lkflags;
3931
3932         cnp = fpl->cnp;
3933         tvp = fpl->tvp;
3934         tvp_seqc = fpl->tvp_seqc;
3935
3936         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3937                 lkflags = LK_SHARED;
3938                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3939                         lkflags = LK_EXCLUSIVE;
3940                 error = vget_finish(tvp, lkflags, tvs);
3941                 if (__predict_false(error != 0)) {
3942                         return (cache_fpl_aborted(fpl));
3943                 }
3944         } else {
3945                 vget_finish_ref(tvp, tvs);
3946         }
3947
3948         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3949                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3950                         vput(tvp);
3951                 else
3952                         vrele(tvp);
3953                 return (cache_fpl_aborted(fpl));
3954         }
3955
3956         return (cache_fpl_handled(fpl, 0));
3957 }
3958
3959 /*
3960  * They want to possibly modify the state of the namecache.
3961  */
3962 static int __noinline
3963 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3964 {
3965         struct nameidata *ndp;
3966         struct componentname *cnp;
3967         enum vgetstate dvs;
3968         struct vnode *dvp, *tvp;
3969         struct mount *mp;
3970         seqc_t dvp_seqc;
3971         int error;
3972         bool docache;
3973
3974         ndp = fpl->ndp;
3975         cnp = fpl->cnp;
3976         dvp = fpl->dvp;
3977         dvp_seqc = fpl->dvp_seqc;
3978
3979         MPASS(cache_fpl_islastcn(ndp));
3980         if ((cnp->cn_flags & LOCKPARENT) == 0)
3981                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
3982         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
3983         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
3984             cnp->cn_nameiop == RENAME);
3985         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
3986         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
3987
3988         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
3989         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
3990                 docache = false;
3991
3992         mp = atomic_load_ptr(&dvp->v_mount);
3993         if (__predict_false(mp == NULL)) {
3994                 return (cache_fpl_aborted(fpl));
3995         }
3996
3997         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
3998                 cache_fpl_smr_exit(fpl);
3999                 /*
4000                  * Original code keeps not checking for CREATE which
4001                  * might be a bug. For now let the old lookup decide.
4002                  */
4003                 if (cnp->cn_nameiop == CREATE) {
4004                         return (cache_fpl_aborted(fpl));
4005                 }
4006                 return (cache_fpl_handled(fpl, EROFS));
4007         }
4008
4009         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4010                 cache_fpl_smr_exit(fpl);
4011                 return (cache_fpl_handled(fpl, EEXIST));
4012         }
4013
4014         /*
4015          * Secure access to dvp; check cache_fplookup_partial_setup for
4016          * reasoning.
4017          *
4018          * XXX At least UFS requires its lookup routine to be called for
4019          * the last path component, which leads to some level of complicaton
4020          * and inefficiency:
4021          * - the target routine always locks the target vnode, but our caller
4022          *   may not need it locked
4023          * - some of the VOP machinery asserts that the parent is locked, which
4024          *   once more may be not required
4025          *
4026          * TODO: add a flag for filesystems which don't need this.
4027          */
4028         dvs = vget_prep_smr(dvp);
4029         cache_fpl_smr_exit(fpl);
4030         if (__predict_false(dvs == VGET_NONE)) {
4031                 return (cache_fpl_aborted(fpl));
4032         }
4033
4034         vget_finish_ref(dvp, dvs);
4035         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4036                 vrele(dvp);
4037                 return (cache_fpl_aborted(fpl));
4038         }
4039
4040         error = vn_lock(dvp, LK_EXCLUSIVE);
4041         if (__predict_false(error != 0)) {
4042                 vrele(dvp);
4043                 return (cache_fpl_aborted(fpl));
4044         }
4045
4046         tvp = NULL;
4047         cnp->cn_flags |= ISLASTCN;
4048         if (docache)
4049                 cnp->cn_flags |= MAKEENTRY;
4050         if (cache_fpl_isdotdot(cnp))
4051                 cnp->cn_flags |= ISDOTDOT;
4052         cnp->cn_lkflags = LK_EXCLUSIVE;
4053         error = VOP_LOOKUP(dvp, &tvp, cnp);
4054         switch (error) {
4055         case EJUSTRETURN:
4056         case 0:
4057                 break;
4058         case ENOTDIR:
4059         case ENOENT:
4060                 vput(dvp);
4061                 return (cache_fpl_handled(fpl, error));
4062         default:
4063                 vput(dvp);
4064                 return (cache_fpl_aborted(fpl));
4065         }
4066
4067         fpl->tvp = tvp;
4068
4069         if (tvp == NULL) {
4070                 if ((cnp->cn_flags & SAVESTART) != 0) {
4071                         ndp->ni_startdir = dvp;
4072                         vrefact(ndp->ni_startdir);
4073                         cnp->cn_flags |= SAVENAME;
4074                 }
4075                 MPASS(error == EJUSTRETURN);
4076                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4077                         VOP_UNLOCK(dvp);
4078                 }
4079                 return (cache_fpl_handled(fpl, 0));
4080         }
4081
4082         /*
4083          * There are very hairy corner cases concerning various flag combinations
4084          * and locking state. In particular here we only hold one lock instead of
4085          * two.
4086          *
4087          * Skip the complexity as it is of no significance for normal workloads.
4088          */
4089         if (__predict_false(tvp == dvp)) {
4090                 vput(dvp);
4091                 vrele(tvp);
4092                 return (cache_fpl_aborted(fpl));
4093         }
4094
4095         /*
4096          * Check if the target is either a symlink or a mount point.
4097          * Since we expect this to be the terminal vnode it should
4098          * almost never be true.
4099          */
4100         if (__predict_false(!cache_fplookup_vnode_supported(tvp) ||
4101             cache_fplookup_is_mp(fpl))) {
4102                 vput(dvp);
4103                 vput(tvp);
4104                 return (cache_fpl_aborted(fpl));
4105         }
4106
4107         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4108                 vput(dvp);
4109                 vput(tvp);
4110                 return (cache_fpl_handled(fpl, EEXIST));
4111         }
4112
4113         if ((cnp->cn_flags & LOCKLEAF) == 0) {
4114                 VOP_UNLOCK(tvp);
4115         }
4116
4117         if ((cnp->cn_flags & LOCKPARENT) == 0) {
4118                 VOP_UNLOCK(dvp);
4119         }
4120
4121         if ((cnp->cn_flags & SAVESTART) != 0) {
4122                 ndp->ni_startdir = dvp;
4123                 vrefact(ndp->ni_startdir);
4124                 cnp->cn_flags |= SAVENAME;
4125         }
4126
4127         return (cache_fpl_handled(fpl, 0));
4128 }
4129
4130 static int __noinline
4131 cache_fplookup_modifying(struct cache_fpl *fpl)
4132 {
4133         struct nameidata *ndp;
4134
4135         ndp = fpl->ndp;
4136
4137         if (!cache_fpl_islastcn(ndp)) {
4138                 return (cache_fpl_partial(fpl));
4139         }
4140         return  (cache_fplookup_final_modifying(fpl));
4141 }
4142
4143 static int __noinline
4144 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4145 {
4146         struct componentname *cnp;
4147         enum vgetstate dvs, tvs;
4148         struct vnode *dvp, *tvp;
4149         seqc_t dvp_seqc;
4150         int error;
4151
4152         cnp = fpl->cnp;
4153         dvp = fpl->dvp;
4154         dvp_seqc = fpl->dvp_seqc;
4155         tvp = fpl->tvp;
4156
4157         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4158
4159         /*
4160          * This is less efficient than it can be for simplicity.
4161          */
4162         dvs = vget_prep_smr(dvp);
4163         if (__predict_false(dvs == VGET_NONE)) {
4164                 return (cache_fpl_aborted(fpl));
4165         }
4166         tvs = vget_prep_smr(tvp);
4167         if (__predict_false(tvs == VGET_NONE)) {
4168                 cache_fpl_smr_exit(fpl);
4169                 vget_abort(dvp, dvs);
4170                 return (cache_fpl_aborted(fpl));
4171         }
4172
4173         cache_fpl_smr_exit(fpl);
4174
4175         if ((cnp->cn_flags & LOCKPARENT) != 0) {
4176                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4177                 if (__predict_false(error != 0)) {
4178                         vget_abort(tvp, tvs);
4179                         return (cache_fpl_aborted(fpl));
4180                 }
4181         } else {
4182                 vget_finish_ref(dvp, dvs);
4183         }
4184
4185         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4186                 vget_abort(tvp, tvs);
4187                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4188                         vput(dvp);
4189                 else
4190                         vrele(dvp);
4191                 return (cache_fpl_aborted(fpl));
4192         }
4193
4194         error = cache_fplookup_final_child(fpl, tvs);
4195         if (__predict_false(error != 0)) {
4196                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
4197                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4198                         vput(dvp);
4199                 else
4200                         vrele(dvp);
4201                 return (error);
4202         }
4203
4204         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4205         return (0);
4206 }
4207
4208 static int
4209 cache_fplookup_final(struct cache_fpl *fpl)
4210 {
4211         struct componentname *cnp;
4212         enum vgetstate tvs;
4213         struct vnode *dvp, *tvp;
4214         seqc_t dvp_seqc;
4215
4216         cnp = fpl->cnp;
4217         dvp = fpl->dvp;
4218         dvp_seqc = fpl->dvp_seqc;
4219         tvp = fpl->tvp;
4220
4221         if (cnp->cn_nameiop != LOOKUP) {
4222                 return (cache_fplookup_final_modifying(fpl));
4223         }
4224
4225         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4226                 return (cache_fplookup_final_withparent(fpl));
4227
4228         tvs = vget_prep_smr(tvp);
4229         if (__predict_false(tvs == VGET_NONE)) {
4230                 return (cache_fpl_partial(fpl));
4231         }
4232
4233         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4234                 cache_fpl_smr_exit(fpl);
4235                 vget_abort(tvp, tvs);
4236                 return (cache_fpl_aborted(fpl));
4237         }
4238
4239         cache_fpl_smr_exit(fpl);
4240         return (cache_fplookup_final_child(fpl, tvs));
4241 }
4242
4243 /*
4244  * Comment from locked lookup:
4245  * Check for degenerate name (e.g. / or "") which is a way of talking about a
4246  * directory, e.g. like "/." or ".".
4247  */
4248 static int __noinline
4249 cache_fplookup_degenerate(struct cache_fpl *fpl)
4250 {
4251         struct componentname *cnp;
4252         struct vnode *dvp;
4253         enum vgetstate dvs;
4254         int error, lkflags;
4255
4256         fpl->tvp = fpl->dvp;
4257         fpl->tvp_seqc = fpl->dvp_seqc;
4258
4259         cnp = fpl->cnp;
4260         dvp = fpl->dvp;
4261
4262         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4263                 cache_fpl_smr_exit(fpl);
4264                 return (cache_fpl_handled(fpl, EISDIR));
4265         }
4266
4267         MPASS((cnp->cn_flags & SAVESTART) == 0);
4268
4269         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4270                 return (cache_fplookup_final_withparent(fpl));
4271         }
4272
4273         dvs = vget_prep_smr(dvp);
4274         cache_fpl_smr_exit(fpl);
4275         if (__predict_false(dvs == VGET_NONE)) {
4276                 return (cache_fpl_aborted(fpl));
4277         }
4278
4279         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4280                 lkflags = LK_SHARED;
4281                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4282                         lkflags = LK_EXCLUSIVE;
4283                 error = vget_finish(dvp, lkflags, dvs);
4284                 if (__predict_false(error != 0)) {
4285                         return (cache_fpl_aborted(fpl));
4286                 }
4287         } else {
4288                 vget_finish_ref(dvp, dvs);
4289         }
4290         return (cache_fpl_handled(fpl, 0));
4291 }
4292
4293 static int __noinline
4294 cache_fplookup_noentry(struct cache_fpl *fpl)
4295 {
4296         struct nameidata *ndp;
4297         struct componentname *cnp;
4298         enum vgetstate dvs;
4299         struct vnode *dvp, *tvp;
4300         seqc_t dvp_seqc;
4301         int error;
4302         bool docache;
4303
4304         ndp = fpl->ndp;
4305         cnp = fpl->cnp;
4306         dvp = fpl->dvp;
4307         dvp_seqc = fpl->dvp_seqc;
4308
4309         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4310         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4311         MPASS(!cache_fpl_isdotdot(cnp));
4312
4313         if (cnp->cn_nameiop != LOOKUP) {
4314                 fpl->tvp = NULL;
4315                 return (cache_fplookup_modifying(fpl));
4316         }
4317
4318         MPASS((cnp->cn_flags & SAVESTART) == 0);
4319
4320         /*
4321          * Only try to fill in the component if it is the last one,
4322          * otherwise not only there may be several to handle but the
4323          * walk may be complicated.
4324          */
4325         if (!cache_fpl_islastcn(ndp)) {
4326                 return (cache_fpl_partial(fpl));
4327         }
4328
4329         /*
4330          * Secure access to dvp; check cache_fplookup_partial_setup for
4331          * reasoning.
4332          */
4333         dvs = vget_prep_smr(dvp);
4334         cache_fpl_smr_exit(fpl);
4335         if (__predict_false(dvs == VGET_NONE)) {
4336                 return (cache_fpl_aborted(fpl));
4337         }
4338
4339         vget_finish_ref(dvp, dvs);
4340         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4341                 vrele(dvp);
4342                 return (cache_fpl_aborted(fpl));
4343         }
4344
4345         error = vn_lock(dvp, LK_SHARED);
4346         if (__predict_false(error != 0)) {
4347                 vrele(dvp);
4348                 return (cache_fpl_aborted(fpl));
4349         }
4350
4351         tvp = NULL;
4352         /*
4353          * TODO: provide variants which don't require locking either vnode.
4354          */
4355         cnp->cn_flags |= ISLASTCN;
4356         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4357         if (docache)
4358                 cnp->cn_flags |= MAKEENTRY;
4359         cnp->cn_lkflags = LK_SHARED;
4360         if ((cnp->cn_flags & LOCKSHARED) == 0) {
4361                 cnp->cn_lkflags = LK_EXCLUSIVE;
4362         }
4363         error = VOP_LOOKUP(dvp, &tvp, cnp);
4364         switch (error) {
4365         case EJUSTRETURN:
4366         case 0:
4367                 break;
4368         case ENOTDIR:
4369         case ENOENT:
4370                 vput(dvp);
4371                 return (cache_fpl_handled(fpl, error));
4372         default:
4373                 vput(dvp);
4374                 return (cache_fpl_aborted(fpl));
4375         }
4376
4377         fpl->tvp = tvp;
4378
4379         if (tvp == NULL) {
4380                 MPASS(error == EJUSTRETURN);
4381                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4382                         vput(dvp);
4383                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4384                         VOP_UNLOCK(dvp);
4385                 }
4386                 return (cache_fpl_handled(fpl, 0));
4387         }
4388
4389         if (__predict_false(!cache_fplookup_vnode_supported(tvp) ||
4390             cache_fplookup_is_mp(fpl))) {
4391                 vput(dvp);
4392                 vput(tvp);
4393                 return (cache_fpl_aborted(fpl));
4394         }
4395
4396         if ((cnp->cn_flags & LOCKLEAF) == 0) {
4397                 VOP_UNLOCK(tvp);
4398         }
4399
4400         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4401                 vput(dvp);
4402         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4403                 VOP_UNLOCK(dvp);
4404         }
4405         return (cache_fpl_handled(fpl, 0));
4406 }
4407
4408 static int __noinline
4409 cache_fplookup_dot(struct cache_fpl *fpl)
4410 {
4411         int error;
4412
4413         MPASS(!seqc_in_modify(fpl->dvp_seqc));
4414         /*
4415          * Just re-assign the value. seqc will be checked later for the first
4416          * non-dot path component in line and/or before deciding to return the
4417          * vnode.
4418          */
4419         fpl->tvp = fpl->dvp;
4420         fpl->tvp_seqc = fpl->dvp_seqc;
4421         if (cache_fplookup_is_mp(fpl)) {
4422                 error = cache_fplookup_cross_mount(fpl);
4423                 if (__predict_false(error != 0)) {
4424                         return (error);
4425                 }
4426         }
4427
4428         counter_u64_add(dothits, 1);
4429         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
4430         return (0);
4431 }
4432
4433 static int __noinline
4434 cache_fplookup_dotdot(struct cache_fpl *fpl)
4435 {
4436         struct nameidata *ndp;
4437         struct componentname *cnp;
4438         struct namecache *ncp;
4439         struct vnode *dvp;
4440         struct prison *pr;
4441         u_char nc_flag;
4442
4443         ndp = fpl->ndp;
4444         cnp = fpl->cnp;
4445         dvp = fpl->dvp;
4446
4447         MPASS(cache_fpl_isdotdot(cnp));
4448
4449         /*
4450          * XXX this is racy the same way regular lookup is
4451          */
4452         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4453             pr = pr->pr_parent)
4454                 if (dvp == pr->pr_root)
4455                         break;
4456
4457         if (dvp == ndp->ni_rootdir ||
4458             dvp == ndp->ni_topdir ||
4459             dvp == rootvnode ||
4460             pr != NULL) {
4461                 fpl->tvp = dvp;
4462                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
4463                 if (seqc_in_modify(fpl->tvp_seqc)) {
4464                         return (cache_fpl_aborted(fpl));
4465                 }
4466                 return (0);
4467         }
4468
4469         if ((dvp->v_vflag & VV_ROOT) != 0) {
4470                 /*
4471                  * TODO
4472                  * The opposite of climb mount is needed here.
4473                  */
4474                 return (cache_fpl_aborted(fpl));
4475         }
4476
4477         ncp = atomic_load_ptr(&dvp->v_cache_dd);
4478         if (ncp == NULL) {
4479                 return (cache_fpl_aborted(fpl));
4480         }
4481
4482         nc_flag = atomic_load_char(&ncp->nc_flag);
4483         if ((nc_flag & NCF_ISDOTDOT) != 0) {
4484                 if ((nc_flag & NCF_NEGATIVE) != 0)
4485                         return (cache_fpl_aborted(fpl));
4486                 fpl->tvp = ncp->nc_vp;
4487         } else {
4488                 fpl->tvp = ncp->nc_dvp;
4489         }
4490
4491         if (!cache_ncp_canuse(ncp)) {
4492                 return (cache_fpl_aborted(fpl));
4493         }
4494
4495         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4496         if (seqc_in_modify(fpl->tvp_seqc)) {
4497                 return (cache_fpl_partial(fpl));
4498         }
4499
4500         counter_u64_add(dotdothits, 1);
4501         return (0);
4502 }
4503
4504 static int __noinline
4505 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4506 {
4507         u_char nc_flag;
4508         bool neg_promote;
4509
4510         nc_flag = atomic_load_char(&ncp->nc_flag);
4511         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4512         /*
4513          * If they want to create an entry we need to replace this one.
4514          */
4515         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4516                 fpl->tvp = NULL;
4517                 return (cache_fplookup_modifying(fpl));
4518         }
4519         neg_promote = cache_neg_hit_prep(ncp);
4520         if (!cache_fpl_neg_ncp_canuse(ncp)) {
4521                 cache_neg_hit_abort(ncp);
4522                 return (cache_fpl_partial(fpl));
4523         }
4524         if (neg_promote) {
4525                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4526         }
4527         cache_neg_hit_finish(ncp);
4528         cache_fpl_smr_exit(fpl);
4529         return (cache_fpl_handled(fpl, ENOENT));
4530 }
4531
4532 static int
4533 cache_fplookup_next(struct cache_fpl *fpl)
4534 {
4535         struct componentname *cnp;
4536         struct namecache *ncp;
4537         struct vnode *dvp, *tvp;
4538         u_char nc_flag;
4539         uint32_t hash;
4540         int error;
4541
4542         cnp = fpl->cnp;
4543         dvp = fpl->dvp;
4544
4545         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
4546                 if (cnp->cn_namelen == 1) {
4547                         return (cache_fplookup_dot(fpl));
4548                 }
4549                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
4550                         return (cache_fplookup_dotdot(fpl));
4551                 }
4552         }
4553
4554         MPASS(!cache_fpl_isdotdot(cnp));
4555
4556         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4557
4558         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4559                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4560                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4561                         break;
4562         }
4563
4564         if (__predict_false(ncp == NULL)) {
4565                 return (cache_fplookup_noentry(fpl));
4566         }
4567
4568         tvp = atomic_load_ptr(&ncp->nc_vp);
4569         nc_flag = atomic_load_char(&ncp->nc_flag);
4570         if ((nc_flag & NCF_NEGATIVE) != 0) {
4571                 return (cache_fplookup_neg(fpl, ncp, hash));
4572         }
4573
4574         if (!cache_ncp_canuse(ncp)) {
4575                 return (cache_fpl_partial(fpl));
4576         }
4577
4578         fpl->tvp = tvp;
4579         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4580         if (seqc_in_modify(fpl->tvp_seqc)) {
4581                 return (cache_fpl_partial(fpl));
4582         }
4583
4584         if (!cache_fplookup_vnode_supported(tvp)) {
4585                 return (cache_fpl_partial(fpl));
4586         }
4587
4588         if (cache_fplookup_is_mp(fpl)) {
4589                 error = cache_fplookup_cross_mount(fpl);
4590                 if (__predict_false(error != 0)) {
4591                         return (error);
4592                 }
4593         }
4594
4595         counter_u64_add(numposhits, 1);
4596         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4597         return (0);
4598 }
4599
4600 static bool
4601 cache_fplookup_mp_supported(struct mount *mp)
4602 {
4603
4604         MPASS(mp != NULL);
4605         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4606                 return (false);
4607         return (true);
4608 }
4609
4610 /*
4611  * Walk up the mount stack (if any).
4612  *
4613  * Correctness is provided in the following ways:
4614  * - all vnodes are protected from freeing with SMR
4615  * - struct mount objects are type stable making them always safe to access
4616  * - stability of the particular mount is provided by busying it
4617  * - relationship between the vnode which is mounted on and the mount is
4618  *   verified with the vnode sequence counter after busying
4619  * - association between root vnode of the mount and the mount is protected
4620  *   by busy
4621  *
4622  * From that point on we can read the sequence counter of the root vnode
4623  * and get the next mount on the stack (if any) using the same protection.
4624  *
4625  * By the end of successful walk we are guaranteed the reached state was
4626  * indeed present at least at some point which matches the regular lookup.
4627  */
4628 static int __noinline
4629 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4630 {
4631         struct mount *mp, *prev_mp;
4632         struct mount_pcpu *mpcpu, *prev_mpcpu;
4633         struct vnode *vp;
4634         seqc_t vp_seqc;
4635
4636         vp = fpl->tvp;
4637         vp_seqc = fpl->tvp_seqc;
4638
4639         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4640         mp = atomic_load_ptr(&vp->v_mountedhere);
4641         if (__predict_false(mp == NULL)) {
4642                 return (0);
4643         }
4644
4645         prev_mp = NULL;
4646         for (;;) {
4647                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
4648                         if (prev_mp != NULL)
4649                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4650                         return (cache_fpl_partial(fpl));
4651                 }
4652                 if (prev_mp != NULL)
4653                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4654                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4655                         vfs_op_thread_exit_crit(mp, mpcpu);
4656                         return (cache_fpl_partial(fpl));
4657                 }
4658                 if (!cache_fplookup_mp_supported(mp)) {
4659                         vfs_op_thread_exit_crit(mp, mpcpu);
4660                         return (cache_fpl_partial(fpl));
4661                 }
4662                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4663                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4664                         vfs_op_thread_exit_crit(mp, mpcpu);
4665                         return (cache_fpl_partial(fpl));
4666                 }
4667                 vp_seqc = vn_seqc_read_any(vp);
4668                 if (seqc_in_modify(vp_seqc)) {
4669                         vfs_op_thread_exit_crit(mp, mpcpu);
4670                         return (cache_fpl_partial(fpl));
4671                 }
4672                 prev_mp = mp;
4673                 prev_mpcpu = mpcpu;
4674                 mp = atomic_load_ptr(&vp->v_mountedhere);
4675                 if (mp == NULL)
4676                         break;
4677         }
4678
4679         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4680         fpl->tvp = vp;
4681         fpl->tvp_seqc = vp_seqc;
4682         return (0);
4683 }
4684
4685 static int __noinline
4686 cache_fplookup_cross_mount(struct cache_fpl *fpl)
4687 {
4688         struct mount *mp;
4689         struct mount_pcpu *mpcpu;
4690         struct vnode *vp;
4691         seqc_t vp_seqc;
4692
4693         vp = fpl->tvp;
4694         vp_seqc = fpl->tvp_seqc;
4695
4696         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4697         mp = atomic_load_ptr(&vp->v_mountedhere);
4698         if (__predict_false(mp == NULL)) {
4699                 return (0);
4700         }
4701
4702         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
4703                 return (cache_fpl_partial(fpl));
4704         }
4705         if (!vn_seqc_consistent(vp, vp_seqc)) {
4706                 vfs_op_thread_exit_crit(mp, mpcpu);
4707                 return (cache_fpl_partial(fpl));
4708         }
4709         if (!cache_fplookup_mp_supported(mp)) {
4710                 vfs_op_thread_exit_crit(mp, mpcpu);
4711                 return (cache_fpl_partial(fpl));
4712         }
4713         vp = atomic_load_ptr(&mp->mnt_rootvnode);
4714         if (__predict_false(vp == NULL || VN_IS_DOOMED(vp))) {
4715                 vfs_op_thread_exit_crit(mp, mpcpu);
4716                 return (cache_fpl_partial(fpl));
4717         }
4718         vp_seqc = vn_seqc_read_any(vp);
4719         vfs_op_thread_exit_crit(mp, mpcpu);
4720         if (seqc_in_modify(vp_seqc)) {
4721                 return (cache_fpl_partial(fpl));
4722         }
4723         mp = atomic_load_ptr(&vp->v_mountedhere);
4724         if (__predict_false(mp != NULL)) {
4725                 /*
4726                  * There are possibly more mount points on top.
4727                  * Normally this does not happen so for simplicity just start
4728                  * over.
4729                  */
4730                 return (cache_fplookup_climb_mount(fpl));
4731         }
4732
4733         fpl->tvp = vp;
4734         fpl->tvp_seqc = vp_seqc;
4735         return (0);
4736 }
4737
4738 /*
4739  * Check if a vnode is mounted on.
4740  */
4741 static bool
4742 cache_fplookup_is_mp(struct cache_fpl *fpl)
4743 {
4744         struct mount *mp;
4745         struct vnode *vp;
4746
4747         vp = fpl->tvp;
4748
4749         /*
4750          * Hack: while this is a union, the pointer tends to be NULL so save on
4751          * a branch.
4752          */
4753         mp = atomic_load_ptr(&vp->v_mountedhere);
4754         if (mp == NULL)
4755                 return (false);
4756         if (vp->v_type == VDIR)
4757                 return (true);
4758         return (false);
4759 }
4760
4761 /*
4762  * Parse the path.
4763  *
4764  * The code was originally copy-pasted from regular lookup and despite
4765  * clean ups leaves performance on the table. Any modifications here
4766  * must take into account that in case off fallback the resulting
4767  * nameidata state has to be compatible with the original.
4768  */
4769 static int
4770 cache_fplookup_preparse(struct cache_fpl *fpl)
4771 {
4772         struct nameidata *ndp;
4773         struct componentname *cnp;
4774
4775         ndp = fpl->ndp;
4776         cnp = fpl->cnp;
4777
4778         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4779                 return (cache_fplookup_degenerate(fpl));
4780         }
4781
4782         /*
4783          * By this point the shortest possible pathname is one character + nul
4784          * terminator, hence 2.
4785          */
4786         KASSERT(ndp->ni_pathlen >= 2, ("%s: ni_pathlen %zu\n", __func__,
4787             ndp->ni_pathlen));
4788
4789         if (__predict_false(cnp->cn_nameptr[ndp->ni_pathlen - 2] == '/')) {
4790                 /*
4791                  * TODO
4792                  * Regular lookup performs the following:
4793                  * *ndp->ni_next = '\0';
4794                  * cnp->cn_flags |= TRAILINGSLASH;
4795                  *
4796                  * Which is problematic since it modifies data read
4797                  * from userspace. Then if fast path lookup was to
4798                  * abort we would have to either restore it or convey
4799                  * the flag. Since this is a corner case just ignore
4800                  * it for simplicity.
4801                  */
4802                 return (cache_fpl_aborted(fpl));
4803         }
4804         return (0);
4805 }
4806
4807 static int
4808 cache_fplookup_parse(struct cache_fpl *fpl)
4809 {
4810         struct nameidata *ndp;
4811         struct componentname *cnp;
4812         char *cp;
4813
4814         ndp = fpl->ndp;
4815         cnp = fpl->cnp;
4816
4817         /*
4818          * Find the end of this path component, it is either / or nul.
4819          *
4820          * Store / as a temporary sentinel so that we only have one character
4821          * to test for. Pathnames tend to be short so this should not be
4822          * resulting in cache misses.
4823          */
4824         KASSERT(cnp->cn_nameptr[ndp->ni_pathlen - 1] == '\0',
4825             ("%s: expected nul at %p + %zu; string [%s]\n", __func__,
4826             cnp->cn_nameptr, ndp->ni_pathlen - 1, cnp->cn_nameptr));
4827         cnp->cn_nameptr[ndp->ni_pathlen - 1] = '/';
4828         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
4829                 KASSERT(*cp != '\0',
4830                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
4831                     cnp->cn_nameptr));
4832                 continue;
4833         }
4834         cnp->cn_nameptr[ndp->ni_pathlen - 1] = '\0';
4835
4836         cnp->cn_namelen = cp - cnp->cn_nameptr;
4837         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4838                 cache_fpl_smr_exit(fpl);
4839                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4840         }
4841         ndp->ni_pathlen -= cnp->cn_namelen;
4842         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4843             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4844         ndp->ni_next = cp;
4845
4846 #ifdef INVARIANTS
4847         /*
4848          * Code below is only here to assure compatibility with regular lookup.
4849          * It covers handling of trailing slashles and names like "/", both of
4850          * which of can be taken care of upfront which lockless lookup does
4851          * in cache_fplookup_preparse. Regular lookup performs these for each
4852          * path component.
4853          */
4854         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4855                 cp++;
4856                 if (*cp == '\0') {
4857                         panic("%s: ran into TRAILINGSLASH handling from [%s]\n",
4858                             __func__, cnp->cn_pnbuf);
4859                 }
4860         }
4861
4862         if (cnp->cn_nameptr[0] == '\0') {
4863                 panic("%s: ran into degenerate name from [%s]\n", __func__, cnp->cn_pnbuf);
4864         }
4865 #endif
4866         return (0);
4867 }
4868
4869 static void
4870 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4871 {
4872         struct nameidata *ndp;
4873         struct componentname *cnp;
4874
4875         ndp = fpl->ndp;
4876         cnp = fpl->cnp;
4877
4878         cnp->cn_nameptr = ndp->ni_next;
4879         while (*cnp->cn_nameptr == '/') {
4880                 cnp->cn_nameptr++;
4881                 ndp->ni_pathlen--;
4882         }
4883 }
4884
4885 /*
4886  * See the API contract for VOP_FPLOOKUP_VEXEC.
4887  */
4888 static int __noinline
4889 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4890 {
4891         struct vnode *dvp;
4892         seqc_t dvp_seqc;
4893
4894         dvp = fpl->dvp;
4895         dvp_seqc = fpl->dvp_seqc;
4896
4897         /*
4898          * Hack: they may be looking up foo/bar, where foo is a
4899          * regular file. In such a case we need to turn ENOTDIR,
4900          * but we may happen to get here with a different error.
4901          */
4902         if (dvp->v_type != VDIR) {
4903                 /*
4904                  * The check here is predominantly to catch
4905                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4906                  * gets doomed past this point it is going to
4907                  * fail seqc verification.
4908                  */
4909                 if (VN_IS_DOOMED(dvp)) {
4910                         return (cache_fpl_aborted(fpl));
4911                 }
4912                 error = ENOTDIR;
4913         }
4914
4915         /*
4916          * Hack: handle O_SEARCH.
4917          *
4918          * Open Group Base Specifications Issue 7, 2018 edition states:
4919          * If the access mode of the open file description associated with the
4920          * file descriptor is not O_SEARCH, the function shall check whether
4921          * directory searches are permitted using the current permissions of
4922          * the directory underlying the file descriptor. If the access mode is
4923          * O_SEARCH, the function shall not perform the check.
4924          *
4925          * Regular lookup tests for the NOEXECCHECK flag for every path
4926          * component to decide whether to do the permission check. However,
4927          * since most lookups never have the flag (and when they do it is only
4928          * present for the first path component), lockless lookup only acts on
4929          * it if there is a permission problem. Here the flag is represented
4930          * with a boolean so that we don't have to clear it on the way out.
4931          *
4932          * For simplicity this always aborts.
4933          * TODO: check if this is the first lookup and ignore the permission
4934          * problem. Note the flag has to survive fallback (if it happens to be
4935          * performed).
4936          */
4937         if (fpl->fsearch) {
4938                 return (cache_fpl_aborted(fpl));
4939         }
4940
4941         switch (error) {
4942         case EAGAIN:
4943                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4944                         error = cache_fpl_aborted(fpl);
4945                 } else {
4946                         cache_fpl_partial(fpl);
4947                 }
4948                 break;
4949         default:
4950                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4951                         error = cache_fpl_aborted(fpl);
4952                 } else {
4953                         cache_fpl_smr_exit(fpl);
4954                         cache_fpl_handled(fpl, error);
4955                 }
4956                 break;
4957         }
4958         return (error);
4959 }
4960
4961 static int
4962 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4963 {
4964         struct nameidata *ndp;
4965         struct componentname *cnp;
4966         struct mount *mp;
4967         int error;
4968
4969         error = CACHE_FPL_FAILED;
4970         ndp = fpl->ndp;
4971         cnp = fpl->cnp;
4972
4973         cache_fpl_checkpoint(fpl, &fpl->snd);
4974
4975         fpl->dvp = dvp;
4976         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4977         if (seqc_in_modify(fpl->dvp_seqc)) {
4978                 cache_fpl_aborted(fpl);
4979                 goto out;
4980         }
4981         mp = atomic_load_ptr(&dvp->v_mount);
4982         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
4983                 cache_fpl_aborted(fpl);
4984                 goto out;
4985         }
4986
4987         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4988
4989         error = cache_fplookup_preparse(fpl);
4990         if (__predict_false(cache_fpl_terminated(fpl))) {
4991                 goto out;
4992         }
4993
4994         for (;;) {
4995                 error = cache_fplookup_parse(fpl);
4996                 if (__predict_false(error != 0)) {
4997                         break;
4998                 }
4999
5000                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
5001
5002                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
5003                 if (__predict_false(error != 0)) {
5004                         error = cache_fplookup_failed_vexec(fpl, error);
5005                         break;
5006                 }
5007
5008                 error = cache_fplookup_next(fpl);
5009                 if (__predict_false(cache_fpl_terminated(fpl))) {
5010                         break;
5011                 }
5012
5013                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
5014
5015                 if (cache_fpl_islastcn(ndp)) {
5016                         error = cache_fplookup_final(fpl);
5017                         break;
5018                 }
5019
5020                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
5021                         error = cache_fpl_aborted(fpl);
5022                         break;
5023                 }
5024
5025                 fpl->dvp = fpl->tvp;
5026                 fpl->dvp_seqc = fpl->tvp_seqc;
5027
5028                 cache_fplookup_parse_advance(fpl);
5029                 cache_fpl_checkpoint(fpl, &fpl->snd);
5030         }
5031 out:
5032         switch (fpl->status) {
5033         case CACHE_FPL_STATUS_UNSET:
5034                 __assert_unreachable();
5035                 break;
5036         case CACHE_FPL_STATUS_PARTIAL:
5037                 cache_fpl_smr_assert_entered(fpl);
5038                 return (cache_fplookup_partial_setup(fpl));
5039         case CACHE_FPL_STATUS_ABORTED:
5040                 if (fpl->in_smr)
5041                         cache_fpl_smr_exit(fpl);
5042                 return (CACHE_FPL_FAILED);
5043         case CACHE_FPL_STATUS_HANDLED:
5044                 MPASS(error != CACHE_FPL_FAILED);
5045                 cache_fpl_smr_assert_not_entered(fpl);
5046                 /*
5047                  * A common error is ENOENT.
5048                  */
5049                 if (error != 0) {
5050                         ndp->ni_dvp = NULL;
5051                         ndp->ni_vp = NULL;
5052                         cache_fpl_cleanup_cnp(cnp);
5053                         return (error);
5054                 }
5055                 ndp->ni_dvp = fpl->dvp;
5056                 ndp->ni_vp = fpl->tvp;
5057                 if (cnp->cn_flags & SAVENAME)
5058                         cnp->cn_flags |= HASBUF;
5059                 else
5060                         cache_fpl_cleanup_cnp(cnp);
5061                 return (error);
5062         }
5063         __assert_unreachable();
5064 }
5065
5066 /*
5067  * Fast path lookup protected with SMR and sequence counters.
5068  *
5069  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
5070  *
5071  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
5072  * outlined below.
5073  *
5074  * Traditional vnode lookup conceptually looks like this:
5075  *
5076  * vn_lock(current);
5077  * for (;;) {
5078  *      next = find();
5079  *      vn_lock(next);
5080  *      vn_unlock(current);
5081  *      current = next;
5082  *      if (last)
5083  *          break;
5084  * }
5085  * return (current);
5086  *
5087  * Each jump to the next vnode is safe memory-wise and atomic with respect to
5088  * any modifications thanks to holding respective locks.
5089  *
5090  * The same guarantee can be provided with a combination of safe memory
5091  * reclamation and sequence counters instead. If all operations which affect
5092  * the relationship between the current vnode and the one we are looking for
5093  * also modify the counter, we can verify whether all the conditions held as
5094  * we made the jump. This includes things like permissions, mount points etc.
5095  * Counter modification is provided by enclosing relevant places in
5096  * vn_seqc_write_begin()/end() calls.
5097  *
5098  * Thus this translates to:
5099  *
5100  * vfs_smr_enter();
5101  * dvp_seqc = seqc_read_any(dvp);
5102  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
5103  *     abort();
5104  * for (;;) {
5105  *      tvp = find();
5106  *      tvp_seqc = seqc_read_any(tvp);
5107  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
5108  *          abort();
5109  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
5110  *          abort();
5111  *      dvp = tvp; // we know nothing of importance has changed
5112  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
5113  *      if (last)
5114  *          break;
5115  * }
5116  * vget(); // secure the vnode
5117  * if (!seqc_consistent(tvp, tvp_seqc) // final check
5118  *          abort();
5119  * // at this point we know nothing has changed for any parent<->child pair
5120  * // as they were crossed during the lookup, meaning we matched the guarantee
5121  * // of the locked variant
5122  * return (tvp);
5123  *
5124  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
5125  * - they are called while within vfs_smr protection which they must never exit
5126  * - EAGAIN can be returned to denote checking could not be performed, it is
5127  *   always valid to return it
5128  * - if the sequence counter has not changed the result must be valid
5129  * - if the sequence counter has changed both false positives and false negatives
5130  *   are permitted (since the result will be rejected later)
5131  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
5132  *
5133  * Caveats to watch out for:
5134  * - vnodes are passed unlocked and unreferenced with nothing stopping
5135  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
5136  *   to use atomic_load_ptr to fetch it.
5137  * - the aforementioned object can also get freed, meaning absent other means it
5138  *   should be protected with vfs_smr
5139  * - either safely checking permissions as they are modified or guaranteeing
5140  *   their stability is left to the routine
5141  */
5142 int
5143 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
5144     struct pwd **pwdp)
5145 {
5146         struct cache_fpl fpl;
5147         struct pwd *pwd;
5148         struct vnode *dvp;
5149         struct componentname *cnp;
5150         struct nameidata_saved orig;
5151         int error;
5152
5153         MPASS(ndp->ni_lcf == 0);
5154
5155         fpl.status = CACHE_FPL_STATUS_UNSET;
5156         fpl.ndp = ndp;
5157         fpl.cnp = &ndp->ni_cnd;
5158         MPASS(curthread == fpl.cnp->cn_thread);
5159         KASSERT ((fpl.cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
5160             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
5161             fpl.cnp->cn_flags));
5162
5163         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
5164                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
5165
5166         if (!cache_can_fplookup(&fpl)) {
5167                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
5168                 *status = fpl.status;
5169                 return (EOPNOTSUPP);
5170         }
5171
5172         cache_fpl_checkpoint(&fpl, &orig);
5173
5174         cache_fpl_smr_enter_initial(&fpl);
5175         fpl.fsearch = false;
5176         pwd = pwd_get_smr();
5177         fpl.pwd = pwd;
5178         ndp->ni_rootdir = pwd->pwd_rdir;
5179         ndp->ni_topdir = pwd->pwd_jdir;
5180
5181         cnp = fpl.cnp;
5182         cnp->cn_nameptr = cnp->cn_pnbuf;
5183         if (cnp->cn_pnbuf[0] == '/') {
5184                 dvp = cache_fpl_handle_root(&fpl);
5185                 ndp->ni_resflags |= NIRES_ABS;
5186         } else {
5187                 if (ndp->ni_dirfd == AT_FDCWD) {
5188                         dvp = pwd->pwd_cdir;
5189                 } else {
5190                         error = cache_fplookup_dirfd(&fpl, &dvp);
5191                         if (__predict_false(error != 0)) {
5192                                 goto out;
5193                         }
5194                 }
5195         }
5196
5197         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
5198
5199         error = cache_fplookup_impl(dvp, &fpl);
5200 out:
5201         cache_fpl_smr_assert_not_entered(&fpl);
5202         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
5203
5204         *status = fpl.status;
5205         switch (fpl.status) {
5206         case CACHE_FPL_STATUS_UNSET:
5207                 __assert_unreachable();
5208                 break;
5209         case CACHE_FPL_STATUS_HANDLED:
5210                 if (error != 0)
5211                         MPASS(ndp->ni_vp == NULL);
5212                 SDT_PROBE3(vfs, namei, lookup, return, error, ndp->ni_vp, true);
5213                 break;
5214         case CACHE_FPL_STATUS_PARTIAL:
5215                 *pwdp = fpl.pwd;
5216                 /*
5217                  * Status restored by cache_fplookup_partial_setup.
5218                  */
5219                 break;
5220         case CACHE_FPL_STATUS_ABORTED:
5221                 cache_fpl_restore_abort(&fpl, &orig);
5222                 break;
5223         }
5224         return (error);
5225 }