sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  83     "Name cache");
  84
  85 SDT_PROVIDER_DECLARE(vfs);
  86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  87     "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  89     "struct vnode *");
  90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  91     "char *");
  92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  93     "const char *");
  94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  95     "struct namecache *", "int", "int");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  98     "char *", "struct vnode *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 101     "struct vnode *", "char *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 105     "struct vnode *", "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 107     "char *");
 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 109     "struct componentname *");
 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 111     "struct componentname *");
 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 113 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 114 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 115 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 116 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 117     "struct vnode *");
 118 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 119     "char *");
 120 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 121     "char *");
 122
 123 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 124 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 125 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 126
 127 /*
 128  * This structure describes the elements in the cache of recent
 129  * names looked up by namei.
 130  */
 131 struct negstate {
 132         u_char neg_flag;
 133         u_char neg_hit;
 134 };
 135 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 136     "the state must fit in a union with a pointer without growing it");
 137
 138 struct  namecache {
 139         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 140         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 141         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 142         struct  vnode *nc_dvp;          /* vnode of parent of name */
 143         union {
 144                 struct  vnode *nu_vp;   /* vnode the name refers to */
 145                 struct  negstate nu_neg;/* negative entry state */
 146         } n_un;
 147         u_char  nc_flag;                /* flag bits */
 148         u_char  nc_nlen;                /* length of name */
 149         char    nc_name[0];             /* segment name + nul */
 150 };
 151
 152 /*
 153  * struct namecache_ts repeats struct namecache layout up to the
 154  * nc_nlen member.
 155  * struct namecache_ts is used in place of struct namecache when time(s) need
 156  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 157  * both a non-dotdot directory name plus dotdot for the directory's
 158  * parent.
 159  *
 160  * See below for alignment requirement.
 161  */
 162 struct  namecache_ts {
 163         struct  timespec nc_time;       /* timespec provided by fs */
 164         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 165         int     nc_ticks;               /* ticks value when entry was added */
 166         int     nc_pad;
 167         struct namecache nc_nc;
 168 };
 169
 170 TAILQ_HEAD(cache_freebatch, namecache);
 171
 172 /*
 173  * At least mips n32 performs 64-bit accesses to timespec as found
 174  * in namecache_ts and requires them to be aligned. Since others
 175  * may be in the same spot suffer a little bit and enforce the
 176  * alignment for everyone. Note this is a nop for 64-bit platforms.
 177  */
 178 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 179
 180 /*
 181  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
 182  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
 183  * smaller and the value was bumped to retain the total size, but it
 184  * was never re-evaluated for suitability. A simple test counting
 185  * lengths during package building shows that the value of 45 covers
 186  * about 86% of all added entries, reaching 99% at 65.
 187  *
 188  * Regardless of the above, use of dedicated zones instead of malloc may be
 189  * inducing additional waste. This may be hard to address as said zones are
 190  * tied to VFS SMR. Even if retaining them, the current split should be
 191  * re-evaluated.
 192  */
 193 #ifdef __LP64__
 194 #define CACHE_PATH_CUTOFF       45
 195 #define CACHE_LARGE_PAD         6
 196 #else
 197 #define CACHE_PATH_CUTOFF       41
 198 #define CACHE_LARGE_PAD         2
 199 #endif
 200
 201 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 202 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 203 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 204 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 205
 206 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 207 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 208 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 209 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 210
 211 #define nc_vp           n_un.nu_vp
 212 #define nc_neg          n_un.nu_neg
 213
 214 /*
 215  * Flags in namecache.nc_flag
 216  */
 217 #define NCF_WHITE       0x01
 218 #define NCF_ISDOTDOT    0x02
 219 #define NCF_TS          0x04
 220 #define NCF_DTS         0x08
 221 #define NCF_DVDROP      0x10
 222 #define NCF_NEGATIVE    0x20
 223 #define NCF_INVALID     0x40
 224 #define NCF_WIP         0x80
 225
 226 /*
 227  * Flags in negstate.neg_flag
 228  */
 229 #define NEG_HOT         0x01
 230
 231 static bool     cache_neg_evict_cond(u_long lnumcache);
 232
 233 /*
 234  * Mark an entry as invalid.
 235  *
 236  * This is called before it starts getting deconstructed.
 237  */
 238 static void
 239 cache_ncp_invalidate(struct namecache *ncp)
 240 {
 241
 242         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 243             ("%s: entry %p already invalid", __func__, ncp));
 244         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 245         atomic_thread_fence_rel();
 246 }
 247
 248 /*
 249  * Check whether the entry can be safely used.
 250  *
 251  * All places which elide locks are supposed to call this after they are
 252  * done with reading from an entry.
 253  */
 254 #define cache_ncp_canuse(ncp)   ({                                      \
 255         struct namecache *_ncp = (ncp);                                 \
 256         u_char _nc_flag;                                                \
 257                                                                         \
 258         atomic_thread_fence_acq();                                      \
 259         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 260         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
 261 })
 262
 263 /*
 264  * Name caching works as follows:
 265  *
 266  * Names found by directory scans are retained in a cache
 267  * for future reference.  It is managed LRU, so frequently
 268  * used names will hang around.  Cache is indexed by hash value
 269  * obtained from (dvp, name) where dvp refers to the directory
 270  * containing name.
 271  *
 272  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 273  * exist) the vnode pointer will be NULL.
 274  *
 275  * Upon reaching the last segment of a path, if the reference
 276  * is for DELETE, or NOCACHE is set (rewrite), and the
 277  * name is located in the cache, it will be dropped.
 278  *
 279  * These locks are used (in the order in which they can be taken):
 280  * NAME         TYPE    ROLE
 281  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 282  * bucketlock   mtx     for access to given set of hash buckets
 283  * neglist      mtx     negative entry LRU management
 284  *
 285  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 286  * order is lower address first. Both are recursive.
 287  *
 288  * "." lookups are lockless.
 289  *
 290  * ".." and vnode -> name lookups require vnodelock.
 291  *
 292  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 293  *
 294  * Insertions and removals of entries require involved vnodes and bucketlocks
 295  * to be locked to provide safe operation against other threads modifying the
 296  * cache.
 297  *
 298  * Some lookups result in removal of the found entry (e.g. getting rid of a
 299  * negative entry with the intent to create a positive one), which poses a
 300  * problem when multiple threads reach the state. Similarly, two different
 301  * threads can purge two different vnodes and try to remove the same name.
 302  *
 303  * If the already held vnode lock is lower than the second required lock, we
 304  * can just take the other lock. However, in the opposite case, this could
 305  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 306  * the first node, locking everything in order and revalidating the state.
 307  */
 308
 309 VFS_SMR_DECLARE;
 310
 311 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 312     "Name cache parameters");
 313
 314 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 316     "Total namecache capacity");
 317
 318 u_int ncsizefactor = 2;
 319 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 320     "Size factor for namecache");
 321
 322 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 323 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 324     "Ratio of negative namecache entries");
 325
 326 /*
 327  * Negative entry % of namecache capacity above which automatic eviction is allowed.
 328  *
 329  * Check cache_neg_evict_cond for details.
 330  */
 331 static u_int ncnegminpct = 3;
 332
 333 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 335     "Negative entry count above which automatic eviction is allowed");
 336
 337 /*
 338  * Structures associated with name caching.
 339  */
 340 #define NCHHASH(hash) \
 341         (&nchashtbl[(hash) & nchash])
 342 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 343 static u_long __read_mostly     nchash;                 /* size of hash table */
 344 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 345     "Size of namecache hash table");
 346 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 347 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 348
 349 struct nchstats nchstats;               /* cache effectiveness statistics */
 350
 351 static bool __read_frequently cache_fast_revlookup = true;
 352 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 353     &cache_fast_revlookup, 0, "");
 354
 355 static u_int __exclusive_cache_line neg_cycle;
 356
 357 #define ncneghash       3
 358 #define numneglists     (ncneghash + 1)
 359
 360 struct neglist {
 361         struct mtx              nl_evict_lock;
 362         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 363         TAILQ_HEAD(, namecache) nl_list;
 364         TAILQ_HEAD(, namecache) nl_hotlist;
 365         u_long                  nl_hotnum;
 366 } __aligned(CACHE_LINE_SIZE);
 367
 368 static struct neglist neglists[numneglists];
 369
 370 static inline struct neglist *
 371 NCP2NEGLIST(struct namecache *ncp)
 372 {
 373
 374         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 375 }
 376
 377 static inline struct negstate *
 378 NCP2NEGSTATE(struct namecache *ncp)
 379 {
 380
 381         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 382         return (&ncp->nc_neg);
 383 }
 384
 385 #define numbucketlocks (ncbuckethash + 1)
 386 static u_int __read_mostly  ncbuckethash;
 387 static struct mtx_padalign __read_mostly  *bucketlocks;
 388 #define HASH2BUCKETLOCK(hash) \
 389         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 390
 391 #define numvnodelocks (ncvnodehash + 1)
 392 static u_int __read_mostly  ncvnodehash;
 393 static struct mtx __read_mostly *vnodelocks;
 394 static inline struct mtx *
 395 VP2VNODELOCK(struct vnode *vp)
 396 {
 397
 398         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 399 }
 400
 401 static void
 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 403 {
 404         struct namecache_ts *ncp_ts;
 405
 406         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 407             (tsp == NULL && ticksp == NULL),
 408             ("No NCF_TS"));
 409
 410         if (tsp == NULL)
 411                 return;
 412
 413         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 414         *tsp = ncp_ts->nc_time;
 415         *ticksp = ncp_ts->nc_ticks;
 416 }
 417
 418 #ifdef DEBUG_CACHE
 419 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 420 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 421     "VFS namecache enabled");
 422 #endif
 423
 424 /* Export size information to userland */
 425 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 426     sizeof(struct namecache), "sizeof(struct namecache)");
 427
 428 /*
 429  * The new name cache statistics
 430  */
 431 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 432     "Name cache statistics");
 433
 434 #define STATNODE_ULONG(name, varname, descr)                                    \
 435         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 436 #define STATNODE_COUNTER(name, varname, descr)                                  \
 437         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 438         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 439             descr);
 440 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 441 STATNODE_ULONG(count, numcache, "Number of cache entries");
 442 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 443 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 444 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 445 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 446 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 447 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 448 STATNODE_COUNTER(posszaps, numposzaps,
 449     "Number of cache hits (positive) we do not want to cache");
 450 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 451 STATNODE_COUNTER(negzaps, numnegzaps,
 452     "Number of cache hits (negative) we do not want to cache");
 453 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 454 /* These count for vn_getcwd(), too. */
 455 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 456 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 457 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 458     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 459 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 460 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 461
 462 /*
 463  * Debug or developer statistics.
 464  */
 465 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 466     "Name cache debugging");
 467 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 468         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 469 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 470         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 471         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 472             descr);
 473 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 474     "Number of successful removals after relocking");
 475 static long zap_bucket_fail;
 476 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 477 static long zap_bucket_fail2;
 478 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 479 static long cache_lock_vnodes_cel_3_failures;
 480 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 481     "Number of times 3-way vnode locking failed");
 482
 483 static void cache_zap_locked(struct namecache *ncp);
 484 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 485     char **freebuf, size_t *buflen);
 486 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 487     char **retbuf, size_t *buflen, size_t addend);
 488 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 489     char **retbuf, size_t *buflen);
 490 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 491     char **retbuf, size_t *len, size_t addend);
 492
 493 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 494
 495 static inline void
 496 cache_assert_vlp_locked(struct mtx *vlp)
 497 {
 498
 499         if (vlp != NULL)
 500                 mtx_assert(vlp, MA_OWNED);
 501 }
 502
 503 static inline void
 504 cache_assert_vnode_locked(struct vnode *vp)
 505 {
 506         struct mtx *vlp;
 507
 508         vlp = VP2VNODELOCK(vp);
 509         cache_assert_vlp_locked(vlp);
 510 }
 511
 512 /*
 513  * Directory vnodes with entries are held for two reasons:
 514  * 1. make them less of a target for reclamation in vnlru
 515  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
 516  *
 517  * Note this preferably would not be done and it's a hold over from. It will be
 518  * feasible to eliminate altogether if all filesystems start supporting
 519  * lockless lookup.
 520  */
 521 static void
 522 cache_hold_vnode(struct vnode *vp)
 523 {
 524
 525         cache_assert_vnode_locked(vp);
 526         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 527         vhold(vp);
 528         counter_u64_add(numcachehv, 1);
 529 }
 530
 531 static void
 532 cache_drop_vnode(struct vnode *vp)
 533 {
 534
 535         /*
 536          * Called after all locks are dropped, meaning we can't assert
 537          * on the state of v_cache_src.
 538          */
 539         vdrop(vp);
 540         counter_u64_add(numcachehv, -1);
 541 }
 542
 543 /*
 544  * UMA zones.
 545  */
 546 static uma_zone_t __read_mostly cache_zone_small;
 547 static uma_zone_t __read_mostly cache_zone_small_ts;
 548 static uma_zone_t __read_mostly cache_zone_large;
 549 static uma_zone_t __read_mostly cache_zone_large_ts;
 550
 551 static struct namecache *
 552 cache_alloc_uma(int len, bool ts)
 553 {
 554         struct namecache_ts *ncp_ts;
 555         struct namecache *ncp;
 556
 557         if (__predict_false(ts)) {
 558                 if (len <= CACHE_PATH_CUTOFF)
 559                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 560                 else
 561                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 562                 ncp = &ncp_ts->nc_nc;
 563         } else {
 564                 if (len <= CACHE_PATH_CUTOFF)
 565                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 566                 else
 567                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 568         }
 569         return (ncp);
 570 }
 571
 572 static void
 573 cache_free_uma(struct namecache *ncp)
 574 {
 575         struct namecache_ts *ncp_ts;
 576
 577         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 578                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 579                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 580                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 581                 else
 582                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 583         } else {
 584                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 585                         uma_zfree_smr(cache_zone_small, ncp);
 586                 else
 587                         uma_zfree_smr(cache_zone_large, ncp);
 588         }
 589 }
 590
 591 static struct namecache *
 592 cache_alloc(int len, bool ts)
 593 {
 594         u_long lnumcache;
 595
 596         /*
 597          * Avoid blowout in namecache entries.
 598          *
 599          * Bugs:
 600          * 1. filesystems may end up trying to add an already existing entry
 601          * (for example this can happen after a cache miss during concurrent
 602          * lookup), in which case we will call cache_neg_evict despite not
 603          * adding anything.
 604          * 2. the routine may fail to free anything and no provisions are made
 605          * to make it try harder (see the inside for failure modes)
 606          * 3. it only ever looks at negative entries.
 607          */
 608         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 609         if (cache_neg_evict_cond(lnumcache)) {
 610                 lnumcache = atomic_load_long(&numcache);
 611         }
 612         if (__predict_false(lnumcache >= ncsize)) {
 613                 atomic_subtract_long(&numcache, 1);
 614                 counter_u64_add(numdrops, 1);
 615                 return (NULL);
 616         }
 617         return (cache_alloc_uma(len, ts));
 618 }
 619
 620 static void
 621 cache_free(struct namecache *ncp)
 622 {
 623
 624         MPASS(ncp != NULL);
 625         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 626                 cache_drop_vnode(ncp->nc_dvp);
 627         }
 628         cache_free_uma(ncp);
 629         atomic_subtract_long(&numcache, 1);
 630 }
 631
 632 static void
 633 cache_free_batch(struct cache_freebatch *batch)
 634 {
 635         struct namecache *ncp, *nnp;
 636         int i;
 637
 638         i = 0;
 639         if (TAILQ_EMPTY(batch))
 640                 goto out;
 641         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 642                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 643                         cache_drop_vnode(ncp->nc_dvp);
 644                 }
 645                 cache_free_uma(ncp);
 646                 i++;
 647         }
 648         atomic_subtract_long(&numcache, i);
 649 out:
 650         SDT_PROBE1(vfs, namecache, purge, batch, i);
 651 }
 652
 653 /*
 654  * TODO: With the value stored we can do better than computing the hash based
 655  * on the address. The choice of FNV should also be revisited.
 656  */
 657 static void
 658 cache_prehash(struct vnode *vp)
 659 {
 660
 661         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 662 }
 663
 664 static uint32_t
 665 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 666 {
 667
 668         return (fnv_32_buf(name, len, dvp->v_nchash));
 669 }
 670
 671 static inline struct nchashhead *
 672 NCP2BUCKET(struct namecache *ncp)
 673 {
 674         uint32_t hash;
 675
 676         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 677         return (NCHHASH(hash));
 678 }
 679
 680 static inline struct mtx *
 681 NCP2BUCKETLOCK(struct namecache *ncp)
 682 {
 683         uint32_t hash;
 684
 685         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 686         return (HASH2BUCKETLOCK(hash));
 687 }
 688
 689 #ifdef INVARIANTS
 690 static void
 691 cache_assert_bucket_locked(struct namecache *ncp)
 692 {
 693         struct mtx *blp;
 694
 695         blp = NCP2BUCKETLOCK(ncp);
 696         mtx_assert(blp, MA_OWNED);
 697 }
 698
 699 static void
 700 cache_assert_bucket_unlocked(struct namecache *ncp)
 701 {
 702         struct mtx *blp;
 703
 704         blp = NCP2BUCKETLOCK(ncp);
 705         mtx_assert(blp, MA_NOTOWNED);
 706 }
 707 #else
 708 #define cache_assert_bucket_locked(x) do { } while (0)
 709 #define cache_assert_bucket_unlocked(x) do { } while (0)
 710 #endif
 711
 712 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 713 static void
 714 _cache_sort_vnodes(void **p1, void **p2)
 715 {
 716         void *tmp;
 717
 718         MPASS(*p1 != NULL || *p2 != NULL);
 719
 720         if (*p1 > *p2) {
 721                 tmp = *p2;
 722                 *p2 = *p1;
 723                 *p1 = tmp;
 724         }
 725 }
 726
 727 static void
 728 cache_lock_all_buckets(void)
 729 {
 730         u_int i;
 731
 732         for (i = 0; i < numbucketlocks; i++)
 733                 mtx_lock(&bucketlocks[i]);
 734 }
 735
 736 static void
 737 cache_unlock_all_buckets(void)
 738 {
 739         u_int i;
 740
 741         for (i = 0; i < numbucketlocks; i++)
 742                 mtx_unlock(&bucketlocks[i]);
 743 }
 744
 745 static void
 746 cache_lock_all_vnodes(void)
 747 {
 748         u_int i;
 749
 750         for (i = 0; i < numvnodelocks; i++)
 751                 mtx_lock(&vnodelocks[i]);
 752 }
 753
 754 static void
 755 cache_unlock_all_vnodes(void)
 756 {
 757         u_int i;
 758
 759         for (i = 0; i < numvnodelocks; i++)
 760                 mtx_unlock(&vnodelocks[i]);
 761 }
 762
 763 static int
 764 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 765 {
 766
 767         cache_sort_vnodes(&vlp1, &vlp2);
 768
 769         if (vlp1 != NULL) {
 770                 if (!mtx_trylock(vlp1))
 771                         return (EAGAIN);
 772         }
 773         if (!mtx_trylock(vlp2)) {
 774                 if (vlp1 != NULL)
 775                         mtx_unlock(vlp1);
 776                 return (EAGAIN);
 777         }
 778
 779         return (0);
 780 }
 781
 782 static void
 783 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 784 {
 785
 786         MPASS(vlp1 != NULL || vlp2 != NULL);
 787         MPASS(vlp1 <= vlp2);
 788
 789         if (vlp1 != NULL)
 790                 mtx_lock(vlp1);
 791         if (vlp2 != NULL)
 792                 mtx_lock(vlp2);
 793 }
 794
 795 static void
 796 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 797 {
 798
 799         MPASS(vlp1 != NULL || vlp2 != NULL);
 800
 801         if (vlp1 != NULL)
 802                 mtx_unlock(vlp1);
 803         if (vlp2 != NULL)
 804                 mtx_unlock(vlp2);
 805 }
 806
 807 static int
 808 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 809 {
 810         struct nchstats snap;
 811
 812         if (req->oldptr == NULL)
 813                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 814
 815         snap = nchstats;
 816         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 817         snap.ncs_neghits = counter_u64_fetch(numneghits);
 818         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 819             counter_u64_fetch(numnegzaps);
 820         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 821             counter_u64_fetch(nummiss);
 822
 823         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 824 }
 825 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 826     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 827     "VFS cache effectiveness statistics");
 828
 829 static void
 830 cache_recalc_neg_min(u_int val)
 831 {
 832
 833         neg_min = (ncsize * val) / 100;
 834 }
 835
 836 static int
 837 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 838 {
 839         u_int val;
 840         int error;
 841
 842         val = ncnegminpct;
 843         error = sysctl_handle_int(oidp, &val, 0, req);
 844         if (error != 0 || req->newptr == NULL)
 845                 return (error);
 846
 847         if (val == ncnegminpct)
 848                 return (0);
 849         if (val < 0 || val > 99)
 850                 return (EINVAL);
 851         ncnegminpct = val;
 852         cache_recalc_neg_min(val);
 853         return (0);
 854 }
 855
 856 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 857     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 858     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 859
 860 #ifdef DIAGNOSTIC
 861 /*
 862  * Grab an atomic snapshot of the name cache hash chain lengths
 863  */
 864 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 865     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 866     "hash table stats");
 867
 868 static int
 869 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 870 {
 871         struct nchashhead *ncpp;
 872         struct namecache *ncp;
 873         int i, error, n_nchash, *cntbuf;
 874
 875 retry:
 876         n_nchash = nchash + 1;  /* nchash is max index, not count */
 877         if (req->oldptr == NULL)
 878                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 879         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 880         cache_lock_all_buckets();
 881         if (n_nchash != nchash + 1) {
 882                 cache_unlock_all_buckets();
 883                 free(cntbuf, M_TEMP);
 884                 goto retry;
 885         }
 886         /* Scan hash tables counting entries */
 887         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 888                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 889                         cntbuf[i]++;
 890         cache_unlock_all_buckets();
 891         for (error = 0, i = 0; i < n_nchash; i++)
 892                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 893                         break;
 894         free(cntbuf, M_TEMP);
 895         return (error);
 896 }
 897 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 898     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 899     "nchash chain lengths");
 900
 901 static int
 902 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 903 {
 904         int error;
 905         struct nchashhead *ncpp;
 906         struct namecache *ncp;
 907         int n_nchash;
 908         int count, maxlength, used, pct;
 909
 910         if (!req->oldptr)
 911                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 912
 913         cache_lock_all_buckets();
 914         n_nchash = nchash + 1;  /* nchash is max index, not count */
 915         used = 0;
 916         maxlength = 0;
 917
 918         /* Scan hash tables for applicable entries */
 919         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 920                 count = 0;
 921                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 922                         count++;
 923                 }
 924                 if (count)
 925                         used++;
 926                 if (maxlength < count)
 927                         maxlength = count;
 928         }
 929         n_nchash = nchash + 1;
 930         cache_unlock_all_buckets();
 931         pct = (used * 100) / (n_nchash / 100);
 932         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 933         if (error)
 934                 return (error);
 935         error = SYSCTL_OUT(req, &used, sizeof(used));
 936         if (error)
 937                 return (error);
 938         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 939         if (error)
 940                 return (error);
 941         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 942         if (error)
 943                 return (error);
 944         return (0);
 945 }
 946 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 947     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 948     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 949 #endif
 950
 951 /*
 952  * Negative entries management
 953  *
 954  * Various workloads create plenty of negative entries and barely use them
 955  * afterwards. Moreover malicious users can keep performing bogus lookups
 956  * adding even more entries. For example "make tinderbox" as of writing this
 957  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 958  * negative.
 959  *
 960  * As such, a rather aggressive eviction method is needed. The currently
 961  * employed method is a placeholder.
 962  *
 963  * Entries are split over numneglists separate lists, each of which is further
 964  * split into hot and cold entries. Entries get promoted after getting a hit.
 965  * Eviction happens on addition of new entry.
 966  */
 967 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 968     "Name cache negative entry statistics");
 969
 970 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 971     "Number of negative cache entries");
 972
 973 static COUNTER_U64_DEFINE_EARLY(neg_created);
 974 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 975     "Number of created negative entries");
 976
 977 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 978 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 979     "Number of evicted negative entries");
 980
 981 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 982 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 983     &neg_evict_skipped_empty,
 984     "Number of times evicting failed due to lack of entries");
 985
 986 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 987 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 988     &neg_evict_skipped_missed,
 989     "Number of times evicting failed due to target entry disappearing");
 990
 991 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 992 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 993     &neg_evict_skipped_contended,
 994     "Number of times evicting failed due to contention");
 995
 996 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 997     "Number of cache hits (negative)");
 998
 999 static int
1000 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1001 {
1002         int i, out;
1003
1004         out = 0;
1005         for (i = 0; i < numneglists; i++)
1006                 out += neglists[i].nl_hotnum;
1007
1008         return (SYSCTL_OUT(req, &out, sizeof(out)));
1009 }
1010 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1011     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1012     "Number of hot negative entries");
1013
1014 static void
1015 cache_neg_init(struct namecache *ncp)
1016 {
1017         struct negstate *ns;
1018
1019         ncp->nc_flag |= NCF_NEGATIVE;
1020         ns = NCP2NEGSTATE(ncp);
1021         ns->neg_flag = 0;
1022         ns->neg_hit = 0;
1023         counter_u64_add(neg_created, 1);
1024 }
1025
1026 #define CACHE_NEG_PROMOTION_THRESH 2
1027
1028 static bool
1029 cache_neg_hit_prep(struct namecache *ncp)
1030 {
1031         struct negstate *ns;
1032         u_char n;
1033
1034         ns = NCP2NEGSTATE(ncp);
1035         n = atomic_load_char(&ns->neg_hit);
1036         for (;;) {
1037                 if (n >= CACHE_NEG_PROMOTION_THRESH)
1038                         return (false);
1039                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1040                         break;
1041         }
1042         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1043 }
1044
1045 /*
1046  * Nothing to do here but it is provided for completeness as some
1047  * cache_neg_hit_prep callers may end up returning without even
1048  * trying to promote.
1049  */
1050 #define cache_neg_hit_abort(ncp)        do { } while (0)
1051
1052 static void
1053 cache_neg_hit_finish(struct namecache *ncp)
1054 {
1055
1056         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1057         counter_u64_add(numneghits, 1);
1058 }
1059
1060 /*
1061  * Move a negative entry to the hot list.
1062  */
1063 static void
1064 cache_neg_promote_locked(struct namecache *ncp)
1065 {
1066         struct neglist *nl;
1067         struct negstate *ns;
1068
1069         ns = NCP2NEGSTATE(ncp);
1070         nl = NCP2NEGLIST(ncp);
1071         mtx_assert(&nl->nl_lock, MA_OWNED);
1072         if ((ns->neg_flag & NEG_HOT) == 0) {
1073                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1074                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1075                 nl->nl_hotnum++;
1076                 ns->neg_flag |= NEG_HOT;
1077         }
1078 }
1079
1080 /*
1081  * Move a hot negative entry to the cold list.
1082  */
1083 static void
1084 cache_neg_demote_locked(struct namecache *ncp)
1085 {
1086         struct neglist *nl;
1087         struct negstate *ns;
1088
1089         ns = NCP2NEGSTATE(ncp);
1090         nl = NCP2NEGLIST(ncp);
1091         mtx_assert(&nl->nl_lock, MA_OWNED);
1092         MPASS(ns->neg_flag & NEG_HOT);
1093         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1094         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1095         nl->nl_hotnum--;
1096         ns->neg_flag &= ~NEG_HOT;
1097         atomic_store_char(&ns->neg_hit, 0);
1098 }
1099
1100 /*
1101  * Move a negative entry to the hot list if it matches the lookup.
1102  *
1103  * We have to take locks, but they may be contended and in the worst
1104  * case we may need to go off CPU. We don't want to spin within the
1105  * smr section and we can't block with it. Exiting the section means
1106  * the found entry could have been evicted. We are going to look it
1107  * up again.
1108  */
1109 static bool
1110 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1111     struct namecache *oncp, uint32_t hash)
1112 {
1113         struct namecache *ncp;
1114         struct neglist *nl;
1115         u_char nc_flag;
1116
1117         nl = NCP2NEGLIST(oncp);
1118
1119         mtx_lock(&nl->nl_lock);
1120         /*
1121          * For hash iteration.
1122          */
1123         vfs_smr_enter();
1124
1125         /*
1126          * Avoid all surprises by only succeeding if we got the same entry and
1127          * bailing completely otherwise.
1128          * XXX There are no provisions to keep the vnode around, meaning we may
1129          * end up promoting a negative entry for a *new* vnode and returning
1130          * ENOENT on its account. This is the error we want to return anyway
1131          * and promotion is harmless.
1132          *
1133          * In particular at this point there can be a new ncp which matches the
1134          * search but hashes to a different neglist.
1135          */
1136         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1137                 if (ncp == oncp)
1138                         break;
1139         }
1140
1141         /*
1142          * No match to begin with.
1143          */
1144         if (__predict_false(ncp == NULL)) {
1145                 goto out_abort;
1146         }
1147
1148         /*
1149          * The newly found entry may be something different...
1150          */
1151         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1152             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1153                 goto out_abort;
1154         }
1155
1156         /*
1157          * ... and not even negative.
1158          */
1159         nc_flag = atomic_load_char(&ncp->nc_flag);
1160         if ((nc_flag & NCF_NEGATIVE) == 0) {
1161                 goto out_abort;
1162         }
1163
1164         if (!cache_ncp_canuse(ncp)) {
1165                 goto out_abort;
1166         }
1167
1168         cache_neg_promote_locked(ncp);
1169         cache_neg_hit_finish(ncp);
1170         vfs_smr_exit();
1171         mtx_unlock(&nl->nl_lock);
1172         return (true);
1173 out_abort:
1174         vfs_smr_exit();
1175         mtx_unlock(&nl->nl_lock);
1176         return (false);
1177 }
1178
1179 static void
1180 cache_neg_promote(struct namecache *ncp)
1181 {
1182         struct neglist *nl;
1183
1184         nl = NCP2NEGLIST(ncp);
1185         mtx_lock(&nl->nl_lock);
1186         cache_neg_promote_locked(ncp);
1187         mtx_unlock(&nl->nl_lock);
1188 }
1189
1190 static void
1191 cache_neg_insert(struct namecache *ncp)
1192 {
1193         struct neglist *nl;
1194
1195         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1196         cache_assert_bucket_locked(ncp);
1197         nl = NCP2NEGLIST(ncp);
1198         mtx_lock(&nl->nl_lock);
1199         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1200         mtx_unlock(&nl->nl_lock);
1201         atomic_add_long(&numneg, 1);
1202 }
1203
1204 static void
1205 cache_neg_remove(struct namecache *ncp)
1206 {
1207         struct neglist *nl;
1208         struct negstate *ns;
1209
1210         cache_assert_bucket_locked(ncp);
1211         nl = NCP2NEGLIST(ncp);
1212         ns = NCP2NEGSTATE(ncp);
1213         mtx_lock(&nl->nl_lock);
1214         if ((ns->neg_flag & NEG_HOT) != 0) {
1215                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1216                 nl->nl_hotnum--;
1217         } else {
1218                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1219         }
1220         mtx_unlock(&nl->nl_lock);
1221         atomic_subtract_long(&numneg, 1);
1222 }
1223
1224 static struct neglist *
1225 cache_neg_evict_select_list(void)
1226 {
1227         struct neglist *nl;
1228         u_int c;
1229
1230         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1231         nl = &neglists[c % numneglists];
1232         if (!mtx_trylock(&nl->nl_evict_lock)) {
1233                 counter_u64_add(neg_evict_skipped_contended, 1);
1234                 return (NULL);
1235         }
1236         return (nl);
1237 }
1238
1239 static struct namecache *
1240 cache_neg_evict_select_entry(struct neglist *nl)
1241 {
1242         struct namecache *ncp, *lncp;
1243         struct negstate *ns, *lns;
1244         int i;
1245
1246         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1247         mtx_assert(&nl->nl_lock, MA_OWNED);
1248         ncp = TAILQ_FIRST(&nl->nl_list);
1249         if (ncp == NULL)
1250                 return (NULL);
1251         lncp = ncp;
1252         lns = NCP2NEGSTATE(lncp);
1253         for (i = 1; i < 4; i++) {
1254                 ncp = TAILQ_NEXT(ncp, nc_dst);
1255                 if (ncp == NULL)
1256                         break;
1257                 ns = NCP2NEGSTATE(ncp);
1258                 if (ns->neg_hit < lns->neg_hit) {
1259                         lncp = ncp;
1260                         lns = ns;
1261                 }
1262         }
1263         return (lncp);
1264 }
1265
1266 static bool
1267 cache_neg_evict(void)
1268 {
1269         struct namecache *ncp, *ncp2;
1270         struct neglist *nl;
1271         struct vnode *dvp;
1272         struct mtx *dvlp;
1273         struct mtx *blp;
1274         uint32_t hash;
1275         u_char nlen;
1276         bool evicted;
1277
1278         nl = cache_neg_evict_select_list();
1279         if (nl == NULL) {
1280                 return (false);
1281         }
1282
1283         mtx_lock(&nl->nl_lock);
1284         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1285         if (ncp != NULL) {
1286                 cache_neg_demote_locked(ncp);
1287         }
1288         ncp = cache_neg_evict_select_entry(nl);
1289         if (ncp == NULL) {
1290                 counter_u64_add(neg_evict_skipped_empty, 1);
1291                 mtx_unlock(&nl->nl_lock);
1292                 mtx_unlock(&nl->nl_evict_lock);
1293                 return (false);
1294         }
1295         nlen = ncp->nc_nlen;
1296         dvp = ncp->nc_dvp;
1297         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1298         dvlp = VP2VNODELOCK(dvp);
1299         blp = HASH2BUCKETLOCK(hash);
1300         mtx_unlock(&nl->nl_lock);
1301         mtx_unlock(&nl->nl_evict_lock);
1302         mtx_lock(dvlp);
1303         mtx_lock(blp);
1304         /*
1305          * Note that since all locks were dropped above, the entry may be
1306          * gone or reallocated to be something else.
1307          */
1308         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1309                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1310                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1311                         break;
1312         }
1313         if (ncp2 == NULL) {
1314                 counter_u64_add(neg_evict_skipped_missed, 1);
1315                 ncp = NULL;
1316                 evicted = false;
1317         } else {
1318                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1319                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1320                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1321                     ncp->nc_name);
1322                 cache_zap_locked(ncp);
1323                 counter_u64_add(neg_evicted, 1);
1324                 evicted = true;
1325         }
1326         mtx_unlock(blp);
1327         mtx_unlock(dvlp);
1328         if (ncp != NULL)
1329                 cache_free(ncp);
1330         return (evicted);
1331 }
1332
1333 /*
1334  * Maybe evict a negative entry to create more room.
1335  *
1336  * The ncnegfactor parameter limits what fraction of the total count
1337  * can comprise of negative entries. However, if the cache is just
1338  * warming up this leads to excessive evictions.  As such, ncnegminpct
1339  * (recomputed to neg_min) dictates whether the above should be
1340  * applied.
1341  *
1342  * Try evicting if the cache is close to full capacity regardless of
1343  * other considerations.
1344  */
1345 static bool
1346 cache_neg_evict_cond(u_long lnumcache)
1347 {
1348         u_long lnumneg;
1349
1350         if (ncsize - 1000 < lnumcache)
1351                 goto out_evict;
1352         lnumneg = atomic_load_long(&numneg);
1353         if (lnumneg < neg_min)
1354                 return (false);
1355         if (lnumneg * ncnegfactor < lnumcache)
1356                 return (false);
1357 out_evict:
1358         return (cache_neg_evict());
1359 }
1360
1361 /*
1362  * cache_zap_locked():
1363  *
1364  *   Removes a namecache entry from cache, whether it contains an actual
1365  *   pointer to a vnode or if it is just a negative cache entry.
1366  */
1367 static void
1368 cache_zap_locked(struct namecache *ncp)
1369 {
1370         struct nchashhead *ncpp;
1371
1372         if (!(ncp->nc_flag & NCF_NEGATIVE))
1373                 cache_assert_vnode_locked(ncp->nc_vp);
1374         cache_assert_vnode_locked(ncp->nc_dvp);
1375         cache_assert_bucket_locked(ncp);
1376
1377         cache_ncp_invalidate(ncp);
1378
1379         ncpp = NCP2BUCKET(ncp);
1380         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1381         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1382                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1383                     ncp->nc_name, ncp->nc_vp);
1384                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1385                 if (ncp == ncp->nc_vp->v_cache_dd) {
1386                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1387                         ncp->nc_vp->v_cache_dd = NULL;
1388                         vn_seqc_write_end(ncp->nc_vp);
1389                 }
1390         } else {
1391                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1392                     ncp->nc_name);
1393                 cache_neg_remove(ncp);
1394         }
1395         if (ncp->nc_flag & NCF_ISDOTDOT) {
1396                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1397                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1398                         ncp->nc_dvp->v_cache_dd = NULL;
1399                         vn_seqc_write_end(ncp->nc_dvp);
1400                 }
1401         } else {
1402                 LIST_REMOVE(ncp, nc_src);
1403                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1404                         ncp->nc_flag |= NCF_DVDROP;
1405                 }
1406         }
1407 }
1408
1409 static void
1410 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1411 {
1412         struct mtx *blp;
1413
1414         MPASS(ncp->nc_dvp == vp);
1415         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1416         cache_assert_vnode_locked(vp);
1417
1418         blp = NCP2BUCKETLOCK(ncp);
1419         mtx_lock(blp);
1420         cache_zap_locked(ncp);
1421         mtx_unlock(blp);
1422 }
1423
1424 static bool
1425 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1426     struct mtx **vlpp)
1427 {
1428         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1429         struct mtx *blp;
1430
1431         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1432         cache_assert_vnode_locked(vp);
1433
1434         if (ncp->nc_flag & NCF_NEGATIVE) {
1435                 if (*vlpp != NULL) {
1436                         mtx_unlock(*vlpp);
1437                         *vlpp = NULL;
1438                 }
1439                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1440                 return (true);
1441         }
1442
1443         pvlp = VP2VNODELOCK(vp);
1444         blp = NCP2BUCKETLOCK(ncp);
1445         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1446         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1447
1448         if (*vlpp == vlp1 || *vlpp == vlp2) {
1449                 to_unlock = *vlpp;
1450                 *vlpp = NULL;
1451         } else {
1452                 if (*vlpp != NULL) {
1453                         mtx_unlock(*vlpp);
1454                         *vlpp = NULL;
1455                 }
1456                 cache_sort_vnodes(&vlp1, &vlp2);
1457                 if (vlp1 == pvlp) {
1458                         mtx_lock(vlp2);
1459                         to_unlock = vlp2;
1460                 } else {
1461                         if (!mtx_trylock(vlp1))
1462                                 goto out_relock;
1463                         to_unlock = vlp1;
1464                 }
1465         }
1466         mtx_lock(blp);
1467         cache_zap_locked(ncp);
1468         mtx_unlock(blp);
1469         if (to_unlock != NULL)
1470                 mtx_unlock(to_unlock);
1471         return (true);
1472
1473 out_relock:
1474         mtx_unlock(vlp2);
1475         mtx_lock(vlp1);
1476         mtx_lock(vlp2);
1477         MPASS(*vlpp == NULL);
1478         *vlpp = vlp1;
1479         return (false);
1480 }
1481
1482 /*
1483  * If trylocking failed we can get here. We know enough to take all needed locks
1484  * in the right order and re-lookup the entry.
1485  */
1486 static int
1487 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1488     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1489     struct mtx *blp)
1490 {
1491         struct namecache *rncp;
1492
1493         cache_assert_bucket_unlocked(ncp);
1494
1495         cache_sort_vnodes(&dvlp, &vlp);
1496         cache_lock_vnodes(dvlp, vlp);
1497         mtx_lock(blp);
1498         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1499                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1500                     rncp->nc_nlen == cnp->cn_namelen &&
1501                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1502                         break;
1503         }
1504         if (rncp != NULL) {
1505                 cache_zap_locked(rncp);
1506                 mtx_unlock(blp);
1507                 cache_unlock_vnodes(dvlp, vlp);
1508                 counter_u64_add(zap_bucket_relock_success, 1);
1509                 return (0);
1510         }
1511
1512         mtx_unlock(blp);
1513         cache_unlock_vnodes(dvlp, vlp);
1514         return (EAGAIN);
1515 }
1516
1517 static int __noinline
1518 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1519     uint32_t hash, struct mtx *blp)
1520 {
1521         struct mtx *dvlp, *vlp;
1522         struct vnode *dvp;
1523
1524         cache_assert_bucket_locked(ncp);
1525
1526         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1527         vlp = NULL;
1528         if (!(ncp->nc_flag & NCF_NEGATIVE))
1529                 vlp = VP2VNODELOCK(ncp->nc_vp);
1530         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1531                 cache_zap_locked(ncp);
1532                 mtx_unlock(blp);
1533                 cache_unlock_vnodes(dvlp, vlp);
1534                 return (0);
1535         }
1536
1537         dvp = ncp->nc_dvp;
1538         mtx_unlock(blp);
1539         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1540 }
1541
1542 static __noinline int
1543 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1544 {
1545         struct namecache *ncp;
1546         struct mtx *blp;
1547         struct mtx *dvlp, *dvlp2;
1548         uint32_t hash;
1549         int error;
1550
1551         if (cnp->cn_namelen == 2 &&
1552             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1553                 dvlp = VP2VNODELOCK(dvp);
1554                 dvlp2 = NULL;
1555                 mtx_lock(dvlp);
1556 retry_dotdot:
1557                 ncp = dvp->v_cache_dd;
1558                 if (ncp == NULL) {
1559                         mtx_unlock(dvlp);
1560                         if (dvlp2 != NULL)
1561                                 mtx_unlock(dvlp2);
1562                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1563                         return (0);
1564                 }
1565                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1566                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1567                                 goto retry_dotdot;
1568                         MPASS(dvp->v_cache_dd == NULL);
1569                         mtx_unlock(dvlp);
1570                         if (dvlp2 != NULL)
1571                                 mtx_unlock(dvlp2);
1572                         cache_free(ncp);
1573                 } else {
1574                         vn_seqc_write_begin(dvp);
1575                         dvp->v_cache_dd = NULL;
1576                         vn_seqc_write_end(dvp);
1577                         mtx_unlock(dvlp);
1578                         if (dvlp2 != NULL)
1579                                 mtx_unlock(dvlp2);
1580                 }
1581                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1582                 return (1);
1583         }
1584
1585         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1586         blp = HASH2BUCKETLOCK(hash);
1587 retry:
1588         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1589                 goto out_no_entry;
1590
1591         mtx_lock(blp);
1592
1593         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1594                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1595                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1596                         break;
1597         }
1598
1599         if (ncp == NULL) {
1600                 mtx_unlock(blp);
1601                 goto out_no_entry;
1602         }
1603
1604         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1605         if (__predict_false(error != 0)) {
1606                 zap_bucket_fail++;
1607                 goto retry;
1608         }
1609         counter_u64_add(numposzaps, 1);
1610         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1611         cache_free(ncp);
1612         return (1);
1613 out_no_entry:
1614         counter_u64_add(nummisszap, 1);
1615         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1616         return (0);
1617 }
1618
1619 static int __noinline
1620 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1621     struct timespec *tsp, int *ticksp)
1622 {
1623         int ltype;
1624
1625         *vpp = dvp;
1626         counter_u64_add(dothits, 1);
1627         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1628         if (tsp != NULL)
1629                 timespecclear(tsp);
1630         if (ticksp != NULL)
1631                 *ticksp = ticks;
1632         vrefact(*vpp);
1633         /*
1634          * When we lookup "." we still can be asked to lock it
1635          * differently...
1636          */
1637         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1638         if (ltype != VOP_ISLOCKED(*vpp)) {
1639                 if (ltype == LK_EXCLUSIVE) {
1640                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1641                         if (VN_IS_DOOMED((*vpp))) {
1642                                 /* forced unmount */
1643                                 vrele(*vpp);
1644                                 *vpp = NULL;
1645                                 return (ENOENT);
1646                         }
1647                 } else
1648                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1649         }
1650         return (-1);
1651 }
1652
1653 static int __noinline
1654 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1655     struct timespec *tsp, int *ticksp)
1656 {
1657         struct namecache_ts *ncp_ts;
1658         struct namecache *ncp;
1659         struct mtx *dvlp;
1660         enum vgetstate vs;
1661         int error, ltype;
1662         bool whiteout;
1663
1664         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1665
1666         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1667                 cache_remove_cnp(dvp, cnp);
1668                 return (0);
1669         }
1670
1671         counter_u64_add(dotdothits, 1);
1672 retry:
1673         dvlp = VP2VNODELOCK(dvp);
1674         mtx_lock(dvlp);
1675         ncp = dvp->v_cache_dd;
1676         if (ncp == NULL) {
1677                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1678                 mtx_unlock(dvlp);
1679                 return (0);
1680         }
1681         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1682                 if (ncp->nc_flag & NCF_NEGATIVE)
1683                         *vpp = NULL;
1684                 else
1685                         *vpp = ncp->nc_vp;
1686         } else
1687                 *vpp = ncp->nc_dvp;
1688         if (*vpp == NULL)
1689                 goto negative_success;
1690         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1691         cache_out_ts(ncp, tsp, ticksp);
1692         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1693             NCF_DTS && tsp != NULL) {
1694                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1695                 *tsp = ncp_ts->nc_dotdottime;
1696         }
1697
1698         MPASS(dvp != *vpp);
1699         ltype = VOP_ISLOCKED(dvp);
1700         VOP_UNLOCK(dvp);
1701         vs = vget_prep(*vpp);
1702         mtx_unlock(dvlp);
1703         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1704         vn_lock(dvp, ltype | LK_RETRY);
1705         if (VN_IS_DOOMED(dvp)) {
1706                 if (error == 0)
1707                         vput(*vpp);
1708                 *vpp = NULL;
1709                 return (ENOENT);
1710         }
1711         if (error) {
1712                 *vpp = NULL;
1713                 goto retry;
1714         }
1715         return (-1);
1716 negative_success:
1717         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1718                 if (cnp->cn_flags & ISLASTCN) {
1719                         counter_u64_add(numnegzaps, 1);
1720                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1721                         mtx_unlock(dvlp);
1722                         cache_free(ncp);
1723                         return (0);
1724                 }
1725         }
1726
1727         whiteout = (ncp->nc_flag & NCF_WHITE);
1728         cache_out_ts(ncp, tsp, ticksp);
1729         if (cache_neg_hit_prep(ncp))
1730                 cache_neg_promote(ncp);
1731         else
1732                 cache_neg_hit_finish(ncp);
1733         mtx_unlock(dvlp);
1734         if (whiteout)
1735                 cnp->cn_flags |= ISWHITEOUT;
1736         return (ENOENT);
1737 }
1738
1739 /**
1740  * Lookup a name in the name cache
1741  *
1742  * # Arguments
1743  *
1744  * - dvp:       Parent directory in which to search.
1745  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1746  * - cnp:       Parameters of the name search.  The most interesting bits of
1747  *              the cn_flags field have the following meanings:
1748  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1749  *                      it up.
1750  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1751  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1752  *              or negative) lookup, tsp will be filled with any timespec that
1753  *              was stored when this cache entry was created.  However, it will
1754  *              be clear for "." entries.
1755  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1756  *              (positive or negative) lookup, it will contain the ticks value
1757  *              that was current when the cache entry was created, unless cnp
1758  *              was ".".
1759  *
1760  * Either both tsp and ticks have to be provided or neither of them.
1761  *
1762  * # Returns
1763  *
1764  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1765  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1766  *              to a forced unmount.  vpp will not be modified.  If the entry
1767  *              is a whiteout, then the ISWHITEOUT flag will be set in
1768  *              cnp->cn_flags.
1769  * - 0:         A cache miss.  vpp will not be modified.
1770  *
1771  * # Locking
1772  *
1773  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1774  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1775  * lock is not recursively acquired.
1776  */
1777 static int __noinline
1778 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1779     struct timespec *tsp, int *ticksp)
1780 {
1781         struct namecache *ncp;
1782         struct mtx *blp;
1783         uint32_t hash;
1784         enum vgetstate vs;
1785         int error;
1786         bool whiteout;
1787
1788         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1789         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1790
1791 retry:
1792         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1793         blp = HASH2BUCKETLOCK(hash);
1794         mtx_lock(blp);
1795
1796         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1797                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1798                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1799                         break;
1800         }
1801
1802         if (__predict_false(ncp == NULL)) {
1803                 mtx_unlock(blp);
1804                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1805                     NULL);
1806                 counter_u64_add(nummiss, 1);
1807                 return (0);
1808         }
1809
1810         if (ncp->nc_flag & NCF_NEGATIVE)
1811                 goto negative_success;
1812
1813         counter_u64_add(numposhits, 1);
1814         *vpp = ncp->nc_vp;
1815         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1816         cache_out_ts(ncp, tsp, ticksp);
1817         MPASS(dvp != *vpp);
1818         vs = vget_prep(*vpp);
1819         mtx_unlock(blp);
1820         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1821         if (error) {
1822                 *vpp = NULL;
1823                 goto retry;
1824         }
1825         return (-1);
1826 negative_success:
1827         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1828                 if (cnp->cn_flags & ISLASTCN) {
1829                         counter_u64_add(numnegzaps, 1);
1830                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1831                         if (__predict_false(error != 0)) {
1832                                 zap_bucket_fail2++;
1833                                 goto retry;
1834                         }
1835                         cache_free(ncp);
1836                         return (0);
1837                 }
1838         }
1839
1840         whiteout = (ncp->nc_flag & NCF_WHITE);
1841         cache_out_ts(ncp, tsp, ticksp);
1842         if (cache_neg_hit_prep(ncp))
1843                 cache_neg_promote(ncp);
1844         else
1845                 cache_neg_hit_finish(ncp);
1846         mtx_unlock(blp);
1847         if (whiteout)
1848                 cnp->cn_flags |= ISWHITEOUT;
1849         return (ENOENT);
1850 }
1851
1852 int
1853 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1854     struct timespec *tsp, int *ticksp)
1855 {
1856         struct namecache *ncp;
1857         uint32_t hash;
1858         enum vgetstate vs;
1859         int error;
1860         bool whiteout, neg_promote;
1861         u_short nc_flag;
1862
1863         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1864
1865 #ifdef DEBUG_CACHE
1866         if (__predict_false(!doingcache)) {
1867                 cnp->cn_flags &= ~MAKEENTRY;
1868                 return (0);
1869         }
1870 #endif
1871
1872         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1873                 if (cnp->cn_namelen == 1)
1874                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1875                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1876                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1877         }
1878
1879         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1880
1881         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1882                 cache_remove_cnp(dvp, cnp);
1883                 return (0);
1884         }
1885
1886         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1887         vfs_smr_enter();
1888
1889         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1890                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1891                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1892                         break;
1893         }
1894
1895         if (__predict_false(ncp == NULL)) {
1896                 vfs_smr_exit();
1897                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1898                     NULL);
1899                 counter_u64_add(nummiss, 1);
1900                 return (0);
1901         }
1902
1903         nc_flag = atomic_load_char(&ncp->nc_flag);
1904         if (nc_flag & NCF_NEGATIVE)
1905                 goto negative_success;
1906
1907         counter_u64_add(numposhits, 1);
1908         *vpp = ncp->nc_vp;
1909         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1910         cache_out_ts(ncp, tsp, ticksp);
1911         MPASS(dvp != *vpp);
1912         if (!cache_ncp_canuse(ncp)) {
1913                 vfs_smr_exit();
1914                 *vpp = NULL;
1915                 goto out_fallback;
1916         }
1917         vs = vget_prep_smr(*vpp);
1918         vfs_smr_exit();
1919         if (__predict_false(vs == VGET_NONE)) {
1920                 *vpp = NULL;
1921                 goto out_fallback;
1922         }
1923         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1924         if (error) {
1925                 *vpp = NULL;
1926                 goto out_fallback;
1927         }
1928         return (-1);
1929 negative_success:
1930         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1931                 if (cnp->cn_flags & ISLASTCN) {
1932                         vfs_smr_exit();
1933                         goto out_fallback;
1934                 }
1935         }
1936
1937         cache_out_ts(ncp, tsp, ticksp);
1938         whiteout = (ncp->nc_flag & NCF_WHITE);
1939         neg_promote = cache_neg_hit_prep(ncp);
1940         if (!cache_ncp_canuse(ncp)) {
1941                 cache_neg_hit_abort(ncp);
1942                 vfs_smr_exit();
1943                 goto out_fallback;
1944         }
1945         if (neg_promote) {
1946                 vfs_smr_exit();
1947                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1948                         goto out_fallback;
1949         } else {
1950                 cache_neg_hit_finish(ncp);
1951                 vfs_smr_exit();
1952         }
1953         if (whiteout)
1954                 cnp->cn_flags |= ISWHITEOUT;
1955         return (ENOENT);
1956 out_fallback:
1957         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1958 }
1959
1960 struct celockstate {
1961         struct mtx *vlp[3];
1962         struct mtx *blp[2];
1963 };
1964 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1965 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1966
1967 static inline void
1968 cache_celockstate_init(struct celockstate *cel)
1969 {
1970
1971         bzero(cel, sizeof(*cel));
1972 }
1973
1974 static void
1975 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1976     struct vnode *dvp)
1977 {
1978         struct mtx *vlp1, *vlp2;
1979
1980         MPASS(cel->vlp[0] == NULL);
1981         MPASS(cel->vlp[1] == NULL);
1982         MPASS(cel->vlp[2] == NULL);
1983
1984         MPASS(vp != NULL || dvp != NULL);
1985
1986         vlp1 = VP2VNODELOCK(vp);
1987         vlp2 = VP2VNODELOCK(dvp);
1988         cache_sort_vnodes(&vlp1, &vlp2);
1989
1990         if (vlp1 != NULL) {
1991                 mtx_lock(vlp1);
1992                 cel->vlp[0] = vlp1;
1993         }
1994         mtx_lock(vlp2);
1995         cel->vlp[1] = vlp2;
1996 }
1997
1998 static void
1999 cache_unlock_vnodes_cel(struct celockstate *cel)
2000 {
2001
2002         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2003
2004         if (cel->vlp[0] != NULL)
2005                 mtx_unlock(cel->vlp[0]);
2006         if (cel->vlp[1] != NULL)
2007                 mtx_unlock(cel->vlp[1]);
2008         if (cel->vlp[2] != NULL)
2009                 mtx_unlock(cel->vlp[2]);
2010 }
2011
2012 static bool
2013 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2014 {
2015         struct mtx *vlp;
2016         bool ret;
2017
2018         cache_assert_vlp_locked(cel->vlp[0]);
2019         cache_assert_vlp_locked(cel->vlp[1]);
2020         MPASS(cel->vlp[2] == NULL);
2021
2022         MPASS(vp != NULL);
2023         vlp = VP2VNODELOCK(vp);
2024
2025         ret = true;
2026         if (vlp >= cel->vlp[1]) {
2027                 mtx_lock(vlp);
2028         } else {
2029                 if (mtx_trylock(vlp))
2030                         goto out;
2031                 cache_lock_vnodes_cel_3_failures++;
2032                 cache_unlock_vnodes_cel(cel);
2033                 if (vlp < cel->vlp[0]) {
2034                         mtx_lock(vlp);
2035                         mtx_lock(cel->vlp[0]);
2036                         mtx_lock(cel->vlp[1]);
2037                 } else {
2038                         if (cel->vlp[0] != NULL)
2039                                 mtx_lock(cel->vlp[0]);
2040                         mtx_lock(vlp);
2041                         mtx_lock(cel->vlp[1]);
2042                 }
2043                 ret = false;
2044         }
2045 out:
2046         cel->vlp[2] = vlp;
2047         return (ret);
2048 }
2049
2050 static void
2051 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2052     struct mtx *blp2)
2053 {
2054
2055         MPASS(cel->blp[0] == NULL);
2056         MPASS(cel->blp[1] == NULL);
2057
2058         cache_sort_vnodes(&blp1, &blp2);
2059
2060         if (blp1 != NULL) {
2061                 mtx_lock(blp1);
2062                 cel->blp[0] = blp1;
2063         }
2064         mtx_lock(blp2);
2065         cel->blp[1] = blp2;
2066 }
2067
2068 static void
2069 cache_unlock_buckets_cel(struct celockstate *cel)
2070 {
2071
2072         if (cel->blp[0] != NULL)
2073                 mtx_unlock(cel->blp[0]);
2074         mtx_unlock(cel->blp[1]);
2075 }
2076
2077 /*
2078  * Lock part of the cache affected by the insertion.
2079  *
2080  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2081  * However, insertion can result in removal of an old entry. In this
2082  * case we have an additional vnode and bucketlock pair to lock.
2083  *
2084  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2085  * preserving the locking order (smaller address first).
2086  */
2087 static void
2088 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2089     uint32_t hash)
2090 {
2091         struct namecache *ncp;
2092         struct mtx *blps[2];
2093
2094         blps[0] = HASH2BUCKETLOCK(hash);
2095         for (;;) {
2096                 blps[1] = NULL;
2097                 cache_lock_vnodes_cel(cel, dvp, vp);
2098                 if (vp == NULL || vp->v_type != VDIR)
2099                         break;
2100                 ncp = vp->v_cache_dd;
2101                 if (ncp == NULL)
2102                         break;
2103                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2104                         break;
2105                 MPASS(ncp->nc_dvp == vp);
2106                 blps[1] = NCP2BUCKETLOCK(ncp);
2107                 if (ncp->nc_flag & NCF_NEGATIVE)
2108                         break;
2109                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2110                         break;
2111                 /*
2112                  * All vnodes got re-locked. Re-validate the state and if
2113                  * nothing changed we are done. Otherwise restart.
2114                  */
2115                 if (ncp == vp->v_cache_dd &&
2116                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2117                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2118                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2119                         break;
2120                 cache_unlock_vnodes_cel(cel);
2121                 cel->vlp[0] = NULL;
2122                 cel->vlp[1] = NULL;
2123                 cel->vlp[2] = NULL;
2124         }
2125         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2126 }
2127
2128 static void
2129 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2130     uint32_t hash)
2131 {
2132         struct namecache *ncp;
2133         struct mtx *blps[2];
2134
2135         blps[0] = HASH2BUCKETLOCK(hash);
2136         for (;;) {
2137                 blps[1] = NULL;
2138                 cache_lock_vnodes_cel(cel, dvp, vp);
2139                 ncp = dvp->v_cache_dd;
2140                 if (ncp == NULL)
2141                         break;
2142                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2143                         break;
2144                 MPASS(ncp->nc_dvp == dvp);
2145                 blps[1] = NCP2BUCKETLOCK(ncp);
2146                 if (ncp->nc_flag & NCF_NEGATIVE)
2147                         break;
2148                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2149                         break;
2150                 if (ncp == dvp->v_cache_dd &&
2151                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2152                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2153                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2154                         break;
2155                 cache_unlock_vnodes_cel(cel);
2156                 cel->vlp[0] = NULL;
2157                 cel->vlp[1] = NULL;
2158                 cel->vlp[2] = NULL;
2159         }
2160         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2161 }
2162
2163 static void
2164 cache_enter_unlock(struct celockstate *cel)
2165 {
2166
2167         cache_unlock_buckets_cel(cel);
2168         cache_unlock_vnodes_cel(cel);
2169 }
2170
2171 static void __noinline
2172 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2173     struct componentname *cnp)
2174 {
2175         struct celockstate cel;
2176         struct namecache *ncp;
2177         uint32_t hash;
2178         int len;
2179
2180         if (dvp->v_cache_dd == NULL)
2181                 return;
2182         len = cnp->cn_namelen;
2183         cache_celockstate_init(&cel);
2184         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2185         cache_enter_lock_dd(&cel, dvp, vp, hash);
2186         vn_seqc_write_begin(dvp);
2187         ncp = dvp->v_cache_dd;
2188         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2189                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2190                 cache_zap_locked(ncp);
2191         } else {
2192                 ncp = NULL;
2193         }
2194         dvp->v_cache_dd = NULL;
2195         vn_seqc_write_end(dvp);
2196         cache_enter_unlock(&cel);
2197         if (ncp != NULL)
2198                 cache_free(ncp);
2199 }
2200
2201 /*
2202  * Add an entry to the cache.
2203  */
2204 void
2205 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2206     struct timespec *tsp, struct timespec *dtsp)
2207 {
2208         struct celockstate cel;
2209         struct namecache *ncp, *n2, *ndd;
2210         struct namecache_ts *ncp_ts;
2211         struct nchashhead *ncpp;
2212         uint32_t hash;
2213         int flag;
2214         int len;
2215
2216         VNPASS(dvp != vp, dvp);
2217         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2218         VNPASS(dvp->v_type != VNON, dvp);
2219         if (vp != NULL) {
2220                 VNPASS(!VN_IS_DOOMED(vp), vp);
2221                 VNPASS(vp->v_type != VNON, vp);
2222         }
2223
2224 #ifdef DEBUG_CACHE
2225         if (__predict_false(!doingcache))
2226                 return;
2227 #endif
2228
2229         flag = 0;
2230         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2231                 if (cnp->cn_namelen == 1)
2232                         return;
2233                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2234                         cache_enter_dotdot_prep(dvp, vp, cnp);
2235                         flag = NCF_ISDOTDOT;
2236                 }
2237         }
2238
2239         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2240         if (ncp == NULL)
2241                 return;
2242
2243         cache_celockstate_init(&cel);
2244         ndd = NULL;
2245         ncp_ts = NULL;
2246
2247         /*
2248          * Calculate the hash key and setup as much of the new
2249          * namecache entry as possible before acquiring the lock.
2250          */
2251         ncp->nc_flag = flag | NCF_WIP;
2252         ncp->nc_vp = vp;
2253         if (vp == NULL)
2254                 cache_neg_init(ncp);
2255         ncp->nc_dvp = dvp;
2256         if (tsp != NULL) {
2257                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2258                 ncp_ts->nc_time = *tsp;
2259                 ncp_ts->nc_ticks = ticks;
2260                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2261                 if (dtsp != NULL) {
2262                         ncp_ts->nc_dotdottime = *dtsp;
2263                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2264                 }
2265         }
2266         len = ncp->nc_nlen = cnp->cn_namelen;
2267         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2268         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2269         ncp->nc_name[len] = '\0';
2270         cache_enter_lock(&cel, dvp, vp, hash);
2271
2272         /*
2273          * See if this vnode or negative entry is already in the cache
2274          * with this name.  This can happen with concurrent lookups of
2275          * the same path name.
2276          */
2277         ncpp = NCHHASH(hash);
2278         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2279                 if (n2->nc_dvp == dvp &&
2280                     n2->nc_nlen == cnp->cn_namelen &&
2281                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2282                         MPASS(cache_ncp_canuse(n2));
2283                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2284                                 KASSERT(vp == NULL,
2285                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2286                                     __func__, NULL, vp));
2287                         else
2288                                 KASSERT(n2->nc_vp == vp,
2289                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2290                                     __func__, n2->nc_vp, vp));
2291                         /*
2292                          * Entries are supposed to be immutable unless in the
2293                          * process of getting destroyed. Accommodating for
2294                          * changing timestamps is possible but not worth it.
2295                          * This should be harmless in terms of correctness, in
2296                          * the worst case resulting in an earlier expiration.
2297                          * Alternatively, the found entry can be replaced
2298                          * altogether.
2299                          */
2300                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2301 #if 0
2302                         if (tsp != NULL) {
2303                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2304                                     ("no NCF_TS"));
2305                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2306                                 n2_ts->nc_time = ncp_ts->nc_time;
2307                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2308                                 if (dtsp != NULL) {
2309                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2310                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2311                                 }
2312                         }
2313 #endif
2314                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2315                             vp);
2316                         goto out_unlock_free;
2317                 }
2318         }
2319
2320         if (flag == NCF_ISDOTDOT) {
2321                 /*
2322                  * See if we are trying to add .. entry, but some other lookup
2323                  * has populated v_cache_dd pointer already.
2324                  */
2325                 if (dvp->v_cache_dd != NULL)
2326                         goto out_unlock_free;
2327                 KASSERT(vp == NULL || vp->v_type == VDIR,
2328                     ("wrong vnode type %p", vp));
2329                 vn_seqc_write_begin(dvp);
2330                 dvp->v_cache_dd = ncp;
2331                 vn_seqc_write_end(dvp);
2332         }
2333
2334         if (vp != NULL) {
2335                 if (flag != NCF_ISDOTDOT) {
2336                         /*
2337                          * For this case, the cache entry maps both the
2338                          * directory name in it and the name ".." for the
2339                          * directory's parent.
2340                          */
2341                         vn_seqc_write_begin(vp);
2342                         if ((ndd = vp->v_cache_dd) != NULL) {
2343                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2344                                         cache_zap_locked(ndd);
2345                                 else
2346                                         ndd = NULL;
2347                         }
2348                         vp->v_cache_dd = ncp;
2349                         vn_seqc_write_end(vp);
2350                 } else if (vp->v_type != VDIR) {
2351                         if (vp->v_cache_dd != NULL) {
2352                                 vn_seqc_write_begin(vp);
2353                                 vp->v_cache_dd = NULL;
2354                                 vn_seqc_write_end(vp);
2355                         }
2356                 }
2357         }
2358
2359         if (flag != NCF_ISDOTDOT) {
2360                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2361                         cache_hold_vnode(dvp);
2362                 }
2363                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2364         }
2365
2366         /*
2367          * If the entry is "negative", we place it into the
2368          * "negative" cache queue, otherwise, we place it into the
2369          * destination vnode's cache entries queue.
2370          */
2371         if (vp != NULL) {
2372                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2373                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2374                     vp);
2375         } else {
2376                 if (cnp->cn_flags & ISWHITEOUT)
2377                         ncp->nc_flag |= NCF_WHITE;
2378                 cache_neg_insert(ncp);
2379                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2380                     ncp->nc_name);
2381         }
2382
2383         /*
2384          * Insert the new namecache entry into the appropriate chain
2385          * within the cache entries table.
2386          */
2387         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2388
2389         atomic_thread_fence_rel();
2390         /*
2391          * Mark the entry as fully constructed.
2392          * It is immutable past this point until its removal.
2393          */
2394         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2395
2396         cache_enter_unlock(&cel);
2397         if (ndd != NULL)
2398                 cache_free(ndd);
2399         return;
2400 out_unlock_free:
2401         cache_enter_unlock(&cel);
2402         cache_free(ncp);
2403         return;
2404 }
2405
2406 static u_int
2407 cache_roundup_2(u_int val)
2408 {
2409         u_int res;
2410
2411         for (res = 1; res <= val; res <<= 1)
2412                 continue;
2413
2414         return (res);
2415 }
2416
2417 static struct nchashhead *
2418 nchinittbl(u_long elements, u_long *hashmask)
2419 {
2420         struct nchashhead *hashtbl;
2421         u_long hashsize, i;
2422
2423         hashsize = cache_roundup_2(elements) / 2;
2424
2425         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2426         for (i = 0; i < hashsize; i++)
2427                 CK_SLIST_INIT(&hashtbl[i]);
2428         *hashmask = hashsize - 1;
2429         return (hashtbl);
2430 }
2431
2432 static void
2433 ncfreetbl(struct nchashhead *hashtbl)
2434 {
2435
2436         free(hashtbl, M_VFSCACHE);
2437 }
2438
2439 /*
2440  * Name cache initialization, from vfs_init() when we are booting
2441  */
2442 static void
2443 nchinit(void *dummy __unused)
2444 {
2445         u_int i;
2446
2447         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2448             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2449         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2450             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2451         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2452             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2453         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2454             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2455
2456         VFS_SMR_ZONE_SET(cache_zone_small);
2457         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2458         VFS_SMR_ZONE_SET(cache_zone_large);
2459         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2460
2461         ncsize = desiredvnodes * ncsizefactor;
2462         cache_recalc_neg_min(ncnegminpct);
2463         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2464         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2465         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2466                 ncbuckethash = 7;
2467         if (ncbuckethash > nchash)
2468                 ncbuckethash = nchash;
2469         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2470             M_WAITOK | M_ZERO);
2471         for (i = 0; i < numbucketlocks; i++)
2472                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2473         ncvnodehash = ncbuckethash;
2474         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2475             M_WAITOK | M_ZERO);
2476         for (i = 0; i < numvnodelocks; i++)
2477                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2478
2479         for (i = 0; i < numneglists; i++) {
2480                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2481                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2482                 TAILQ_INIT(&neglists[i].nl_list);
2483                 TAILQ_INIT(&neglists[i].nl_hotlist);
2484         }
2485 }
2486 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2487
2488 void
2489 cache_vnode_init(struct vnode *vp)
2490 {
2491
2492         LIST_INIT(&vp->v_cache_src);
2493         TAILQ_INIT(&vp->v_cache_dst);
2494         vp->v_cache_dd = NULL;
2495         cache_prehash(vp);
2496 }
2497
2498 void
2499 cache_changesize(u_long newmaxvnodes)
2500 {
2501         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2502         u_long new_nchash, old_nchash;
2503         struct namecache *ncp;
2504         uint32_t hash;
2505         u_long newncsize;
2506         int i;
2507
2508         newncsize = newmaxvnodes * ncsizefactor;
2509         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2510         if (newmaxvnodes < numbucketlocks)
2511                 newmaxvnodes = numbucketlocks;
2512
2513         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2514         /* If same hash table size, nothing to do */
2515         if (nchash == new_nchash) {
2516                 ncfreetbl(new_nchashtbl);
2517                 return;
2518         }
2519         /*
2520          * Move everything from the old hash table to the new table.
2521          * None of the namecache entries in the table can be removed
2522          * because to do so, they have to be removed from the hash table.
2523          */
2524         cache_lock_all_vnodes();
2525         cache_lock_all_buckets();
2526         old_nchashtbl = nchashtbl;
2527         old_nchash = nchash;
2528         nchashtbl = new_nchashtbl;
2529         nchash = new_nchash;
2530         for (i = 0; i <= old_nchash; i++) {
2531                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2532                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2533                             ncp->nc_dvp);
2534                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2535                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2536                 }
2537         }
2538         ncsize = newncsize;
2539         cache_recalc_neg_min(ncnegminpct);
2540         cache_unlock_all_buckets();
2541         cache_unlock_all_vnodes();
2542         ncfreetbl(old_nchashtbl);
2543 }
2544
2545 /*
2546  * Invalidate all entries from and to a particular vnode.
2547  */
2548 static void
2549 cache_purge_impl(struct vnode *vp)
2550 {
2551         struct cache_freebatch batch;
2552         struct namecache *ncp;
2553         struct mtx *vlp, *vlp2;
2554
2555         TAILQ_INIT(&batch);
2556         vlp = VP2VNODELOCK(vp);
2557         vlp2 = NULL;
2558         mtx_lock(vlp);
2559 retry:
2560         while (!LIST_EMPTY(&vp->v_cache_src)) {
2561                 ncp = LIST_FIRST(&vp->v_cache_src);
2562                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2563                         goto retry;
2564                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2565         }
2566         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2567                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2568                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2569                         goto retry;
2570                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2571         }
2572         ncp = vp->v_cache_dd;
2573         if (ncp != NULL) {
2574                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2575                    ("lost dotdot link"));
2576                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2577                         goto retry;
2578                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2579         }
2580         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2581         mtx_unlock(vlp);
2582         if (vlp2 != NULL)
2583                 mtx_unlock(vlp2);
2584         cache_free_batch(&batch);
2585 }
2586
2587 /*
2588  * Opportunistic check to see if there is anything to do.
2589  */
2590 static bool
2591 cache_has_entries(struct vnode *vp)
2592 {
2593
2594         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2595             vp->v_cache_dd == NULL)
2596                 return (false);
2597         return (true);
2598 }
2599
2600 void
2601 cache_purge(struct vnode *vp)
2602 {
2603
2604         SDT_PROBE1(vfs, namecache, purge, done, vp);
2605         if (!cache_has_entries(vp))
2606                 return;
2607         cache_purge_impl(vp);
2608 }
2609
2610 /*
2611  * Only to be used by vgone.
2612  */
2613 void
2614 cache_purge_vgone(struct vnode *vp)
2615 {
2616         struct mtx *vlp;
2617
2618         VNPASS(VN_IS_DOOMED(vp), vp);
2619         if (cache_has_entries(vp)) {
2620                 cache_purge_impl(vp);
2621                 return;
2622         }
2623
2624         /*
2625          * Serialize against a potential thread doing cache_purge.
2626          */
2627         vlp = VP2VNODELOCK(vp);
2628         mtx_wait_unlocked(vlp);
2629         if (cache_has_entries(vp)) {
2630                 cache_purge_impl(vp);
2631                 return;
2632         }
2633         return;
2634 }
2635
2636 /*
2637  * Invalidate all negative entries for a particular directory vnode.
2638  */
2639 void
2640 cache_purge_negative(struct vnode *vp)
2641 {
2642         struct cache_freebatch batch;
2643         struct namecache *ncp, *nnp;
2644         struct mtx *vlp;
2645
2646         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2647         if (LIST_EMPTY(&vp->v_cache_src))
2648                 return;
2649         TAILQ_INIT(&batch);
2650         vlp = VP2VNODELOCK(vp);
2651         mtx_lock(vlp);
2652         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2653                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2654                         continue;
2655                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2656                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2657         }
2658         mtx_unlock(vlp);
2659         cache_free_batch(&batch);
2660 }
2661
2662 /*
2663  * Entry points for modifying VOP operations.
2664  */
2665 void
2666 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2667     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2668 {
2669
2670         ASSERT_VOP_IN_SEQC(fdvp);
2671         ASSERT_VOP_IN_SEQC(fvp);
2672         ASSERT_VOP_IN_SEQC(tdvp);
2673         if (tvp != NULL)
2674                 ASSERT_VOP_IN_SEQC(tvp);
2675
2676         cache_purge(fvp);
2677         if (tvp != NULL) {
2678                 cache_purge(tvp);
2679                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2680                     ("%s: lingering negative entry", __func__));
2681         } else {
2682                 cache_remove_cnp(tdvp, tcnp);
2683         }
2684 }
2685
2686 void
2687 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2688 {
2689
2690         ASSERT_VOP_IN_SEQC(dvp);
2691         ASSERT_VOP_IN_SEQC(vp);
2692         cache_purge(vp);
2693 }
2694
2695 #ifdef INVARIANTS
2696 /*
2697  * Validate that if an entry exists it matches.
2698  */
2699 void
2700 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2701 {
2702         struct namecache *ncp;
2703         struct mtx *blp;
2704         uint32_t hash;
2705
2706         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2707         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2708                 return;
2709         blp = HASH2BUCKETLOCK(hash);
2710         mtx_lock(blp);
2711         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2712                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2713                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2714                         if (ncp->nc_vp != vp)
2715                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2716                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2717                                     ncp->nc_vp);
2718                 }
2719         }
2720         mtx_unlock(blp);
2721 }
2722 #endif
2723
2724 /*
2725  * Flush all entries referencing a particular filesystem.
2726  */
2727 void
2728 cache_purgevfs(struct mount *mp)
2729 {
2730         struct vnode *vp, *mvp;
2731
2732         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2733         /*
2734          * Somewhat wasteful iteration over all vnodes. Would be better to
2735          * support filtering and avoid the interlock to begin with.
2736          */
2737         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2738                 if (!cache_has_entries(vp)) {
2739                         VI_UNLOCK(vp);
2740                         continue;
2741                 }
2742                 vholdl(vp);
2743                 VI_UNLOCK(vp);
2744                 cache_purge(vp);
2745                 vdrop(vp);
2746         }
2747 }
2748
2749 /*
2750  * Perform canonical checks and cache lookup and pass on to filesystem
2751  * through the vop_cachedlookup only if needed.
2752  */
2753
2754 int
2755 vfs_cache_lookup(struct vop_lookup_args *ap)
2756 {
2757         struct vnode *dvp;
2758         int error;
2759         struct vnode **vpp = ap->a_vpp;
2760         struct componentname *cnp = ap->a_cnp;
2761         int flags = cnp->cn_flags;
2762
2763         *vpp = NULL;
2764         dvp = ap->a_dvp;
2765
2766         if (dvp->v_type != VDIR)
2767                 return (ENOTDIR);
2768
2769         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2770             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2771                 return (EROFS);
2772
2773         error = vn_dir_check_exec(dvp, cnp);
2774         if (error != 0)
2775                 return (error);
2776
2777         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2778         if (error == 0)
2779                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2780         if (error == -1)
2781                 return (0);
2782         return (error);
2783 }
2784
2785 /* Implementation of the getcwd syscall. */
2786 int
2787 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2788 {
2789         char *buf, *retbuf;
2790         size_t buflen;
2791         int error;
2792
2793         buflen = uap->buflen;
2794         if (__predict_false(buflen < 2))
2795                 return (EINVAL);
2796         if (buflen > MAXPATHLEN)
2797                 buflen = MAXPATHLEN;
2798
2799         buf = uma_zalloc(namei_zone, M_WAITOK);
2800         error = vn_getcwd(buf, &retbuf, &buflen);
2801         if (error == 0)
2802                 error = copyout(retbuf, uap->buf, buflen);
2803         uma_zfree(namei_zone, buf);
2804         return (error);
2805 }
2806
2807 int
2808 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2809 {
2810         struct pwd *pwd;
2811         int error;
2812
2813         vfs_smr_enter();
2814         pwd = pwd_get_smr();
2815         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2816             buflen, 0);
2817         VFS_SMR_ASSERT_NOT_ENTERED();
2818         if (error < 0) {
2819                 pwd = pwd_hold(curthread);
2820                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2821                     retbuf, buflen);
2822                 pwd_drop(pwd);
2823         }
2824
2825 #ifdef KTRACE
2826         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2827                 ktrnamei(*retbuf);
2828 #endif
2829         return (error);
2830 }
2831
2832 static int
2833 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2834     size_t size, int flags, enum uio_seg pathseg)
2835 {
2836         struct nameidata nd;
2837         char *retbuf, *freebuf;
2838         int error;
2839
2840         if (flags != 0)
2841                 return (EINVAL);
2842         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2843             pathseg, path, fd, &cap_fstat_rights, td);
2844         if ((error = namei(&nd)) != 0)
2845                 return (error);
2846         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2847         if (error == 0) {
2848                 error = copyout(retbuf, buf, size);
2849                 free(freebuf, M_TEMP);
2850         }
2851         NDFREE(&nd, 0);
2852         return (error);
2853 }
2854
2855 int
2856 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2857 {
2858
2859         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2860             uap->flags, UIO_USERSPACE));
2861 }
2862
2863 /*
2864  * Retrieve the full filesystem path that correspond to a vnode from the name
2865  * cache (if available)
2866  */
2867 int
2868 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2869 {
2870         struct pwd *pwd;
2871         char *buf;
2872         size_t buflen;
2873         int error;
2874
2875         if (__predict_false(vp == NULL))
2876                 return (EINVAL);
2877
2878         buflen = MAXPATHLEN;
2879         buf = malloc(buflen, M_TEMP, M_WAITOK);
2880         vfs_smr_enter();
2881         pwd = pwd_get_smr();
2882         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2883         VFS_SMR_ASSERT_NOT_ENTERED();
2884         if (error < 0) {
2885                 pwd = pwd_hold(curthread);
2886                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2887                 pwd_drop(pwd);
2888         }
2889         if (error == 0)
2890                 *freebuf = buf;
2891         else
2892                 free(buf, M_TEMP);
2893         return (error);
2894 }
2895
2896 /*
2897  * This function is similar to vn_fullpath, but it attempts to lookup the
2898  * pathname relative to the global root mount point.  This is required for the
2899  * auditing sub-system, as audited pathnames must be absolute, relative to the
2900  * global root mount point.
2901  */
2902 int
2903 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2904 {
2905         char *buf;
2906         size_t buflen;
2907         int error;
2908
2909         if (__predict_false(vp == NULL))
2910                 return (EINVAL);
2911         buflen = MAXPATHLEN;
2912         buf = malloc(buflen, M_TEMP, M_WAITOK);
2913         vfs_smr_enter();
2914         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2915         VFS_SMR_ASSERT_NOT_ENTERED();
2916         if (error < 0) {
2917                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2918         }
2919         if (error == 0)
2920                 *freebuf = buf;
2921         else
2922                 free(buf, M_TEMP);
2923         return (error);
2924 }
2925
2926 static struct namecache *
2927 vn_dd_from_dst(struct vnode *vp)
2928 {
2929         struct namecache *ncp;
2930
2931         cache_assert_vnode_locked(vp);
2932         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2933                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2934                         return (ncp);
2935         }
2936         return (NULL);
2937 }
2938
2939 int
2940 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2941 {
2942         struct vnode *dvp;
2943         struct namecache *ncp;
2944         struct mtx *vlp;
2945         int error;
2946
2947         vlp = VP2VNODELOCK(*vp);
2948         mtx_lock(vlp);
2949         ncp = (*vp)->v_cache_dd;
2950         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2951                 KASSERT(ncp == vn_dd_from_dst(*vp),
2952                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2953                     ncp, vn_dd_from_dst(*vp)));
2954         } else {
2955                 ncp = vn_dd_from_dst(*vp);
2956         }
2957         if (ncp != NULL) {
2958                 if (*buflen < ncp->nc_nlen) {
2959                         mtx_unlock(vlp);
2960                         vrele(*vp);
2961                         counter_u64_add(numfullpathfail4, 1);
2962                         error = ENOMEM;
2963                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2964                             vp, NULL);
2965                         return (error);
2966                 }
2967                 *buflen -= ncp->nc_nlen;
2968                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2969                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2970                     ncp->nc_name, vp);
2971                 dvp = *vp;
2972                 *vp = ncp->nc_dvp;
2973                 vref(*vp);
2974                 mtx_unlock(vlp);
2975                 vrele(dvp);
2976                 return (0);
2977         }
2978         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2979
2980         mtx_unlock(vlp);
2981         vn_lock(*vp, LK_SHARED | LK_RETRY);
2982         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2983         vput(*vp);
2984         if (error) {
2985                 counter_u64_add(numfullpathfail2, 1);
2986                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2987                 return (error);
2988         }
2989
2990         *vp = dvp;
2991         if (VN_IS_DOOMED(dvp)) {
2992                 /* forced unmount */
2993                 vrele(dvp);
2994                 error = ENOENT;
2995                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2996                 return (error);
2997         }
2998         /*
2999          * *vp has its use count incremented still.
3000          */
3001
3002         return (0);
3003 }
3004
3005 /*
3006  * Resolve a directory to a pathname.
3007  *
3008  * The name of the directory can always be found in the namecache or fetched
3009  * from the filesystem. There is also guaranteed to be only one parent, meaning
3010  * we can just follow vnodes up until we find the root.
3011  *
3012  * The vnode must be referenced.
3013  */
3014 static int
3015 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3016     size_t *len, size_t addend)
3017 {
3018 #ifdef KDTRACE_HOOKS
3019         struct vnode *startvp = vp;
3020 #endif
3021         struct vnode *vp1;
3022         size_t buflen;
3023         int error;
3024         bool slash_prefixed;
3025
3026         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3027         VNPASS(vp->v_usecount > 0, vp);
3028
3029         buflen = *len;
3030
3031         slash_prefixed = true;
3032         if (addend == 0) {
3033                 MPASS(*len >= 2);
3034                 buflen--;
3035                 buf[buflen] = '\0';
3036                 slash_prefixed = false;
3037         }
3038
3039         error = 0;
3040
3041         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3042         counter_u64_add(numfullpathcalls, 1);
3043         while (vp != rdir && vp != rootvnode) {
3044                 /*
3045                  * The vp vnode must be already fully constructed,
3046                  * since it is either found in namecache or obtained
3047                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3048                  * without obtaining the vnode lock.
3049                  */
3050                 if ((vp->v_vflag & VV_ROOT) != 0) {
3051                         vn_lock(vp, LK_RETRY | LK_SHARED);
3052
3053                         /*
3054                          * With the vnode locked, check for races with
3055                          * unmount, forced or not.  Note that we
3056                          * already verified that vp is not equal to
3057                          * the root vnode, which means that
3058                          * mnt_vnodecovered can be NULL only for the
3059                          * case of unmount.
3060                          */
3061                         if (VN_IS_DOOMED(vp) ||
3062                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3063                             vp1->v_mountedhere != vp->v_mount) {
3064                                 vput(vp);
3065                                 error = ENOENT;
3066                                 SDT_PROBE3(vfs, namecache, fullpath, return,
3067                                     error, vp, NULL);
3068                                 break;
3069                         }
3070
3071                         vref(vp1);
3072                         vput(vp);
3073                         vp = vp1;
3074                         continue;
3075                 }
3076                 if (vp->v_type != VDIR) {
3077                         vrele(vp);
3078                         counter_u64_add(numfullpathfail1, 1);
3079                         error = ENOTDIR;
3080                         SDT_PROBE3(vfs, namecache, fullpath, return,
3081                             error, vp, NULL);
3082                         break;
3083                 }
3084                 error = vn_vptocnp(&vp, buf, &buflen);
3085                 if (error)
3086                         break;
3087                 if (buflen == 0) {
3088                         vrele(vp);
3089                         error = ENOMEM;
3090                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3091                             startvp, NULL);
3092                         break;
3093                 }
3094                 buf[--buflen] = '/';
3095                 slash_prefixed = true;
3096         }
3097         if (error)
3098                 return (error);
3099         if (!slash_prefixed) {
3100                 if (buflen == 0) {
3101                         vrele(vp);
3102                         counter_u64_add(numfullpathfail4, 1);
3103                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3104                             startvp, NULL);
3105                         return (ENOMEM);
3106                 }
3107                 buf[--buflen] = '/';
3108         }
3109         counter_u64_add(numfullpathfound, 1);
3110         vrele(vp);
3111
3112         *retbuf = buf + buflen;
3113         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3114         *len -= buflen;
3115         *len += addend;
3116         return (0);
3117 }
3118
3119 /*
3120  * Resolve an arbitrary vnode to a pathname.
3121  *
3122  * Note 2 caveats:
3123  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3124  *   resolve to a different path than the one used to find it
3125  * - namecache is not mandatory, meaning names are not guaranteed to be added
3126  *   (in which case resolving fails)
3127  */
3128 static void __inline
3129 cache_rev_failed_impl(int *reason, int line)
3130 {
3131
3132         *reason = line;
3133 }
3134 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3135
3136 static int
3137 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3138     char **retbuf, size_t *buflen, size_t addend)
3139 {
3140 #ifdef KDTRACE_HOOKS
3141         struct vnode *startvp = vp;
3142 #endif
3143         struct vnode *tvp;
3144         struct mount *mp;
3145         struct namecache *ncp;
3146         size_t orig_buflen;
3147         int reason;
3148         int error;
3149 #ifdef KDTRACE_HOOKS
3150         int i;
3151 #endif
3152         seqc_t vp_seqc, tvp_seqc;
3153         u_char nc_flag;
3154
3155         VFS_SMR_ASSERT_ENTERED();
3156
3157         if (!cache_fast_revlookup) {
3158                 vfs_smr_exit();
3159                 return (-1);
3160         }
3161
3162         orig_buflen = *buflen;
3163
3164         if (addend == 0) {
3165                 MPASS(*buflen >= 2);
3166                 *buflen -= 1;
3167                 buf[*buflen] = '\0';
3168         }
3169
3170         if (vp == rdir || vp == rootvnode) {
3171                 if (addend == 0) {
3172                         *buflen -= 1;
3173                         buf[*buflen] = '/';
3174                 }
3175                 goto out_ok;
3176         }
3177
3178 #ifdef KDTRACE_HOOKS
3179         i = 0;
3180 #endif
3181         error = -1;
3182         ncp = NULL; /* for sdt probe down below */
3183         vp_seqc = vn_seqc_read_any(vp);
3184         if (seqc_in_modify(vp_seqc)) {
3185                 cache_rev_failed(&reason);
3186                 goto out_abort;
3187         }
3188
3189         for (;;) {
3190 #ifdef KDTRACE_HOOKS
3191                 i++;
3192 #endif
3193                 if ((vp->v_vflag & VV_ROOT) != 0) {
3194                         mp = atomic_load_ptr(&vp->v_mount);
3195                         if (mp == NULL) {
3196                                 cache_rev_failed(&reason);
3197                                 goto out_abort;
3198                         }
3199                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3200                         tvp_seqc = vn_seqc_read_any(tvp);
3201                         if (seqc_in_modify(tvp_seqc)) {
3202                                 cache_rev_failed(&reason);
3203                                 goto out_abort;
3204                         }
3205                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3206                                 cache_rev_failed(&reason);
3207                                 goto out_abort;
3208                         }
3209                         vp = tvp;
3210                         vp_seqc = tvp_seqc;
3211                         continue;
3212                 }
3213                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3214                 if (ncp == NULL) {
3215                         cache_rev_failed(&reason);
3216                         goto out_abort;
3217                 }
3218                 nc_flag = atomic_load_char(&ncp->nc_flag);
3219                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3220                         cache_rev_failed(&reason);
3221                         goto out_abort;
3222                 }
3223                 if (!cache_ncp_canuse(ncp)) {
3224                         cache_rev_failed(&reason);
3225                         goto out_abort;
3226                 }
3227                 if (ncp->nc_nlen >= *buflen) {
3228                         cache_rev_failed(&reason);
3229                         error = ENOMEM;
3230                         goto out_abort;
3231                 }
3232                 *buflen -= ncp->nc_nlen;
3233                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3234                 *buflen -= 1;
3235                 buf[*buflen] = '/';
3236                 tvp = ncp->nc_dvp;
3237                 tvp_seqc = vn_seqc_read_any(tvp);
3238                 if (seqc_in_modify(tvp_seqc)) {
3239                         cache_rev_failed(&reason);
3240                         goto out_abort;
3241                 }
3242                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3243                         cache_rev_failed(&reason);
3244                         goto out_abort;
3245                 }
3246                 vp = tvp;
3247                 vp_seqc = tvp_seqc;
3248                 if (vp == rdir || vp == rootvnode)
3249                         break;
3250         }
3251 out_ok:
3252         vfs_smr_exit();
3253         *retbuf = buf + *buflen;
3254         *buflen = orig_buflen - *buflen + addend;
3255         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3256         return (0);
3257
3258 out_abort:
3259         *buflen = orig_buflen;
3260         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3261         vfs_smr_exit();
3262         return (error);
3263 }
3264
3265 static int
3266 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3267     size_t *buflen)
3268 {
3269         size_t orig_buflen, addend;
3270         int error;
3271
3272         if (*buflen < 2)
3273                 return (EINVAL);
3274
3275         orig_buflen = *buflen;
3276
3277         vref(vp);
3278         addend = 0;
3279         if (vp->v_type != VDIR) {
3280                 *buflen -= 1;
3281                 buf[*buflen] = '\0';
3282                 error = vn_vptocnp(&vp, buf, buflen);
3283                 if (error)
3284                         return (error);
3285                 if (*buflen == 0) {
3286                         vrele(vp);
3287                         return (ENOMEM);
3288                 }
3289                 *buflen -= 1;
3290                 buf[*buflen] = '/';
3291                 addend = orig_buflen - *buflen;
3292         }
3293
3294         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3295 }
3296
3297 /*
3298  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3299  *
3300  * Since the namecache does not track hardlinks, the caller is expected to first
3301  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3302  *
3303  * Then we have 2 cases:
3304  * - if the found vnode is a directory, the path can be constructed just by
3305  *   following names up the chain
3306  * - otherwise we populate the buffer with the saved name and start resolving
3307  *   from the parent
3308  */
3309 static int
3310 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3311     size_t *buflen)
3312 {
3313         char *buf, *tmpbuf;
3314         struct pwd *pwd;
3315         struct componentname *cnp;
3316         struct vnode *vp;
3317         size_t addend;
3318         int error;
3319         enum vtype type;
3320
3321         if (*buflen < 2)
3322                 return (EINVAL);
3323         if (*buflen > MAXPATHLEN)
3324                 *buflen = MAXPATHLEN;
3325
3326         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3327
3328         addend = 0;
3329         vp = ndp->ni_vp;
3330         /*
3331          * Check for VBAD to work around the vp_crossmp bug in lookup().
3332          *
3333          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3334          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3335          * If the type is VDIR (like in this very case) we can skip looking
3336          * at ni_dvp in the first place. However, since vnodes get passed here
3337          * unlocked the target may transition to doomed state (type == VBAD)
3338          * before we get to evaluate the condition. If this happens, we will
3339          * populate part of the buffer and descend to vn_fullpath_dir with
3340          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3341          *
3342          * This should be atomic_load(&vp->v_type) but it is illegal to take
3343          * an address of a bit field, even if said field is sized to char.
3344          * Work around the problem by reading the value into a full-sized enum
3345          * and then re-reading it with atomic_load which will still prevent
3346          * the compiler from re-reading down the road.
3347          */
3348         type = vp->v_type;
3349         type = atomic_load_int(&type);
3350         if (type == VBAD) {
3351                 error = ENOENT;
3352                 goto out_bad;
3353         }
3354         if (type != VDIR) {
3355                 cnp = &ndp->ni_cnd;
3356                 addend = cnp->cn_namelen + 2;
3357                 if (*buflen < addend) {
3358                         error = ENOMEM;
3359                         goto out_bad;
3360                 }
3361                 *buflen -= addend;
3362                 tmpbuf = buf + *buflen;
3363                 tmpbuf[0] = '/';
3364                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3365                 tmpbuf[addend - 1] = '\0';
3366                 vp = ndp->ni_dvp;
3367         }
3368
3369         vfs_smr_enter();
3370         pwd = pwd_get_smr();
3371         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3372             addend);
3373         VFS_SMR_ASSERT_NOT_ENTERED();
3374         if (error < 0) {
3375                 pwd = pwd_hold(curthread);
3376                 vref(vp);
3377                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3378                     addend);
3379                 pwd_drop(pwd);
3380                 if (error != 0)
3381                         goto out_bad;
3382         }
3383
3384         *freebuf = buf;
3385
3386         return (0);
3387 out_bad:
3388         free(buf, M_TEMP);
3389         return (error);
3390 }
3391
3392 struct vnode *
3393 vn_dir_dd_ino(struct vnode *vp)
3394 {
3395         struct namecache *ncp;
3396         struct vnode *ddvp;
3397         struct mtx *vlp;
3398         enum vgetstate vs;
3399
3400         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3401         vlp = VP2VNODELOCK(vp);
3402         mtx_lock(vlp);
3403         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3404                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3405                         continue;
3406                 ddvp = ncp->nc_dvp;
3407                 vs = vget_prep(ddvp);
3408                 mtx_unlock(vlp);
3409                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3410                         return (NULL);
3411                 return (ddvp);
3412         }
3413         mtx_unlock(vlp);
3414         return (NULL);
3415 }
3416
3417 int
3418 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3419 {
3420         struct namecache *ncp;
3421         struct mtx *vlp;
3422         int l;
3423
3424         vlp = VP2VNODELOCK(vp);
3425         mtx_lock(vlp);
3426         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3427                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3428                         break;
3429         if (ncp == NULL) {
3430                 mtx_unlock(vlp);
3431                 return (ENOENT);
3432         }
3433         l = min(ncp->nc_nlen, buflen - 1);
3434         memcpy(buf, ncp->nc_name, l);
3435         mtx_unlock(vlp);
3436         buf[l] = '\0';
3437         return (0);
3438 }
3439
3440 /*
3441  * This function updates path string to vnode's full global path
3442  * and checks the size of the new path string against the pathlen argument.
3443  *
3444  * Requires a locked, referenced vnode.
3445  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3446  *
3447  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3448  * because it falls back to the ".." lookup if the namecache lookup fails.
3449  */
3450 int
3451 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3452     u_int pathlen)
3453 {
3454         struct nameidata nd;
3455         struct vnode *vp1;
3456         char *rpath, *fbuf;
3457         int error;
3458
3459         ASSERT_VOP_ELOCKED(vp, __func__);
3460
3461         /* Construct global filesystem path from vp. */
3462         VOP_UNLOCK(vp);
3463         error = vn_fullpath_global(vp, &rpath, &fbuf);
3464
3465         if (error != 0) {
3466                 vrele(vp);
3467                 return (error);
3468         }
3469
3470         if (strlen(rpath) >= pathlen) {
3471                 vrele(vp);
3472                 error = ENAMETOOLONG;
3473                 goto out;
3474         }
3475
3476         /*
3477          * Re-lookup the vnode by path to detect a possible rename.
3478          * As a side effect, the vnode is relocked.
3479          * If vnode was renamed, return ENOENT.
3480          */
3481         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3482             UIO_SYSSPACE, path, td);
3483         error = namei(&nd);
3484         if (error != 0) {
3485                 vrele(vp);
3486                 goto out;
3487         }
3488         NDFREE(&nd, NDF_ONLY_PNBUF);
3489         vp1 = nd.ni_vp;
3490         vrele(vp);
3491         if (vp1 == vp)
3492                 strcpy(path, rpath);
3493         else {
3494                 vput(vp1);
3495                 error = ENOENT;
3496         }
3497
3498 out:
3499         free(fbuf, M_TEMP);
3500         return (error);
3501 }
3502
3503 #ifdef DDB
3504 static void
3505 db_print_vpath(struct vnode *vp)
3506 {
3507
3508         while (vp != NULL) {
3509                 db_printf("%p: ", vp);
3510                 if (vp == rootvnode) {
3511                         db_printf("/");
3512                         vp = NULL;
3513                 } else {
3514                         if (vp->v_vflag & VV_ROOT) {
3515                                 db_printf("<mount point>");
3516                                 vp = vp->v_mount->mnt_vnodecovered;
3517                         } else {
3518                                 struct namecache *ncp;
3519                                 char *ncn;
3520                                 int i;
3521
3522                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3523                                 if (ncp != NULL) {
3524                                         ncn = ncp->nc_name;
3525                                         for (i = 0; i < ncp->nc_nlen; i++)
3526                                                 db_printf("%c", *ncn++);
3527                                         vp = ncp->nc_dvp;
3528                                 } else {
3529                                         vp = NULL;
3530                                 }
3531                         }
3532                 }
3533                 db_printf("\n");
3534         }
3535
3536         return;
3537 }
3538
3539 DB_SHOW_COMMAND(vpath, db_show_vpath)
3540 {
3541         struct vnode *vp;
3542
3543         if (!have_addr) {
3544                 db_printf("usage: show vpath <struct vnode *>\n");
3545                 return;
3546         }
3547
3548         vp = (struct vnode *)addr;
3549         db_print_vpath(vp);
3550 }
3551
3552 #endif
3553
3554 static bool __read_frequently cache_fast_lookup = true;
3555 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3556     &cache_fast_lookup, 0, "");
3557
3558 #define CACHE_FPL_FAILED        -2020
3559
3560 static void
3561 cache_fpl_cleanup_cnp(struct componentname *cnp)
3562 {
3563
3564         uma_zfree(namei_zone, cnp->cn_pnbuf);
3565 #ifdef DIAGNOSTIC
3566         cnp->cn_pnbuf = NULL;
3567         cnp->cn_nameptr = NULL;
3568 #endif
3569 }
3570
3571 static void
3572 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3573 {
3574         struct componentname *cnp;
3575
3576         cnp = &ndp->ni_cnd;
3577         while (*(cnp->cn_nameptr) == '/') {
3578                 cnp->cn_nameptr++;
3579                 ndp->ni_pathlen--;
3580         }
3581
3582         *dpp = ndp->ni_rootdir;
3583 }
3584
3585 /*
3586  * Components of nameidata (or objects it can point to) which may
3587  * need restoring in case fast path lookup fails.
3588  */
3589 struct nameidata_saved {
3590         long cn_namelen;
3591         char *cn_nameptr;
3592         size_t ni_pathlen;
3593         int cn_flags;
3594 };
3595
3596 struct cache_fpl {
3597         struct nameidata *ndp;
3598         struct componentname *cnp;
3599         struct pwd *pwd;
3600         struct vnode *dvp;
3601         struct vnode *tvp;
3602         seqc_t dvp_seqc;
3603         seqc_t tvp_seqc;
3604         struct nameidata_saved snd;
3605         int line;
3606         enum cache_fpl_status status:8;
3607         bool in_smr;
3608         bool fsearch;
3609 };
3610
3611 static void
3612 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3613 {
3614
3615         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3616         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3617         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3618         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3619 }
3620
3621 static void
3622 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd)
3623 {
3624
3625         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3626         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3627         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3628         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3629 }
3630
3631 static void
3632 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd)
3633 {
3634
3635         cache_fpl_restore_partial(fpl, snd);
3636         /*
3637          * It is 0 on entry by API contract.
3638          */
3639         fpl->ndp->ni_resflags = 0;
3640 }
3641
3642 #ifdef INVARIANTS
3643 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3644         struct cache_fpl *_fpl = (fpl);                         \
3645         MPASS(_fpl->in_smr == true);                            \
3646         VFS_SMR_ASSERT_ENTERED();                               \
3647 })
3648 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3649         struct cache_fpl *_fpl = (fpl);                         \
3650         MPASS(_fpl->in_smr == false);                           \
3651         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3652 })
3653 #else
3654 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3655 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3656 #endif
3657
3658 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3659         struct cache_fpl *_fpl = (fpl);                         \
3660         vfs_smr_enter();                                        \
3661         _fpl->in_smr = true;                                    \
3662 })
3663
3664 #define cache_fpl_smr_enter(fpl) ({                             \
3665         struct cache_fpl *_fpl = (fpl);                         \
3666         MPASS(_fpl->in_smr == false);                           \
3667         vfs_smr_enter();                                        \
3668         _fpl->in_smr = true;                                    \
3669 })
3670
3671 #define cache_fpl_smr_exit(fpl) ({                              \
3672         struct cache_fpl *_fpl = (fpl);                         \
3673         MPASS(_fpl->in_smr == true);                            \
3674         vfs_smr_exit();                                         \
3675         _fpl->in_smr = false;                                   \
3676 })
3677
3678 static int
3679 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3680 {
3681
3682         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3683                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3684                     ("%s: converting to abort from %d at %d, set at %d\n",
3685                     __func__, fpl->status, line, fpl->line));
3686         }
3687         fpl->status = CACHE_FPL_STATUS_ABORTED;
3688         fpl->line = line;
3689         return (CACHE_FPL_FAILED);
3690 }
3691
3692 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3693
3694 static int
3695 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3696 {
3697
3698         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3699             ("%s: setting to partial at %d, but already set to %d at %d\n",
3700             __func__, line, fpl->status, fpl->line));
3701         cache_fpl_smr_assert_entered(fpl);
3702         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3703         fpl->line = line;
3704         return (CACHE_FPL_FAILED);
3705 }
3706
3707 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3708
3709 static int
3710 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3711 {
3712
3713         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3714             ("%s: setting to handled at %d, but already set to %d at %d\n",
3715             __func__, line, fpl->status, fpl->line));
3716         cache_fpl_smr_assert_not_entered(fpl);
3717         MPASS(error != CACHE_FPL_FAILED);
3718         fpl->status = CACHE_FPL_STATUS_HANDLED;
3719         fpl->line = line;
3720         return (error);
3721 }
3722
3723 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3724
3725 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3726         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
3727          FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \
3728          NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3729
3730 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3731         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3732
3733 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3734     "supported and internal flags overlap");
3735
3736 static bool
3737 cache_fpl_islastcn(struct nameidata *ndp)
3738 {
3739
3740         return (*ndp->ni_next == 0);
3741 }
3742
3743 static bool
3744 cache_fpl_isdotdot(struct componentname *cnp)
3745 {
3746
3747         if (cnp->cn_namelen == 2 &&
3748             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3749                 return (true);
3750         return (false);
3751 }
3752
3753 static bool
3754 cache_can_fplookup(struct cache_fpl *fpl)
3755 {
3756         struct nameidata *ndp;
3757         struct componentname *cnp;
3758         struct thread *td;
3759
3760         ndp = fpl->ndp;
3761         cnp = fpl->cnp;
3762         td = cnp->cn_thread;
3763
3764         if (!cache_fast_lookup) {
3765                 cache_fpl_aborted(fpl);
3766                 return (false);
3767         }
3768 #ifdef MAC
3769         if (mac_vnode_check_lookup_enabled()) {
3770                 cache_fpl_aborted(fpl);
3771                 return (false);
3772         }
3773 #endif
3774         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3775                 cache_fpl_aborted(fpl);
3776                 return (false);
3777         }
3778         if (IN_CAPABILITY_MODE(td)) {
3779                 cache_fpl_aborted(fpl);
3780                 return (false);
3781         }
3782         if (AUDITING_TD(td)) {
3783                 cache_fpl_aborted(fpl);
3784                 return (false);
3785         }
3786         if (ndp->ni_startdir != NULL) {
3787                 cache_fpl_aborted(fpl);
3788                 return (false);
3789         }
3790         return (true);
3791 }
3792
3793 static int
3794 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3795 {
3796         struct nameidata *ndp;
3797         int error;
3798         bool fsearch;
3799
3800         ndp = fpl->ndp;
3801         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3802         if (__predict_false(error != 0)) {
3803                 cache_fpl_smr_exit(fpl);
3804                 return (cache_fpl_aborted(fpl));
3805         }
3806         fpl->fsearch = fsearch;
3807         return (0);
3808 }
3809
3810 static bool
3811 cache_fplookup_vnode_supported(struct vnode *vp)
3812 {
3813
3814         return (vp->v_type != VLNK);
3815 }
3816
3817 static int __noinline
3818 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3819     uint32_t hash)
3820 {
3821         struct componentname *cnp;
3822         struct vnode *dvp;
3823
3824         cnp = fpl->cnp;
3825         dvp = fpl->dvp;
3826
3827         cache_fpl_smr_exit(fpl);
3828         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3829                 return (cache_fpl_handled(fpl, ENOENT));
3830         else
3831                 return (cache_fpl_aborted(fpl));
3832 }
3833
3834 /*
3835  * The target vnode is not supported, prepare for the slow path to take over.
3836  */
3837 static int __noinline
3838 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3839 {
3840         struct nameidata *ndp;
3841         struct componentname *cnp;
3842         enum vgetstate dvs;
3843         struct vnode *dvp;
3844         struct pwd *pwd;
3845         seqc_t dvp_seqc;
3846
3847         ndp = fpl->ndp;
3848         cnp = fpl->cnp;
3849         pwd = fpl->pwd;
3850         dvp = fpl->dvp;
3851         dvp_seqc = fpl->dvp_seqc;
3852
3853         if (!pwd_hold_smr(pwd)) {
3854                 cache_fpl_smr_exit(fpl);
3855                 return (cache_fpl_aborted(fpl));
3856         }
3857
3858         dvs = vget_prep_smr(dvp);
3859         cache_fpl_smr_exit(fpl);
3860         if (__predict_false(dvs == VGET_NONE)) {
3861                 pwd_drop(pwd);
3862                 return (cache_fpl_aborted(fpl));
3863         }
3864
3865         vget_finish_ref(dvp, dvs);
3866         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3867                 vrele(dvp);
3868                 pwd_drop(pwd);
3869                 return (cache_fpl_aborted(fpl));
3870         }
3871
3872         cache_fpl_restore_partial(fpl, &fpl->snd);
3873
3874         ndp->ni_startdir = dvp;
3875         cnp->cn_flags |= MAKEENTRY;
3876         if (cache_fpl_islastcn(ndp))
3877                 cnp->cn_flags |= ISLASTCN;
3878         if (cache_fpl_isdotdot(cnp))
3879                 cnp->cn_flags |= ISDOTDOT;
3880
3881         return (0);
3882 }
3883
3884 static int
3885 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3886 {
3887         struct componentname *cnp;
3888         struct vnode *tvp;
3889         seqc_t tvp_seqc;
3890         int error, lkflags;
3891
3892         cnp = fpl->cnp;
3893         tvp = fpl->tvp;
3894         tvp_seqc = fpl->tvp_seqc;
3895
3896         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3897                 lkflags = LK_SHARED;
3898                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3899                         lkflags = LK_EXCLUSIVE;
3900                 error = vget_finish(tvp, lkflags, tvs);
3901                 if (__predict_false(error != 0)) {
3902                         return (cache_fpl_aborted(fpl));
3903                 }
3904         } else {
3905                 vget_finish_ref(tvp, tvs);
3906         }
3907
3908         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3909                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3910                         vput(tvp);
3911                 else
3912                         vrele(tvp);
3913                 return (cache_fpl_aborted(fpl));
3914         }
3915
3916         return (cache_fpl_handled(fpl, 0));
3917 }
3918
3919 /*
3920  * They want to possibly modify the state of the namecache.
3921  *
3922  * Don't try to match the API contract, just leave.
3923  * TODO: this leaves scalability on the table
3924  */
3925 static int
3926 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3927 {
3928         struct componentname *cnp;
3929
3930         cnp = fpl->cnp;
3931         MPASS(cnp->cn_nameiop != LOOKUP);
3932         return (cache_fpl_partial(fpl));
3933 }
3934
3935 static int __noinline
3936 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3937 {
3938         struct componentname *cnp;
3939         enum vgetstate dvs, tvs;
3940         struct vnode *dvp, *tvp;
3941         seqc_t dvp_seqc;
3942         int error;
3943
3944         cnp = fpl->cnp;
3945         dvp = fpl->dvp;
3946         dvp_seqc = fpl->dvp_seqc;
3947         tvp = fpl->tvp;
3948
3949         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3950
3951         /*
3952          * This is less efficient than it can be for simplicity.
3953          */
3954         dvs = vget_prep_smr(dvp);
3955         if (__predict_false(dvs == VGET_NONE)) {
3956                 return (cache_fpl_aborted(fpl));
3957         }
3958         tvs = vget_prep_smr(tvp);
3959         if (__predict_false(tvs == VGET_NONE)) {
3960                 cache_fpl_smr_exit(fpl);
3961                 vget_abort(dvp, dvs);
3962                 return (cache_fpl_aborted(fpl));
3963         }
3964
3965         cache_fpl_smr_exit(fpl);
3966
3967         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3968                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3969                 if (__predict_false(error != 0)) {
3970                         vget_abort(tvp, tvs);
3971                         return (cache_fpl_aborted(fpl));
3972                 }
3973         } else {
3974                 vget_finish_ref(dvp, dvs);
3975         }
3976
3977         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3978                 vget_abort(tvp, tvs);
3979                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3980                         vput(dvp);
3981                 else
3982                         vrele(dvp);
3983                 return (cache_fpl_aborted(fpl));
3984         }
3985
3986         error = cache_fplookup_final_child(fpl, tvs);
3987         if (__predict_false(error != 0)) {
3988                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3989                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3990                         vput(dvp);
3991                 else
3992                         vrele(dvp);
3993                 return (error);
3994         }
3995
3996         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3997         return (0);
3998 }
3999
4000 static int
4001 cache_fplookup_final(struct cache_fpl *fpl)
4002 {
4003         struct componentname *cnp;
4004         enum vgetstate tvs;
4005         struct vnode *dvp, *tvp;
4006         seqc_t dvp_seqc;
4007
4008         cnp = fpl->cnp;
4009         dvp = fpl->dvp;
4010         dvp_seqc = fpl->dvp_seqc;
4011         tvp = fpl->tvp;
4012
4013         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
4014
4015         if (cnp->cn_nameiop != LOOKUP) {
4016                 return (cache_fplookup_final_modifying(fpl));
4017         }
4018
4019         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4020                 return (cache_fplookup_final_withparent(fpl));
4021
4022         tvs = vget_prep_smr(tvp);
4023         if (__predict_false(tvs == VGET_NONE)) {
4024                 return (cache_fpl_partial(fpl));
4025         }
4026
4027         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4028                 cache_fpl_smr_exit(fpl);
4029                 vget_abort(tvp, tvs);
4030                 return (cache_fpl_aborted(fpl));
4031         }
4032
4033         cache_fpl_smr_exit(fpl);
4034         return (cache_fplookup_final_child(fpl, tvs));
4035 }
4036
4037 static int __noinline
4038 cache_fplookup_dot(struct cache_fpl *fpl)
4039 {
4040         struct vnode *dvp;
4041
4042         dvp = fpl->dvp;
4043
4044         fpl->tvp = dvp;
4045         fpl->tvp_seqc = vn_seqc_read_any(dvp);
4046         if (seqc_in_modify(fpl->tvp_seqc)) {
4047                 return (cache_fpl_aborted(fpl));
4048         }
4049
4050         counter_u64_add(dothits, 1);
4051         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
4052
4053         return (0);
4054 }
4055
4056 static int __noinline
4057 cache_fplookup_dotdot(struct cache_fpl *fpl)
4058 {
4059         struct nameidata *ndp;
4060         struct componentname *cnp;
4061         struct namecache *ncp;
4062         struct vnode *dvp;
4063         struct prison *pr;
4064         u_char nc_flag;
4065
4066         ndp = fpl->ndp;
4067         cnp = fpl->cnp;
4068         dvp = fpl->dvp;
4069
4070         /*
4071          * XXX this is racy the same way regular lookup is
4072          */
4073         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4074             pr = pr->pr_parent)
4075                 if (dvp == pr->pr_root)
4076                         break;
4077
4078         if (dvp == ndp->ni_rootdir ||
4079             dvp == ndp->ni_topdir ||
4080             dvp == rootvnode ||
4081             pr != NULL) {
4082                 fpl->tvp = dvp;
4083                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
4084                 if (seqc_in_modify(fpl->tvp_seqc)) {
4085                         return (cache_fpl_aborted(fpl));
4086                 }
4087                 return (0);
4088         }
4089
4090         if ((dvp->v_vflag & VV_ROOT) != 0) {
4091                 /*
4092                  * TODO
4093                  * The opposite of climb mount is needed here.
4094                  */
4095                 return (cache_fpl_aborted(fpl));
4096         }
4097
4098         ncp = atomic_load_ptr(&dvp->v_cache_dd);
4099         if (ncp == NULL) {
4100                 return (cache_fpl_aborted(fpl));
4101         }
4102
4103         nc_flag = atomic_load_char(&ncp->nc_flag);
4104         if ((nc_flag & NCF_ISDOTDOT) != 0) {
4105                 if ((nc_flag & NCF_NEGATIVE) != 0)
4106                         return (cache_fpl_aborted(fpl));
4107                 fpl->tvp = ncp->nc_vp;
4108         } else {
4109                 fpl->tvp = ncp->nc_dvp;
4110         }
4111
4112         if (!cache_ncp_canuse(ncp)) {
4113                 return (cache_fpl_aborted(fpl));
4114         }
4115
4116         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4117         if (seqc_in_modify(fpl->tvp_seqc)) {
4118                 return (cache_fpl_partial(fpl));
4119         }
4120
4121         counter_u64_add(dotdothits, 1);
4122         return (0);
4123 }
4124
4125 static int __noinline
4126 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4127 {
4128         u_char nc_flag;
4129         bool neg_promote;
4130
4131         nc_flag = atomic_load_char(&ncp->nc_flag);
4132         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4133         /*
4134          * If they want to create an entry we need to replace this one.
4135          */
4136         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4137                 /*
4138                  * TODO
4139                  * This should call something similar to
4140                  * cache_fplookup_final_modifying.
4141                  */
4142                 return (cache_fpl_partial(fpl));
4143         }
4144         neg_promote = cache_neg_hit_prep(ncp);
4145         if (!cache_ncp_canuse(ncp)) {
4146                 cache_neg_hit_abort(ncp);
4147                 return (cache_fpl_partial(fpl));
4148         }
4149         if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4150                 cache_neg_hit_abort(ncp);
4151                 return (cache_fpl_partial(fpl));
4152         }
4153         if (neg_promote) {
4154                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4155         }
4156         cache_neg_hit_finish(ncp);
4157         cache_fpl_smr_exit(fpl);
4158         return (cache_fpl_handled(fpl, ENOENT));
4159 }
4160
4161 static int
4162 cache_fplookup_next(struct cache_fpl *fpl)
4163 {
4164         struct componentname *cnp;
4165         struct namecache *ncp;
4166         struct vnode *dvp, *tvp;
4167         u_char nc_flag;
4168         uint32_t hash;
4169
4170         cnp = fpl->cnp;
4171         dvp = fpl->dvp;
4172
4173         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4174                 return (cache_fplookup_dot(fpl));
4175         }
4176
4177         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4178
4179         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4180                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4181                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4182                         break;
4183         }
4184
4185         /*
4186          * If there is no entry we have to punt to the slow path to perform
4187          * actual lookup. Should there be nothing with this name a negative
4188          * entry will be created.
4189          */
4190         if (__predict_false(ncp == NULL)) {
4191                 return (cache_fpl_partial(fpl));
4192         }
4193
4194         tvp = atomic_load_ptr(&ncp->nc_vp);
4195         nc_flag = atomic_load_char(&ncp->nc_flag);
4196         if ((nc_flag & NCF_NEGATIVE) != 0) {
4197                 return (cache_fplookup_neg(fpl, ncp, hash));
4198         }
4199
4200         if (!cache_ncp_canuse(ncp)) {
4201                 return (cache_fpl_partial(fpl));
4202         }
4203
4204         fpl->tvp = tvp;
4205         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4206         if (seqc_in_modify(fpl->tvp_seqc)) {
4207                 return (cache_fpl_partial(fpl));
4208         }
4209
4210         if (!cache_fplookup_vnode_supported(tvp)) {
4211                 return (cache_fpl_partial(fpl));
4212         }
4213
4214         counter_u64_add(numposhits, 1);
4215         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4216         return (0);
4217 }
4218
4219 static bool
4220 cache_fplookup_mp_supported(struct mount *mp)
4221 {
4222
4223         if (mp == NULL)
4224                 return (false);
4225         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4226                 return (false);
4227         return (true);
4228 }
4229
4230 /*
4231  * Walk up the mount stack (if any).
4232  *
4233  * Correctness is provided in the following ways:
4234  * - all vnodes are protected from freeing with SMR
4235  * - struct mount objects are type stable making them always safe to access
4236  * - stability of the particular mount is provided by busying it
4237  * - relationship between the vnode which is mounted on and the mount is
4238  *   verified with the vnode sequence counter after busying
4239  * - association between root vnode of the mount and the mount is protected
4240  *   by busy
4241  *
4242  * From that point on we can read the sequence counter of the root vnode
4243  * and get the next mount on the stack (if any) using the same protection.
4244  *
4245  * By the end of successful walk we are guaranteed the reached state was
4246  * indeed present at least at some point which matches the regular lookup.
4247  */
4248 static int __noinline
4249 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4250 {
4251         struct mount *mp, *prev_mp;
4252         struct mount_pcpu *mpcpu, *prev_mpcpu;
4253         struct vnode *vp;
4254         seqc_t vp_seqc;
4255
4256         vp = fpl->tvp;
4257         vp_seqc = fpl->tvp_seqc;
4258
4259         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4260         mp = atomic_load_ptr(&vp->v_mountedhere);
4261         if (mp == NULL)
4262                 return (0);
4263
4264         prev_mp = NULL;
4265         for (;;) {
4266                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
4267                         if (prev_mp != NULL)
4268                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4269                         return (cache_fpl_partial(fpl));
4270                 }
4271                 if (prev_mp != NULL)
4272                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4273                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4274                         vfs_op_thread_exit_crit(mp, mpcpu);
4275                         return (cache_fpl_partial(fpl));
4276                 }
4277                 if (!cache_fplookup_mp_supported(mp)) {
4278                         vfs_op_thread_exit_crit(mp, mpcpu);
4279                         return (cache_fpl_partial(fpl));
4280                 }
4281                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4282                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4283                         vfs_op_thread_exit_crit(mp, mpcpu);
4284                         return (cache_fpl_partial(fpl));
4285                 }
4286                 vp_seqc = vn_seqc_read_any(vp);
4287                 if (seqc_in_modify(vp_seqc)) {
4288                         vfs_op_thread_exit_crit(mp, mpcpu);
4289                         return (cache_fpl_partial(fpl));
4290                 }
4291                 prev_mp = mp;
4292                 prev_mpcpu = mpcpu;
4293                 mp = atomic_load_ptr(&vp->v_mountedhere);
4294                 if (mp == NULL)
4295                         break;
4296         }
4297
4298         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4299         fpl->tvp = vp;
4300         fpl->tvp_seqc = vp_seqc;
4301         return (0);
4302 }
4303
4304 static bool
4305 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4306 {
4307         struct mount *mp;
4308         struct vnode *vp;
4309
4310         vp = fpl->tvp;
4311
4312         /*
4313          * Hack: while this is a union, the pointer tends to be NULL so save on
4314          * a branch.
4315          */
4316         mp = atomic_load_ptr(&vp->v_mountedhere);
4317         if (mp == NULL)
4318                 return (false);
4319         if (vp->v_type == VDIR)
4320                 return (true);
4321         return (false);
4322 }
4323
4324 /*
4325  * Parse the path.
4326  *
4327  * The code was originally copy-pasted from regular lookup and despite
4328  * clean ups leaves performance on the table. Any modifications here
4329  * must take into account that in case off fallback the resulting
4330  * nameidata state has to be compatible with the original.
4331  */
4332 static int
4333 cache_fplookup_parse(struct cache_fpl *fpl)
4334 {
4335         struct nameidata *ndp;
4336         struct componentname *cnp;
4337         char *cp;
4338
4339         ndp = fpl->ndp;
4340         cnp = fpl->cnp;
4341
4342         /*
4343          * Search a new directory.
4344          *
4345          * The last component of the filename is left accessible via
4346          * cnp->cn_nameptr for callers that need the name. Callers needing
4347          * the name set the SAVENAME flag. When done, they assume
4348          * responsibility for freeing the pathname buffer.
4349          */
4350         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4351                 continue;
4352         cnp->cn_namelen = cp - cnp->cn_nameptr;
4353         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4354                 cache_fpl_smr_exit(fpl);
4355                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4356         }
4357         ndp->ni_pathlen -= cnp->cn_namelen;
4358         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4359             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4360         ndp->ni_next = cp;
4361
4362         /*
4363          * Replace multiple slashes by a single slash and trailing slashes
4364          * by a null.  This must be done before VOP_LOOKUP() because some
4365          * fs's don't know about trailing slashes.  Remember if there were
4366          * trailing slashes to handle symlinks, existing non-directories
4367          * and non-existing files that won't be directories specially later.
4368          */
4369         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4370                 cp++;
4371                 ndp->ni_pathlen--;
4372                 if (*cp == '\0') {
4373                         /*
4374                          * TODO
4375                          * Regular lookup performs the following:
4376                          * *ndp->ni_next = '\0';
4377                          * cnp->cn_flags |= TRAILINGSLASH;
4378                          *
4379                          * Which is problematic since it modifies data read
4380                          * from userspace. Then if fast path lookup was to
4381                          * abort we would have to either restore it or convey
4382                          * the flag. Since this is a corner case just ignore
4383                          * it for simplicity.
4384                          */
4385                         return (cache_fpl_partial(fpl));
4386                 }
4387         }
4388         ndp->ni_next = cp;
4389
4390         /*
4391          * Check for degenerate name (e.g. / or "")
4392          * which is a way of talking about a directory,
4393          * e.g. like "/." or ".".
4394          *
4395          * TODO
4396          * Another corner case handled by the regular lookup
4397          */
4398         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4399                 return (cache_fpl_partial(fpl));
4400         }
4401         return (0);
4402 }
4403
4404 static void
4405 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4406 {
4407         struct nameidata *ndp;
4408         struct componentname *cnp;
4409
4410         ndp = fpl->ndp;
4411         cnp = fpl->cnp;
4412
4413         cnp->cn_nameptr = ndp->ni_next;
4414         while (*cnp->cn_nameptr == '/') {
4415                 cnp->cn_nameptr++;
4416                 ndp->ni_pathlen--;
4417         }
4418 }
4419
4420 /*
4421  * See the API contract for VOP_FPLOOKUP_VEXEC.
4422  */
4423 static int __noinline
4424 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4425 {
4426         struct vnode *dvp;
4427         seqc_t dvp_seqc;
4428
4429         dvp = fpl->dvp;
4430         dvp_seqc = fpl->dvp_seqc;
4431
4432         /*
4433          * Hack: they may be looking up foo/bar, where foo is a
4434          * regular file. In such a case we need to turn ENOTDIR,
4435          * but we may happen to get here with a different error.
4436          */
4437         if (dvp->v_type != VDIR) {
4438                 /*
4439                  * The check here is predominantly to catch
4440                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4441                  * gets doomed past this point it is going to
4442                  * fail seqc verification.
4443                  */
4444                 if (VN_IS_DOOMED(dvp)) {
4445                         return (cache_fpl_aborted(fpl));
4446                 }
4447                 error = ENOTDIR;
4448         }
4449
4450         /*
4451          * Hack: handle O_SEARCH.
4452          *
4453          * Open Group Base Specifications Issue 7, 2018 edition states:
4454          * If the access mode of the open file description associated with the
4455          * file descriptor is not O_SEARCH, the function shall check whether
4456          * directory searches are permitted using the current permissions of
4457          * the directory underlying the file descriptor. If the access mode is
4458          * O_SEARCH, the function shall not perform the check.
4459          *
4460          * Regular lookup tests for the NOEXECCHECK flag for every path
4461          * component to decide whether to do the permission check. However,
4462          * since most lookups never have the flag (and when they do it is only
4463          * present for the first path component), lockless lookup only acts on
4464          * it if there is a permission problem. Here the flag is represented
4465          * with a boolean so that we don't have to clear it on the way out.
4466          *
4467          * For simplicity this always aborts.
4468          * TODO: check if this is the first lookup and ignore the permission
4469          * problem. Note the flag has to survive fallback (if it happens to be
4470          * performed).
4471          */
4472         if (fpl->fsearch) {
4473                 return (cache_fpl_aborted(fpl));
4474         }
4475
4476         switch (error) {
4477         case EAGAIN:
4478                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4479                         error = cache_fpl_aborted(fpl);
4480                 } else {
4481                         cache_fpl_partial(fpl);
4482                 }
4483                 break;
4484         default:
4485                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4486                         error = cache_fpl_aborted(fpl);
4487                 } else {
4488                         cache_fpl_smr_exit(fpl);
4489                         cache_fpl_handled(fpl, error);
4490                 }
4491                 break;
4492         }
4493         return (error);
4494 }
4495
4496 static int
4497 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4498 {
4499         struct nameidata *ndp;
4500         struct componentname *cnp;
4501         struct mount *mp;
4502         int error;
4503
4504         error = CACHE_FPL_FAILED;
4505         ndp = fpl->ndp;
4506         cnp = fpl->cnp;
4507
4508         cache_fpl_checkpoint(fpl, &fpl->snd);
4509
4510         fpl->dvp = dvp;
4511         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4512         if (seqc_in_modify(fpl->dvp_seqc)) {
4513                 cache_fpl_aborted(fpl);
4514                 goto out;
4515         }
4516         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4517         if (!cache_fplookup_mp_supported(mp)) {
4518                 cache_fpl_aborted(fpl);
4519                 goto out;
4520         }
4521
4522         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4523
4524         for (;;) {
4525                 error = cache_fplookup_parse(fpl);
4526                 if (__predict_false(error != 0)) {
4527                         break;
4528                 }
4529
4530                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4531
4532                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4533                 if (__predict_false(error != 0)) {
4534                         error = cache_fplookup_failed_vexec(fpl, error);
4535                         break;
4536                 }
4537
4538                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4539                         error = cache_fplookup_dotdot(fpl);
4540                         if (__predict_false(error != 0)) {
4541                                 break;
4542                         }
4543                 } else {
4544                         error = cache_fplookup_next(fpl);
4545                         if (__predict_false(error != 0)) {
4546                                 break;
4547                         }
4548
4549                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4550
4551                         if (cache_fplookup_need_climb_mount(fpl)) {
4552                                 error = cache_fplookup_climb_mount(fpl);
4553                                 if (__predict_false(error != 0)) {
4554                                         break;
4555                                 }
4556                         }
4557                 }
4558
4559                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4560
4561                 if (cache_fpl_islastcn(ndp)) {
4562                         error = cache_fplookup_final(fpl);
4563                         break;
4564                 }
4565
4566                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4567                         error = cache_fpl_aborted(fpl);
4568                         break;
4569                 }
4570
4571                 fpl->dvp = fpl->tvp;
4572                 fpl->dvp_seqc = fpl->tvp_seqc;
4573
4574                 cache_fplookup_parse_advance(fpl);
4575                 cache_fpl_checkpoint(fpl, &fpl->snd);
4576         }
4577 out:
4578         switch (fpl->status) {
4579         case CACHE_FPL_STATUS_UNSET:
4580                 __assert_unreachable();
4581                 break;
4582         case CACHE_FPL_STATUS_PARTIAL:
4583                 cache_fpl_smr_assert_entered(fpl);
4584                 return (cache_fplookup_partial_setup(fpl));
4585         case CACHE_FPL_STATUS_ABORTED:
4586                 if (fpl->in_smr)
4587                         cache_fpl_smr_exit(fpl);
4588                 return (CACHE_FPL_FAILED);
4589         case CACHE_FPL_STATUS_HANDLED:
4590                 MPASS(error != CACHE_FPL_FAILED);
4591                 cache_fpl_smr_assert_not_entered(fpl);
4592                 if (__predict_false(error != 0)) {
4593                         ndp->ni_dvp = NULL;
4594                         ndp->ni_vp = NULL;
4595                         cache_fpl_cleanup_cnp(cnp);
4596                         return (error);
4597                 }
4598                 ndp->ni_dvp = fpl->dvp;
4599                 ndp->ni_vp = fpl->tvp;
4600                 if (cnp->cn_flags & SAVENAME)
4601                         cnp->cn_flags |= HASBUF;
4602                 else
4603                         cache_fpl_cleanup_cnp(cnp);
4604                 return (error);
4605         }
4606 }
4607
4608 /*
4609  * Fast path lookup protected with SMR and sequence counters.
4610  *
4611  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4612  *
4613  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4614  * outlined below.
4615  *
4616  * Traditional vnode lookup conceptually looks like this:
4617  *
4618  * vn_lock(current);
4619  * for (;;) {
4620  *      next = find();
4621  *      vn_lock(next);
4622  *      vn_unlock(current);
4623  *      current = next;
4624  *      if (last)
4625  *          break;
4626  * }
4627  * return (current);
4628  *
4629  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4630  * any modifications thanks to holding respective locks.
4631  *
4632  * The same guarantee can be provided with a combination of safe memory
4633  * reclamation and sequence counters instead. If all operations which affect
4634  * the relationship between the current vnode and the one we are looking for
4635  * also modify the counter, we can verify whether all the conditions held as
4636  * we made the jump. This includes things like permissions, mount points etc.
4637  * Counter modification is provided by enclosing relevant places in
4638  * vn_seqc_write_begin()/end() calls.
4639  *
4640  * Thus this translates to:
4641  *
4642  * vfs_smr_enter();
4643  * dvp_seqc = seqc_read_any(dvp);
4644  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4645  *     abort();
4646  * for (;;) {
4647  *      tvp = find();
4648  *      tvp_seqc = seqc_read_any(tvp);
4649  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4650  *          abort();
4651  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4652  *          abort();
4653  *      dvp = tvp; // we know nothing of importance has changed
4654  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4655  *      if (last)
4656  *          break;
4657  * }
4658  * vget(); // secure the vnode
4659  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4660  *          abort();
4661  * // at this point we know nothing has changed for any parent<->child pair
4662  * // as they were crossed during the lookup, meaning we matched the guarantee
4663  * // of the locked variant
4664  * return (tvp);
4665  *
4666  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4667  * - they are called while within vfs_smr protection which they must never exit
4668  * - EAGAIN can be returned to denote checking could not be performed, it is
4669  *   always valid to return it
4670  * - if the sequence counter has not changed the result must be valid
4671  * - if the sequence counter has changed both false positives and false negatives
4672  *   are permitted (since the result will be rejected later)
4673  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4674  *
4675  * Caveats to watch out for:
4676  * - vnodes are passed unlocked and unreferenced with nothing stopping
4677  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4678  *   to use atomic_load_ptr to fetch it.
4679  * - the aforementioned object can also get freed, meaning absent other means it
4680  *   should be protected with vfs_smr
4681  * - either safely checking permissions as they are modified or guaranteeing
4682  *   their stability is left to the routine
4683  */
4684 int
4685 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4686     struct pwd **pwdp)
4687 {
4688         struct cache_fpl fpl;
4689         struct pwd *pwd;
4690         struct vnode *dvp;
4691         struct componentname *cnp;
4692         struct nameidata_saved orig;
4693         int error;
4694
4695         MPASS(ndp->ni_lcf == 0);
4696
4697         fpl.status = CACHE_FPL_STATUS_UNSET;
4698         fpl.ndp = ndp;
4699         fpl.cnp = &ndp->ni_cnd;
4700         MPASS(curthread == fpl.cnp->cn_thread);
4701
4702         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4703                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4704
4705         if (!cache_can_fplookup(&fpl)) {
4706                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4707                 *status = fpl.status;
4708                 return (EOPNOTSUPP);
4709         }
4710
4711         cache_fpl_checkpoint(&fpl, &orig);
4712
4713         cache_fpl_smr_enter_initial(&fpl);
4714         fpl.fsearch = false;
4715         pwd = pwd_get_smr();
4716         fpl.pwd = pwd;
4717         ndp->ni_rootdir = pwd->pwd_rdir;
4718         ndp->ni_topdir = pwd->pwd_jdir;
4719
4720         cnp = fpl.cnp;
4721         cnp->cn_nameptr = cnp->cn_pnbuf;
4722         if (cnp->cn_pnbuf[0] == '/') {
4723                 cache_fpl_handle_root(ndp, &dvp);
4724                 ndp->ni_resflags |= NIRES_ABS;
4725         } else {
4726                 if (ndp->ni_dirfd == AT_FDCWD) {
4727                         dvp = pwd->pwd_cdir;
4728                 } else {
4729                         error = cache_fplookup_dirfd(&fpl, &dvp);
4730                         if (__predict_false(error != 0)) {
4731                                 goto out;
4732                         }
4733                 }
4734         }
4735
4736         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4737
4738         error = cache_fplookup_impl(dvp, &fpl);
4739 out:
4740         cache_fpl_smr_assert_not_entered(&fpl);
4741         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4742
4743         *status = fpl.status;
4744         switch (fpl.status) {
4745         case CACHE_FPL_STATUS_UNSET:
4746                 __assert_unreachable();
4747                 break;
4748         case CACHE_FPL_STATUS_HANDLED:
4749                 SDT_PROBE3(vfs, namei, lookup, return, error,
4750                     (error == 0 ? ndp->ni_vp : NULL), true);
4751                 break;
4752         case CACHE_FPL_STATUS_PARTIAL:
4753                 *pwdp = fpl.pwd;
4754                 /*
4755                  * Status restored by cache_fplookup_partial_setup.
4756                  */
4757                 break;
4758         case CACHE_FPL_STATUS_ABORTED:
4759                 cache_fpl_restore_abort(&fpl, &orig);
4760                 break;
4761         }
4762         return (error);
4763 }