sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 SDT_PROVIDER_DECLARE(vfs);
  83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  84     "struct vnode *");
  85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  86     "char *");
  87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  88     "const char *");
  89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  90     "struct namecache *", "int", "int");
  91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  93     "char *", "struct vnode *");
  94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  96     "struct vnode *", "char *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  98     "struct vnode *");
  99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 100     "struct vnode *", "char *");
 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 102     "char *");
 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 104     "struct componentname *");
 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 106     "struct componentname *");
 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 111     "struct vnode *");
 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 113     "char *");
 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
 115     "char *");
 116
 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 120
 121 /*
 122  * This structure describes the elements in the cache of recent
 123  * names looked up by namei.
 124  */
 125 struct negstate {
 126         u_char neg_flag;
 127 };
 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 129     "the state must fit in a union with a pointer without growing it");
 130
 131 struct  namecache {
 132         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 133         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 134         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 135         struct  vnode *nc_dvp;          /* vnode of parent of name */
 136         union {
 137                 struct  vnode *nu_vp;   /* vnode the name refers to */
 138                 struct  negstate nu_neg;/* negative entry state */
 139         } n_un;
 140         u_char  nc_flag;                /* flag bits */
 141         u_char  nc_nlen;                /* length of name */
 142         char    nc_name[0];             /* segment name + nul */
 143 };
 144
 145 /*
 146  * struct namecache_ts repeats struct namecache layout up to the
 147  * nc_nlen member.
 148  * struct namecache_ts is used in place of struct namecache when time(s) need
 149  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 150  * both a non-dotdot directory name plus dotdot for the directory's
 151  * parent.
 152  *
 153  * See below for alignment requirement.
 154  */
 155 struct  namecache_ts {
 156         struct  timespec nc_time;       /* timespec provided by fs */
 157         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 158         int     nc_ticks;               /* ticks value when entry was added */
 159         struct namecache nc_nc;
 160 };
 161
 162 /*
 163  * At least mips n32 performs 64-bit accesses to timespec as found
 164  * in namecache_ts and requires them to be aligned. Since others
 165  * may be in the same spot suffer a little bit and enforce the
 166  * alignment for everyone. Note this is a nop for 64-bit platforms.
 167  */
 168 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 169 #define CACHE_PATH_CUTOFF       39
 170
 171 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 172 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 173 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 174 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 175
 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 180
 181 #define nc_vp           n_un.nu_vp
 182 #define nc_neg          n_un.nu_neg
 183
 184 /*
 185  * Flags in namecache.nc_flag
 186  */
 187 #define NCF_WHITE       0x01
 188 #define NCF_ISDOTDOT    0x02
 189 #define NCF_TS          0x04
 190 #define NCF_DTS         0x08
 191 #define NCF_DVDROP      0x10
 192 #define NCF_NEGATIVE    0x20
 193 #define NCF_INVALID     0x40
 194 #define NCF_WIP         0x80
 195
 196 /*
 197  * Flags in negstate.neg_flag
 198  */
 199 #define NEG_HOT         0x01
 200
 201 /*
 202  * Mark an entry as invalid.
 203  *
 204  * This is called before it starts getting deconstructed.
 205  */
 206 static void
 207 cache_ncp_invalidate(struct namecache *ncp)
 208 {
 209
 210         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 211             ("%s: entry %p already invalid", __func__, ncp));
 212         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 213         atomic_thread_fence_rel();
 214 }
 215
 216 /*
 217  * Check whether the entry can be safely used.
 218  *
 219  * All places which elide locks are supposed to call this after they are
 220  * done with reading from an entry.
 221  */
 222 static bool
 223 cache_ncp_canuse(struct namecache *ncp)
 224 {
 225
 226         atomic_thread_fence_acq();
 227         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 228 }
 229
 230 /*
 231  * Name caching works as follows:
 232  *
 233  * Names found by directory scans are retained in a cache
 234  * for future reference.  It is managed LRU, so frequently
 235  * used names will hang around.  Cache is indexed by hash value
 236  * obtained from (dvp, name) where dvp refers to the directory
 237  * containing name.
 238  *
 239  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 240  * exist) the vnode pointer will be NULL.
 241  *
 242  * Upon reaching the last segment of a path, if the reference
 243  * is for DELETE, or NOCACHE is set (rewrite), and the
 244  * name is located in the cache, it will be dropped.
 245  *
 246  * These locks are used (in the order in which they can be taken):
 247  * NAME         TYPE    ROLE
 248  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 249  * bucketlock   mtx     for access to given set of hash buckets
 250  * neglist      mtx     negative entry LRU management
 251  *
 252  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
 253  * shrinking the LRU list.
 254  *
 255  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 256  * order is lower address first. Both are recursive.
 257  *
 258  * "." lookups are lockless.
 259  *
 260  * ".." and vnode -> name lookups require vnodelock.
 261  *
 262  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 263  *
 264  * Insertions and removals of entries require involved vnodes and bucketlocks
 265  * to be locked to provide safe operation against other threads modifying the
 266  * cache.
 267  *
 268  * Some lookups result in removal of the found entry (e.g. getting rid of a
 269  * negative entry with the intent to create a positive one), which poses a
 270  * problem when multiple threads reach the state. Similarly, two different
 271  * threads can purge two different vnodes and try to remove the same name.
 272  *
 273  * If the already held vnode lock is lower than the second required lock, we
 274  * can just take the other lock. However, in the opposite case, this could
 275  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 276  * the first node, locking everything in order and revalidating the state.
 277  */
 278
 279 VFS_SMR_DECLARE;
 280
 281 /*
 282  * Structures associated with name caching.
 283  */
 284 #define NCHHASH(hash) \
 285         (&nchashtbl[(hash) & nchash])
 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 287 static u_long __read_mostly     nchash;                 /* size of hash table */
 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 289     "Size of namecache hash table");
 290 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 292     "Ratio of negative namecache entries");
 293 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 294 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 295 u_int ncsizefactor = 2;
 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 297     "Size factor for namecache");
 298 static u_int __read_mostly      ncpurgeminvnodes;
 299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
 300     "Number of vnodes below which purgevfs ignores the request");
 301 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 302
 303 struct nchstats nchstats;               /* cache effectiveness statistics */
 304
 305 static bool __read_frequently cache_fast_revlookup = true;
 306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 307     &cache_fast_revlookup, 0, "");
 308
 309 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
 310
 311 struct neglist {
 312         struct mtx              nl_lock;
 313         TAILQ_HEAD(, namecache) nl_list;
 314 } __aligned(CACHE_LINE_SIZE);
 315
 316 static struct neglist __read_mostly     *neglists;
 317 static struct neglist ncneg_hot;
 318 static u_long numhotneg;
 319
 320 #define ncneghash       3
 321 #define numneglists     (ncneghash + 1)
 322 static inline struct neglist *
 323 NCP2NEGLIST(struct namecache *ncp)
 324 {
 325
 326         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 327 }
 328
 329 static inline struct negstate *
 330 NCP2NEGSTATE(struct namecache *ncp)
 331 {
 332
 333         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 334         return (&ncp->nc_neg);
 335 }
 336
 337 #define numbucketlocks (ncbuckethash + 1)
 338 static u_int __read_mostly  ncbuckethash;
 339 static struct mtx_padalign __read_mostly  *bucketlocks;
 340 #define HASH2BUCKETLOCK(hash) \
 341         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 342
 343 #define numvnodelocks (ncvnodehash + 1)
 344 static u_int __read_mostly  ncvnodehash;
 345 static struct mtx __read_mostly *vnodelocks;
 346 static inline struct mtx *
 347 VP2VNODELOCK(struct vnode *vp)
 348 {
 349
 350         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 351 }
 352
 353 /*
 354  * UMA zones for the VFS cache.
 355  *
 356  * The small cache is used for entries with short names, which are the
 357  * most common.  The large cache is used for entries which are too big to
 358  * fit in the small cache.
 359  */
 360 static uma_zone_t __read_mostly cache_zone_small;
 361 static uma_zone_t __read_mostly cache_zone_small_ts;
 362 static uma_zone_t __read_mostly cache_zone_large;
 363 static uma_zone_t __read_mostly cache_zone_large_ts;
 364
 365 static struct namecache *
 366 cache_alloc(int len, int ts)
 367 {
 368         struct namecache_ts *ncp_ts;
 369         struct namecache *ncp;
 370
 371         if (__predict_false(ts)) {
 372                 if (len <= CACHE_PATH_CUTOFF)
 373                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 374                 else
 375                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 376                 ncp = &ncp_ts->nc_nc;
 377         } else {
 378                 if (len <= CACHE_PATH_CUTOFF)
 379                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 380                 else
 381                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 382         }
 383         return (ncp);
 384 }
 385
 386 static void
 387 cache_free(struct namecache *ncp)
 388 {
 389         struct namecache_ts *ncp_ts;
 390
 391         MPASS(ncp != NULL);
 392         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 393                 vdrop(ncp->nc_dvp);
 394         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 395                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 396                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 397                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 398                 else
 399                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 400         } else {
 401                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 402                         uma_zfree_smr(cache_zone_small, ncp);
 403                 else
 404                         uma_zfree_smr(cache_zone_large, ncp);
 405         }
 406 }
 407
 408 static void
 409 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 410 {
 411         struct namecache_ts *ncp_ts;
 412
 413         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 414             (tsp == NULL && ticksp == NULL),
 415             ("No NCF_TS"));
 416
 417         if (tsp == NULL)
 418                 return;
 419
 420         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 421         *tsp = ncp_ts->nc_time;
 422         *ticksp = ncp_ts->nc_ticks;
 423 }
 424
 425 #ifdef DEBUG_CACHE
 426 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 427 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 428     "VFS namecache enabled");
 429 #endif
 430
 431 /* Export size information to userland */
 432 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 433     sizeof(struct namecache), "sizeof(struct namecache)");
 434
 435 /*
 436  * The new name cache statistics
 437  */
 438 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 439     "Name cache statistics");
 440 #define STATNODE_ULONG(name, descr)                                     \
 441         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 442 #define STATNODE_COUNTER(name, descr)                                   \
 443         static COUNTER_U64_DEFINE_EARLY(name);                          \
 444         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 445             descr);
 446 STATNODE_ULONG(numneg, "Number of negative cache entries");
 447 STATNODE_ULONG(numcache, "Number of cache entries");
 448 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 449 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 450 STATNODE_COUNTER(dothits, "Number of '.' hits");
 451 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 452 STATNODE_COUNTER(nummiss, "Number of cache misses");
 453 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 454 STATNODE_COUNTER(numposzaps,
 455     "Number of cache hits (positive) we do not want to cache");
 456 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 457 STATNODE_COUNTER(numnegzaps,
 458     "Number of cache hits (negative) we do not want to cache");
 459 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 460 /* These count for vn_getcwd(), too. */
 461 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 462 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 463 STATNODE_COUNTER(numfullpathfail2,
 464     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 465 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 466 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 467 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 468     "Number of successful removals after relocking");
 469 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 470     "Number of times zap_and_exit failed to lock");
 471 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 472     "Number of times zap_and_exit failed to lock");
 473 static long cache_lock_vnodes_cel_3_failures;
 474 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 475     "Number of times 3-way vnode locking failed");
 476 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
 477 STATNODE_COUNTER(numneg_evicted,
 478     "Number of negative entries evicted when adding a new entry");
 479 STATNODE_COUNTER(shrinking_skipped,
 480     "Number of times shrinking was already in progress");
 481
 482 static void cache_zap_locked(struct namecache *ncp);
 483 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 484     char **freebuf, size_t *buflen);
 485 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 486     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
 487 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 488     char **retbuf, size_t *buflen);
 489 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 490     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 491
 492 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 493
 494 static int cache_yield;
 495 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
 496     "Number of times cache called yield");
 497
 498 static void __noinline
 499 cache_maybe_yield(void)
 500 {
 501
 502         if (should_yield()) {
 503                 cache_yield++;
 504                 kern_yield(PRI_USER);
 505         }
 506 }
 507
 508 static inline void
 509 cache_assert_vlp_locked(struct mtx *vlp)
 510 {
 511
 512         if (vlp != NULL)
 513                 mtx_assert(vlp, MA_OWNED);
 514 }
 515
 516 static inline void
 517 cache_assert_vnode_locked(struct vnode *vp)
 518 {
 519         struct mtx *vlp;
 520
 521         vlp = VP2VNODELOCK(vp);
 522         cache_assert_vlp_locked(vlp);
 523 }
 524
 525 /*
 526  * TODO: With the value stored we can do better than computing the hash based
 527  * on the address. The choice of FNV should also be revisited.
 528  */
 529 static void
 530 cache_prehash(struct vnode *vp)
 531 {
 532
 533         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 534 }
 535
 536 static uint32_t
 537 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 538 {
 539
 540         return (fnv_32_buf(name, len, dvp->v_nchash));
 541 }
 542
 543 static inline struct nchashhead *
 544 NCP2BUCKET(struct namecache *ncp)
 545 {
 546         uint32_t hash;
 547
 548         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 549         return (NCHHASH(hash));
 550 }
 551
 552 static inline struct mtx *
 553 NCP2BUCKETLOCK(struct namecache *ncp)
 554 {
 555         uint32_t hash;
 556
 557         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 558         return (HASH2BUCKETLOCK(hash));
 559 }
 560
 561 #ifdef INVARIANTS
 562 static void
 563 cache_assert_bucket_locked(struct namecache *ncp)
 564 {
 565         struct mtx *blp;
 566
 567         blp = NCP2BUCKETLOCK(ncp);
 568         mtx_assert(blp, MA_OWNED);
 569 }
 570
 571 static void
 572 cache_assert_bucket_unlocked(struct namecache *ncp)
 573 {
 574         struct mtx *blp;
 575
 576         blp = NCP2BUCKETLOCK(ncp);
 577         mtx_assert(blp, MA_NOTOWNED);
 578 }
 579 #else
 580 #define cache_assert_bucket_locked(x) do { } while (0)
 581 #define cache_assert_bucket_unlocked(x) do { } while (0)
 582 #endif
 583
 584 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 585 static void
 586 _cache_sort_vnodes(void **p1, void **p2)
 587 {
 588         void *tmp;
 589
 590         MPASS(*p1 != NULL || *p2 != NULL);
 591
 592         if (*p1 > *p2) {
 593                 tmp = *p2;
 594                 *p2 = *p1;
 595                 *p1 = tmp;
 596         }
 597 }
 598
 599 static void
 600 cache_lock_all_buckets(void)
 601 {
 602         u_int i;
 603
 604         for (i = 0; i < numbucketlocks; i++)
 605                 mtx_lock(&bucketlocks[i]);
 606 }
 607
 608 static void
 609 cache_unlock_all_buckets(void)
 610 {
 611         u_int i;
 612
 613         for (i = 0; i < numbucketlocks; i++)
 614                 mtx_unlock(&bucketlocks[i]);
 615 }
 616
 617 static void
 618 cache_lock_all_vnodes(void)
 619 {
 620         u_int i;
 621
 622         for (i = 0; i < numvnodelocks; i++)
 623                 mtx_lock(&vnodelocks[i]);
 624 }
 625
 626 static void
 627 cache_unlock_all_vnodes(void)
 628 {
 629         u_int i;
 630
 631         for (i = 0; i < numvnodelocks; i++)
 632                 mtx_unlock(&vnodelocks[i]);
 633 }
 634
 635 static int
 636 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 637 {
 638
 639         cache_sort_vnodes(&vlp1, &vlp2);
 640
 641         if (vlp1 != NULL) {
 642                 if (!mtx_trylock(vlp1))
 643                         return (EAGAIN);
 644         }
 645         if (!mtx_trylock(vlp2)) {
 646                 if (vlp1 != NULL)
 647                         mtx_unlock(vlp1);
 648                 return (EAGAIN);
 649         }
 650
 651         return (0);
 652 }
 653
 654 static void
 655 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 656 {
 657
 658         MPASS(vlp1 != NULL || vlp2 != NULL);
 659         MPASS(vlp1 <= vlp2);
 660
 661         if (vlp1 != NULL)
 662                 mtx_lock(vlp1);
 663         if (vlp2 != NULL)
 664                 mtx_lock(vlp2);
 665 }
 666
 667 static void
 668 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 669 {
 670
 671         MPASS(vlp1 != NULL || vlp2 != NULL);
 672
 673         if (vlp1 != NULL)
 674                 mtx_unlock(vlp1);
 675         if (vlp2 != NULL)
 676                 mtx_unlock(vlp2);
 677 }
 678
 679 static int
 680 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 681 {
 682         struct nchstats snap;
 683
 684         if (req->oldptr == NULL)
 685                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 686
 687         snap = nchstats;
 688         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 689         snap.ncs_neghits = counter_u64_fetch(numneghits);
 690         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 691             counter_u64_fetch(numnegzaps);
 692         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 693             counter_u64_fetch(nummiss);
 694
 695         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 696 }
 697 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 698     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 699     "VFS cache effectiveness statistics");
 700
 701 #ifdef DIAGNOSTIC
 702 /*
 703  * Grab an atomic snapshot of the name cache hash chain lengths
 704  */
 705 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 706     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 707     "hash table stats");
 708
 709 static int
 710 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 711 {
 712         struct nchashhead *ncpp;
 713         struct namecache *ncp;
 714         int i, error, n_nchash, *cntbuf;
 715
 716 retry:
 717         n_nchash = nchash + 1;  /* nchash is max index, not count */
 718         if (req->oldptr == NULL)
 719                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 720         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 721         cache_lock_all_buckets();
 722         if (n_nchash != nchash + 1) {
 723                 cache_unlock_all_buckets();
 724                 free(cntbuf, M_TEMP);
 725                 goto retry;
 726         }
 727         /* Scan hash tables counting entries */
 728         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 729                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 730                         cntbuf[i]++;
 731         cache_unlock_all_buckets();
 732         for (error = 0, i = 0; i < n_nchash; i++)
 733                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 734                         break;
 735         free(cntbuf, M_TEMP);
 736         return (error);
 737 }
 738 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 739     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 740     "nchash chain lengths");
 741
 742 static int
 743 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 744 {
 745         int error;
 746         struct nchashhead *ncpp;
 747         struct namecache *ncp;
 748         int n_nchash;
 749         int count, maxlength, used, pct;
 750
 751         if (!req->oldptr)
 752                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 753
 754         cache_lock_all_buckets();
 755         n_nchash = nchash + 1;  /* nchash is max index, not count */
 756         used = 0;
 757         maxlength = 0;
 758
 759         /* Scan hash tables for applicable entries */
 760         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 761                 count = 0;
 762                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 763                         count++;
 764                 }
 765                 if (count)
 766                         used++;
 767                 if (maxlength < count)
 768                         maxlength = count;
 769         }
 770         n_nchash = nchash + 1;
 771         cache_unlock_all_buckets();
 772         pct = (used * 100) / (n_nchash / 100);
 773         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 774         if (error)
 775                 return (error);
 776         error = SYSCTL_OUT(req, &used, sizeof(used));
 777         if (error)
 778                 return (error);
 779         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 780         if (error)
 781                 return (error);
 782         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 783         if (error)
 784                 return (error);
 785         return (0);
 786 }
 787 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 788     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 789     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 790 #endif
 791
 792 /*
 793  * Negative entries management
 794  *
 795  * A variation of LRU scheme is used. New entries are hashed into one of
 796  * numneglists cold lists. Entries get promoted to the hot list on first hit.
 797  *
 798  * The shrinker will demote hot list head and evict from the cold list in a
 799  * round-robin manner.
 800  */
 801 static void
 802 cache_negative_init(struct namecache *ncp)
 803 {
 804         struct negstate *negstate;
 805
 806         ncp->nc_flag |= NCF_NEGATIVE;
 807         negstate = NCP2NEGSTATE(ncp);
 808         negstate->neg_flag = 0;
 809 }
 810
 811 static void
 812 cache_negative_hit(struct namecache *ncp)
 813 {
 814         struct neglist *neglist;
 815         struct negstate *negstate;
 816
 817         negstate = NCP2NEGSTATE(ncp);
 818         if ((negstate->neg_flag & NEG_HOT) != 0)
 819                 return;
 820         neglist = NCP2NEGLIST(ncp);
 821         mtx_lock(&ncneg_hot.nl_lock);
 822         mtx_lock(&neglist->nl_lock);
 823         if ((negstate->neg_flag & NEG_HOT) == 0) {
 824                 numhotneg++;
 825                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 826                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 827                 negstate->neg_flag |= NEG_HOT;
 828         }
 829         mtx_unlock(&neglist->nl_lock);
 830         mtx_unlock(&ncneg_hot.nl_lock);
 831 }
 832
 833 static void
 834 cache_negative_insert(struct namecache *ncp)
 835 {
 836         struct neglist *neglist;
 837
 838         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 839         cache_assert_bucket_locked(ncp);
 840         neglist = NCP2NEGLIST(ncp);
 841         mtx_lock(&neglist->nl_lock);
 842         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 843         mtx_unlock(&neglist->nl_lock);
 844         atomic_add_rel_long(&numneg, 1);
 845 }
 846
 847 static void
 848 cache_negative_remove(struct namecache *ncp)
 849 {
 850         struct neglist *neglist;
 851         struct negstate *negstate;
 852         bool hot_locked = false;
 853         bool list_locked = false;
 854
 855         cache_assert_bucket_locked(ncp);
 856         neglist = NCP2NEGLIST(ncp);
 857         negstate = NCP2NEGSTATE(ncp);
 858         if ((negstate->neg_flag & NEG_HOT) != 0) {
 859                 hot_locked = true;
 860                 mtx_lock(&ncneg_hot.nl_lock);
 861                 if ((negstate->neg_flag & NEG_HOT) == 0) {
 862                         list_locked = true;
 863                         mtx_lock(&neglist->nl_lock);
 864                 }
 865         } else {
 866                 list_locked = true;
 867                 mtx_lock(&neglist->nl_lock);
 868                 /*
 869                  * We may be racing against promotion in lockless lookup.
 870                  */
 871                 if ((negstate->neg_flag & NEG_HOT) != 0) {
 872                         mtx_unlock(&neglist->nl_lock);
 873                         hot_locked = true;
 874                         mtx_lock(&ncneg_hot.nl_lock);
 875                         mtx_lock(&neglist->nl_lock);
 876                 }
 877         }
 878         if ((negstate->neg_flag & NEG_HOT) != 0) {
 879                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
 880                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 881                 numhotneg--;
 882         } else {
 883                 mtx_assert(&neglist->nl_lock, MA_OWNED);
 884                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 885         }
 886         if (list_locked)
 887                 mtx_unlock(&neglist->nl_lock);
 888         if (hot_locked)
 889                 mtx_unlock(&ncneg_hot.nl_lock);
 890         atomic_subtract_rel_long(&numneg, 1);
 891 }
 892
 893 static void
 894 cache_negative_shrink_select(struct namecache **ncpp,
 895     struct neglist **neglistpp)
 896 {
 897         struct neglist *neglist;
 898         struct namecache *ncp;
 899         static u_int cycle;
 900         u_int i;
 901
 902         *ncpp = ncp = NULL;
 903
 904         for (i = 0; i < numneglists; i++) {
 905                 neglist = &neglists[(cycle + i) % numneglists];
 906                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
 907                         continue;
 908                 mtx_lock(&neglist->nl_lock);
 909                 ncp = TAILQ_FIRST(&neglist->nl_list);
 910                 if (ncp != NULL)
 911                         break;
 912                 mtx_unlock(&neglist->nl_lock);
 913         }
 914
 915         *neglistpp = neglist;
 916         *ncpp = ncp;
 917         cycle++;
 918 }
 919
 920 static void
 921 cache_negative_zap_one(void)
 922 {
 923         struct namecache *ncp, *ncp2;
 924         struct neglist *neglist;
 925         struct negstate *negstate;
 926         struct mtx *dvlp;
 927         struct mtx *blp;
 928
 929         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 930             !mtx_trylock(&ncneg_shrink_lock)) {
 931                 counter_u64_add(shrinking_skipped, 1);
 932                 return;
 933         }
 934
 935         mtx_lock(&ncneg_hot.nl_lock);
 936         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
 937         if (ncp != NULL) {
 938                 neglist = NCP2NEGLIST(ncp);
 939                 negstate = NCP2NEGSTATE(ncp);
 940                 mtx_lock(&neglist->nl_lock);
 941                 MPASS((negstate->neg_flag & NEG_HOT) != 0);
 942                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 943                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 944                 negstate->neg_flag &= ~NEG_HOT;
 945                 numhotneg--;
 946                 mtx_unlock(&neglist->nl_lock);
 947         }
 948         mtx_unlock(&ncneg_hot.nl_lock);
 949
 950         cache_negative_shrink_select(&ncp, &neglist);
 951
 952         mtx_unlock(&ncneg_shrink_lock);
 953         if (ncp == NULL)
 954                 return;
 955
 956         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 957         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 958         blp = NCP2BUCKETLOCK(ncp);
 959         mtx_unlock(&neglist->nl_lock);
 960         mtx_lock(dvlp);
 961         mtx_lock(blp);
 962         /*
 963          * Enter SMR to safely check the negative list.
 964          * Even if the found pointer matches, the entry may now be reallocated
 965          * and used by a different vnode.
 966          */
 967         vfs_smr_enter();
 968         ncp2 = TAILQ_FIRST(&neglist->nl_list);
 969         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 970             blp != NCP2BUCKETLOCK(ncp2)) {
 971                 vfs_smr_exit();
 972                 ncp = NULL;
 973         } else {
 974                 vfs_smr_exit();
 975                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 976                     ncp->nc_name);
 977                 cache_zap_locked(ncp);
 978                 counter_u64_add(numneg_evicted, 1);
 979         }
 980         mtx_unlock(blp);
 981         mtx_unlock(dvlp);
 982         if (ncp != NULL)
 983                 cache_free(ncp);
 984 }
 985
 986 /*
 987  * cache_zap_locked():
 988  *
 989  *   Removes a namecache entry from cache, whether it contains an actual
 990  *   pointer to a vnode or if it is just a negative cache entry.
 991  */
 992 static void
 993 cache_zap_locked(struct namecache *ncp)
 994 {
 995         struct nchashhead *ncpp;
 996
 997         if (!(ncp->nc_flag & NCF_NEGATIVE))
 998                 cache_assert_vnode_locked(ncp->nc_vp);
 999         cache_assert_vnode_locked(ncp->nc_dvp);
1000         cache_assert_bucket_locked(ncp);
1001
1002         cache_ncp_invalidate(ncp);
1003
1004         ncpp = NCP2BUCKET(ncp);
1005         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1006         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1007                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1008                     ncp->nc_name, ncp->nc_vp);
1009                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1010                 if (ncp == ncp->nc_vp->v_cache_dd) {
1011                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1012                         ncp->nc_vp->v_cache_dd = NULL;
1013                         vn_seqc_write_end(ncp->nc_vp);
1014                 }
1015         } else {
1016                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1017                     ncp->nc_name);
1018                 cache_negative_remove(ncp);
1019         }
1020         if (ncp->nc_flag & NCF_ISDOTDOT) {
1021                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1022                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1023                         ncp->nc_dvp->v_cache_dd = NULL;
1024                         vn_seqc_write_end(ncp->nc_dvp);
1025                 }
1026         } else {
1027                 LIST_REMOVE(ncp, nc_src);
1028                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1029                         ncp->nc_flag |= NCF_DVDROP;
1030                         counter_u64_add(numcachehv, -1);
1031                 }
1032         }
1033         atomic_subtract_rel_long(&numcache, 1);
1034 }
1035
1036 static void
1037 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1038 {
1039         struct mtx *blp;
1040
1041         MPASS(ncp->nc_dvp == vp);
1042         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1043         cache_assert_vnode_locked(vp);
1044
1045         blp = NCP2BUCKETLOCK(ncp);
1046         mtx_lock(blp);
1047         cache_zap_locked(ncp);
1048         mtx_unlock(blp);
1049 }
1050
1051 static bool
1052 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1053     struct mtx **vlpp)
1054 {
1055         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1056         struct mtx *blp;
1057
1058         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1059         cache_assert_vnode_locked(vp);
1060
1061         if (ncp->nc_flag & NCF_NEGATIVE) {
1062                 if (*vlpp != NULL) {
1063                         mtx_unlock(*vlpp);
1064                         *vlpp = NULL;
1065                 }
1066                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1067                 return (true);
1068         }
1069
1070         pvlp = VP2VNODELOCK(vp);
1071         blp = NCP2BUCKETLOCK(ncp);
1072         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1073         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1074
1075         if (*vlpp == vlp1 || *vlpp == vlp2) {
1076                 to_unlock = *vlpp;
1077                 *vlpp = NULL;
1078         } else {
1079                 if (*vlpp != NULL) {
1080                         mtx_unlock(*vlpp);
1081                         *vlpp = NULL;
1082                 }
1083                 cache_sort_vnodes(&vlp1, &vlp2);
1084                 if (vlp1 == pvlp) {
1085                         mtx_lock(vlp2);
1086                         to_unlock = vlp2;
1087                 } else {
1088                         if (!mtx_trylock(vlp1))
1089                                 goto out_relock;
1090                         to_unlock = vlp1;
1091                 }
1092         }
1093         mtx_lock(blp);
1094         cache_zap_locked(ncp);
1095         mtx_unlock(blp);
1096         if (to_unlock != NULL)
1097                 mtx_unlock(to_unlock);
1098         return (true);
1099
1100 out_relock:
1101         mtx_unlock(vlp2);
1102         mtx_lock(vlp1);
1103         mtx_lock(vlp2);
1104         MPASS(*vlpp == NULL);
1105         *vlpp = vlp1;
1106         return (false);
1107 }
1108
1109 static int __noinline
1110 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1111 {
1112         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1113         struct mtx *blp;
1114         int error = 0;
1115
1116         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1117         cache_assert_vnode_locked(vp);
1118
1119         pvlp = VP2VNODELOCK(vp);
1120         if (ncp->nc_flag & NCF_NEGATIVE) {
1121                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1122                 goto out;
1123         }
1124
1125         blp = NCP2BUCKETLOCK(ncp);
1126         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1127         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1128         cache_sort_vnodes(&vlp1, &vlp2);
1129         if (vlp1 == pvlp) {
1130                 mtx_lock(vlp2);
1131                 to_unlock = vlp2;
1132         } else {
1133                 if (!mtx_trylock(vlp1)) {
1134                         /*
1135                          * TODO: Very wasteful but rare.
1136                          */
1137                         mtx_unlock(pvlp);
1138                         mtx_lock(vlp1);
1139                         mtx_lock(vlp2);
1140                         mtx_unlock(vlp2);
1141                         mtx_unlock(vlp1);
1142                         return (EAGAIN);
1143                 }
1144                 to_unlock = vlp1;
1145         }
1146         mtx_lock(blp);
1147         cache_zap_locked(ncp);
1148         mtx_unlock(blp);
1149         mtx_unlock(to_unlock);
1150 out:
1151         mtx_unlock(pvlp);
1152         return (error);
1153 }
1154
1155 /*
1156  * If trylocking failed we can get here. We know enough to take all needed locks
1157  * in the right order and re-lookup the entry.
1158  */
1159 static int
1160 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1161     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1162     struct mtx *blp)
1163 {
1164         struct namecache *rncp;
1165
1166         cache_assert_bucket_unlocked(ncp);
1167
1168         cache_sort_vnodes(&dvlp, &vlp);
1169         cache_lock_vnodes(dvlp, vlp);
1170         mtx_lock(blp);
1171         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1172                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1173                     rncp->nc_nlen == cnp->cn_namelen &&
1174                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1175                         break;
1176         }
1177         if (rncp != NULL) {
1178                 cache_zap_locked(rncp);
1179                 mtx_unlock(blp);
1180                 cache_unlock_vnodes(dvlp, vlp);
1181                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1182                 return (0);
1183         }
1184
1185         mtx_unlock(blp);
1186         cache_unlock_vnodes(dvlp, vlp);
1187         return (EAGAIN);
1188 }
1189
1190 static int __noinline
1191 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1192     uint32_t hash, struct mtx *blp)
1193 {
1194         struct mtx *dvlp, *vlp;
1195         struct vnode *dvp;
1196
1197         cache_assert_bucket_locked(ncp);
1198
1199         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1200         vlp = NULL;
1201         if (!(ncp->nc_flag & NCF_NEGATIVE))
1202                 vlp = VP2VNODELOCK(ncp->nc_vp);
1203         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1204                 cache_zap_locked(ncp);
1205                 mtx_unlock(blp);
1206                 cache_unlock_vnodes(dvlp, vlp);
1207                 return (0);
1208         }
1209
1210         dvp = ncp->nc_dvp;
1211         mtx_unlock(blp);
1212         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1213 }
1214
1215 static int
1216 cache_zap_locked_bucket_kl(struct namecache *ncp, struct mtx *blp,
1217     struct mtx **vlpp1, struct mtx **vlpp2)
1218 {
1219         struct mtx *dvlp, *vlp;
1220
1221         cache_assert_bucket_locked(ncp);
1222
1223         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1224         vlp = NULL;
1225         if (!(ncp->nc_flag & NCF_NEGATIVE))
1226                 vlp = VP2VNODELOCK(ncp->nc_vp);
1227         cache_sort_vnodes(&dvlp, &vlp);
1228
1229         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1230                 cache_zap_locked(ncp);
1231                 cache_unlock_vnodes(dvlp, vlp);
1232                 *vlpp1 = NULL;
1233                 *vlpp2 = NULL;
1234                 return (0);
1235         }
1236
1237         if (*vlpp1 != NULL)
1238                 mtx_unlock(*vlpp1);
1239         if (*vlpp2 != NULL)
1240                 mtx_unlock(*vlpp2);
1241         *vlpp1 = NULL;
1242         *vlpp2 = NULL;
1243
1244         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1245                 cache_zap_locked(ncp);
1246                 cache_unlock_vnodes(dvlp, vlp);
1247                 return (0);
1248         }
1249
1250         mtx_unlock(blp);
1251         *vlpp1 = dvlp;
1252         *vlpp2 = vlp;
1253         if (*vlpp1 != NULL)
1254                 mtx_lock(*vlpp1);
1255         mtx_lock(*vlpp2);
1256         mtx_lock(blp);
1257         return (EAGAIN);
1258 }
1259
1260 static __noinline int
1261 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1262 {
1263         struct namecache *ncp;
1264         struct mtx *blp;
1265         struct mtx *dvlp, *dvlp2;
1266         uint32_t hash;
1267         int error;
1268
1269         if (cnp->cn_namelen == 2 &&
1270             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1271                 dvlp = VP2VNODELOCK(dvp);
1272                 dvlp2 = NULL;
1273                 mtx_lock(dvlp);
1274 retry_dotdot:
1275                 ncp = dvp->v_cache_dd;
1276                 if (ncp == NULL) {
1277                         mtx_unlock(dvlp);
1278                         if (dvlp2 != NULL)
1279                                 mtx_unlock(dvlp2);
1280                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1281                         return (0);
1282                 }
1283                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1284                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1285                                 goto retry_dotdot;
1286                         MPASS(dvp->v_cache_dd == NULL);
1287                         mtx_unlock(dvlp);
1288                         if (dvlp2 != NULL)
1289                                 mtx_unlock(dvlp2);
1290                         cache_free(ncp);
1291                 } else {
1292                         vn_seqc_write_begin(dvp);
1293                         dvp->v_cache_dd = NULL;
1294                         vn_seqc_write_end(dvp);
1295                         mtx_unlock(dvlp);
1296                         if (dvlp2 != NULL)
1297                                 mtx_unlock(dvlp2);
1298                 }
1299                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1300                 return (1);
1301         }
1302
1303         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1304         blp = HASH2BUCKETLOCK(hash);
1305 retry:
1306         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1307                 goto out_no_entry;
1308
1309         mtx_lock(blp);
1310
1311         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1312                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1313                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1314                         break;
1315         }
1316
1317         if (ncp == NULL) {
1318                 mtx_unlock(blp);
1319                 goto out_no_entry;
1320         }
1321
1322         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1323         if (__predict_false(error != 0)) {
1324                 zap_and_exit_bucket_fail++;
1325                 goto retry;
1326         }
1327         counter_u64_add(numposzaps, 1);
1328         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1329         cache_free(ncp);
1330         return (1);
1331 out_no_entry:
1332         counter_u64_add(nummisszap, 1);
1333         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1334         return (0);
1335 }
1336
1337 static int __noinline
1338 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1339     struct timespec *tsp, int *ticksp)
1340 {
1341         int ltype;
1342
1343         *vpp = dvp;
1344         counter_u64_add(dothits, 1);
1345         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1346         if (tsp != NULL)
1347                 timespecclear(tsp);
1348         if (ticksp != NULL)
1349                 *ticksp = ticks;
1350         vrefact(*vpp);
1351         /*
1352          * When we lookup "." we still can be asked to lock it
1353          * differently...
1354          */
1355         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1356         if (ltype != VOP_ISLOCKED(*vpp)) {
1357                 if (ltype == LK_EXCLUSIVE) {
1358                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1359                         if (VN_IS_DOOMED((*vpp))) {
1360                                 /* forced unmount */
1361                                 vrele(*vpp);
1362                                 *vpp = NULL;
1363                                 return (ENOENT);
1364                         }
1365                 } else
1366                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1367         }
1368         return (-1);
1369 }
1370
1371 static int __noinline
1372 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1373     struct timespec *tsp, int *ticksp)
1374 {
1375         struct namecache_ts *ncp_ts;
1376         struct namecache *ncp;
1377         struct mtx *dvlp;
1378         enum vgetstate vs;
1379         int error, ltype;
1380         bool whiteout;
1381
1382         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1383
1384         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1385                 cache_remove_cnp(dvp, cnp);
1386                 return (0);
1387         }
1388
1389         counter_u64_add(dotdothits, 1);
1390 retry:
1391         dvlp = VP2VNODELOCK(dvp);
1392         mtx_lock(dvlp);
1393         ncp = dvp->v_cache_dd;
1394         if (ncp == NULL) {
1395                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1396                 mtx_unlock(dvlp);
1397                 return (0);
1398         }
1399         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1400                 if (ncp->nc_flag & NCF_NEGATIVE)
1401                         *vpp = NULL;
1402                 else
1403                         *vpp = ncp->nc_vp;
1404         } else
1405                 *vpp = ncp->nc_dvp;
1406         if (*vpp == NULL)
1407                 goto negative_success;
1408         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1409         cache_out_ts(ncp, tsp, ticksp);
1410         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1411             NCF_DTS && tsp != NULL) {
1412                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1413                 *tsp = ncp_ts->nc_dotdottime;
1414         }
1415
1416         MPASS(dvp != *vpp);
1417         ltype = VOP_ISLOCKED(dvp);
1418         VOP_UNLOCK(dvp);
1419         vs = vget_prep(*vpp);
1420         mtx_unlock(dvlp);
1421         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1422         vn_lock(dvp, ltype | LK_RETRY);
1423         if (VN_IS_DOOMED(dvp)) {
1424                 if (error == 0)
1425                         vput(*vpp);
1426                 *vpp = NULL;
1427                 return (ENOENT);
1428         }
1429         if (error) {
1430                 *vpp = NULL;
1431                 goto retry;
1432         }
1433         return (-1);
1434 negative_success:
1435         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1436                 if (cnp->cn_flags & ISLASTCN) {
1437                         counter_u64_add(numnegzaps, 1);
1438                         error = cache_zap_locked_vnode(ncp, dvp);
1439                         if (__predict_false(error != 0)) {
1440                                 zap_and_exit_bucket_fail2++;
1441                                 goto retry;
1442                         }
1443                         cache_free(ncp);
1444                         return (0);
1445                 }
1446         }
1447
1448         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1449         cache_out_ts(ncp, tsp, ticksp);
1450         counter_u64_add(numneghits, 1);
1451         whiteout = (ncp->nc_flag & NCF_WHITE);
1452         cache_negative_hit(ncp);
1453         mtx_unlock(dvlp);
1454         if (whiteout)
1455                 cnp->cn_flags |= ISWHITEOUT;
1456         return (ENOENT);
1457 }
1458
1459 /**
1460  * Lookup a name in the name cache
1461  *
1462  * # Arguments
1463  *
1464  * - dvp:       Parent directory in which to search.
1465  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1466  * - cnp:       Parameters of the name search.  The most interesting bits of
1467  *              the cn_flags field have the following meanings:
1468  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1469  *                      it up.
1470  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1471  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1472  *              or negative) lookup, tsp will be filled with any timespec that
1473  *              was stored when this cache entry was created.  However, it will
1474  *              be clear for "." entries.
1475  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1476  *              (positive or negative) lookup, it will contain the ticks value
1477  *              that was current when the cache entry was created, unless cnp
1478  *              was ".".
1479  *
1480  * Either both tsp and ticks have to be provided or neither of them.
1481  *
1482  * # Returns
1483  *
1484  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1485  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1486  *              to a forced unmount.  vpp will not be modified.  If the entry
1487  *              is a whiteout, then the ISWHITEOUT flag will be set in
1488  *              cnp->cn_flags.
1489  * - 0:         A cache miss.  vpp will not be modified.
1490  *
1491  * # Locking
1492  *
1493  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1494  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1495  * lock is not recursively acquired.
1496  */
1497 static int __noinline
1498 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1499     struct timespec *tsp, int *ticksp)
1500 {
1501         struct namecache *ncp;
1502         struct mtx *blp;
1503         uint32_t hash;
1504         enum vgetstate vs;
1505         int error;
1506         bool whiteout;
1507
1508         MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1509
1510 retry:
1511         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1512         blp = HASH2BUCKETLOCK(hash);
1513         mtx_lock(blp);
1514
1515         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1516                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1517                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1518                         break;
1519         }
1520
1521         if (__predict_false(ncp == NULL)) {
1522                 mtx_unlock(blp);
1523                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1524                     NULL);
1525                 counter_u64_add(nummiss, 1);
1526                 return (0);
1527         }
1528
1529         if (ncp->nc_flag & NCF_NEGATIVE)
1530                 goto negative_success;
1531
1532         counter_u64_add(numposhits, 1);
1533         *vpp = ncp->nc_vp;
1534         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1535         cache_out_ts(ncp, tsp, ticksp);
1536         MPASS(dvp != *vpp);
1537         vs = vget_prep(*vpp);
1538         mtx_unlock(blp);
1539         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1540         if (error) {
1541                 *vpp = NULL;
1542                 goto retry;
1543         }
1544         return (-1);
1545 negative_success:
1546         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1547                 if (cnp->cn_flags & ISLASTCN) {
1548                         counter_u64_add(numnegzaps, 1);
1549                         error = cache_zap_locked_vnode(ncp, dvp);
1550                         if (__predict_false(error != 0)) {
1551                                 zap_and_exit_bucket_fail2++;
1552                                 goto retry;
1553                         }
1554                         cache_free(ncp);
1555                         return (0);
1556                 }
1557         }
1558
1559         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1560         cache_out_ts(ncp, tsp, ticksp);
1561         counter_u64_add(numneghits, 1);
1562         whiteout = (ncp->nc_flag & NCF_WHITE);
1563         cache_negative_hit(ncp);
1564         mtx_unlock(blp);
1565         if (whiteout)
1566                 cnp->cn_flags |= ISWHITEOUT;
1567         return (ENOENT);
1568 }
1569
1570 int
1571 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1572     struct timespec *tsp, int *ticksp)
1573 {
1574         struct namecache *ncp;
1575         struct negstate *negstate;
1576         uint32_t hash;
1577         enum vgetstate vs;
1578         int error;
1579         bool whiteout;
1580         u_short nc_flag;
1581
1582         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1583
1584 #ifdef DEBUG_CACHE
1585         if (__predict_false(!doingcache)) {
1586                 cnp->cn_flags &= ~MAKEENTRY;
1587                 return (0);
1588         }
1589 #endif
1590
1591         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1592                 if (cnp->cn_namelen == 1)
1593                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1594                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1595                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1596         }
1597
1598         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1599
1600         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1601                 cache_remove_cnp(dvp, cnp);
1602                 return (0);
1603         }
1604
1605         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1606         vfs_smr_enter();
1607
1608         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1609                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1610                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1611                         break;
1612         }
1613
1614         if (__predict_false(ncp == NULL)) {
1615                 vfs_smr_exit();
1616                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1617                     NULL);
1618                 counter_u64_add(nummiss, 1);
1619                 return (0);
1620         }
1621
1622         nc_flag = atomic_load_char(&ncp->nc_flag);
1623         if (nc_flag & NCF_NEGATIVE)
1624                 goto negative_success;
1625
1626         counter_u64_add(numposhits, 1);
1627         *vpp = ncp->nc_vp;
1628         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1629         cache_out_ts(ncp, tsp, ticksp);
1630         MPASS(dvp != *vpp);
1631         if (!cache_ncp_canuse(ncp)) {
1632                 vfs_smr_exit();
1633                 *vpp = NULL;
1634                 goto out_fallback;
1635         }
1636         vs = vget_prep_smr(*vpp);
1637         vfs_smr_exit();
1638         if (__predict_false(vs == VGET_NONE)) {
1639                 *vpp = NULL;
1640                 goto out_fallback;
1641         }
1642         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1643         if (error) {
1644                 *vpp = NULL;
1645                 goto out_fallback;
1646         }
1647         return (-1);
1648 negative_success:
1649         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1650                 if (cnp->cn_flags & ISLASTCN) {
1651                         vfs_smr_exit();
1652                         goto out_fallback;
1653                 }
1654         }
1655
1656         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1657         cache_out_ts(ncp, tsp, ticksp);
1658         counter_u64_add(numneghits, 1);
1659         whiteout = (ncp->nc_flag & NCF_WHITE);
1660         /*
1661          * TODO: We need to take locks to promote an entry. Code doing it
1662          * in SMR lookup can be modified to be shared.
1663          */
1664         negstate = NCP2NEGSTATE(ncp);
1665         if ((negstate->neg_flag & NEG_HOT) == 0 ||
1666             !cache_ncp_canuse(ncp)) {
1667                 vfs_smr_exit();
1668                 goto out_fallback;
1669         }
1670         vfs_smr_exit();
1671         if (whiteout)
1672                 cnp->cn_flags |= ISWHITEOUT;
1673         return (ENOENT);
1674 out_fallback:
1675         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1676 }
1677
1678 struct celockstate {
1679         struct mtx *vlp[3];
1680         struct mtx *blp[2];
1681 };
1682 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1683 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1684
1685 static inline void
1686 cache_celockstate_init(struct celockstate *cel)
1687 {
1688
1689         bzero(cel, sizeof(*cel));
1690 }
1691
1692 static void
1693 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1694     struct vnode *dvp)
1695 {
1696         struct mtx *vlp1, *vlp2;
1697
1698         MPASS(cel->vlp[0] == NULL);
1699         MPASS(cel->vlp[1] == NULL);
1700         MPASS(cel->vlp[2] == NULL);
1701
1702         MPASS(vp != NULL || dvp != NULL);
1703
1704         vlp1 = VP2VNODELOCK(vp);
1705         vlp2 = VP2VNODELOCK(dvp);
1706         cache_sort_vnodes(&vlp1, &vlp2);
1707
1708         if (vlp1 != NULL) {
1709                 mtx_lock(vlp1);
1710                 cel->vlp[0] = vlp1;
1711         }
1712         mtx_lock(vlp2);
1713         cel->vlp[1] = vlp2;
1714 }
1715
1716 static void
1717 cache_unlock_vnodes_cel(struct celockstate *cel)
1718 {
1719
1720         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1721
1722         if (cel->vlp[0] != NULL)
1723                 mtx_unlock(cel->vlp[0]);
1724         if (cel->vlp[1] != NULL)
1725                 mtx_unlock(cel->vlp[1]);
1726         if (cel->vlp[2] != NULL)
1727                 mtx_unlock(cel->vlp[2]);
1728 }
1729
1730 static bool
1731 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1732 {
1733         struct mtx *vlp;
1734         bool ret;
1735
1736         cache_assert_vlp_locked(cel->vlp[0]);
1737         cache_assert_vlp_locked(cel->vlp[1]);
1738         MPASS(cel->vlp[2] == NULL);
1739
1740         MPASS(vp != NULL);
1741         vlp = VP2VNODELOCK(vp);
1742
1743         ret = true;
1744         if (vlp >= cel->vlp[1]) {
1745                 mtx_lock(vlp);
1746         } else {
1747                 if (mtx_trylock(vlp))
1748                         goto out;
1749                 cache_lock_vnodes_cel_3_failures++;
1750                 cache_unlock_vnodes_cel(cel);
1751                 if (vlp < cel->vlp[0]) {
1752                         mtx_lock(vlp);
1753                         mtx_lock(cel->vlp[0]);
1754                         mtx_lock(cel->vlp[1]);
1755                 } else {
1756                         if (cel->vlp[0] != NULL)
1757                                 mtx_lock(cel->vlp[0]);
1758                         mtx_lock(vlp);
1759                         mtx_lock(cel->vlp[1]);
1760                 }
1761                 ret = false;
1762         }
1763 out:
1764         cel->vlp[2] = vlp;
1765         return (ret);
1766 }
1767
1768 static void
1769 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1770     struct mtx *blp2)
1771 {
1772
1773         MPASS(cel->blp[0] == NULL);
1774         MPASS(cel->blp[1] == NULL);
1775
1776         cache_sort_vnodes(&blp1, &blp2);
1777
1778         if (blp1 != NULL) {
1779                 mtx_lock(blp1);
1780                 cel->blp[0] = blp1;
1781         }
1782         mtx_lock(blp2);
1783         cel->blp[1] = blp2;
1784 }
1785
1786 static void
1787 cache_unlock_buckets_cel(struct celockstate *cel)
1788 {
1789
1790         if (cel->blp[0] != NULL)
1791                 mtx_unlock(cel->blp[0]);
1792         mtx_unlock(cel->blp[1]);
1793 }
1794
1795 /*
1796  * Lock part of the cache affected by the insertion.
1797  *
1798  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1799  * However, insertion can result in removal of an old entry. In this
1800  * case we have an additional vnode and bucketlock pair to lock.
1801  *
1802  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1803  * preserving the locking order (smaller address first).
1804  */
1805 static void
1806 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1807     uint32_t hash)
1808 {
1809         struct namecache *ncp;
1810         struct mtx *blps[2];
1811
1812         blps[0] = HASH2BUCKETLOCK(hash);
1813         for (;;) {
1814                 blps[1] = NULL;
1815                 cache_lock_vnodes_cel(cel, dvp, vp);
1816                 if (vp == NULL || vp->v_type != VDIR)
1817                         break;
1818                 ncp = vp->v_cache_dd;
1819                 if (ncp == NULL)
1820                         break;
1821                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1822                         break;
1823                 MPASS(ncp->nc_dvp == vp);
1824                 blps[1] = NCP2BUCKETLOCK(ncp);
1825                 if (ncp->nc_flag & NCF_NEGATIVE)
1826                         break;
1827                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1828                         break;
1829                 /*
1830                  * All vnodes got re-locked. Re-validate the state and if
1831                  * nothing changed we are done. Otherwise restart.
1832                  */
1833                 if (ncp == vp->v_cache_dd &&
1834                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1835                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1836                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1837                         break;
1838                 cache_unlock_vnodes_cel(cel);
1839                 cel->vlp[0] = NULL;
1840                 cel->vlp[1] = NULL;
1841                 cel->vlp[2] = NULL;
1842         }
1843         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1844 }
1845
1846 static void
1847 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1848     uint32_t hash)
1849 {
1850         struct namecache *ncp;
1851         struct mtx *blps[2];
1852
1853         blps[0] = HASH2BUCKETLOCK(hash);
1854         for (;;) {
1855                 blps[1] = NULL;
1856                 cache_lock_vnodes_cel(cel, dvp, vp);
1857                 ncp = dvp->v_cache_dd;
1858                 if (ncp == NULL)
1859                         break;
1860                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1861                         break;
1862                 MPASS(ncp->nc_dvp == dvp);
1863                 blps[1] = NCP2BUCKETLOCK(ncp);
1864                 if (ncp->nc_flag & NCF_NEGATIVE)
1865                         break;
1866                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1867                         break;
1868                 if (ncp == dvp->v_cache_dd &&
1869                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1870                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1871                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1872                         break;
1873                 cache_unlock_vnodes_cel(cel);
1874                 cel->vlp[0] = NULL;
1875                 cel->vlp[1] = NULL;
1876                 cel->vlp[2] = NULL;
1877         }
1878         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1879 }
1880
1881 static void
1882 cache_enter_unlock(struct celockstate *cel)
1883 {
1884
1885         cache_unlock_buckets_cel(cel);
1886         cache_unlock_vnodes_cel(cel);
1887 }
1888
1889 static void __noinline
1890 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1891     struct componentname *cnp)
1892 {
1893         struct celockstate cel;
1894         struct namecache *ncp;
1895         uint32_t hash;
1896         int len;
1897
1898         if (dvp->v_cache_dd == NULL)
1899                 return;
1900         len = cnp->cn_namelen;
1901         cache_celockstate_init(&cel);
1902         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1903         cache_enter_lock_dd(&cel, dvp, vp, hash);
1904         vn_seqc_write_begin(dvp);
1905         ncp = dvp->v_cache_dd;
1906         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1907                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1908                 cache_zap_locked(ncp);
1909         } else {
1910                 ncp = NULL;
1911         }
1912         dvp->v_cache_dd = NULL;
1913         vn_seqc_write_end(dvp);
1914         cache_enter_unlock(&cel);
1915         if (ncp != NULL)
1916                 cache_free(ncp);
1917 }
1918
1919 /*
1920  * Add an entry to the cache.
1921  */
1922 void
1923 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1924     struct timespec *tsp, struct timespec *dtsp)
1925 {
1926         struct celockstate cel;
1927         struct namecache *ncp, *n2, *ndd;
1928         struct namecache_ts *ncp_ts;
1929         struct nchashhead *ncpp;
1930         uint32_t hash;
1931         int flag;
1932         int len;
1933         u_long lnumcache;
1934
1935         VNPASS(!VN_IS_DOOMED(dvp), dvp);
1936         VNPASS(dvp->v_type != VNON, dvp);
1937         if (vp != NULL) {
1938                 VNPASS(!VN_IS_DOOMED(vp), vp);
1939                 VNPASS(vp->v_type != VNON, vp);
1940         }
1941
1942 #ifdef DEBUG_CACHE
1943         if (__predict_false(!doingcache))
1944                 return;
1945 #endif
1946
1947         flag = 0;
1948         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1949                 if (cnp->cn_namelen == 1)
1950                         return;
1951                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1952                         cache_enter_dotdot_prep(dvp, vp, cnp);
1953                         flag = NCF_ISDOTDOT;
1954                 }
1955         }
1956
1957         /*
1958          * Avoid blowout in namecache entries.
1959          */
1960         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1961         if (__predict_false(lnumcache >= ncsize)) {
1962                 atomic_add_long(&numcache, -1);
1963                 counter_u64_add(numdrops, 1);
1964                 return;
1965         }
1966
1967         cache_celockstate_init(&cel);
1968         ndd = NULL;
1969         ncp_ts = NULL;
1970
1971         /*
1972          * Calculate the hash key and setup as much of the new
1973          * namecache entry as possible before acquiring the lock.
1974          */
1975         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1976         ncp->nc_flag = flag | NCF_WIP;
1977         ncp->nc_vp = vp;
1978         if (vp == NULL)
1979                 cache_negative_init(ncp);
1980         ncp->nc_dvp = dvp;
1981         if (tsp != NULL) {
1982                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1983                 ncp_ts->nc_time = *tsp;
1984                 ncp_ts->nc_ticks = ticks;
1985                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1986                 if (dtsp != NULL) {
1987                         ncp_ts->nc_dotdottime = *dtsp;
1988                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1989                 }
1990         }
1991         len = ncp->nc_nlen = cnp->cn_namelen;
1992         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1993         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1994         ncp->nc_name[len] = '\0';
1995         cache_enter_lock(&cel, dvp, vp, hash);
1996
1997         /*
1998          * See if this vnode or negative entry is already in the cache
1999          * with this name.  This can happen with concurrent lookups of
2000          * the same path name.
2001          */
2002         ncpp = NCHHASH(hash);
2003         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2004                 if (n2->nc_dvp == dvp &&
2005                     n2->nc_nlen == cnp->cn_namelen &&
2006                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2007                         MPASS(cache_ncp_canuse(n2));
2008                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2009                                 KASSERT(vp == NULL,
2010                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2011                                     __func__, NULL, vp));
2012                         else
2013                                 KASSERT(n2->nc_vp == vp,
2014                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2015                                     __func__, n2->nc_vp, vp));
2016                         /*
2017                          * Entries are supposed to be immutable unless in the
2018                          * process of getting destroyed. Accommodating for
2019                          * changing timestamps is possible but not worth it.
2020                          * This should be harmless in terms of correctness, in
2021                          * the worst case resulting in an earlier expiration.
2022                          * Alternatively, the found entry can be replaced
2023                          * altogether.
2024                          */
2025                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2026 #if 0
2027                         if (tsp != NULL) {
2028                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2029                                     ("no NCF_TS"));
2030                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2031                                 n2_ts->nc_time = ncp_ts->nc_time;
2032                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2033                                 if (dtsp != NULL) {
2034                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2035                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2036                                 }
2037                         }
2038 #endif
2039                         goto out_unlock_free;
2040                 }
2041         }
2042
2043         if (flag == NCF_ISDOTDOT) {
2044                 /*
2045                  * See if we are trying to add .. entry, but some other lookup
2046                  * has populated v_cache_dd pointer already.
2047                  */
2048                 if (dvp->v_cache_dd != NULL)
2049                         goto out_unlock_free;
2050                 KASSERT(vp == NULL || vp->v_type == VDIR,
2051                     ("wrong vnode type %p", vp));
2052                 vn_seqc_write_begin(dvp);
2053                 dvp->v_cache_dd = ncp;
2054                 vn_seqc_write_end(dvp);
2055         }
2056
2057         if (vp != NULL) {
2058                 if (flag != NCF_ISDOTDOT) {
2059                         /*
2060                          * For this case, the cache entry maps both the
2061                          * directory name in it and the name ".." for the
2062                          * directory's parent.
2063                          */
2064                         vn_seqc_write_begin(vp);
2065                         if ((ndd = vp->v_cache_dd) != NULL) {
2066                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2067                                         cache_zap_locked(ndd);
2068                                 else
2069                                         ndd = NULL;
2070                         }
2071                         vp->v_cache_dd = ncp;
2072                         vn_seqc_write_end(vp);
2073                 } else if (vp->v_type != VDIR) {
2074                         if (vp->v_cache_dd != NULL) {
2075                                 vn_seqc_write_begin(vp);
2076                                 vp->v_cache_dd = NULL;
2077                                 vn_seqc_write_end(vp);
2078                         }
2079                 }
2080         }
2081
2082         if (flag != NCF_ISDOTDOT) {
2083                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2084                         vhold(dvp);
2085                         counter_u64_add(numcachehv, 1);
2086                 }
2087                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2088         }
2089
2090         /*
2091          * If the entry is "negative", we place it into the
2092          * "negative" cache queue, otherwise, we place it into the
2093          * destination vnode's cache entries queue.
2094          */
2095         if (vp != NULL) {
2096                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2097                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2098                     vp);
2099         } else {
2100                 if (cnp->cn_flags & ISWHITEOUT)
2101                         ncp->nc_flag |= NCF_WHITE;
2102                 cache_negative_insert(ncp);
2103                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2104                     ncp->nc_name);
2105         }
2106
2107         /*
2108          * Insert the new namecache entry into the appropriate chain
2109          * within the cache entries table.
2110          */
2111         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2112
2113         atomic_thread_fence_rel();
2114         /*
2115          * Mark the entry as fully constructed.
2116          * It is immutable past this point until its removal.
2117          */
2118         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2119
2120         cache_enter_unlock(&cel);
2121         if (numneg * ncnegfactor > lnumcache)
2122                 cache_negative_zap_one();
2123         if (ndd != NULL)
2124                 cache_free(ndd);
2125         return;
2126 out_unlock_free:
2127         cache_enter_unlock(&cel);
2128         atomic_add_long(&numcache, -1);
2129         cache_free(ncp);
2130         return;
2131 }
2132
2133 static u_int
2134 cache_roundup_2(u_int val)
2135 {
2136         u_int res;
2137
2138         for (res = 1; res <= val; res <<= 1)
2139                 continue;
2140
2141         return (res);
2142 }
2143
2144 static struct nchashhead *
2145 nchinittbl(u_long elements, u_long *hashmask)
2146 {
2147         struct nchashhead *hashtbl;
2148         u_long hashsize, i;
2149
2150         hashsize = cache_roundup_2(elements) / 2;
2151
2152         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2153         for (i = 0; i < hashsize; i++)
2154                 CK_SLIST_INIT(&hashtbl[i]);
2155         *hashmask = hashsize - 1;
2156         return (hashtbl);
2157 }
2158
2159 static void
2160 ncfreetbl(struct nchashhead *hashtbl)
2161 {
2162
2163         free(hashtbl, M_VFSCACHE);
2164 }
2165
2166 /*
2167  * Name cache initialization, from vfs_init() when we are booting
2168  */
2169 static void
2170 nchinit(void *dummy __unused)
2171 {
2172         u_int i;
2173
2174         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2175             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2176         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2177             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2178         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2179             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2180         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2181             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2182
2183         VFS_SMR_ZONE_SET(cache_zone_small);
2184         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2185         VFS_SMR_ZONE_SET(cache_zone_large);
2186         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2187
2188         ncsize = desiredvnodes * ncsizefactor;
2189         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2190         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2191         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2192                 ncbuckethash = 7;
2193         if (ncbuckethash > nchash)
2194                 ncbuckethash = nchash;
2195         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2196             M_WAITOK | M_ZERO);
2197         for (i = 0; i < numbucketlocks; i++)
2198                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2199         ncvnodehash = ncbuckethash;
2200         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2201             M_WAITOK | M_ZERO);
2202         for (i = 0; i < numvnodelocks; i++)
2203                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2204         ncpurgeminvnodes = numbucketlocks * 2;
2205
2206         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2207             M_WAITOK | M_ZERO);
2208         for (i = 0; i < numneglists; i++) {
2209                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2210                 TAILQ_INIT(&neglists[i].nl_list);
2211         }
2212         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2213         TAILQ_INIT(&ncneg_hot.nl_list);
2214
2215         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2216 }
2217 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2218
2219 void
2220 cache_vnode_init(struct vnode *vp)
2221 {
2222
2223         LIST_INIT(&vp->v_cache_src);
2224         TAILQ_INIT(&vp->v_cache_dst);
2225         vp->v_cache_dd = NULL;
2226         cache_prehash(vp);
2227 }
2228
2229 void
2230 cache_changesize(u_long newmaxvnodes)
2231 {
2232         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2233         u_long new_nchash, old_nchash;
2234         struct namecache *ncp;
2235         uint32_t hash;
2236         u_long newncsize;
2237         int i;
2238
2239         newncsize = newmaxvnodes * ncsizefactor;
2240         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2241         if (newmaxvnodes < numbucketlocks)
2242                 newmaxvnodes = numbucketlocks;
2243
2244         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2245         /* If same hash table size, nothing to do */
2246         if (nchash == new_nchash) {
2247                 ncfreetbl(new_nchashtbl);
2248                 return;
2249         }
2250         /*
2251          * Move everything from the old hash table to the new table.
2252          * None of the namecache entries in the table can be removed
2253          * because to do so, they have to be removed from the hash table.
2254          */
2255         cache_lock_all_vnodes();
2256         cache_lock_all_buckets();
2257         old_nchashtbl = nchashtbl;
2258         old_nchash = nchash;
2259         nchashtbl = new_nchashtbl;
2260         nchash = new_nchash;
2261         for (i = 0; i <= old_nchash; i++) {
2262                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2263                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2264                             ncp->nc_dvp);
2265                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2266                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2267                 }
2268         }
2269         ncsize = newncsize;
2270         cache_unlock_all_buckets();
2271         cache_unlock_all_vnodes();
2272         ncfreetbl(old_nchashtbl);
2273 }
2274
2275 /*
2276  * Invalidate all entries from and to a particular vnode.
2277  */
2278 static void
2279 cache_purge_impl(struct vnode *vp)
2280 {
2281         TAILQ_HEAD(, namecache) ncps;
2282         struct namecache *ncp, *nnp;
2283         struct mtx *vlp, *vlp2;
2284
2285         TAILQ_INIT(&ncps);
2286         vlp = VP2VNODELOCK(vp);
2287         vlp2 = NULL;
2288         mtx_assert(vlp, MA_OWNED);
2289 retry:
2290         while (!LIST_EMPTY(&vp->v_cache_src)) {
2291                 ncp = LIST_FIRST(&vp->v_cache_src);
2292                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2293                         goto retry;
2294                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2295         }
2296         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2297                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2298                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2299                         goto retry;
2300                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2301         }
2302         ncp = vp->v_cache_dd;
2303         if (ncp != NULL) {
2304                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2305                    ("lost dotdot link"));
2306                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2307                         goto retry;
2308                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2309         }
2310         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2311         mtx_unlock(vlp);
2312         if (vlp2 != NULL)
2313                 mtx_unlock(vlp2);
2314         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2315                 cache_free(ncp);
2316         }
2317 }
2318
2319 void
2320 cache_purge(struct vnode *vp)
2321 {
2322         struct mtx *vlp;
2323
2324         SDT_PROBE1(vfs, namecache, purge, done, vp);
2325         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2326             vp->v_cache_dd == NULL)
2327                 return;
2328         vlp = VP2VNODELOCK(vp);
2329         mtx_lock(vlp);
2330         cache_purge_impl(vp);
2331 }
2332
2333 /*
2334  * Only to be used by vgone.
2335  */
2336 void
2337 cache_purge_vgone(struct vnode *vp)
2338 {
2339         struct mtx *vlp;
2340
2341         VNPASS(VN_IS_DOOMED(vp), vp);
2342         vlp = VP2VNODELOCK(vp);
2343         if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2344             vp->v_cache_dd == NULL)) {
2345                 mtx_lock(vlp);
2346                 cache_purge_impl(vp);
2347                 mtx_assert(vlp, MA_NOTOWNED);
2348                 return;
2349         }
2350
2351         /*
2352          * All the NULL pointer state we found above may be transient.
2353          * Serialize against a possible thread doing cache_purge.
2354          */
2355         mtx_wait_unlocked(vlp);
2356         if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2357             vp->v_cache_dd == NULL)) {
2358                 mtx_lock(vlp);
2359                 cache_purge_impl(vp);
2360                 mtx_assert(vlp, MA_NOTOWNED);
2361                 return;
2362         }
2363         return;
2364 }
2365
2366 /*
2367  * Invalidate all negative entries for a particular directory vnode.
2368  */
2369 void
2370 cache_purge_negative(struct vnode *vp)
2371 {
2372         TAILQ_HEAD(, namecache) ncps;
2373         struct namecache *ncp, *nnp;
2374         struct mtx *vlp;
2375
2376         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2377         if (LIST_EMPTY(&vp->v_cache_src))
2378                 return;
2379         TAILQ_INIT(&ncps);
2380         vlp = VP2VNODELOCK(vp);
2381         mtx_lock(vlp);
2382         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2383                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2384                         continue;
2385                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2386                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2387         }
2388         mtx_unlock(vlp);
2389         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2390                 cache_free(ncp);
2391         }
2392 }
2393
2394 void
2395 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2396     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2397 {
2398
2399         ASSERT_VOP_IN_SEQC(fdvp);
2400         ASSERT_VOP_IN_SEQC(fvp);
2401         ASSERT_VOP_IN_SEQC(tdvp);
2402         if (tvp != NULL)
2403                 ASSERT_VOP_IN_SEQC(tvp);
2404
2405         cache_purge(fvp);
2406         if (tvp != NULL) {
2407                 cache_purge(tvp);
2408                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2409                     ("%s: lingering negative entry", __func__));
2410         } else {
2411                 cache_remove_cnp(tdvp, tcnp);
2412         }
2413 }
2414
2415 /*
2416  * Flush all entries referencing a particular filesystem.
2417  */
2418 void
2419 cache_purgevfs(struct mount *mp, bool force)
2420 {
2421         TAILQ_HEAD(, namecache) ncps;
2422         struct mtx *vlp1, *vlp2;
2423         struct mtx *blp;
2424         struct nchashhead *bucket;
2425         struct namecache *ncp, *nnp;
2426         u_long i, j, n_nchash;
2427         int error;
2428
2429         /* Scan hash tables for applicable entries */
2430         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2431         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2432                 return;
2433         TAILQ_INIT(&ncps);
2434         n_nchash = nchash + 1;
2435         vlp1 = vlp2 = NULL;
2436         for (i = 0; i < numbucketlocks; i++) {
2437                 blp = (struct mtx *)&bucketlocks[i];
2438                 mtx_lock(blp);
2439                 for (j = i; j < n_nchash; j += numbucketlocks) {
2440 retry:
2441                         bucket = &nchashtbl[j];
2442                         CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2443                                 cache_assert_bucket_locked(ncp);
2444                                 if (ncp->nc_dvp->v_mount != mp)
2445                                         continue;
2446                                 error = cache_zap_locked_bucket_kl(ncp, blp,
2447                                     &vlp1, &vlp2);
2448                                 if (error != 0)
2449                                         goto retry;
2450                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2451                         }
2452                 }
2453                 mtx_unlock(blp);
2454                 if (vlp1 == NULL && vlp2 == NULL)
2455                         cache_maybe_yield();
2456         }
2457         if (vlp1 != NULL)
2458                 mtx_unlock(vlp1);
2459         if (vlp2 != NULL)
2460                 mtx_unlock(vlp2);
2461
2462         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2463                 cache_free(ncp);
2464         }
2465 }
2466
2467 /*
2468  * Perform canonical checks and cache lookup and pass on to filesystem
2469  * through the vop_cachedlookup only if needed.
2470  */
2471
2472 int
2473 vfs_cache_lookup(struct vop_lookup_args *ap)
2474 {
2475         struct vnode *dvp;
2476         int error;
2477         struct vnode **vpp = ap->a_vpp;
2478         struct componentname *cnp = ap->a_cnp;
2479         int flags = cnp->cn_flags;
2480
2481         *vpp = NULL;
2482         dvp = ap->a_dvp;
2483
2484         if (dvp->v_type != VDIR)
2485                 return (ENOTDIR);
2486
2487         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2488             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2489                 return (EROFS);
2490
2491         error = vn_dir_check_exec(dvp, cnp);
2492         if (error != 0)
2493                 return (error);
2494
2495         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2496         if (error == 0)
2497                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2498         if (error == -1)
2499                 return (0);
2500         return (error);
2501 }
2502
2503 /* Implementation of the getcwd syscall. */
2504 int
2505 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2506 {
2507         char *buf, *retbuf;
2508         size_t buflen;
2509         int error;
2510
2511         buflen = uap->buflen;
2512         if (__predict_false(buflen < 2))
2513                 return (EINVAL);
2514         if (buflen > MAXPATHLEN)
2515                 buflen = MAXPATHLEN;
2516
2517         buf = uma_zalloc(namei_zone, M_WAITOK);
2518         error = vn_getcwd(buf, &retbuf, &buflen);
2519         if (error == 0)
2520                 error = copyout(retbuf, uap->buf, buflen);
2521         uma_zfree(namei_zone, buf);
2522         return (error);
2523 }
2524
2525 int
2526 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2527 {
2528         struct pwd *pwd;
2529         int error;
2530
2531         vfs_smr_enter();
2532         pwd = pwd_get_smr();
2533         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2534             buflen, false, 0);
2535         VFS_SMR_ASSERT_NOT_ENTERED();
2536         if (error < 0) {
2537                 pwd = pwd_hold(curthread);
2538                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2539                     retbuf, buflen);
2540                 pwd_drop(pwd);
2541         }
2542
2543 #ifdef KTRACE
2544         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2545                 ktrnamei(*retbuf);
2546 #endif
2547         return (error);
2548 }
2549
2550 static int
2551 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2552     size_t size, int flags, enum uio_seg pathseg)
2553 {
2554         struct nameidata nd;
2555         char *retbuf, *freebuf;
2556         int error;
2557
2558         if (flags != 0)
2559                 return (EINVAL);
2560         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2561             pathseg, path, fd, &cap_fstat_rights, td);
2562         if ((error = namei(&nd)) != 0)
2563                 return (error);
2564         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2565         if (error == 0) {
2566                 error = copyout(retbuf, buf, size);
2567                 free(freebuf, M_TEMP);
2568         }
2569         NDFREE(&nd, 0);
2570         return (error);
2571 }
2572
2573 int
2574 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2575 {
2576
2577         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2578             uap->flags, UIO_USERSPACE));
2579 }
2580
2581 /*
2582  * Retrieve the full filesystem path that correspond to a vnode from the name
2583  * cache (if available)
2584  */
2585 int
2586 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2587 {
2588         struct pwd *pwd;
2589         char *buf;
2590         size_t buflen;
2591         int error;
2592
2593         if (__predict_false(vp == NULL))
2594                 return (EINVAL);
2595
2596         buflen = MAXPATHLEN;
2597         buf = malloc(buflen, M_TEMP, M_WAITOK);
2598         vfs_smr_enter();
2599         pwd = pwd_get_smr();
2600         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2601         VFS_SMR_ASSERT_NOT_ENTERED();
2602         if (error < 0) {
2603                 pwd = pwd_hold(curthread);
2604                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2605                 pwd_drop(pwd);
2606         }
2607         if (error == 0)
2608                 *freebuf = buf;
2609         else
2610                 free(buf, M_TEMP);
2611         return (error);
2612 }
2613
2614 /*
2615  * This function is similar to vn_fullpath, but it attempts to lookup the
2616  * pathname relative to the global root mount point.  This is required for the
2617  * auditing sub-system, as audited pathnames must be absolute, relative to the
2618  * global root mount point.
2619  */
2620 int
2621 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2622 {
2623         char *buf;
2624         size_t buflen;
2625         int error;
2626
2627         if (__predict_false(vp == NULL))
2628                 return (EINVAL);
2629         buflen = MAXPATHLEN;
2630         buf = malloc(buflen, M_TEMP, M_WAITOK);
2631         vfs_smr_enter();
2632         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2633         VFS_SMR_ASSERT_NOT_ENTERED();
2634         if (error < 0) {
2635                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2636         }
2637         if (error == 0)
2638                 *freebuf = buf;
2639         else
2640                 free(buf, M_TEMP);
2641         return (error);
2642 }
2643
2644 static struct namecache *
2645 vn_dd_from_dst(struct vnode *vp)
2646 {
2647         struct namecache *ncp;
2648
2649         cache_assert_vnode_locked(vp);
2650         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2651                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2652                         return (ncp);
2653         }
2654         return (NULL);
2655 }
2656
2657 int
2658 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2659 {
2660         struct vnode *dvp;
2661         struct namecache *ncp;
2662         struct mtx *vlp;
2663         int error;
2664
2665         vlp = VP2VNODELOCK(*vp);
2666         mtx_lock(vlp);
2667         ncp = (*vp)->v_cache_dd;
2668         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2669                 KASSERT(ncp == vn_dd_from_dst(*vp),
2670                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2671                     ncp, vn_dd_from_dst(*vp)));
2672         } else {
2673                 ncp = vn_dd_from_dst(*vp);
2674         }
2675         if (ncp != NULL) {
2676                 if (*buflen < ncp->nc_nlen) {
2677                         mtx_unlock(vlp);
2678                         vrele(*vp);
2679                         counter_u64_add(numfullpathfail4, 1);
2680                         error = ENOMEM;
2681                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2682                             vp, NULL);
2683                         return (error);
2684                 }
2685                 *buflen -= ncp->nc_nlen;
2686                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2687                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2688                     ncp->nc_name, vp);
2689                 dvp = *vp;
2690                 *vp = ncp->nc_dvp;
2691                 vref(*vp);
2692                 mtx_unlock(vlp);
2693                 vrele(dvp);
2694                 return (0);
2695         }
2696         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2697
2698         mtx_unlock(vlp);
2699         vn_lock(*vp, LK_SHARED | LK_RETRY);
2700         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2701         vput(*vp);
2702         if (error) {
2703                 counter_u64_add(numfullpathfail2, 1);
2704                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2705                 return (error);
2706         }
2707
2708         *vp = dvp;
2709         if (VN_IS_DOOMED(dvp)) {
2710                 /* forced unmount */
2711                 vrele(dvp);
2712                 error = ENOENT;
2713                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2714                 return (error);
2715         }
2716         /*
2717          * *vp has its use count incremented still.
2718          */
2719
2720         return (0);
2721 }
2722
2723 /*
2724  * Resolve a directory to a pathname.
2725  *
2726  * The name of the directory can always be found in the namecache or fetched
2727  * from the filesystem. There is also guaranteed to be only one parent, meaning
2728  * we can just follow vnodes up until we find the root.
2729  *
2730  * The vnode must be referenced.
2731  */
2732 static int
2733 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2734     size_t *len, bool slash_prefixed, size_t addend)
2735 {
2736 #ifdef KDTRACE_HOOKS
2737         struct vnode *startvp = vp;
2738 #endif
2739         struct vnode *vp1;
2740         size_t buflen;
2741         int error;
2742
2743         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2744         VNPASS(vp->v_usecount > 0, vp);
2745
2746         buflen = *len;
2747
2748         if (!slash_prefixed) {
2749                 MPASS(*len >= 2);
2750                 buflen--;
2751                 buf[buflen] = '\0';
2752         }
2753
2754         error = 0;
2755
2756         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2757         counter_u64_add(numfullpathcalls, 1);
2758         while (vp != rdir && vp != rootvnode) {
2759                 /*
2760                  * The vp vnode must be already fully constructed,
2761                  * since it is either found in namecache or obtained
2762                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2763                  * without obtaining the vnode lock.
2764                  */
2765                 if ((vp->v_vflag & VV_ROOT) != 0) {
2766                         vn_lock(vp, LK_RETRY | LK_SHARED);
2767
2768                         /*
2769                          * With the vnode locked, check for races with
2770                          * unmount, forced or not.  Note that we
2771                          * already verified that vp is not equal to
2772                          * the root vnode, which means that
2773                          * mnt_vnodecovered can be NULL only for the
2774                          * case of unmount.
2775                          */
2776                         if (VN_IS_DOOMED(vp) ||
2777                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2778                             vp1->v_mountedhere != vp->v_mount) {
2779                                 vput(vp);
2780                                 error = ENOENT;
2781                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2782                                     error, vp, NULL);
2783                                 break;
2784                         }
2785
2786                         vref(vp1);
2787                         vput(vp);
2788                         vp = vp1;
2789                         continue;
2790                 }
2791                 if (vp->v_type != VDIR) {
2792                         vrele(vp);
2793                         counter_u64_add(numfullpathfail1, 1);
2794                         error = ENOTDIR;
2795                         SDT_PROBE3(vfs, namecache, fullpath, return,
2796                             error, vp, NULL);
2797                         break;
2798                 }
2799                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2800                 if (error)
2801                         break;
2802                 if (buflen == 0) {
2803                         vrele(vp);
2804                         error = ENOMEM;
2805                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2806                             startvp, NULL);
2807                         break;
2808                 }
2809                 buf[--buflen] = '/';
2810                 slash_prefixed = true;
2811         }
2812         if (error)
2813                 return (error);
2814         if (!slash_prefixed) {
2815                 if (buflen == 0) {
2816                         vrele(vp);
2817                         counter_u64_add(numfullpathfail4, 1);
2818                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2819                             startvp, NULL);
2820                         return (ENOMEM);
2821                 }
2822                 buf[--buflen] = '/';
2823         }
2824         counter_u64_add(numfullpathfound, 1);
2825         vrele(vp);
2826
2827         *retbuf = buf + buflen;
2828         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2829         *len -= buflen;
2830         *len += addend;
2831         return (0);
2832 }
2833
2834 /*
2835  * Resolve an arbitrary vnode to a pathname.
2836  *
2837  * Note 2 caveats:
2838  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2839  *   resolve to a different path than the one used to find it
2840  * - namecache is not mandatory, meaning names are not guaranteed to be added
2841  *   (in which case resolving fails)
2842  */
2843 static void __inline
2844 cache_rev_failed_impl(int *reason, int line)
2845 {
2846
2847         *reason = line;
2848 }
2849 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
2850
2851 static int
2852 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2853     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2854 {
2855 #ifdef KDTRACE_HOOKS
2856         struct vnode *startvp = vp;
2857 #endif
2858         struct vnode *tvp;
2859         struct mount *mp;
2860         struct namecache *ncp;
2861         size_t orig_buflen;
2862         int reason;
2863         int error;
2864 #ifdef KDTRACE_HOOKS
2865         int i;
2866 #endif
2867         seqc_t vp_seqc, tvp_seqc;
2868         u_char nc_flag;
2869
2870         VFS_SMR_ASSERT_ENTERED();
2871
2872         if (!cache_fast_revlookup) {
2873                 vfs_smr_exit();
2874                 return (-1);
2875         }
2876
2877         orig_buflen = *buflen;
2878
2879         if (!slash_prefixed) {
2880                 MPASS(*buflen >= 2);
2881                 *buflen -= 1;
2882                 buf[*buflen] = '\0';
2883         }
2884
2885         if (vp == rdir || vp == rootvnode) {
2886                 if (!slash_prefixed) {
2887                         *buflen -= 1;
2888                         buf[*buflen] = '/';
2889                 }
2890                 goto out_ok;
2891         }
2892
2893 #ifdef KDTRACE_HOOKS
2894         i = 0;
2895 #endif
2896         error = -1;
2897         ncp = NULL; /* for sdt probe down below */
2898         vp_seqc = vn_seqc_read_any(vp);
2899         if (seqc_in_modify(vp_seqc)) {
2900                 cache_rev_failed(&reason);
2901                 goto out_abort;
2902         }
2903
2904         for (;;) {
2905 #ifdef KDTRACE_HOOKS
2906                 i++;
2907 #endif
2908                 if ((vp->v_vflag & VV_ROOT) != 0) {
2909                         mp = atomic_load_ptr(&vp->v_mount);
2910                         if (mp == NULL) {
2911                                 cache_rev_failed(&reason);
2912                                 goto out_abort;
2913                         }
2914                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2915                         tvp_seqc = vn_seqc_read_any(tvp);
2916                         if (seqc_in_modify(tvp_seqc)) {
2917                                 cache_rev_failed(&reason);
2918                                 goto out_abort;
2919                         }
2920                         if (!vn_seqc_consistent(vp, vp_seqc)) {
2921                                 cache_rev_failed(&reason);
2922                                 goto out_abort;
2923                         }
2924                         vp = tvp;
2925                         vp_seqc = tvp_seqc;
2926                         continue;
2927                 }
2928                 ncp = atomic_load_ptr(&vp->v_cache_dd);
2929                 if (ncp == NULL) {
2930                         cache_rev_failed(&reason);
2931                         goto out_abort;
2932                 }
2933                 nc_flag = atomic_load_char(&ncp->nc_flag);
2934                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
2935                         cache_rev_failed(&reason);
2936                         goto out_abort;
2937                 }
2938                 if (!cache_ncp_canuse(ncp)) {
2939                         cache_rev_failed(&reason);
2940                         goto out_abort;
2941                 }
2942                 if (ncp->nc_nlen >= *buflen) {
2943                         cache_rev_failed(&reason);
2944                         error = ENOMEM;
2945                         goto out_abort;
2946                 }
2947                 *buflen -= ncp->nc_nlen;
2948                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2949                 *buflen -= 1;
2950                 buf[*buflen] = '/';
2951                 tvp = ncp->nc_dvp;
2952                 tvp_seqc = vn_seqc_read_any(tvp);
2953                 if (seqc_in_modify(tvp_seqc)) {
2954                         cache_rev_failed(&reason);
2955                         goto out_abort;
2956                 }
2957                 if (!vn_seqc_consistent(vp, vp_seqc)) {
2958                         cache_rev_failed(&reason);
2959                         goto out_abort;
2960                 }
2961                 vp = tvp;
2962                 vp_seqc = tvp_seqc;
2963                 if (vp == rdir || vp == rootvnode)
2964                         break;
2965         }
2966 out_ok:
2967         vfs_smr_exit();
2968         *retbuf = buf + *buflen;
2969         *buflen = orig_buflen - *buflen + addend;
2970         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2971         return (0);
2972
2973 out_abort:
2974         *buflen = orig_buflen;
2975         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2976         vfs_smr_exit();
2977         return (error);
2978 }
2979
2980 static int
2981 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2982     size_t *buflen)
2983 {
2984         size_t orig_buflen;
2985         bool slash_prefixed;
2986         int error;
2987
2988         if (*buflen < 2)
2989                 return (EINVAL);
2990
2991         orig_buflen = *buflen;
2992
2993         vref(vp);
2994         slash_prefixed = false;
2995         if (vp->v_type != VDIR) {
2996                 *buflen -= 1;
2997                 buf[*buflen] = '\0';
2998                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2999                 if (error)
3000                         return (error);
3001                 if (*buflen == 0) {
3002                         vrele(vp);
3003                         return (ENOMEM);
3004                 }
3005                 *buflen -= 1;
3006                 buf[*buflen] = '/';
3007                 slash_prefixed = true;
3008         }
3009
3010         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
3011             orig_buflen - *buflen));
3012 }
3013
3014 /*
3015  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3016  *
3017  * Since the namecache does not track handlings, the caller is expected to first
3018  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3019  *
3020  * Then we have 2 cases:
3021  * - if the found vnode is a directory, the path can be constructed just by
3022  *   fullowing names up the chain
3023  * - otherwise we populate the buffer with the saved name and start resolving
3024  *   from the parent
3025  */
3026 static int
3027 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3028     size_t *buflen)
3029 {
3030         char *buf, *tmpbuf;
3031         struct pwd *pwd;
3032         struct componentname *cnp;
3033         struct vnode *vp;
3034         size_t addend;
3035         int error;
3036         bool slash_prefixed;
3037         enum vtype type;
3038
3039         if (*buflen < 2)
3040                 return (EINVAL);
3041         if (*buflen > MAXPATHLEN)
3042                 *buflen = MAXPATHLEN;
3043
3044         slash_prefixed = false;
3045
3046         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3047
3048         addend = 0;
3049         vp = ndp->ni_vp;
3050         /*
3051          * Check for VBAD to work around the vp_crossmp bug in lookup().
3052          *
3053          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3054          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3055          * If the type is VDIR (like in this very case) we can skip looking
3056          * at ni_dvp in the first place. However, since vnodes get passed here
3057          * unlocked the target may transition to doomed state (type == VBAD)
3058          * before we get to evaluate the condition. If this happens, we will
3059          * populate part of the buffer and descend to vn_fullpath_dir with
3060          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3061          *
3062          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3063          * an address of a bit field, even if said field is sized to char.
3064          * Work around the problem by reading the value into a full-sized enum
3065          * and then re-reading it with atomic_load which will still prevent
3066          * the compiler from re-reading down the road.
3067          */
3068         type = vp->v_type;
3069         type = atomic_load_int(&type);
3070         if (type == VBAD) {
3071                 error = ENOENT;
3072                 goto out_bad;
3073         }
3074         if (type != VDIR) {
3075                 cnp = &ndp->ni_cnd;
3076                 addend = cnp->cn_namelen + 2;
3077                 if (*buflen < addend) {
3078                         error = ENOMEM;
3079                         goto out_bad;
3080                 }
3081                 *buflen -= addend;
3082                 tmpbuf = buf + *buflen;
3083                 tmpbuf[0] = '/';
3084                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3085                 tmpbuf[addend - 1] = '\0';
3086                 slash_prefixed = true;
3087                 vp = ndp->ni_dvp;
3088         }
3089
3090         vfs_smr_enter();
3091         pwd = pwd_get_smr();
3092         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3093             slash_prefixed, addend);
3094         VFS_SMR_ASSERT_NOT_ENTERED();
3095         if (error < 0) {
3096                 pwd = pwd_hold(curthread);
3097                 vref(vp);
3098                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3099                     slash_prefixed, addend);
3100                 pwd_drop(pwd);
3101                 if (error != 0)
3102                         goto out_bad;
3103         }
3104
3105         *freebuf = buf;
3106
3107         return (0);
3108 out_bad:
3109         free(buf, M_TEMP);
3110         return (error);
3111 }
3112
3113 struct vnode *
3114 vn_dir_dd_ino(struct vnode *vp)
3115 {
3116         struct namecache *ncp;
3117         struct vnode *ddvp;
3118         struct mtx *vlp;
3119         enum vgetstate vs;
3120
3121         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3122         vlp = VP2VNODELOCK(vp);
3123         mtx_lock(vlp);
3124         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3125                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3126                         continue;
3127                 ddvp = ncp->nc_dvp;
3128                 vs = vget_prep(ddvp);
3129                 mtx_unlock(vlp);
3130                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3131                         return (NULL);
3132                 return (ddvp);
3133         }
3134         mtx_unlock(vlp);
3135         return (NULL);
3136 }
3137
3138 int
3139 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3140 {
3141         struct namecache *ncp;
3142         struct mtx *vlp;
3143         int l;
3144
3145         vlp = VP2VNODELOCK(vp);
3146         mtx_lock(vlp);
3147         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3148                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3149                         break;
3150         if (ncp == NULL) {
3151                 mtx_unlock(vlp);
3152                 return (ENOENT);
3153         }
3154         l = min(ncp->nc_nlen, buflen - 1);
3155         memcpy(buf, ncp->nc_name, l);
3156         mtx_unlock(vlp);
3157         buf[l] = '\0';
3158         return (0);
3159 }
3160
3161 /*
3162  * This function updates path string to vnode's full global path
3163  * and checks the size of the new path string against the pathlen argument.
3164  *
3165  * Requires a locked, referenced vnode.
3166  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3167  *
3168  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3169  * because it falls back to the ".." lookup if the namecache lookup fails.
3170  */
3171 int
3172 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3173     u_int pathlen)
3174 {
3175         struct nameidata nd;
3176         struct vnode *vp1;
3177         char *rpath, *fbuf;
3178         int error;
3179
3180         ASSERT_VOP_ELOCKED(vp, __func__);
3181
3182         /* Construct global filesystem path from vp. */
3183         VOP_UNLOCK(vp);
3184         error = vn_fullpath_global(vp, &rpath, &fbuf);
3185
3186         if (error != 0) {
3187                 vrele(vp);
3188                 return (error);
3189         }
3190
3191         if (strlen(rpath) >= pathlen) {
3192                 vrele(vp);
3193                 error = ENAMETOOLONG;
3194                 goto out;
3195         }
3196
3197         /*
3198          * Re-lookup the vnode by path to detect a possible rename.
3199          * As a side effect, the vnode is relocked.
3200          * If vnode was renamed, return ENOENT.
3201          */
3202         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3203             UIO_SYSSPACE, path, td);
3204         error = namei(&nd);
3205         if (error != 0) {
3206                 vrele(vp);
3207                 goto out;
3208         }
3209         NDFREE(&nd, NDF_ONLY_PNBUF);
3210         vp1 = nd.ni_vp;
3211         vrele(vp);
3212         if (vp1 == vp)
3213                 strcpy(path, rpath);
3214         else {
3215                 vput(vp1);
3216                 error = ENOENT;
3217         }
3218
3219 out:
3220         free(fbuf, M_TEMP);
3221         return (error);
3222 }
3223
3224 #ifdef DDB
3225 static void
3226 db_print_vpath(struct vnode *vp)
3227 {
3228
3229         while (vp != NULL) {
3230                 db_printf("%p: ", vp);
3231                 if (vp == rootvnode) {
3232                         db_printf("/");
3233                         vp = NULL;
3234                 } else {
3235                         if (vp->v_vflag & VV_ROOT) {
3236                                 db_printf("<mount point>");
3237                                 vp = vp->v_mount->mnt_vnodecovered;
3238                         } else {
3239                                 struct namecache *ncp;
3240                                 char *ncn;
3241                                 int i;
3242
3243                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3244                                 if (ncp != NULL) {
3245                                         ncn = ncp->nc_name;
3246                                         for (i = 0; i < ncp->nc_nlen; i++)
3247                                                 db_printf("%c", *ncn++);
3248                                         vp = ncp->nc_dvp;
3249                                 } else {
3250                                         vp = NULL;
3251                                 }
3252                         }
3253                 }
3254                 db_printf("\n");
3255         }
3256
3257         return;
3258 }
3259
3260 DB_SHOW_COMMAND(vpath, db_show_vpath)
3261 {
3262         struct vnode *vp;
3263
3264         if (!have_addr) {
3265                 db_printf("usage: show vpath <struct vnode *>\n");
3266                 return;
3267         }
3268
3269         vp = (struct vnode *)addr;
3270         db_print_vpath(vp);
3271 }
3272
3273 #endif
3274
3275 static bool __read_frequently cache_fast_lookup = true;
3276 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3277     &cache_fast_lookup, 0, "");
3278
3279 #define CACHE_FPL_FAILED        -2020
3280
3281 static void
3282 cache_fpl_cleanup_cnp(struct componentname *cnp)
3283 {
3284
3285         uma_zfree(namei_zone, cnp->cn_pnbuf);
3286 #ifdef DIAGNOSTIC
3287         cnp->cn_pnbuf = NULL;
3288         cnp->cn_nameptr = NULL;
3289 #endif
3290 }
3291
3292 static void
3293 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3294 {
3295         struct componentname *cnp;
3296
3297         cnp = &ndp->ni_cnd;
3298         while (*(cnp->cn_nameptr) == '/') {
3299                 cnp->cn_nameptr++;
3300                 ndp->ni_pathlen--;
3301         }
3302
3303         *dpp = ndp->ni_rootdir;
3304 }
3305
3306 /*
3307  * Components of nameidata (or objects it can point to) which may
3308  * need restoring in case fast path lookup fails.
3309  */
3310 struct nameidata_saved {
3311         long cn_namelen;
3312         char *cn_nameptr;
3313         size_t ni_pathlen;
3314         int cn_flags;
3315 };
3316
3317 struct cache_fpl {
3318         struct nameidata *ndp;
3319         struct componentname *cnp;
3320         struct pwd *pwd;
3321         struct vnode *dvp;
3322         struct vnode *tvp;
3323         seqc_t dvp_seqc;
3324         seqc_t tvp_seqc;
3325         struct nameidata_saved snd;
3326         int line;
3327         enum cache_fpl_status status:8;
3328         bool in_smr;
3329 };
3330
3331 static void
3332 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3333 {
3334
3335         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3336         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3337         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3338         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3339 }
3340
3341 static void
3342 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3343 {
3344
3345         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3346         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3347         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3348         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3349 }
3350
3351 #ifdef INVARIANTS
3352 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3353         struct cache_fpl *_fpl = (fpl);                         \
3354         MPASS(_fpl->in_smr == true);                            \
3355         VFS_SMR_ASSERT_ENTERED();                               \
3356 })
3357 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3358         struct cache_fpl *_fpl = (fpl);                         \
3359         MPASS(_fpl->in_smr == false);                           \
3360         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3361 })
3362 #else
3363 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3364 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3365 #endif
3366
3367 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3368         struct cache_fpl *_fpl = (fpl);                         \
3369         vfs_smr_enter();                                        \
3370         _fpl->in_smr = true;                                    \
3371 })
3372
3373 #define cache_fpl_smr_enter(fpl) ({                             \
3374         struct cache_fpl *_fpl = (fpl);                         \
3375         MPASS(_fpl->in_smr == false);                           \
3376         vfs_smr_enter();                                        \
3377         _fpl->in_smr = true;                                    \
3378 })
3379
3380 #define cache_fpl_smr_exit(fpl) ({                              \
3381         struct cache_fpl *_fpl = (fpl);                         \
3382         MPASS(_fpl->in_smr == true);                            \
3383         vfs_smr_exit();                                         \
3384         _fpl->in_smr = false;                                   \
3385 })
3386
3387 static int
3388 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3389 {
3390
3391         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3392                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3393                     ("%s: converting to abort from %d at %d, set at %d\n",
3394                     __func__, fpl->status, line, fpl->line));
3395         }
3396         fpl->status = CACHE_FPL_STATUS_ABORTED;
3397         fpl->line = line;
3398         return (CACHE_FPL_FAILED);
3399 }
3400
3401 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3402
3403 static int
3404 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3405 {
3406
3407         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3408             ("%s: setting to partial at %d, but already set to %d at %d\n",
3409             __func__, line, fpl->status, fpl->line));
3410         cache_fpl_smr_assert_entered(fpl);
3411         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3412         fpl->line = line;
3413         return (CACHE_FPL_FAILED);
3414 }
3415
3416 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3417
3418 static int
3419 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3420 {
3421
3422         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3423             ("%s: setting to handled at %d, but already set to %d at %d\n",
3424             __func__, line, fpl->status, fpl->line));
3425         cache_fpl_smr_assert_not_entered(fpl);
3426         MPASS(error != CACHE_FPL_FAILED);
3427         fpl->status = CACHE_FPL_STATUS_HANDLED;
3428         fpl->line = line;
3429         return (error);
3430 }
3431
3432 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3433
3434 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3435         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3436          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3437
3438 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3439         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3440
3441 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3442     "supported and internal flags overlap");
3443
3444 static bool
3445 cache_fpl_islastcn(struct nameidata *ndp)
3446 {
3447
3448         return (*ndp->ni_next == 0);
3449 }
3450
3451 static bool
3452 cache_fpl_isdotdot(struct componentname *cnp)
3453 {
3454
3455         if (cnp->cn_namelen == 2 &&
3456             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3457                 return (true);
3458         return (false);
3459 }
3460
3461 static bool
3462 cache_can_fplookup(struct cache_fpl *fpl)
3463 {
3464         struct nameidata *ndp;
3465         struct componentname *cnp;
3466         struct thread *td;
3467
3468         ndp = fpl->ndp;
3469         cnp = fpl->cnp;
3470         td = cnp->cn_thread;
3471
3472         if (!cache_fast_lookup) {
3473                 cache_fpl_aborted(fpl);
3474                 return (false);
3475         }
3476 #ifdef MAC
3477         if (mac_vnode_check_lookup_enabled()) {
3478                 cache_fpl_aborted(fpl);
3479                 return (false);
3480         }
3481 #endif
3482         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3483                 cache_fpl_aborted(fpl);
3484                 return (false);
3485         }
3486         if (ndp->ni_dirfd != AT_FDCWD) {
3487                 cache_fpl_aborted(fpl);
3488                 return (false);
3489         }
3490         if (IN_CAPABILITY_MODE(td)) {
3491                 cache_fpl_aborted(fpl);
3492                 return (false);
3493         }
3494         if (AUDITING_TD(td)) {
3495                 cache_fpl_aborted(fpl);
3496                 return (false);
3497         }
3498         if (ndp->ni_startdir != NULL) {
3499                 cache_fpl_aborted(fpl);
3500                 return (false);
3501         }
3502         return (true);
3503 }
3504
3505 static bool
3506 cache_fplookup_vnode_supported(struct vnode *vp)
3507 {
3508
3509         return (vp->v_type != VLNK);
3510 }
3511
3512 /*
3513  * Move a negative entry to the hot list.
3514  *
3515  * We have to take locks, but they may be contended and in the worst
3516  * case we may need to go off CPU. We don't want to spin within the
3517  * smr section and we can't block with it. Instead we are going to
3518  * look up the entry again.
3519  */
3520 static int __noinline
3521 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3522     uint32_t hash)
3523 {
3524         struct componentname *cnp;
3525         struct namecache *ncp;
3526         struct neglist *neglist;
3527         struct negstate *negstate;
3528         struct vnode *dvp;
3529         u_char nc_flag;
3530
3531         cnp = fpl->cnp;
3532         dvp = fpl->dvp;
3533
3534         if (!vhold_smr(dvp))
3535                 return (cache_fpl_aborted(fpl));
3536
3537         neglist = NCP2NEGLIST(oncp);
3538         cache_fpl_smr_exit(fpl);
3539
3540         mtx_lock(&ncneg_hot.nl_lock);
3541         mtx_lock(&neglist->nl_lock);
3542         /*
3543          * For hash iteration.
3544          */
3545         cache_fpl_smr_enter(fpl);
3546
3547         /*
3548          * Avoid all surprises by only succeeding if we got the same entry and
3549          * bailing completely otherwise.
3550          *
3551          * In particular at this point there can be a new ncp which matches the
3552          * search but hashes to a different neglist.
3553          */
3554         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3555                 if (ncp == oncp)
3556                         break;
3557         }
3558
3559         /*
3560          * No match to begin with.
3561          */
3562         if (__predict_false(ncp == NULL)) {
3563                 goto out_abort;
3564         }
3565
3566         /*
3567          * The newly found entry may be something different...
3568          */
3569         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3570             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3571                 goto out_abort;
3572         }
3573
3574         /*
3575          * ... and not even negative.
3576          */
3577         nc_flag = atomic_load_char(&ncp->nc_flag);
3578         if ((nc_flag & NCF_NEGATIVE) == 0) {
3579                 goto out_abort;
3580         }
3581
3582         if (__predict_false(!cache_ncp_canuse(ncp))) {
3583                 goto out_abort;
3584         }
3585
3586         negstate = NCP2NEGSTATE(ncp);
3587         if ((negstate->neg_flag & NEG_HOT) == 0) {
3588                 numhotneg++;
3589                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3590                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3591                 negstate->neg_flag |= NEG_HOT;
3592         }
3593
3594         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3595         counter_u64_add(numneghits, 1);
3596         cache_fpl_smr_exit(fpl);
3597         mtx_unlock(&neglist->nl_lock);
3598         mtx_unlock(&ncneg_hot.nl_lock);
3599         vdrop(dvp);
3600         return (cache_fpl_handled(fpl, ENOENT));
3601 out_abort:
3602         cache_fpl_smr_exit(fpl);
3603         mtx_unlock(&neglist->nl_lock);
3604         mtx_unlock(&ncneg_hot.nl_lock);
3605         vdrop(dvp);
3606         return (cache_fpl_aborted(fpl));
3607 }
3608
3609 /*
3610  * The target vnode is not supported, prepare for the slow path to take over.
3611  */
3612 static int __noinline
3613 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3614 {
3615         struct nameidata *ndp;
3616         struct componentname *cnp;
3617         enum vgetstate dvs;
3618         struct vnode *dvp;
3619         struct pwd *pwd;
3620         seqc_t dvp_seqc;
3621
3622         ndp = fpl->ndp;
3623         cnp = fpl->cnp;
3624         dvp = fpl->dvp;
3625         dvp_seqc = fpl->dvp_seqc;
3626
3627         dvs = vget_prep_smr(dvp);
3628         if (__predict_false(dvs == VGET_NONE)) {
3629                 cache_fpl_smr_exit(fpl);
3630                 return (cache_fpl_aborted(fpl));
3631         }
3632
3633         cache_fpl_smr_exit(fpl);
3634
3635         vget_finish_ref(dvp, dvs);
3636         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3637                 vrele(dvp);
3638                 return (cache_fpl_aborted(fpl));
3639         }
3640
3641         pwd = pwd_hold(curthread);
3642         if (fpl->pwd != pwd) {
3643                 vrele(dvp);
3644                 pwd_drop(pwd);
3645                 return (cache_fpl_aborted(fpl));
3646         }
3647
3648         cache_fpl_restore(fpl, &fpl->snd);
3649
3650         ndp->ni_startdir = dvp;
3651         cnp->cn_flags |= MAKEENTRY;
3652         if (cache_fpl_islastcn(ndp))
3653                 cnp->cn_flags |= ISLASTCN;
3654         if (cache_fpl_isdotdot(cnp))
3655                 cnp->cn_flags |= ISDOTDOT;
3656
3657         return (0);
3658 }
3659
3660 static int
3661 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3662 {
3663         struct componentname *cnp;
3664         struct vnode *tvp;
3665         seqc_t tvp_seqc;
3666         int error, lkflags;
3667
3668         cnp = fpl->cnp;
3669         tvp = fpl->tvp;
3670         tvp_seqc = fpl->tvp_seqc;
3671
3672         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3673                 lkflags = LK_SHARED;
3674                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3675                         lkflags = LK_EXCLUSIVE;
3676                 error = vget_finish(tvp, lkflags, tvs);
3677                 if (__predict_false(error != 0)) {
3678                         return (cache_fpl_aborted(fpl));
3679                 }
3680         } else {
3681                 vget_finish_ref(tvp, tvs);
3682         }
3683
3684         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3685                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3686                         vput(tvp);
3687                 else
3688                         vrele(tvp);
3689                 return (cache_fpl_aborted(fpl));
3690         }
3691
3692         return (cache_fpl_handled(fpl, 0));
3693 }
3694
3695 /*
3696  * They want to possibly modify the state of the namecache.
3697  *
3698  * Don't try to match the API contract, just leave.
3699  * TODO: this leaves scalability on the table
3700  */
3701 static int
3702 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3703 {
3704         struct componentname *cnp;
3705
3706         cnp = fpl->cnp;
3707         MPASS(cnp->cn_nameiop != LOOKUP);
3708         return (cache_fpl_partial(fpl));
3709 }
3710
3711 static int __noinline
3712 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3713 {
3714         struct componentname *cnp;
3715         enum vgetstate dvs, tvs;
3716         struct vnode *dvp, *tvp;
3717         seqc_t dvp_seqc, tvp_seqc;
3718         int error;
3719
3720         cnp = fpl->cnp;
3721         dvp = fpl->dvp;
3722         dvp_seqc = fpl->dvp_seqc;
3723         tvp = fpl->tvp;
3724         tvp_seqc = fpl->tvp_seqc;
3725
3726         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3727
3728         /*
3729          * This is less efficient than it can be for simplicity.
3730          */
3731         dvs = vget_prep_smr(dvp);
3732         if (__predict_false(dvs == VGET_NONE)) {
3733                 return (cache_fpl_aborted(fpl));
3734         }
3735         tvs = vget_prep_smr(tvp);
3736         if (__predict_false(tvs == VGET_NONE)) {
3737                 cache_fpl_smr_exit(fpl);
3738                 vget_abort(dvp, dvs);
3739                 return (cache_fpl_aborted(fpl));
3740         }
3741
3742         cache_fpl_smr_exit(fpl);
3743
3744         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3745                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3746                 if (__predict_false(error != 0)) {
3747                         vget_abort(tvp, tvs);
3748                         return (cache_fpl_aborted(fpl));
3749                 }
3750         } else {
3751                 vget_finish_ref(dvp, dvs);
3752         }
3753
3754         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3755                 vget_abort(tvp, tvs);
3756                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3757                         vput(dvp);
3758                 else
3759                         vrele(dvp);
3760                 return (cache_fpl_aborted(fpl));
3761         }
3762
3763         error = cache_fplookup_final_child(fpl, tvs);
3764         if (__predict_false(error != 0)) {
3765                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3766                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3767                         vput(dvp);
3768                 else
3769                         vrele(dvp);
3770                 return (error);
3771         }
3772
3773         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3774         return (0);
3775 }
3776
3777 static int
3778 cache_fplookup_final(struct cache_fpl *fpl)
3779 {
3780         struct componentname *cnp;
3781         enum vgetstate tvs;
3782         struct vnode *dvp, *tvp;
3783         seqc_t dvp_seqc, tvp_seqc;
3784
3785         cnp = fpl->cnp;
3786         dvp = fpl->dvp;
3787         dvp_seqc = fpl->dvp_seqc;
3788         tvp = fpl->tvp;
3789         tvp_seqc = fpl->tvp_seqc;
3790
3791         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3792
3793         if (cnp->cn_nameiop != LOOKUP) {
3794                 return (cache_fplookup_final_modifying(fpl));
3795         }
3796
3797         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3798                 return (cache_fplookup_final_withparent(fpl));
3799
3800         tvs = vget_prep_smr(tvp);
3801         if (__predict_false(tvs == VGET_NONE)) {
3802                 return (cache_fpl_partial(fpl));
3803         }
3804
3805         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3806                 cache_fpl_smr_exit(fpl);
3807                 vget_abort(tvp, tvs);
3808                 return (cache_fpl_aborted(fpl));
3809         }
3810
3811         cache_fpl_smr_exit(fpl);
3812         return (cache_fplookup_final_child(fpl, tvs));
3813 }
3814
3815 static int __noinline
3816 cache_fplookup_dot(struct cache_fpl *fpl)
3817 {
3818         struct vnode *dvp;
3819
3820         dvp = fpl->dvp;
3821
3822         fpl->tvp = dvp;
3823         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3824         if (seqc_in_modify(fpl->tvp_seqc)) {
3825                 return (cache_fpl_aborted(fpl));
3826         }
3827
3828         counter_u64_add(dothits, 1);
3829         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3830
3831         return (0);
3832 }
3833
3834 static int __noinline
3835 cache_fplookup_dotdot(struct cache_fpl *fpl)
3836 {
3837         struct nameidata *ndp;
3838         struct componentname *cnp;
3839         struct namecache *ncp;
3840         struct vnode *dvp;
3841         struct prison *pr;
3842         u_char nc_flag;
3843
3844         ndp = fpl->ndp;
3845         cnp = fpl->cnp;
3846         dvp = fpl->dvp;
3847
3848         /*
3849          * XXX this is racy the same way regular lookup is
3850          */
3851         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3852             pr = pr->pr_parent)
3853                 if (dvp == pr->pr_root)
3854                         break;
3855
3856         if (dvp == ndp->ni_rootdir ||
3857             dvp == ndp->ni_topdir ||
3858             dvp == rootvnode ||
3859             pr != NULL) {
3860                 fpl->tvp = dvp;
3861                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3862                 if (seqc_in_modify(fpl->tvp_seqc)) {
3863                         return (cache_fpl_aborted(fpl));
3864                 }
3865                 return (0);
3866         }
3867
3868         if ((dvp->v_vflag & VV_ROOT) != 0) {
3869                 /*
3870                  * TODO
3871                  * The opposite of climb mount is needed here.
3872                  */
3873                 return (cache_fpl_aborted(fpl));
3874         }
3875
3876         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3877         if (ncp == NULL) {
3878                 return (cache_fpl_aborted(fpl));
3879         }
3880
3881         nc_flag = atomic_load_char(&ncp->nc_flag);
3882         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3883                 if ((nc_flag & NCF_NEGATIVE) != 0)
3884                         return (cache_fpl_aborted(fpl));
3885                 fpl->tvp = ncp->nc_vp;
3886         } else {
3887                 fpl->tvp = ncp->nc_dvp;
3888         }
3889
3890         if (__predict_false(!cache_ncp_canuse(ncp))) {
3891                 return (cache_fpl_aborted(fpl));
3892         }
3893
3894         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3895         if (seqc_in_modify(fpl->tvp_seqc)) {
3896                 return (cache_fpl_partial(fpl));
3897         }
3898
3899         counter_u64_add(dotdothits, 1);
3900         return (0);
3901 }
3902
3903 static int
3904 cache_fplookup_next(struct cache_fpl *fpl)
3905 {
3906         struct componentname *cnp;
3907         struct namecache *ncp;
3908         struct negstate *negstate;
3909         struct vnode *dvp, *tvp;
3910         u_char nc_flag;
3911         uint32_t hash;
3912         bool neg_hot;
3913
3914         cnp = fpl->cnp;
3915         dvp = fpl->dvp;
3916
3917         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3918                 return (cache_fplookup_dot(fpl));
3919         }
3920
3921         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3922
3923         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3924                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3925                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3926                         break;
3927         }
3928
3929         /*
3930          * If there is no entry we have to punt to the slow path to perform
3931          * actual lookup. Should there be nothing with this name a negative
3932          * entry will be created.
3933          */
3934         if (__predict_false(ncp == NULL)) {
3935                 return (cache_fpl_partial(fpl));
3936         }
3937
3938         tvp = atomic_load_ptr(&ncp->nc_vp);
3939         nc_flag = atomic_load_char(&ncp->nc_flag);
3940         if ((nc_flag & NCF_NEGATIVE) != 0) {
3941                 /*
3942                  * If they want to create an entry we need to replace this one.
3943                  */
3944                 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3945                         return (cache_fpl_partial(fpl));
3946                 }
3947                 negstate = NCP2NEGSTATE(ncp);
3948                 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3949                 if (__predict_false(!cache_ncp_canuse(ncp))) {
3950                         return (cache_fpl_partial(fpl));
3951                 }
3952                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3953                         return (cache_fpl_partial(fpl));
3954                 }
3955                 if (!neg_hot) {
3956                         return (cache_fplookup_negative_promote(fpl, ncp, hash));
3957                 }
3958                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3959                     ncp->nc_name);
3960                 counter_u64_add(numneghits, 1);
3961                 cache_fpl_smr_exit(fpl);
3962                 return (cache_fpl_handled(fpl, ENOENT));
3963         }
3964
3965         if (__predict_false(!cache_ncp_canuse(ncp))) {
3966                 return (cache_fpl_partial(fpl));
3967         }
3968
3969         fpl->tvp = tvp;
3970         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3971         if (seqc_in_modify(fpl->tvp_seqc)) {
3972                 return (cache_fpl_partial(fpl));
3973         }
3974
3975         if (!cache_fplookup_vnode_supported(tvp)) {
3976                 return (cache_fpl_partial(fpl));
3977         }
3978
3979         counter_u64_add(numposhits, 1);
3980         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3981         return (0);
3982 }
3983
3984 static bool
3985 cache_fplookup_mp_supported(struct mount *mp)
3986 {
3987
3988         if (mp == NULL)
3989                 return (false);
3990         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3991                 return (false);
3992         return (true);
3993 }
3994
3995 /*
3996  * Walk up the mount stack (if any).
3997  *
3998  * Correctness is provided in the following ways:
3999  * - all vnodes are protected from freeing with SMR
4000  * - struct mount objects are type stable making them always safe to access
4001  * - stability of the particular mount is provided by busying it
4002  * - relationship between the vnode which is mounted on and the mount is
4003  *   verified with the vnode sequence counter after busying
4004  * - association between root vnode of the mount and the mount is protected
4005  *   by busy
4006  *
4007  * From that point on we can read the sequence counter of the root vnode
4008  * and get the next mount on the stack (if any) using the same protection.
4009  *
4010  * By the end of successful walk we are guaranteed the reached state was
4011  * indeed present at least at some point which matches the regular lookup.
4012  */
4013 static int __noinline
4014 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4015 {
4016         struct mount *mp, *prev_mp;
4017         struct vnode *vp;
4018         seqc_t vp_seqc;
4019
4020         vp = fpl->tvp;
4021         vp_seqc = fpl->tvp_seqc;
4022
4023         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4024         mp = atomic_load_ptr(&vp->v_mountedhere);
4025         if (mp == NULL)
4026                 return (0);
4027
4028         prev_mp = NULL;
4029         for (;;) {
4030                 if (!vfs_op_thread_enter_crit(mp)) {
4031                         if (prev_mp != NULL)
4032                                 vfs_op_thread_exit_crit(prev_mp);
4033                         return (cache_fpl_partial(fpl));
4034                 }
4035                 if (prev_mp != NULL)
4036                         vfs_op_thread_exit_crit(prev_mp);
4037                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4038                         vfs_op_thread_exit_crit(mp);
4039                         return (cache_fpl_partial(fpl));
4040                 }
4041                 if (!cache_fplookup_mp_supported(mp)) {
4042                         vfs_op_thread_exit_crit(mp);
4043                         return (cache_fpl_partial(fpl));
4044                 }
4045                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4046                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4047                         vfs_op_thread_exit_crit(mp);
4048                         return (cache_fpl_partial(fpl));
4049                 }
4050                 vp_seqc = vn_seqc_read_any(vp);
4051                 if (seqc_in_modify(vp_seqc)) {
4052                         vfs_op_thread_exit_crit(mp);
4053                         return (cache_fpl_partial(fpl));
4054                 }
4055                 prev_mp = mp;
4056                 mp = atomic_load_ptr(&vp->v_mountedhere);
4057                 if (mp == NULL)
4058                         break;
4059         }
4060
4061         vfs_op_thread_exit_crit(prev_mp);
4062         fpl->tvp = vp;
4063         fpl->tvp_seqc = vp_seqc;
4064         return (0);
4065 }
4066
4067 static bool
4068 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4069 {
4070         struct mount *mp;
4071         struct vnode *vp;
4072
4073         vp = fpl->tvp;
4074
4075         /*
4076          * Hack: while this is a union, the pointer tends to be NULL so save on
4077          * a branch.
4078          */
4079         mp = atomic_load_ptr(&vp->v_mountedhere);
4080         if (mp == NULL)
4081                 return (false);
4082         if (vp->v_type == VDIR)
4083                 return (true);
4084         return (false);
4085 }
4086
4087 /*
4088  * Parse the path.
4089  *
4090  * The code is mostly copy-pasted from regular lookup, see lookup().
4091  * The structure is maintained along with comments for easier maintenance.
4092  * Deduplicating the code will become feasible after fast path lookup
4093  * becomes more feature-complete.
4094  */
4095 static int
4096 cache_fplookup_parse(struct cache_fpl *fpl)
4097 {
4098         struct nameidata *ndp;
4099         struct componentname *cnp;
4100         char *cp;
4101
4102         ndp = fpl->ndp;
4103         cnp = fpl->cnp;
4104
4105         /*
4106          * Search a new directory.
4107          *
4108          * The last component of the filename is left accessible via
4109          * cnp->cn_nameptr for callers that need the name. Callers needing
4110          * the name set the SAVENAME flag. When done, they assume
4111          * responsibility for freeing the pathname buffer.
4112          */
4113         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4114                 continue;
4115         cnp->cn_namelen = cp - cnp->cn_nameptr;
4116         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4117                 cache_fpl_smr_exit(fpl);
4118                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4119         }
4120         ndp->ni_pathlen -= cnp->cn_namelen;
4121         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4122             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4123         ndp->ni_next = cp;
4124
4125         /*
4126          * Replace multiple slashes by a single slash and trailing slashes
4127          * by a null.  This must be done before VOP_LOOKUP() because some
4128          * fs's don't know about trailing slashes.  Remember if there were
4129          * trailing slashes to handle symlinks, existing non-directories
4130          * and non-existing files that won't be directories specially later.
4131          */
4132         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4133                 cp++;
4134                 ndp->ni_pathlen--;
4135                 if (*cp == '\0') {
4136                         /*
4137                          * TODO
4138                          * Regular lookup performs the following:
4139                          * *ndp->ni_next = '\0';
4140                          * cnp->cn_flags |= TRAILINGSLASH;
4141                          *
4142                          * Which is problematic since it modifies data read
4143                          * from userspace. Then if fast path lookup was to
4144                          * abort we would have to either restore it or convey
4145                          * the flag. Since this is a corner case just ignore
4146                          * it for simplicity.
4147                          */
4148                         return (cache_fpl_partial(fpl));
4149                 }
4150         }
4151         ndp->ni_next = cp;
4152
4153         /*
4154          * Check for degenerate name (e.g. / or "")
4155          * which is a way of talking about a directory,
4156          * e.g. like "/." or ".".
4157          *
4158          * TODO
4159          * Another corner case handled by the regular lookup
4160          */
4161         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4162                 return (cache_fpl_partial(fpl));
4163         }
4164         return (0);
4165 }
4166
4167 static void
4168 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4169 {
4170         struct nameidata *ndp;
4171         struct componentname *cnp;
4172
4173         ndp = fpl->ndp;
4174         cnp = fpl->cnp;
4175
4176         cnp->cn_nameptr = ndp->ni_next;
4177         while (*cnp->cn_nameptr == '/') {
4178                 cnp->cn_nameptr++;
4179                 ndp->ni_pathlen--;
4180         }
4181 }
4182
4183 static int __noinline
4184 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4185 {
4186
4187         switch (error) {
4188         case EAGAIN:
4189                 /*
4190                  * Can happen when racing against vgone.
4191                  * */
4192         case EOPNOTSUPP:
4193                 cache_fpl_partial(fpl);
4194                 break;
4195         default:
4196                 /*
4197                  * See the API contract for VOP_FPLOOKUP_VEXEC.
4198                  */
4199                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4200                         error = cache_fpl_aborted(fpl);
4201                 } else {
4202                         cache_fpl_smr_exit(fpl);
4203                         cache_fpl_handled(fpl, error);
4204                 }
4205                 break;
4206         }
4207         return (error);
4208 }
4209
4210 static int
4211 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4212 {
4213         struct nameidata *ndp;
4214         struct componentname *cnp;
4215         struct mount *mp;
4216         int error;
4217
4218         error = CACHE_FPL_FAILED;
4219         ndp = fpl->ndp;
4220         cnp = fpl->cnp;
4221
4222         cache_fpl_checkpoint(fpl, &fpl->snd);
4223
4224         fpl->dvp = dvp;
4225         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4226         if (seqc_in_modify(fpl->dvp_seqc)) {
4227                 cache_fpl_aborted(fpl);
4228                 goto out;
4229         }
4230         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4231         if (!cache_fplookup_mp_supported(mp)) {
4232                 cache_fpl_aborted(fpl);
4233                 goto out;
4234         }
4235
4236         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4237
4238         for (;;) {
4239                 error = cache_fplookup_parse(fpl);
4240                 if (__predict_false(error != 0)) {
4241                         break;
4242                 }
4243
4244                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4245
4246                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4247                 if (__predict_false(error != 0)) {
4248                         error = cache_fplookup_failed_vexec(fpl, error);
4249                         break;
4250                 }
4251
4252                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4253                         error = cache_fplookup_dotdot(fpl);
4254                         if (__predict_false(error != 0)) {
4255                                 break;
4256                         }
4257                 } else {
4258                         error = cache_fplookup_next(fpl);
4259                         if (__predict_false(error != 0)) {
4260                                 break;
4261                         }
4262
4263                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4264
4265                         if (cache_fplookup_need_climb_mount(fpl)) {
4266                                 error = cache_fplookup_climb_mount(fpl);
4267                                 if (__predict_false(error != 0)) {
4268                                         break;
4269                                 }
4270                         }
4271                 }
4272
4273                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4274
4275                 if (cache_fpl_islastcn(ndp)) {
4276                         error = cache_fplookup_final(fpl);
4277                         break;
4278                 }
4279
4280                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4281                         error = cache_fpl_aborted(fpl);
4282                         break;
4283                 }
4284
4285                 fpl->dvp = fpl->tvp;
4286                 fpl->dvp_seqc = fpl->tvp_seqc;
4287
4288                 cache_fplookup_parse_advance(fpl);
4289                 cache_fpl_checkpoint(fpl, &fpl->snd);
4290         }
4291 out:
4292         switch (fpl->status) {
4293         case CACHE_FPL_STATUS_UNSET:
4294                 __assert_unreachable();
4295                 break;
4296         case CACHE_FPL_STATUS_PARTIAL:
4297                 cache_fpl_smr_assert_entered(fpl);
4298                 return (cache_fplookup_partial_setup(fpl));
4299         case CACHE_FPL_STATUS_ABORTED:
4300                 if (fpl->in_smr)
4301                         cache_fpl_smr_exit(fpl);
4302                 return (CACHE_FPL_FAILED);
4303         case CACHE_FPL_STATUS_HANDLED:
4304                 MPASS(error != CACHE_FPL_FAILED);
4305                 cache_fpl_smr_assert_not_entered(fpl);
4306                 if (__predict_false(error != 0)) {
4307                         ndp->ni_dvp = NULL;
4308                         ndp->ni_vp = NULL;
4309                         cache_fpl_cleanup_cnp(cnp);
4310                         return (error);
4311                 }
4312                 ndp->ni_dvp = fpl->dvp;
4313                 ndp->ni_vp = fpl->tvp;
4314                 if (cnp->cn_flags & SAVENAME)
4315                         cnp->cn_flags |= HASBUF;
4316                 else
4317                         cache_fpl_cleanup_cnp(cnp);
4318                 return (error);
4319         }
4320 }
4321
4322 /*
4323  * Fast path lookup protected with SMR and sequence counters.
4324  *
4325  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4326  *
4327  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4328  * outlined below.
4329  *
4330  * Traditional vnode lookup conceptually looks like this:
4331  *
4332  * vn_lock(current);
4333  * for (;;) {
4334  *      next = find();
4335  *      vn_lock(next);
4336  *      vn_unlock(current);
4337  *      current = next;
4338  *      if (last)
4339  *          break;
4340  * }
4341  * return (current);
4342  *
4343  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4344  * any modifications thanks to holding respective locks.
4345  *
4346  * The same guarantee can be provided with a combination of safe memory
4347  * reclamation and sequence counters instead. If all operations which affect
4348  * the relationship between the current vnode and the one we are looking for
4349  * also modify the counter, we can verify whether all the conditions held as
4350  * we made the jump. This includes things like permissions, mount points etc.
4351  * Counter modification is provided by enclosing relevant places in
4352  * vn_seqc_write_begin()/end() calls.
4353  *
4354  * Thus this translates to:
4355  *
4356  * vfs_smr_enter();
4357  * dvp_seqc = seqc_read_any(dvp);
4358  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4359  *     abort();
4360  * for (;;) {
4361  *      tvp = find();
4362  *      tvp_seqc = seqc_read_any(tvp);
4363  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4364  *          abort();
4365  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4366  *          abort();
4367  *      dvp = tvp; // we know nothing of importance has changed
4368  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4369  *      if (last)
4370  *          break;
4371  * }
4372  * vget(); // secure the vnode
4373  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4374  *          abort();
4375  * // at this point we know nothing has changed for any parent<->child pair
4376  * // as they were crossed during the lookup, meaning we matched the guarantee
4377  * // of the locked variant
4378  * return (tvp);
4379  *
4380  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4381  * - they are called while within vfs_smr protection which they must never exit
4382  * - EAGAIN can be returned to denote checking could not be performed, it is
4383  *   always valid to return it
4384  * - if the sequence counter has not changed the result must be valid
4385  * - if the sequence counter has changed both false positives and false negatives
4386  *   are permitted (since the result will be rejected later)
4387  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4388  *
4389  * Caveats to watch out for:
4390  * - vnodes are passed unlocked and unreferenced with nothing stopping
4391  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4392  *   to use atomic_load_ptr to fetch it.
4393  * - the aforementioned object can also get freed, meaning absent other means it
4394  *   should be protected with vfs_smr
4395  * - either safely checking permissions as they are modified or guaranteeing
4396  *   their stability is left to the routine
4397  */
4398 int
4399 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4400     struct pwd **pwdp)
4401 {
4402         struct cache_fpl fpl;
4403         struct pwd *pwd;
4404         struct vnode *dvp;
4405         struct componentname *cnp;
4406         struct nameidata_saved orig;
4407         int error;
4408
4409         MPASS(ndp->ni_lcf == 0);
4410
4411         fpl.status = CACHE_FPL_STATUS_UNSET;
4412         fpl.ndp = ndp;
4413         fpl.cnp = &ndp->ni_cnd;
4414         MPASS(curthread == fpl.cnp->cn_thread);
4415
4416         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4417                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4418
4419         if (!cache_can_fplookup(&fpl)) {
4420                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4421                 *status = fpl.status;
4422                 return (EOPNOTSUPP);
4423         }
4424
4425         cache_fpl_checkpoint(&fpl, &orig);
4426
4427         cache_fpl_smr_enter_initial(&fpl);
4428         pwd = pwd_get_smr();
4429         fpl.pwd = pwd;
4430         ndp->ni_rootdir = pwd->pwd_rdir;
4431         ndp->ni_topdir = pwd->pwd_jdir;
4432
4433         cnp = fpl.cnp;
4434         cnp->cn_nameptr = cnp->cn_pnbuf;
4435         if (cnp->cn_pnbuf[0] == '/') {
4436                 cache_fpl_handle_root(ndp, &dvp);
4437         } else {
4438                 MPASS(ndp->ni_dirfd == AT_FDCWD);
4439                 dvp = pwd->pwd_cdir;
4440         }
4441
4442         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4443
4444         error = cache_fplookup_impl(dvp, &fpl);
4445         cache_fpl_smr_assert_not_entered(&fpl);
4446         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4447
4448         *status = fpl.status;
4449         switch (fpl.status) {
4450         case CACHE_FPL_STATUS_UNSET:
4451                 __assert_unreachable();
4452                 break;
4453         case CACHE_FPL_STATUS_HANDLED:
4454                 SDT_PROBE3(vfs, namei, lookup, return, error,
4455                     (error == 0 ? ndp->ni_vp : NULL), true);
4456                 break;
4457         case CACHE_FPL_STATUS_PARTIAL:
4458                 *pwdp = fpl.pwd;
4459                 /*
4460                  * Status restored by cache_fplookup_partial_setup.
4461                  */
4462                 break;
4463         case CACHE_FPL_STATUS_ABORTED:
4464                 cache_fpl_restore(&fpl, &orig);
4465                 break;
4466         }
4467         return (error);
4468 }