sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 SDT_PROVIDER_DECLARE(vfs);
  83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  84     "struct vnode *");
  85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  86     "char *");
  87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  88     "const char *");
  89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  90     "struct namecache *", "int", "int");
  91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  93     "char *", "struct vnode *");
  94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  96     "struct vnode *", "char *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  98     "struct vnode *");
  99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 100     "struct vnode *", "char *");
 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 102     "char *");
 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 104     "struct componentname *");
 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 106     "struct componentname *");
 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 111     "struct vnode *");
 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 113     "char *");
 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
 115     "char *");
 116
 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 120
 121 /*
 122  * This structure describes the elements in the cache of recent
 123  * names looked up by namei.
 124  */
 125 struct negstate {
 126         u_char neg_flag;
 127 };
 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 129     "the state must fit in a union with a pointer without growing it");
 130
 131 struct  namecache {
 132         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 133         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 134         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 135         struct  vnode *nc_dvp;          /* vnode of parent of name */
 136         union {
 137                 struct  vnode *nu_vp;   /* vnode the name refers to */
 138                 struct  negstate nu_neg;/* negative entry state */
 139         } n_un;
 140         u_char  nc_flag;                /* flag bits */
 141         u_char  nc_nlen;                /* length of name */
 142         char    nc_name[0];             /* segment name + nul */
 143 };
 144
 145 /*
 146  * struct namecache_ts repeats struct namecache layout up to the
 147  * nc_nlen member.
 148  * struct namecache_ts is used in place of struct namecache when time(s) need
 149  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 150  * both a non-dotdot directory name plus dotdot for the directory's
 151  * parent.
 152  *
 153  * See below for alignment requirement.
 154  */
 155 struct  namecache_ts {
 156         struct  timespec nc_time;       /* timespec provided by fs */
 157         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 158         int     nc_ticks;               /* ticks value when entry was added */
 159         struct namecache nc_nc;
 160 };
 161
 162 /*
 163  * At least mips n32 performs 64-bit accesses to timespec as found
 164  * in namecache_ts and requires them to be aligned. Since others
 165  * may be in the same spot suffer a little bit and enforce the
 166  * alignment for everyone. Note this is a nop for 64-bit platforms.
 167  */
 168 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 169 #define CACHE_PATH_CUTOFF       39
 170
 171 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 172 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 173 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 174 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 175
 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 180
 181 #define nc_vp           n_un.nu_vp
 182 #define nc_neg          n_un.nu_neg
 183
 184 /*
 185  * Flags in namecache.nc_flag
 186  */
 187 #define NCF_WHITE       0x01
 188 #define NCF_ISDOTDOT    0x02
 189 #define NCF_TS          0x04
 190 #define NCF_DTS         0x08
 191 #define NCF_DVDROP      0x10
 192 #define NCF_NEGATIVE    0x20
 193 #define NCF_INVALID     0x40
 194 #define NCF_WIP         0x80
 195
 196 /*
 197  * Flags in negstate.neg_flag
 198  */
 199 #define NEG_HOT         0x01
 200
 201 /*
 202  * Mark an entry as invalid.
 203  *
 204  * This is called before it starts getting deconstructed.
 205  */
 206 static void
 207 cache_ncp_invalidate(struct namecache *ncp)
 208 {
 209
 210         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 211             ("%s: entry %p already invalid", __func__, ncp));
 212         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 213         atomic_thread_fence_rel();
 214 }
 215
 216 /*
 217  * Check whether the entry can be safely used.
 218  *
 219  * All places which elide locks are supposed to call this after they are
 220  * done with reading from an entry.
 221  */
 222 static bool
 223 cache_ncp_canuse(struct namecache *ncp)
 224 {
 225
 226         atomic_thread_fence_acq();
 227         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 228 }
 229
 230 /*
 231  * Name caching works as follows:
 232  *
 233  * Names found by directory scans are retained in a cache
 234  * for future reference.  It is managed LRU, so frequently
 235  * used names will hang around.  Cache is indexed by hash value
 236  * obtained from (dvp, name) where dvp refers to the directory
 237  * containing name.
 238  *
 239  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 240  * exist) the vnode pointer will be NULL.
 241  *
 242  * Upon reaching the last segment of a path, if the reference
 243  * is for DELETE, or NOCACHE is set (rewrite), and the
 244  * name is located in the cache, it will be dropped.
 245  *
 246  * These locks are used (in the order in which they can be taken):
 247  * NAME         TYPE    ROLE
 248  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 249  * bucketlock   mtx     for access to given set of hash buckets
 250  * neglist      mtx     negative entry LRU management
 251  *
 252  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
 253  * shrinking the LRU list.
 254  *
 255  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 256  * order is lower address first. Both are recursive.
 257  *
 258  * "." lookups are lockless.
 259  *
 260  * ".." and vnode -> name lookups require vnodelock.
 261  *
 262  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 263  *
 264  * Insertions and removals of entries require involved vnodes and bucketlocks
 265  * to be locked to provide safe operation against other threads modifying the
 266  * cache.
 267  *
 268  * Some lookups result in removal of the found entry (e.g. getting rid of a
 269  * negative entry with the intent to create a positive one), which poses a
 270  * problem when multiple threads reach the state. Similarly, two different
 271  * threads can purge two different vnodes and try to remove the same name.
 272  *
 273  * If the already held vnode lock is lower than the second required lock, we
 274  * can just take the other lock. However, in the opposite case, this could
 275  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 276  * the first node, locking everything in order and revalidating the state.
 277  */
 278
 279 VFS_SMR_DECLARE;
 280
 281 /*
 282  * Structures associated with name caching.
 283  */
 284 #define NCHHASH(hash) \
 285         (&nchashtbl[(hash) & nchash])
 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 287 static u_long __read_mostly     nchash;                 /* size of hash table */
 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 289     "Size of namecache hash table");
 290 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 292     "Ratio of negative namecache entries");
 293 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 294 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 295 u_int ncsizefactor = 2;
 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 297     "Size factor for namecache");
 298 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 299
 300 struct nchstats nchstats;               /* cache effectiveness statistics */
 301
 302 static bool __read_frequently cache_fast_revlookup = true;
 303 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 304     &cache_fast_revlookup, 0, "");
 305
 306 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
 307
 308 #define ncneghash       3
 309 #define numneglists     (ncneghash + 1)
 310
 311 struct neglist {
 312         struct mtx              nl_lock;
 313         TAILQ_HEAD(, namecache) nl_list;
 314         TAILQ_HEAD(, namecache) nl_hotlist;
 315         u_long                  nl_hotnum;
 316 } __aligned(CACHE_LINE_SIZE);
 317
 318 static struct neglist neglists[numneglists];
 319
 320 static inline struct neglist *
 321 NCP2NEGLIST(struct namecache *ncp)
 322 {
 323
 324         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 325 }
 326
 327 static inline struct negstate *
 328 NCP2NEGSTATE(struct namecache *ncp)
 329 {
 330
 331         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 332         return (&ncp->nc_neg);
 333 }
 334
 335 #define numbucketlocks (ncbuckethash + 1)
 336 static u_int __read_mostly  ncbuckethash;
 337 static struct mtx_padalign __read_mostly  *bucketlocks;
 338 #define HASH2BUCKETLOCK(hash) \
 339         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 340
 341 #define numvnodelocks (ncvnodehash + 1)
 342 static u_int __read_mostly  ncvnodehash;
 343 static struct mtx __read_mostly *vnodelocks;
 344 static inline struct mtx *
 345 VP2VNODELOCK(struct vnode *vp)
 346 {
 347
 348         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 349 }
 350
 351 /*
 352  * UMA zones for the VFS cache.
 353  *
 354  * The small cache is used for entries with short names, which are the
 355  * most common.  The large cache is used for entries which are too big to
 356  * fit in the small cache.
 357  */
 358 static uma_zone_t __read_mostly cache_zone_small;
 359 static uma_zone_t __read_mostly cache_zone_small_ts;
 360 static uma_zone_t __read_mostly cache_zone_large;
 361 static uma_zone_t __read_mostly cache_zone_large_ts;
 362
 363 static struct namecache *
 364 cache_alloc(int len, int ts)
 365 {
 366         struct namecache_ts *ncp_ts;
 367         struct namecache *ncp;
 368
 369         if (__predict_false(ts)) {
 370                 if (len <= CACHE_PATH_CUTOFF)
 371                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 372                 else
 373                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 374                 ncp = &ncp_ts->nc_nc;
 375         } else {
 376                 if (len <= CACHE_PATH_CUTOFF)
 377                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 378                 else
 379                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 380         }
 381         return (ncp);
 382 }
 383
 384 static void
 385 cache_free(struct namecache *ncp)
 386 {
 387         struct namecache_ts *ncp_ts;
 388
 389         MPASS(ncp != NULL);
 390         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 391                 vdrop(ncp->nc_dvp);
 392         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 393                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 394                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 395                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 396                 else
 397                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 398         } else {
 399                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 400                         uma_zfree_smr(cache_zone_small, ncp);
 401                 else
 402                         uma_zfree_smr(cache_zone_large, ncp);
 403         }
 404 }
 405
 406 static void
 407 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 408 {
 409         struct namecache_ts *ncp_ts;
 410
 411         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 412             (tsp == NULL && ticksp == NULL),
 413             ("No NCF_TS"));
 414
 415         if (tsp == NULL)
 416                 return;
 417
 418         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 419         *tsp = ncp_ts->nc_time;
 420         *ticksp = ncp_ts->nc_ticks;
 421 }
 422
 423 #ifdef DEBUG_CACHE
 424 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 425 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 426     "VFS namecache enabled");
 427 #endif
 428
 429 /* Export size information to userland */
 430 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 431     sizeof(struct namecache), "sizeof(struct namecache)");
 432
 433 /*
 434  * The new name cache statistics
 435  */
 436 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 437     "Name cache statistics");
 438 #define STATNODE_ULONG(name, descr)                                     \
 439         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 440 #define STATNODE_COUNTER(name, descr)                                   \
 441         static COUNTER_U64_DEFINE_EARLY(name);                          \
 442         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 443             descr);
 444 STATNODE_ULONG(numneg, "Number of negative cache entries");
 445 STATNODE_ULONG(numcache, "Number of cache entries");
 446 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 447 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 448 STATNODE_COUNTER(dothits, "Number of '.' hits");
 449 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 450 STATNODE_COUNTER(nummiss, "Number of cache misses");
 451 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 452 STATNODE_COUNTER(numposzaps,
 453     "Number of cache hits (positive) we do not want to cache");
 454 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 455 STATNODE_COUNTER(numnegzaps,
 456     "Number of cache hits (negative) we do not want to cache");
 457 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 458 /* These count for vn_getcwd(), too. */
 459 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 460 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 461 STATNODE_COUNTER(numfullpathfail2,
 462     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 463 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 464 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 465 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 466     "Number of successful removals after relocking");
 467 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 468     "Number of times zap_and_exit failed to lock");
 469 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 470     "Number of times zap_and_exit failed to lock");
 471 static long cache_lock_vnodes_cel_3_failures;
 472 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 473     "Number of times 3-way vnode locking failed");
 474 STATNODE_COUNTER(numneg_evicted,
 475     "Number of negative entries evicted when adding a new entry");
 476 STATNODE_COUNTER(shrinking_skipped,
 477     "Number of times shrinking was already in progress");
 478
 479 static void cache_zap_locked(struct namecache *ncp);
 480 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 481     char **freebuf, size_t *buflen);
 482 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 483     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
 484 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 485     char **retbuf, size_t *buflen);
 486 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 487     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 488
 489 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 490
 491 static inline void
 492 cache_assert_vlp_locked(struct mtx *vlp)
 493 {
 494
 495         if (vlp != NULL)
 496                 mtx_assert(vlp, MA_OWNED);
 497 }
 498
 499 static inline void
 500 cache_assert_vnode_locked(struct vnode *vp)
 501 {
 502         struct mtx *vlp;
 503
 504         vlp = VP2VNODELOCK(vp);
 505         cache_assert_vlp_locked(vlp);
 506 }
 507
 508 /*
 509  * TODO: With the value stored we can do better than computing the hash based
 510  * on the address. The choice of FNV should also be revisited.
 511  */
 512 static void
 513 cache_prehash(struct vnode *vp)
 514 {
 515
 516         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 517 }
 518
 519 static uint32_t
 520 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 521 {
 522
 523         return (fnv_32_buf(name, len, dvp->v_nchash));
 524 }
 525
 526 static inline struct nchashhead *
 527 NCP2BUCKET(struct namecache *ncp)
 528 {
 529         uint32_t hash;
 530
 531         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 532         return (NCHHASH(hash));
 533 }
 534
 535 static inline struct mtx *
 536 NCP2BUCKETLOCK(struct namecache *ncp)
 537 {
 538         uint32_t hash;
 539
 540         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 541         return (HASH2BUCKETLOCK(hash));
 542 }
 543
 544 #ifdef INVARIANTS
 545 static void
 546 cache_assert_bucket_locked(struct namecache *ncp)
 547 {
 548         struct mtx *blp;
 549
 550         blp = NCP2BUCKETLOCK(ncp);
 551         mtx_assert(blp, MA_OWNED);
 552 }
 553
 554 static void
 555 cache_assert_bucket_unlocked(struct namecache *ncp)
 556 {
 557         struct mtx *blp;
 558
 559         blp = NCP2BUCKETLOCK(ncp);
 560         mtx_assert(blp, MA_NOTOWNED);
 561 }
 562 #else
 563 #define cache_assert_bucket_locked(x) do { } while (0)
 564 #define cache_assert_bucket_unlocked(x) do { } while (0)
 565 #endif
 566
 567 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 568 static void
 569 _cache_sort_vnodes(void **p1, void **p2)
 570 {
 571         void *tmp;
 572
 573         MPASS(*p1 != NULL || *p2 != NULL);
 574
 575         if (*p1 > *p2) {
 576                 tmp = *p2;
 577                 *p2 = *p1;
 578                 *p1 = tmp;
 579         }
 580 }
 581
 582 static void
 583 cache_lock_all_buckets(void)
 584 {
 585         u_int i;
 586
 587         for (i = 0; i < numbucketlocks; i++)
 588                 mtx_lock(&bucketlocks[i]);
 589 }
 590
 591 static void
 592 cache_unlock_all_buckets(void)
 593 {
 594         u_int i;
 595
 596         for (i = 0; i < numbucketlocks; i++)
 597                 mtx_unlock(&bucketlocks[i]);
 598 }
 599
 600 static void
 601 cache_lock_all_vnodes(void)
 602 {
 603         u_int i;
 604
 605         for (i = 0; i < numvnodelocks; i++)
 606                 mtx_lock(&vnodelocks[i]);
 607 }
 608
 609 static void
 610 cache_unlock_all_vnodes(void)
 611 {
 612         u_int i;
 613
 614         for (i = 0; i < numvnodelocks; i++)
 615                 mtx_unlock(&vnodelocks[i]);
 616 }
 617
 618 static int
 619 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 620 {
 621
 622         cache_sort_vnodes(&vlp1, &vlp2);
 623
 624         if (vlp1 != NULL) {
 625                 if (!mtx_trylock(vlp1))
 626                         return (EAGAIN);
 627         }
 628         if (!mtx_trylock(vlp2)) {
 629                 if (vlp1 != NULL)
 630                         mtx_unlock(vlp1);
 631                 return (EAGAIN);
 632         }
 633
 634         return (0);
 635 }
 636
 637 static void
 638 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 639 {
 640
 641         MPASS(vlp1 != NULL || vlp2 != NULL);
 642         MPASS(vlp1 <= vlp2);
 643
 644         if (vlp1 != NULL)
 645                 mtx_lock(vlp1);
 646         if (vlp2 != NULL)
 647                 mtx_lock(vlp2);
 648 }
 649
 650 static void
 651 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 652 {
 653
 654         MPASS(vlp1 != NULL || vlp2 != NULL);
 655
 656         if (vlp1 != NULL)
 657                 mtx_unlock(vlp1);
 658         if (vlp2 != NULL)
 659                 mtx_unlock(vlp2);
 660 }
 661
 662 static int
 663 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 664 {
 665         struct nchstats snap;
 666
 667         if (req->oldptr == NULL)
 668                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 669
 670         snap = nchstats;
 671         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 672         snap.ncs_neghits = counter_u64_fetch(numneghits);
 673         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 674             counter_u64_fetch(numnegzaps);
 675         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 676             counter_u64_fetch(nummiss);
 677
 678         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 679 }
 680 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 681     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 682     "VFS cache effectiveness statistics");
 683
 684 static int
 685 sysctl_hotnum(SYSCTL_HANDLER_ARGS)
 686 {
 687         int i, out;
 688
 689         out = 0;
 690         for (i = 0; i < numneglists; i++)
 691                 out += neglists[i].nl_hotnum;
 692
 693         return (SYSCTL_OUT(req, &out, sizeof(out)));
 694 }
 695 SYSCTL_PROC(_vfs_cache, OID_AUTO, hotnum, CTLTYPE_INT | CTLFLAG_RD |
 696     CTLFLAG_MPSAFE, 0, 0, sysctl_hotnum, "I",
 697     "Number of hot negative entries");
 698
 699 #ifdef DIAGNOSTIC
 700 /*
 701  * Grab an atomic snapshot of the name cache hash chain lengths
 702  */
 703 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 704     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 705     "hash table stats");
 706
 707 static int
 708 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 709 {
 710         struct nchashhead *ncpp;
 711         struct namecache *ncp;
 712         int i, error, n_nchash, *cntbuf;
 713
 714 retry:
 715         n_nchash = nchash + 1;  /* nchash is max index, not count */
 716         if (req->oldptr == NULL)
 717                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 718         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 719         cache_lock_all_buckets();
 720         if (n_nchash != nchash + 1) {
 721                 cache_unlock_all_buckets();
 722                 free(cntbuf, M_TEMP);
 723                 goto retry;
 724         }
 725         /* Scan hash tables counting entries */
 726         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 727                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 728                         cntbuf[i]++;
 729         cache_unlock_all_buckets();
 730         for (error = 0, i = 0; i < n_nchash; i++)
 731                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 732                         break;
 733         free(cntbuf, M_TEMP);
 734         return (error);
 735 }
 736 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 737     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 738     "nchash chain lengths");
 739
 740 static int
 741 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 742 {
 743         int error;
 744         struct nchashhead *ncpp;
 745         struct namecache *ncp;
 746         int n_nchash;
 747         int count, maxlength, used, pct;
 748
 749         if (!req->oldptr)
 750                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 751
 752         cache_lock_all_buckets();
 753         n_nchash = nchash + 1;  /* nchash is max index, not count */
 754         used = 0;
 755         maxlength = 0;
 756
 757         /* Scan hash tables for applicable entries */
 758         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 759                 count = 0;
 760                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 761                         count++;
 762                 }
 763                 if (count)
 764                         used++;
 765                 if (maxlength < count)
 766                         maxlength = count;
 767         }
 768         n_nchash = nchash + 1;
 769         cache_unlock_all_buckets();
 770         pct = (used * 100) / (n_nchash / 100);
 771         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 772         if (error)
 773                 return (error);
 774         error = SYSCTL_OUT(req, &used, sizeof(used));
 775         if (error)
 776                 return (error);
 777         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 778         if (error)
 779                 return (error);
 780         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 781         if (error)
 782                 return (error);
 783         return (0);
 784 }
 785 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 786     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 787     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 788 #endif
 789
 790 /*
 791  * Negative entries management
 792  *
 793  * A variation of LRU scheme is used. New entries are hashed into one of
 794  * numneglists cold lists. Entries get promoted to the hot list on first hit.
 795  *
 796  * The shrinker will demote hot list head and evict from the cold list in a
 797  * round-robin manner.
 798  */
 799 static void
 800 cache_negative_init(struct namecache *ncp)
 801 {
 802         struct negstate *ns;
 803
 804         ncp->nc_flag |= NCF_NEGATIVE;
 805         ns = NCP2NEGSTATE(ncp);
 806         ns->neg_flag = 0;
 807 }
 808
 809 static void
 810 cache_negative_promote(struct namecache *ncp)
 811 {
 812         struct neglist *nl;
 813         struct negstate *ns;
 814
 815         ns = NCP2NEGSTATE(ncp);
 816         nl = NCP2NEGLIST(ncp);
 817         mtx_assert(&nl->nl_lock, MA_OWNED);
 818         if ((ns->neg_flag & NEG_HOT) == 0) {
 819                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 820                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 821                 nl->nl_hotnum++;
 822                 ns->neg_flag |= NEG_HOT;
 823         }
 824 }
 825
 826 static void
 827 cache_negative_hit(struct namecache *ncp)
 828 {
 829         struct neglist *nl;
 830         struct negstate *ns;
 831
 832         ns = NCP2NEGSTATE(ncp);
 833         if ((ns->neg_flag & NEG_HOT) != 0)
 834                 return;
 835         nl = NCP2NEGLIST(ncp);
 836         mtx_lock(&nl->nl_lock);
 837         cache_negative_promote(ncp);
 838         mtx_unlock(&nl->nl_lock);
 839 }
 840
 841 static void
 842 cache_negative_insert(struct namecache *ncp)
 843 {
 844         struct neglist *nl;
 845
 846         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 847         cache_assert_bucket_locked(ncp);
 848         nl = NCP2NEGLIST(ncp);
 849         mtx_lock(&nl->nl_lock);
 850         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 851         mtx_unlock(&nl->nl_lock);
 852         atomic_add_long(&numneg, 1);
 853 }
 854
 855 static void
 856 cache_negative_remove(struct namecache *ncp)
 857 {
 858         struct neglist *nl;
 859         struct negstate *ns;
 860
 861         cache_assert_bucket_locked(ncp);
 862         nl = NCP2NEGLIST(ncp);
 863         ns = NCP2NEGSTATE(ncp);
 864         mtx_lock(&nl->nl_lock);
 865         if ((ns->neg_flag & NEG_HOT) != 0) {
 866                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 867                 nl->nl_hotnum--;
 868         } else {
 869                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 870         }
 871         mtx_unlock(&nl->nl_lock);
 872         atomic_subtract_long(&numneg, 1);
 873 }
 874
 875 static struct neglist *
 876 cache_negative_shrink_select(void)
 877 {
 878         struct neglist *nl;
 879         static u_int cycle;
 880         u_int i;
 881
 882         cycle++;
 883         for (i = 0; i < numneglists; i++) {
 884                 nl = &neglists[(cycle + i) % numneglists];
 885                 if (TAILQ_FIRST(&nl->nl_list) == NULL &&
 886                     TAILQ_FIRST(&nl->nl_hotlist) == NULL)
 887                         continue;
 888                 mtx_lock(&nl->nl_lock);
 889                 if (TAILQ_FIRST(&nl->nl_list) != NULL ||
 890                     TAILQ_FIRST(&nl->nl_hotlist) != NULL)
 891                         return (nl);
 892                 mtx_unlock(&nl->nl_lock);
 893         }
 894
 895         return (NULL);
 896 }
 897
 898 static void
 899 cache_negative_zap_one(void)
 900 {
 901         struct namecache *ncp, *ncp2;
 902         struct neglist *nl;
 903         struct negstate *ns;
 904         struct mtx *dvlp;
 905         struct mtx *blp;
 906
 907         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 908             !mtx_trylock(&ncneg_shrink_lock)) {
 909                 counter_u64_add(shrinking_skipped, 1);
 910                 return;
 911         }
 912
 913         nl = cache_negative_shrink_select();
 914         mtx_unlock(&ncneg_shrink_lock);
 915         if (nl == NULL) {
 916                 return;
 917         }
 918
 919         ncp = TAILQ_FIRST(&nl->nl_hotlist);
 920         if (ncp != NULL) {
 921                 ns = NCP2NEGSTATE(ncp);
 922                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 923                 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 924                 nl->nl_hotnum--;
 925                 ns->neg_flag &= ~NEG_HOT;
 926         }
 927         ncp = TAILQ_FIRST(&nl->nl_list);
 928         MPASS(ncp != NULL);
 929         ns = NCP2NEGSTATE(ncp);
 930         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 931         blp = NCP2BUCKETLOCK(ncp);
 932         mtx_unlock(&nl->nl_lock);
 933         mtx_lock(dvlp);
 934         mtx_lock(blp);
 935         /*
 936          * Enter SMR to safely check the negative list.
 937          * Even if the found pointer matches, the entry may now be reallocated
 938          * and used by a different vnode.
 939          */
 940         vfs_smr_enter();
 941         ncp2 = TAILQ_FIRST(&nl->nl_list);
 942         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 943             blp != NCP2BUCKETLOCK(ncp2)) {
 944                 vfs_smr_exit();
 945                 ncp = NULL;
 946         } else {
 947                 vfs_smr_exit();
 948                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 949                     ncp->nc_name);
 950                 cache_zap_locked(ncp);
 951                 counter_u64_add(numneg_evicted, 1);
 952         }
 953         mtx_unlock(blp);
 954         mtx_unlock(dvlp);
 955         if (ncp != NULL)
 956                 cache_free(ncp);
 957 }
 958
 959 /*
 960  * cache_zap_locked():
 961  *
 962  *   Removes a namecache entry from cache, whether it contains an actual
 963  *   pointer to a vnode or if it is just a negative cache entry.
 964  */
 965 static void
 966 cache_zap_locked(struct namecache *ncp)
 967 {
 968         struct nchashhead *ncpp;
 969
 970         if (!(ncp->nc_flag & NCF_NEGATIVE))
 971                 cache_assert_vnode_locked(ncp->nc_vp);
 972         cache_assert_vnode_locked(ncp->nc_dvp);
 973         cache_assert_bucket_locked(ncp);
 974
 975         cache_ncp_invalidate(ncp);
 976
 977         ncpp = NCP2BUCKET(ncp);
 978         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 979         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 980                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 981                     ncp->nc_name, ncp->nc_vp);
 982                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 983                 if (ncp == ncp->nc_vp->v_cache_dd) {
 984                         vn_seqc_write_begin_unheld(ncp->nc_vp);
 985                         ncp->nc_vp->v_cache_dd = NULL;
 986                         vn_seqc_write_end(ncp->nc_vp);
 987                 }
 988         } else {
 989                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 990                     ncp->nc_name);
 991                 cache_negative_remove(ncp);
 992         }
 993         if (ncp->nc_flag & NCF_ISDOTDOT) {
 994                 if (ncp == ncp->nc_dvp->v_cache_dd) {
 995                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
 996                         ncp->nc_dvp->v_cache_dd = NULL;
 997                         vn_seqc_write_end(ncp->nc_dvp);
 998                 }
 999         } else {
1000                 LIST_REMOVE(ncp, nc_src);
1001                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1002                         ncp->nc_flag |= NCF_DVDROP;
1003                         counter_u64_add(numcachehv, -1);
1004                 }
1005         }
1006         atomic_subtract_long(&numcache, 1);
1007 }
1008
1009 static void
1010 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1011 {
1012         struct mtx *blp;
1013
1014         MPASS(ncp->nc_dvp == vp);
1015         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1016         cache_assert_vnode_locked(vp);
1017
1018         blp = NCP2BUCKETLOCK(ncp);
1019         mtx_lock(blp);
1020         cache_zap_locked(ncp);
1021         mtx_unlock(blp);
1022 }
1023
1024 static bool
1025 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1026     struct mtx **vlpp)
1027 {
1028         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1029         struct mtx *blp;
1030
1031         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1032         cache_assert_vnode_locked(vp);
1033
1034         if (ncp->nc_flag & NCF_NEGATIVE) {
1035                 if (*vlpp != NULL) {
1036                         mtx_unlock(*vlpp);
1037                         *vlpp = NULL;
1038                 }
1039                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1040                 return (true);
1041         }
1042
1043         pvlp = VP2VNODELOCK(vp);
1044         blp = NCP2BUCKETLOCK(ncp);
1045         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1046         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1047
1048         if (*vlpp == vlp1 || *vlpp == vlp2) {
1049                 to_unlock = *vlpp;
1050                 *vlpp = NULL;
1051         } else {
1052                 if (*vlpp != NULL) {
1053                         mtx_unlock(*vlpp);
1054                         *vlpp = NULL;
1055                 }
1056                 cache_sort_vnodes(&vlp1, &vlp2);
1057                 if (vlp1 == pvlp) {
1058                         mtx_lock(vlp2);
1059                         to_unlock = vlp2;
1060                 } else {
1061                         if (!mtx_trylock(vlp1))
1062                                 goto out_relock;
1063                         to_unlock = vlp1;
1064                 }
1065         }
1066         mtx_lock(blp);
1067         cache_zap_locked(ncp);
1068         mtx_unlock(blp);
1069         if (to_unlock != NULL)
1070                 mtx_unlock(to_unlock);
1071         return (true);
1072
1073 out_relock:
1074         mtx_unlock(vlp2);
1075         mtx_lock(vlp1);
1076         mtx_lock(vlp2);
1077         MPASS(*vlpp == NULL);
1078         *vlpp = vlp1;
1079         return (false);
1080 }
1081
1082 /*
1083  * If trylocking failed we can get here. We know enough to take all needed locks
1084  * in the right order and re-lookup the entry.
1085  */
1086 static int
1087 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1088     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1089     struct mtx *blp)
1090 {
1091         struct namecache *rncp;
1092
1093         cache_assert_bucket_unlocked(ncp);
1094
1095         cache_sort_vnodes(&dvlp, &vlp);
1096         cache_lock_vnodes(dvlp, vlp);
1097         mtx_lock(blp);
1098         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1099                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1100                     rncp->nc_nlen == cnp->cn_namelen &&
1101                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1102                         break;
1103         }
1104         if (rncp != NULL) {
1105                 cache_zap_locked(rncp);
1106                 mtx_unlock(blp);
1107                 cache_unlock_vnodes(dvlp, vlp);
1108                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1109                 return (0);
1110         }
1111
1112         mtx_unlock(blp);
1113         cache_unlock_vnodes(dvlp, vlp);
1114         return (EAGAIN);
1115 }
1116
1117 static int __noinline
1118 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1119     uint32_t hash, struct mtx *blp)
1120 {
1121         struct mtx *dvlp, *vlp;
1122         struct vnode *dvp;
1123
1124         cache_assert_bucket_locked(ncp);
1125
1126         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1127         vlp = NULL;
1128         if (!(ncp->nc_flag & NCF_NEGATIVE))
1129                 vlp = VP2VNODELOCK(ncp->nc_vp);
1130         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1131                 cache_zap_locked(ncp);
1132                 mtx_unlock(blp);
1133                 cache_unlock_vnodes(dvlp, vlp);
1134                 return (0);
1135         }
1136
1137         dvp = ncp->nc_dvp;
1138         mtx_unlock(blp);
1139         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1140 }
1141
1142 static __noinline int
1143 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1144 {
1145         struct namecache *ncp;
1146         struct mtx *blp;
1147         struct mtx *dvlp, *dvlp2;
1148         uint32_t hash;
1149         int error;
1150
1151         if (cnp->cn_namelen == 2 &&
1152             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1153                 dvlp = VP2VNODELOCK(dvp);
1154                 dvlp2 = NULL;
1155                 mtx_lock(dvlp);
1156 retry_dotdot:
1157                 ncp = dvp->v_cache_dd;
1158                 if (ncp == NULL) {
1159                         mtx_unlock(dvlp);
1160                         if (dvlp2 != NULL)
1161                                 mtx_unlock(dvlp2);
1162                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1163                         return (0);
1164                 }
1165                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1166                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1167                                 goto retry_dotdot;
1168                         MPASS(dvp->v_cache_dd == NULL);
1169                         mtx_unlock(dvlp);
1170                         if (dvlp2 != NULL)
1171                                 mtx_unlock(dvlp2);
1172                         cache_free(ncp);
1173                 } else {
1174                         vn_seqc_write_begin(dvp);
1175                         dvp->v_cache_dd = NULL;
1176                         vn_seqc_write_end(dvp);
1177                         mtx_unlock(dvlp);
1178                         if (dvlp2 != NULL)
1179                                 mtx_unlock(dvlp2);
1180                 }
1181                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1182                 return (1);
1183         }
1184
1185         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1186         blp = HASH2BUCKETLOCK(hash);
1187 retry:
1188         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1189                 goto out_no_entry;
1190
1191         mtx_lock(blp);
1192
1193         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1194                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1195                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1196                         break;
1197         }
1198
1199         if (ncp == NULL) {
1200                 mtx_unlock(blp);
1201                 goto out_no_entry;
1202         }
1203
1204         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1205         if (__predict_false(error != 0)) {
1206                 zap_and_exit_bucket_fail++;
1207                 goto retry;
1208         }
1209         counter_u64_add(numposzaps, 1);
1210         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1211         cache_free(ncp);
1212         return (1);
1213 out_no_entry:
1214         counter_u64_add(nummisszap, 1);
1215         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1216         return (0);
1217 }
1218
1219 static int __noinline
1220 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1221     struct timespec *tsp, int *ticksp)
1222 {
1223         int ltype;
1224
1225         *vpp = dvp;
1226         counter_u64_add(dothits, 1);
1227         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1228         if (tsp != NULL)
1229                 timespecclear(tsp);
1230         if (ticksp != NULL)
1231                 *ticksp = ticks;
1232         vrefact(*vpp);
1233         /*
1234          * When we lookup "." we still can be asked to lock it
1235          * differently...
1236          */
1237         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1238         if (ltype != VOP_ISLOCKED(*vpp)) {
1239                 if (ltype == LK_EXCLUSIVE) {
1240                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1241                         if (VN_IS_DOOMED((*vpp))) {
1242                                 /* forced unmount */
1243                                 vrele(*vpp);
1244                                 *vpp = NULL;
1245                                 return (ENOENT);
1246                         }
1247                 } else
1248                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1249         }
1250         return (-1);
1251 }
1252
1253 static int __noinline
1254 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1255     struct timespec *tsp, int *ticksp)
1256 {
1257         struct namecache_ts *ncp_ts;
1258         struct namecache *ncp;
1259         struct mtx *dvlp;
1260         enum vgetstate vs;
1261         int error, ltype;
1262         bool whiteout;
1263
1264         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1265
1266         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1267                 cache_remove_cnp(dvp, cnp);
1268                 return (0);
1269         }
1270
1271         counter_u64_add(dotdothits, 1);
1272 retry:
1273         dvlp = VP2VNODELOCK(dvp);
1274         mtx_lock(dvlp);
1275         ncp = dvp->v_cache_dd;
1276         if (ncp == NULL) {
1277                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1278                 mtx_unlock(dvlp);
1279                 return (0);
1280         }
1281         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1282                 if (ncp->nc_flag & NCF_NEGATIVE)
1283                         *vpp = NULL;
1284                 else
1285                         *vpp = ncp->nc_vp;
1286         } else
1287                 *vpp = ncp->nc_dvp;
1288         if (*vpp == NULL)
1289                 goto negative_success;
1290         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1291         cache_out_ts(ncp, tsp, ticksp);
1292         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1293             NCF_DTS && tsp != NULL) {
1294                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1295                 *tsp = ncp_ts->nc_dotdottime;
1296         }
1297
1298         MPASS(dvp != *vpp);
1299         ltype = VOP_ISLOCKED(dvp);
1300         VOP_UNLOCK(dvp);
1301         vs = vget_prep(*vpp);
1302         mtx_unlock(dvlp);
1303         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1304         vn_lock(dvp, ltype | LK_RETRY);
1305         if (VN_IS_DOOMED(dvp)) {
1306                 if (error == 0)
1307                         vput(*vpp);
1308                 *vpp = NULL;
1309                 return (ENOENT);
1310         }
1311         if (error) {
1312                 *vpp = NULL;
1313                 goto retry;
1314         }
1315         return (-1);
1316 negative_success:
1317         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1318                 if (cnp->cn_flags & ISLASTCN) {
1319                         counter_u64_add(numnegzaps, 1);
1320                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1321                         mtx_unlock(dvlp);
1322                         cache_free(ncp);
1323                         return (0);
1324                 }
1325         }
1326
1327         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1328         cache_out_ts(ncp, tsp, ticksp);
1329         counter_u64_add(numneghits, 1);
1330         whiteout = (ncp->nc_flag & NCF_WHITE);
1331         cache_negative_hit(ncp);
1332         mtx_unlock(dvlp);
1333         if (whiteout)
1334                 cnp->cn_flags |= ISWHITEOUT;
1335         return (ENOENT);
1336 }
1337
1338 /**
1339  * Lookup a name in the name cache
1340  *
1341  * # Arguments
1342  *
1343  * - dvp:       Parent directory in which to search.
1344  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1345  * - cnp:       Parameters of the name search.  The most interesting bits of
1346  *              the cn_flags field have the following meanings:
1347  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1348  *                      it up.
1349  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1350  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1351  *              or negative) lookup, tsp will be filled with any timespec that
1352  *              was stored when this cache entry was created.  However, it will
1353  *              be clear for "." entries.
1354  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1355  *              (positive or negative) lookup, it will contain the ticks value
1356  *              that was current when the cache entry was created, unless cnp
1357  *              was ".".
1358  *
1359  * Either both tsp and ticks have to be provided or neither of them.
1360  *
1361  * # Returns
1362  *
1363  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1364  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1365  *              to a forced unmount.  vpp will not be modified.  If the entry
1366  *              is a whiteout, then the ISWHITEOUT flag will be set in
1367  *              cnp->cn_flags.
1368  * - 0:         A cache miss.  vpp will not be modified.
1369  *
1370  * # Locking
1371  *
1372  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1373  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1374  * lock is not recursively acquired.
1375  */
1376 static int __noinline
1377 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1378     struct timespec *tsp, int *ticksp)
1379 {
1380         struct namecache *ncp;
1381         struct mtx *blp;
1382         uint32_t hash;
1383         enum vgetstate vs;
1384         int error;
1385         bool whiteout;
1386
1387         MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1388
1389 retry:
1390         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1391         blp = HASH2BUCKETLOCK(hash);
1392         mtx_lock(blp);
1393
1394         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1395                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1396                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1397                         break;
1398         }
1399
1400         if (__predict_false(ncp == NULL)) {
1401                 mtx_unlock(blp);
1402                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1403                     NULL);
1404                 counter_u64_add(nummiss, 1);
1405                 return (0);
1406         }
1407
1408         if (ncp->nc_flag & NCF_NEGATIVE)
1409                 goto negative_success;
1410
1411         counter_u64_add(numposhits, 1);
1412         *vpp = ncp->nc_vp;
1413         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1414         cache_out_ts(ncp, tsp, ticksp);
1415         MPASS(dvp != *vpp);
1416         vs = vget_prep(*vpp);
1417         mtx_unlock(blp);
1418         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1419         if (error) {
1420                 *vpp = NULL;
1421                 goto retry;
1422         }
1423         return (-1);
1424 negative_success:
1425         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1426                 if (cnp->cn_flags & ISLASTCN) {
1427                         counter_u64_add(numnegzaps, 1);
1428                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1429                         if (__predict_false(error != 0)) {
1430                                 zap_and_exit_bucket_fail2++;
1431                                 goto retry;
1432                         }
1433                         cache_free(ncp);
1434                         return (0);
1435                 }
1436         }
1437
1438         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1439         cache_out_ts(ncp, tsp, ticksp);
1440         counter_u64_add(numneghits, 1);
1441         whiteout = (ncp->nc_flag & NCF_WHITE);
1442         cache_negative_hit(ncp);
1443         mtx_unlock(blp);
1444         if (whiteout)
1445                 cnp->cn_flags |= ISWHITEOUT;
1446         return (ENOENT);
1447 }
1448
1449 int
1450 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1451     struct timespec *tsp, int *ticksp)
1452 {
1453         struct namecache *ncp;
1454         struct negstate *ns;
1455         uint32_t hash;
1456         enum vgetstate vs;
1457         int error;
1458         bool whiteout;
1459         u_short nc_flag;
1460
1461         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1462
1463 #ifdef DEBUG_CACHE
1464         if (__predict_false(!doingcache)) {
1465                 cnp->cn_flags &= ~MAKEENTRY;
1466                 return (0);
1467         }
1468 #endif
1469
1470         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1471                 if (cnp->cn_namelen == 1)
1472                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1473                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1474                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1475         }
1476
1477         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1478
1479         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1480                 cache_remove_cnp(dvp, cnp);
1481                 return (0);
1482         }
1483
1484         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1485         vfs_smr_enter();
1486
1487         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1488                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1489                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1490                         break;
1491         }
1492
1493         if (__predict_false(ncp == NULL)) {
1494                 vfs_smr_exit();
1495                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1496                     NULL);
1497                 counter_u64_add(nummiss, 1);
1498                 return (0);
1499         }
1500
1501         nc_flag = atomic_load_char(&ncp->nc_flag);
1502         if (nc_flag & NCF_NEGATIVE)
1503                 goto negative_success;
1504
1505         counter_u64_add(numposhits, 1);
1506         *vpp = ncp->nc_vp;
1507         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1508         cache_out_ts(ncp, tsp, ticksp);
1509         MPASS(dvp != *vpp);
1510         if (!cache_ncp_canuse(ncp)) {
1511                 vfs_smr_exit();
1512                 *vpp = NULL;
1513                 goto out_fallback;
1514         }
1515         vs = vget_prep_smr(*vpp);
1516         vfs_smr_exit();
1517         if (__predict_false(vs == VGET_NONE)) {
1518                 *vpp = NULL;
1519                 goto out_fallback;
1520         }
1521         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1522         if (error) {
1523                 *vpp = NULL;
1524                 goto out_fallback;
1525         }
1526         return (-1);
1527 negative_success:
1528         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1529                 if (cnp->cn_flags & ISLASTCN) {
1530                         vfs_smr_exit();
1531                         goto out_fallback;
1532                 }
1533         }
1534
1535         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1536         cache_out_ts(ncp, tsp, ticksp);
1537         counter_u64_add(numneghits, 1);
1538         whiteout = (ncp->nc_flag & NCF_WHITE);
1539         /*
1540          * TODO: We need to take locks to promote an entry. Code doing it
1541          * in SMR lookup can be modified to be shared.
1542          */
1543         ns = NCP2NEGSTATE(ncp);
1544         if ((ns->neg_flag & NEG_HOT) == 0 ||
1545             !cache_ncp_canuse(ncp)) {
1546                 vfs_smr_exit();
1547                 goto out_fallback;
1548         }
1549         vfs_smr_exit();
1550         if (whiteout)
1551                 cnp->cn_flags |= ISWHITEOUT;
1552         return (ENOENT);
1553 out_fallback:
1554         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1555 }
1556
1557 struct celockstate {
1558         struct mtx *vlp[3];
1559         struct mtx *blp[2];
1560 };
1561 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1562 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1563
1564 static inline void
1565 cache_celockstate_init(struct celockstate *cel)
1566 {
1567
1568         bzero(cel, sizeof(*cel));
1569 }
1570
1571 static void
1572 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1573     struct vnode *dvp)
1574 {
1575         struct mtx *vlp1, *vlp2;
1576
1577         MPASS(cel->vlp[0] == NULL);
1578         MPASS(cel->vlp[1] == NULL);
1579         MPASS(cel->vlp[2] == NULL);
1580
1581         MPASS(vp != NULL || dvp != NULL);
1582
1583         vlp1 = VP2VNODELOCK(vp);
1584         vlp2 = VP2VNODELOCK(dvp);
1585         cache_sort_vnodes(&vlp1, &vlp2);
1586
1587         if (vlp1 != NULL) {
1588                 mtx_lock(vlp1);
1589                 cel->vlp[0] = vlp1;
1590         }
1591         mtx_lock(vlp2);
1592         cel->vlp[1] = vlp2;
1593 }
1594
1595 static void
1596 cache_unlock_vnodes_cel(struct celockstate *cel)
1597 {
1598
1599         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1600
1601         if (cel->vlp[0] != NULL)
1602                 mtx_unlock(cel->vlp[0]);
1603         if (cel->vlp[1] != NULL)
1604                 mtx_unlock(cel->vlp[1]);
1605         if (cel->vlp[2] != NULL)
1606                 mtx_unlock(cel->vlp[2]);
1607 }
1608
1609 static bool
1610 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1611 {
1612         struct mtx *vlp;
1613         bool ret;
1614
1615         cache_assert_vlp_locked(cel->vlp[0]);
1616         cache_assert_vlp_locked(cel->vlp[1]);
1617         MPASS(cel->vlp[2] == NULL);
1618
1619         MPASS(vp != NULL);
1620         vlp = VP2VNODELOCK(vp);
1621
1622         ret = true;
1623         if (vlp >= cel->vlp[1]) {
1624                 mtx_lock(vlp);
1625         } else {
1626                 if (mtx_trylock(vlp))
1627                         goto out;
1628                 cache_lock_vnodes_cel_3_failures++;
1629                 cache_unlock_vnodes_cel(cel);
1630                 if (vlp < cel->vlp[0]) {
1631                         mtx_lock(vlp);
1632                         mtx_lock(cel->vlp[0]);
1633                         mtx_lock(cel->vlp[1]);
1634                 } else {
1635                         if (cel->vlp[0] != NULL)
1636                                 mtx_lock(cel->vlp[0]);
1637                         mtx_lock(vlp);
1638                         mtx_lock(cel->vlp[1]);
1639                 }
1640                 ret = false;
1641         }
1642 out:
1643         cel->vlp[2] = vlp;
1644         return (ret);
1645 }
1646
1647 static void
1648 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1649     struct mtx *blp2)
1650 {
1651
1652         MPASS(cel->blp[0] == NULL);
1653         MPASS(cel->blp[1] == NULL);
1654
1655         cache_sort_vnodes(&blp1, &blp2);
1656
1657         if (blp1 != NULL) {
1658                 mtx_lock(blp1);
1659                 cel->blp[0] = blp1;
1660         }
1661         mtx_lock(blp2);
1662         cel->blp[1] = blp2;
1663 }
1664
1665 static void
1666 cache_unlock_buckets_cel(struct celockstate *cel)
1667 {
1668
1669         if (cel->blp[0] != NULL)
1670                 mtx_unlock(cel->blp[0]);
1671         mtx_unlock(cel->blp[1]);
1672 }
1673
1674 /*
1675  * Lock part of the cache affected by the insertion.
1676  *
1677  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1678  * However, insertion can result in removal of an old entry. In this
1679  * case we have an additional vnode and bucketlock pair to lock.
1680  *
1681  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1682  * preserving the locking order (smaller address first).
1683  */
1684 static void
1685 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1686     uint32_t hash)
1687 {
1688         struct namecache *ncp;
1689         struct mtx *blps[2];
1690
1691         blps[0] = HASH2BUCKETLOCK(hash);
1692         for (;;) {
1693                 blps[1] = NULL;
1694                 cache_lock_vnodes_cel(cel, dvp, vp);
1695                 if (vp == NULL || vp->v_type != VDIR)
1696                         break;
1697                 ncp = vp->v_cache_dd;
1698                 if (ncp == NULL)
1699                         break;
1700                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1701                         break;
1702                 MPASS(ncp->nc_dvp == vp);
1703                 blps[1] = NCP2BUCKETLOCK(ncp);
1704                 if (ncp->nc_flag & NCF_NEGATIVE)
1705                         break;
1706                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1707                         break;
1708                 /*
1709                  * All vnodes got re-locked. Re-validate the state and if
1710                  * nothing changed we are done. Otherwise restart.
1711                  */
1712                 if (ncp == vp->v_cache_dd &&
1713                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1714                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1715                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1716                         break;
1717                 cache_unlock_vnodes_cel(cel);
1718                 cel->vlp[0] = NULL;
1719                 cel->vlp[1] = NULL;
1720                 cel->vlp[2] = NULL;
1721         }
1722         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1723 }
1724
1725 static void
1726 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1727     uint32_t hash)
1728 {
1729         struct namecache *ncp;
1730         struct mtx *blps[2];
1731
1732         blps[0] = HASH2BUCKETLOCK(hash);
1733         for (;;) {
1734                 blps[1] = NULL;
1735                 cache_lock_vnodes_cel(cel, dvp, vp);
1736                 ncp = dvp->v_cache_dd;
1737                 if (ncp == NULL)
1738                         break;
1739                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1740                         break;
1741                 MPASS(ncp->nc_dvp == dvp);
1742                 blps[1] = NCP2BUCKETLOCK(ncp);
1743                 if (ncp->nc_flag & NCF_NEGATIVE)
1744                         break;
1745                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1746                         break;
1747                 if (ncp == dvp->v_cache_dd &&
1748                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1749                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1750                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1751                         break;
1752                 cache_unlock_vnodes_cel(cel);
1753                 cel->vlp[0] = NULL;
1754                 cel->vlp[1] = NULL;
1755                 cel->vlp[2] = NULL;
1756         }
1757         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1758 }
1759
1760 static void
1761 cache_enter_unlock(struct celockstate *cel)
1762 {
1763
1764         cache_unlock_buckets_cel(cel);
1765         cache_unlock_vnodes_cel(cel);
1766 }
1767
1768 static void __noinline
1769 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1770     struct componentname *cnp)
1771 {
1772         struct celockstate cel;
1773         struct namecache *ncp;
1774         uint32_t hash;
1775         int len;
1776
1777         if (dvp->v_cache_dd == NULL)
1778                 return;
1779         len = cnp->cn_namelen;
1780         cache_celockstate_init(&cel);
1781         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1782         cache_enter_lock_dd(&cel, dvp, vp, hash);
1783         vn_seqc_write_begin(dvp);
1784         ncp = dvp->v_cache_dd;
1785         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1786                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1787                 cache_zap_locked(ncp);
1788         } else {
1789                 ncp = NULL;
1790         }
1791         dvp->v_cache_dd = NULL;
1792         vn_seqc_write_end(dvp);
1793         cache_enter_unlock(&cel);
1794         if (ncp != NULL)
1795                 cache_free(ncp);
1796 }
1797
1798 /*
1799  * Add an entry to the cache.
1800  */
1801 void
1802 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1803     struct timespec *tsp, struct timespec *dtsp)
1804 {
1805         struct celockstate cel;
1806         struct namecache *ncp, *n2, *ndd;
1807         struct namecache_ts *ncp_ts;
1808         struct nchashhead *ncpp;
1809         uint32_t hash;
1810         int flag;
1811         int len;
1812         u_long lnumcache;
1813
1814         VNPASS(!VN_IS_DOOMED(dvp), dvp);
1815         VNPASS(dvp->v_type != VNON, dvp);
1816         if (vp != NULL) {
1817                 VNPASS(!VN_IS_DOOMED(vp), vp);
1818                 VNPASS(vp->v_type != VNON, vp);
1819         }
1820
1821 #ifdef DEBUG_CACHE
1822         if (__predict_false(!doingcache))
1823                 return;
1824 #endif
1825
1826         flag = 0;
1827         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1828                 if (cnp->cn_namelen == 1)
1829                         return;
1830                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1831                         cache_enter_dotdot_prep(dvp, vp, cnp);
1832                         flag = NCF_ISDOTDOT;
1833                 }
1834         }
1835
1836         /*
1837          * Avoid blowout in namecache entries.
1838          */
1839         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1840         if (__predict_false(lnumcache >= ncsize)) {
1841                 atomic_subtract_long(&numcache, 1);
1842                 counter_u64_add(numdrops, 1);
1843                 return;
1844         }
1845
1846         cache_celockstate_init(&cel);
1847         ndd = NULL;
1848         ncp_ts = NULL;
1849
1850         /*
1851          * Calculate the hash key and setup as much of the new
1852          * namecache entry as possible before acquiring the lock.
1853          */
1854         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1855         ncp->nc_flag = flag | NCF_WIP;
1856         ncp->nc_vp = vp;
1857         if (vp == NULL)
1858                 cache_negative_init(ncp);
1859         ncp->nc_dvp = dvp;
1860         if (tsp != NULL) {
1861                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1862                 ncp_ts->nc_time = *tsp;
1863                 ncp_ts->nc_ticks = ticks;
1864                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1865                 if (dtsp != NULL) {
1866                         ncp_ts->nc_dotdottime = *dtsp;
1867                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1868                 }
1869         }
1870         len = ncp->nc_nlen = cnp->cn_namelen;
1871         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1872         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1873         ncp->nc_name[len] = '\0';
1874         cache_enter_lock(&cel, dvp, vp, hash);
1875
1876         /*
1877          * See if this vnode or negative entry is already in the cache
1878          * with this name.  This can happen with concurrent lookups of
1879          * the same path name.
1880          */
1881         ncpp = NCHHASH(hash);
1882         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1883                 if (n2->nc_dvp == dvp &&
1884                     n2->nc_nlen == cnp->cn_namelen &&
1885                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1886                         MPASS(cache_ncp_canuse(n2));
1887                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
1888                                 KASSERT(vp == NULL,
1889                                     ("%s: found entry pointing to a different vnode (%p != %p)",
1890                                     __func__, NULL, vp));
1891                         else
1892                                 KASSERT(n2->nc_vp == vp,
1893                                     ("%s: found entry pointing to a different vnode (%p != %p)",
1894                                     __func__, n2->nc_vp, vp));
1895                         /*
1896                          * Entries are supposed to be immutable unless in the
1897                          * process of getting destroyed. Accommodating for
1898                          * changing timestamps is possible but not worth it.
1899                          * This should be harmless in terms of correctness, in
1900                          * the worst case resulting in an earlier expiration.
1901                          * Alternatively, the found entry can be replaced
1902                          * altogether.
1903                          */
1904                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
1905 #if 0
1906                         if (tsp != NULL) {
1907                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
1908                                     ("no NCF_TS"));
1909                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1910                                 n2_ts->nc_time = ncp_ts->nc_time;
1911                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
1912                                 if (dtsp != NULL) {
1913                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1914                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
1915                                 }
1916                         }
1917 #endif
1918                         goto out_unlock_free;
1919                 }
1920         }
1921
1922         if (flag == NCF_ISDOTDOT) {
1923                 /*
1924                  * See if we are trying to add .. entry, but some other lookup
1925                  * has populated v_cache_dd pointer already.
1926                  */
1927                 if (dvp->v_cache_dd != NULL)
1928                         goto out_unlock_free;
1929                 KASSERT(vp == NULL || vp->v_type == VDIR,
1930                     ("wrong vnode type %p", vp));
1931                 vn_seqc_write_begin(dvp);
1932                 dvp->v_cache_dd = ncp;
1933                 vn_seqc_write_end(dvp);
1934         }
1935
1936         if (vp != NULL) {
1937                 if (flag != NCF_ISDOTDOT) {
1938                         /*
1939                          * For this case, the cache entry maps both the
1940                          * directory name in it and the name ".." for the
1941                          * directory's parent.
1942                          */
1943                         vn_seqc_write_begin(vp);
1944                         if ((ndd = vp->v_cache_dd) != NULL) {
1945                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1946                                         cache_zap_locked(ndd);
1947                                 else
1948                                         ndd = NULL;
1949                         }
1950                         vp->v_cache_dd = ncp;
1951                         vn_seqc_write_end(vp);
1952                 } else if (vp->v_type != VDIR) {
1953                         if (vp->v_cache_dd != NULL) {
1954                                 vn_seqc_write_begin(vp);
1955                                 vp->v_cache_dd = NULL;
1956                                 vn_seqc_write_end(vp);
1957                         }
1958                 }
1959         }
1960
1961         if (flag != NCF_ISDOTDOT) {
1962                 if (LIST_EMPTY(&dvp->v_cache_src)) {
1963                         vhold(dvp);
1964                         counter_u64_add(numcachehv, 1);
1965                 }
1966                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1967         }
1968
1969         /*
1970          * If the entry is "negative", we place it into the
1971          * "negative" cache queue, otherwise, we place it into the
1972          * destination vnode's cache entries queue.
1973          */
1974         if (vp != NULL) {
1975                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1976                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1977                     vp);
1978         } else {
1979                 if (cnp->cn_flags & ISWHITEOUT)
1980                         ncp->nc_flag |= NCF_WHITE;
1981                 cache_negative_insert(ncp);
1982                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1983                     ncp->nc_name);
1984         }
1985
1986         /*
1987          * Insert the new namecache entry into the appropriate chain
1988          * within the cache entries table.
1989          */
1990         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1991
1992         atomic_thread_fence_rel();
1993         /*
1994          * Mark the entry as fully constructed.
1995          * It is immutable past this point until its removal.
1996          */
1997         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
1998
1999         cache_enter_unlock(&cel);
2000         if (numneg * ncnegfactor > lnumcache)
2001                 cache_negative_zap_one();
2002         if (ndd != NULL)
2003                 cache_free(ndd);
2004         return;
2005 out_unlock_free:
2006         cache_enter_unlock(&cel);
2007         atomic_subtract_long(&numcache, 1);
2008         cache_free(ncp);
2009         return;
2010 }
2011
2012 static u_int
2013 cache_roundup_2(u_int val)
2014 {
2015         u_int res;
2016
2017         for (res = 1; res <= val; res <<= 1)
2018                 continue;
2019
2020         return (res);
2021 }
2022
2023 static struct nchashhead *
2024 nchinittbl(u_long elements, u_long *hashmask)
2025 {
2026         struct nchashhead *hashtbl;
2027         u_long hashsize, i;
2028
2029         hashsize = cache_roundup_2(elements) / 2;
2030
2031         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2032         for (i = 0; i < hashsize; i++)
2033                 CK_SLIST_INIT(&hashtbl[i]);
2034         *hashmask = hashsize - 1;
2035         return (hashtbl);
2036 }
2037
2038 static void
2039 ncfreetbl(struct nchashhead *hashtbl)
2040 {
2041
2042         free(hashtbl, M_VFSCACHE);
2043 }
2044
2045 /*
2046  * Name cache initialization, from vfs_init() when we are booting
2047  */
2048 static void
2049 nchinit(void *dummy __unused)
2050 {
2051         u_int i;
2052
2053         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2054             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2055         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2056             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2057         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2058             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2059         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2060             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2061
2062         VFS_SMR_ZONE_SET(cache_zone_small);
2063         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2064         VFS_SMR_ZONE_SET(cache_zone_large);
2065         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2066
2067         ncsize = desiredvnodes * ncsizefactor;
2068         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2069         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2070         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2071                 ncbuckethash = 7;
2072         if (ncbuckethash > nchash)
2073                 ncbuckethash = nchash;
2074         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2075             M_WAITOK | M_ZERO);
2076         for (i = 0; i < numbucketlocks; i++)
2077                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2078         ncvnodehash = ncbuckethash;
2079         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2080             M_WAITOK | M_ZERO);
2081         for (i = 0; i < numvnodelocks; i++)
2082                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2083
2084         for (i = 0; i < numneglists; i++) {
2085                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2086                 TAILQ_INIT(&neglists[i].nl_list);
2087                 TAILQ_INIT(&neglists[i].nl_hotlist);
2088         }
2089
2090         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2091 }
2092 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2093
2094 void
2095 cache_vnode_init(struct vnode *vp)
2096 {
2097
2098         LIST_INIT(&vp->v_cache_src);
2099         TAILQ_INIT(&vp->v_cache_dst);
2100         vp->v_cache_dd = NULL;
2101         cache_prehash(vp);
2102 }
2103
2104 void
2105 cache_changesize(u_long newmaxvnodes)
2106 {
2107         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2108         u_long new_nchash, old_nchash;
2109         struct namecache *ncp;
2110         uint32_t hash;
2111         u_long newncsize;
2112         int i;
2113
2114         newncsize = newmaxvnodes * ncsizefactor;
2115         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2116         if (newmaxvnodes < numbucketlocks)
2117                 newmaxvnodes = numbucketlocks;
2118
2119         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2120         /* If same hash table size, nothing to do */
2121         if (nchash == new_nchash) {
2122                 ncfreetbl(new_nchashtbl);
2123                 return;
2124         }
2125         /*
2126          * Move everything from the old hash table to the new table.
2127          * None of the namecache entries in the table can be removed
2128          * because to do so, they have to be removed from the hash table.
2129          */
2130         cache_lock_all_vnodes();
2131         cache_lock_all_buckets();
2132         old_nchashtbl = nchashtbl;
2133         old_nchash = nchash;
2134         nchashtbl = new_nchashtbl;
2135         nchash = new_nchash;
2136         for (i = 0; i <= old_nchash; i++) {
2137                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2138                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2139                             ncp->nc_dvp);
2140                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2141                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2142                 }
2143         }
2144         ncsize = newncsize;
2145         cache_unlock_all_buckets();
2146         cache_unlock_all_vnodes();
2147         ncfreetbl(old_nchashtbl);
2148 }
2149
2150 /*
2151  * Invalidate all entries from and to a particular vnode.
2152  */
2153 static void
2154 cache_purge_impl(struct vnode *vp)
2155 {
2156         TAILQ_HEAD(, namecache) ncps;
2157         struct namecache *ncp, *nnp;
2158         struct mtx *vlp, *vlp2;
2159
2160         TAILQ_INIT(&ncps);
2161         vlp = VP2VNODELOCK(vp);
2162         vlp2 = NULL;
2163         mtx_lock(vlp);
2164 retry:
2165         while (!LIST_EMPTY(&vp->v_cache_src)) {
2166                 ncp = LIST_FIRST(&vp->v_cache_src);
2167                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2168                         goto retry;
2169                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2170         }
2171         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2172                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2173                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2174                         goto retry;
2175                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2176         }
2177         ncp = vp->v_cache_dd;
2178         if (ncp != NULL) {
2179                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2180                    ("lost dotdot link"));
2181                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2182                         goto retry;
2183                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2184         }
2185         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2186         mtx_unlock(vlp);
2187         if (vlp2 != NULL)
2188                 mtx_unlock(vlp2);
2189         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2190                 cache_free(ncp);
2191         }
2192 }
2193
2194 /*
2195  * Opportunistic check to see if there is anything to do.
2196  */
2197 static bool
2198 cache_has_entries(struct vnode *vp)
2199 {
2200
2201         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2202             vp->v_cache_dd == NULL)
2203                 return (false);
2204         return (true);
2205 }
2206
2207 void
2208 cache_purge(struct vnode *vp)
2209 {
2210
2211         SDT_PROBE1(vfs, namecache, purge, done, vp);
2212         if (!cache_has_entries(vp))
2213                 return;
2214         cache_purge_impl(vp);
2215 }
2216
2217 /*
2218  * Only to be used by vgone.
2219  */
2220 void
2221 cache_purge_vgone(struct vnode *vp)
2222 {
2223         struct mtx *vlp;
2224
2225         VNPASS(VN_IS_DOOMED(vp), vp);
2226         if (cache_has_entries(vp)) {
2227                 cache_purge_impl(vp);
2228                 return;
2229         }
2230
2231         /*
2232          * Serialize against a potential thread doing cache_purge.
2233          */
2234         vlp = VP2VNODELOCK(vp);
2235         mtx_wait_unlocked(vlp);
2236         if (cache_has_entries(vp)) {
2237                 cache_purge_impl(vp);
2238                 return;
2239         }
2240         return;
2241 }
2242
2243 /*
2244  * Invalidate all negative entries for a particular directory vnode.
2245  */
2246 void
2247 cache_purge_negative(struct vnode *vp)
2248 {
2249         TAILQ_HEAD(, namecache) ncps;
2250         struct namecache *ncp, *nnp;
2251         struct mtx *vlp;
2252
2253         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2254         if (LIST_EMPTY(&vp->v_cache_src))
2255                 return;
2256         TAILQ_INIT(&ncps);
2257         vlp = VP2VNODELOCK(vp);
2258         mtx_lock(vlp);
2259         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2260                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2261                         continue;
2262                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2263                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2264         }
2265         mtx_unlock(vlp);
2266         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2267                 cache_free(ncp);
2268         }
2269 }
2270
2271 void
2272 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2273     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2274 {
2275
2276         ASSERT_VOP_IN_SEQC(fdvp);
2277         ASSERT_VOP_IN_SEQC(fvp);
2278         ASSERT_VOP_IN_SEQC(tdvp);
2279         if (tvp != NULL)
2280                 ASSERT_VOP_IN_SEQC(tvp);
2281
2282         cache_purge(fvp);
2283         if (tvp != NULL) {
2284                 cache_purge(tvp);
2285                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2286                     ("%s: lingering negative entry", __func__));
2287         } else {
2288                 cache_remove_cnp(tdvp, tcnp);
2289         }
2290 }
2291
2292 /*
2293  * Flush all entries referencing a particular filesystem.
2294  */
2295 void
2296 cache_purgevfs(struct mount *mp)
2297 {
2298         struct vnode *vp, *mvp;
2299
2300         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2301         /*
2302          * Somewhat wasteful iteration over all vnodes. Would be better to
2303          * support filtering and avoid the interlock to begin with.
2304          */
2305         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2306                 if (!cache_has_entries(vp)) {
2307                         VI_UNLOCK(vp);
2308                         continue;
2309                 }
2310                 vholdl(vp);
2311                 VI_UNLOCK(vp);
2312                 cache_purge(vp);
2313                 vdrop(vp);
2314         }
2315 }
2316
2317 /*
2318  * Perform canonical checks and cache lookup and pass on to filesystem
2319  * through the vop_cachedlookup only if needed.
2320  */
2321
2322 int
2323 vfs_cache_lookup(struct vop_lookup_args *ap)
2324 {
2325         struct vnode *dvp;
2326         int error;
2327         struct vnode **vpp = ap->a_vpp;
2328         struct componentname *cnp = ap->a_cnp;
2329         int flags = cnp->cn_flags;
2330
2331         *vpp = NULL;
2332         dvp = ap->a_dvp;
2333
2334         if (dvp->v_type != VDIR)
2335                 return (ENOTDIR);
2336
2337         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2338             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2339                 return (EROFS);
2340
2341         error = vn_dir_check_exec(dvp, cnp);
2342         if (error != 0)
2343                 return (error);
2344
2345         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2346         if (error == 0)
2347                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2348         if (error == -1)
2349                 return (0);
2350         return (error);
2351 }
2352
2353 /* Implementation of the getcwd syscall. */
2354 int
2355 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2356 {
2357         char *buf, *retbuf;
2358         size_t buflen;
2359         int error;
2360
2361         buflen = uap->buflen;
2362         if (__predict_false(buflen < 2))
2363                 return (EINVAL);
2364         if (buflen > MAXPATHLEN)
2365                 buflen = MAXPATHLEN;
2366
2367         buf = uma_zalloc(namei_zone, M_WAITOK);
2368         error = vn_getcwd(buf, &retbuf, &buflen);
2369         if (error == 0)
2370                 error = copyout(retbuf, uap->buf, buflen);
2371         uma_zfree(namei_zone, buf);
2372         return (error);
2373 }
2374
2375 int
2376 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2377 {
2378         struct pwd *pwd;
2379         int error;
2380
2381         vfs_smr_enter();
2382         pwd = pwd_get_smr();
2383         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2384             buflen, false, 0);
2385         VFS_SMR_ASSERT_NOT_ENTERED();
2386         if (error < 0) {
2387                 pwd = pwd_hold(curthread);
2388                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2389                     retbuf, buflen);
2390                 pwd_drop(pwd);
2391         }
2392
2393 #ifdef KTRACE
2394         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2395                 ktrnamei(*retbuf);
2396 #endif
2397         return (error);
2398 }
2399
2400 static int
2401 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2402     size_t size, int flags, enum uio_seg pathseg)
2403 {
2404         struct nameidata nd;
2405         char *retbuf, *freebuf;
2406         int error;
2407
2408         if (flags != 0)
2409                 return (EINVAL);
2410         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2411             pathseg, path, fd, &cap_fstat_rights, td);
2412         if ((error = namei(&nd)) != 0)
2413                 return (error);
2414         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2415         if (error == 0) {
2416                 error = copyout(retbuf, buf, size);
2417                 free(freebuf, M_TEMP);
2418         }
2419         NDFREE(&nd, 0);
2420         return (error);
2421 }
2422
2423 int
2424 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2425 {
2426
2427         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2428             uap->flags, UIO_USERSPACE));
2429 }
2430
2431 /*
2432  * Retrieve the full filesystem path that correspond to a vnode from the name
2433  * cache (if available)
2434  */
2435 int
2436 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2437 {
2438         struct pwd *pwd;
2439         char *buf;
2440         size_t buflen;
2441         int error;
2442
2443         if (__predict_false(vp == NULL))
2444                 return (EINVAL);
2445
2446         buflen = MAXPATHLEN;
2447         buf = malloc(buflen, M_TEMP, M_WAITOK);
2448         vfs_smr_enter();
2449         pwd = pwd_get_smr();
2450         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2451         VFS_SMR_ASSERT_NOT_ENTERED();
2452         if (error < 0) {
2453                 pwd = pwd_hold(curthread);
2454                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2455                 pwd_drop(pwd);
2456         }
2457         if (error == 0)
2458                 *freebuf = buf;
2459         else
2460                 free(buf, M_TEMP);
2461         return (error);
2462 }
2463
2464 /*
2465  * This function is similar to vn_fullpath, but it attempts to lookup the
2466  * pathname relative to the global root mount point.  This is required for the
2467  * auditing sub-system, as audited pathnames must be absolute, relative to the
2468  * global root mount point.
2469  */
2470 int
2471 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2472 {
2473         char *buf;
2474         size_t buflen;
2475         int error;
2476
2477         if (__predict_false(vp == NULL))
2478                 return (EINVAL);
2479         buflen = MAXPATHLEN;
2480         buf = malloc(buflen, M_TEMP, M_WAITOK);
2481         vfs_smr_enter();
2482         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2483         VFS_SMR_ASSERT_NOT_ENTERED();
2484         if (error < 0) {
2485                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2486         }
2487         if (error == 0)
2488                 *freebuf = buf;
2489         else
2490                 free(buf, M_TEMP);
2491         return (error);
2492 }
2493
2494 static struct namecache *
2495 vn_dd_from_dst(struct vnode *vp)
2496 {
2497         struct namecache *ncp;
2498
2499         cache_assert_vnode_locked(vp);
2500         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2501                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2502                         return (ncp);
2503         }
2504         return (NULL);
2505 }
2506
2507 int
2508 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2509 {
2510         struct vnode *dvp;
2511         struct namecache *ncp;
2512         struct mtx *vlp;
2513         int error;
2514
2515         vlp = VP2VNODELOCK(*vp);
2516         mtx_lock(vlp);
2517         ncp = (*vp)->v_cache_dd;
2518         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2519                 KASSERT(ncp == vn_dd_from_dst(*vp),
2520                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2521                     ncp, vn_dd_from_dst(*vp)));
2522         } else {
2523                 ncp = vn_dd_from_dst(*vp);
2524         }
2525         if (ncp != NULL) {
2526                 if (*buflen < ncp->nc_nlen) {
2527                         mtx_unlock(vlp);
2528                         vrele(*vp);
2529                         counter_u64_add(numfullpathfail4, 1);
2530                         error = ENOMEM;
2531                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2532                             vp, NULL);
2533                         return (error);
2534                 }
2535                 *buflen -= ncp->nc_nlen;
2536                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2537                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2538                     ncp->nc_name, vp);
2539                 dvp = *vp;
2540                 *vp = ncp->nc_dvp;
2541                 vref(*vp);
2542                 mtx_unlock(vlp);
2543                 vrele(dvp);
2544                 return (0);
2545         }
2546         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2547
2548         mtx_unlock(vlp);
2549         vn_lock(*vp, LK_SHARED | LK_RETRY);
2550         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2551         vput(*vp);
2552         if (error) {
2553                 counter_u64_add(numfullpathfail2, 1);
2554                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2555                 return (error);
2556         }
2557
2558         *vp = dvp;
2559         if (VN_IS_DOOMED(dvp)) {
2560                 /* forced unmount */
2561                 vrele(dvp);
2562                 error = ENOENT;
2563                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2564                 return (error);
2565         }
2566         /*
2567          * *vp has its use count incremented still.
2568          */
2569
2570         return (0);
2571 }
2572
2573 /*
2574  * Resolve a directory to a pathname.
2575  *
2576  * The name of the directory can always be found in the namecache or fetched
2577  * from the filesystem. There is also guaranteed to be only one parent, meaning
2578  * we can just follow vnodes up until we find the root.
2579  *
2580  * The vnode must be referenced.
2581  */
2582 static int
2583 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2584     size_t *len, bool slash_prefixed, size_t addend)
2585 {
2586 #ifdef KDTRACE_HOOKS
2587         struct vnode *startvp = vp;
2588 #endif
2589         struct vnode *vp1;
2590         size_t buflen;
2591         int error;
2592
2593         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2594         VNPASS(vp->v_usecount > 0, vp);
2595
2596         buflen = *len;
2597
2598         if (!slash_prefixed) {
2599                 MPASS(*len >= 2);
2600                 buflen--;
2601                 buf[buflen] = '\0';
2602         }
2603
2604         error = 0;
2605
2606         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2607         counter_u64_add(numfullpathcalls, 1);
2608         while (vp != rdir && vp != rootvnode) {
2609                 /*
2610                  * The vp vnode must be already fully constructed,
2611                  * since it is either found in namecache or obtained
2612                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2613                  * without obtaining the vnode lock.
2614                  */
2615                 if ((vp->v_vflag & VV_ROOT) != 0) {
2616                         vn_lock(vp, LK_RETRY | LK_SHARED);
2617
2618                         /*
2619                          * With the vnode locked, check for races with
2620                          * unmount, forced or not.  Note that we
2621                          * already verified that vp is not equal to
2622                          * the root vnode, which means that
2623                          * mnt_vnodecovered can be NULL only for the
2624                          * case of unmount.
2625                          */
2626                         if (VN_IS_DOOMED(vp) ||
2627                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2628                             vp1->v_mountedhere != vp->v_mount) {
2629                                 vput(vp);
2630                                 error = ENOENT;
2631                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2632                                     error, vp, NULL);
2633                                 break;
2634                         }
2635
2636                         vref(vp1);
2637                         vput(vp);
2638                         vp = vp1;
2639                         continue;
2640                 }
2641                 if (vp->v_type != VDIR) {
2642                         vrele(vp);
2643                         counter_u64_add(numfullpathfail1, 1);
2644                         error = ENOTDIR;
2645                         SDT_PROBE3(vfs, namecache, fullpath, return,
2646                             error, vp, NULL);
2647                         break;
2648                 }
2649                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2650                 if (error)
2651                         break;
2652                 if (buflen == 0) {
2653                         vrele(vp);
2654                         error = ENOMEM;
2655                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2656                             startvp, NULL);
2657                         break;
2658                 }
2659                 buf[--buflen] = '/';
2660                 slash_prefixed = true;
2661         }
2662         if (error)
2663                 return (error);
2664         if (!slash_prefixed) {
2665                 if (buflen == 0) {
2666                         vrele(vp);
2667                         counter_u64_add(numfullpathfail4, 1);
2668                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2669                             startvp, NULL);
2670                         return (ENOMEM);
2671                 }
2672                 buf[--buflen] = '/';
2673         }
2674         counter_u64_add(numfullpathfound, 1);
2675         vrele(vp);
2676
2677         *retbuf = buf + buflen;
2678         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2679         *len -= buflen;
2680         *len += addend;
2681         return (0);
2682 }
2683
2684 /*
2685  * Resolve an arbitrary vnode to a pathname.
2686  *
2687  * Note 2 caveats:
2688  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2689  *   resolve to a different path than the one used to find it
2690  * - namecache is not mandatory, meaning names are not guaranteed to be added
2691  *   (in which case resolving fails)
2692  */
2693 static void __inline
2694 cache_rev_failed_impl(int *reason, int line)
2695 {
2696
2697         *reason = line;
2698 }
2699 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
2700
2701 static int
2702 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2703     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2704 {
2705 #ifdef KDTRACE_HOOKS
2706         struct vnode *startvp = vp;
2707 #endif
2708         struct vnode *tvp;
2709         struct mount *mp;
2710         struct namecache *ncp;
2711         size_t orig_buflen;
2712         int reason;
2713         int error;
2714 #ifdef KDTRACE_HOOKS
2715         int i;
2716 #endif
2717         seqc_t vp_seqc, tvp_seqc;
2718         u_char nc_flag;
2719
2720         VFS_SMR_ASSERT_ENTERED();
2721
2722         if (!cache_fast_revlookup) {
2723                 vfs_smr_exit();
2724                 return (-1);
2725         }
2726
2727         orig_buflen = *buflen;
2728
2729         if (!slash_prefixed) {
2730                 MPASS(*buflen >= 2);
2731                 *buflen -= 1;
2732                 buf[*buflen] = '\0';
2733         }
2734
2735         if (vp == rdir || vp == rootvnode) {
2736                 if (!slash_prefixed) {
2737                         *buflen -= 1;
2738                         buf[*buflen] = '/';
2739                 }
2740                 goto out_ok;
2741         }
2742
2743 #ifdef KDTRACE_HOOKS
2744         i = 0;
2745 #endif
2746         error = -1;
2747         ncp = NULL; /* for sdt probe down below */
2748         vp_seqc = vn_seqc_read_any(vp);
2749         if (seqc_in_modify(vp_seqc)) {
2750                 cache_rev_failed(&reason);
2751                 goto out_abort;
2752         }
2753
2754         for (;;) {
2755 #ifdef KDTRACE_HOOKS
2756                 i++;
2757 #endif
2758                 if ((vp->v_vflag & VV_ROOT) != 0) {
2759                         mp = atomic_load_ptr(&vp->v_mount);
2760                         if (mp == NULL) {
2761                                 cache_rev_failed(&reason);
2762                                 goto out_abort;
2763                         }
2764                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2765                         tvp_seqc = vn_seqc_read_any(tvp);
2766                         if (seqc_in_modify(tvp_seqc)) {
2767                                 cache_rev_failed(&reason);
2768                                 goto out_abort;
2769                         }
2770                         if (!vn_seqc_consistent(vp, vp_seqc)) {
2771                                 cache_rev_failed(&reason);
2772                                 goto out_abort;
2773                         }
2774                         vp = tvp;
2775                         vp_seqc = tvp_seqc;
2776                         continue;
2777                 }
2778                 ncp = atomic_load_ptr(&vp->v_cache_dd);
2779                 if (ncp == NULL) {
2780                         cache_rev_failed(&reason);
2781                         goto out_abort;
2782                 }
2783                 nc_flag = atomic_load_char(&ncp->nc_flag);
2784                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
2785                         cache_rev_failed(&reason);
2786                         goto out_abort;
2787                 }
2788                 if (!cache_ncp_canuse(ncp)) {
2789                         cache_rev_failed(&reason);
2790                         goto out_abort;
2791                 }
2792                 if (ncp->nc_nlen >= *buflen) {
2793                         cache_rev_failed(&reason);
2794                         error = ENOMEM;
2795                         goto out_abort;
2796                 }
2797                 *buflen -= ncp->nc_nlen;
2798                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2799                 *buflen -= 1;
2800                 buf[*buflen] = '/';
2801                 tvp = ncp->nc_dvp;
2802                 tvp_seqc = vn_seqc_read_any(tvp);
2803                 if (seqc_in_modify(tvp_seqc)) {
2804                         cache_rev_failed(&reason);
2805                         goto out_abort;
2806                 }
2807                 if (!vn_seqc_consistent(vp, vp_seqc)) {
2808                         cache_rev_failed(&reason);
2809                         goto out_abort;
2810                 }
2811                 vp = tvp;
2812                 vp_seqc = tvp_seqc;
2813                 if (vp == rdir || vp == rootvnode)
2814                         break;
2815         }
2816 out_ok:
2817         vfs_smr_exit();
2818         *retbuf = buf + *buflen;
2819         *buflen = orig_buflen - *buflen + addend;
2820         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2821         return (0);
2822
2823 out_abort:
2824         *buflen = orig_buflen;
2825         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2826         vfs_smr_exit();
2827         return (error);
2828 }
2829
2830 static int
2831 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2832     size_t *buflen)
2833 {
2834         size_t orig_buflen;
2835         bool slash_prefixed;
2836         int error;
2837
2838         if (*buflen < 2)
2839                 return (EINVAL);
2840
2841         orig_buflen = *buflen;
2842
2843         vref(vp);
2844         slash_prefixed = false;
2845         if (vp->v_type != VDIR) {
2846                 *buflen -= 1;
2847                 buf[*buflen] = '\0';
2848                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2849                 if (error)
2850                         return (error);
2851                 if (*buflen == 0) {
2852                         vrele(vp);
2853                         return (ENOMEM);
2854                 }
2855                 *buflen -= 1;
2856                 buf[*buflen] = '/';
2857                 slash_prefixed = true;
2858         }
2859
2860         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
2861             orig_buflen - *buflen));
2862 }
2863
2864 /*
2865  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2866  *
2867  * Since the namecache does not track handlings, the caller is expected to first
2868  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2869  *
2870  * Then we have 2 cases:
2871  * - if the found vnode is a directory, the path can be constructed just by
2872  *   fullowing names up the chain
2873  * - otherwise we populate the buffer with the saved name and start resolving
2874  *   from the parent
2875  */
2876 static int
2877 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
2878     size_t *buflen)
2879 {
2880         char *buf, *tmpbuf;
2881         struct pwd *pwd;
2882         struct componentname *cnp;
2883         struct vnode *vp;
2884         size_t addend;
2885         int error;
2886         bool slash_prefixed;
2887         enum vtype type;
2888
2889         if (*buflen < 2)
2890                 return (EINVAL);
2891         if (*buflen > MAXPATHLEN)
2892                 *buflen = MAXPATHLEN;
2893
2894         slash_prefixed = false;
2895
2896         buf = malloc(*buflen, M_TEMP, M_WAITOK);
2897
2898         addend = 0;
2899         vp = ndp->ni_vp;
2900         /*
2901          * Check for VBAD to work around the vp_crossmp bug in lookup().
2902          *
2903          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
2904          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
2905          * If the type is VDIR (like in this very case) we can skip looking
2906          * at ni_dvp in the first place. However, since vnodes get passed here
2907          * unlocked the target may transition to doomed state (type == VBAD)
2908          * before we get to evaluate the condition. If this happens, we will
2909          * populate part of the buffer and descend to vn_fullpath_dir with
2910          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
2911          *
2912          * This should be atomic_load(&vp->v_type) but it is ilegal to take
2913          * an address of a bit field, even if said field is sized to char.
2914          * Work around the problem by reading the value into a full-sized enum
2915          * and then re-reading it with atomic_load which will still prevent
2916          * the compiler from re-reading down the road.
2917          */
2918         type = vp->v_type;
2919         type = atomic_load_int(&type);
2920         if (type == VBAD) {
2921                 error = ENOENT;
2922                 goto out_bad;
2923         }
2924         if (type != VDIR) {
2925                 cnp = &ndp->ni_cnd;
2926                 addend = cnp->cn_namelen + 2;
2927                 if (*buflen < addend) {
2928                         error = ENOMEM;
2929                         goto out_bad;
2930                 }
2931                 *buflen -= addend;
2932                 tmpbuf = buf + *buflen;
2933                 tmpbuf[0] = '/';
2934                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2935                 tmpbuf[addend - 1] = '\0';
2936                 slash_prefixed = true;
2937                 vp = ndp->ni_dvp;
2938         }
2939
2940         vfs_smr_enter();
2941         pwd = pwd_get_smr();
2942         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
2943             slash_prefixed, addend);
2944         VFS_SMR_ASSERT_NOT_ENTERED();
2945         if (error < 0) {
2946                 pwd = pwd_hold(curthread);
2947                 vref(vp);
2948                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
2949                     slash_prefixed, addend);
2950                 pwd_drop(pwd);
2951                 if (error != 0)
2952                         goto out_bad;
2953         }
2954
2955         *freebuf = buf;
2956
2957         return (0);
2958 out_bad:
2959         free(buf, M_TEMP);
2960         return (error);
2961 }
2962
2963 struct vnode *
2964 vn_dir_dd_ino(struct vnode *vp)
2965 {
2966         struct namecache *ncp;
2967         struct vnode *ddvp;
2968         struct mtx *vlp;
2969         enum vgetstate vs;
2970
2971         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2972         vlp = VP2VNODELOCK(vp);
2973         mtx_lock(vlp);
2974         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2975                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2976                         continue;
2977                 ddvp = ncp->nc_dvp;
2978                 vs = vget_prep(ddvp);
2979                 mtx_unlock(vlp);
2980                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2981                         return (NULL);
2982                 return (ddvp);
2983         }
2984         mtx_unlock(vlp);
2985         return (NULL);
2986 }
2987
2988 int
2989 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2990 {
2991         struct namecache *ncp;
2992         struct mtx *vlp;
2993         int l;
2994
2995         vlp = VP2VNODELOCK(vp);
2996         mtx_lock(vlp);
2997         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2998                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2999                         break;
3000         if (ncp == NULL) {
3001                 mtx_unlock(vlp);
3002                 return (ENOENT);
3003         }
3004         l = min(ncp->nc_nlen, buflen - 1);
3005         memcpy(buf, ncp->nc_name, l);
3006         mtx_unlock(vlp);
3007         buf[l] = '\0';
3008         return (0);
3009 }
3010
3011 /*
3012  * This function updates path string to vnode's full global path
3013  * and checks the size of the new path string against the pathlen argument.
3014  *
3015  * Requires a locked, referenced vnode.
3016  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3017  *
3018  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3019  * because it falls back to the ".." lookup if the namecache lookup fails.
3020  */
3021 int
3022 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3023     u_int pathlen)
3024 {
3025         struct nameidata nd;
3026         struct vnode *vp1;
3027         char *rpath, *fbuf;
3028         int error;
3029
3030         ASSERT_VOP_ELOCKED(vp, __func__);
3031
3032         /* Construct global filesystem path from vp. */
3033         VOP_UNLOCK(vp);
3034         error = vn_fullpath_global(vp, &rpath, &fbuf);
3035
3036         if (error != 0) {
3037                 vrele(vp);
3038                 return (error);
3039         }
3040
3041         if (strlen(rpath) >= pathlen) {
3042                 vrele(vp);
3043                 error = ENAMETOOLONG;
3044                 goto out;
3045         }
3046
3047         /*
3048          * Re-lookup the vnode by path to detect a possible rename.
3049          * As a side effect, the vnode is relocked.
3050          * If vnode was renamed, return ENOENT.
3051          */
3052         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3053             UIO_SYSSPACE, path, td);
3054         error = namei(&nd);
3055         if (error != 0) {
3056                 vrele(vp);
3057                 goto out;
3058         }
3059         NDFREE(&nd, NDF_ONLY_PNBUF);
3060         vp1 = nd.ni_vp;
3061         vrele(vp);
3062         if (vp1 == vp)
3063                 strcpy(path, rpath);
3064         else {
3065                 vput(vp1);
3066                 error = ENOENT;
3067         }
3068
3069 out:
3070         free(fbuf, M_TEMP);
3071         return (error);
3072 }
3073
3074 #ifdef DDB
3075 static void
3076 db_print_vpath(struct vnode *vp)
3077 {
3078
3079         while (vp != NULL) {
3080                 db_printf("%p: ", vp);
3081                 if (vp == rootvnode) {
3082                         db_printf("/");
3083                         vp = NULL;
3084                 } else {
3085                         if (vp->v_vflag & VV_ROOT) {
3086                                 db_printf("<mount point>");
3087                                 vp = vp->v_mount->mnt_vnodecovered;
3088                         } else {
3089                                 struct namecache *ncp;
3090                                 char *ncn;
3091                                 int i;
3092
3093                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3094                                 if (ncp != NULL) {
3095                                         ncn = ncp->nc_name;
3096                                         for (i = 0; i < ncp->nc_nlen; i++)
3097                                                 db_printf("%c", *ncn++);
3098                                         vp = ncp->nc_dvp;
3099                                 } else {
3100                                         vp = NULL;
3101                                 }
3102                         }
3103                 }
3104                 db_printf("\n");
3105         }
3106
3107         return;
3108 }
3109
3110 DB_SHOW_COMMAND(vpath, db_show_vpath)
3111 {
3112         struct vnode *vp;
3113
3114         if (!have_addr) {
3115                 db_printf("usage: show vpath <struct vnode *>\n");
3116                 return;
3117         }
3118
3119         vp = (struct vnode *)addr;
3120         db_print_vpath(vp);
3121 }
3122
3123 #endif
3124
3125 static bool __read_frequently cache_fast_lookup = true;
3126 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3127     &cache_fast_lookup, 0, "");
3128
3129 #define CACHE_FPL_FAILED        -2020
3130
3131 static void
3132 cache_fpl_cleanup_cnp(struct componentname *cnp)
3133 {
3134
3135         uma_zfree(namei_zone, cnp->cn_pnbuf);
3136 #ifdef DIAGNOSTIC
3137         cnp->cn_pnbuf = NULL;
3138         cnp->cn_nameptr = NULL;
3139 #endif
3140 }
3141
3142 static void
3143 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3144 {
3145         struct componentname *cnp;
3146
3147         cnp = &ndp->ni_cnd;
3148         while (*(cnp->cn_nameptr) == '/') {
3149                 cnp->cn_nameptr++;
3150                 ndp->ni_pathlen--;
3151         }
3152
3153         *dpp = ndp->ni_rootdir;
3154 }
3155
3156 /*
3157  * Components of nameidata (or objects it can point to) which may
3158  * need restoring in case fast path lookup fails.
3159  */
3160 struct nameidata_saved {
3161         long cn_namelen;
3162         char *cn_nameptr;
3163         size_t ni_pathlen;
3164         int cn_flags;
3165 };
3166
3167 struct cache_fpl {
3168         struct nameidata *ndp;
3169         struct componentname *cnp;
3170         struct pwd *pwd;
3171         struct vnode *dvp;
3172         struct vnode *tvp;
3173         seqc_t dvp_seqc;
3174         seqc_t tvp_seqc;
3175         struct nameidata_saved snd;
3176         int line;
3177         enum cache_fpl_status status:8;
3178         bool in_smr;
3179         bool fsearch;
3180 };
3181
3182 static void
3183 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3184 {
3185
3186         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3187         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3188         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3189         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3190 }
3191
3192 static void
3193 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3194 {
3195
3196         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3197         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3198         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3199         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3200 }
3201
3202 #ifdef INVARIANTS
3203 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3204         struct cache_fpl *_fpl = (fpl);                         \
3205         MPASS(_fpl->in_smr == true);                            \
3206         VFS_SMR_ASSERT_ENTERED();                               \
3207 })
3208 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3209         struct cache_fpl *_fpl = (fpl);                         \
3210         MPASS(_fpl->in_smr == false);                           \
3211         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3212 })
3213 #else
3214 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3215 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3216 #endif
3217
3218 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3219         struct cache_fpl *_fpl = (fpl);                         \
3220         vfs_smr_enter();                                        \
3221         _fpl->in_smr = true;                                    \
3222 })
3223
3224 #define cache_fpl_smr_enter(fpl) ({                             \
3225         struct cache_fpl *_fpl = (fpl);                         \
3226         MPASS(_fpl->in_smr == false);                           \
3227         vfs_smr_enter();                                        \
3228         _fpl->in_smr = true;                                    \
3229 })
3230
3231 #define cache_fpl_smr_exit(fpl) ({                              \
3232         struct cache_fpl *_fpl = (fpl);                         \
3233         MPASS(_fpl->in_smr == true);                            \
3234         vfs_smr_exit();                                         \
3235         _fpl->in_smr = false;                                   \
3236 })
3237
3238 static int
3239 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3240 {
3241
3242         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3243                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3244                     ("%s: converting to abort from %d at %d, set at %d\n",
3245                     __func__, fpl->status, line, fpl->line));
3246         }
3247         fpl->status = CACHE_FPL_STATUS_ABORTED;
3248         fpl->line = line;
3249         return (CACHE_FPL_FAILED);
3250 }
3251
3252 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3253
3254 static int
3255 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3256 {
3257
3258         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3259             ("%s: setting to partial at %d, but already set to %d at %d\n",
3260             __func__, line, fpl->status, fpl->line));
3261         cache_fpl_smr_assert_entered(fpl);
3262         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3263         fpl->line = line;
3264         return (CACHE_FPL_FAILED);
3265 }
3266
3267 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3268
3269 static int
3270 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3271 {
3272
3273         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3274             ("%s: setting to handled at %d, but already set to %d at %d\n",
3275             __func__, line, fpl->status, fpl->line));
3276         cache_fpl_smr_assert_not_entered(fpl);
3277         MPASS(error != CACHE_FPL_FAILED);
3278         fpl->status = CACHE_FPL_STATUS_HANDLED;
3279         fpl->line = line;
3280         return (error);
3281 }
3282
3283 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3284
3285 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3286         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3287          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3288
3289 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3290         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3291
3292 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3293     "supported and internal flags overlap");
3294
3295 static bool
3296 cache_fpl_islastcn(struct nameidata *ndp)
3297 {
3298
3299         return (*ndp->ni_next == 0);
3300 }
3301
3302 static bool
3303 cache_fpl_isdotdot(struct componentname *cnp)
3304 {
3305
3306         if (cnp->cn_namelen == 2 &&
3307             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3308                 return (true);
3309         return (false);
3310 }
3311
3312 static bool
3313 cache_can_fplookup(struct cache_fpl *fpl)
3314 {
3315         struct nameidata *ndp;
3316         struct componentname *cnp;
3317         struct thread *td;
3318
3319         ndp = fpl->ndp;
3320         cnp = fpl->cnp;
3321         td = cnp->cn_thread;
3322
3323         if (!cache_fast_lookup) {
3324                 cache_fpl_aborted(fpl);
3325                 return (false);
3326         }
3327 #ifdef MAC
3328         if (mac_vnode_check_lookup_enabled()) {
3329                 cache_fpl_aborted(fpl);
3330                 return (false);
3331         }
3332 #endif
3333         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3334                 cache_fpl_aborted(fpl);
3335                 return (false);
3336         }
3337         if (IN_CAPABILITY_MODE(td)) {
3338                 cache_fpl_aborted(fpl);
3339                 return (false);
3340         }
3341         if (AUDITING_TD(td)) {
3342                 cache_fpl_aborted(fpl);
3343                 return (false);
3344         }
3345         if (ndp->ni_startdir != NULL) {
3346                 cache_fpl_aborted(fpl);
3347                 return (false);
3348         }
3349         return (true);
3350 }
3351
3352 static int
3353 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3354 {
3355         struct nameidata *ndp;
3356         int error;
3357         bool fsearch;
3358
3359         ndp = fpl->ndp;
3360         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3361         if (__predict_false(error != 0)) {
3362                 cache_fpl_smr_exit(fpl);
3363                 return (cache_fpl_aborted(fpl));
3364         }
3365         fpl->fsearch = fsearch;
3366         return (0);
3367 }
3368
3369 static bool
3370 cache_fplookup_vnode_supported(struct vnode *vp)
3371 {
3372
3373         return (vp->v_type != VLNK);
3374 }
3375
3376 /*
3377  * Move a negative entry to the hot list.
3378  *
3379  * We have to take locks, but they may be contended and in the worst
3380  * case we may need to go off CPU. We don't want to spin within the
3381  * smr section and we can't block with it. Instead we are going to
3382  * look up the entry again.
3383  */
3384 static int __noinline
3385 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3386     uint32_t hash)
3387 {
3388         struct componentname *cnp;
3389         struct namecache *ncp;
3390         struct neglist *nl;
3391         struct vnode *dvp;
3392         u_char nc_flag;
3393
3394         cnp = fpl->cnp;
3395         dvp = fpl->dvp;
3396
3397         if (!vhold_smr(dvp))
3398                 return (cache_fpl_aborted(fpl));
3399
3400         nl = NCP2NEGLIST(oncp);
3401         cache_fpl_smr_exit(fpl);
3402
3403         mtx_lock(&nl->nl_lock);
3404         /*
3405          * For hash iteration.
3406          */
3407         cache_fpl_smr_enter(fpl);
3408
3409         /*
3410          * Avoid all surprises by only succeeding if we got the same entry and
3411          * bailing completely otherwise.
3412          *
3413          * In particular at this point there can be a new ncp which matches the
3414          * search but hashes to a different neglist.
3415          */
3416         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3417                 if (ncp == oncp)
3418                         break;
3419         }
3420
3421         /*
3422          * No match to begin with.
3423          */
3424         if (__predict_false(ncp == NULL)) {
3425                 goto out_abort;
3426         }
3427
3428         /*
3429          * The newly found entry may be something different...
3430          */
3431         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3432             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3433                 goto out_abort;
3434         }
3435
3436         /*
3437          * ... and not even negative.
3438          */
3439         nc_flag = atomic_load_char(&ncp->nc_flag);
3440         if ((nc_flag & NCF_NEGATIVE) == 0) {
3441                 goto out_abort;
3442         }
3443
3444         if (__predict_false(!cache_ncp_canuse(ncp))) {
3445                 goto out_abort;
3446         }
3447
3448         cache_negative_promote(ncp);
3449
3450         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3451         counter_u64_add(numneghits, 1);
3452         cache_fpl_smr_exit(fpl);
3453         mtx_unlock(&nl->nl_lock);
3454         vdrop(dvp);
3455         return (cache_fpl_handled(fpl, ENOENT));
3456 out_abort:
3457         cache_fpl_smr_exit(fpl);
3458         mtx_unlock(&nl->nl_lock);
3459         vdrop(dvp);
3460         return (cache_fpl_aborted(fpl));
3461 }
3462
3463 /*
3464  * The target vnode is not supported, prepare for the slow path to take over.
3465  */
3466 static int __noinline
3467 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3468 {
3469         struct nameidata *ndp;
3470         struct componentname *cnp;
3471         enum vgetstate dvs;
3472         struct vnode *dvp;
3473         struct pwd *pwd;
3474         seqc_t dvp_seqc;
3475
3476         ndp = fpl->ndp;
3477         cnp = fpl->cnp;
3478         pwd = fpl->pwd;
3479         dvp = fpl->dvp;
3480         dvp_seqc = fpl->dvp_seqc;
3481
3482         if (!pwd_hold_smr(pwd)) {
3483                 cache_fpl_smr_exit(fpl);
3484                 return (cache_fpl_aborted(fpl));
3485         }
3486
3487         dvs = vget_prep_smr(dvp);
3488         cache_fpl_smr_exit(fpl);
3489         if (__predict_false(dvs == VGET_NONE)) {
3490                 pwd_drop(pwd);
3491                 return (cache_fpl_aborted(fpl));
3492         }
3493
3494         vget_finish_ref(dvp, dvs);
3495         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3496                 vrele(dvp);
3497                 pwd_drop(pwd);
3498                 return (cache_fpl_aborted(fpl));
3499         }
3500
3501         cache_fpl_restore(fpl, &fpl->snd);
3502
3503         ndp->ni_startdir = dvp;
3504         cnp->cn_flags |= MAKEENTRY;
3505         if (cache_fpl_islastcn(ndp))
3506                 cnp->cn_flags |= ISLASTCN;
3507         if (cache_fpl_isdotdot(cnp))
3508                 cnp->cn_flags |= ISDOTDOT;
3509
3510         return (0);
3511 }
3512
3513 static int
3514 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3515 {
3516         struct componentname *cnp;
3517         struct vnode *tvp;
3518         seqc_t tvp_seqc;
3519         int error, lkflags;
3520
3521         cnp = fpl->cnp;
3522         tvp = fpl->tvp;
3523         tvp_seqc = fpl->tvp_seqc;
3524
3525         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3526                 lkflags = LK_SHARED;
3527                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3528                         lkflags = LK_EXCLUSIVE;
3529                 error = vget_finish(tvp, lkflags, tvs);
3530                 if (__predict_false(error != 0)) {
3531                         return (cache_fpl_aborted(fpl));
3532                 }
3533         } else {
3534                 vget_finish_ref(tvp, tvs);
3535         }
3536
3537         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3538                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3539                         vput(tvp);
3540                 else
3541                         vrele(tvp);
3542                 return (cache_fpl_aborted(fpl));
3543         }
3544
3545         return (cache_fpl_handled(fpl, 0));
3546 }
3547
3548 /*
3549  * They want to possibly modify the state of the namecache.
3550  *
3551  * Don't try to match the API contract, just leave.
3552  * TODO: this leaves scalability on the table
3553  */
3554 static int
3555 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3556 {
3557         struct componentname *cnp;
3558
3559         cnp = fpl->cnp;
3560         MPASS(cnp->cn_nameiop != LOOKUP);
3561         return (cache_fpl_partial(fpl));
3562 }
3563
3564 static int __noinline
3565 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3566 {
3567         struct componentname *cnp;
3568         enum vgetstate dvs, tvs;
3569         struct vnode *dvp, *tvp;
3570         seqc_t dvp_seqc;
3571         int error;
3572
3573         cnp = fpl->cnp;
3574         dvp = fpl->dvp;
3575         dvp_seqc = fpl->dvp_seqc;
3576         tvp = fpl->tvp;
3577
3578         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3579
3580         /*
3581          * This is less efficient than it can be for simplicity.
3582          */
3583         dvs = vget_prep_smr(dvp);
3584         if (__predict_false(dvs == VGET_NONE)) {
3585                 return (cache_fpl_aborted(fpl));
3586         }
3587         tvs = vget_prep_smr(tvp);
3588         if (__predict_false(tvs == VGET_NONE)) {
3589                 cache_fpl_smr_exit(fpl);
3590                 vget_abort(dvp, dvs);
3591                 return (cache_fpl_aborted(fpl));
3592         }
3593
3594         cache_fpl_smr_exit(fpl);
3595
3596         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3597                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3598                 if (__predict_false(error != 0)) {
3599                         vget_abort(tvp, tvs);
3600                         return (cache_fpl_aborted(fpl));
3601                 }
3602         } else {
3603                 vget_finish_ref(dvp, dvs);
3604         }
3605
3606         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3607                 vget_abort(tvp, tvs);
3608                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3609                         vput(dvp);
3610                 else
3611                         vrele(dvp);
3612                 return (cache_fpl_aborted(fpl));
3613         }
3614
3615         error = cache_fplookup_final_child(fpl, tvs);
3616         if (__predict_false(error != 0)) {
3617                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3618                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3619                         vput(dvp);
3620                 else
3621                         vrele(dvp);
3622                 return (error);
3623         }
3624
3625         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3626         return (0);
3627 }
3628
3629 static int
3630 cache_fplookup_final(struct cache_fpl *fpl)
3631 {
3632         struct componentname *cnp;
3633         enum vgetstate tvs;
3634         struct vnode *dvp, *tvp;
3635         seqc_t dvp_seqc;
3636
3637         cnp = fpl->cnp;
3638         dvp = fpl->dvp;
3639         dvp_seqc = fpl->dvp_seqc;
3640         tvp = fpl->tvp;
3641
3642         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3643
3644         if (cnp->cn_nameiop != LOOKUP) {
3645                 return (cache_fplookup_final_modifying(fpl));
3646         }
3647
3648         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3649                 return (cache_fplookup_final_withparent(fpl));
3650
3651         tvs = vget_prep_smr(tvp);
3652         if (__predict_false(tvs == VGET_NONE)) {
3653                 return (cache_fpl_partial(fpl));
3654         }
3655
3656         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3657                 cache_fpl_smr_exit(fpl);
3658                 vget_abort(tvp, tvs);
3659                 return (cache_fpl_aborted(fpl));
3660         }
3661
3662         cache_fpl_smr_exit(fpl);
3663         return (cache_fplookup_final_child(fpl, tvs));
3664 }
3665
3666 static int __noinline
3667 cache_fplookup_dot(struct cache_fpl *fpl)
3668 {
3669         struct vnode *dvp;
3670
3671         dvp = fpl->dvp;
3672
3673         fpl->tvp = dvp;
3674         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3675         if (seqc_in_modify(fpl->tvp_seqc)) {
3676                 return (cache_fpl_aborted(fpl));
3677         }
3678
3679         counter_u64_add(dothits, 1);
3680         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3681
3682         return (0);
3683 }
3684
3685 static int __noinline
3686 cache_fplookup_dotdot(struct cache_fpl *fpl)
3687 {
3688         struct nameidata *ndp;
3689         struct componentname *cnp;
3690         struct namecache *ncp;
3691         struct vnode *dvp;
3692         struct prison *pr;
3693         u_char nc_flag;
3694
3695         ndp = fpl->ndp;
3696         cnp = fpl->cnp;
3697         dvp = fpl->dvp;
3698
3699         /*
3700          * XXX this is racy the same way regular lookup is
3701          */
3702         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3703             pr = pr->pr_parent)
3704                 if (dvp == pr->pr_root)
3705                         break;
3706
3707         if (dvp == ndp->ni_rootdir ||
3708             dvp == ndp->ni_topdir ||
3709             dvp == rootvnode ||
3710             pr != NULL) {
3711                 fpl->tvp = dvp;
3712                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3713                 if (seqc_in_modify(fpl->tvp_seqc)) {
3714                         return (cache_fpl_aborted(fpl));
3715                 }
3716                 return (0);
3717         }
3718
3719         if ((dvp->v_vflag & VV_ROOT) != 0) {
3720                 /*
3721                  * TODO
3722                  * The opposite of climb mount is needed here.
3723                  */
3724                 return (cache_fpl_aborted(fpl));
3725         }
3726
3727         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3728         if (ncp == NULL) {
3729                 return (cache_fpl_aborted(fpl));
3730         }
3731
3732         nc_flag = atomic_load_char(&ncp->nc_flag);
3733         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3734                 if ((nc_flag & NCF_NEGATIVE) != 0)
3735                         return (cache_fpl_aborted(fpl));
3736                 fpl->tvp = ncp->nc_vp;
3737         } else {
3738                 fpl->tvp = ncp->nc_dvp;
3739         }
3740
3741         if (__predict_false(!cache_ncp_canuse(ncp))) {
3742                 return (cache_fpl_aborted(fpl));
3743         }
3744
3745         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3746         if (seqc_in_modify(fpl->tvp_seqc)) {
3747                 return (cache_fpl_partial(fpl));
3748         }
3749
3750         counter_u64_add(dotdothits, 1);
3751         return (0);
3752 }
3753
3754 static int
3755 cache_fplookup_next(struct cache_fpl *fpl)
3756 {
3757         struct componentname *cnp;
3758         struct namecache *ncp;
3759         struct negstate *ns;
3760         struct vnode *dvp, *tvp;
3761         u_char nc_flag;
3762         uint32_t hash;
3763         bool neg_hot;
3764
3765         cnp = fpl->cnp;
3766         dvp = fpl->dvp;
3767
3768         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3769                 return (cache_fplookup_dot(fpl));
3770         }
3771
3772         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3773
3774         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3775                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3776                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3777                         break;
3778         }
3779
3780         /*
3781          * If there is no entry we have to punt to the slow path to perform
3782          * actual lookup. Should there be nothing with this name a negative
3783          * entry will be created.
3784          */
3785         if (__predict_false(ncp == NULL)) {
3786                 return (cache_fpl_partial(fpl));
3787         }
3788
3789         tvp = atomic_load_ptr(&ncp->nc_vp);
3790         nc_flag = atomic_load_char(&ncp->nc_flag);
3791         if ((nc_flag & NCF_NEGATIVE) != 0) {
3792                 /*
3793                  * If they want to create an entry we need to replace this one.
3794                  */
3795                 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3796                         return (cache_fpl_partial(fpl));
3797                 }
3798                 ns = NCP2NEGSTATE(ncp);
3799                 neg_hot = ((ns->neg_flag & NEG_HOT) != 0);
3800                 if (__predict_false(!cache_ncp_canuse(ncp))) {
3801                         return (cache_fpl_partial(fpl));
3802                 }
3803                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3804                         return (cache_fpl_partial(fpl));
3805                 }
3806                 if (!neg_hot) {
3807                         return (cache_fplookup_negative_promote(fpl, ncp, hash));
3808                 }
3809                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3810                     ncp->nc_name);
3811                 counter_u64_add(numneghits, 1);
3812                 cache_fpl_smr_exit(fpl);
3813                 return (cache_fpl_handled(fpl, ENOENT));
3814         }
3815
3816         if (__predict_false(!cache_ncp_canuse(ncp))) {
3817                 return (cache_fpl_partial(fpl));
3818         }
3819
3820         fpl->tvp = tvp;
3821         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3822         if (seqc_in_modify(fpl->tvp_seqc)) {
3823                 return (cache_fpl_partial(fpl));
3824         }
3825
3826         if (!cache_fplookup_vnode_supported(tvp)) {
3827                 return (cache_fpl_partial(fpl));
3828         }
3829
3830         counter_u64_add(numposhits, 1);
3831         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3832         return (0);
3833 }
3834
3835 static bool
3836 cache_fplookup_mp_supported(struct mount *mp)
3837 {
3838
3839         if (mp == NULL)
3840                 return (false);
3841         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3842                 return (false);
3843         return (true);
3844 }
3845
3846 /*
3847  * Walk up the mount stack (if any).
3848  *
3849  * Correctness is provided in the following ways:
3850  * - all vnodes are protected from freeing with SMR
3851  * - struct mount objects are type stable making them always safe to access
3852  * - stability of the particular mount is provided by busying it
3853  * - relationship between the vnode which is mounted on and the mount is
3854  *   verified with the vnode sequence counter after busying
3855  * - association between root vnode of the mount and the mount is protected
3856  *   by busy
3857  *
3858  * From that point on we can read the sequence counter of the root vnode
3859  * and get the next mount on the stack (if any) using the same protection.
3860  *
3861  * By the end of successful walk we are guaranteed the reached state was
3862  * indeed present at least at some point which matches the regular lookup.
3863  */
3864 static int __noinline
3865 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3866 {
3867         struct mount *mp, *prev_mp;
3868         struct vnode *vp;
3869         seqc_t vp_seqc;
3870
3871         vp = fpl->tvp;
3872         vp_seqc = fpl->tvp_seqc;
3873
3874         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3875         mp = atomic_load_ptr(&vp->v_mountedhere);
3876         if (mp == NULL)
3877                 return (0);
3878
3879         prev_mp = NULL;
3880         for (;;) {
3881                 if (!vfs_op_thread_enter_crit(mp)) {
3882                         if (prev_mp != NULL)
3883                                 vfs_op_thread_exit_crit(prev_mp);
3884                         return (cache_fpl_partial(fpl));
3885                 }
3886                 if (prev_mp != NULL)
3887                         vfs_op_thread_exit_crit(prev_mp);
3888                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3889                         vfs_op_thread_exit_crit(mp);
3890                         return (cache_fpl_partial(fpl));
3891                 }
3892                 if (!cache_fplookup_mp_supported(mp)) {
3893                         vfs_op_thread_exit_crit(mp);
3894                         return (cache_fpl_partial(fpl));
3895                 }
3896                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3897                 if (vp == NULL || VN_IS_DOOMED(vp)) {
3898                         vfs_op_thread_exit_crit(mp);
3899                         return (cache_fpl_partial(fpl));
3900                 }
3901                 vp_seqc = vn_seqc_read_any(vp);
3902                 if (seqc_in_modify(vp_seqc)) {
3903                         vfs_op_thread_exit_crit(mp);
3904                         return (cache_fpl_partial(fpl));
3905                 }
3906                 prev_mp = mp;
3907                 mp = atomic_load_ptr(&vp->v_mountedhere);
3908                 if (mp == NULL)
3909                         break;
3910         }
3911
3912         vfs_op_thread_exit_crit(prev_mp);
3913         fpl->tvp = vp;
3914         fpl->tvp_seqc = vp_seqc;
3915         return (0);
3916 }
3917
3918 static bool
3919 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3920 {
3921         struct mount *mp;
3922         struct vnode *vp;
3923
3924         vp = fpl->tvp;
3925
3926         /*
3927          * Hack: while this is a union, the pointer tends to be NULL so save on
3928          * a branch.
3929          */
3930         mp = atomic_load_ptr(&vp->v_mountedhere);
3931         if (mp == NULL)
3932                 return (false);
3933         if (vp->v_type == VDIR)
3934                 return (true);
3935         return (false);
3936 }
3937
3938 /*
3939  * Parse the path.
3940  *
3941  * The code was originally copy-pasted from regular lookup and despite
3942  * clean ups leaves performance on the table. Any modifications here
3943  * must take into account that in case off fallback the resulting
3944  * nameidata state has to be compatible with the original.
3945  */
3946 static int
3947 cache_fplookup_parse(struct cache_fpl *fpl)
3948 {
3949         struct nameidata *ndp;
3950         struct componentname *cnp;
3951         char *cp;
3952
3953         ndp = fpl->ndp;
3954         cnp = fpl->cnp;
3955
3956         /*
3957          * Search a new directory.
3958          *
3959          * The last component of the filename is left accessible via
3960          * cnp->cn_nameptr for callers that need the name. Callers needing
3961          * the name set the SAVENAME flag. When done, they assume
3962          * responsibility for freeing the pathname buffer.
3963          */
3964         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3965                 continue;
3966         cnp->cn_namelen = cp - cnp->cn_nameptr;
3967         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
3968                 cache_fpl_smr_exit(fpl);
3969                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
3970         }
3971         ndp->ni_pathlen -= cnp->cn_namelen;
3972         KASSERT(ndp->ni_pathlen <= PATH_MAX,
3973             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3974         ndp->ni_next = cp;
3975
3976         /*
3977          * Replace multiple slashes by a single slash and trailing slashes
3978          * by a null.  This must be done before VOP_LOOKUP() because some
3979          * fs's don't know about trailing slashes.  Remember if there were
3980          * trailing slashes to handle symlinks, existing non-directories
3981          * and non-existing files that won't be directories specially later.
3982          */
3983         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3984                 cp++;
3985                 ndp->ni_pathlen--;
3986                 if (*cp == '\0') {
3987                         /*
3988                          * TODO
3989                          * Regular lookup performs the following:
3990                          * *ndp->ni_next = '\0';
3991                          * cnp->cn_flags |= TRAILINGSLASH;
3992                          *
3993                          * Which is problematic since it modifies data read
3994                          * from userspace. Then if fast path lookup was to
3995                          * abort we would have to either restore it or convey
3996                          * the flag. Since this is a corner case just ignore
3997                          * it for simplicity.
3998                          */
3999                         return (cache_fpl_partial(fpl));
4000                 }
4001         }
4002         ndp->ni_next = cp;
4003
4004         /*
4005          * Check for degenerate name (e.g. / or "")
4006          * which is a way of talking about a directory,
4007          * e.g. like "/." or ".".
4008          *
4009          * TODO
4010          * Another corner case handled by the regular lookup
4011          */
4012         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4013                 return (cache_fpl_partial(fpl));
4014         }
4015         return (0);
4016 }
4017
4018 static void
4019 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4020 {
4021         struct nameidata *ndp;
4022         struct componentname *cnp;
4023
4024         ndp = fpl->ndp;
4025         cnp = fpl->cnp;
4026
4027         cnp->cn_nameptr = ndp->ni_next;
4028         while (*cnp->cn_nameptr == '/') {
4029                 cnp->cn_nameptr++;
4030                 ndp->ni_pathlen--;
4031         }
4032 }
4033
4034 /*
4035  * See the API contract for VOP_FPLOOKUP_VEXEC.
4036  */
4037 static int __noinline
4038 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4039 {
4040         struct componentname *cnp;
4041         struct vnode *dvp;
4042         seqc_t dvp_seqc;
4043
4044         cnp = fpl->cnp;
4045         dvp = fpl->dvp;
4046         dvp_seqc = fpl->dvp_seqc;
4047
4048         /*
4049          * Hack: they may be looking up foo/bar, where foo is a
4050          * regular file. In such a case we need to turn ENOTDIR,
4051          * but we may happen to get here with a different error.
4052          */
4053         if (dvp->v_type != VDIR) {
4054                 /*
4055                  * The check here is predominantly to catch
4056                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4057                  * gets doomed past this point it is going to
4058                  * fail seqc verification.
4059                  */
4060                 if (VN_IS_DOOMED(dvp)) {
4061                         return (cache_fpl_aborted(fpl));
4062                 }
4063                 error = ENOTDIR;
4064         }
4065
4066         /*
4067          * Hack: handle O_SEARCH.
4068          *
4069          * Open Group Base Specifications Issue 7, 2018 edition states:
4070          * If the access mode of the open file description associated with the
4071          * file descriptor is not O_SEARCH, the function shall check whether
4072          * directory searches are permitted using the current permissions of
4073          * the directory underlying the file descriptor. If the access mode is
4074          * O_SEARCH, the function shall not perform the check.
4075          *
4076          * Regular lookup tests for the NOEXECCHECK flag for every path
4077          * component to decide whether to do the permission check. However,
4078          * since most lookups never have the flag (and when they do it is only
4079          * present for the first path component), lockless lookup only acts on
4080          * it if there is a permission problem. Here the flag is represented
4081          * with a boolean so that we don't have to clear it on the way out.
4082          *
4083          * For simplicity this always aborts.
4084          * TODO: check if this is the first lookup and ignore the permission
4085          * problem. Note the flag has to survive fallback (if it happens to be
4086          * performed).
4087          */
4088         if (fpl->fsearch) {
4089                 return (cache_fpl_aborted(fpl));
4090         }
4091
4092         switch (error) {
4093         case EAGAIN:
4094                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4095                         error = cache_fpl_aborted(fpl);
4096                 } else {
4097                         cache_fpl_partial(fpl);
4098                 }
4099                 break;
4100         default:
4101                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4102                         error = cache_fpl_aborted(fpl);
4103                 } else {
4104                         cache_fpl_smr_exit(fpl);
4105                         cache_fpl_handled(fpl, error);
4106                 }
4107                 break;
4108         }
4109         return (error);
4110 }
4111
4112 static int
4113 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4114 {
4115         struct nameidata *ndp;
4116         struct componentname *cnp;
4117         struct mount *mp;
4118         int error;
4119
4120         error = CACHE_FPL_FAILED;
4121         ndp = fpl->ndp;
4122         cnp = fpl->cnp;
4123
4124         cache_fpl_checkpoint(fpl, &fpl->snd);
4125
4126         fpl->dvp = dvp;
4127         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4128         if (seqc_in_modify(fpl->dvp_seqc)) {
4129                 cache_fpl_aborted(fpl);
4130                 goto out;
4131         }
4132         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4133         if (!cache_fplookup_mp_supported(mp)) {
4134                 cache_fpl_aborted(fpl);
4135                 goto out;
4136         }
4137
4138         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4139
4140         for (;;) {
4141                 error = cache_fplookup_parse(fpl);
4142                 if (__predict_false(error != 0)) {
4143                         break;
4144                 }
4145
4146                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4147
4148                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4149                 if (__predict_false(error != 0)) {
4150                         error = cache_fplookup_failed_vexec(fpl, error);
4151                         break;
4152                 }
4153
4154                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4155                         error = cache_fplookup_dotdot(fpl);
4156                         if (__predict_false(error != 0)) {
4157                                 break;
4158                         }
4159                 } else {
4160                         error = cache_fplookup_next(fpl);
4161                         if (__predict_false(error != 0)) {
4162                                 break;
4163                         }
4164
4165                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4166
4167                         if (cache_fplookup_need_climb_mount(fpl)) {
4168                                 error = cache_fplookup_climb_mount(fpl);
4169                                 if (__predict_false(error != 0)) {
4170                                         break;
4171                                 }
4172                         }
4173                 }
4174
4175                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4176
4177                 if (cache_fpl_islastcn(ndp)) {
4178                         error = cache_fplookup_final(fpl);
4179                         break;
4180                 }
4181
4182                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4183                         error = cache_fpl_aborted(fpl);
4184                         break;
4185                 }
4186
4187                 fpl->dvp = fpl->tvp;
4188                 fpl->dvp_seqc = fpl->tvp_seqc;
4189
4190                 cache_fplookup_parse_advance(fpl);
4191                 cache_fpl_checkpoint(fpl, &fpl->snd);
4192         }
4193 out:
4194         switch (fpl->status) {
4195         case CACHE_FPL_STATUS_UNSET:
4196                 __assert_unreachable();
4197                 break;
4198         case CACHE_FPL_STATUS_PARTIAL:
4199                 cache_fpl_smr_assert_entered(fpl);
4200                 return (cache_fplookup_partial_setup(fpl));
4201         case CACHE_FPL_STATUS_ABORTED:
4202                 if (fpl->in_smr)
4203                         cache_fpl_smr_exit(fpl);
4204                 return (CACHE_FPL_FAILED);
4205         case CACHE_FPL_STATUS_HANDLED:
4206                 MPASS(error != CACHE_FPL_FAILED);
4207                 cache_fpl_smr_assert_not_entered(fpl);
4208                 if (__predict_false(error != 0)) {
4209                         ndp->ni_dvp = NULL;
4210                         ndp->ni_vp = NULL;
4211                         cache_fpl_cleanup_cnp(cnp);
4212                         return (error);
4213                 }
4214                 ndp->ni_dvp = fpl->dvp;
4215                 ndp->ni_vp = fpl->tvp;
4216                 if (cnp->cn_flags & SAVENAME)
4217                         cnp->cn_flags |= HASBUF;
4218                 else
4219                         cache_fpl_cleanup_cnp(cnp);
4220                 return (error);
4221         }
4222 }
4223
4224 /*
4225  * Fast path lookup protected with SMR and sequence counters.
4226  *
4227  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4228  *
4229  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4230  * outlined below.
4231  *
4232  * Traditional vnode lookup conceptually looks like this:
4233  *
4234  * vn_lock(current);
4235  * for (;;) {
4236  *      next = find();
4237  *      vn_lock(next);
4238  *      vn_unlock(current);
4239  *      current = next;
4240  *      if (last)
4241  *          break;
4242  * }
4243  * return (current);
4244  *
4245  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4246  * any modifications thanks to holding respective locks.
4247  *
4248  * The same guarantee can be provided with a combination of safe memory
4249  * reclamation and sequence counters instead. If all operations which affect
4250  * the relationship between the current vnode and the one we are looking for
4251  * also modify the counter, we can verify whether all the conditions held as
4252  * we made the jump. This includes things like permissions, mount points etc.
4253  * Counter modification is provided by enclosing relevant places in
4254  * vn_seqc_write_begin()/end() calls.
4255  *
4256  * Thus this translates to:
4257  *
4258  * vfs_smr_enter();
4259  * dvp_seqc = seqc_read_any(dvp);
4260  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4261  *     abort();
4262  * for (;;) {
4263  *      tvp = find();
4264  *      tvp_seqc = seqc_read_any(tvp);
4265  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4266  *          abort();
4267  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4268  *          abort();
4269  *      dvp = tvp; // we know nothing of importance has changed
4270  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4271  *      if (last)
4272  *          break;
4273  * }
4274  * vget(); // secure the vnode
4275  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4276  *          abort();
4277  * // at this point we know nothing has changed for any parent<->child pair
4278  * // as they were crossed during the lookup, meaning we matched the guarantee
4279  * // of the locked variant
4280  * return (tvp);
4281  *
4282  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4283  * - they are called while within vfs_smr protection which they must never exit
4284  * - EAGAIN can be returned to denote checking could not be performed, it is
4285  *   always valid to return it
4286  * - if the sequence counter has not changed the result must be valid
4287  * - if the sequence counter has changed both false positives and false negatives
4288  *   are permitted (since the result will be rejected later)
4289  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4290  *
4291  * Caveats to watch out for:
4292  * - vnodes are passed unlocked and unreferenced with nothing stopping
4293  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4294  *   to use atomic_load_ptr to fetch it.
4295  * - the aforementioned object can also get freed, meaning absent other means it
4296  *   should be protected with vfs_smr
4297  * - either safely checking permissions as they are modified or guaranteeing
4298  *   their stability is left to the routine
4299  */
4300 int
4301 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4302     struct pwd **pwdp)
4303 {
4304         struct cache_fpl fpl;
4305         struct pwd *pwd;
4306         struct vnode *dvp;
4307         struct componentname *cnp;
4308         struct nameidata_saved orig;
4309         int error;
4310
4311         MPASS(ndp->ni_lcf == 0);
4312
4313         fpl.status = CACHE_FPL_STATUS_UNSET;
4314         fpl.ndp = ndp;
4315         fpl.cnp = &ndp->ni_cnd;
4316         MPASS(curthread == fpl.cnp->cn_thread);
4317
4318         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4319                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4320
4321         if (!cache_can_fplookup(&fpl)) {
4322                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4323                 *status = fpl.status;
4324                 return (EOPNOTSUPP);
4325         }
4326
4327         cache_fpl_checkpoint(&fpl, &orig);
4328
4329         cache_fpl_smr_enter_initial(&fpl);
4330         fpl.fsearch = false;
4331         pwd = pwd_get_smr();
4332         fpl.pwd = pwd;
4333         ndp->ni_rootdir = pwd->pwd_rdir;
4334         ndp->ni_topdir = pwd->pwd_jdir;
4335
4336         cnp = fpl.cnp;
4337         cnp->cn_nameptr = cnp->cn_pnbuf;
4338         if (cnp->cn_pnbuf[0] == '/') {
4339                 cache_fpl_handle_root(ndp, &dvp);
4340         } else {
4341                 if (ndp->ni_dirfd == AT_FDCWD) {
4342                         dvp = pwd->pwd_cdir;
4343                 } else {
4344                         error = cache_fplookup_dirfd(&fpl, &dvp);
4345                         if (__predict_false(error != 0)) {
4346                                 goto out;
4347                         }
4348                 }
4349         }
4350
4351         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4352
4353         error = cache_fplookup_impl(dvp, &fpl);
4354 out:
4355         cache_fpl_smr_assert_not_entered(&fpl);
4356         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4357
4358         *status = fpl.status;
4359         switch (fpl.status) {
4360         case CACHE_FPL_STATUS_UNSET:
4361                 __assert_unreachable();
4362                 break;
4363         case CACHE_FPL_STATUS_HANDLED:
4364                 SDT_PROBE3(vfs, namei, lookup, return, error,
4365                     (error == 0 ? ndp->ni_vp : NULL), true);
4366                 break;
4367         case CACHE_FPL_STATUS_PARTIAL:
4368                 *pwdp = fpl.pwd;
4369                 /*
4370                  * Status restored by cache_fplookup_partial_setup.
4371                  */
4372                 break;
4373         case CACHE_FPL_STATUS_ABORTED:
4374                 cache_fpl_restore(&fpl, &orig);
4375                 break;
4376         }
4377         return (error);
4378 }