sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 SDT_PROVIDER_DECLARE(vfs);
  83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  84     "struct vnode *");
  85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  86     "char *");
  87 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  88     "const char *");
  89 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  90     "struct namecache *", "int", "int");
  91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  93     "char *", "struct vnode *");
  94 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  95 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  96     "struct vnode *", "char *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  98     "struct vnode *");
  99 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 100     "struct vnode *", "char *");
 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 102     "char *");
 103 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 104     "struct componentname *");
 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 106     "struct componentname *");
 107 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 108 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 109 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 110 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 111     "struct vnode *");
 112 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 113     "char *");
 114 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
 115     "char *");
 116
 117 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 118 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 119 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 120
 121 /*
 122  * This structure describes the elements in the cache of recent
 123  * names looked up by namei.
 124  */
 125 struct negstate {
 126         u_char neg_flag;
 127 };
 128 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 129     "the state must fit in a union with a pointer without growing it");
 130
 131 struct  namecache {
 132         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 133         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 134         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 135         struct  vnode *nc_dvp;          /* vnode of parent of name */
 136         union {
 137                 struct  vnode *nu_vp;   /* vnode the name refers to */
 138                 struct  negstate nu_neg;/* negative entry state */
 139         } n_un;
 140         u_char  nc_flag;                /* flag bits */
 141         u_char  nc_nlen;                /* length of name */
 142         char    nc_name[0];             /* segment name + nul */
 143 };
 144
 145 /*
 146  * struct namecache_ts repeats struct namecache layout up to the
 147  * nc_nlen member.
 148  * struct namecache_ts is used in place of struct namecache when time(s) need
 149  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 150  * both a non-dotdot directory name plus dotdot for the directory's
 151  * parent.
 152  *
 153  * See below for alignment requirement.
 154  */
 155 struct  namecache_ts {
 156         struct  timespec nc_time;       /* timespec provided by fs */
 157         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 158         int     nc_ticks;               /* ticks value when entry was added */
 159         struct namecache nc_nc;
 160 };
 161
 162 /*
 163  * At least mips n32 performs 64-bit accesses to timespec as found
 164  * in namecache_ts and requires them to be aligned. Since others
 165  * may be in the same spot suffer a little bit and enforce the
 166  * alignment for everyone. Note this is a nop for 64-bit platforms.
 167  */
 168 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 169 #define CACHE_PATH_CUTOFF       39
 170
 171 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 172 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 173 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 174 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 175
 176 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 177 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 178 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 179 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 180
 181 #define nc_vp           n_un.nu_vp
 182 #define nc_neg          n_un.nu_neg
 183
 184 /*
 185  * Flags in namecache.nc_flag
 186  */
 187 #define NCF_WHITE       0x01
 188 #define NCF_ISDOTDOT    0x02
 189 #define NCF_TS          0x04
 190 #define NCF_DTS         0x08
 191 #define NCF_DVDROP      0x10
 192 #define NCF_NEGATIVE    0x20
 193 #define NCF_INVALID     0x40
 194 #define NCF_WIP         0x80
 195
 196 /*
 197  * Flags in negstate.neg_flag
 198  */
 199 #define NEG_HOT         0x01
 200
 201 /*
 202  * Mark an entry as invalid.
 203  *
 204  * This is called before it starts getting deconstructed.
 205  */
 206 static void
 207 cache_ncp_invalidate(struct namecache *ncp)
 208 {
 209
 210         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 211             ("%s: entry %p already invalid", __func__, ncp));
 212         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 213         atomic_thread_fence_rel();
 214 }
 215
 216 /*
 217  * Check whether the entry can be safely used.
 218  *
 219  * All places which elide locks are supposed to call this after they are
 220  * done with reading from an entry.
 221  */
 222 static bool
 223 cache_ncp_canuse(struct namecache *ncp)
 224 {
 225
 226         atomic_thread_fence_acq();
 227         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 228 }
 229
 230 /*
 231  * Name caching works as follows:
 232  *
 233  * Names found by directory scans are retained in a cache
 234  * for future reference.  It is managed LRU, so frequently
 235  * used names will hang around.  Cache is indexed by hash value
 236  * obtained from (dvp, name) where dvp refers to the directory
 237  * containing name.
 238  *
 239  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 240  * exist) the vnode pointer will be NULL.
 241  *
 242  * Upon reaching the last segment of a path, if the reference
 243  * is for DELETE, or NOCACHE is set (rewrite), and the
 244  * name is located in the cache, it will be dropped.
 245  *
 246  * These locks are used (in the order in which they can be taken):
 247  * NAME         TYPE    ROLE
 248  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 249  * bucketlock   mtx     for access to given set of hash buckets
 250  * neglist      mtx     negative entry LRU management
 251  *
 252  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
 253  * shrinking the LRU list.
 254  *
 255  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 256  * order is lower address first. Both are recursive.
 257  *
 258  * "." lookups are lockless.
 259  *
 260  * ".." and vnode -> name lookups require vnodelock.
 261  *
 262  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 263  *
 264  * Insertions and removals of entries require involved vnodes and bucketlocks
 265  * to be locked to provide safe operation against other threads modifying the
 266  * cache.
 267  *
 268  * Some lookups result in removal of the found entry (e.g. getting rid of a
 269  * negative entry with the intent to create a positive one), which poses a
 270  * problem when multiple threads reach the state. Similarly, two different
 271  * threads can purge two different vnodes and try to remove the same name.
 272  *
 273  * If the already held vnode lock is lower than the second required lock, we
 274  * can just take the other lock. However, in the opposite case, this could
 275  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 276  * the first node, locking everything in order and revalidating the state.
 277  */
 278
 279 VFS_SMR_DECLARE;
 280
 281 /*
 282  * Structures associated with name caching.
 283  */
 284 #define NCHHASH(hash) \
 285         (&nchashtbl[(hash) & nchash])
 286 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 287 static u_long __read_mostly     nchash;                 /* size of hash table */
 288 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 289     "Size of namecache hash table");
 290 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 291 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 292     "Ratio of negative namecache entries");
 293 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 294 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 295 u_int ncsizefactor = 2;
 296 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 297     "Size factor for namecache");
 298 static u_int __read_mostly      ncpurgeminvnodes;
 299 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
 300     "Number of vnodes below which purgevfs ignores the request");
 301 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 302
 303 struct nchstats nchstats;               /* cache effectiveness statistics */
 304
 305 static bool __read_frequently cache_fast_revlookup = true;
 306 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 307     &cache_fast_revlookup, 0, "");
 308
 309 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
 310
 311 struct neglist {
 312         struct mtx              nl_lock;
 313         TAILQ_HEAD(, namecache) nl_list;
 314 } __aligned(CACHE_LINE_SIZE);
 315
 316 static struct neglist __read_mostly     *neglists;
 317 static struct neglist ncneg_hot;
 318 static u_long numhotneg;
 319
 320 #define ncneghash       3
 321 #define numneglists     (ncneghash + 1)
 322 static inline struct neglist *
 323 NCP2NEGLIST(struct namecache *ncp)
 324 {
 325
 326         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 327 }
 328
 329 static inline struct negstate *
 330 NCP2NEGSTATE(struct namecache *ncp)
 331 {
 332
 333         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 334         return (&ncp->nc_neg);
 335 }
 336
 337 #define numbucketlocks (ncbuckethash + 1)
 338 static u_int __read_mostly  ncbuckethash;
 339 static struct mtx_padalign __read_mostly  *bucketlocks;
 340 #define HASH2BUCKETLOCK(hash) \
 341         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 342
 343 #define numvnodelocks (ncvnodehash + 1)
 344 static u_int __read_mostly  ncvnodehash;
 345 static struct mtx __read_mostly *vnodelocks;
 346 static inline struct mtx *
 347 VP2VNODELOCK(struct vnode *vp)
 348 {
 349
 350         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 351 }
 352
 353 /*
 354  * UMA zones for the VFS cache.
 355  *
 356  * The small cache is used for entries with short names, which are the
 357  * most common.  The large cache is used for entries which are too big to
 358  * fit in the small cache.
 359  */
 360 static uma_zone_t __read_mostly cache_zone_small;
 361 static uma_zone_t __read_mostly cache_zone_small_ts;
 362 static uma_zone_t __read_mostly cache_zone_large;
 363 static uma_zone_t __read_mostly cache_zone_large_ts;
 364
 365 static struct namecache *
 366 cache_alloc(int len, int ts)
 367 {
 368         struct namecache_ts *ncp_ts;
 369         struct namecache *ncp;
 370
 371         if (__predict_false(ts)) {
 372                 if (len <= CACHE_PATH_CUTOFF)
 373                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 374                 else
 375                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 376                 ncp = &ncp_ts->nc_nc;
 377         } else {
 378                 if (len <= CACHE_PATH_CUTOFF)
 379                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 380                 else
 381                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 382         }
 383         return (ncp);
 384 }
 385
 386 static void
 387 cache_free(struct namecache *ncp)
 388 {
 389         struct namecache_ts *ncp_ts;
 390
 391         MPASS(ncp != NULL);
 392         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 393                 vdrop(ncp->nc_dvp);
 394         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 395                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 396                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 397                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 398                 else
 399                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 400         } else {
 401                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 402                         uma_zfree_smr(cache_zone_small, ncp);
 403                 else
 404                         uma_zfree_smr(cache_zone_large, ncp);
 405         }
 406 }
 407
 408 static void
 409 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 410 {
 411         struct namecache_ts *ncp_ts;
 412
 413         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 414             (tsp == NULL && ticksp == NULL),
 415             ("No NCF_TS"));
 416
 417         if (tsp == NULL)
 418                 return;
 419
 420         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 421         *tsp = ncp_ts->nc_time;
 422         *ticksp = ncp_ts->nc_ticks;
 423 }
 424
 425 #ifdef DEBUG_CACHE
 426 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 427 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 428     "VFS namecache enabled");
 429 #endif
 430
 431 /* Export size information to userland */
 432 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 433     sizeof(struct namecache), "sizeof(struct namecache)");
 434
 435 /*
 436  * The new name cache statistics
 437  */
 438 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 439     "Name cache statistics");
 440 #define STATNODE_ULONG(name, descr)                                     \
 441         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 442 #define STATNODE_COUNTER(name, descr)                                   \
 443         static COUNTER_U64_DEFINE_EARLY(name);                          \
 444         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 445             descr);
 446 STATNODE_ULONG(numneg, "Number of negative cache entries");
 447 STATNODE_ULONG(numcache, "Number of cache entries");
 448 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 449 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 450 STATNODE_COUNTER(dothits, "Number of '.' hits");
 451 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 452 STATNODE_COUNTER(nummiss, "Number of cache misses");
 453 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 454 STATNODE_COUNTER(numposzaps,
 455     "Number of cache hits (positive) we do not want to cache");
 456 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 457 STATNODE_COUNTER(numnegzaps,
 458     "Number of cache hits (negative) we do not want to cache");
 459 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 460 /* These count for vn_getcwd(), too. */
 461 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 462 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 463 STATNODE_COUNTER(numfullpathfail2,
 464     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 465 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 466 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 467 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 468     "Number of successful removals after relocking");
 469 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 470     "Number of times zap_and_exit failed to lock");
 471 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 472     "Number of times zap_and_exit failed to lock");
 473 static long cache_lock_vnodes_cel_3_failures;
 474 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 475     "Number of times 3-way vnode locking failed");
 476 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
 477 STATNODE_COUNTER(numneg_evicted,
 478     "Number of negative entries evicted when adding a new entry");
 479 STATNODE_COUNTER(shrinking_skipped,
 480     "Number of times shrinking was already in progress");
 481
 482 static void cache_zap_locked(struct namecache *ncp);
 483 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 484     char **freebuf, size_t *buflen);
 485 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 486     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
 487 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 488     char **retbuf, size_t *buflen);
 489 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 490     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 491
 492 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 493
 494 static int cache_yield;
 495 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
 496     "Number of times cache called yield");
 497
 498 static void __noinline
 499 cache_maybe_yield(void)
 500 {
 501
 502         if (should_yield()) {
 503                 cache_yield++;
 504                 kern_yield(PRI_USER);
 505         }
 506 }
 507
 508 static inline void
 509 cache_assert_vlp_locked(struct mtx *vlp)
 510 {
 511
 512         if (vlp != NULL)
 513                 mtx_assert(vlp, MA_OWNED);
 514 }
 515
 516 static inline void
 517 cache_assert_vnode_locked(struct vnode *vp)
 518 {
 519         struct mtx *vlp;
 520
 521         vlp = VP2VNODELOCK(vp);
 522         cache_assert_vlp_locked(vlp);
 523 }
 524
 525 /*
 526  * TODO: With the value stored we can do better than computing the hash based
 527  * on the address. The choice of FNV should also be revisited.
 528  */
 529 static void
 530 cache_prehash(struct vnode *vp)
 531 {
 532
 533         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 534 }
 535
 536 static uint32_t
 537 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 538 {
 539
 540         return (fnv_32_buf(name, len, dvp->v_nchash));
 541 }
 542
 543 static inline struct nchashhead *
 544 NCP2BUCKET(struct namecache *ncp)
 545 {
 546         uint32_t hash;
 547
 548         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 549         return (NCHHASH(hash));
 550 }
 551
 552 static inline struct mtx *
 553 NCP2BUCKETLOCK(struct namecache *ncp)
 554 {
 555         uint32_t hash;
 556
 557         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 558         return (HASH2BUCKETLOCK(hash));
 559 }
 560
 561 #ifdef INVARIANTS
 562 static void
 563 cache_assert_bucket_locked(struct namecache *ncp)
 564 {
 565         struct mtx *blp;
 566
 567         blp = NCP2BUCKETLOCK(ncp);
 568         mtx_assert(blp, MA_OWNED);
 569 }
 570
 571 static void
 572 cache_assert_bucket_unlocked(struct namecache *ncp)
 573 {
 574         struct mtx *blp;
 575
 576         blp = NCP2BUCKETLOCK(ncp);
 577         mtx_assert(blp, MA_NOTOWNED);
 578 }
 579 #else
 580 #define cache_assert_bucket_locked(x) do { } while (0)
 581 #define cache_assert_bucket_unlocked(x) do { } while (0)
 582 #endif
 583
 584 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 585 static void
 586 _cache_sort_vnodes(void **p1, void **p2)
 587 {
 588         void *tmp;
 589
 590         MPASS(*p1 != NULL || *p2 != NULL);
 591
 592         if (*p1 > *p2) {
 593                 tmp = *p2;
 594                 *p2 = *p1;
 595                 *p1 = tmp;
 596         }
 597 }
 598
 599 static void
 600 cache_lock_all_buckets(void)
 601 {
 602         u_int i;
 603
 604         for (i = 0; i < numbucketlocks; i++)
 605                 mtx_lock(&bucketlocks[i]);
 606 }
 607
 608 static void
 609 cache_unlock_all_buckets(void)
 610 {
 611         u_int i;
 612
 613         for (i = 0; i < numbucketlocks; i++)
 614                 mtx_unlock(&bucketlocks[i]);
 615 }
 616
 617 static void
 618 cache_lock_all_vnodes(void)
 619 {
 620         u_int i;
 621
 622         for (i = 0; i < numvnodelocks; i++)
 623                 mtx_lock(&vnodelocks[i]);
 624 }
 625
 626 static void
 627 cache_unlock_all_vnodes(void)
 628 {
 629         u_int i;
 630
 631         for (i = 0; i < numvnodelocks; i++)
 632                 mtx_unlock(&vnodelocks[i]);
 633 }
 634
 635 static int
 636 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 637 {
 638
 639         cache_sort_vnodes(&vlp1, &vlp2);
 640
 641         if (vlp1 != NULL) {
 642                 if (!mtx_trylock(vlp1))
 643                         return (EAGAIN);
 644         }
 645         if (!mtx_trylock(vlp2)) {
 646                 if (vlp1 != NULL)
 647                         mtx_unlock(vlp1);
 648                 return (EAGAIN);
 649         }
 650
 651         return (0);
 652 }
 653
 654 static void
 655 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 656 {
 657
 658         MPASS(vlp1 != NULL || vlp2 != NULL);
 659         MPASS(vlp1 <= vlp2);
 660
 661         if (vlp1 != NULL)
 662                 mtx_lock(vlp1);
 663         if (vlp2 != NULL)
 664                 mtx_lock(vlp2);
 665 }
 666
 667 static void
 668 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 669 {
 670
 671         MPASS(vlp1 != NULL || vlp2 != NULL);
 672
 673         if (vlp1 != NULL)
 674                 mtx_unlock(vlp1);
 675         if (vlp2 != NULL)
 676                 mtx_unlock(vlp2);
 677 }
 678
 679 static int
 680 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 681 {
 682         struct nchstats snap;
 683
 684         if (req->oldptr == NULL)
 685                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 686
 687         snap = nchstats;
 688         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 689         snap.ncs_neghits = counter_u64_fetch(numneghits);
 690         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 691             counter_u64_fetch(numnegzaps);
 692         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 693             counter_u64_fetch(nummiss);
 694
 695         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 696 }
 697 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 698     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 699     "VFS cache effectiveness statistics");
 700
 701 #ifdef DIAGNOSTIC
 702 /*
 703  * Grab an atomic snapshot of the name cache hash chain lengths
 704  */
 705 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 706     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 707     "hash table stats");
 708
 709 static int
 710 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 711 {
 712         struct nchashhead *ncpp;
 713         struct namecache *ncp;
 714         int i, error, n_nchash, *cntbuf;
 715
 716 retry:
 717         n_nchash = nchash + 1;  /* nchash is max index, not count */
 718         if (req->oldptr == NULL)
 719                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 720         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 721         cache_lock_all_buckets();
 722         if (n_nchash != nchash + 1) {
 723                 cache_unlock_all_buckets();
 724                 free(cntbuf, M_TEMP);
 725                 goto retry;
 726         }
 727         /* Scan hash tables counting entries */
 728         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 729                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 730                         cntbuf[i]++;
 731         cache_unlock_all_buckets();
 732         for (error = 0, i = 0; i < n_nchash; i++)
 733                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 734                         break;
 735         free(cntbuf, M_TEMP);
 736         return (error);
 737 }
 738 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 739     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 740     "nchash chain lengths");
 741
 742 static int
 743 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 744 {
 745         int error;
 746         struct nchashhead *ncpp;
 747         struct namecache *ncp;
 748         int n_nchash;
 749         int count, maxlength, used, pct;
 750
 751         if (!req->oldptr)
 752                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 753
 754         cache_lock_all_buckets();
 755         n_nchash = nchash + 1;  /* nchash is max index, not count */
 756         used = 0;
 757         maxlength = 0;
 758
 759         /* Scan hash tables for applicable entries */
 760         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 761                 count = 0;
 762                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 763                         count++;
 764                 }
 765                 if (count)
 766                         used++;
 767                 if (maxlength < count)
 768                         maxlength = count;
 769         }
 770         n_nchash = nchash + 1;
 771         cache_unlock_all_buckets();
 772         pct = (used * 100) / (n_nchash / 100);
 773         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 774         if (error)
 775                 return (error);
 776         error = SYSCTL_OUT(req, &used, sizeof(used));
 777         if (error)
 778                 return (error);
 779         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 780         if (error)
 781                 return (error);
 782         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 783         if (error)
 784                 return (error);
 785         return (0);
 786 }
 787 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 788     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 789     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 790 #endif
 791
 792 /*
 793  * Negative entries management
 794  *
 795  * A variation of LRU scheme is used. New entries are hashed into one of
 796  * numneglists cold lists. Entries get promoted to the hot list on first hit.
 797  *
 798  * The shrinker will demote hot list head and evict from the cold list in a
 799  * round-robin manner.
 800  */
 801 static void
 802 cache_negative_init(struct namecache *ncp)
 803 {
 804         struct negstate *negstate;
 805
 806         ncp->nc_flag |= NCF_NEGATIVE;
 807         negstate = NCP2NEGSTATE(ncp);
 808         negstate->neg_flag = 0;
 809 }
 810
 811 static void
 812 cache_negative_hit(struct namecache *ncp)
 813 {
 814         struct neglist *neglist;
 815         struct negstate *negstate;
 816
 817         negstate = NCP2NEGSTATE(ncp);
 818         if ((negstate->neg_flag & NEG_HOT) != 0)
 819                 return;
 820         neglist = NCP2NEGLIST(ncp);
 821         mtx_lock(&ncneg_hot.nl_lock);
 822         mtx_lock(&neglist->nl_lock);
 823         if ((negstate->neg_flag & NEG_HOT) == 0) {
 824                 numhotneg++;
 825                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 826                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 827                 negstate->neg_flag |= NEG_HOT;
 828         }
 829         mtx_unlock(&neglist->nl_lock);
 830         mtx_unlock(&ncneg_hot.nl_lock);
 831 }
 832
 833 static void
 834 cache_negative_insert(struct namecache *ncp)
 835 {
 836         struct neglist *neglist;
 837
 838         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 839         cache_assert_bucket_locked(ncp);
 840         neglist = NCP2NEGLIST(ncp);
 841         mtx_lock(&neglist->nl_lock);
 842         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 843         mtx_unlock(&neglist->nl_lock);
 844         atomic_add_rel_long(&numneg, 1);
 845 }
 846
 847 static void
 848 cache_negative_remove(struct namecache *ncp)
 849 {
 850         struct neglist *neglist;
 851         struct negstate *negstate;
 852         bool hot_locked = false;
 853         bool list_locked = false;
 854
 855         cache_assert_bucket_locked(ncp);
 856         neglist = NCP2NEGLIST(ncp);
 857         negstate = NCP2NEGSTATE(ncp);
 858         if ((negstate->neg_flag & NEG_HOT) != 0) {
 859                 hot_locked = true;
 860                 mtx_lock(&ncneg_hot.nl_lock);
 861                 if ((negstate->neg_flag & NEG_HOT) == 0) {
 862                         list_locked = true;
 863                         mtx_lock(&neglist->nl_lock);
 864                 }
 865         } else {
 866                 list_locked = true;
 867                 mtx_lock(&neglist->nl_lock);
 868                 /*
 869                  * We may be racing against promotion in lockless lookup.
 870                  */
 871                 if ((negstate->neg_flag & NEG_HOT) != 0) {
 872                         mtx_unlock(&neglist->nl_lock);
 873                         hot_locked = true;
 874                         mtx_lock(&ncneg_hot.nl_lock);
 875                         mtx_lock(&neglist->nl_lock);
 876                 }
 877         }
 878         if ((negstate->neg_flag & NEG_HOT) != 0) {
 879                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
 880                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 881                 numhotneg--;
 882         } else {
 883                 mtx_assert(&neglist->nl_lock, MA_OWNED);
 884                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 885         }
 886         if (list_locked)
 887                 mtx_unlock(&neglist->nl_lock);
 888         if (hot_locked)
 889                 mtx_unlock(&ncneg_hot.nl_lock);
 890         atomic_subtract_rel_long(&numneg, 1);
 891 }
 892
 893 static void
 894 cache_negative_shrink_select(struct namecache **ncpp,
 895     struct neglist **neglistpp)
 896 {
 897         struct neglist *neglist;
 898         struct namecache *ncp;
 899         static u_int cycle;
 900         u_int i;
 901
 902         *ncpp = ncp = NULL;
 903
 904         for (i = 0; i < numneglists; i++) {
 905                 neglist = &neglists[(cycle + i) % numneglists];
 906                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
 907                         continue;
 908                 mtx_lock(&neglist->nl_lock);
 909                 ncp = TAILQ_FIRST(&neglist->nl_list);
 910                 if (ncp != NULL)
 911                         break;
 912                 mtx_unlock(&neglist->nl_lock);
 913         }
 914
 915         *neglistpp = neglist;
 916         *ncpp = ncp;
 917         cycle++;
 918 }
 919
 920 static void
 921 cache_negative_zap_one(void)
 922 {
 923         struct namecache *ncp, *ncp2;
 924         struct neglist *neglist;
 925         struct negstate *negstate;
 926         struct mtx *dvlp;
 927         struct mtx *blp;
 928
 929         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 930             !mtx_trylock(&ncneg_shrink_lock)) {
 931                 counter_u64_add(shrinking_skipped, 1);
 932                 return;
 933         }
 934
 935         mtx_lock(&ncneg_hot.nl_lock);
 936         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
 937         if (ncp != NULL) {
 938                 neglist = NCP2NEGLIST(ncp);
 939                 negstate = NCP2NEGSTATE(ncp);
 940                 mtx_lock(&neglist->nl_lock);
 941                 MPASS((negstate->neg_flag & NEG_HOT) != 0);
 942                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 943                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 944                 negstate->neg_flag &= ~NEG_HOT;
 945                 numhotneg--;
 946                 mtx_unlock(&neglist->nl_lock);
 947         }
 948         mtx_unlock(&ncneg_hot.nl_lock);
 949
 950         cache_negative_shrink_select(&ncp, &neglist);
 951
 952         mtx_unlock(&ncneg_shrink_lock);
 953         if (ncp == NULL)
 954                 return;
 955
 956         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 957         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 958         blp = NCP2BUCKETLOCK(ncp);
 959         mtx_unlock(&neglist->nl_lock);
 960         mtx_lock(dvlp);
 961         mtx_lock(blp);
 962         /*
 963          * Enter SMR to safely check the negative list.
 964          * Even if the found pointer matches, the entry may now be reallocated
 965          * and used by a different vnode.
 966          */
 967         vfs_smr_enter();
 968         ncp2 = TAILQ_FIRST(&neglist->nl_list);
 969         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 970             blp != NCP2BUCKETLOCK(ncp2)) {
 971                 vfs_smr_exit();
 972                 ncp = NULL;
 973         } else {
 974                 vfs_smr_exit();
 975                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 976                     ncp->nc_name);
 977                 cache_zap_locked(ncp);
 978                 counter_u64_add(numneg_evicted, 1);
 979         }
 980         mtx_unlock(blp);
 981         mtx_unlock(dvlp);
 982         if (ncp != NULL)
 983                 cache_free(ncp);
 984 }
 985
 986 /*
 987  * cache_zap_locked():
 988  *
 989  *   Removes a namecache entry from cache, whether it contains an actual
 990  *   pointer to a vnode or if it is just a negative cache entry.
 991  */
 992 static void
 993 cache_zap_locked(struct namecache *ncp)
 994 {
 995         struct nchashhead *ncpp;
 996
 997         if (!(ncp->nc_flag & NCF_NEGATIVE))
 998                 cache_assert_vnode_locked(ncp->nc_vp);
 999         cache_assert_vnode_locked(ncp->nc_dvp);
1000         cache_assert_bucket_locked(ncp);
1001
1002         cache_ncp_invalidate(ncp);
1003
1004         ncpp = NCP2BUCKET(ncp);
1005         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1006         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1007                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1008                     ncp->nc_name, ncp->nc_vp);
1009                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1010                 if (ncp == ncp->nc_vp->v_cache_dd) {
1011                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1012                         ncp->nc_vp->v_cache_dd = NULL;
1013                         vn_seqc_write_end(ncp->nc_vp);
1014                 }
1015         } else {
1016                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1017                     ncp->nc_name);
1018                 cache_negative_remove(ncp);
1019         }
1020         if (ncp->nc_flag & NCF_ISDOTDOT) {
1021                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1022                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1023                         ncp->nc_dvp->v_cache_dd = NULL;
1024                         vn_seqc_write_end(ncp->nc_dvp);
1025                 }
1026         } else {
1027                 LIST_REMOVE(ncp, nc_src);
1028                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1029                         ncp->nc_flag |= NCF_DVDROP;
1030                         counter_u64_add(numcachehv, -1);
1031                 }
1032         }
1033         atomic_subtract_rel_long(&numcache, 1);
1034 }
1035
1036 static void
1037 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1038 {
1039         struct mtx *blp;
1040
1041         MPASS(ncp->nc_dvp == vp);
1042         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1043         cache_assert_vnode_locked(vp);
1044
1045         blp = NCP2BUCKETLOCK(ncp);
1046         mtx_lock(blp);
1047         cache_zap_locked(ncp);
1048         mtx_unlock(blp);
1049 }
1050
1051 static bool
1052 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1053     struct mtx **vlpp)
1054 {
1055         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1056         struct mtx *blp;
1057
1058         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1059         cache_assert_vnode_locked(vp);
1060
1061         if (ncp->nc_flag & NCF_NEGATIVE) {
1062                 if (*vlpp != NULL) {
1063                         mtx_unlock(*vlpp);
1064                         *vlpp = NULL;
1065                 }
1066                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1067                 return (true);
1068         }
1069
1070         pvlp = VP2VNODELOCK(vp);
1071         blp = NCP2BUCKETLOCK(ncp);
1072         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1073         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1074
1075         if (*vlpp == vlp1 || *vlpp == vlp2) {
1076                 to_unlock = *vlpp;
1077                 *vlpp = NULL;
1078         } else {
1079                 if (*vlpp != NULL) {
1080                         mtx_unlock(*vlpp);
1081                         *vlpp = NULL;
1082                 }
1083                 cache_sort_vnodes(&vlp1, &vlp2);
1084                 if (vlp1 == pvlp) {
1085                         mtx_lock(vlp2);
1086                         to_unlock = vlp2;
1087                 } else {
1088                         if (!mtx_trylock(vlp1))
1089                                 goto out_relock;
1090                         to_unlock = vlp1;
1091                 }
1092         }
1093         mtx_lock(blp);
1094         cache_zap_locked(ncp);
1095         mtx_unlock(blp);
1096         if (to_unlock != NULL)
1097                 mtx_unlock(to_unlock);
1098         return (true);
1099
1100 out_relock:
1101         mtx_unlock(vlp2);
1102         mtx_lock(vlp1);
1103         mtx_lock(vlp2);
1104         MPASS(*vlpp == NULL);
1105         *vlpp = vlp1;
1106         return (false);
1107 }
1108
1109 static int __noinline
1110 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1111 {
1112         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1113         struct mtx *blp;
1114         int error = 0;
1115
1116         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1117         cache_assert_vnode_locked(vp);
1118
1119         pvlp = VP2VNODELOCK(vp);
1120         if (ncp->nc_flag & NCF_NEGATIVE) {
1121                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1122                 goto out;
1123         }
1124
1125         blp = NCP2BUCKETLOCK(ncp);
1126         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1127         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1128         cache_sort_vnodes(&vlp1, &vlp2);
1129         if (vlp1 == pvlp) {
1130                 mtx_lock(vlp2);
1131                 to_unlock = vlp2;
1132         } else {
1133                 if (!mtx_trylock(vlp1)) {
1134                         /*
1135                          * TODO: Very wasteful but rare.
1136                          */
1137                         mtx_unlock(pvlp);
1138                         mtx_lock(vlp1);
1139                         mtx_lock(vlp2);
1140                         mtx_unlock(vlp2);
1141                         mtx_unlock(vlp1);
1142                         return (EAGAIN);
1143                 }
1144                 to_unlock = vlp1;
1145         }
1146         mtx_lock(blp);
1147         cache_zap_locked(ncp);
1148         mtx_unlock(blp);
1149         mtx_unlock(to_unlock);
1150 out:
1151         mtx_unlock(pvlp);
1152         return (error);
1153 }
1154
1155 /*
1156  * If trylocking failed we can get here. We know enough to take all needed locks
1157  * in the right order and re-lookup the entry.
1158  */
1159 static int
1160 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1161     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1162     struct mtx *blp)
1163 {
1164         struct namecache *rncp;
1165
1166         cache_assert_bucket_unlocked(ncp);
1167
1168         cache_sort_vnodes(&dvlp, &vlp);
1169         cache_lock_vnodes(dvlp, vlp);
1170         mtx_lock(blp);
1171         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1172                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1173                     rncp->nc_nlen == cnp->cn_namelen &&
1174                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1175                         break;
1176         }
1177         if (rncp != NULL) {
1178                 cache_zap_locked(rncp);
1179                 mtx_unlock(blp);
1180                 cache_unlock_vnodes(dvlp, vlp);
1181                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1182                 return (0);
1183         }
1184
1185         mtx_unlock(blp);
1186         cache_unlock_vnodes(dvlp, vlp);
1187         return (EAGAIN);
1188 }
1189
1190 static int __noinline
1191 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1192     uint32_t hash, struct mtx *blp)
1193 {
1194         struct mtx *dvlp, *vlp;
1195         struct vnode *dvp;
1196
1197         cache_assert_bucket_locked(ncp);
1198
1199         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1200         vlp = NULL;
1201         if (!(ncp->nc_flag & NCF_NEGATIVE))
1202                 vlp = VP2VNODELOCK(ncp->nc_vp);
1203         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1204                 cache_zap_locked(ncp);
1205                 mtx_unlock(blp);
1206                 cache_unlock_vnodes(dvlp, vlp);
1207                 return (0);
1208         }
1209
1210         dvp = ncp->nc_dvp;
1211         mtx_unlock(blp);
1212         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1213 }
1214
1215 static int
1216 cache_zap_locked_bucket_kl(struct namecache *ncp, struct mtx *blp,
1217     struct mtx **vlpp1, struct mtx **vlpp2)
1218 {
1219         struct mtx *dvlp, *vlp;
1220
1221         cache_assert_bucket_locked(ncp);
1222
1223         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1224         vlp = NULL;
1225         if (!(ncp->nc_flag & NCF_NEGATIVE))
1226                 vlp = VP2VNODELOCK(ncp->nc_vp);
1227         cache_sort_vnodes(&dvlp, &vlp);
1228
1229         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1230                 cache_zap_locked(ncp);
1231                 cache_unlock_vnodes(dvlp, vlp);
1232                 *vlpp1 = NULL;
1233                 *vlpp2 = NULL;
1234                 return (0);
1235         }
1236
1237         if (*vlpp1 != NULL)
1238                 mtx_unlock(*vlpp1);
1239         if (*vlpp2 != NULL)
1240                 mtx_unlock(*vlpp2);
1241         *vlpp1 = NULL;
1242         *vlpp2 = NULL;
1243
1244         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1245                 cache_zap_locked(ncp);
1246                 cache_unlock_vnodes(dvlp, vlp);
1247                 return (0);
1248         }
1249
1250         mtx_unlock(blp);
1251         *vlpp1 = dvlp;
1252         *vlpp2 = vlp;
1253         if (*vlpp1 != NULL)
1254                 mtx_lock(*vlpp1);
1255         mtx_lock(*vlpp2);
1256         mtx_lock(blp);
1257         return (EAGAIN);
1258 }
1259
1260 static __noinline int
1261 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1262 {
1263         struct namecache *ncp;
1264         struct mtx *blp;
1265         struct mtx *dvlp, *dvlp2;
1266         uint32_t hash;
1267         int error;
1268
1269         if (cnp->cn_namelen == 2 &&
1270             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1271                 dvlp = VP2VNODELOCK(dvp);
1272                 dvlp2 = NULL;
1273                 mtx_lock(dvlp);
1274 retry_dotdot:
1275                 ncp = dvp->v_cache_dd;
1276                 if (ncp == NULL) {
1277                         mtx_unlock(dvlp);
1278                         if (dvlp2 != NULL)
1279                                 mtx_unlock(dvlp2);
1280                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1281                         return (0);
1282                 }
1283                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1284                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1285                                 goto retry_dotdot;
1286                         MPASS(dvp->v_cache_dd == NULL);
1287                         mtx_unlock(dvlp);
1288                         if (dvlp2 != NULL)
1289                                 mtx_unlock(dvlp2);
1290                         cache_free(ncp);
1291                 } else {
1292                         vn_seqc_write_begin(dvp);
1293                         dvp->v_cache_dd = NULL;
1294                         vn_seqc_write_end(dvp);
1295                         mtx_unlock(dvlp);
1296                         if (dvlp2 != NULL)
1297                                 mtx_unlock(dvlp2);
1298                 }
1299                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1300                 return (1);
1301         }
1302
1303         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1304         blp = HASH2BUCKETLOCK(hash);
1305 retry:
1306         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1307                 goto out_no_entry;
1308
1309         mtx_lock(blp);
1310
1311         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1312                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1313                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1314                         break;
1315         }
1316
1317         if (ncp == NULL) {
1318                 mtx_unlock(blp);
1319                 goto out_no_entry;
1320         }
1321
1322         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1323         if (__predict_false(error != 0)) {
1324                 zap_and_exit_bucket_fail++;
1325                 goto retry;
1326         }
1327         counter_u64_add(numposzaps, 1);
1328         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1329         cache_free(ncp);
1330         return (1);
1331 out_no_entry:
1332         counter_u64_add(nummisszap, 1);
1333         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1334         return (0);
1335 }
1336
1337 static int __noinline
1338 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1339     struct timespec *tsp, int *ticksp)
1340 {
1341         int ltype;
1342
1343         *vpp = dvp;
1344         counter_u64_add(dothits, 1);
1345         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1346         if (tsp != NULL)
1347                 timespecclear(tsp);
1348         if (ticksp != NULL)
1349                 *ticksp = ticks;
1350         vrefact(*vpp);
1351         /*
1352          * When we lookup "." we still can be asked to lock it
1353          * differently...
1354          */
1355         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1356         if (ltype != VOP_ISLOCKED(*vpp)) {
1357                 if (ltype == LK_EXCLUSIVE) {
1358                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1359                         if (VN_IS_DOOMED((*vpp))) {
1360                                 /* forced unmount */
1361                                 vrele(*vpp);
1362                                 *vpp = NULL;
1363                                 return (ENOENT);
1364                         }
1365                 } else
1366                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1367         }
1368         return (-1);
1369 }
1370
1371 static int __noinline
1372 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1373     struct timespec *tsp, int *ticksp)
1374 {
1375         struct namecache_ts *ncp_ts;
1376         struct namecache *ncp;
1377         struct mtx *dvlp;
1378         enum vgetstate vs;
1379         int error, ltype;
1380         bool whiteout;
1381
1382         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1383
1384         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1385                 cache_remove_cnp(dvp, cnp);
1386                 return (0);
1387         }
1388
1389         counter_u64_add(dotdothits, 1);
1390 retry:
1391         dvlp = VP2VNODELOCK(dvp);
1392         mtx_lock(dvlp);
1393         ncp = dvp->v_cache_dd;
1394         if (ncp == NULL) {
1395                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1396                 mtx_unlock(dvlp);
1397                 return (0);
1398         }
1399         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1400                 if (ncp->nc_flag & NCF_NEGATIVE)
1401                         *vpp = NULL;
1402                 else
1403                         *vpp = ncp->nc_vp;
1404         } else
1405                 *vpp = ncp->nc_dvp;
1406         if (*vpp == NULL)
1407                 goto negative_success;
1408         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1409         cache_out_ts(ncp, tsp, ticksp);
1410         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1411             NCF_DTS && tsp != NULL) {
1412                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1413                 *tsp = ncp_ts->nc_dotdottime;
1414         }
1415
1416         MPASS(dvp != *vpp);
1417         ltype = VOP_ISLOCKED(dvp);
1418         VOP_UNLOCK(dvp);
1419         vs = vget_prep(*vpp);
1420         mtx_unlock(dvlp);
1421         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1422         vn_lock(dvp, ltype | LK_RETRY);
1423         if (VN_IS_DOOMED(dvp)) {
1424                 if (error == 0)
1425                         vput(*vpp);
1426                 *vpp = NULL;
1427                 return (ENOENT);
1428         }
1429         if (error) {
1430                 *vpp = NULL;
1431                 goto retry;
1432         }
1433         return (-1);
1434 negative_success:
1435         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1436                 if (cnp->cn_flags & ISLASTCN) {
1437                         counter_u64_add(numnegzaps, 1);
1438                         error = cache_zap_locked_vnode(ncp, dvp);
1439                         if (__predict_false(error != 0)) {
1440                                 zap_and_exit_bucket_fail2++;
1441                                 goto retry;
1442                         }
1443                         cache_free(ncp);
1444                         return (0);
1445                 }
1446         }
1447
1448         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1449         cache_out_ts(ncp, tsp, ticksp);
1450         counter_u64_add(numneghits, 1);
1451         whiteout = (ncp->nc_flag & NCF_WHITE);
1452         cache_negative_hit(ncp);
1453         mtx_unlock(dvlp);
1454         if (whiteout)
1455                 cnp->cn_flags |= ISWHITEOUT;
1456         return (ENOENT);
1457 }
1458
1459 /**
1460  * Lookup a name in the name cache
1461  *
1462  * # Arguments
1463  *
1464  * - dvp:       Parent directory in which to search.
1465  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1466  * - cnp:       Parameters of the name search.  The most interesting bits of
1467  *              the cn_flags field have the following meanings:
1468  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1469  *                      it up.
1470  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1471  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1472  *              or negative) lookup, tsp will be filled with any timespec that
1473  *              was stored when this cache entry was created.  However, it will
1474  *              be clear for "." entries.
1475  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1476  *              (positive or negative) lookup, it will contain the ticks value
1477  *              that was current when the cache entry was created, unless cnp
1478  *              was ".".
1479  *
1480  * Either both tsp and ticks have to be provided or neither of them.
1481  *
1482  * # Returns
1483  *
1484  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1485  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1486  *              to a forced unmount.  vpp will not be modified.  If the entry
1487  *              is a whiteout, then the ISWHITEOUT flag will be set in
1488  *              cnp->cn_flags.
1489  * - 0:         A cache miss.  vpp will not be modified.
1490  *
1491  * # Locking
1492  *
1493  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1494  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1495  * lock is not recursively acquired.
1496  */
1497 static int __noinline
1498 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1499     struct timespec *tsp, int *ticksp)
1500 {
1501         struct namecache *ncp;
1502         struct mtx *blp;
1503         uint32_t hash;
1504         enum vgetstate vs;
1505         int error;
1506         bool whiteout;
1507
1508         MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1509
1510 retry:
1511         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1512         blp = HASH2BUCKETLOCK(hash);
1513         mtx_lock(blp);
1514
1515         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1516                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1517                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1518                         break;
1519         }
1520
1521         if (__predict_false(ncp == NULL)) {
1522                 mtx_unlock(blp);
1523                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1524                     NULL);
1525                 counter_u64_add(nummiss, 1);
1526                 return (0);
1527         }
1528
1529         if (ncp->nc_flag & NCF_NEGATIVE)
1530                 goto negative_success;
1531
1532         counter_u64_add(numposhits, 1);
1533         *vpp = ncp->nc_vp;
1534         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1535         cache_out_ts(ncp, tsp, ticksp);
1536         MPASS(dvp != *vpp);
1537         vs = vget_prep(*vpp);
1538         mtx_unlock(blp);
1539         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1540         if (error) {
1541                 *vpp = NULL;
1542                 goto retry;
1543         }
1544         return (-1);
1545 negative_success:
1546         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1547                 if (cnp->cn_flags & ISLASTCN) {
1548                         counter_u64_add(numnegzaps, 1);
1549                         error = cache_zap_locked_vnode(ncp, dvp);
1550                         if (__predict_false(error != 0)) {
1551                                 zap_and_exit_bucket_fail2++;
1552                                 goto retry;
1553                         }
1554                         cache_free(ncp);
1555                         return (0);
1556                 }
1557         }
1558
1559         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1560         cache_out_ts(ncp, tsp, ticksp);
1561         counter_u64_add(numneghits, 1);
1562         whiteout = (ncp->nc_flag & NCF_WHITE);
1563         cache_negative_hit(ncp);
1564         mtx_unlock(blp);
1565         if (whiteout)
1566                 cnp->cn_flags |= ISWHITEOUT;
1567         return (ENOENT);
1568 }
1569
1570 int
1571 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1572     struct timespec *tsp, int *ticksp)
1573 {
1574         struct namecache *ncp;
1575         struct negstate *negstate;
1576         uint32_t hash;
1577         enum vgetstate vs;
1578         int error;
1579         bool whiteout;
1580         u_short nc_flag;
1581
1582         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1583
1584 #ifdef DEBUG_CACHE
1585         if (__predict_false(!doingcache)) {
1586                 cnp->cn_flags &= ~MAKEENTRY;
1587                 return (0);
1588         }
1589 #endif
1590
1591         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1592                 if (cnp->cn_namelen == 1)
1593                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1594                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1595                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1596         }
1597
1598         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1599
1600         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1601                 cache_remove_cnp(dvp, cnp);
1602                 return (0);
1603         }
1604
1605         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1606         vfs_smr_enter();
1607
1608         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1609                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1610                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1611                         break;
1612         }
1613
1614         if (__predict_false(ncp == NULL)) {
1615                 vfs_smr_exit();
1616                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1617                     NULL);
1618                 counter_u64_add(nummiss, 1);
1619                 return (0);
1620         }
1621
1622         nc_flag = atomic_load_char(&ncp->nc_flag);
1623         if (nc_flag & NCF_NEGATIVE)
1624                 goto negative_success;
1625
1626         counter_u64_add(numposhits, 1);
1627         *vpp = ncp->nc_vp;
1628         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1629         cache_out_ts(ncp, tsp, ticksp);
1630         MPASS(dvp != *vpp);
1631         if (!cache_ncp_canuse(ncp)) {
1632                 vfs_smr_exit();
1633                 *vpp = NULL;
1634                 goto out_fallback;
1635         }
1636         vs = vget_prep_smr(*vpp);
1637         vfs_smr_exit();
1638         if (__predict_false(vs == VGET_NONE)) {
1639                 *vpp = NULL;
1640                 goto out_fallback;
1641         }
1642         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1643         if (error) {
1644                 *vpp = NULL;
1645                 goto out_fallback;
1646         }
1647         return (-1);
1648 negative_success:
1649         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1650                 if (cnp->cn_flags & ISLASTCN) {
1651                         vfs_smr_exit();
1652                         goto out_fallback;
1653                 }
1654         }
1655
1656         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1657         cache_out_ts(ncp, tsp, ticksp);
1658         counter_u64_add(numneghits, 1);
1659         whiteout = (ncp->nc_flag & NCF_WHITE);
1660         /*
1661          * TODO: We need to take locks to promote an entry. Code doing it
1662          * in SMR lookup can be modified to be shared.
1663          */
1664         negstate = NCP2NEGSTATE(ncp);
1665         if ((negstate->neg_flag & NEG_HOT) == 0 ||
1666             !cache_ncp_canuse(ncp)) {
1667                 vfs_smr_exit();
1668                 goto out_fallback;
1669         }
1670         vfs_smr_exit();
1671         if (whiteout)
1672                 cnp->cn_flags |= ISWHITEOUT;
1673         return (ENOENT);
1674 out_fallback:
1675         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1676 }
1677
1678 struct celockstate {
1679         struct mtx *vlp[3];
1680         struct mtx *blp[2];
1681 };
1682 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1683 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1684
1685 static inline void
1686 cache_celockstate_init(struct celockstate *cel)
1687 {
1688
1689         bzero(cel, sizeof(*cel));
1690 }
1691
1692 static void
1693 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1694     struct vnode *dvp)
1695 {
1696         struct mtx *vlp1, *vlp2;
1697
1698         MPASS(cel->vlp[0] == NULL);
1699         MPASS(cel->vlp[1] == NULL);
1700         MPASS(cel->vlp[2] == NULL);
1701
1702         MPASS(vp != NULL || dvp != NULL);
1703
1704         vlp1 = VP2VNODELOCK(vp);
1705         vlp2 = VP2VNODELOCK(dvp);
1706         cache_sort_vnodes(&vlp1, &vlp2);
1707
1708         if (vlp1 != NULL) {
1709                 mtx_lock(vlp1);
1710                 cel->vlp[0] = vlp1;
1711         }
1712         mtx_lock(vlp2);
1713         cel->vlp[1] = vlp2;
1714 }
1715
1716 static void
1717 cache_unlock_vnodes_cel(struct celockstate *cel)
1718 {
1719
1720         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1721
1722         if (cel->vlp[0] != NULL)
1723                 mtx_unlock(cel->vlp[0]);
1724         if (cel->vlp[1] != NULL)
1725                 mtx_unlock(cel->vlp[1]);
1726         if (cel->vlp[2] != NULL)
1727                 mtx_unlock(cel->vlp[2]);
1728 }
1729
1730 static bool
1731 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1732 {
1733         struct mtx *vlp;
1734         bool ret;
1735
1736         cache_assert_vlp_locked(cel->vlp[0]);
1737         cache_assert_vlp_locked(cel->vlp[1]);
1738         MPASS(cel->vlp[2] == NULL);
1739
1740         MPASS(vp != NULL);
1741         vlp = VP2VNODELOCK(vp);
1742
1743         ret = true;
1744         if (vlp >= cel->vlp[1]) {
1745                 mtx_lock(vlp);
1746         } else {
1747                 if (mtx_trylock(vlp))
1748                         goto out;
1749                 cache_lock_vnodes_cel_3_failures++;
1750                 cache_unlock_vnodes_cel(cel);
1751                 if (vlp < cel->vlp[0]) {
1752                         mtx_lock(vlp);
1753                         mtx_lock(cel->vlp[0]);
1754                         mtx_lock(cel->vlp[1]);
1755                 } else {
1756                         if (cel->vlp[0] != NULL)
1757                                 mtx_lock(cel->vlp[0]);
1758                         mtx_lock(vlp);
1759                         mtx_lock(cel->vlp[1]);
1760                 }
1761                 ret = false;
1762         }
1763 out:
1764         cel->vlp[2] = vlp;
1765         return (ret);
1766 }
1767
1768 static void
1769 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1770     struct mtx *blp2)
1771 {
1772
1773         MPASS(cel->blp[0] == NULL);
1774         MPASS(cel->blp[1] == NULL);
1775
1776         cache_sort_vnodes(&blp1, &blp2);
1777
1778         if (blp1 != NULL) {
1779                 mtx_lock(blp1);
1780                 cel->blp[0] = blp1;
1781         }
1782         mtx_lock(blp2);
1783         cel->blp[1] = blp2;
1784 }
1785
1786 static void
1787 cache_unlock_buckets_cel(struct celockstate *cel)
1788 {
1789
1790         if (cel->blp[0] != NULL)
1791                 mtx_unlock(cel->blp[0]);
1792         mtx_unlock(cel->blp[1]);
1793 }
1794
1795 /*
1796  * Lock part of the cache affected by the insertion.
1797  *
1798  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1799  * However, insertion can result in removal of an old entry. In this
1800  * case we have an additional vnode and bucketlock pair to lock.
1801  *
1802  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1803  * preserving the locking order (smaller address first).
1804  */
1805 static void
1806 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1807     uint32_t hash)
1808 {
1809         struct namecache *ncp;
1810         struct mtx *blps[2];
1811
1812         blps[0] = HASH2BUCKETLOCK(hash);
1813         for (;;) {
1814                 blps[1] = NULL;
1815                 cache_lock_vnodes_cel(cel, dvp, vp);
1816                 if (vp == NULL || vp->v_type != VDIR)
1817                         break;
1818                 ncp = vp->v_cache_dd;
1819                 if (ncp == NULL)
1820                         break;
1821                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1822                         break;
1823                 MPASS(ncp->nc_dvp == vp);
1824                 blps[1] = NCP2BUCKETLOCK(ncp);
1825                 if (ncp->nc_flag & NCF_NEGATIVE)
1826                         break;
1827                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1828                         break;
1829                 /*
1830                  * All vnodes got re-locked. Re-validate the state and if
1831                  * nothing changed we are done. Otherwise restart.
1832                  */
1833                 if (ncp == vp->v_cache_dd &&
1834                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1835                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1836                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1837                         break;
1838                 cache_unlock_vnodes_cel(cel);
1839                 cel->vlp[0] = NULL;
1840                 cel->vlp[1] = NULL;
1841                 cel->vlp[2] = NULL;
1842         }
1843         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1844 }
1845
1846 static void
1847 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1848     uint32_t hash)
1849 {
1850         struct namecache *ncp;
1851         struct mtx *blps[2];
1852
1853         blps[0] = HASH2BUCKETLOCK(hash);
1854         for (;;) {
1855                 blps[1] = NULL;
1856                 cache_lock_vnodes_cel(cel, dvp, vp);
1857                 ncp = dvp->v_cache_dd;
1858                 if (ncp == NULL)
1859                         break;
1860                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1861                         break;
1862                 MPASS(ncp->nc_dvp == dvp);
1863                 blps[1] = NCP2BUCKETLOCK(ncp);
1864                 if (ncp->nc_flag & NCF_NEGATIVE)
1865                         break;
1866                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1867                         break;
1868                 if (ncp == dvp->v_cache_dd &&
1869                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1870                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1871                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1872                         break;
1873                 cache_unlock_vnodes_cel(cel);
1874                 cel->vlp[0] = NULL;
1875                 cel->vlp[1] = NULL;
1876                 cel->vlp[2] = NULL;
1877         }
1878         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1879 }
1880
1881 static void
1882 cache_enter_unlock(struct celockstate *cel)
1883 {
1884
1885         cache_unlock_buckets_cel(cel);
1886         cache_unlock_vnodes_cel(cel);
1887 }
1888
1889 static void __noinline
1890 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1891     struct componentname *cnp)
1892 {
1893         struct celockstate cel;
1894         struct namecache *ncp;
1895         uint32_t hash;
1896         int len;
1897
1898         if (dvp->v_cache_dd == NULL)
1899                 return;
1900         len = cnp->cn_namelen;
1901         cache_celockstate_init(&cel);
1902         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1903         cache_enter_lock_dd(&cel, dvp, vp, hash);
1904         vn_seqc_write_begin(dvp);
1905         ncp = dvp->v_cache_dd;
1906         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1907                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1908                 cache_zap_locked(ncp);
1909         } else {
1910                 ncp = NULL;
1911         }
1912         dvp->v_cache_dd = NULL;
1913         vn_seqc_write_end(dvp);
1914         cache_enter_unlock(&cel);
1915         if (ncp != NULL)
1916                 cache_free(ncp);
1917 }
1918
1919 /*
1920  * Add an entry to the cache.
1921  */
1922 void
1923 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1924     struct timespec *tsp, struct timespec *dtsp)
1925 {
1926         struct celockstate cel;
1927         struct namecache *ncp, *n2, *ndd;
1928         struct namecache_ts *ncp_ts, *n2_ts;
1929         struct nchashhead *ncpp;
1930         uint32_t hash;
1931         int flag;
1932         int len;
1933         u_long lnumcache;
1934
1935         VNPASS(!VN_IS_DOOMED(dvp), dvp);
1936         VNPASS(dvp->v_type != VNON, dvp);
1937         if (vp != NULL) {
1938                 VNPASS(!VN_IS_DOOMED(vp), vp);
1939                 VNPASS(vp->v_type != VNON, vp);
1940         }
1941
1942 #ifdef DEBUG_CACHE
1943         if (__predict_false(!doingcache))
1944                 return;
1945 #endif
1946
1947         flag = 0;
1948         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1949                 if (cnp->cn_namelen == 1)
1950                         return;
1951                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1952                         cache_enter_dotdot_prep(dvp, vp, cnp);
1953                         flag = NCF_ISDOTDOT;
1954                 }
1955         }
1956
1957         /*
1958          * Avoid blowout in namecache entries.
1959          */
1960         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1961         if (__predict_false(lnumcache >= ncsize)) {
1962                 atomic_add_long(&numcache, -1);
1963                 counter_u64_add(numdrops, 1);
1964                 return;
1965         }
1966
1967         cache_celockstate_init(&cel);
1968         ndd = NULL;
1969         ncp_ts = NULL;
1970
1971         /*
1972          * Calculate the hash key and setup as much of the new
1973          * namecache entry as possible before acquiring the lock.
1974          */
1975         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1976         ncp->nc_flag = flag | NCF_WIP;
1977         ncp->nc_vp = vp;
1978         if (vp == NULL)
1979                 cache_negative_init(ncp);
1980         ncp->nc_dvp = dvp;
1981         if (tsp != NULL) {
1982                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1983                 ncp_ts->nc_time = *tsp;
1984                 ncp_ts->nc_ticks = ticks;
1985                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1986                 if (dtsp != NULL) {
1987                         ncp_ts->nc_dotdottime = *dtsp;
1988                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1989                 }
1990         }
1991         len = ncp->nc_nlen = cnp->cn_namelen;
1992         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1993         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1994         ncp->nc_name[len] = '\0';
1995         cache_enter_lock(&cel, dvp, vp, hash);
1996
1997         /*
1998          * See if this vnode or negative entry is already in the cache
1999          * with this name.  This can happen with concurrent lookups of
2000          * the same path name.
2001          */
2002         ncpp = NCHHASH(hash);
2003         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2004                 if (n2->nc_dvp == dvp &&
2005                     n2->nc_nlen == cnp->cn_namelen &&
2006                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2007                         MPASS(cache_ncp_canuse(n2));
2008                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2009                                 KASSERT(vp == NULL,
2010                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2011                                     __func__, NULL, vp));
2012                         else
2013                                 KASSERT(n2->nc_vp == vp,
2014                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2015                                     __func__, n2->nc_vp, vp));
2016                         if (tsp != NULL) {
2017                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2018                                     ("no NCF_TS"));
2019                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2020                                 n2_ts->nc_time = ncp_ts->nc_time;
2021                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2022                                 if (dtsp != NULL) {
2023                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2024                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2025                                 }
2026                         }
2027                         goto out_unlock_free;
2028                 }
2029         }
2030
2031         if (flag == NCF_ISDOTDOT) {
2032                 /*
2033                  * See if we are trying to add .. entry, but some other lookup
2034                  * has populated v_cache_dd pointer already.
2035                  */
2036                 if (dvp->v_cache_dd != NULL)
2037                         goto out_unlock_free;
2038                 KASSERT(vp == NULL || vp->v_type == VDIR,
2039                     ("wrong vnode type %p", vp));
2040                 vn_seqc_write_begin(dvp);
2041                 dvp->v_cache_dd = ncp;
2042                 vn_seqc_write_end(dvp);
2043         }
2044
2045         if (vp != NULL) {
2046                 if (flag != NCF_ISDOTDOT) {
2047                         /*
2048                          * For this case, the cache entry maps both the
2049                          * directory name in it and the name ".." for the
2050                          * directory's parent.
2051                          */
2052                         vn_seqc_write_begin(vp);
2053                         if ((ndd = vp->v_cache_dd) != NULL) {
2054                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2055                                         cache_zap_locked(ndd);
2056                                 else
2057                                         ndd = NULL;
2058                         }
2059                         vp->v_cache_dd = ncp;
2060                         vn_seqc_write_end(vp);
2061                 } else if (vp->v_type != VDIR) {
2062                         if (vp->v_cache_dd != NULL) {
2063                                 vn_seqc_write_begin(vp);
2064                                 vp->v_cache_dd = NULL;
2065                                 vn_seqc_write_end(vp);
2066                         }
2067                 }
2068         }
2069
2070         if (flag != NCF_ISDOTDOT) {
2071                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2072                         vhold(dvp);
2073                         counter_u64_add(numcachehv, 1);
2074                 }
2075                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2076         }
2077
2078         /*
2079          * If the entry is "negative", we place it into the
2080          * "negative" cache queue, otherwise, we place it into the
2081          * destination vnode's cache entries queue.
2082          */
2083         if (vp != NULL) {
2084                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2085                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2086                     vp);
2087         } else {
2088                 if (cnp->cn_flags & ISWHITEOUT)
2089                         ncp->nc_flag |= NCF_WHITE;
2090                 cache_negative_insert(ncp);
2091                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2092                     ncp->nc_name);
2093         }
2094
2095         /*
2096          * Insert the new namecache entry into the appropriate chain
2097          * within the cache entries table.
2098          */
2099         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2100
2101         atomic_thread_fence_rel();
2102         /*
2103          * Mark the entry as fully constructed.
2104          * It is immutable past this point until its removal.
2105          */
2106         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2107
2108         cache_enter_unlock(&cel);
2109         if (numneg * ncnegfactor > lnumcache)
2110                 cache_negative_zap_one();
2111         if (ndd != NULL)
2112                 cache_free(ndd);
2113         return;
2114 out_unlock_free:
2115         cache_enter_unlock(&cel);
2116         atomic_add_long(&numcache, -1);
2117         cache_free(ncp);
2118         return;
2119 }
2120
2121 static u_int
2122 cache_roundup_2(u_int val)
2123 {
2124         u_int res;
2125
2126         for (res = 1; res <= val; res <<= 1)
2127                 continue;
2128
2129         return (res);
2130 }
2131
2132 static struct nchashhead *
2133 nchinittbl(u_long elements, u_long *hashmask)
2134 {
2135         struct nchashhead *hashtbl;
2136         u_long hashsize, i;
2137
2138         hashsize = cache_roundup_2(elements) / 2;
2139
2140         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2141         for (i = 0; i < hashsize; i++)
2142                 CK_SLIST_INIT(&hashtbl[i]);
2143         *hashmask = hashsize - 1;
2144         return (hashtbl);
2145 }
2146
2147 static void
2148 ncfreetbl(struct nchashhead *hashtbl)
2149 {
2150
2151         free(hashtbl, M_VFSCACHE);
2152 }
2153
2154 /*
2155  * Name cache initialization, from vfs_init() when we are booting
2156  */
2157 static void
2158 nchinit(void *dummy __unused)
2159 {
2160         u_int i;
2161
2162         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2163             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2164         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2165             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2166         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2167             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2168         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2169             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2170
2171         VFS_SMR_ZONE_SET(cache_zone_small);
2172         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2173         VFS_SMR_ZONE_SET(cache_zone_large);
2174         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2175
2176         ncsize = desiredvnodes * ncsizefactor;
2177         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2178         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2179         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2180                 ncbuckethash = 7;
2181         if (ncbuckethash > nchash)
2182                 ncbuckethash = nchash;
2183         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2184             M_WAITOK | M_ZERO);
2185         for (i = 0; i < numbucketlocks; i++)
2186                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2187         ncvnodehash = ncbuckethash;
2188         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2189             M_WAITOK | M_ZERO);
2190         for (i = 0; i < numvnodelocks; i++)
2191                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2192         ncpurgeminvnodes = numbucketlocks * 2;
2193
2194         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2195             M_WAITOK | M_ZERO);
2196         for (i = 0; i < numneglists; i++) {
2197                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2198                 TAILQ_INIT(&neglists[i].nl_list);
2199         }
2200         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2201         TAILQ_INIT(&ncneg_hot.nl_list);
2202
2203         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2204 }
2205 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2206
2207 void
2208 cache_vnode_init(struct vnode *vp)
2209 {
2210
2211         LIST_INIT(&vp->v_cache_src);
2212         TAILQ_INIT(&vp->v_cache_dst);
2213         vp->v_cache_dd = NULL;
2214         cache_prehash(vp);
2215 }
2216
2217 void
2218 cache_changesize(u_long newmaxvnodes)
2219 {
2220         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2221         u_long new_nchash, old_nchash;
2222         struct namecache *ncp;
2223         uint32_t hash;
2224         u_long newncsize;
2225         int i;
2226
2227         newncsize = newmaxvnodes * ncsizefactor;
2228         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2229         if (newmaxvnodes < numbucketlocks)
2230                 newmaxvnodes = numbucketlocks;
2231
2232         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2233         /* If same hash table size, nothing to do */
2234         if (nchash == new_nchash) {
2235                 ncfreetbl(new_nchashtbl);
2236                 return;
2237         }
2238         /*
2239          * Move everything from the old hash table to the new table.
2240          * None of the namecache entries in the table can be removed
2241          * because to do so, they have to be removed from the hash table.
2242          */
2243         cache_lock_all_vnodes();
2244         cache_lock_all_buckets();
2245         old_nchashtbl = nchashtbl;
2246         old_nchash = nchash;
2247         nchashtbl = new_nchashtbl;
2248         nchash = new_nchash;
2249         for (i = 0; i <= old_nchash; i++) {
2250                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2251                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2252                             ncp->nc_dvp);
2253                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2254                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2255                 }
2256         }
2257         ncsize = newncsize;
2258         cache_unlock_all_buckets();
2259         cache_unlock_all_vnodes();
2260         ncfreetbl(old_nchashtbl);
2261 }
2262
2263 /*
2264  * Invalidate all entries from and to a particular vnode.
2265  */
2266 static void
2267 cache_purge_impl(struct vnode *vp)
2268 {
2269         TAILQ_HEAD(, namecache) ncps;
2270         struct namecache *ncp, *nnp;
2271         struct mtx *vlp, *vlp2;
2272
2273         TAILQ_INIT(&ncps);
2274         vlp = VP2VNODELOCK(vp);
2275         vlp2 = NULL;
2276         mtx_assert(vlp, MA_OWNED);
2277 retry:
2278         while (!LIST_EMPTY(&vp->v_cache_src)) {
2279                 ncp = LIST_FIRST(&vp->v_cache_src);
2280                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2281                         goto retry;
2282                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2283         }
2284         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2285                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2286                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2287                         goto retry;
2288                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2289         }
2290         ncp = vp->v_cache_dd;
2291         if (ncp != NULL) {
2292                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2293                    ("lost dotdot link"));
2294                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2295                         goto retry;
2296                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2297         }
2298         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2299         mtx_unlock(vlp);
2300         if (vlp2 != NULL)
2301                 mtx_unlock(vlp2);
2302         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2303                 cache_free(ncp);
2304         }
2305 }
2306
2307 void
2308 cache_purge(struct vnode *vp)
2309 {
2310         struct mtx *vlp;
2311
2312         SDT_PROBE1(vfs, namecache, purge, done, vp);
2313         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2314             vp->v_cache_dd == NULL)
2315                 return;
2316         vlp = VP2VNODELOCK(vp);
2317         mtx_lock(vlp);
2318         cache_purge_impl(vp);
2319 }
2320
2321 /*
2322  * Only to be used by vgone.
2323  */
2324 void
2325 cache_purge_vgone(struct vnode *vp)
2326 {
2327         struct mtx *vlp;
2328
2329         VNPASS(VN_IS_DOOMED(vp), vp);
2330         vlp = VP2VNODELOCK(vp);
2331         if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2332             vp->v_cache_dd == NULL)) {
2333                 mtx_lock(vlp);
2334                 cache_purge_impl(vp);
2335                 mtx_assert(vlp, MA_NOTOWNED);
2336                 return;
2337         }
2338
2339         /*
2340          * All the NULL pointer state we found above may be transient.
2341          * Serialize against a possible thread doing cache_purge.
2342          */
2343         mtx_wait_unlocked(vlp);
2344         if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2345             vp->v_cache_dd == NULL)) {
2346                 mtx_lock(vlp);
2347                 cache_purge_impl(vp);
2348                 mtx_assert(vlp, MA_NOTOWNED);
2349                 return;
2350         }
2351         return;
2352 }
2353
2354 /*
2355  * Invalidate all negative entries for a particular directory vnode.
2356  */
2357 void
2358 cache_purge_negative(struct vnode *vp)
2359 {
2360         TAILQ_HEAD(, namecache) ncps;
2361         struct namecache *ncp, *nnp;
2362         struct mtx *vlp;
2363
2364         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2365         if (LIST_EMPTY(&vp->v_cache_src))
2366                 return;
2367         TAILQ_INIT(&ncps);
2368         vlp = VP2VNODELOCK(vp);
2369         mtx_lock(vlp);
2370         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2371                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2372                         continue;
2373                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2374                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2375         }
2376         mtx_unlock(vlp);
2377         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2378                 cache_free(ncp);
2379         }
2380 }
2381
2382 void
2383 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2384     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2385 {
2386
2387         ASSERT_VOP_IN_SEQC(fdvp);
2388         ASSERT_VOP_IN_SEQC(fvp);
2389         ASSERT_VOP_IN_SEQC(tdvp);
2390         if (tvp != NULL)
2391                 ASSERT_VOP_IN_SEQC(tvp);
2392
2393         cache_purge(fvp);
2394         if (tvp != NULL) {
2395                 cache_purge(tvp);
2396                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2397                     ("%s: lingering negative entry", __func__));
2398         } else {
2399                 cache_remove_cnp(tdvp, tcnp);
2400         }
2401 }
2402
2403 /*
2404  * Flush all entries referencing a particular filesystem.
2405  */
2406 void
2407 cache_purgevfs(struct mount *mp, bool force)
2408 {
2409         TAILQ_HEAD(, namecache) ncps;
2410         struct mtx *vlp1, *vlp2;
2411         struct mtx *blp;
2412         struct nchashhead *bucket;
2413         struct namecache *ncp, *nnp;
2414         u_long i, j, n_nchash;
2415         int error;
2416
2417         /* Scan hash tables for applicable entries */
2418         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2419         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2420                 return;
2421         TAILQ_INIT(&ncps);
2422         n_nchash = nchash + 1;
2423         vlp1 = vlp2 = NULL;
2424         for (i = 0; i < numbucketlocks; i++) {
2425                 blp = (struct mtx *)&bucketlocks[i];
2426                 mtx_lock(blp);
2427                 for (j = i; j < n_nchash; j += numbucketlocks) {
2428 retry:
2429                         bucket = &nchashtbl[j];
2430                         CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2431                                 cache_assert_bucket_locked(ncp);
2432                                 if (ncp->nc_dvp->v_mount != mp)
2433                                         continue;
2434                                 error = cache_zap_locked_bucket_kl(ncp, blp,
2435                                     &vlp1, &vlp2);
2436                                 if (error != 0)
2437                                         goto retry;
2438                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2439                         }
2440                 }
2441                 mtx_unlock(blp);
2442                 if (vlp1 == NULL && vlp2 == NULL)
2443                         cache_maybe_yield();
2444         }
2445         if (vlp1 != NULL)
2446                 mtx_unlock(vlp1);
2447         if (vlp2 != NULL)
2448                 mtx_unlock(vlp2);
2449
2450         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2451                 cache_free(ncp);
2452         }
2453 }
2454
2455 /*
2456  * Perform canonical checks and cache lookup and pass on to filesystem
2457  * through the vop_cachedlookup only if needed.
2458  */
2459
2460 int
2461 vfs_cache_lookup(struct vop_lookup_args *ap)
2462 {
2463         struct vnode *dvp;
2464         int error;
2465         struct vnode **vpp = ap->a_vpp;
2466         struct componentname *cnp = ap->a_cnp;
2467         int flags = cnp->cn_flags;
2468
2469         *vpp = NULL;
2470         dvp = ap->a_dvp;
2471
2472         if (dvp->v_type != VDIR)
2473                 return (ENOTDIR);
2474
2475         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2476             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2477                 return (EROFS);
2478
2479         error = vn_dir_check_exec(dvp, cnp);
2480         if (error != 0)
2481                 return (error);
2482
2483         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2484         if (error == 0)
2485                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2486         if (error == -1)
2487                 return (0);
2488         return (error);
2489 }
2490
2491 /* Implementation of the getcwd syscall. */
2492 int
2493 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2494 {
2495         char *buf, *retbuf;
2496         size_t buflen;
2497         int error;
2498
2499         buflen = uap->buflen;
2500         if (__predict_false(buflen < 2))
2501                 return (EINVAL);
2502         if (buflen > MAXPATHLEN)
2503                 buflen = MAXPATHLEN;
2504
2505         buf = uma_zalloc(namei_zone, M_WAITOK);
2506         error = vn_getcwd(buf, &retbuf, &buflen);
2507         if (error == 0)
2508                 error = copyout(retbuf, uap->buf, buflen);
2509         uma_zfree(namei_zone, buf);
2510         return (error);
2511 }
2512
2513 int
2514 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2515 {
2516         struct pwd *pwd;
2517         int error;
2518
2519         vfs_smr_enter();
2520         pwd = pwd_get_smr();
2521         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2522             buflen, false, 0);
2523         VFS_SMR_ASSERT_NOT_ENTERED();
2524         if (error < 0) {
2525                 pwd = pwd_hold(curthread);
2526                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2527                     retbuf, buflen);
2528                 pwd_drop(pwd);
2529         }
2530
2531 #ifdef KTRACE
2532         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2533                 ktrnamei(*retbuf);
2534 #endif
2535         return (error);
2536 }
2537
2538 static int
2539 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2540     size_t size, int flags, enum uio_seg pathseg)
2541 {
2542         struct nameidata nd;
2543         char *retbuf, *freebuf;
2544         int error;
2545
2546         if (flags != 0)
2547                 return (EINVAL);
2548         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2549             pathseg, path, fd, &cap_fstat_rights, td);
2550         if ((error = namei(&nd)) != 0)
2551                 return (error);
2552         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2553         if (error == 0) {
2554                 error = copyout(retbuf, buf, size);
2555                 free(freebuf, M_TEMP);
2556         }
2557         NDFREE(&nd, 0);
2558         return (error);
2559 }
2560
2561 int
2562 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2563 {
2564
2565         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2566             uap->flags, UIO_USERSPACE));
2567 }
2568
2569 /*
2570  * Retrieve the full filesystem path that correspond to a vnode from the name
2571  * cache (if available)
2572  */
2573 int
2574 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2575 {
2576         struct pwd *pwd;
2577         char *buf;
2578         size_t buflen;
2579         int error;
2580
2581         if (__predict_false(vp == NULL))
2582                 return (EINVAL);
2583
2584         buflen = MAXPATHLEN;
2585         buf = malloc(buflen, M_TEMP, M_WAITOK);
2586         vfs_smr_enter();
2587         pwd = pwd_get_smr();
2588         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2589         VFS_SMR_ASSERT_NOT_ENTERED();
2590         if (error < 0) {
2591                 pwd = pwd_hold(curthread);
2592                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2593                 pwd_drop(pwd);
2594         }
2595         if (error == 0)
2596                 *freebuf = buf;
2597         else
2598                 free(buf, M_TEMP);
2599         return (error);
2600 }
2601
2602 /*
2603  * This function is similar to vn_fullpath, but it attempts to lookup the
2604  * pathname relative to the global root mount point.  This is required for the
2605  * auditing sub-system, as audited pathnames must be absolute, relative to the
2606  * global root mount point.
2607  */
2608 int
2609 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2610 {
2611         char *buf;
2612         size_t buflen;
2613         int error;
2614
2615         if (__predict_false(vp == NULL))
2616                 return (EINVAL);
2617         buflen = MAXPATHLEN;
2618         buf = malloc(buflen, M_TEMP, M_WAITOK);
2619         vfs_smr_enter();
2620         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2621         VFS_SMR_ASSERT_NOT_ENTERED();
2622         if (error < 0) {
2623                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2624         }
2625         if (error == 0)
2626                 *freebuf = buf;
2627         else
2628                 free(buf, M_TEMP);
2629         return (error);
2630 }
2631
2632 static struct namecache *
2633 vn_dd_from_dst(struct vnode *vp)
2634 {
2635         struct namecache *ncp;
2636
2637         cache_assert_vnode_locked(vp);
2638         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2639                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2640                         return (ncp);
2641         }
2642         return (NULL);
2643 }
2644
2645 int
2646 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2647 {
2648         struct vnode *dvp;
2649         struct namecache *ncp;
2650         struct mtx *vlp;
2651         int error;
2652
2653         vlp = VP2VNODELOCK(*vp);
2654         mtx_lock(vlp);
2655         ncp = (*vp)->v_cache_dd;
2656         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2657                 KASSERT(ncp == vn_dd_from_dst(*vp),
2658                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2659                     ncp, vn_dd_from_dst(*vp)));
2660         } else {
2661                 ncp = vn_dd_from_dst(*vp);
2662         }
2663         if (ncp != NULL) {
2664                 if (*buflen < ncp->nc_nlen) {
2665                         mtx_unlock(vlp);
2666                         vrele(*vp);
2667                         counter_u64_add(numfullpathfail4, 1);
2668                         error = ENOMEM;
2669                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2670                             vp, NULL);
2671                         return (error);
2672                 }
2673                 *buflen -= ncp->nc_nlen;
2674                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2675                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2676                     ncp->nc_name, vp);
2677                 dvp = *vp;
2678                 *vp = ncp->nc_dvp;
2679                 vref(*vp);
2680                 mtx_unlock(vlp);
2681                 vrele(dvp);
2682                 return (0);
2683         }
2684         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2685
2686         mtx_unlock(vlp);
2687         vn_lock(*vp, LK_SHARED | LK_RETRY);
2688         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2689         vput(*vp);
2690         if (error) {
2691                 counter_u64_add(numfullpathfail2, 1);
2692                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2693                 return (error);
2694         }
2695
2696         *vp = dvp;
2697         if (VN_IS_DOOMED(dvp)) {
2698                 /* forced unmount */
2699                 vrele(dvp);
2700                 error = ENOENT;
2701                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2702                 return (error);
2703         }
2704         /*
2705          * *vp has its use count incremented still.
2706          */
2707
2708         return (0);
2709 }
2710
2711 /*
2712  * Resolve a directory to a pathname.
2713  *
2714  * The name of the directory can always be found in the namecache or fetched
2715  * from the filesystem. There is also guaranteed to be only one parent, meaning
2716  * we can just follow vnodes up until we find the root.
2717  *
2718  * The vnode must be referenced.
2719  */
2720 static int
2721 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2722     size_t *len, bool slash_prefixed, size_t addend)
2723 {
2724 #ifdef KDTRACE_HOOKS
2725         struct vnode *startvp = vp;
2726 #endif
2727         struct vnode *vp1;
2728         size_t buflen;
2729         int error;
2730
2731         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2732         VNPASS(vp->v_usecount > 0, vp);
2733
2734         buflen = *len;
2735
2736         if (!slash_prefixed) {
2737                 MPASS(*len >= 2);
2738                 buflen--;
2739                 buf[buflen] = '\0';
2740         }
2741
2742         error = 0;
2743
2744         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2745         counter_u64_add(numfullpathcalls, 1);
2746         while (vp != rdir && vp != rootvnode) {
2747                 /*
2748                  * The vp vnode must be already fully constructed,
2749                  * since it is either found in namecache or obtained
2750                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2751                  * without obtaining the vnode lock.
2752                  */
2753                 if ((vp->v_vflag & VV_ROOT) != 0) {
2754                         vn_lock(vp, LK_RETRY | LK_SHARED);
2755
2756                         /*
2757                          * With the vnode locked, check for races with
2758                          * unmount, forced or not.  Note that we
2759                          * already verified that vp is not equal to
2760                          * the root vnode, which means that
2761                          * mnt_vnodecovered can be NULL only for the
2762                          * case of unmount.
2763                          */
2764                         if (VN_IS_DOOMED(vp) ||
2765                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2766                             vp1->v_mountedhere != vp->v_mount) {
2767                                 vput(vp);
2768                                 error = ENOENT;
2769                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2770                                     error, vp, NULL);
2771                                 break;
2772                         }
2773
2774                         vref(vp1);
2775                         vput(vp);
2776                         vp = vp1;
2777                         continue;
2778                 }
2779                 if (vp->v_type != VDIR) {
2780                         vrele(vp);
2781                         counter_u64_add(numfullpathfail1, 1);
2782                         error = ENOTDIR;
2783                         SDT_PROBE3(vfs, namecache, fullpath, return,
2784                             error, vp, NULL);
2785                         break;
2786                 }
2787                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2788                 if (error)
2789                         break;
2790                 if (buflen == 0) {
2791                         vrele(vp);
2792                         error = ENOMEM;
2793                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2794                             startvp, NULL);
2795                         break;
2796                 }
2797                 buf[--buflen] = '/';
2798                 slash_prefixed = true;
2799         }
2800         if (error)
2801                 return (error);
2802         if (!slash_prefixed) {
2803                 if (buflen == 0) {
2804                         vrele(vp);
2805                         counter_u64_add(numfullpathfail4, 1);
2806                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2807                             startvp, NULL);
2808                         return (ENOMEM);
2809                 }
2810                 buf[--buflen] = '/';
2811         }
2812         counter_u64_add(numfullpathfound, 1);
2813         vrele(vp);
2814
2815         *retbuf = buf + buflen;
2816         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2817         *len -= buflen;
2818         *len += addend;
2819         return (0);
2820 }
2821
2822 /*
2823  * Resolve an arbitrary vnode to a pathname.
2824  *
2825  * Note 2 caveats:
2826  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2827  *   resolve to a different path than the one used to find it
2828  * - namecache is not mandatory, meaning names are not guaranteed to be added
2829  *   (in which case resolving fails)
2830  */
2831 static void __inline
2832 cache_rev_failed_impl(int *reason, int line)
2833 {
2834
2835         *reason = line;
2836 }
2837 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
2838
2839 static int
2840 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2841     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2842 {
2843 #ifdef KDTRACE_HOOKS
2844         struct vnode *startvp = vp;
2845 #endif
2846         struct vnode *tvp;
2847         struct mount *mp;
2848         struct namecache *ncp;
2849         size_t orig_buflen;
2850         int reason;
2851         int error;
2852 #ifdef KDTRACE_HOOKS
2853         int i;
2854 #endif
2855         seqc_t vp_seqc, tvp_seqc;
2856         u_char nc_flag;
2857
2858         VFS_SMR_ASSERT_ENTERED();
2859
2860         if (!cache_fast_revlookup) {
2861                 vfs_smr_exit();
2862                 return (-1);
2863         }
2864
2865         orig_buflen = *buflen;
2866
2867         if (!slash_prefixed) {
2868                 MPASS(*buflen >= 2);
2869                 *buflen -= 1;
2870                 buf[*buflen] = '\0';
2871         }
2872
2873         if (vp == rdir || vp == rootvnode) {
2874                 if (!slash_prefixed) {
2875                         *buflen -= 1;
2876                         buf[*buflen] = '/';
2877                 }
2878                 goto out_ok;
2879         }
2880
2881 #ifdef KDTRACE_HOOKS
2882         i = 0;
2883 #endif
2884         error = -1;
2885         ncp = NULL; /* for sdt probe down below */
2886         vp_seqc = vn_seqc_read_any(vp);
2887         if (seqc_in_modify(vp_seqc)) {
2888                 cache_rev_failed(&reason);
2889                 goto out_abort;
2890         }
2891
2892         for (;;) {
2893 #ifdef KDTRACE_HOOKS
2894                 i++;
2895 #endif
2896                 if ((vp->v_vflag & VV_ROOT) != 0) {
2897                         mp = atomic_load_ptr(&vp->v_mount);
2898                         if (mp == NULL) {
2899                                 cache_rev_failed(&reason);
2900                                 goto out_abort;
2901                         }
2902                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2903                         tvp_seqc = vn_seqc_read_any(tvp);
2904                         if (seqc_in_modify(tvp_seqc)) {
2905                                 cache_rev_failed(&reason);
2906                                 goto out_abort;
2907                         }
2908                         if (!vn_seqc_consistent(vp, vp_seqc)) {
2909                                 cache_rev_failed(&reason);
2910                                 goto out_abort;
2911                         }
2912                         vp = tvp;
2913                         vp_seqc = tvp_seqc;
2914                         continue;
2915                 }
2916                 ncp = atomic_load_ptr(&vp->v_cache_dd);
2917                 if (ncp == NULL) {
2918                         cache_rev_failed(&reason);
2919                         goto out_abort;
2920                 }
2921                 nc_flag = atomic_load_char(&ncp->nc_flag);
2922                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
2923                         cache_rev_failed(&reason);
2924                         goto out_abort;
2925                 }
2926                 if (!cache_ncp_canuse(ncp)) {
2927                         cache_rev_failed(&reason);
2928                         goto out_abort;
2929                 }
2930                 if (ncp->nc_nlen >= *buflen) {
2931                         cache_rev_failed(&reason);
2932                         error = ENOMEM;
2933                         goto out_abort;
2934                 }
2935                 *buflen -= ncp->nc_nlen;
2936                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2937                 *buflen -= 1;
2938                 buf[*buflen] = '/';
2939                 tvp = ncp->nc_dvp;
2940                 tvp_seqc = vn_seqc_read_any(tvp);
2941                 if (seqc_in_modify(tvp_seqc)) {
2942                         cache_rev_failed(&reason);
2943                         goto out_abort;
2944                 }
2945                 if (!vn_seqc_consistent(vp, vp_seqc)) {
2946                         cache_rev_failed(&reason);
2947                         goto out_abort;
2948                 }
2949                 vp = tvp;
2950                 vp_seqc = tvp_seqc;
2951                 if (vp == rdir || vp == rootvnode)
2952                         break;
2953         }
2954 out_ok:
2955         vfs_smr_exit();
2956         *retbuf = buf + *buflen;
2957         *buflen = orig_buflen - *buflen + addend;
2958         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2959         return (0);
2960
2961 out_abort:
2962         *buflen = orig_buflen;
2963         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2964         vfs_smr_exit();
2965         return (error);
2966 }
2967
2968 static int
2969 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2970     size_t *buflen)
2971 {
2972         size_t orig_buflen;
2973         bool slash_prefixed;
2974         int error;
2975
2976         if (*buflen < 2)
2977                 return (EINVAL);
2978
2979         orig_buflen = *buflen;
2980
2981         vref(vp);
2982         slash_prefixed = false;
2983         if (vp->v_type != VDIR) {
2984                 *buflen -= 1;
2985                 buf[*buflen] = '\0';
2986                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2987                 if (error)
2988                         return (error);
2989                 if (*buflen == 0) {
2990                         vrele(vp);
2991                         return (ENOMEM);
2992                 }
2993                 *buflen -= 1;
2994                 buf[*buflen] = '/';
2995                 slash_prefixed = true;
2996         }
2997
2998         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
2999             orig_buflen - *buflen));
3000 }
3001
3002 /*
3003  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3004  *
3005  * Since the namecache does not track handlings, the caller is expected to first
3006  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3007  *
3008  * Then we have 2 cases:
3009  * - if the found vnode is a directory, the path can be constructed just by
3010  *   fullowing names up the chain
3011  * - otherwise we populate the buffer with the saved name and start resolving
3012  *   from the parent
3013  */
3014 static int
3015 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3016     size_t *buflen)
3017 {
3018         char *buf, *tmpbuf;
3019         struct pwd *pwd;
3020         struct componentname *cnp;
3021         struct vnode *vp;
3022         size_t addend;
3023         int error;
3024         bool slash_prefixed;
3025         enum vtype type;
3026
3027         if (*buflen < 2)
3028                 return (EINVAL);
3029         if (*buflen > MAXPATHLEN)
3030                 *buflen = MAXPATHLEN;
3031
3032         slash_prefixed = false;
3033
3034         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3035
3036         addend = 0;
3037         vp = ndp->ni_vp;
3038         /*
3039          * Check for VBAD to work around the vp_crossmp bug in lookup().
3040          *
3041          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3042          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3043          * If the type is VDIR (like in this very case) we can skip looking
3044          * at ni_dvp in the first place. However, since vnodes get passed here
3045          * unlocked the target may transition to doomed state (type == VBAD)
3046          * before we get to evaluate the condition. If this happens, we will
3047          * populate part of the buffer and descend to vn_fullpath_dir with
3048          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3049          *
3050          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3051          * an address of a bit field, even if said field is sized to char.
3052          * Work around the problem by reading the value into a full-sized enum
3053          * and then re-reading it with atomic_load which will still prevent
3054          * the compiler from re-reading down the road.
3055          */
3056         type = vp->v_type;
3057         type = atomic_load_int(&type);
3058         if (type == VBAD) {
3059                 error = ENOENT;
3060                 goto out_bad;
3061         }
3062         if (type != VDIR) {
3063                 cnp = &ndp->ni_cnd;
3064                 addend = cnp->cn_namelen + 2;
3065                 if (*buflen < addend) {
3066                         error = ENOMEM;
3067                         goto out_bad;
3068                 }
3069                 *buflen -= addend;
3070                 tmpbuf = buf + *buflen;
3071                 tmpbuf[0] = '/';
3072                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3073                 tmpbuf[addend - 1] = '\0';
3074                 slash_prefixed = true;
3075                 vp = ndp->ni_dvp;
3076         }
3077
3078         vfs_smr_enter();
3079         pwd = pwd_get_smr();
3080         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3081             slash_prefixed, addend);
3082         VFS_SMR_ASSERT_NOT_ENTERED();
3083         if (error < 0) {
3084                 pwd = pwd_hold(curthread);
3085                 vref(vp);
3086                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3087                     slash_prefixed, addend);
3088                 pwd_drop(pwd);
3089                 if (error != 0)
3090                         goto out_bad;
3091         }
3092
3093         *freebuf = buf;
3094
3095         return (0);
3096 out_bad:
3097         free(buf, M_TEMP);
3098         return (error);
3099 }
3100
3101 struct vnode *
3102 vn_dir_dd_ino(struct vnode *vp)
3103 {
3104         struct namecache *ncp;
3105         struct vnode *ddvp;
3106         struct mtx *vlp;
3107         enum vgetstate vs;
3108
3109         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3110         vlp = VP2VNODELOCK(vp);
3111         mtx_lock(vlp);
3112         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3113                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3114                         continue;
3115                 ddvp = ncp->nc_dvp;
3116                 vs = vget_prep(ddvp);
3117                 mtx_unlock(vlp);
3118                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3119                         return (NULL);
3120                 return (ddvp);
3121         }
3122         mtx_unlock(vlp);
3123         return (NULL);
3124 }
3125
3126 int
3127 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3128 {
3129         struct namecache *ncp;
3130         struct mtx *vlp;
3131         int l;
3132
3133         vlp = VP2VNODELOCK(vp);
3134         mtx_lock(vlp);
3135         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3136                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3137                         break;
3138         if (ncp == NULL) {
3139                 mtx_unlock(vlp);
3140                 return (ENOENT);
3141         }
3142         l = min(ncp->nc_nlen, buflen - 1);
3143         memcpy(buf, ncp->nc_name, l);
3144         mtx_unlock(vlp);
3145         buf[l] = '\0';
3146         return (0);
3147 }
3148
3149 /*
3150  * This function updates path string to vnode's full global path
3151  * and checks the size of the new path string against the pathlen argument.
3152  *
3153  * Requires a locked, referenced vnode.
3154  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3155  *
3156  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3157  * because it falls back to the ".." lookup if the namecache lookup fails.
3158  */
3159 int
3160 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3161     u_int pathlen)
3162 {
3163         struct nameidata nd;
3164         struct vnode *vp1;
3165         char *rpath, *fbuf;
3166         int error;
3167
3168         ASSERT_VOP_ELOCKED(vp, __func__);
3169
3170         /* Construct global filesystem path from vp. */
3171         VOP_UNLOCK(vp);
3172         error = vn_fullpath_global(vp, &rpath, &fbuf);
3173
3174         if (error != 0) {
3175                 vrele(vp);
3176                 return (error);
3177         }
3178
3179         if (strlen(rpath) >= pathlen) {
3180                 vrele(vp);
3181                 error = ENAMETOOLONG;
3182                 goto out;
3183         }
3184
3185         /*
3186          * Re-lookup the vnode by path to detect a possible rename.
3187          * As a side effect, the vnode is relocked.
3188          * If vnode was renamed, return ENOENT.
3189          */
3190         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3191             UIO_SYSSPACE, path, td);
3192         error = namei(&nd);
3193         if (error != 0) {
3194                 vrele(vp);
3195                 goto out;
3196         }
3197         NDFREE(&nd, NDF_ONLY_PNBUF);
3198         vp1 = nd.ni_vp;
3199         vrele(vp);
3200         if (vp1 == vp)
3201                 strcpy(path, rpath);
3202         else {
3203                 vput(vp1);
3204                 error = ENOENT;
3205         }
3206
3207 out:
3208         free(fbuf, M_TEMP);
3209         return (error);
3210 }
3211
3212 #ifdef DDB
3213 static void
3214 db_print_vpath(struct vnode *vp)
3215 {
3216
3217         while (vp != NULL) {
3218                 db_printf("%p: ", vp);
3219                 if (vp == rootvnode) {
3220                         db_printf("/");
3221                         vp = NULL;
3222                 } else {
3223                         if (vp->v_vflag & VV_ROOT) {
3224                                 db_printf("<mount point>");
3225                                 vp = vp->v_mount->mnt_vnodecovered;
3226                         } else {
3227                                 struct namecache *ncp;
3228                                 char *ncn;
3229                                 int i;
3230
3231                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3232                                 if (ncp != NULL) {
3233                                         ncn = ncp->nc_name;
3234                                         for (i = 0; i < ncp->nc_nlen; i++)
3235                                                 db_printf("%c", *ncn++);
3236                                         vp = ncp->nc_dvp;
3237                                 } else {
3238                                         vp = NULL;
3239                                 }
3240                         }
3241                 }
3242                 db_printf("\n");
3243         }
3244
3245         return;
3246 }
3247
3248 DB_SHOW_COMMAND(vpath, db_show_vpath)
3249 {
3250         struct vnode *vp;
3251
3252         if (!have_addr) {
3253                 db_printf("usage: show vpath <struct vnode *>\n");
3254                 return;
3255         }
3256
3257         vp = (struct vnode *)addr;
3258         db_print_vpath(vp);
3259 }
3260
3261 #endif
3262
3263 static bool __read_frequently cache_fast_lookup = true;
3264 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3265     &cache_fast_lookup, 0, "");
3266
3267 #define CACHE_FPL_FAILED        -2020
3268
3269 static void
3270 cache_fpl_cleanup_cnp(struct componentname *cnp)
3271 {
3272
3273         uma_zfree(namei_zone, cnp->cn_pnbuf);
3274 #ifdef DIAGNOSTIC
3275         cnp->cn_pnbuf = NULL;
3276         cnp->cn_nameptr = NULL;
3277 #endif
3278 }
3279
3280 static void
3281 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3282 {
3283         struct componentname *cnp;
3284
3285         cnp = &ndp->ni_cnd;
3286         while (*(cnp->cn_nameptr) == '/') {
3287                 cnp->cn_nameptr++;
3288                 ndp->ni_pathlen--;
3289         }
3290
3291         *dpp = ndp->ni_rootdir;
3292 }
3293
3294 /*
3295  * Components of nameidata (or objects it can point to) which may
3296  * need restoring in case fast path lookup fails.
3297  */
3298 struct nameidata_saved {
3299         long cn_namelen;
3300         char *cn_nameptr;
3301         size_t ni_pathlen;
3302         int cn_flags;
3303 };
3304
3305 struct cache_fpl {
3306         struct nameidata *ndp;
3307         struct componentname *cnp;
3308         struct pwd *pwd;
3309         struct vnode *dvp;
3310         struct vnode *tvp;
3311         seqc_t dvp_seqc;
3312         seqc_t tvp_seqc;
3313         struct nameidata_saved snd;
3314         int line;
3315         enum cache_fpl_status status:8;
3316         bool in_smr;
3317 };
3318
3319 static void
3320 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3321 {
3322
3323         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3324         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3325         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3326         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3327 }
3328
3329 static void
3330 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3331 {
3332
3333         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3334         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3335         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3336         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3337 }
3338
3339 #ifdef INVARIANTS
3340 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3341         struct cache_fpl *_fpl = (fpl);                         \
3342         MPASS(_fpl->in_smr == true);                            \
3343         VFS_SMR_ASSERT_ENTERED();                               \
3344 })
3345 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3346         struct cache_fpl *_fpl = (fpl);                         \
3347         MPASS(_fpl->in_smr == false);                           \
3348         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3349 })
3350 #else
3351 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3352 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3353 #endif
3354
3355 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3356         struct cache_fpl *_fpl = (fpl);                         \
3357         vfs_smr_enter();                                        \
3358         _fpl->in_smr = true;                                    \
3359 })
3360
3361 #define cache_fpl_smr_enter(fpl) ({                             \
3362         struct cache_fpl *_fpl = (fpl);                         \
3363         MPASS(_fpl->in_smr == false);                           \
3364         vfs_smr_enter();                                        \
3365         _fpl->in_smr = true;                                    \
3366 })
3367
3368 #define cache_fpl_smr_exit(fpl) ({                              \
3369         struct cache_fpl *_fpl = (fpl);                         \
3370         MPASS(_fpl->in_smr == true);                            \
3371         vfs_smr_exit();                                         \
3372         _fpl->in_smr = false;                                   \
3373 })
3374
3375 static int
3376 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3377 {
3378
3379         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3380                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3381                     ("%s: converting to abort from %d at %d, set at %d\n",
3382                     __func__, fpl->status, line, fpl->line));
3383         }
3384         fpl->status = CACHE_FPL_STATUS_ABORTED;
3385         fpl->line = line;
3386         return (CACHE_FPL_FAILED);
3387 }
3388
3389 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3390
3391 static int
3392 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3393 {
3394
3395         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3396             ("%s: setting to partial at %d, but already set to %d at %d\n",
3397             __func__, line, fpl->status, fpl->line));
3398         cache_fpl_smr_assert_entered(fpl);
3399         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3400         fpl->line = line;
3401         return (CACHE_FPL_FAILED);
3402 }
3403
3404 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3405
3406 static int
3407 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3408 {
3409
3410         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3411             ("%s: setting to handled at %d, but already set to %d at %d\n",
3412             __func__, line, fpl->status, fpl->line));
3413         cache_fpl_smr_assert_not_entered(fpl);
3414         MPASS(error != CACHE_FPL_FAILED);
3415         fpl->status = CACHE_FPL_STATUS_HANDLED;
3416         fpl->line = line;
3417         return (error);
3418 }
3419
3420 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3421
3422 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3423         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3424          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3425
3426 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3427         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3428
3429 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3430     "supported and internal flags overlap");
3431
3432 static bool
3433 cache_fpl_islastcn(struct nameidata *ndp)
3434 {
3435
3436         return (*ndp->ni_next == 0);
3437 }
3438
3439 static bool
3440 cache_fpl_isdotdot(struct componentname *cnp)
3441 {
3442
3443         if (cnp->cn_namelen == 2 &&
3444             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3445                 return (true);
3446         return (false);
3447 }
3448
3449 static bool
3450 cache_can_fplookup(struct cache_fpl *fpl)
3451 {
3452         struct nameidata *ndp;
3453         struct componentname *cnp;
3454         struct thread *td;
3455
3456         ndp = fpl->ndp;
3457         cnp = fpl->cnp;
3458         td = cnp->cn_thread;
3459
3460         if (!cache_fast_lookup) {
3461                 cache_fpl_aborted(fpl);
3462                 return (false);
3463         }
3464 #ifdef MAC
3465         if (mac_vnode_check_lookup_enabled()) {
3466                 cache_fpl_aborted(fpl);
3467                 return (false);
3468         }
3469 #endif
3470         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3471                 cache_fpl_aborted(fpl);
3472                 return (false);
3473         }
3474         if (ndp->ni_dirfd != AT_FDCWD) {
3475                 cache_fpl_aborted(fpl);
3476                 return (false);
3477         }
3478         if (IN_CAPABILITY_MODE(td)) {
3479                 cache_fpl_aborted(fpl);
3480                 return (false);
3481         }
3482         if (AUDITING_TD(td)) {
3483                 cache_fpl_aborted(fpl);
3484                 return (false);
3485         }
3486         if (ndp->ni_startdir != NULL) {
3487                 cache_fpl_aborted(fpl);
3488                 return (false);
3489         }
3490         return (true);
3491 }
3492
3493 static bool
3494 cache_fplookup_vnode_supported(struct vnode *vp)
3495 {
3496
3497         return (vp->v_type != VLNK);
3498 }
3499
3500 /*
3501  * Move a negative entry to the hot list.
3502  *
3503  * We have to take locks, but they may be contended and in the worst
3504  * case we may need to go off CPU. We don't want to spin within the
3505  * smr section and we can't block with it. Instead we are going to
3506  * look up the entry again.
3507  */
3508 static int __noinline
3509 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3510     uint32_t hash)
3511 {
3512         struct componentname *cnp;
3513         struct namecache *ncp;
3514         struct neglist *neglist;
3515         struct negstate *negstate;
3516         struct vnode *dvp;
3517         u_char nc_flag;
3518
3519         cnp = fpl->cnp;
3520         dvp = fpl->dvp;
3521
3522         if (!vhold_smr(dvp))
3523                 return (cache_fpl_aborted(fpl));
3524
3525         neglist = NCP2NEGLIST(oncp);
3526         cache_fpl_smr_exit(fpl);
3527
3528         mtx_lock(&ncneg_hot.nl_lock);
3529         mtx_lock(&neglist->nl_lock);
3530         /*
3531          * For hash iteration.
3532          */
3533         cache_fpl_smr_enter(fpl);
3534
3535         /*
3536          * Avoid all surprises by only succeeding if we got the same entry and
3537          * bailing completely otherwise.
3538          *
3539          * In particular at this point there can be a new ncp which matches the
3540          * search but hashes to a different neglist.
3541          */
3542         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3543                 if (ncp == oncp)
3544                         break;
3545         }
3546
3547         /*
3548          * No match to begin with.
3549          */
3550         if (__predict_false(ncp == NULL)) {
3551                 goto out_abort;
3552         }
3553
3554         /*
3555          * The newly found entry may be something different...
3556          */
3557         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3558             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3559                 goto out_abort;
3560         }
3561
3562         /*
3563          * ... and not even negative.
3564          */
3565         nc_flag = atomic_load_char(&ncp->nc_flag);
3566         if ((nc_flag & NCF_NEGATIVE) == 0) {
3567                 goto out_abort;
3568         }
3569
3570         if (__predict_false(!cache_ncp_canuse(ncp))) {
3571                 goto out_abort;
3572         }
3573
3574         negstate = NCP2NEGSTATE(ncp);
3575         if ((negstate->neg_flag & NEG_HOT) == 0) {
3576                 numhotneg++;
3577                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3578                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3579                 negstate->neg_flag |= NEG_HOT;
3580         }
3581
3582         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3583         counter_u64_add(numneghits, 1);
3584         cache_fpl_smr_exit(fpl);
3585         mtx_unlock(&neglist->nl_lock);
3586         mtx_unlock(&ncneg_hot.nl_lock);
3587         vdrop(dvp);
3588         return (cache_fpl_handled(fpl, ENOENT));
3589 out_abort:
3590         cache_fpl_smr_exit(fpl);
3591         mtx_unlock(&neglist->nl_lock);
3592         mtx_unlock(&ncneg_hot.nl_lock);
3593         vdrop(dvp);
3594         return (cache_fpl_aborted(fpl));
3595 }
3596
3597 /*
3598  * The target vnode is not supported, prepare for the slow path to take over.
3599  */
3600 static int __noinline
3601 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3602 {
3603         struct nameidata *ndp;
3604         struct componentname *cnp;
3605         enum vgetstate dvs;
3606         struct vnode *dvp;
3607         struct pwd *pwd;
3608         seqc_t dvp_seqc;
3609
3610         ndp = fpl->ndp;
3611         cnp = fpl->cnp;
3612         dvp = fpl->dvp;
3613         dvp_seqc = fpl->dvp_seqc;
3614
3615         dvs = vget_prep_smr(dvp);
3616         if (__predict_false(dvs == VGET_NONE)) {
3617                 cache_fpl_smr_exit(fpl);
3618                 return (cache_fpl_aborted(fpl));
3619         }
3620
3621         cache_fpl_smr_exit(fpl);
3622
3623         vget_finish_ref(dvp, dvs);
3624         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3625                 vrele(dvp);
3626                 return (cache_fpl_aborted(fpl));
3627         }
3628
3629         pwd = pwd_hold(curthread);
3630         if (fpl->pwd != pwd) {
3631                 vrele(dvp);
3632                 pwd_drop(pwd);
3633                 return (cache_fpl_aborted(fpl));
3634         }
3635
3636         cache_fpl_restore(fpl, &fpl->snd);
3637
3638         ndp->ni_startdir = dvp;
3639         cnp->cn_flags |= MAKEENTRY;
3640         if (cache_fpl_islastcn(ndp))
3641                 cnp->cn_flags |= ISLASTCN;
3642         if (cache_fpl_isdotdot(cnp))
3643                 cnp->cn_flags |= ISDOTDOT;
3644
3645         return (0);
3646 }
3647
3648 static int
3649 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3650 {
3651         struct componentname *cnp;
3652         struct vnode *tvp;
3653         seqc_t tvp_seqc;
3654         int error, lkflags;
3655
3656         cnp = fpl->cnp;
3657         tvp = fpl->tvp;
3658         tvp_seqc = fpl->tvp_seqc;
3659
3660         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3661                 lkflags = LK_SHARED;
3662                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3663                         lkflags = LK_EXCLUSIVE;
3664                 error = vget_finish(tvp, lkflags, tvs);
3665                 if (__predict_false(error != 0)) {
3666                         return (cache_fpl_aborted(fpl));
3667                 }
3668         } else {
3669                 vget_finish_ref(tvp, tvs);
3670         }
3671
3672         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3673                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3674                         vput(tvp);
3675                 else
3676                         vrele(tvp);
3677                 return (cache_fpl_aborted(fpl));
3678         }
3679
3680         return (cache_fpl_handled(fpl, 0));
3681 }
3682
3683 /*
3684  * They want to possibly modify the state of the namecache.
3685  *
3686  * Don't try to match the API contract, just leave.
3687  * TODO: this leaves scalability on the table
3688  */
3689 static int
3690 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3691 {
3692         struct componentname *cnp;
3693
3694         cnp = fpl->cnp;
3695         MPASS(cnp->cn_nameiop != LOOKUP);
3696         return (cache_fpl_partial(fpl));
3697 }
3698
3699 static int __noinline
3700 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3701 {
3702         struct componentname *cnp;
3703         enum vgetstate dvs, tvs;
3704         struct vnode *dvp, *tvp;
3705         seqc_t dvp_seqc, tvp_seqc;
3706         int error;
3707
3708         cnp = fpl->cnp;
3709         dvp = fpl->dvp;
3710         dvp_seqc = fpl->dvp_seqc;
3711         tvp = fpl->tvp;
3712         tvp_seqc = fpl->tvp_seqc;
3713
3714         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3715
3716         /*
3717          * This is less efficient than it can be for simplicity.
3718          */
3719         dvs = vget_prep_smr(dvp);
3720         if (__predict_false(dvs == VGET_NONE)) {
3721                 return (cache_fpl_aborted(fpl));
3722         }
3723         tvs = vget_prep_smr(tvp);
3724         if (__predict_false(tvs == VGET_NONE)) {
3725                 cache_fpl_smr_exit(fpl);
3726                 vget_abort(dvp, dvs);
3727                 return (cache_fpl_aborted(fpl));
3728         }
3729
3730         cache_fpl_smr_exit(fpl);
3731
3732         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3733                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3734                 if (__predict_false(error != 0)) {
3735                         vget_abort(tvp, tvs);
3736                         return (cache_fpl_aborted(fpl));
3737                 }
3738         } else {
3739                 vget_finish_ref(dvp, dvs);
3740         }
3741
3742         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3743                 vget_abort(tvp, tvs);
3744                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3745                         vput(dvp);
3746                 else
3747                         vrele(dvp);
3748                 return (cache_fpl_aborted(fpl));
3749         }
3750
3751         error = cache_fplookup_final_child(fpl, tvs);
3752         if (__predict_false(error != 0)) {
3753                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3754                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3755                         vput(dvp);
3756                 else
3757                         vrele(dvp);
3758                 return (error);
3759         }
3760
3761         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3762         return (0);
3763 }
3764
3765 static int
3766 cache_fplookup_final(struct cache_fpl *fpl)
3767 {
3768         struct componentname *cnp;
3769         enum vgetstate tvs;
3770         struct vnode *dvp, *tvp;
3771         seqc_t dvp_seqc, tvp_seqc;
3772
3773         cnp = fpl->cnp;
3774         dvp = fpl->dvp;
3775         dvp_seqc = fpl->dvp_seqc;
3776         tvp = fpl->tvp;
3777         tvp_seqc = fpl->tvp_seqc;
3778
3779         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3780
3781         if (cnp->cn_nameiop != LOOKUP) {
3782                 return (cache_fplookup_final_modifying(fpl));
3783         }
3784
3785         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3786                 return (cache_fplookup_final_withparent(fpl));
3787
3788         tvs = vget_prep_smr(tvp);
3789         if (__predict_false(tvs == VGET_NONE)) {
3790                 return (cache_fpl_partial(fpl));
3791         }
3792
3793         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3794                 cache_fpl_smr_exit(fpl);
3795                 vget_abort(tvp, tvs);
3796                 return (cache_fpl_aborted(fpl));
3797         }
3798
3799         cache_fpl_smr_exit(fpl);
3800         return (cache_fplookup_final_child(fpl, tvs));
3801 }
3802
3803 static int __noinline
3804 cache_fplookup_dot(struct cache_fpl *fpl)
3805 {
3806         struct vnode *dvp;
3807
3808         dvp = fpl->dvp;
3809
3810         fpl->tvp = dvp;
3811         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3812         if (seqc_in_modify(fpl->tvp_seqc)) {
3813                 return (cache_fpl_aborted(fpl));
3814         }
3815
3816         counter_u64_add(dothits, 1);
3817         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3818
3819         return (0);
3820 }
3821
3822 static int __noinline
3823 cache_fplookup_dotdot(struct cache_fpl *fpl)
3824 {
3825         struct nameidata *ndp;
3826         struct componentname *cnp;
3827         struct namecache *ncp;
3828         struct vnode *dvp;
3829         struct prison *pr;
3830         u_char nc_flag;
3831
3832         ndp = fpl->ndp;
3833         cnp = fpl->cnp;
3834         dvp = fpl->dvp;
3835
3836         /*
3837          * XXX this is racy the same way regular lookup is
3838          */
3839         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3840             pr = pr->pr_parent)
3841                 if (dvp == pr->pr_root)
3842                         break;
3843
3844         if (dvp == ndp->ni_rootdir ||
3845             dvp == ndp->ni_topdir ||
3846             dvp == rootvnode ||
3847             pr != NULL) {
3848                 fpl->tvp = dvp;
3849                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3850                 if (seqc_in_modify(fpl->tvp_seqc)) {
3851                         return (cache_fpl_aborted(fpl));
3852                 }
3853                 return (0);
3854         }
3855
3856         if ((dvp->v_vflag & VV_ROOT) != 0) {
3857                 /*
3858                  * TODO
3859                  * The opposite of climb mount is needed here.
3860                  */
3861                 return (cache_fpl_aborted(fpl));
3862         }
3863
3864         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3865         if (ncp == NULL) {
3866                 return (cache_fpl_aborted(fpl));
3867         }
3868
3869         nc_flag = atomic_load_char(&ncp->nc_flag);
3870         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3871                 if ((nc_flag & NCF_NEGATIVE) != 0)
3872                         return (cache_fpl_aborted(fpl));
3873                 fpl->tvp = ncp->nc_vp;
3874         } else {
3875                 fpl->tvp = ncp->nc_dvp;
3876         }
3877
3878         if (__predict_false(!cache_ncp_canuse(ncp))) {
3879                 return (cache_fpl_aborted(fpl));
3880         }
3881
3882         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3883         if (seqc_in_modify(fpl->tvp_seqc)) {
3884                 return (cache_fpl_partial(fpl));
3885         }
3886
3887         counter_u64_add(dotdothits, 1);
3888         return (0);
3889 }
3890
3891 static int
3892 cache_fplookup_next(struct cache_fpl *fpl)
3893 {
3894         struct componentname *cnp;
3895         struct namecache *ncp;
3896         struct negstate *negstate;
3897         struct vnode *dvp, *tvp;
3898         u_char nc_flag;
3899         uint32_t hash;
3900         bool neg_hot;
3901
3902         cnp = fpl->cnp;
3903         dvp = fpl->dvp;
3904
3905         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3906                 return (cache_fplookup_dot(fpl));
3907         }
3908
3909         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3910
3911         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3912                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3913                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3914                         break;
3915         }
3916
3917         /*
3918          * If there is no entry we have to punt to the slow path to perform
3919          * actual lookup. Should there be nothing with this name a negative
3920          * entry will be created.
3921          */
3922         if (__predict_false(ncp == NULL)) {
3923                 return (cache_fpl_partial(fpl));
3924         }
3925
3926         tvp = atomic_load_ptr(&ncp->nc_vp);
3927         nc_flag = atomic_load_char(&ncp->nc_flag);
3928         if ((nc_flag & NCF_NEGATIVE) != 0) {
3929                 /*
3930                  * If they want to create an entry we need to replace this one.
3931                  */
3932                 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3933                         return (cache_fpl_partial(fpl));
3934                 }
3935                 negstate = NCP2NEGSTATE(ncp);
3936                 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3937                 if (__predict_false(!cache_ncp_canuse(ncp))) {
3938                         return (cache_fpl_partial(fpl));
3939                 }
3940                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3941                         return (cache_fpl_partial(fpl));
3942                 }
3943                 if (!neg_hot) {
3944                         return (cache_fplookup_negative_promote(fpl, ncp, hash));
3945                 }
3946                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3947                     ncp->nc_name);
3948                 counter_u64_add(numneghits, 1);
3949                 cache_fpl_smr_exit(fpl);
3950                 return (cache_fpl_handled(fpl, ENOENT));
3951         }
3952
3953         if (__predict_false(!cache_ncp_canuse(ncp))) {
3954                 return (cache_fpl_partial(fpl));
3955         }
3956
3957         fpl->tvp = tvp;
3958         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3959         if (seqc_in_modify(fpl->tvp_seqc)) {
3960                 return (cache_fpl_partial(fpl));
3961         }
3962
3963         if (!cache_fplookup_vnode_supported(tvp)) {
3964                 return (cache_fpl_partial(fpl));
3965         }
3966
3967         counter_u64_add(numposhits, 1);
3968         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3969         return (0);
3970 }
3971
3972 static bool
3973 cache_fplookup_mp_supported(struct mount *mp)
3974 {
3975
3976         if (mp == NULL)
3977                 return (false);
3978         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3979                 return (false);
3980         return (true);
3981 }
3982
3983 /*
3984  * Walk up the mount stack (if any).
3985  *
3986  * Correctness is provided in the following ways:
3987  * - all vnodes are protected from freeing with SMR
3988  * - struct mount objects are type stable making them always safe to access
3989  * - stability of the particular mount is provided by busying it
3990  * - relationship between the vnode which is mounted on and the mount is
3991  *   verified with the vnode sequence counter after busying
3992  * - association between root vnode of the mount and the mount is protected
3993  *   by busy
3994  *
3995  * From that point on we can read the sequence counter of the root vnode
3996  * and get the next mount on the stack (if any) using the same protection.
3997  *
3998  * By the end of successful walk we are guaranteed the reached state was
3999  * indeed present at least at some point which matches the regular lookup.
4000  */
4001 static int __noinline
4002 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4003 {
4004         struct mount *mp, *prev_mp;
4005         struct vnode *vp;
4006         seqc_t vp_seqc;
4007
4008         vp = fpl->tvp;
4009         vp_seqc = fpl->tvp_seqc;
4010
4011         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4012         mp = atomic_load_ptr(&vp->v_mountedhere);
4013         if (mp == NULL)
4014                 return (0);
4015
4016         prev_mp = NULL;
4017         for (;;) {
4018                 if (!vfs_op_thread_enter_crit(mp)) {
4019                         if (prev_mp != NULL)
4020                                 vfs_op_thread_exit_crit(prev_mp);
4021                         return (cache_fpl_partial(fpl));
4022                 }
4023                 if (prev_mp != NULL)
4024                         vfs_op_thread_exit_crit(prev_mp);
4025                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4026                         vfs_op_thread_exit_crit(mp);
4027                         return (cache_fpl_partial(fpl));
4028                 }
4029                 if (!cache_fplookup_mp_supported(mp)) {
4030                         vfs_op_thread_exit_crit(mp);
4031                         return (cache_fpl_partial(fpl));
4032                 }
4033                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4034                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4035                         vfs_op_thread_exit_crit(mp);
4036                         return (cache_fpl_partial(fpl));
4037                 }
4038                 vp_seqc = vn_seqc_read_any(vp);
4039                 if (seqc_in_modify(vp_seqc)) {
4040                         vfs_op_thread_exit_crit(mp);
4041                         return (cache_fpl_partial(fpl));
4042                 }
4043                 prev_mp = mp;
4044                 mp = atomic_load_ptr(&vp->v_mountedhere);
4045                 if (mp == NULL)
4046                         break;
4047         }
4048
4049         vfs_op_thread_exit_crit(prev_mp);
4050         fpl->tvp = vp;
4051         fpl->tvp_seqc = vp_seqc;
4052         return (0);
4053 }
4054
4055 static bool
4056 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4057 {
4058         struct mount *mp;
4059         struct vnode *vp;
4060
4061         vp = fpl->tvp;
4062
4063         /*
4064          * Hack: while this is a union, the pointer tends to be NULL so save on
4065          * a branch.
4066          */
4067         mp = atomic_load_ptr(&vp->v_mountedhere);
4068         if (mp == NULL)
4069                 return (false);
4070         if (vp->v_type == VDIR)
4071                 return (true);
4072         return (false);
4073 }
4074
4075 /*
4076  * Parse the path.
4077  *
4078  * The code is mostly copy-pasted from regular lookup, see lookup().
4079  * The structure is maintained along with comments for easier maintenance.
4080  * Deduplicating the code will become feasible after fast path lookup
4081  * becomes more feature-complete.
4082  */
4083 static int
4084 cache_fplookup_parse(struct cache_fpl *fpl)
4085 {
4086         struct nameidata *ndp;
4087         struct componentname *cnp;
4088         char *cp;
4089
4090         ndp = fpl->ndp;
4091         cnp = fpl->cnp;
4092
4093         /*
4094          * Search a new directory.
4095          *
4096          * The last component of the filename is left accessible via
4097          * cnp->cn_nameptr for callers that need the name. Callers needing
4098          * the name set the SAVENAME flag. When done, they assume
4099          * responsibility for freeing the pathname buffer.
4100          */
4101         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4102                 continue;
4103         cnp->cn_namelen = cp - cnp->cn_nameptr;
4104         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4105                 cache_fpl_smr_exit(fpl);
4106                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4107         }
4108         ndp->ni_pathlen -= cnp->cn_namelen;
4109         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4110             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4111         ndp->ni_next = cp;
4112
4113         /*
4114          * Replace multiple slashes by a single slash and trailing slashes
4115          * by a null.  This must be done before VOP_LOOKUP() because some
4116          * fs's don't know about trailing slashes.  Remember if there were
4117          * trailing slashes to handle symlinks, existing non-directories
4118          * and non-existing files that won't be directories specially later.
4119          */
4120         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4121                 cp++;
4122                 ndp->ni_pathlen--;
4123                 if (*cp == '\0') {
4124                         /*
4125                          * TODO
4126                          * Regular lookup performs the following:
4127                          * *ndp->ni_next = '\0';
4128                          * cnp->cn_flags |= TRAILINGSLASH;
4129                          *
4130                          * Which is problematic since it modifies data read
4131                          * from userspace. Then if fast path lookup was to
4132                          * abort we would have to either restore it or convey
4133                          * the flag. Since this is a corner case just ignore
4134                          * it for simplicity.
4135                          */
4136                         return (cache_fpl_partial(fpl));
4137                 }
4138         }
4139         ndp->ni_next = cp;
4140
4141         /*
4142          * Check for degenerate name (e.g. / or "")
4143          * which is a way of talking about a directory,
4144          * e.g. like "/." or ".".
4145          *
4146          * TODO
4147          * Another corner case handled by the regular lookup
4148          */
4149         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4150                 return (cache_fpl_partial(fpl));
4151         }
4152         return (0);
4153 }
4154
4155 static void
4156 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4157 {
4158         struct nameidata *ndp;
4159         struct componentname *cnp;
4160
4161         ndp = fpl->ndp;
4162         cnp = fpl->cnp;
4163
4164         cnp->cn_nameptr = ndp->ni_next;
4165         while (*cnp->cn_nameptr == '/') {
4166                 cnp->cn_nameptr++;
4167                 ndp->ni_pathlen--;
4168         }
4169 }
4170
4171 static int __noinline
4172 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4173 {
4174
4175         switch (error) {
4176         case EAGAIN:
4177                 /*
4178                  * Can happen when racing against vgone.
4179                  * */
4180         case EOPNOTSUPP:
4181                 cache_fpl_partial(fpl);
4182                 break;
4183         default:
4184                 /*
4185                  * See the API contract for VOP_FPLOOKUP_VEXEC.
4186                  */
4187                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4188                         error = cache_fpl_aborted(fpl);
4189                 } else {
4190                         cache_fpl_smr_exit(fpl);
4191                         cache_fpl_handled(fpl, error);
4192                 }
4193                 break;
4194         }
4195         return (error);
4196 }
4197
4198 static int
4199 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4200 {
4201         struct nameidata *ndp;
4202         struct componentname *cnp;
4203         struct mount *mp;
4204         int error;
4205
4206         error = CACHE_FPL_FAILED;
4207         ndp = fpl->ndp;
4208         cnp = fpl->cnp;
4209
4210         cache_fpl_checkpoint(fpl, &fpl->snd);
4211
4212         fpl->dvp = dvp;
4213         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4214         if (seqc_in_modify(fpl->dvp_seqc)) {
4215                 cache_fpl_aborted(fpl);
4216                 goto out;
4217         }
4218         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4219         if (!cache_fplookup_mp_supported(mp)) {
4220                 cache_fpl_aborted(fpl);
4221                 goto out;
4222         }
4223
4224         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4225
4226         for (;;) {
4227                 error = cache_fplookup_parse(fpl);
4228                 if (__predict_false(error != 0)) {
4229                         break;
4230                 }
4231
4232                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4233
4234                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4235                 if (__predict_false(error != 0)) {
4236                         error = cache_fplookup_failed_vexec(fpl, error);
4237                         break;
4238                 }
4239
4240                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4241                         error = cache_fplookup_dotdot(fpl);
4242                         if (__predict_false(error != 0)) {
4243                                 break;
4244                         }
4245                 } else {
4246                         error = cache_fplookup_next(fpl);
4247                         if (__predict_false(error != 0)) {
4248                                 break;
4249                         }
4250
4251                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4252
4253                         if (cache_fplookup_need_climb_mount(fpl)) {
4254                                 error = cache_fplookup_climb_mount(fpl);
4255                                 if (__predict_false(error != 0)) {
4256                                         break;
4257                                 }
4258                         }
4259                 }
4260
4261                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4262
4263                 if (cache_fpl_islastcn(ndp)) {
4264                         error = cache_fplookup_final(fpl);
4265                         break;
4266                 }
4267
4268                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4269                         error = cache_fpl_aborted(fpl);
4270                         break;
4271                 }
4272
4273                 fpl->dvp = fpl->tvp;
4274                 fpl->dvp_seqc = fpl->tvp_seqc;
4275
4276                 cache_fplookup_parse_advance(fpl);
4277                 cache_fpl_checkpoint(fpl, &fpl->snd);
4278         }
4279 out:
4280         switch (fpl->status) {
4281         case CACHE_FPL_STATUS_UNSET:
4282                 __assert_unreachable();
4283                 break;
4284         case CACHE_FPL_STATUS_PARTIAL:
4285                 cache_fpl_smr_assert_entered(fpl);
4286                 return (cache_fplookup_partial_setup(fpl));
4287         case CACHE_FPL_STATUS_ABORTED:
4288                 if (fpl->in_smr)
4289                         cache_fpl_smr_exit(fpl);
4290                 return (CACHE_FPL_FAILED);
4291         case CACHE_FPL_STATUS_HANDLED:
4292                 MPASS(error != CACHE_FPL_FAILED);
4293                 cache_fpl_smr_assert_not_entered(fpl);
4294                 if (__predict_false(error != 0)) {
4295                         ndp->ni_dvp = NULL;
4296                         ndp->ni_vp = NULL;
4297                         cache_fpl_cleanup_cnp(cnp);
4298                         return (error);
4299                 }
4300                 ndp->ni_dvp = fpl->dvp;
4301                 ndp->ni_vp = fpl->tvp;
4302                 if (cnp->cn_flags & SAVENAME)
4303                         cnp->cn_flags |= HASBUF;
4304                 else
4305                         cache_fpl_cleanup_cnp(cnp);
4306                 return (error);
4307         }
4308 }
4309
4310 /*
4311  * Fast path lookup protected with SMR and sequence counters.
4312  *
4313  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4314  *
4315  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4316  * outlined below.
4317  *
4318  * Traditional vnode lookup conceptually looks like this:
4319  *
4320  * vn_lock(current);
4321  * for (;;) {
4322  *      next = find();
4323  *      vn_lock(next);
4324  *      vn_unlock(current);
4325  *      current = next;
4326  *      if (last)
4327  *          break;
4328  * }
4329  * return (current);
4330  *
4331  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4332  * any modifications thanks to holding respective locks.
4333  *
4334  * The same guarantee can be provided with a combination of safe memory
4335  * reclamation and sequence counters instead. If all operations which affect
4336  * the relationship between the current vnode and the one we are looking for
4337  * also modify the counter, we can verify whether all the conditions held as
4338  * we made the jump. This includes things like permissions, mount points etc.
4339  * Counter modification is provided by enclosing relevant places in
4340  * vn_seqc_write_begin()/end() calls.
4341  *
4342  * Thus this translates to:
4343  *
4344  * vfs_smr_enter();
4345  * dvp_seqc = seqc_read_any(dvp);
4346  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4347  *     abort();
4348  * for (;;) {
4349  *      tvp = find();
4350  *      tvp_seqc = seqc_read_any(tvp);
4351  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4352  *          abort();
4353  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4354  *          abort();
4355  *      dvp = tvp; // we know nothing of importance has changed
4356  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4357  *      if (last)
4358  *          break;
4359  * }
4360  * vget(); // secure the vnode
4361  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4362  *          abort();
4363  * // at this point we know nothing has changed for any parent<->child pair
4364  * // as they were crossed during the lookup, meaning we matched the guarantee
4365  * // of the locked variant
4366  * return (tvp);
4367  *
4368  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4369  * - they are called while within vfs_smr protection which they must never exit
4370  * - EAGAIN can be returned to denote checking could not be performed, it is
4371  *   always valid to return it
4372  * - if the sequence counter has not changed the result must be valid
4373  * - if the sequence counter has changed both false positives and false negatives
4374  *   are permitted (since the result will be rejected later)
4375  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4376  *
4377  * Caveats to watch out for:
4378  * - vnodes are passed unlocked and unreferenced with nothing stopping
4379  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4380  *   to use atomic_load_ptr to fetch it.
4381  * - the aforementioned object can also get freed, meaning absent other means it
4382  *   should be protected with vfs_smr
4383  * - either safely checking permissions as they are modified or guaranteeing
4384  *   their stability is left to the routine
4385  */
4386 int
4387 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4388     struct pwd **pwdp)
4389 {
4390         struct cache_fpl fpl;
4391         struct pwd *pwd;
4392         struct vnode *dvp;
4393         struct componentname *cnp;
4394         struct nameidata_saved orig;
4395         int error;
4396
4397         MPASS(ndp->ni_lcf == 0);
4398
4399         fpl.status = CACHE_FPL_STATUS_UNSET;
4400         fpl.ndp = ndp;
4401         fpl.cnp = &ndp->ni_cnd;
4402         MPASS(curthread == fpl.cnp->cn_thread);
4403
4404         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4405                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4406
4407         if (!cache_can_fplookup(&fpl)) {
4408                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4409                 *status = fpl.status;
4410                 return (EOPNOTSUPP);
4411         }
4412
4413         cache_fpl_checkpoint(&fpl, &orig);
4414
4415         cache_fpl_smr_enter_initial(&fpl);
4416         pwd = pwd_get_smr();
4417         fpl.pwd = pwd;
4418         ndp->ni_rootdir = pwd->pwd_rdir;
4419         ndp->ni_topdir = pwd->pwd_jdir;
4420
4421         cnp = fpl.cnp;
4422         cnp->cn_nameptr = cnp->cn_pnbuf;
4423         if (cnp->cn_pnbuf[0] == '/') {
4424                 cache_fpl_handle_root(ndp, &dvp);
4425         } else {
4426                 MPASS(ndp->ni_dirfd == AT_FDCWD);
4427                 dvp = pwd->pwd_cdir;
4428         }
4429
4430         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4431
4432         error = cache_fplookup_impl(dvp, &fpl);
4433         cache_fpl_smr_assert_not_entered(&fpl);
4434         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4435
4436         *status = fpl.status;
4437         switch (fpl.status) {
4438         case CACHE_FPL_STATUS_UNSET:
4439                 __assert_unreachable();
4440                 break;
4441         case CACHE_FPL_STATUS_HANDLED:
4442                 SDT_PROBE3(vfs, namei, lookup, return, error,
4443                     (error == 0 ? ndp->ni_vp : NULL), true);
4444                 break;
4445         case CACHE_FPL_STATUS_PARTIAL:
4446                 *pwdp = fpl.pwd;
4447                 /*
4448                  * Status restored by cache_fplookup_partial_setup.
4449                  */
4450                 break;
4451         case CACHE_FPL_STATUS_ABORTED:
4452                 cache_fpl_restore(&fpl, &orig);
4453                 break;
4454         }
4455         return (error);
4456 }