sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  83     "Name cache");
  84
  85 SDT_PROVIDER_DECLARE(vfs);
  86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  87     "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  89     "struct vnode *");
  90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  91     "char *");
  92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  93     "const char *");
  94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  95     "struct namecache *", "int", "int");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  98     "char *", "struct vnode *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 101     "struct vnode *", "char *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 105     "struct vnode *", "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 107     "char *");
 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 109     "struct componentname *");
 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 111     "struct componentname *");
 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 113 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 114 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 115 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 116 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 117     "struct vnode *");
 118 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 119     "char *");
 120 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 121     "char *");
 122
 123 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 124 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 125 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 126
 127 /*
 128  * This structure describes the elements in the cache of recent
 129  * names looked up by namei.
 130  */
 131 struct negstate {
 132         u_char neg_flag;
 133         u_char neg_hit;
 134 };
 135 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 136     "the state must fit in a union with a pointer without growing it");
 137
 138 struct  namecache {
 139         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 140         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 141         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 142         struct  vnode *nc_dvp;          /* vnode of parent of name */
 143         union {
 144                 struct  vnode *nu_vp;   /* vnode the name refers to */
 145                 struct  negstate nu_neg;/* negative entry state */
 146         } n_un;
 147         u_char  nc_flag;                /* flag bits */
 148         u_char  nc_nlen;                /* length of name */
 149         char    nc_name[0];             /* segment name + nul */
 150 };
 151
 152 /*
 153  * struct namecache_ts repeats struct namecache layout up to the
 154  * nc_nlen member.
 155  * struct namecache_ts is used in place of struct namecache when time(s) need
 156  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 157  * both a non-dotdot directory name plus dotdot for the directory's
 158  * parent.
 159  *
 160  * See below for alignment requirement.
 161  */
 162 struct  namecache_ts {
 163         struct  timespec nc_time;       /* timespec provided by fs */
 164         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 165         int     nc_ticks;               /* ticks value when entry was added */
 166         int     nc_pad;
 167         struct namecache nc_nc;
 168 };
 169
 170 TAILQ_HEAD(cache_freebatch, namecache);
 171
 172 /*
 173  * At least mips n32 performs 64-bit accesses to timespec as found
 174  * in namecache_ts and requires them to be aligned. Since others
 175  * may be in the same spot suffer a little bit and enforce the
 176  * alignment for everyone. Note this is a nop for 64-bit platforms.
 177  */
 178 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 179
 180 /*
 181  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
 182  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
 183  * smaller and the value was bumped to retain the total size, but it
 184  * was never re-evaluated for suitability. A simple test counting
 185  * lengths during package building shows that the value of 45 covers
 186  * about 86% of all added entries, reaching 99% at 65.
 187  *
 188  * Regardless of the above, use of dedicated zones instead of malloc may be
 189  * inducing additional waste. This may be hard to address as said zones are
 190  * tied to VFS SMR. Even if retaining them, the current split should be
 191  * re-evaluated.
 192  */
 193 #ifdef __LP64__
 194 #define CACHE_PATH_CUTOFF       45
 195 #define CACHE_LARGE_PAD         6
 196 #else
 197 #define CACHE_PATH_CUTOFF       41
 198 #define CACHE_LARGE_PAD         2
 199 #endif
 200
 201 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 202 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 203 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 204 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 205
 206 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 207 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 208 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 209 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 210
 211 #define nc_vp           n_un.nu_vp
 212 #define nc_neg          n_un.nu_neg
 213
 214 /*
 215  * Flags in namecache.nc_flag
 216  */
 217 #define NCF_WHITE       0x01
 218 #define NCF_ISDOTDOT    0x02
 219 #define NCF_TS          0x04
 220 #define NCF_DTS         0x08
 221 #define NCF_DVDROP      0x10
 222 #define NCF_NEGATIVE    0x20
 223 #define NCF_INVALID     0x40
 224 #define NCF_WIP         0x80
 225
 226 /*
 227  * Flags in negstate.neg_flag
 228  */
 229 #define NEG_HOT         0x01
 230
 231 static bool     cache_neg_evict_cond(u_long lnumcache);
 232
 233 /*
 234  * Mark an entry as invalid.
 235  *
 236  * This is called before it starts getting deconstructed.
 237  */
 238 static void
 239 cache_ncp_invalidate(struct namecache *ncp)
 240 {
 241
 242         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 243             ("%s: entry %p already invalid", __func__, ncp));
 244         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 245         atomic_thread_fence_rel();
 246 }
 247
 248 /*
 249  * Check whether the entry can be safely used.
 250  *
 251  * All places which elide locks are supposed to call this after they are
 252  * done with reading from an entry.
 253  */
 254 #define cache_ncp_canuse(ncp)   ({                                      \
 255         struct namecache *_ncp = (ncp);                                 \
 256         u_char _nc_flag;                                                \
 257                                                                         \
 258         atomic_thread_fence_acq();                                      \
 259         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 260         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
 261 })
 262
 263 /*
 264  * Name caching works as follows:
 265  *
 266  * Names found by directory scans are retained in a cache
 267  * for future reference.  It is managed LRU, so frequently
 268  * used names will hang around.  Cache is indexed by hash value
 269  * obtained from (dvp, name) where dvp refers to the directory
 270  * containing name.
 271  *
 272  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 273  * exist) the vnode pointer will be NULL.
 274  *
 275  * Upon reaching the last segment of a path, if the reference
 276  * is for DELETE, or NOCACHE is set (rewrite), and the
 277  * name is located in the cache, it will be dropped.
 278  *
 279  * These locks are used (in the order in which they can be taken):
 280  * NAME         TYPE    ROLE
 281  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 282  * bucketlock   mtx     for access to given set of hash buckets
 283  * neglist      mtx     negative entry LRU management
 284  *
 285  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 286  * order is lower address first. Both are recursive.
 287  *
 288  * "." lookups are lockless.
 289  *
 290  * ".." and vnode -> name lookups require vnodelock.
 291  *
 292  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 293  *
 294  * Insertions and removals of entries require involved vnodes and bucketlocks
 295  * to be locked to provide safe operation against other threads modifying the
 296  * cache.
 297  *
 298  * Some lookups result in removal of the found entry (e.g. getting rid of a
 299  * negative entry with the intent to create a positive one), which poses a
 300  * problem when multiple threads reach the state. Similarly, two different
 301  * threads can purge two different vnodes and try to remove the same name.
 302  *
 303  * If the already held vnode lock is lower than the second required lock, we
 304  * can just take the other lock. However, in the opposite case, this could
 305  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 306  * the first node, locking everything in order and revalidating the state.
 307  */
 308
 309 VFS_SMR_DECLARE;
 310
 311 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 312     "Name cache parameters");
 313
 314 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 316     "Total namecache capacity");
 317
 318 u_int ncsizefactor = 2;
 319 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 320     "Size factor for namecache");
 321
 322 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 323 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 324     "Ratio of negative namecache entries");
 325
 326 /*
 327  * Negative entry % of namecache capacity above which automatic eviction is allowed.
 328  *
 329  * Check cache_neg_evict_cond for details.
 330  */
 331 static u_int ncnegminpct = 3;
 332
 333 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 334 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 335     "Negative entry count above which automatic eviction is allowed");
 336
 337 /*
 338  * Structures associated with name caching.
 339  */
 340 #define NCHHASH(hash) \
 341         (&nchashtbl[(hash) & nchash])
 342 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 343 static u_long __read_mostly     nchash;                 /* size of hash table */
 344 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 345     "Size of namecache hash table");
 346 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 347 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 348
 349 struct nchstats nchstats;               /* cache effectiveness statistics */
 350
 351 static bool __read_frequently cache_fast_revlookup = true;
 352 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 353     &cache_fast_revlookup, 0, "");
 354
 355 static u_int __exclusive_cache_line neg_cycle;
 356
 357 #define ncneghash       3
 358 #define numneglists     (ncneghash + 1)
 359
 360 struct neglist {
 361         struct mtx              nl_evict_lock;
 362         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 363         TAILQ_HEAD(, namecache) nl_list;
 364         TAILQ_HEAD(, namecache) nl_hotlist;
 365         u_long                  nl_hotnum;
 366 } __aligned(CACHE_LINE_SIZE);
 367
 368 static struct neglist neglists[numneglists];
 369
 370 static inline struct neglist *
 371 NCP2NEGLIST(struct namecache *ncp)
 372 {
 373
 374         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 375 }
 376
 377 static inline struct negstate *
 378 NCP2NEGSTATE(struct namecache *ncp)
 379 {
 380
 381         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 382         return (&ncp->nc_neg);
 383 }
 384
 385 #define numbucketlocks (ncbuckethash + 1)
 386 static u_int __read_mostly  ncbuckethash;
 387 static struct mtx_padalign __read_mostly  *bucketlocks;
 388 #define HASH2BUCKETLOCK(hash) \
 389         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 390
 391 #define numvnodelocks (ncvnodehash + 1)
 392 static u_int __read_mostly  ncvnodehash;
 393 static struct mtx __read_mostly *vnodelocks;
 394 static inline struct mtx *
 395 VP2VNODELOCK(struct vnode *vp)
 396 {
 397
 398         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 399 }
 400
 401 static void
 402 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 403 {
 404         struct namecache_ts *ncp_ts;
 405
 406         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 407             (tsp == NULL && ticksp == NULL),
 408             ("No NCF_TS"));
 409
 410         if (tsp == NULL)
 411                 return;
 412
 413         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 414         *tsp = ncp_ts->nc_time;
 415         *ticksp = ncp_ts->nc_ticks;
 416 }
 417
 418 #ifdef DEBUG_CACHE
 419 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 420 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 421     "VFS namecache enabled");
 422 #endif
 423
 424 /* Export size information to userland */
 425 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 426     sizeof(struct namecache), "sizeof(struct namecache)");
 427
 428 /*
 429  * The new name cache statistics
 430  */
 431 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 432     "Name cache statistics");
 433
 434 #define STATNODE_ULONG(name, varname, descr)                                    \
 435         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 436 #define STATNODE_COUNTER(name, varname, descr)                                  \
 437         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 438         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 439             descr);
 440 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 441 STATNODE_ULONG(count, numcache, "Number of cache entries");
 442 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 443 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 444 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 445 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 446 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 447 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 448 STATNODE_COUNTER(posszaps, numposzaps,
 449     "Number of cache hits (positive) we do not want to cache");
 450 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 451 STATNODE_COUNTER(negzaps, numnegzaps,
 452     "Number of cache hits (negative) we do not want to cache");
 453 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 454 /* These count for vn_getcwd(), too. */
 455 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 456 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 457 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 458     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 459 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 460 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 461
 462 /*
 463  * Debug or developer statistics.
 464  */
 465 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 466     "Name cache debugging");
 467 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 468         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 469 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 470         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 471         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 472             descr);
 473 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 474     "Number of successful removals after relocking");
 475 static long zap_bucket_fail;
 476 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 477 static long zap_bucket_fail2;
 478 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 479 static long cache_lock_vnodes_cel_3_failures;
 480 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 481     "Number of times 3-way vnode locking failed");
 482
 483 static void cache_zap_locked(struct namecache *ncp);
 484 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 485     char **freebuf, size_t *buflen);
 486 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 487     char **retbuf, size_t *buflen, size_t addend);
 488 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 489     char **retbuf, size_t *buflen);
 490 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 491     char **retbuf, size_t *len, size_t addend);
 492
 493 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 494
 495 static inline void
 496 cache_assert_vlp_locked(struct mtx *vlp)
 497 {
 498
 499         if (vlp != NULL)
 500                 mtx_assert(vlp, MA_OWNED);
 501 }
 502
 503 static inline void
 504 cache_assert_vnode_locked(struct vnode *vp)
 505 {
 506         struct mtx *vlp;
 507
 508         vlp = VP2VNODELOCK(vp);
 509         cache_assert_vlp_locked(vlp);
 510 }
 511
 512 /*
 513  * Directory vnodes with entries are held for two reasons:
 514  * 1. make them less of a target for reclamation in vnlru
 515  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
 516  *
 517  * Note this preferably would not be done and it's a hold over from. It will be
 518  * feasible to eliminate altogether if all filesystems start supporting
 519  * lockless lookup.
 520  */
 521 static void
 522 cache_hold_vnode(struct vnode *vp)
 523 {
 524
 525         cache_assert_vnode_locked(vp);
 526         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 527         vhold(vp);
 528         counter_u64_add(numcachehv, 1);
 529 }
 530
 531 static void
 532 cache_drop_vnode(struct vnode *vp)
 533 {
 534
 535         /*
 536          * Called after all locks are dropped, meaning we can't assert
 537          * on the state of v_cache_src.
 538          */
 539         vdrop(vp);
 540         counter_u64_add(numcachehv, -1);
 541 }
 542
 543 /*
 544  * UMA zones.
 545  */
 546 static uma_zone_t __read_mostly cache_zone_small;
 547 static uma_zone_t __read_mostly cache_zone_small_ts;
 548 static uma_zone_t __read_mostly cache_zone_large;
 549 static uma_zone_t __read_mostly cache_zone_large_ts;
 550
 551 static struct namecache *
 552 cache_alloc_uma(int len, bool ts)
 553 {
 554         struct namecache_ts *ncp_ts;
 555         struct namecache *ncp;
 556
 557         if (__predict_false(ts)) {
 558                 if (len <= CACHE_PATH_CUTOFF)
 559                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 560                 else
 561                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 562                 ncp = &ncp_ts->nc_nc;
 563         } else {
 564                 if (len <= CACHE_PATH_CUTOFF)
 565                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 566                 else
 567                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 568         }
 569         return (ncp);
 570 }
 571
 572 static void
 573 cache_free_uma(struct namecache *ncp)
 574 {
 575         struct namecache_ts *ncp_ts;
 576
 577         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 578                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 579                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 580                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 581                 else
 582                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 583         } else {
 584                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 585                         uma_zfree_smr(cache_zone_small, ncp);
 586                 else
 587                         uma_zfree_smr(cache_zone_large, ncp);
 588         }
 589 }
 590
 591 static struct namecache *
 592 cache_alloc(int len, bool ts)
 593 {
 594         u_long lnumcache;
 595
 596         /*
 597          * Avoid blowout in namecache entries.
 598          *
 599          * Bugs:
 600          * 1. filesystems may end up trying to add an already existing entry
 601          * (for example this can happen after a cache miss during concurrent
 602          * lookup), in which case we will call cache_neg_evict despite not
 603          * adding anything.
 604          * 2. the routine may fail to free anything and no provisions are made
 605          * to make it try harder (see the inside for failure modes)
 606          * 3. it only ever looks at negative entries.
 607          */
 608         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 609         if (cache_neg_evict_cond(lnumcache)) {
 610                 lnumcache = atomic_load_long(&numcache);
 611         }
 612         if (__predict_false(lnumcache >= ncsize)) {
 613                 atomic_subtract_long(&numcache, 1);
 614                 counter_u64_add(numdrops, 1);
 615                 return (NULL);
 616         }
 617         return (cache_alloc_uma(len, ts));
 618 }
 619
 620 static void
 621 cache_free(struct namecache *ncp)
 622 {
 623
 624         MPASS(ncp != NULL);
 625         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 626                 cache_drop_vnode(ncp->nc_dvp);
 627         }
 628         cache_free_uma(ncp);
 629         atomic_subtract_long(&numcache, 1);
 630 }
 631
 632 static void
 633 cache_free_batch(struct cache_freebatch *batch)
 634 {
 635         struct namecache *ncp, *nnp;
 636         int i;
 637
 638         i = 0;
 639         if (TAILQ_EMPTY(batch))
 640                 goto out;
 641         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 642                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 643                         cache_drop_vnode(ncp->nc_dvp);
 644                 }
 645                 cache_free_uma(ncp);
 646                 i++;
 647         }
 648         atomic_subtract_long(&numcache, i);
 649 out:
 650         SDT_PROBE1(vfs, namecache, purge, batch, i);
 651 }
 652
 653 /*
 654  * TODO: With the value stored we can do better than computing the hash based
 655  * on the address. The choice of FNV should also be revisited.
 656  */
 657 static void
 658 cache_prehash(struct vnode *vp)
 659 {
 660
 661         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 662 }
 663
 664 static uint32_t
 665 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 666 {
 667
 668         return (fnv_32_buf(name, len, dvp->v_nchash));
 669 }
 670
 671 static inline struct nchashhead *
 672 NCP2BUCKET(struct namecache *ncp)
 673 {
 674         uint32_t hash;
 675
 676         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 677         return (NCHHASH(hash));
 678 }
 679
 680 static inline struct mtx *
 681 NCP2BUCKETLOCK(struct namecache *ncp)
 682 {
 683         uint32_t hash;
 684
 685         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 686         return (HASH2BUCKETLOCK(hash));
 687 }
 688
 689 #ifdef INVARIANTS
 690 static void
 691 cache_assert_bucket_locked(struct namecache *ncp)
 692 {
 693         struct mtx *blp;
 694
 695         blp = NCP2BUCKETLOCK(ncp);
 696         mtx_assert(blp, MA_OWNED);
 697 }
 698
 699 static void
 700 cache_assert_bucket_unlocked(struct namecache *ncp)
 701 {
 702         struct mtx *blp;
 703
 704         blp = NCP2BUCKETLOCK(ncp);
 705         mtx_assert(blp, MA_NOTOWNED);
 706 }
 707 #else
 708 #define cache_assert_bucket_locked(x) do { } while (0)
 709 #define cache_assert_bucket_unlocked(x) do { } while (0)
 710 #endif
 711
 712 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 713 static void
 714 _cache_sort_vnodes(void **p1, void **p2)
 715 {
 716         void *tmp;
 717
 718         MPASS(*p1 != NULL || *p2 != NULL);
 719
 720         if (*p1 > *p2) {
 721                 tmp = *p2;
 722                 *p2 = *p1;
 723                 *p1 = tmp;
 724         }
 725 }
 726
 727 static void
 728 cache_lock_all_buckets(void)
 729 {
 730         u_int i;
 731
 732         for (i = 0; i < numbucketlocks; i++)
 733                 mtx_lock(&bucketlocks[i]);
 734 }
 735
 736 static void
 737 cache_unlock_all_buckets(void)
 738 {
 739         u_int i;
 740
 741         for (i = 0; i < numbucketlocks; i++)
 742                 mtx_unlock(&bucketlocks[i]);
 743 }
 744
 745 static void
 746 cache_lock_all_vnodes(void)
 747 {
 748         u_int i;
 749
 750         for (i = 0; i < numvnodelocks; i++)
 751                 mtx_lock(&vnodelocks[i]);
 752 }
 753
 754 static void
 755 cache_unlock_all_vnodes(void)
 756 {
 757         u_int i;
 758
 759         for (i = 0; i < numvnodelocks; i++)
 760                 mtx_unlock(&vnodelocks[i]);
 761 }
 762
 763 static int
 764 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 765 {
 766
 767         cache_sort_vnodes(&vlp1, &vlp2);
 768
 769         if (vlp1 != NULL) {
 770                 if (!mtx_trylock(vlp1))
 771                         return (EAGAIN);
 772         }
 773         if (!mtx_trylock(vlp2)) {
 774                 if (vlp1 != NULL)
 775                         mtx_unlock(vlp1);
 776                 return (EAGAIN);
 777         }
 778
 779         return (0);
 780 }
 781
 782 static void
 783 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 784 {
 785
 786         MPASS(vlp1 != NULL || vlp2 != NULL);
 787         MPASS(vlp1 <= vlp2);
 788
 789         if (vlp1 != NULL)
 790                 mtx_lock(vlp1);
 791         if (vlp2 != NULL)
 792                 mtx_lock(vlp2);
 793 }
 794
 795 static void
 796 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 797 {
 798
 799         MPASS(vlp1 != NULL || vlp2 != NULL);
 800
 801         if (vlp1 != NULL)
 802                 mtx_unlock(vlp1);
 803         if (vlp2 != NULL)
 804                 mtx_unlock(vlp2);
 805 }
 806
 807 static int
 808 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 809 {
 810         struct nchstats snap;
 811
 812         if (req->oldptr == NULL)
 813                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 814
 815         snap = nchstats;
 816         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 817         snap.ncs_neghits = counter_u64_fetch(numneghits);
 818         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 819             counter_u64_fetch(numnegzaps);
 820         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 821             counter_u64_fetch(nummiss);
 822
 823         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 824 }
 825 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 826     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 827     "VFS cache effectiveness statistics");
 828
 829 static void
 830 cache_recalc_neg_min(u_int val)
 831 {
 832
 833         neg_min = (ncsize * val) / 100;
 834 }
 835
 836 static int
 837 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 838 {
 839         u_int val;
 840         int error;
 841
 842         val = ncnegminpct;
 843         error = sysctl_handle_int(oidp, &val, 0, req);
 844         if (error != 0 || req->newptr == NULL)
 845                 return (error);
 846
 847         if (val == ncnegminpct)
 848                 return (0);
 849         if (val < 0 || val > 99)
 850                 return (EINVAL);
 851         ncnegminpct = val;
 852         cache_recalc_neg_min(val);
 853         return (0);
 854 }
 855
 856 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 857     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 858     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 859
 860 #ifdef DIAGNOSTIC
 861 /*
 862  * Grab an atomic snapshot of the name cache hash chain lengths
 863  */
 864 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 865     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 866     "hash table stats");
 867
 868 static int
 869 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 870 {
 871         struct nchashhead *ncpp;
 872         struct namecache *ncp;
 873         int i, error, n_nchash, *cntbuf;
 874
 875 retry:
 876         n_nchash = nchash + 1;  /* nchash is max index, not count */
 877         if (req->oldptr == NULL)
 878                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 879         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 880         cache_lock_all_buckets();
 881         if (n_nchash != nchash + 1) {
 882                 cache_unlock_all_buckets();
 883                 free(cntbuf, M_TEMP);
 884                 goto retry;
 885         }
 886         /* Scan hash tables counting entries */
 887         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 888                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 889                         cntbuf[i]++;
 890         cache_unlock_all_buckets();
 891         for (error = 0, i = 0; i < n_nchash; i++)
 892                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 893                         break;
 894         free(cntbuf, M_TEMP);
 895         return (error);
 896 }
 897 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 898     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 899     "nchash chain lengths");
 900
 901 static int
 902 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 903 {
 904         int error;
 905         struct nchashhead *ncpp;
 906         struct namecache *ncp;
 907         int n_nchash;
 908         int count, maxlength, used, pct;
 909
 910         if (!req->oldptr)
 911                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 912
 913         cache_lock_all_buckets();
 914         n_nchash = nchash + 1;  /* nchash is max index, not count */
 915         used = 0;
 916         maxlength = 0;
 917
 918         /* Scan hash tables for applicable entries */
 919         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 920                 count = 0;
 921                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 922                         count++;
 923                 }
 924                 if (count)
 925                         used++;
 926                 if (maxlength < count)
 927                         maxlength = count;
 928         }
 929         n_nchash = nchash + 1;
 930         cache_unlock_all_buckets();
 931         pct = (used * 100) / (n_nchash / 100);
 932         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 933         if (error)
 934                 return (error);
 935         error = SYSCTL_OUT(req, &used, sizeof(used));
 936         if (error)
 937                 return (error);
 938         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 939         if (error)
 940                 return (error);
 941         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 942         if (error)
 943                 return (error);
 944         return (0);
 945 }
 946 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 947     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 948     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 949 #endif
 950
 951 /*
 952  * Negative entries management
 953  *
 954  * Various workloads create plenty of negative entries and barely use them
 955  * afterwards. Moreover malicious users can keep performing bogus lookups
 956  * adding even more entries. For example "make tinderbox" as of writing this
 957  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 958  * negative.
 959  *
 960  * As such, a rather aggressive eviction method is needed. The currently
 961  * employed method is a placeholder.
 962  *
 963  * Entries are split over numneglists separate lists, each of which is further
 964  * split into hot and cold entries. Entries get promoted after getting a hit.
 965  * Eviction happens on addition of new entry.
 966  */
 967 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 968     "Name cache negative entry statistics");
 969
 970 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 971     "Number of negative cache entries");
 972
 973 static COUNTER_U64_DEFINE_EARLY(neg_created);
 974 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 975     "Number of created negative entries");
 976
 977 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 978 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 979     "Number of evicted negative entries");
 980
 981 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 982 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 983     &neg_evict_skipped_empty,
 984     "Number of times evicting failed due to lack of entries");
 985
 986 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 987 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 988     &neg_evict_skipped_missed,
 989     "Number of times evicting failed due to target entry disappearing");
 990
 991 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 992 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 993     &neg_evict_skipped_contended,
 994     "Number of times evicting failed due to contention");
 995
 996 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 997     "Number of cache hits (negative)");
 998
 999 static int
1000 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1001 {
1002         int i, out;
1003
1004         out = 0;
1005         for (i = 0; i < numneglists; i++)
1006                 out += neglists[i].nl_hotnum;
1007
1008         return (SYSCTL_OUT(req, &out, sizeof(out)));
1009 }
1010 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1011     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1012     "Number of hot negative entries");
1013
1014 static void
1015 cache_neg_init(struct namecache *ncp)
1016 {
1017         struct negstate *ns;
1018
1019         ncp->nc_flag |= NCF_NEGATIVE;
1020         ns = NCP2NEGSTATE(ncp);
1021         ns->neg_flag = 0;
1022         ns->neg_hit = 0;
1023         counter_u64_add(neg_created, 1);
1024 }
1025
1026 #define CACHE_NEG_PROMOTION_THRESH 2
1027
1028 static bool
1029 cache_neg_hit_prep(struct namecache *ncp)
1030 {
1031         struct negstate *ns;
1032         u_char n;
1033
1034         ns = NCP2NEGSTATE(ncp);
1035         n = atomic_load_char(&ns->neg_hit);
1036         for (;;) {
1037                 if (n >= CACHE_NEG_PROMOTION_THRESH)
1038                         return (false);
1039                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1040                         break;
1041         }
1042         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1043 }
1044
1045 /*
1046  * Nothing to do here but it is provided for completeness as some
1047  * cache_neg_hit_prep callers may end up returning without even
1048  * trying to promote.
1049  */
1050 #define cache_neg_hit_abort(ncp)        do { } while (0)
1051
1052 static void
1053 cache_neg_hit_finish(struct namecache *ncp)
1054 {
1055
1056         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1057         counter_u64_add(numneghits, 1);
1058 }
1059
1060 /*
1061  * Move a negative entry to the hot list.
1062  */
1063 static void
1064 cache_neg_promote_locked(struct namecache *ncp)
1065 {
1066         struct neglist *nl;
1067         struct negstate *ns;
1068
1069         ns = NCP2NEGSTATE(ncp);
1070         nl = NCP2NEGLIST(ncp);
1071         mtx_assert(&nl->nl_lock, MA_OWNED);
1072         if ((ns->neg_flag & NEG_HOT) == 0) {
1073                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1074                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1075                 nl->nl_hotnum++;
1076                 ns->neg_flag |= NEG_HOT;
1077         }
1078 }
1079
1080 /*
1081  * Move a hot negative entry to the cold list.
1082  */
1083 static void
1084 cache_neg_demote_locked(struct namecache *ncp)
1085 {
1086         struct neglist *nl;
1087         struct negstate *ns;
1088
1089         ns = NCP2NEGSTATE(ncp);
1090         nl = NCP2NEGLIST(ncp);
1091         mtx_assert(&nl->nl_lock, MA_OWNED);
1092         MPASS(ns->neg_flag & NEG_HOT);
1093         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1094         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1095         nl->nl_hotnum--;
1096         ns->neg_flag &= ~NEG_HOT;
1097         atomic_store_char(&ns->neg_hit, 0);
1098 }
1099
1100 /*
1101  * Move a negative entry to the hot list if it matches the lookup.
1102  *
1103  * We have to take locks, but they may be contended and in the worst
1104  * case we may need to go off CPU. We don't want to spin within the
1105  * smr section and we can't block with it. Exiting the section means
1106  * the found entry could have been evicted. We are going to look it
1107  * up again.
1108  */
1109 static bool
1110 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1111     struct namecache *oncp, uint32_t hash)
1112 {
1113         struct namecache *ncp;
1114         struct neglist *nl;
1115         u_char nc_flag;
1116
1117         nl = NCP2NEGLIST(oncp);
1118
1119         mtx_lock(&nl->nl_lock);
1120         /*
1121          * For hash iteration.
1122          */
1123         vfs_smr_enter();
1124
1125         /*
1126          * Avoid all surprises by only succeeding if we got the same entry and
1127          * bailing completely otherwise.
1128          * XXX There are no provisions to keep the vnode around, meaning we may
1129          * end up promoting a negative entry for a *new* vnode and returning
1130          * ENOENT on its account. This is the error we want to return anyway
1131          * and promotion is harmless.
1132          *
1133          * In particular at this point there can be a new ncp which matches the
1134          * search but hashes to a different neglist.
1135          */
1136         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1137                 if (ncp == oncp)
1138                         break;
1139         }
1140
1141         /*
1142          * No match to begin with.
1143          */
1144         if (__predict_false(ncp == NULL)) {
1145                 goto out_abort;
1146         }
1147
1148         /*
1149          * The newly found entry may be something different...
1150          */
1151         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1152             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1153                 goto out_abort;
1154         }
1155
1156         /*
1157          * ... and not even negative.
1158          */
1159         nc_flag = atomic_load_char(&ncp->nc_flag);
1160         if ((nc_flag & NCF_NEGATIVE) == 0) {
1161                 goto out_abort;
1162         }
1163
1164         if (!cache_ncp_canuse(ncp)) {
1165                 goto out_abort;
1166         }
1167
1168         cache_neg_promote_locked(ncp);
1169         cache_neg_hit_finish(ncp);
1170         vfs_smr_exit();
1171         mtx_unlock(&nl->nl_lock);
1172         return (true);
1173 out_abort:
1174         vfs_smr_exit();
1175         mtx_unlock(&nl->nl_lock);
1176         return (false);
1177 }
1178
1179 static void
1180 cache_neg_promote(struct namecache *ncp)
1181 {
1182         struct neglist *nl;
1183
1184         nl = NCP2NEGLIST(ncp);
1185         mtx_lock(&nl->nl_lock);
1186         cache_neg_promote_locked(ncp);
1187         mtx_unlock(&nl->nl_lock);
1188 }
1189
1190 static void
1191 cache_neg_insert(struct namecache *ncp)
1192 {
1193         struct neglist *nl;
1194
1195         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1196         cache_assert_bucket_locked(ncp);
1197         nl = NCP2NEGLIST(ncp);
1198         mtx_lock(&nl->nl_lock);
1199         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1200         mtx_unlock(&nl->nl_lock);
1201         atomic_add_long(&numneg, 1);
1202 }
1203
1204 static void
1205 cache_neg_remove(struct namecache *ncp)
1206 {
1207         struct neglist *nl;
1208         struct negstate *ns;
1209
1210         cache_assert_bucket_locked(ncp);
1211         nl = NCP2NEGLIST(ncp);
1212         ns = NCP2NEGSTATE(ncp);
1213         mtx_lock(&nl->nl_lock);
1214         if ((ns->neg_flag & NEG_HOT) != 0) {
1215                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1216                 nl->nl_hotnum--;
1217         } else {
1218                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1219         }
1220         mtx_unlock(&nl->nl_lock);
1221         atomic_subtract_long(&numneg, 1);
1222 }
1223
1224 static struct neglist *
1225 cache_neg_evict_select_list(void)
1226 {
1227         struct neglist *nl;
1228         u_int c;
1229
1230         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1231         nl = &neglists[c % numneglists];
1232         if (!mtx_trylock(&nl->nl_evict_lock)) {
1233                 counter_u64_add(neg_evict_skipped_contended, 1);
1234                 return (NULL);
1235         }
1236         return (nl);
1237 }
1238
1239 static struct namecache *
1240 cache_neg_evict_select_entry(struct neglist *nl)
1241 {
1242         struct namecache *ncp, *lncp;
1243         struct negstate *ns, *lns;
1244         int i;
1245
1246         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1247         mtx_assert(&nl->nl_lock, MA_OWNED);
1248         ncp = TAILQ_FIRST(&nl->nl_list);
1249         if (ncp == NULL)
1250                 return (NULL);
1251         lncp = ncp;
1252         lns = NCP2NEGSTATE(lncp);
1253         for (i = 1; i < 4; i++) {
1254                 ncp = TAILQ_NEXT(ncp, nc_dst);
1255                 if (ncp == NULL)
1256                         break;
1257                 ns = NCP2NEGSTATE(ncp);
1258                 if (ns->neg_hit < lns->neg_hit) {
1259                         lncp = ncp;
1260                         lns = ns;
1261                 }
1262         }
1263         return (lncp);
1264 }
1265
1266 static bool
1267 cache_neg_evict(void)
1268 {
1269         struct namecache *ncp, *ncp2;
1270         struct neglist *nl;
1271         struct vnode *dvp;
1272         struct mtx *dvlp;
1273         struct mtx *blp;
1274         uint32_t hash;
1275         u_char nlen;
1276         bool evicted;
1277
1278         nl = cache_neg_evict_select_list();
1279         if (nl == NULL) {
1280                 return (false);
1281         }
1282
1283         mtx_lock(&nl->nl_lock);
1284         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1285         if (ncp != NULL) {
1286                 cache_neg_demote_locked(ncp);
1287         }
1288         ncp = cache_neg_evict_select_entry(nl);
1289         if (ncp == NULL) {
1290                 counter_u64_add(neg_evict_skipped_empty, 1);
1291                 mtx_unlock(&nl->nl_lock);
1292                 mtx_unlock(&nl->nl_evict_lock);
1293                 return (false);
1294         }
1295         nlen = ncp->nc_nlen;
1296         dvp = ncp->nc_dvp;
1297         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1298         dvlp = VP2VNODELOCK(dvp);
1299         blp = HASH2BUCKETLOCK(hash);
1300         mtx_unlock(&nl->nl_lock);
1301         mtx_unlock(&nl->nl_evict_lock);
1302         mtx_lock(dvlp);
1303         mtx_lock(blp);
1304         /*
1305          * Note that since all locks were dropped above, the entry may be
1306          * gone or reallocated to be something else.
1307          */
1308         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1309                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1310                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1311                         break;
1312         }
1313         if (ncp2 == NULL) {
1314                 counter_u64_add(neg_evict_skipped_missed, 1);
1315                 ncp = NULL;
1316                 evicted = false;
1317         } else {
1318                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1319                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1320                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1321                     ncp->nc_name);
1322                 cache_zap_locked(ncp);
1323                 counter_u64_add(neg_evicted, 1);
1324                 evicted = true;
1325         }
1326         mtx_unlock(blp);
1327         mtx_unlock(dvlp);
1328         if (ncp != NULL)
1329                 cache_free(ncp);
1330         return (evicted);
1331 }
1332
1333 /*
1334  * Maybe evict a negative entry to create more room.
1335  *
1336  * The ncnegfactor parameter limits what fraction of the total count
1337  * can comprise of negative entries. However, if the cache is just
1338  * warming up this leads to excessive evictions.  As such, ncnegminpct
1339  * (recomputed to neg_min) dictates whether the above should be
1340  * applied.
1341  *
1342  * Try evicting if the cache is close to full capacity regardless of
1343  * other considerations.
1344  */
1345 static bool
1346 cache_neg_evict_cond(u_long lnumcache)
1347 {
1348         u_long lnumneg;
1349
1350         if (ncsize - 1000 < lnumcache)
1351                 goto out_evict;
1352         lnumneg = atomic_load_long(&numneg);
1353         if (lnumneg < neg_min)
1354                 return (false);
1355         if (lnumneg * ncnegfactor < lnumcache)
1356                 return (false);
1357 out_evict:
1358         return (cache_neg_evict());
1359 }
1360
1361 /*
1362  * cache_zap_locked():
1363  *
1364  *   Removes a namecache entry from cache, whether it contains an actual
1365  *   pointer to a vnode or if it is just a negative cache entry.
1366  */
1367 static void
1368 cache_zap_locked(struct namecache *ncp)
1369 {
1370         struct nchashhead *ncpp;
1371
1372         if (!(ncp->nc_flag & NCF_NEGATIVE))
1373                 cache_assert_vnode_locked(ncp->nc_vp);
1374         cache_assert_vnode_locked(ncp->nc_dvp);
1375         cache_assert_bucket_locked(ncp);
1376
1377         cache_ncp_invalidate(ncp);
1378
1379         ncpp = NCP2BUCKET(ncp);
1380         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1381         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1382                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1383                     ncp->nc_name, ncp->nc_vp);
1384                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1385                 if (ncp == ncp->nc_vp->v_cache_dd) {
1386                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1387                         ncp->nc_vp->v_cache_dd = NULL;
1388                         vn_seqc_write_end(ncp->nc_vp);
1389                 }
1390         } else {
1391                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1392                     ncp->nc_name);
1393                 cache_neg_remove(ncp);
1394         }
1395         if (ncp->nc_flag & NCF_ISDOTDOT) {
1396                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1397                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1398                         ncp->nc_dvp->v_cache_dd = NULL;
1399                         vn_seqc_write_end(ncp->nc_dvp);
1400                 }
1401         } else {
1402                 LIST_REMOVE(ncp, nc_src);
1403                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1404                         ncp->nc_flag |= NCF_DVDROP;
1405                 }
1406         }
1407 }
1408
1409 static void
1410 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1411 {
1412         struct mtx *blp;
1413
1414         MPASS(ncp->nc_dvp == vp);
1415         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1416         cache_assert_vnode_locked(vp);
1417
1418         blp = NCP2BUCKETLOCK(ncp);
1419         mtx_lock(blp);
1420         cache_zap_locked(ncp);
1421         mtx_unlock(blp);
1422 }
1423
1424 static bool
1425 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1426     struct mtx **vlpp)
1427 {
1428         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1429         struct mtx *blp;
1430
1431         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1432         cache_assert_vnode_locked(vp);
1433
1434         if (ncp->nc_flag & NCF_NEGATIVE) {
1435                 if (*vlpp != NULL) {
1436                         mtx_unlock(*vlpp);
1437                         *vlpp = NULL;
1438                 }
1439                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1440                 return (true);
1441         }
1442
1443         pvlp = VP2VNODELOCK(vp);
1444         blp = NCP2BUCKETLOCK(ncp);
1445         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1446         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1447
1448         if (*vlpp == vlp1 || *vlpp == vlp2) {
1449                 to_unlock = *vlpp;
1450                 *vlpp = NULL;
1451         } else {
1452                 if (*vlpp != NULL) {
1453                         mtx_unlock(*vlpp);
1454                         *vlpp = NULL;
1455                 }
1456                 cache_sort_vnodes(&vlp1, &vlp2);
1457                 if (vlp1 == pvlp) {
1458                         mtx_lock(vlp2);
1459                         to_unlock = vlp2;
1460                 } else {
1461                         if (!mtx_trylock(vlp1))
1462                                 goto out_relock;
1463                         to_unlock = vlp1;
1464                 }
1465         }
1466         mtx_lock(blp);
1467         cache_zap_locked(ncp);
1468         mtx_unlock(blp);
1469         if (to_unlock != NULL)
1470                 mtx_unlock(to_unlock);
1471         return (true);
1472
1473 out_relock:
1474         mtx_unlock(vlp2);
1475         mtx_lock(vlp1);
1476         mtx_lock(vlp2);
1477         MPASS(*vlpp == NULL);
1478         *vlpp = vlp1;
1479         return (false);
1480 }
1481
1482 /*
1483  * If trylocking failed we can get here. We know enough to take all needed locks
1484  * in the right order and re-lookup the entry.
1485  */
1486 static int
1487 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1488     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1489     struct mtx *blp)
1490 {
1491         struct namecache *rncp;
1492
1493         cache_assert_bucket_unlocked(ncp);
1494
1495         cache_sort_vnodes(&dvlp, &vlp);
1496         cache_lock_vnodes(dvlp, vlp);
1497         mtx_lock(blp);
1498         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1499                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1500                     rncp->nc_nlen == cnp->cn_namelen &&
1501                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1502                         break;
1503         }
1504         if (rncp != NULL) {
1505                 cache_zap_locked(rncp);
1506                 mtx_unlock(blp);
1507                 cache_unlock_vnodes(dvlp, vlp);
1508                 counter_u64_add(zap_bucket_relock_success, 1);
1509                 return (0);
1510         }
1511
1512         mtx_unlock(blp);
1513         cache_unlock_vnodes(dvlp, vlp);
1514         return (EAGAIN);
1515 }
1516
1517 static int __noinline
1518 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1519     uint32_t hash, struct mtx *blp)
1520 {
1521         struct mtx *dvlp, *vlp;
1522         struct vnode *dvp;
1523
1524         cache_assert_bucket_locked(ncp);
1525
1526         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1527         vlp = NULL;
1528         if (!(ncp->nc_flag & NCF_NEGATIVE))
1529                 vlp = VP2VNODELOCK(ncp->nc_vp);
1530         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1531                 cache_zap_locked(ncp);
1532                 mtx_unlock(blp);
1533                 cache_unlock_vnodes(dvlp, vlp);
1534                 return (0);
1535         }
1536
1537         dvp = ncp->nc_dvp;
1538         mtx_unlock(blp);
1539         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1540 }
1541
1542 static __noinline int
1543 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1544 {
1545         struct namecache *ncp;
1546         struct mtx *blp;
1547         struct mtx *dvlp, *dvlp2;
1548         uint32_t hash;
1549         int error;
1550
1551         if (cnp->cn_namelen == 2 &&
1552             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1553                 dvlp = VP2VNODELOCK(dvp);
1554                 dvlp2 = NULL;
1555                 mtx_lock(dvlp);
1556 retry_dotdot:
1557                 ncp = dvp->v_cache_dd;
1558                 if (ncp == NULL) {
1559                         mtx_unlock(dvlp);
1560                         if (dvlp2 != NULL)
1561                                 mtx_unlock(dvlp2);
1562                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1563                         return (0);
1564                 }
1565                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1566                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1567                                 goto retry_dotdot;
1568                         MPASS(dvp->v_cache_dd == NULL);
1569                         mtx_unlock(dvlp);
1570                         if (dvlp2 != NULL)
1571                                 mtx_unlock(dvlp2);
1572                         cache_free(ncp);
1573                 } else {
1574                         vn_seqc_write_begin(dvp);
1575                         dvp->v_cache_dd = NULL;
1576                         vn_seqc_write_end(dvp);
1577                         mtx_unlock(dvlp);
1578                         if (dvlp2 != NULL)
1579                                 mtx_unlock(dvlp2);
1580                 }
1581                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1582                 return (1);
1583         }
1584
1585         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1586         blp = HASH2BUCKETLOCK(hash);
1587 retry:
1588         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1589                 goto out_no_entry;
1590
1591         mtx_lock(blp);
1592
1593         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1594                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1595                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1596                         break;
1597         }
1598
1599         if (ncp == NULL) {
1600                 mtx_unlock(blp);
1601                 goto out_no_entry;
1602         }
1603
1604         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1605         if (__predict_false(error != 0)) {
1606                 zap_bucket_fail++;
1607                 goto retry;
1608         }
1609         counter_u64_add(numposzaps, 1);
1610         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1611         cache_free(ncp);
1612         return (1);
1613 out_no_entry:
1614         counter_u64_add(nummisszap, 1);
1615         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1616         return (0);
1617 }
1618
1619 static int __noinline
1620 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1621     struct timespec *tsp, int *ticksp)
1622 {
1623         int ltype;
1624
1625         *vpp = dvp;
1626         counter_u64_add(dothits, 1);
1627         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1628         if (tsp != NULL)
1629                 timespecclear(tsp);
1630         if (ticksp != NULL)
1631                 *ticksp = ticks;
1632         vrefact(*vpp);
1633         /*
1634          * When we lookup "." we still can be asked to lock it
1635          * differently...
1636          */
1637         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1638         if (ltype != VOP_ISLOCKED(*vpp)) {
1639                 if (ltype == LK_EXCLUSIVE) {
1640                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1641                         if (VN_IS_DOOMED((*vpp))) {
1642                                 /* forced unmount */
1643                                 vrele(*vpp);
1644                                 *vpp = NULL;
1645                                 return (ENOENT);
1646                         }
1647                 } else
1648                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1649         }
1650         return (-1);
1651 }
1652
1653 static int __noinline
1654 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1655     struct timespec *tsp, int *ticksp)
1656 {
1657         struct namecache_ts *ncp_ts;
1658         struct namecache *ncp;
1659         struct mtx *dvlp;
1660         enum vgetstate vs;
1661         int error, ltype;
1662         bool whiteout;
1663
1664         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1665
1666         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1667                 cache_remove_cnp(dvp, cnp);
1668                 return (0);
1669         }
1670
1671         counter_u64_add(dotdothits, 1);
1672 retry:
1673         dvlp = VP2VNODELOCK(dvp);
1674         mtx_lock(dvlp);
1675         ncp = dvp->v_cache_dd;
1676         if (ncp == NULL) {
1677                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1678                 mtx_unlock(dvlp);
1679                 return (0);
1680         }
1681         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1682                 if (ncp->nc_flag & NCF_NEGATIVE)
1683                         *vpp = NULL;
1684                 else
1685                         *vpp = ncp->nc_vp;
1686         } else
1687                 *vpp = ncp->nc_dvp;
1688         if (*vpp == NULL)
1689                 goto negative_success;
1690         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1691         cache_out_ts(ncp, tsp, ticksp);
1692         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1693             NCF_DTS && tsp != NULL) {
1694                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1695                 *tsp = ncp_ts->nc_dotdottime;
1696         }
1697
1698         MPASS(dvp != *vpp);
1699         ltype = VOP_ISLOCKED(dvp);
1700         VOP_UNLOCK(dvp);
1701         vs = vget_prep(*vpp);
1702         mtx_unlock(dvlp);
1703         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1704         vn_lock(dvp, ltype | LK_RETRY);
1705         if (VN_IS_DOOMED(dvp)) {
1706                 if (error == 0)
1707                         vput(*vpp);
1708                 *vpp = NULL;
1709                 return (ENOENT);
1710         }
1711         if (error) {
1712                 *vpp = NULL;
1713                 goto retry;
1714         }
1715         return (-1);
1716 negative_success:
1717         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1718                 if (cnp->cn_flags & ISLASTCN) {
1719                         counter_u64_add(numnegzaps, 1);
1720                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1721                         mtx_unlock(dvlp);
1722                         cache_free(ncp);
1723                         return (0);
1724                 }
1725         }
1726
1727         whiteout = (ncp->nc_flag & NCF_WHITE);
1728         cache_out_ts(ncp, tsp, ticksp);
1729         if (cache_neg_hit_prep(ncp))
1730                 cache_neg_promote(ncp);
1731         else
1732                 cache_neg_hit_finish(ncp);
1733         mtx_unlock(dvlp);
1734         if (whiteout)
1735                 cnp->cn_flags |= ISWHITEOUT;
1736         return (ENOENT);
1737 }
1738
1739 /**
1740  * Lookup a name in the name cache
1741  *
1742  * # Arguments
1743  *
1744  * - dvp:       Parent directory in which to search.
1745  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1746  * - cnp:       Parameters of the name search.  The most interesting bits of
1747  *              the cn_flags field have the following meanings:
1748  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1749  *                      it up.
1750  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1751  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1752  *              or negative) lookup, tsp will be filled with any timespec that
1753  *              was stored when this cache entry was created.  However, it will
1754  *              be clear for "." entries.
1755  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1756  *              (positive or negative) lookup, it will contain the ticks value
1757  *              that was current when the cache entry was created, unless cnp
1758  *              was ".".
1759  *
1760  * Either both tsp and ticks have to be provided or neither of them.
1761  *
1762  * # Returns
1763  *
1764  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1765  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1766  *              to a forced unmount.  vpp will not be modified.  If the entry
1767  *              is a whiteout, then the ISWHITEOUT flag will be set in
1768  *              cnp->cn_flags.
1769  * - 0:         A cache miss.  vpp will not be modified.
1770  *
1771  * # Locking
1772  *
1773  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1774  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1775  * lock is not recursively acquired.
1776  */
1777 static int __noinline
1778 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1779     struct timespec *tsp, int *ticksp)
1780 {
1781         struct namecache *ncp;
1782         struct mtx *blp;
1783         uint32_t hash;
1784         enum vgetstate vs;
1785         int error;
1786         bool whiteout;
1787
1788         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1789         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1790
1791 retry:
1792         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1793         blp = HASH2BUCKETLOCK(hash);
1794         mtx_lock(blp);
1795
1796         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1797                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1798                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1799                         break;
1800         }
1801
1802         if (__predict_false(ncp == NULL)) {
1803                 mtx_unlock(blp);
1804                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1805                     NULL);
1806                 counter_u64_add(nummiss, 1);
1807                 return (0);
1808         }
1809
1810         if (ncp->nc_flag & NCF_NEGATIVE)
1811                 goto negative_success;
1812
1813         counter_u64_add(numposhits, 1);
1814         *vpp = ncp->nc_vp;
1815         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1816         cache_out_ts(ncp, tsp, ticksp);
1817         MPASS(dvp != *vpp);
1818         vs = vget_prep(*vpp);
1819         mtx_unlock(blp);
1820         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1821         if (error) {
1822                 *vpp = NULL;
1823                 goto retry;
1824         }
1825         return (-1);
1826 negative_success:
1827         /*
1828          * We don't get here with regular lookup apart from corner cases.
1829          */
1830         if (__predict_true(cnp->cn_nameiop == CREATE)) {
1831                 if (cnp->cn_flags & ISLASTCN) {
1832                         counter_u64_add(numnegzaps, 1);
1833                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1834                         if (__predict_false(error != 0)) {
1835                                 zap_bucket_fail2++;
1836                                 goto retry;
1837                         }
1838                         cache_free(ncp);
1839                         return (0);
1840                 }
1841         }
1842
1843         whiteout = (ncp->nc_flag & NCF_WHITE);
1844         cache_out_ts(ncp, tsp, ticksp);
1845         if (cache_neg_hit_prep(ncp))
1846                 cache_neg_promote(ncp);
1847         else
1848                 cache_neg_hit_finish(ncp);
1849         mtx_unlock(blp);
1850         if (whiteout)
1851                 cnp->cn_flags |= ISWHITEOUT;
1852         return (ENOENT);
1853 }
1854
1855 int
1856 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1857     struct timespec *tsp, int *ticksp)
1858 {
1859         struct namecache *ncp;
1860         uint32_t hash;
1861         enum vgetstate vs;
1862         int error;
1863         bool whiteout, neg_promote;
1864         u_short nc_flag;
1865
1866         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1867
1868 #ifdef DEBUG_CACHE
1869         if (__predict_false(!doingcache)) {
1870                 cnp->cn_flags &= ~MAKEENTRY;
1871                 return (0);
1872         }
1873 #endif
1874
1875         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1876                 if (cnp->cn_namelen == 1)
1877                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1878                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1879                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1880         }
1881
1882         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1883
1884         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1885                 cache_remove_cnp(dvp, cnp);
1886                 return (0);
1887         }
1888
1889         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1890         vfs_smr_enter();
1891
1892         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1893                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1894                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1895                         break;
1896         }
1897
1898         if (__predict_false(ncp == NULL)) {
1899                 vfs_smr_exit();
1900                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1901                     NULL);
1902                 counter_u64_add(nummiss, 1);
1903                 return (0);
1904         }
1905
1906         nc_flag = atomic_load_char(&ncp->nc_flag);
1907         if (nc_flag & NCF_NEGATIVE)
1908                 goto negative_success;
1909
1910         counter_u64_add(numposhits, 1);
1911         *vpp = ncp->nc_vp;
1912         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1913         cache_out_ts(ncp, tsp, ticksp);
1914         MPASS(dvp != *vpp);
1915         if (!cache_ncp_canuse(ncp)) {
1916                 vfs_smr_exit();
1917                 *vpp = NULL;
1918                 goto out_fallback;
1919         }
1920         vs = vget_prep_smr(*vpp);
1921         vfs_smr_exit();
1922         if (__predict_false(vs == VGET_NONE)) {
1923                 *vpp = NULL;
1924                 goto out_fallback;
1925         }
1926         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1927         if (error) {
1928                 *vpp = NULL;
1929                 goto out_fallback;
1930         }
1931         return (-1);
1932 negative_success:
1933         if (cnp->cn_nameiop == CREATE) {
1934                 if (cnp->cn_flags & ISLASTCN) {
1935                         vfs_smr_exit();
1936                         goto out_fallback;
1937                 }
1938         }
1939
1940         cache_out_ts(ncp, tsp, ticksp);
1941         whiteout = (ncp->nc_flag & NCF_WHITE);
1942         neg_promote = cache_neg_hit_prep(ncp);
1943         if (!cache_ncp_canuse(ncp)) {
1944                 cache_neg_hit_abort(ncp);
1945                 vfs_smr_exit();
1946                 goto out_fallback;
1947         }
1948         if (neg_promote) {
1949                 vfs_smr_exit();
1950                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1951                         goto out_fallback;
1952         } else {
1953                 cache_neg_hit_finish(ncp);
1954                 vfs_smr_exit();
1955         }
1956         if (whiteout)
1957                 cnp->cn_flags |= ISWHITEOUT;
1958         return (ENOENT);
1959 out_fallback:
1960         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1961 }
1962
1963 struct celockstate {
1964         struct mtx *vlp[3];
1965         struct mtx *blp[2];
1966 };
1967 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1968 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1969
1970 static inline void
1971 cache_celockstate_init(struct celockstate *cel)
1972 {
1973
1974         bzero(cel, sizeof(*cel));
1975 }
1976
1977 static void
1978 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1979     struct vnode *dvp)
1980 {
1981         struct mtx *vlp1, *vlp2;
1982
1983         MPASS(cel->vlp[0] == NULL);
1984         MPASS(cel->vlp[1] == NULL);
1985         MPASS(cel->vlp[2] == NULL);
1986
1987         MPASS(vp != NULL || dvp != NULL);
1988
1989         vlp1 = VP2VNODELOCK(vp);
1990         vlp2 = VP2VNODELOCK(dvp);
1991         cache_sort_vnodes(&vlp1, &vlp2);
1992
1993         if (vlp1 != NULL) {
1994                 mtx_lock(vlp1);
1995                 cel->vlp[0] = vlp1;
1996         }
1997         mtx_lock(vlp2);
1998         cel->vlp[1] = vlp2;
1999 }
2000
2001 static void
2002 cache_unlock_vnodes_cel(struct celockstate *cel)
2003 {
2004
2005         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2006
2007         if (cel->vlp[0] != NULL)
2008                 mtx_unlock(cel->vlp[0]);
2009         if (cel->vlp[1] != NULL)
2010                 mtx_unlock(cel->vlp[1]);
2011         if (cel->vlp[2] != NULL)
2012                 mtx_unlock(cel->vlp[2]);
2013 }
2014
2015 static bool
2016 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2017 {
2018         struct mtx *vlp;
2019         bool ret;
2020
2021         cache_assert_vlp_locked(cel->vlp[0]);
2022         cache_assert_vlp_locked(cel->vlp[1]);
2023         MPASS(cel->vlp[2] == NULL);
2024
2025         MPASS(vp != NULL);
2026         vlp = VP2VNODELOCK(vp);
2027
2028         ret = true;
2029         if (vlp >= cel->vlp[1]) {
2030                 mtx_lock(vlp);
2031         } else {
2032                 if (mtx_trylock(vlp))
2033                         goto out;
2034                 cache_lock_vnodes_cel_3_failures++;
2035                 cache_unlock_vnodes_cel(cel);
2036                 if (vlp < cel->vlp[0]) {
2037                         mtx_lock(vlp);
2038                         mtx_lock(cel->vlp[0]);
2039                         mtx_lock(cel->vlp[1]);
2040                 } else {
2041                         if (cel->vlp[0] != NULL)
2042                                 mtx_lock(cel->vlp[0]);
2043                         mtx_lock(vlp);
2044                         mtx_lock(cel->vlp[1]);
2045                 }
2046                 ret = false;
2047         }
2048 out:
2049         cel->vlp[2] = vlp;
2050         return (ret);
2051 }
2052
2053 static void
2054 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2055     struct mtx *blp2)
2056 {
2057
2058         MPASS(cel->blp[0] == NULL);
2059         MPASS(cel->blp[1] == NULL);
2060
2061         cache_sort_vnodes(&blp1, &blp2);
2062
2063         if (blp1 != NULL) {
2064                 mtx_lock(blp1);
2065                 cel->blp[0] = blp1;
2066         }
2067         mtx_lock(blp2);
2068         cel->blp[1] = blp2;
2069 }
2070
2071 static void
2072 cache_unlock_buckets_cel(struct celockstate *cel)
2073 {
2074
2075         if (cel->blp[0] != NULL)
2076                 mtx_unlock(cel->blp[0]);
2077         mtx_unlock(cel->blp[1]);
2078 }
2079
2080 /*
2081  * Lock part of the cache affected by the insertion.
2082  *
2083  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2084  * However, insertion can result in removal of an old entry. In this
2085  * case we have an additional vnode and bucketlock pair to lock.
2086  *
2087  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2088  * preserving the locking order (smaller address first).
2089  */
2090 static void
2091 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2092     uint32_t hash)
2093 {
2094         struct namecache *ncp;
2095         struct mtx *blps[2];
2096
2097         blps[0] = HASH2BUCKETLOCK(hash);
2098         for (;;) {
2099                 blps[1] = NULL;
2100                 cache_lock_vnodes_cel(cel, dvp, vp);
2101                 if (vp == NULL || vp->v_type != VDIR)
2102                         break;
2103                 ncp = vp->v_cache_dd;
2104                 if (ncp == NULL)
2105                         break;
2106                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2107                         break;
2108                 MPASS(ncp->nc_dvp == vp);
2109                 blps[1] = NCP2BUCKETLOCK(ncp);
2110                 if (ncp->nc_flag & NCF_NEGATIVE)
2111                         break;
2112                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2113                         break;
2114                 /*
2115                  * All vnodes got re-locked. Re-validate the state and if
2116                  * nothing changed we are done. Otherwise restart.
2117                  */
2118                 if (ncp == vp->v_cache_dd &&
2119                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2120                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2121                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2122                         break;
2123                 cache_unlock_vnodes_cel(cel);
2124                 cel->vlp[0] = NULL;
2125                 cel->vlp[1] = NULL;
2126                 cel->vlp[2] = NULL;
2127         }
2128         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2129 }
2130
2131 static void
2132 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2133     uint32_t hash)
2134 {
2135         struct namecache *ncp;
2136         struct mtx *blps[2];
2137
2138         blps[0] = HASH2BUCKETLOCK(hash);
2139         for (;;) {
2140                 blps[1] = NULL;
2141                 cache_lock_vnodes_cel(cel, dvp, vp);
2142                 ncp = dvp->v_cache_dd;
2143                 if (ncp == NULL)
2144                         break;
2145                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2146                         break;
2147                 MPASS(ncp->nc_dvp == dvp);
2148                 blps[1] = NCP2BUCKETLOCK(ncp);
2149                 if (ncp->nc_flag & NCF_NEGATIVE)
2150                         break;
2151                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2152                         break;
2153                 if (ncp == dvp->v_cache_dd &&
2154                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2155                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2156                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2157                         break;
2158                 cache_unlock_vnodes_cel(cel);
2159                 cel->vlp[0] = NULL;
2160                 cel->vlp[1] = NULL;
2161                 cel->vlp[2] = NULL;
2162         }
2163         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2164 }
2165
2166 static void
2167 cache_enter_unlock(struct celockstate *cel)
2168 {
2169
2170         cache_unlock_buckets_cel(cel);
2171         cache_unlock_vnodes_cel(cel);
2172 }
2173
2174 static void __noinline
2175 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2176     struct componentname *cnp)
2177 {
2178         struct celockstate cel;
2179         struct namecache *ncp;
2180         uint32_t hash;
2181         int len;
2182
2183         if (dvp->v_cache_dd == NULL)
2184                 return;
2185         len = cnp->cn_namelen;
2186         cache_celockstate_init(&cel);
2187         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2188         cache_enter_lock_dd(&cel, dvp, vp, hash);
2189         vn_seqc_write_begin(dvp);
2190         ncp = dvp->v_cache_dd;
2191         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2192                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2193                 cache_zap_locked(ncp);
2194         } else {
2195                 ncp = NULL;
2196         }
2197         dvp->v_cache_dd = NULL;
2198         vn_seqc_write_end(dvp);
2199         cache_enter_unlock(&cel);
2200         if (ncp != NULL)
2201                 cache_free(ncp);
2202 }
2203
2204 /*
2205  * Add an entry to the cache.
2206  */
2207 void
2208 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2209     struct timespec *tsp, struct timespec *dtsp)
2210 {
2211         struct celockstate cel;
2212         struct namecache *ncp, *n2, *ndd;
2213         struct namecache_ts *ncp_ts;
2214         struct nchashhead *ncpp;
2215         uint32_t hash;
2216         int flag;
2217         int len;
2218
2219         VNPASS(dvp != vp, dvp);
2220         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2221         VNPASS(dvp->v_type != VNON, dvp);
2222         if (vp != NULL) {
2223                 VNPASS(!VN_IS_DOOMED(vp), vp);
2224                 VNPASS(vp->v_type != VNON, vp);
2225         }
2226
2227 #ifdef DEBUG_CACHE
2228         if (__predict_false(!doingcache))
2229                 return;
2230 #endif
2231
2232         flag = 0;
2233         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2234                 if (cnp->cn_namelen == 1)
2235                         return;
2236                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2237                         cache_enter_dotdot_prep(dvp, vp, cnp);
2238                         flag = NCF_ISDOTDOT;
2239                 }
2240         }
2241
2242         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2243         if (ncp == NULL)
2244                 return;
2245
2246         cache_celockstate_init(&cel);
2247         ndd = NULL;
2248         ncp_ts = NULL;
2249
2250         /*
2251          * Calculate the hash key and setup as much of the new
2252          * namecache entry as possible before acquiring the lock.
2253          */
2254         ncp->nc_flag = flag | NCF_WIP;
2255         ncp->nc_vp = vp;
2256         if (vp == NULL)
2257                 cache_neg_init(ncp);
2258         ncp->nc_dvp = dvp;
2259         if (tsp != NULL) {
2260                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2261                 ncp_ts->nc_time = *tsp;
2262                 ncp_ts->nc_ticks = ticks;
2263                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2264                 if (dtsp != NULL) {
2265                         ncp_ts->nc_dotdottime = *dtsp;
2266                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2267                 }
2268         }
2269         len = ncp->nc_nlen = cnp->cn_namelen;
2270         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2271         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2272         ncp->nc_name[len] = '\0';
2273         cache_enter_lock(&cel, dvp, vp, hash);
2274
2275         /*
2276          * See if this vnode or negative entry is already in the cache
2277          * with this name.  This can happen with concurrent lookups of
2278          * the same path name.
2279          */
2280         ncpp = NCHHASH(hash);
2281         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2282                 if (n2->nc_dvp == dvp &&
2283                     n2->nc_nlen == cnp->cn_namelen &&
2284                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2285                         MPASS(cache_ncp_canuse(n2));
2286                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2287                                 KASSERT(vp == NULL,
2288                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2289                                     __func__, NULL, vp));
2290                         else
2291                                 KASSERT(n2->nc_vp == vp,
2292                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2293                                     __func__, n2->nc_vp, vp));
2294                         /*
2295                          * Entries are supposed to be immutable unless in the
2296                          * process of getting destroyed. Accommodating for
2297                          * changing timestamps is possible but not worth it.
2298                          * This should be harmless in terms of correctness, in
2299                          * the worst case resulting in an earlier expiration.
2300                          * Alternatively, the found entry can be replaced
2301                          * altogether.
2302                          */
2303                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2304 #if 0
2305                         if (tsp != NULL) {
2306                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2307                                     ("no NCF_TS"));
2308                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2309                                 n2_ts->nc_time = ncp_ts->nc_time;
2310                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2311                                 if (dtsp != NULL) {
2312                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2313                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2314                                 }
2315                         }
2316 #endif
2317                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2318                             vp);
2319                         goto out_unlock_free;
2320                 }
2321         }
2322
2323         if (flag == NCF_ISDOTDOT) {
2324                 /*
2325                  * See if we are trying to add .. entry, but some other lookup
2326                  * has populated v_cache_dd pointer already.
2327                  */
2328                 if (dvp->v_cache_dd != NULL)
2329                         goto out_unlock_free;
2330                 KASSERT(vp == NULL || vp->v_type == VDIR,
2331                     ("wrong vnode type %p", vp));
2332                 vn_seqc_write_begin(dvp);
2333                 dvp->v_cache_dd = ncp;
2334                 vn_seqc_write_end(dvp);
2335         }
2336
2337         if (vp != NULL) {
2338                 if (flag != NCF_ISDOTDOT) {
2339                         /*
2340                          * For this case, the cache entry maps both the
2341                          * directory name in it and the name ".." for the
2342                          * directory's parent.
2343                          */
2344                         vn_seqc_write_begin(vp);
2345                         if ((ndd = vp->v_cache_dd) != NULL) {
2346                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2347                                         cache_zap_locked(ndd);
2348                                 else
2349                                         ndd = NULL;
2350                         }
2351                         vp->v_cache_dd = ncp;
2352                         vn_seqc_write_end(vp);
2353                 } else if (vp->v_type != VDIR) {
2354                         if (vp->v_cache_dd != NULL) {
2355                                 vn_seqc_write_begin(vp);
2356                                 vp->v_cache_dd = NULL;
2357                                 vn_seqc_write_end(vp);
2358                         }
2359                 }
2360         }
2361
2362         if (flag != NCF_ISDOTDOT) {
2363                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2364                         cache_hold_vnode(dvp);
2365                 }
2366                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2367         }
2368
2369         /*
2370          * If the entry is "negative", we place it into the
2371          * "negative" cache queue, otherwise, we place it into the
2372          * destination vnode's cache entries queue.
2373          */
2374         if (vp != NULL) {
2375                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2376                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2377                     vp);
2378         } else {
2379                 if (cnp->cn_flags & ISWHITEOUT)
2380                         ncp->nc_flag |= NCF_WHITE;
2381                 cache_neg_insert(ncp);
2382                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2383                     ncp->nc_name);
2384         }
2385
2386         /*
2387          * Insert the new namecache entry into the appropriate chain
2388          * within the cache entries table.
2389          */
2390         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2391
2392         atomic_thread_fence_rel();
2393         /*
2394          * Mark the entry as fully constructed.
2395          * It is immutable past this point until its removal.
2396          */
2397         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2398
2399         cache_enter_unlock(&cel);
2400         if (ndd != NULL)
2401                 cache_free(ndd);
2402         return;
2403 out_unlock_free:
2404         cache_enter_unlock(&cel);
2405         cache_free(ncp);
2406         return;
2407 }
2408
2409 static u_int
2410 cache_roundup_2(u_int val)
2411 {
2412         u_int res;
2413
2414         for (res = 1; res <= val; res <<= 1)
2415                 continue;
2416
2417         return (res);
2418 }
2419
2420 static struct nchashhead *
2421 nchinittbl(u_long elements, u_long *hashmask)
2422 {
2423         struct nchashhead *hashtbl;
2424         u_long hashsize, i;
2425
2426         hashsize = cache_roundup_2(elements) / 2;
2427
2428         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2429         for (i = 0; i < hashsize; i++)
2430                 CK_SLIST_INIT(&hashtbl[i]);
2431         *hashmask = hashsize - 1;
2432         return (hashtbl);
2433 }
2434
2435 static void
2436 ncfreetbl(struct nchashhead *hashtbl)
2437 {
2438
2439         free(hashtbl, M_VFSCACHE);
2440 }
2441
2442 /*
2443  * Name cache initialization, from vfs_init() when we are booting
2444  */
2445 static void
2446 nchinit(void *dummy __unused)
2447 {
2448         u_int i;
2449
2450         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2451             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2452         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2453             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2454         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2455             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2456         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2457             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2458
2459         VFS_SMR_ZONE_SET(cache_zone_small);
2460         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2461         VFS_SMR_ZONE_SET(cache_zone_large);
2462         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2463
2464         ncsize = desiredvnodes * ncsizefactor;
2465         cache_recalc_neg_min(ncnegminpct);
2466         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2467         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2468         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2469                 ncbuckethash = 7;
2470         if (ncbuckethash > nchash)
2471                 ncbuckethash = nchash;
2472         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2473             M_WAITOK | M_ZERO);
2474         for (i = 0; i < numbucketlocks; i++)
2475                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2476         ncvnodehash = ncbuckethash;
2477         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2478             M_WAITOK | M_ZERO);
2479         for (i = 0; i < numvnodelocks; i++)
2480                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2481
2482         for (i = 0; i < numneglists; i++) {
2483                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2484                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2485                 TAILQ_INIT(&neglists[i].nl_list);
2486                 TAILQ_INIT(&neglists[i].nl_hotlist);
2487         }
2488 }
2489 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2490
2491 void
2492 cache_vnode_init(struct vnode *vp)
2493 {
2494
2495         LIST_INIT(&vp->v_cache_src);
2496         TAILQ_INIT(&vp->v_cache_dst);
2497         vp->v_cache_dd = NULL;
2498         cache_prehash(vp);
2499 }
2500
2501 void
2502 cache_changesize(u_long newmaxvnodes)
2503 {
2504         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2505         u_long new_nchash, old_nchash;
2506         struct namecache *ncp;
2507         uint32_t hash;
2508         u_long newncsize;
2509         int i;
2510
2511         newncsize = newmaxvnodes * ncsizefactor;
2512         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2513         if (newmaxvnodes < numbucketlocks)
2514                 newmaxvnodes = numbucketlocks;
2515
2516         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2517         /* If same hash table size, nothing to do */
2518         if (nchash == new_nchash) {
2519                 ncfreetbl(new_nchashtbl);
2520                 return;
2521         }
2522         /*
2523          * Move everything from the old hash table to the new table.
2524          * None of the namecache entries in the table can be removed
2525          * because to do so, they have to be removed from the hash table.
2526          */
2527         cache_lock_all_vnodes();
2528         cache_lock_all_buckets();
2529         old_nchashtbl = nchashtbl;
2530         old_nchash = nchash;
2531         nchashtbl = new_nchashtbl;
2532         nchash = new_nchash;
2533         for (i = 0; i <= old_nchash; i++) {
2534                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2535                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2536                             ncp->nc_dvp);
2537                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2538                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2539                 }
2540         }
2541         ncsize = newncsize;
2542         cache_recalc_neg_min(ncnegminpct);
2543         cache_unlock_all_buckets();
2544         cache_unlock_all_vnodes();
2545         ncfreetbl(old_nchashtbl);
2546 }
2547
2548 /*
2549  * Invalidate all entries from and to a particular vnode.
2550  */
2551 static void
2552 cache_purge_impl(struct vnode *vp)
2553 {
2554         struct cache_freebatch batch;
2555         struct namecache *ncp;
2556         struct mtx *vlp, *vlp2;
2557
2558         TAILQ_INIT(&batch);
2559         vlp = VP2VNODELOCK(vp);
2560         vlp2 = NULL;
2561         mtx_lock(vlp);
2562 retry:
2563         while (!LIST_EMPTY(&vp->v_cache_src)) {
2564                 ncp = LIST_FIRST(&vp->v_cache_src);
2565                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2566                         goto retry;
2567                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2568         }
2569         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2570                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2571                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2572                         goto retry;
2573                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2574         }
2575         ncp = vp->v_cache_dd;
2576         if (ncp != NULL) {
2577                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2578                    ("lost dotdot link"));
2579                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2580                         goto retry;
2581                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2582         }
2583         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2584         mtx_unlock(vlp);
2585         if (vlp2 != NULL)
2586                 mtx_unlock(vlp2);
2587         cache_free_batch(&batch);
2588 }
2589
2590 /*
2591  * Opportunistic check to see if there is anything to do.
2592  */
2593 static bool
2594 cache_has_entries(struct vnode *vp)
2595 {
2596
2597         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2598             vp->v_cache_dd == NULL)
2599                 return (false);
2600         return (true);
2601 }
2602
2603 void
2604 cache_purge(struct vnode *vp)
2605 {
2606
2607         SDT_PROBE1(vfs, namecache, purge, done, vp);
2608         if (!cache_has_entries(vp))
2609                 return;
2610         cache_purge_impl(vp);
2611 }
2612
2613 /*
2614  * Only to be used by vgone.
2615  */
2616 void
2617 cache_purge_vgone(struct vnode *vp)
2618 {
2619         struct mtx *vlp;
2620
2621         VNPASS(VN_IS_DOOMED(vp), vp);
2622         if (cache_has_entries(vp)) {
2623                 cache_purge_impl(vp);
2624                 return;
2625         }
2626
2627         /*
2628          * Serialize against a potential thread doing cache_purge.
2629          */
2630         vlp = VP2VNODELOCK(vp);
2631         mtx_wait_unlocked(vlp);
2632         if (cache_has_entries(vp)) {
2633                 cache_purge_impl(vp);
2634                 return;
2635         }
2636         return;
2637 }
2638
2639 /*
2640  * Invalidate all negative entries for a particular directory vnode.
2641  */
2642 void
2643 cache_purge_negative(struct vnode *vp)
2644 {
2645         struct cache_freebatch batch;
2646         struct namecache *ncp, *nnp;
2647         struct mtx *vlp;
2648
2649         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2650         if (LIST_EMPTY(&vp->v_cache_src))
2651                 return;
2652         TAILQ_INIT(&batch);
2653         vlp = VP2VNODELOCK(vp);
2654         mtx_lock(vlp);
2655         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2656                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2657                         continue;
2658                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2659                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2660         }
2661         mtx_unlock(vlp);
2662         cache_free_batch(&batch);
2663 }
2664
2665 /*
2666  * Entry points for modifying VOP operations.
2667  */
2668 void
2669 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2670     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2671 {
2672
2673         ASSERT_VOP_IN_SEQC(fdvp);
2674         ASSERT_VOP_IN_SEQC(fvp);
2675         ASSERT_VOP_IN_SEQC(tdvp);
2676         if (tvp != NULL)
2677                 ASSERT_VOP_IN_SEQC(tvp);
2678
2679         cache_purge(fvp);
2680         if (tvp != NULL) {
2681                 cache_purge(tvp);
2682                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2683                     ("%s: lingering negative entry", __func__));
2684         } else {
2685                 cache_remove_cnp(tdvp, tcnp);
2686         }
2687 }
2688
2689 void
2690 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2691 {
2692
2693         ASSERT_VOP_IN_SEQC(dvp);
2694         ASSERT_VOP_IN_SEQC(vp);
2695         cache_purge(vp);
2696 }
2697
2698 #ifdef INVARIANTS
2699 /*
2700  * Validate that if an entry exists it matches.
2701  */
2702 void
2703 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2704 {
2705         struct namecache *ncp;
2706         struct mtx *blp;
2707         uint32_t hash;
2708
2709         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2710         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2711                 return;
2712         blp = HASH2BUCKETLOCK(hash);
2713         mtx_lock(blp);
2714         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2715                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2716                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2717                         if (ncp->nc_vp != vp)
2718                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2719                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2720                                     ncp->nc_vp);
2721                 }
2722         }
2723         mtx_unlock(blp);
2724 }
2725 #endif
2726
2727 /*
2728  * Flush all entries referencing a particular filesystem.
2729  */
2730 void
2731 cache_purgevfs(struct mount *mp)
2732 {
2733         struct vnode *vp, *mvp;
2734
2735         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2736         /*
2737          * Somewhat wasteful iteration over all vnodes. Would be better to
2738          * support filtering and avoid the interlock to begin with.
2739          */
2740         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2741                 if (!cache_has_entries(vp)) {
2742                         VI_UNLOCK(vp);
2743                         continue;
2744                 }
2745                 vholdl(vp);
2746                 VI_UNLOCK(vp);
2747                 cache_purge(vp);
2748                 vdrop(vp);
2749         }
2750 }
2751
2752 /*
2753  * Perform canonical checks and cache lookup and pass on to filesystem
2754  * through the vop_cachedlookup only if needed.
2755  */
2756
2757 int
2758 vfs_cache_lookup(struct vop_lookup_args *ap)
2759 {
2760         struct vnode *dvp;
2761         int error;
2762         struct vnode **vpp = ap->a_vpp;
2763         struct componentname *cnp = ap->a_cnp;
2764         int flags = cnp->cn_flags;
2765
2766         *vpp = NULL;
2767         dvp = ap->a_dvp;
2768
2769         if (dvp->v_type != VDIR)
2770                 return (ENOTDIR);
2771
2772         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2773             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2774                 return (EROFS);
2775
2776         error = vn_dir_check_exec(dvp, cnp);
2777         if (error != 0)
2778                 return (error);
2779
2780         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2781         if (error == 0)
2782                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2783         if (error == -1)
2784                 return (0);
2785         return (error);
2786 }
2787
2788 /* Implementation of the getcwd syscall. */
2789 int
2790 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2791 {
2792         char *buf, *retbuf;
2793         size_t buflen;
2794         int error;
2795
2796         buflen = uap->buflen;
2797         if (__predict_false(buflen < 2))
2798                 return (EINVAL);
2799         if (buflen > MAXPATHLEN)
2800                 buflen = MAXPATHLEN;
2801
2802         buf = uma_zalloc(namei_zone, M_WAITOK);
2803         error = vn_getcwd(buf, &retbuf, &buflen);
2804         if (error == 0)
2805                 error = copyout(retbuf, uap->buf, buflen);
2806         uma_zfree(namei_zone, buf);
2807         return (error);
2808 }
2809
2810 int
2811 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2812 {
2813         struct pwd *pwd;
2814         int error;
2815
2816         vfs_smr_enter();
2817         pwd = pwd_get_smr();
2818         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2819             buflen, 0);
2820         VFS_SMR_ASSERT_NOT_ENTERED();
2821         if (error < 0) {
2822                 pwd = pwd_hold(curthread);
2823                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2824                     retbuf, buflen);
2825                 pwd_drop(pwd);
2826         }
2827
2828 #ifdef KTRACE
2829         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2830                 ktrnamei(*retbuf);
2831 #endif
2832         return (error);
2833 }
2834
2835 static int
2836 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2837     size_t size, int flags, enum uio_seg pathseg)
2838 {
2839         struct nameidata nd;
2840         char *retbuf, *freebuf;
2841         int error;
2842
2843         if (flags != 0)
2844                 return (EINVAL);
2845         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2846             pathseg, path, fd, &cap_fstat_rights, td);
2847         if ((error = namei(&nd)) != 0)
2848                 return (error);
2849         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2850         if (error == 0) {
2851                 error = copyout(retbuf, buf, size);
2852                 free(freebuf, M_TEMP);
2853         }
2854         NDFREE(&nd, 0);
2855         return (error);
2856 }
2857
2858 int
2859 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2860 {
2861
2862         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2863             uap->flags, UIO_USERSPACE));
2864 }
2865
2866 /*
2867  * Retrieve the full filesystem path that correspond to a vnode from the name
2868  * cache (if available)
2869  */
2870 int
2871 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2872 {
2873         struct pwd *pwd;
2874         char *buf;
2875         size_t buflen;
2876         int error;
2877
2878         if (__predict_false(vp == NULL))
2879                 return (EINVAL);
2880
2881         buflen = MAXPATHLEN;
2882         buf = malloc(buflen, M_TEMP, M_WAITOK);
2883         vfs_smr_enter();
2884         pwd = pwd_get_smr();
2885         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2886         VFS_SMR_ASSERT_NOT_ENTERED();
2887         if (error < 0) {
2888                 pwd = pwd_hold(curthread);
2889                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2890                 pwd_drop(pwd);
2891         }
2892         if (error == 0)
2893                 *freebuf = buf;
2894         else
2895                 free(buf, M_TEMP);
2896         return (error);
2897 }
2898
2899 /*
2900  * This function is similar to vn_fullpath, but it attempts to lookup the
2901  * pathname relative to the global root mount point.  This is required for the
2902  * auditing sub-system, as audited pathnames must be absolute, relative to the
2903  * global root mount point.
2904  */
2905 int
2906 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2907 {
2908         char *buf;
2909         size_t buflen;
2910         int error;
2911
2912         if (__predict_false(vp == NULL))
2913                 return (EINVAL);
2914         buflen = MAXPATHLEN;
2915         buf = malloc(buflen, M_TEMP, M_WAITOK);
2916         vfs_smr_enter();
2917         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2918         VFS_SMR_ASSERT_NOT_ENTERED();
2919         if (error < 0) {
2920                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2921         }
2922         if (error == 0)
2923                 *freebuf = buf;
2924         else
2925                 free(buf, M_TEMP);
2926         return (error);
2927 }
2928
2929 static struct namecache *
2930 vn_dd_from_dst(struct vnode *vp)
2931 {
2932         struct namecache *ncp;
2933
2934         cache_assert_vnode_locked(vp);
2935         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2936                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2937                         return (ncp);
2938         }
2939         return (NULL);
2940 }
2941
2942 int
2943 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2944 {
2945         struct vnode *dvp;
2946         struct namecache *ncp;
2947         struct mtx *vlp;
2948         int error;
2949
2950         vlp = VP2VNODELOCK(*vp);
2951         mtx_lock(vlp);
2952         ncp = (*vp)->v_cache_dd;
2953         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2954                 KASSERT(ncp == vn_dd_from_dst(*vp),
2955                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2956                     ncp, vn_dd_from_dst(*vp)));
2957         } else {
2958                 ncp = vn_dd_from_dst(*vp);
2959         }
2960         if (ncp != NULL) {
2961                 if (*buflen < ncp->nc_nlen) {
2962                         mtx_unlock(vlp);
2963                         vrele(*vp);
2964                         counter_u64_add(numfullpathfail4, 1);
2965                         error = ENOMEM;
2966                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2967                             vp, NULL);
2968                         return (error);
2969                 }
2970                 *buflen -= ncp->nc_nlen;
2971                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2972                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2973                     ncp->nc_name, vp);
2974                 dvp = *vp;
2975                 *vp = ncp->nc_dvp;
2976                 vref(*vp);
2977                 mtx_unlock(vlp);
2978                 vrele(dvp);
2979                 return (0);
2980         }
2981         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2982
2983         mtx_unlock(vlp);
2984         vn_lock(*vp, LK_SHARED | LK_RETRY);
2985         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2986         vput(*vp);
2987         if (error) {
2988                 counter_u64_add(numfullpathfail2, 1);
2989                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2990                 return (error);
2991         }
2992
2993         *vp = dvp;
2994         if (VN_IS_DOOMED(dvp)) {
2995                 /* forced unmount */
2996                 vrele(dvp);
2997                 error = ENOENT;
2998                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2999                 return (error);
3000         }
3001         /*
3002          * *vp has its use count incremented still.
3003          */
3004
3005         return (0);
3006 }
3007
3008 /*
3009  * Resolve a directory to a pathname.
3010  *
3011  * The name of the directory can always be found in the namecache or fetched
3012  * from the filesystem. There is also guaranteed to be only one parent, meaning
3013  * we can just follow vnodes up until we find the root.
3014  *
3015  * The vnode must be referenced.
3016  */
3017 static int
3018 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3019     size_t *len, size_t addend)
3020 {
3021 #ifdef KDTRACE_HOOKS
3022         struct vnode *startvp = vp;
3023 #endif
3024         struct vnode *vp1;
3025         size_t buflen;
3026         int error;
3027         bool slash_prefixed;
3028
3029         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3030         VNPASS(vp->v_usecount > 0, vp);
3031
3032         buflen = *len;
3033
3034         slash_prefixed = true;
3035         if (addend == 0) {
3036                 MPASS(*len >= 2);
3037                 buflen--;
3038                 buf[buflen] = '\0';
3039                 slash_prefixed = false;
3040         }
3041
3042         error = 0;
3043
3044         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3045         counter_u64_add(numfullpathcalls, 1);
3046         while (vp != rdir && vp != rootvnode) {
3047                 /*
3048                  * The vp vnode must be already fully constructed,
3049                  * since it is either found in namecache or obtained
3050                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3051                  * without obtaining the vnode lock.
3052                  */
3053                 if ((vp->v_vflag & VV_ROOT) != 0) {
3054                         vn_lock(vp, LK_RETRY | LK_SHARED);
3055
3056                         /*
3057                          * With the vnode locked, check for races with
3058                          * unmount, forced or not.  Note that we
3059                          * already verified that vp is not equal to
3060                          * the root vnode, which means that
3061                          * mnt_vnodecovered can be NULL only for the
3062                          * case of unmount.
3063                          */
3064                         if (VN_IS_DOOMED(vp) ||
3065                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3066                             vp1->v_mountedhere != vp->v_mount) {
3067                                 vput(vp);
3068                                 error = ENOENT;
3069                                 SDT_PROBE3(vfs, namecache, fullpath, return,
3070                                     error, vp, NULL);
3071                                 break;
3072                         }
3073
3074                         vref(vp1);
3075                         vput(vp);
3076                         vp = vp1;
3077                         continue;
3078                 }
3079                 if (vp->v_type != VDIR) {
3080                         vrele(vp);
3081                         counter_u64_add(numfullpathfail1, 1);
3082                         error = ENOTDIR;
3083                         SDT_PROBE3(vfs, namecache, fullpath, return,
3084                             error, vp, NULL);
3085                         break;
3086                 }
3087                 error = vn_vptocnp(&vp, buf, &buflen);
3088                 if (error)
3089                         break;
3090                 if (buflen == 0) {
3091                         vrele(vp);
3092                         error = ENOMEM;
3093                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3094                             startvp, NULL);
3095                         break;
3096                 }
3097                 buf[--buflen] = '/';
3098                 slash_prefixed = true;
3099         }
3100         if (error)
3101                 return (error);
3102         if (!slash_prefixed) {
3103                 if (buflen == 0) {
3104                         vrele(vp);
3105                         counter_u64_add(numfullpathfail4, 1);
3106                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3107                             startvp, NULL);
3108                         return (ENOMEM);
3109                 }
3110                 buf[--buflen] = '/';
3111         }
3112         counter_u64_add(numfullpathfound, 1);
3113         vrele(vp);
3114
3115         *retbuf = buf + buflen;
3116         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3117         *len -= buflen;
3118         *len += addend;
3119         return (0);
3120 }
3121
3122 /*
3123  * Resolve an arbitrary vnode to a pathname.
3124  *
3125  * Note 2 caveats:
3126  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3127  *   resolve to a different path than the one used to find it
3128  * - namecache is not mandatory, meaning names are not guaranteed to be added
3129  *   (in which case resolving fails)
3130  */
3131 static void __inline
3132 cache_rev_failed_impl(int *reason, int line)
3133 {
3134
3135         *reason = line;
3136 }
3137 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3138
3139 static int
3140 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3141     char **retbuf, size_t *buflen, size_t addend)
3142 {
3143 #ifdef KDTRACE_HOOKS
3144         struct vnode *startvp = vp;
3145 #endif
3146         struct vnode *tvp;
3147         struct mount *mp;
3148         struct namecache *ncp;
3149         size_t orig_buflen;
3150         int reason;
3151         int error;
3152 #ifdef KDTRACE_HOOKS
3153         int i;
3154 #endif
3155         seqc_t vp_seqc, tvp_seqc;
3156         u_char nc_flag;
3157
3158         VFS_SMR_ASSERT_ENTERED();
3159
3160         if (!cache_fast_revlookup) {
3161                 vfs_smr_exit();
3162                 return (-1);
3163         }
3164
3165         orig_buflen = *buflen;
3166
3167         if (addend == 0) {
3168                 MPASS(*buflen >= 2);
3169                 *buflen -= 1;
3170                 buf[*buflen] = '\0';
3171         }
3172
3173         if (vp == rdir || vp == rootvnode) {
3174                 if (addend == 0) {
3175                         *buflen -= 1;
3176                         buf[*buflen] = '/';
3177                 }
3178                 goto out_ok;
3179         }
3180
3181 #ifdef KDTRACE_HOOKS
3182         i = 0;
3183 #endif
3184         error = -1;
3185         ncp = NULL; /* for sdt probe down below */
3186         vp_seqc = vn_seqc_read_any(vp);
3187         if (seqc_in_modify(vp_seqc)) {
3188                 cache_rev_failed(&reason);
3189                 goto out_abort;
3190         }
3191
3192         for (;;) {
3193 #ifdef KDTRACE_HOOKS
3194                 i++;
3195 #endif
3196                 if ((vp->v_vflag & VV_ROOT) != 0) {
3197                         mp = atomic_load_ptr(&vp->v_mount);
3198                         if (mp == NULL) {
3199                                 cache_rev_failed(&reason);
3200                                 goto out_abort;
3201                         }
3202                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3203                         tvp_seqc = vn_seqc_read_any(tvp);
3204                         if (seqc_in_modify(tvp_seqc)) {
3205                                 cache_rev_failed(&reason);
3206                                 goto out_abort;
3207                         }
3208                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3209                                 cache_rev_failed(&reason);
3210                                 goto out_abort;
3211                         }
3212                         vp = tvp;
3213                         vp_seqc = tvp_seqc;
3214                         continue;
3215                 }
3216                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3217                 if (ncp == NULL) {
3218                         cache_rev_failed(&reason);
3219                         goto out_abort;
3220                 }
3221                 nc_flag = atomic_load_char(&ncp->nc_flag);
3222                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3223                         cache_rev_failed(&reason);
3224                         goto out_abort;
3225                 }
3226                 if (!cache_ncp_canuse(ncp)) {
3227                         cache_rev_failed(&reason);
3228                         goto out_abort;
3229                 }
3230                 if (ncp->nc_nlen >= *buflen) {
3231                         cache_rev_failed(&reason);
3232                         error = ENOMEM;
3233                         goto out_abort;
3234                 }
3235                 *buflen -= ncp->nc_nlen;
3236                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3237                 *buflen -= 1;
3238                 buf[*buflen] = '/';
3239                 tvp = ncp->nc_dvp;
3240                 tvp_seqc = vn_seqc_read_any(tvp);
3241                 if (seqc_in_modify(tvp_seqc)) {
3242                         cache_rev_failed(&reason);
3243                         goto out_abort;
3244                 }
3245                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3246                         cache_rev_failed(&reason);
3247                         goto out_abort;
3248                 }
3249                 vp = tvp;
3250                 vp_seqc = tvp_seqc;
3251                 if (vp == rdir || vp == rootvnode)
3252                         break;
3253         }
3254 out_ok:
3255         vfs_smr_exit();
3256         *retbuf = buf + *buflen;
3257         *buflen = orig_buflen - *buflen + addend;
3258         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3259         return (0);
3260
3261 out_abort:
3262         *buflen = orig_buflen;
3263         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3264         vfs_smr_exit();
3265         return (error);
3266 }
3267
3268 static int
3269 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3270     size_t *buflen)
3271 {
3272         size_t orig_buflen, addend;
3273         int error;
3274
3275         if (*buflen < 2)
3276                 return (EINVAL);
3277
3278         orig_buflen = *buflen;
3279
3280         vref(vp);
3281         addend = 0;
3282         if (vp->v_type != VDIR) {
3283                 *buflen -= 1;
3284                 buf[*buflen] = '\0';
3285                 error = vn_vptocnp(&vp, buf, buflen);
3286                 if (error)
3287                         return (error);
3288                 if (*buflen == 0) {
3289                         vrele(vp);
3290                         return (ENOMEM);
3291                 }
3292                 *buflen -= 1;
3293                 buf[*buflen] = '/';
3294                 addend = orig_buflen - *buflen;
3295         }
3296
3297         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3298 }
3299
3300 /*
3301  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3302  *
3303  * Since the namecache does not track hardlinks, the caller is expected to first
3304  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3305  *
3306  * Then we have 2 cases:
3307  * - if the found vnode is a directory, the path can be constructed just by
3308  *   following names up the chain
3309  * - otherwise we populate the buffer with the saved name and start resolving
3310  *   from the parent
3311  */
3312 static int
3313 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3314     size_t *buflen)
3315 {
3316         char *buf, *tmpbuf;
3317         struct pwd *pwd;
3318         struct componentname *cnp;
3319         struct vnode *vp;
3320         size_t addend;
3321         int error;
3322         enum vtype type;
3323
3324         if (*buflen < 2)
3325                 return (EINVAL);
3326         if (*buflen > MAXPATHLEN)
3327                 *buflen = MAXPATHLEN;
3328
3329         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3330
3331         addend = 0;
3332         vp = ndp->ni_vp;
3333         /*
3334          * Check for VBAD to work around the vp_crossmp bug in lookup().
3335          *
3336          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3337          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3338          * If the type is VDIR (like in this very case) we can skip looking
3339          * at ni_dvp in the first place. However, since vnodes get passed here
3340          * unlocked the target may transition to doomed state (type == VBAD)
3341          * before we get to evaluate the condition. If this happens, we will
3342          * populate part of the buffer and descend to vn_fullpath_dir with
3343          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3344          *
3345          * This should be atomic_load(&vp->v_type) but it is illegal to take
3346          * an address of a bit field, even if said field is sized to char.
3347          * Work around the problem by reading the value into a full-sized enum
3348          * and then re-reading it with atomic_load which will still prevent
3349          * the compiler from re-reading down the road.
3350          */
3351         type = vp->v_type;
3352         type = atomic_load_int(&type);
3353         if (type == VBAD) {
3354                 error = ENOENT;
3355                 goto out_bad;
3356         }
3357         if (type != VDIR) {
3358                 cnp = &ndp->ni_cnd;
3359                 addend = cnp->cn_namelen + 2;
3360                 if (*buflen < addend) {
3361                         error = ENOMEM;
3362                         goto out_bad;
3363                 }
3364                 *buflen -= addend;
3365                 tmpbuf = buf + *buflen;
3366                 tmpbuf[0] = '/';
3367                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3368                 tmpbuf[addend - 1] = '\0';
3369                 vp = ndp->ni_dvp;
3370         }
3371
3372         vfs_smr_enter();
3373         pwd = pwd_get_smr();
3374         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3375             addend);
3376         VFS_SMR_ASSERT_NOT_ENTERED();
3377         if (error < 0) {
3378                 pwd = pwd_hold(curthread);
3379                 vref(vp);
3380                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3381                     addend);
3382                 pwd_drop(pwd);
3383                 if (error != 0)
3384                         goto out_bad;
3385         }
3386
3387         *freebuf = buf;
3388
3389         return (0);
3390 out_bad:
3391         free(buf, M_TEMP);
3392         return (error);
3393 }
3394
3395 struct vnode *
3396 vn_dir_dd_ino(struct vnode *vp)
3397 {
3398         struct namecache *ncp;
3399         struct vnode *ddvp;
3400         struct mtx *vlp;
3401         enum vgetstate vs;
3402
3403         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3404         vlp = VP2VNODELOCK(vp);
3405         mtx_lock(vlp);
3406         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3407                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3408                         continue;
3409                 ddvp = ncp->nc_dvp;
3410                 vs = vget_prep(ddvp);
3411                 mtx_unlock(vlp);
3412                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3413                         return (NULL);
3414                 return (ddvp);
3415         }
3416         mtx_unlock(vlp);
3417         return (NULL);
3418 }
3419
3420 int
3421 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3422 {
3423         struct namecache *ncp;
3424         struct mtx *vlp;
3425         int l;
3426
3427         vlp = VP2VNODELOCK(vp);
3428         mtx_lock(vlp);
3429         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3430                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3431                         break;
3432         if (ncp == NULL) {
3433                 mtx_unlock(vlp);
3434                 return (ENOENT);
3435         }
3436         l = min(ncp->nc_nlen, buflen - 1);
3437         memcpy(buf, ncp->nc_name, l);
3438         mtx_unlock(vlp);
3439         buf[l] = '\0';
3440         return (0);
3441 }
3442
3443 /*
3444  * This function updates path string to vnode's full global path
3445  * and checks the size of the new path string against the pathlen argument.
3446  *
3447  * Requires a locked, referenced vnode.
3448  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3449  *
3450  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3451  * because it falls back to the ".." lookup if the namecache lookup fails.
3452  */
3453 int
3454 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3455     u_int pathlen)
3456 {
3457         struct nameidata nd;
3458         struct vnode *vp1;
3459         char *rpath, *fbuf;
3460         int error;
3461
3462         ASSERT_VOP_ELOCKED(vp, __func__);
3463
3464         /* Construct global filesystem path from vp. */
3465         VOP_UNLOCK(vp);
3466         error = vn_fullpath_global(vp, &rpath, &fbuf);
3467
3468         if (error != 0) {
3469                 vrele(vp);
3470                 return (error);
3471         }
3472
3473         if (strlen(rpath) >= pathlen) {
3474                 vrele(vp);
3475                 error = ENAMETOOLONG;
3476                 goto out;
3477         }
3478
3479         /*
3480          * Re-lookup the vnode by path to detect a possible rename.
3481          * As a side effect, the vnode is relocked.
3482          * If vnode was renamed, return ENOENT.
3483          */
3484         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3485             UIO_SYSSPACE, path, td);
3486         error = namei(&nd);
3487         if (error != 0) {
3488                 vrele(vp);
3489                 goto out;
3490         }
3491         NDFREE(&nd, NDF_ONLY_PNBUF);
3492         vp1 = nd.ni_vp;
3493         vrele(vp);
3494         if (vp1 == vp)
3495                 strcpy(path, rpath);
3496         else {
3497                 vput(vp1);
3498                 error = ENOENT;
3499         }
3500
3501 out:
3502         free(fbuf, M_TEMP);
3503         return (error);
3504 }
3505
3506 #ifdef DDB
3507 static void
3508 db_print_vpath(struct vnode *vp)
3509 {
3510
3511         while (vp != NULL) {
3512                 db_printf("%p: ", vp);
3513                 if (vp == rootvnode) {
3514                         db_printf("/");
3515                         vp = NULL;
3516                 } else {
3517                         if (vp->v_vflag & VV_ROOT) {
3518                                 db_printf("<mount point>");
3519                                 vp = vp->v_mount->mnt_vnodecovered;
3520                         } else {
3521                                 struct namecache *ncp;
3522                                 char *ncn;
3523                                 int i;
3524
3525                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3526                                 if (ncp != NULL) {
3527                                         ncn = ncp->nc_name;
3528                                         for (i = 0; i < ncp->nc_nlen; i++)
3529                                                 db_printf("%c", *ncn++);
3530                                         vp = ncp->nc_dvp;
3531                                 } else {
3532                                         vp = NULL;
3533                                 }
3534                         }
3535                 }
3536                 db_printf("\n");
3537         }
3538
3539         return;
3540 }
3541
3542 DB_SHOW_COMMAND(vpath, db_show_vpath)
3543 {
3544         struct vnode *vp;
3545
3546         if (!have_addr) {
3547                 db_printf("usage: show vpath <struct vnode *>\n");
3548                 return;
3549         }
3550
3551         vp = (struct vnode *)addr;
3552         db_print_vpath(vp);
3553 }
3554
3555 #endif
3556
3557 static bool __read_frequently cache_fast_lookup = true;
3558 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3559     &cache_fast_lookup, 0, "");
3560
3561 #define CACHE_FPL_FAILED        -2020
3562
3563 static void
3564 cache_fpl_cleanup_cnp(struct componentname *cnp)
3565 {
3566
3567         uma_zfree(namei_zone, cnp->cn_pnbuf);
3568 #ifdef DIAGNOSTIC
3569         cnp->cn_pnbuf = NULL;
3570         cnp->cn_nameptr = NULL;
3571 #endif
3572 }
3573
3574 static void
3575 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3576 {
3577         struct componentname *cnp;
3578
3579         cnp = &ndp->ni_cnd;
3580         while (*(cnp->cn_nameptr) == '/') {
3581                 cnp->cn_nameptr++;
3582                 ndp->ni_pathlen--;
3583         }
3584
3585         *dpp = ndp->ni_rootdir;
3586 }
3587
3588 /*
3589  * Components of nameidata (or objects it can point to) which may
3590  * need restoring in case fast path lookup fails.
3591  */
3592 struct nameidata_saved {
3593         long cn_namelen;
3594         char *cn_nameptr;
3595         size_t ni_pathlen;
3596         int cn_flags;
3597 };
3598
3599 struct cache_fpl {
3600         struct nameidata *ndp;
3601         struct componentname *cnp;
3602         struct pwd *pwd;
3603         struct vnode *dvp;
3604         struct vnode *tvp;
3605         seqc_t dvp_seqc;
3606         seqc_t tvp_seqc;
3607         struct nameidata_saved snd;
3608         int line;
3609         enum cache_fpl_status status:8;
3610         bool in_smr;
3611         bool fsearch;
3612 };
3613
3614 static void
3615 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3616 {
3617
3618         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3619         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3620         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3621         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3622 }
3623
3624 static void
3625 cache_fpl_restore_partial(struct cache_fpl *fpl, struct nameidata_saved *snd)
3626 {
3627
3628         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3629         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3630         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3631         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3632 }
3633
3634 static void
3635 cache_fpl_restore_abort(struct cache_fpl *fpl, struct nameidata_saved *snd)
3636 {
3637
3638         cache_fpl_restore_partial(fpl, snd);
3639         /*
3640          * It is 0 on entry by API contract.
3641          */
3642         fpl->ndp->ni_resflags = 0;
3643 }
3644
3645 #ifdef INVARIANTS
3646 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3647         struct cache_fpl *_fpl = (fpl);                         \
3648         MPASS(_fpl->in_smr == true);                            \
3649         VFS_SMR_ASSERT_ENTERED();                               \
3650 })
3651 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3652         struct cache_fpl *_fpl = (fpl);                         \
3653         MPASS(_fpl->in_smr == false);                           \
3654         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3655 })
3656 #else
3657 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3658 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3659 #endif
3660
3661 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3662         struct cache_fpl *_fpl = (fpl);                         \
3663         vfs_smr_enter();                                        \
3664         _fpl->in_smr = true;                                    \
3665 })
3666
3667 #define cache_fpl_smr_enter(fpl) ({                             \
3668         struct cache_fpl *_fpl = (fpl);                         \
3669         MPASS(_fpl->in_smr == false);                           \
3670         vfs_smr_enter();                                        \
3671         _fpl->in_smr = true;                                    \
3672 })
3673
3674 #define cache_fpl_smr_exit(fpl) ({                              \
3675         struct cache_fpl *_fpl = (fpl);                         \
3676         MPASS(_fpl->in_smr == true);                            \
3677         vfs_smr_exit();                                         \
3678         _fpl->in_smr = false;                                   \
3679 })
3680
3681 static int
3682 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3683 {
3684
3685         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3686                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3687                     ("%s: converting to abort from %d at %d, set at %d\n",
3688                     __func__, fpl->status, line, fpl->line));
3689         }
3690         fpl->status = CACHE_FPL_STATUS_ABORTED;
3691         fpl->line = line;
3692         return (CACHE_FPL_FAILED);
3693 }
3694
3695 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3696
3697 static int
3698 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3699 {
3700
3701         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3702             ("%s: setting to partial at %d, but already set to %d at %d\n",
3703             __func__, line, fpl->status, fpl->line));
3704         cache_fpl_smr_assert_entered(fpl);
3705         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3706         fpl->line = line;
3707         return (CACHE_FPL_FAILED);
3708 }
3709
3710 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3711
3712 static int
3713 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3714 {
3715
3716         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3717             ("%s: setting to handled at %d, but already set to %d at %d\n",
3718             __func__, line, fpl->status, fpl->line));
3719         cache_fpl_smr_assert_not_entered(fpl);
3720         MPASS(error != CACHE_FPL_FAILED);
3721         fpl->status = CACHE_FPL_STATUS_HANDLED;
3722         fpl->line = line;
3723         return (error);
3724 }
3725
3726 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3727
3728 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3729         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
3730          FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \
3731          NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3732
3733 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3734         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3735
3736 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3737     "supported and internal flags overlap");
3738
3739 static bool
3740 cache_fpl_islastcn(struct nameidata *ndp)
3741 {
3742
3743         return (*ndp->ni_next == 0);
3744 }
3745
3746 static bool
3747 cache_fpl_isdotdot(struct componentname *cnp)
3748 {
3749
3750         if (cnp->cn_namelen == 2 &&
3751             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3752                 return (true);
3753         return (false);
3754 }
3755
3756 static bool
3757 cache_can_fplookup(struct cache_fpl *fpl)
3758 {
3759         struct nameidata *ndp;
3760         struct componentname *cnp;
3761         struct thread *td;
3762
3763         ndp = fpl->ndp;
3764         cnp = fpl->cnp;
3765         td = cnp->cn_thread;
3766
3767         if (!cache_fast_lookup) {
3768                 cache_fpl_aborted(fpl);
3769                 return (false);
3770         }
3771 #ifdef MAC
3772         if (mac_vnode_check_lookup_enabled()) {
3773                 cache_fpl_aborted(fpl);
3774                 return (false);
3775         }
3776 #endif
3777         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3778                 cache_fpl_aborted(fpl);
3779                 return (false);
3780         }
3781         if (IN_CAPABILITY_MODE(td)) {
3782                 cache_fpl_aborted(fpl);
3783                 return (false);
3784         }
3785         if (AUDITING_TD(td)) {
3786                 cache_fpl_aborted(fpl);
3787                 return (false);
3788         }
3789         if (ndp->ni_startdir != NULL) {
3790                 cache_fpl_aborted(fpl);
3791                 return (false);
3792         }
3793         return (true);
3794 }
3795
3796 static int
3797 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3798 {
3799         struct nameidata *ndp;
3800         int error;
3801         bool fsearch;
3802
3803         ndp = fpl->ndp;
3804         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3805         if (__predict_false(error != 0)) {
3806                 cache_fpl_smr_exit(fpl);
3807                 return (cache_fpl_aborted(fpl));
3808         }
3809         fpl->fsearch = fsearch;
3810         return (0);
3811 }
3812
3813 static bool
3814 cache_fplookup_vnode_supported(struct vnode *vp)
3815 {
3816
3817         return (vp->v_type != VLNK);
3818 }
3819
3820 static int __noinline
3821 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3822     uint32_t hash)
3823 {
3824         struct componentname *cnp;
3825         struct vnode *dvp;
3826
3827         cnp = fpl->cnp;
3828         dvp = fpl->dvp;
3829
3830         cache_fpl_smr_exit(fpl);
3831         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3832                 return (cache_fpl_handled(fpl, ENOENT));
3833         else
3834                 return (cache_fpl_aborted(fpl));
3835 }
3836
3837 /*
3838  * The target vnode is not supported, prepare for the slow path to take over.
3839  */
3840 static int __noinline
3841 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3842 {
3843         struct nameidata *ndp;
3844         struct componentname *cnp;
3845         enum vgetstate dvs;
3846         struct vnode *dvp;
3847         struct pwd *pwd;
3848         seqc_t dvp_seqc;
3849
3850         ndp = fpl->ndp;
3851         cnp = fpl->cnp;
3852         pwd = fpl->pwd;
3853         dvp = fpl->dvp;
3854         dvp_seqc = fpl->dvp_seqc;
3855
3856         if (!pwd_hold_smr(pwd)) {
3857                 cache_fpl_smr_exit(fpl);
3858                 return (cache_fpl_aborted(fpl));
3859         }
3860
3861         dvs = vget_prep_smr(dvp);
3862         cache_fpl_smr_exit(fpl);
3863         if (__predict_false(dvs == VGET_NONE)) {
3864                 pwd_drop(pwd);
3865                 return (cache_fpl_aborted(fpl));
3866         }
3867
3868         vget_finish_ref(dvp, dvs);
3869         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3870                 vrele(dvp);
3871                 pwd_drop(pwd);
3872                 return (cache_fpl_aborted(fpl));
3873         }
3874
3875         cache_fpl_restore_partial(fpl, &fpl->snd);
3876
3877         ndp->ni_startdir = dvp;
3878         cnp->cn_flags |= MAKEENTRY;
3879         if (cache_fpl_islastcn(ndp))
3880                 cnp->cn_flags |= ISLASTCN;
3881         if (cache_fpl_isdotdot(cnp))
3882                 cnp->cn_flags |= ISDOTDOT;
3883
3884         return (0);
3885 }
3886
3887 static int
3888 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3889 {
3890         struct componentname *cnp;
3891         struct vnode *tvp;
3892         seqc_t tvp_seqc;
3893         int error, lkflags;
3894
3895         cnp = fpl->cnp;
3896         tvp = fpl->tvp;
3897         tvp_seqc = fpl->tvp_seqc;
3898
3899         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3900                 lkflags = LK_SHARED;
3901                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3902                         lkflags = LK_EXCLUSIVE;
3903                 error = vget_finish(tvp, lkflags, tvs);
3904                 if (__predict_false(error != 0)) {
3905                         return (cache_fpl_aborted(fpl));
3906                 }
3907         } else {
3908                 vget_finish_ref(tvp, tvs);
3909         }
3910
3911         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3912                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3913                         vput(tvp);
3914                 else
3915                         vrele(tvp);
3916                 return (cache_fpl_aborted(fpl));
3917         }
3918
3919         return (cache_fpl_handled(fpl, 0));
3920 }
3921
3922 /*
3923  * They want to possibly modify the state of the namecache.
3924  *
3925  * Don't try to match the API contract, just leave.
3926  * TODO: this leaves scalability on the table
3927  */
3928 static int
3929 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3930 {
3931         struct componentname *cnp;
3932
3933         cnp = fpl->cnp;
3934         MPASS(cnp->cn_nameiop != LOOKUP);
3935         return (cache_fpl_partial(fpl));
3936 }
3937
3938 static int __noinline
3939 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3940 {
3941         struct componentname *cnp;
3942         enum vgetstate dvs, tvs;
3943         struct vnode *dvp, *tvp;
3944         seqc_t dvp_seqc;
3945         int error;
3946
3947         cnp = fpl->cnp;
3948         dvp = fpl->dvp;
3949         dvp_seqc = fpl->dvp_seqc;
3950         tvp = fpl->tvp;
3951
3952         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3953
3954         /*
3955          * This is less efficient than it can be for simplicity.
3956          */
3957         dvs = vget_prep_smr(dvp);
3958         if (__predict_false(dvs == VGET_NONE)) {
3959                 return (cache_fpl_aborted(fpl));
3960         }
3961         tvs = vget_prep_smr(tvp);
3962         if (__predict_false(tvs == VGET_NONE)) {
3963                 cache_fpl_smr_exit(fpl);
3964                 vget_abort(dvp, dvs);
3965                 return (cache_fpl_aborted(fpl));
3966         }
3967
3968         cache_fpl_smr_exit(fpl);
3969
3970         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3971                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3972                 if (__predict_false(error != 0)) {
3973                         vget_abort(tvp, tvs);
3974                         return (cache_fpl_aborted(fpl));
3975                 }
3976         } else {
3977                 vget_finish_ref(dvp, dvs);
3978         }
3979
3980         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3981                 vget_abort(tvp, tvs);
3982                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3983                         vput(dvp);
3984                 else
3985                         vrele(dvp);
3986                 return (cache_fpl_aborted(fpl));
3987         }
3988
3989         error = cache_fplookup_final_child(fpl, tvs);
3990         if (__predict_false(error != 0)) {
3991                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3992                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3993                         vput(dvp);
3994                 else
3995                         vrele(dvp);
3996                 return (error);
3997         }
3998
3999         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4000         return (0);
4001 }
4002
4003 static int
4004 cache_fplookup_final(struct cache_fpl *fpl)
4005 {
4006         struct componentname *cnp;
4007         enum vgetstate tvs;
4008         struct vnode *dvp, *tvp;
4009         seqc_t dvp_seqc;
4010
4011         cnp = fpl->cnp;
4012         dvp = fpl->dvp;
4013         dvp_seqc = fpl->dvp_seqc;
4014         tvp = fpl->tvp;
4015
4016         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
4017
4018         if (cnp->cn_nameiop != LOOKUP) {
4019                 return (cache_fplookup_final_modifying(fpl));
4020         }
4021
4022         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4023                 return (cache_fplookup_final_withparent(fpl));
4024
4025         tvs = vget_prep_smr(tvp);
4026         if (__predict_false(tvs == VGET_NONE)) {
4027                 return (cache_fpl_partial(fpl));
4028         }
4029
4030         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4031                 cache_fpl_smr_exit(fpl);
4032                 vget_abort(tvp, tvs);
4033                 return (cache_fpl_aborted(fpl));
4034         }
4035
4036         cache_fpl_smr_exit(fpl);
4037         return (cache_fplookup_final_child(fpl, tvs));
4038 }
4039
4040 static int __noinline
4041 cache_fplookup_dot(struct cache_fpl *fpl)
4042 {
4043         struct vnode *dvp;
4044
4045         dvp = fpl->dvp;
4046
4047         fpl->tvp = dvp;
4048         fpl->tvp_seqc = vn_seqc_read_any(dvp);
4049         if (seqc_in_modify(fpl->tvp_seqc)) {
4050                 return (cache_fpl_aborted(fpl));
4051         }
4052
4053         counter_u64_add(dothits, 1);
4054         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
4055
4056         return (0);
4057 }
4058
4059 static int __noinline
4060 cache_fplookup_dotdot(struct cache_fpl *fpl)
4061 {
4062         struct nameidata *ndp;
4063         struct componentname *cnp;
4064         struct namecache *ncp;
4065         struct vnode *dvp;
4066         struct prison *pr;
4067         u_char nc_flag;
4068
4069         ndp = fpl->ndp;
4070         cnp = fpl->cnp;
4071         dvp = fpl->dvp;
4072
4073         /*
4074          * XXX this is racy the same way regular lookup is
4075          */
4076         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4077             pr = pr->pr_parent)
4078                 if (dvp == pr->pr_root)
4079                         break;
4080
4081         if (dvp == ndp->ni_rootdir ||
4082             dvp == ndp->ni_topdir ||
4083             dvp == rootvnode ||
4084             pr != NULL) {
4085                 fpl->tvp = dvp;
4086                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
4087                 if (seqc_in_modify(fpl->tvp_seqc)) {
4088                         return (cache_fpl_aborted(fpl));
4089                 }
4090                 return (0);
4091         }
4092
4093         if ((dvp->v_vflag & VV_ROOT) != 0) {
4094                 /*
4095                  * TODO
4096                  * The opposite of climb mount is needed here.
4097                  */
4098                 return (cache_fpl_aborted(fpl));
4099         }
4100
4101         ncp = atomic_load_ptr(&dvp->v_cache_dd);
4102         if (ncp == NULL) {
4103                 return (cache_fpl_aborted(fpl));
4104         }
4105
4106         nc_flag = atomic_load_char(&ncp->nc_flag);
4107         if ((nc_flag & NCF_ISDOTDOT) != 0) {
4108                 if ((nc_flag & NCF_NEGATIVE) != 0)
4109                         return (cache_fpl_aborted(fpl));
4110                 fpl->tvp = ncp->nc_vp;
4111         } else {
4112                 fpl->tvp = ncp->nc_dvp;
4113         }
4114
4115         if (!cache_ncp_canuse(ncp)) {
4116                 return (cache_fpl_aborted(fpl));
4117         }
4118
4119         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4120         if (seqc_in_modify(fpl->tvp_seqc)) {
4121                 return (cache_fpl_partial(fpl));
4122         }
4123
4124         counter_u64_add(dotdothits, 1);
4125         return (0);
4126 }
4127
4128 static int __noinline
4129 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4130 {
4131         u_char nc_flag;
4132         bool neg_promote;
4133
4134         nc_flag = atomic_load_char(&ncp->nc_flag);
4135         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4136         /*
4137          * If they want to create an entry we need to replace this one.
4138          */
4139         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4140                 /*
4141                  * TODO
4142                  * This should call something similar to
4143                  * cache_fplookup_final_modifying.
4144                  */
4145                 return (cache_fpl_partial(fpl));
4146         }
4147         neg_promote = cache_neg_hit_prep(ncp);
4148         if (!cache_ncp_canuse(ncp)) {
4149                 cache_neg_hit_abort(ncp);
4150                 return (cache_fpl_partial(fpl));
4151         }
4152         if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4153                 cache_neg_hit_abort(ncp);
4154                 return (cache_fpl_partial(fpl));
4155         }
4156         if (neg_promote) {
4157                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4158         }
4159         cache_neg_hit_finish(ncp);
4160         cache_fpl_smr_exit(fpl);
4161         return (cache_fpl_handled(fpl, ENOENT));
4162 }
4163
4164 static int
4165 cache_fplookup_next(struct cache_fpl *fpl)
4166 {
4167         struct componentname *cnp;
4168         struct namecache *ncp;
4169         struct vnode *dvp, *tvp;
4170         u_char nc_flag;
4171         uint32_t hash;
4172
4173         cnp = fpl->cnp;
4174         dvp = fpl->dvp;
4175
4176         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4177                 return (cache_fplookup_dot(fpl));
4178         }
4179
4180         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4181
4182         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4183                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4184                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4185                         break;
4186         }
4187
4188         /*
4189          * If there is no entry we have to punt to the slow path to perform
4190          * actual lookup. Should there be nothing with this name a negative
4191          * entry will be created.
4192          */
4193         if (__predict_false(ncp == NULL)) {
4194                 return (cache_fpl_partial(fpl));
4195         }
4196
4197         tvp = atomic_load_ptr(&ncp->nc_vp);
4198         nc_flag = atomic_load_char(&ncp->nc_flag);
4199         if ((nc_flag & NCF_NEGATIVE) != 0) {
4200                 return (cache_fplookup_neg(fpl, ncp, hash));
4201         }
4202
4203         if (!cache_ncp_canuse(ncp)) {
4204                 return (cache_fpl_partial(fpl));
4205         }
4206
4207         fpl->tvp = tvp;
4208         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4209         if (seqc_in_modify(fpl->tvp_seqc)) {
4210                 return (cache_fpl_partial(fpl));
4211         }
4212
4213         if (!cache_fplookup_vnode_supported(tvp)) {
4214                 return (cache_fpl_partial(fpl));
4215         }
4216
4217         counter_u64_add(numposhits, 1);
4218         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4219         return (0);
4220 }
4221
4222 static bool
4223 cache_fplookup_mp_supported(struct mount *mp)
4224 {
4225
4226         if (mp == NULL)
4227                 return (false);
4228         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4229                 return (false);
4230         return (true);
4231 }
4232
4233 /*
4234  * Walk up the mount stack (if any).
4235  *
4236  * Correctness is provided in the following ways:
4237  * - all vnodes are protected from freeing with SMR
4238  * - struct mount objects are type stable making them always safe to access
4239  * - stability of the particular mount is provided by busying it
4240  * - relationship between the vnode which is mounted on and the mount is
4241  *   verified with the vnode sequence counter after busying
4242  * - association between root vnode of the mount and the mount is protected
4243  *   by busy
4244  *
4245  * From that point on we can read the sequence counter of the root vnode
4246  * and get the next mount on the stack (if any) using the same protection.
4247  *
4248  * By the end of successful walk we are guaranteed the reached state was
4249  * indeed present at least at some point which matches the regular lookup.
4250  */
4251 static int __noinline
4252 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4253 {
4254         struct mount *mp, *prev_mp;
4255         struct mount_pcpu *mpcpu, *prev_mpcpu;
4256         struct vnode *vp;
4257         seqc_t vp_seqc;
4258
4259         vp = fpl->tvp;
4260         vp_seqc = fpl->tvp_seqc;
4261
4262         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4263         mp = atomic_load_ptr(&vp->v_mountedhere);
4264         if (mp == NULL)
4265                 return (0);
4266
4267         prev_mp = NULL;
4268         for (;;) {
4269                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
4270                         if (prev_mp != NULL)
4271                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4272                         return (cache_fpl_partial(fpl));
4273                 }
4274                 if (prev_mp != NULL)
4275                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4276                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4277                         vfs_op_thread_exit_crit(mp, mpcpu);
4278                         return (cache_fpl_partial(fpl));
4279                 }
4280                 if (!cache_fplookup_mp_supported(mp)) {
4281                         vfs_op_thread_exit_crit(mp, mpcpu);
4282                         return (cache_fpl_partial(fpl));
4283                 }
4284                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4285                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4286                         vfs_op_thread_exit_crit(mp, mpcpu);
4287                         return (cache_fpl_partial(fpl));
4288                 }
4289                 vp_seqc = vn_seqc_read_any(vp);
4290                 if (seqc_in_modify(vp_seqc)) {
4291                         vfs_op_thread_exit_crit(mp, mpcpu);
4292                         return (cache_fpl_partial(fpl));
4293                 }
4294                 prev_mp = mp;
4295                 prev_mpcpu = mpcpu;
4296                 mp = atomic_load_ptr(&vp->v_mountedhere);
4297                 if (mp == NULL)
4298                         break;
4299         }
4300
4301         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
4302         fpl->tvp = vp;
4303         fpl->tvp_seqc = vp_seqc;
4304         return (0);
4305 }
4306
4307 static bool
4308 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4309 {
4310         struct mount *mp;
4311         struct vnode *vp;
4312
4313         vp = fpl->tvp;
4314
4315         /*
4316          * Hack: while this is a union, the pointer tends to be NULL so save on
4317          * a branch.
4318          */
4319         mp = atomic_load_ptr(&vp->v_mountedhere);
4320         if (mp == NULL)
4321                 return (false);
4322         if (vp->v_type == VDIR)
4323                 return (true);
4324         return (false);
4325 }
4326
4327 /*
4328  * Parse the path.
4329  *
4330  * The code was originally copy-pasted from regular lookup and despite
4331  * clean ups leaves performance on the table. Any modifications here
4332  * must take into account that in case off fallback the resulting
4333  * nameidata state has to be compatible with the original.
4334  */
4335 static int
4336 cache_fplookup_parse(struct cache_fpl *fpl)
4337 {
4338         struct nameidata *ndp;
4339         struct componentname *cnp;
4340         char *cp;
4341
4342         ndp = fpl->ndp;
4343         cnp = fpl->cnp;
4344
4345         /*
4346          * Search a new directory.
4347          *
4348          * The last component of the filename is left accessible via
4349          * cnp->cn_nameptr for callers that need the name. Callers needing
4350          * the name set the SAVENAME flag. When done, they assume
4351          * responsibility for freeing the pathname buffer.
4352          */
4353         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4354                 continue;
4355         cnp->cn_namelen = cp - cnp->cn_nameptr;
4356         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4357                 cache_fpl_smr_exit(fpl);
4358                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4359         }
4360         ndp->ni_pathlen -= cnp->cn_namelen;
4361         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4362             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4363         ndp->ni_next = cp;
4364
4365         /*
4366          * Replace multiple slashes by a single slash and trailing slashes
4367          * by a null.  This must be done before VOP_LOOKUP() because some
4368          * fs's don't know about trailing slashes.  Remember if there were
4369          * trailing slashes to handle symlinks, existing non-directories
4370          * and non-existing files that won't be directories specially later.
4371          */
4372         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4373                 cp++;
4374                 ndp->ni_pathlen--;
4375                 if (*cp == '\0') {
4376                         /*
4377                          * TODO
4378                          * Regular lookup performs the following:
4379                          * *ndp->ni_next = '\0';
4380                          * cnp->cn_flags |= TRAILINGSLASH;
4381                          *
4382                          * Which is problematic since it modifies data read
4383                          * from userspace. Then if fast path lookup was to
4384                          * abort we would have to either restore it or convey
4385                          * the flag. Since this is a corner case just ignore
4386                          * it for simplicity.
4387                          */
4388                         return (cache_fpl_partial(fpl));
4389                 }
4390         }
4391         ndp->ni_next = cp;
4392
4393         /*
4394          * Check for degenerate name (e.g. / or "")
4395          * which is a way of talking about a directory,
4396          * e.g. like "/." or ".".
4397          *
4398          * TODO
4399          * Another corner case handled by the regular lookup
4400          */
4401         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4402                 return (cache_fpl_partial(fpl));
4403         }
4404         return (0);
4405 }
4406
4407 static void
4408 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4409 {
4410         struct nameidata *ndp;
4411         struct componentname *cnp;
4412
4413         ndp = fpl->ndp;
4414         cnp = fpl->cnp;
4415
4416         cnp->cn_nameptr = ndp->ni_next;
4417         while (*cnp->cn_nameptr == '/') {
4418                 cnp->cn_nameptr++;
4419                 ndp->ni_pathlen--;
4420         }
4421 }
4422
4423 /*
4424  * See the API contract for VOP_FPLOOKUP_VEXEC.
4425  */
4426 static int __noinline
4427 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4428 {
4429         struct vnode *dvp;
4430         seqc_t dvp_seqc;
4431
4432         dvp = fpl->dvp;
4433         dvp_seqc = fpl->dvp_seqc;
4434
4435         /*
4436          * Hack: they may be looking up foo/bar, where foo is a
4437          * regular file. In such a case we need to turn ENOTDIR,
4438          * but we may happen to get here with a different error.
4439          */
4440         if (dvp->v_type != VDIR) {
4441                 /*
4442                  * The check here is predominantly to catch
4443                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4444                  * gets doomed past this point it is going to
4445                  * fail seqc verification.
4446                  */
4447                 if (VN_IS_DOOMED(dvp)) {
4448                         return (cache_fpl_aborted(fpl));
4449                 }
4450                 error = ENOTDIR;
4451         }
4452
4453         /*
4454          * Hack: handle O_SEARCH.
4455          *
4456          * Open Group Base Specifications Issue 7, 2018 edition states:
4457          * If the access mode of the open file description associated with the
4458          * file descriptor is not O_SEARCH, the function shall check whether
4459          * directory searches are permitted using the current permissions of
4460          * the directory underlying the file descriptor. If the access mode is
4461          * O_SEARCH, the function shall not perform the check.
4462          *
4463          * Regular lookup tests for the NOEXECCHECK flag for every path
4464          * component to decide whether to do the permission check. However,
4465          * since most lookups never have the flag (and when they do it is only
4466          * present for the first path component), lockless lookup only acts on
4467          * it if there is a permission problem. Here the flag is represented
4468          * with a boolean so that we don't have to clear it on the way out.
4469          *
4470          * For simplicity this always aborts.
4471          * TODO: check if this is the first lookup and ignore the permission
4472          * problem. Note the flag has to survive fallback (if it happens to be
4473          * performed).
4474          */
4475         if (fpl->fsearch) {
4476                 return (cache_fpl_aborted(fpl));
4477         }
4478
4479         switch (error) {
4480         case EAGAIN:
4481                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4482                         error = cache_fpl_aborted(fpl);
4483                 } else {
4484                         cache_fpl_partial(fpl);
4485                 }
4486                 break;
4487         default:
4488                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4489                         error = cache_fpl_aborted(fpl);
4490                 } else {
4491                         cache_fpl_smr_exit(fpl);
4492                         cache_fpl_handled(fpl, error);
4493                 }
4494                 break;
4495         }
4496         return (error);
4497 }
4498
4499 static int
4500 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4501 {
4502         struct nameidata *ndp;
4503         struct componentname *cnp;
4504         struct mount *mp;
4505         int error;
4506
4507         error = CACHE_FPL_FAILED;
4508         ndp = fpl->ndp;
4509         cnp = fpl->cnp;
4510
4511         cache_fpl_checkpoint(fpl, &fpl->snd);
4512
4513         fpl->dvp = dvp;
4514         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4515         if (seqc_in_modify(fpl->dvp_seqc)) {
4516                 cache_fpl_aborted(fpl);
4517                 goto out;
4518         }
4519         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4520         if (!cache_fplookup_mp_supported(mp)) {
4521                 cache_fpl_aborted(fpl);
4522                 goto out;
4523         }
4524
4525         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4526
4527         for (;;) {
4528                 error = cache_fplookup_parse(fpl);
4529                 if (__predict_false(error != 0)) {
4530                         break;
4531                 }
4532
4533                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4534
4535                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4536                 if (__predict_false(error != 0)) {
4537                         error = cache_fplookup_failed_vexec(fpl, error);
4538                         break;
4539                 }
4540
4541                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4542                         error = cache_fplookup_dotdot(fpl);
4543                         if (__predict_false(error != 0)) {
4544                                 break;
4545                         }
4546                 } else {
4547                         error = cache_fplookup_next(fpl);
4548                         if (__predict_false(error != 0)) {
4549                                 break;
4550                         }
4551
4552                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4553
4554                         if (cache_fplookup_need_climb_mount(fpl)) {
4555                                 error = cache_fplookup_climb_mount(fpl);
4556                                 if (__predict_false(error != 0)) {
4557                                         break;
4558                                 }
4559                         }
4560                 }
4561
4562                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4563
4564                 if (cache_fpl_islastcn(ndp)) {
4565                         error = cache_fplookup_final(fpl);
4566                         break;
4567                 }
4568
4569                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4570                         error = cache_fpl_aborted(fpl);
4571                         break;
4572                 }
4573
4574                 fpl->dvp = fpl->tvp;
4575                 fpl->dvp_seqc = fpl->tvp_seqc;
4576
4577                 cache_fplookup_parse_advance(fpl);
4578                 cache_fpl_checkpoint(fpl, &fpl->snd);
4579         }
4580 out:
4581         switch (fpl->status) {
4582         case CACHE_FPL_STATUS_UNSET:
4583                 __assert_unreachable();
4584                 break;
4585         case CACHE_FPL_STATUS_PARTIAL:
4586                 cache_fpl_smr_assert_entered(fpl);
4587                 return (cache_fplookup_partial_setup(fpl));
4588         case CACHE_FPL_STATUS_ABORTED:
4589                 if (fpl->in_smr)
4590                         cache_fpl_smr_exit(fpl);
4591                 return (CACHE_FPL_FAILED);
4592         case CACHE_FPL_STATUS_HANDLED:
4593                 MPASS(error != CACHE_FPL_FAILED);
4594                 cache_fpl_smr_assert_not_entered(fpl);
4595                 /*
4596                  * A common error is ENOENT.
4597                  */
4598                 if (error != 0) {
4599                         ndp->ni_dvp = NULL;
4600                         ndp->ni_vp = NULL;
4601                         cache_fpl_cleanup_cnp(cnp);
4602                         return (error);
4603                 }
4604                 ndp->ni_dvp = fpl->dvp;
4605                 ndp->ni_vp = fpl->tvp;
4606                 if (cnp->cn_flags & SAVENAME)
4607                         cnp->cn_flags |= HASBUF;
4608                 else
4609                         cache_fpl_cleanup_cnp(cnp);
4610                 return (error);
4611         }
4612         __assert_unreachable();
4613 }
4614
4615 /*
4616  * Fast path lookup protected with SMR and sequence counters.
4617  *
4618  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4619  *
4620  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4621  * outlined below.
4622  *
4623  * Traditional vnode lookup conceptually looks like this:
4624  *
4625  * vn_lock(current);
4626  * for (;;) {
4627  *      next = find();
4628  *      vn_lock(next);
4629  *      vn_unlock(current);
4630  *      current = next;
4631  *      if (last)
4632  *          break;
4633  * }
4634  * return (current);
4635  *
4636  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4637  * any modifications thanks to holding respective locks.
4638  *
4639  * The same guarantee can be provided with a combination of safe memory
4640  * reclamation and sequence counters instead. If all operations which affect
4641  * the relationship between the current vnode and the one we are looking for
4642  * also modify the counter, we can verify whether all the conditions held as
4643  * we made the jump. This includes things like permissions, mount points etc.
4644  * Counter modification is provided by enclosing relevant places in
4645  * vn_seqc_write_begin()/end() calls.
4646  *
4647  * Thus this translates to:
4648  *
4649  * vfs_smr_enter();
4650  * dvp_seqc = seqc_read_any(dvp);
4651  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4652  *     abort();
4653  * for (;;) {
4654  *      tvp = find();
4655  *      tvp_seqc = seqc_read_any(tvp);
4656  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4657  *          abort();
4658  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4659  *          abort();
4660  *      dvp = tvp; // we know nothing of importance has changed
4661  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4662  *      if (last)
4663  *          break;
4664  * }
4665  * vget(); // secure the vnode
4666  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4667  *          abort();
4668  * // at this point we know nothing has changed for any parent<->child pair
4669  * // as they were crossed during the lookup, meaning we matched the guarantee
4670  * // of the locked variant
4671  * return (tvp);
4672  *
4673  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4674  * - they are called while within vfs_smr protection which they must never exit
4675  * - EAGAIN can be returned to denote checking could not be performed, it is
4676  *   always valid to return it
4677  * - if the sequence counter has not changed the result must be valid
4678  * - if the sequence counter has changed both false positives and false negatives
4679  *   are permitted (since the result will be rejected later)
4680  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4681  *
4682  * Caveats to watch out for:
4683  * - vnodes are passed unlocked and unreferenced with nothing stopping
4684  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4685  *   to use atomic_load_ptr to fetch it.
4686  * - the aforementioned object can also get freed, meaning absent other means it
4687  *   should be protected with vfs_smr
4688  * - either safely checking permissions as they are modified or guaranteeing
4689  *   their stability is left to the routine
4690  */
4691 int
4692 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4693     struct pwd **pwdp)
4694 {
4695         struct cache_fpl fpl;
4696         struct pwd *pwd;
4697         struct vnode *dvp;
4698         struct componentname *cnp;
4699         struct nameidata_saved orig;
4700         int error;
4701
4702         MPASS(ndp->ni_lcf == 0);
4703
4704         fpl.status = CACHE_FPL_STATUS_UNSET;
4705         fpl.ndp = ndp;
4706         fpl.cnp = &ndp->ni_cnd;
4707         MPASS(curthread == fpl.cnp->cn_thread);
4708
4709         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4710                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4711
4712         if (!cache_can_fplookup(&fpl)) {
4713                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4714                 *status = fpl.status;
4715                 return (EOPNOTSUPP);
4716         }
4717
4718         cache_fpl_checkpoint(&fpl, &orig);
4719
4720         cache_fpl_smr_enter_initial(&fpl);
4721         fpl.fsearch = false;
4722         pwd = pwd_get_smr();
4723         fpl.pwd = pwd;
4724         ndp->ni_rootdir = pwd->pwd_rdir;
4725         ndp->ni_topdir = pwd->pwd_jdir;
4726
4727         cnp = fpl.cnp;
4728         cnp->cn_nameptr = cnp->cn_pnbuf;
4729         if (cnp->cn_pnbuf[0] == '/') {
4730                 cache_fpl_handle_root(ndp, &dvp);
4731                 ndp->ni_resflags |= NIRES_ABS;
4732         } else {
4733                 if (ndp->ni_dirfd == AT_FDCWD) {
4734                         dvp = pwd->pwd_cdir;
4735                 } else {
4736                         error = cache_fplookup_dirfd(&fpl, &dvp);
4737                         if (__predict_false(error != 0)) {
4738                                 goto out;
4739                         }
4740                 }
4741         }
4742
4743         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4744
4745         error = cache_fplookup_impl(dvp, &fpl);
4746 out:
4747         cache_fpl_smr_assert_not_entered(&fpl);
4748         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4749
4750         *status = fpl.status;
4751         switch (fpl.status) {
4752         case CACHE_FPL_STATUS_UNSET:
4753                 __assert_unreachable();
4754                 break;
4755         case CACHE_FPL_STATUS_HANDLED:
4756                 SDT_PROBE3(vfs, namei, lookup, return, error,
4757                     (error == 0 ? ndp->ni_vp : NULL), true);
4758                 break;
4759         case CACHE_FPL_STATUS_PARTIAL:
4760                 *pwdp = fpl.pwd;
4761                 /*
4762                  * Status restored by cache_fplookup_partial_setup.
4763                  */
4764                 break;
4765         case CACHE_FPL_STATUS_ABORTED:
4766                 cache_fpl_restore_abort(&fpl, &orig);
4767                 break;
4768         }
4769         return (error);
4770 }