sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70 #ifdef INVARIANTS
  71 #include <machine/_inttypes.h>
  72 #endif
  73
  74 #include <sys/capsicum.h>
  75
  76 #include <security/audit/audit.h>
  77 #include <security/mac/mac_framework.h>
  78
  79 #ifdef DDB
  80 #include <ddb/ddb.h>
  81 #endif
  82
  83 #include <vm/uma.h>
  84
  85 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  86     "Name cache");
  87
  88 SDT_PROVIDER_DECLARE(vfs);
  89 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  90     "struct vnode *");
  91 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  92     "struct vnode *");
  93 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  94     "char *");
  95 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  96     "const char *");
  97 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  98     "struct namecache *", "int", "int");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
 101     "char *", "struct vnode *");
 102 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 103 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 104     "struct vnode *", "char *");
 105 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 106     "struct vnode *");
 107 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 108     "struct vnode *", "char *");
 109 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 110     "char *");
 111 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 112     "struct componentname *");
 113 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 114     "struct componentname *");
 115 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 116 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 117 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 118 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 119 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 120     "struct vnode *");
 121 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 122     "char *");
 123 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 124     "char *");
 125 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 126
 127 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 128 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 129 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 130
 131 /*
 132  * This structure describes the elements in the cache of recent
 133  * names looked up by namei.
 134  */
 135 struct negstate {
 136         u_char neg_flag;
 137         u_char neg_hit;
 138 };
 139 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 140     "the state must fit in a union with a pointer without growing it");
 141
 142 struct  namecache {
 143         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 144         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 145         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 146         struct  vnode *nc_dvp;          /* vnode of parent of name */
 147         union {
 148                 struct  vnode *nu_vp;   /* vnode the name refers to */
 149                 struct  negstate nu_neg;/* negative entry state */
 150         } n_un;
 151         u_char  nc_flag;                /* flag bits */
 152         u_char  nc_nlen;                /* length of name */
 153         char    nc_name[0];             /* segment name + nul */
 154 };
 155
 156 /*
 157  * struct namecache_ts repeats struct namecache layout up to the
 158  * nc_nlen member.
 159  * struct namecache_ts is used in place of struct namecache when time(s) need
 160  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 161  * both a non-dotdot directory name plus dotdot for the directory's
 162  * parent.
 163  *
 164  * See below for alignment requirement.
 165  */
 166 struct  namecache_ts {
 167         struct  timespec nc_time;       /* timespec provided by fs */
 168         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 169         int     nc_ticks;               /* ticks value when entry was added */
 170         int     nc_pad;
 171         struct namecache nc_nc;
 172 };
 173
 174 TAILQ_HEAD(cache_freebatch, namecache);
 175
 176 /*
 177  * At least mips n32 performs 64-bit accesses to timespec as found
 178  * in namecache_ts and requires them to be aligned. Since others
 179  * may be in the same spot suffer a little bit and enforce the
 180  * alignment for everyone. Note this is a nop for 64-bit platforms.
 181  */
 182 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 183
 184 /*
 185  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
 186  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
 187  * smaller and the value was bumped to retain the total size, but it
 188  * was never re-evaluated for suitability. A simple test counting
 189  * lengths during package building shows that the value of 45 covers
 190  * about 86% of all added entries, reaching 99% at 65.
 191  *
 192  * Regardless of the above, use of dedicated zones instead of malloc may be
 193  * inducing additional waste. This may be hard to address as said zones are
 194  * tied to VFS SMR. Even if retaining them, the current split should be
 195  * re-evaluated.
 196  */
 197 #ifdef __LP64__
 198 #define CACHE_PATH_CUTOFF       45
 199 #define CACHE_LARGE_PAD         6
 200 #else
 201 #define CACHE_PATH_CUTOFF       41
 202 #define CACHE_LARGE_PAD         2
 203 #endif
 204
 205 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 206 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 207 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 208 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 209
 210 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 211 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 212 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 213 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 214
 215 #define nc_vp           n_un.nu_vp
 216 #define nc_neg          n_un.nu_neg
 217
 218 /*
 219  * Flags in namecache.nc_flag
 220  */
 221 #define NCF_WHITE       0x01
 222 #define NCF_ISDOTDOT    0x02
 223 #define NCF_TS          0x04
 224 #define NCF_DTS         0x08
 225 #define NCF_DVDROP      0x10
 226 #define NCF_NEGATIVE    0x20
 227 #define NCF_INVALID     0x40
 228 #define NCF_WIP         0x80
 229
 230 /*
 231  * Flags in negstate.neg_flag
 232  */
 233 #define NEG_HOT         0x01
 234
 235 static bool     cache_neg_evict_cond(u_long lnumcache);
 236
 237 /*
 238  * Mark an entry as invalid.
 239  *
 240  * This is called before it starts getting deconstructed.
 241  */
 242 static void
 243 cache_ncp_invalidate(struct namecache *ncp)
 244 {
 245
 246         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 247             ("%s: entry %p already invalid", __func__, ncp));
 248         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 249         atomic_thread_fence_rel();
 250 }
 251
 252 /*
 253  * Check whether the entry can be safely used.
 254  *
 255  * All places which elide locks are supposed to call this after they are
 256  * done with reading from an entry.
 257  */
 258 #define cache_ncp_canuse(ncp)   ({                                      \
 259         struct namecache *_ncp = (ncp);                                 \
 260         u_char _nc_flag;                                                \
 261                                                                         \
 262         atomic_thread_fence_acq();                                      \
 263         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 264         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);      \
 265 })
 266
 267 /*
 268  * Like the above but also checks NCF_WHITE.
 269  */
 270 #define cache_fpl_neg_ncp_canuse(ncp)   ({                              \
 271         struct namecache *_ncp = (ncp);                                 \
 272         u_char _nc_flag;                                                \
 273                                                                         \
 274         atomic_thread_fence_acq();                                      \
 275         _nc_flag = atomic_load_char(&_ncp->nc_flag);                    \
 276         __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);  \
 277 })
 278
 279 /*
 280  * Name caching works as follows:
 281  *
 282  * Names found by directory scans are retained in a cache
 283  * for future reference.  It is managed LRU, so frequently
 284  * used names will hang around.  Cache is indexed by hash value
 285  * obtained from (dvp, name) where dvp refers to the directory
 286  * containing name.
 287  *
 288  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 289  * exist) the vnode pointer will be NULL.
 290  *
 291  * Upon reaching the last segment of a path, if the reference
 292  * is for DELETE, or NOCACHE is set (rewrite), and the
 293  * name is located in the cache, it will be dropped.
 294  *
 295  * These locks are used (in the order in which they can be taken):
 296  * NAME         TYPE    ROLE
 297  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 298  * bucketlock   mtx     for access to given set of hash buckets
 299  * neglist      mtx     negative entry LRU management
 300  *
 301  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 302  * order is lower address first. Both are recursive.
 303  *
 304  * "." lookups are lockless.
 305  *
 306  * ".." and vnode -> name lookups require vnodelock.
 307  *
 308  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 309  *
 310  * Insertions and removals of entries require involved vnodes and bucketlocks
 311  * to be locked to provide safe operation against other threads modifying the
 312  * cache.
 313  *
 314  * Some lookups result in removal of the found entry (e.g. getting rid of a
 315  * negative entry with the intent to create a positive one), which poses a
 316  * problem when multiple threads reach the state. Similarly, two different
 317  * threads can purge two different vnodes and try to remove the same name.
 318  *
 319  * If the already held vnode lock is lower than the second required lock, we
 320  * can just take the other lock. However, in the opposite case, this could
 321  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 322  * the first node, locking everything in order and revalidating the state.
 323  */
 324
 325 VFS_SMR_DECLARE;
 326
 327 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 328     "Name cache parameters");
 329
 330 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 331 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 332     "Total namecache capacity");
 333
 334 u_int ncsizefactor = 2;
 335 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 336     "Size factor for namecache");
 337
 338 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 339 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 340     "Ratio of negative namecache entries");
 341
 342 /*
 343  * Negative entry % of namecache capacity above which automatic eviction is allowed.
 344  *
 345  * Check cache_neg_evict_cond for details.
 346  */
 347 static u_int ncnegminpct = 3;
 348
 349 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 350 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 351     "Negative entry count above which automatic eviction is allowed");
 352
 353 /*
 354  * Structures associated with name caching.
 355  */
 356 #define NCHHASH(hash) \
 357         (&nchashtbl[(hash) & nchash])
 358 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 359 static u_long __read_mostly     nchash;                 /* size of hash table */
 360 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 361     "Size of namecache hash table");
 362 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 363 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 364
 365 struct nchstats nchstats;               /* cache effectiveness statistics */
 366
 367 static bool __read_frequently cache_fast_revlookup = true;
 368 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 369     &cache_fast_revlookup, 0, "");
 370
 371 static bool __read_mostly cache_rename_add = true;
 372 SYSCTL_BOOL(_vfs, OID_AUTO, cache_rename_add, CTLFLAG_RW,
 373     &cache_rename_add, 0, "");
 374
 375 static u_int __exclusive_cache_line neg_cycle;
 376
 377 #define ncneghash       3
 378 #define numneglists     (ncneghash + 1)
 379
 380 struct neglist {
 381         struct mtx              nl_evict_lock;
 382         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 383         TAILQ_HEAD(, namecache) nl_list;
 384         TAILQ_HEAD(, namecache) nl_hotlist;
 385         u_long                  nl_hotnum;
 386 } __aligned(CACHE_LINE_SIZE);
 387
 388 static struct neglist neglists[numneglists];
 389
 390 static inline struct neglist *
 391 NCP2NEGLIST(struct namecache *ncp)
 392 {
 393
 394         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 395 }
 396
 397 static inline struct negstate *
 398 NCP2NEGSTATE(struct namecache *ncp)
 399 {
 400
 401         MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
 402         return (&ncp->nc_neg);
 403 }
 404
 405 #define numbucketlocks (ncbuckethash + 1)
 406 static u_int __read_mostly  ncbuckethash;
 407 static struct mtx_padalign __read_mostly  *bucketlocks;
 408 #define HASH2BUCKETLOCK(hash) \
 409         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 410
 411 #define numvnodelocks (ncvnodehash + 1)
 412 static u_int __read_mostly  ncvnodehash;
 413 static struct mtx __read_mostly *vnodelocks;
 414 static inline struct mtx *
 415 VP2VNODELOCK(struct vnode *vp)
 416 {
 417
 418         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 419 }
 420
 421 static void
 422 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 423 {
 424         struct namecache_ts *ncp_ts;
 425
 426         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 427             (tsp == NULL && ticksp == NULL),
 428             ("No NCF_TS"));
 429
 430         if (tsp == NULL)
 431                 return;
 432
 433         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 434         *tsp = ncp_ts->nc_time;
 435         *ticksp = ncp_ts->nc_ticks;
 436 }
 437
 438 #ifdef DEBUG_CACHE
 439 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 440 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 441     "VFS namecache enabled");
 442 #endif
 443
 444 /* Export size information to userland */
 445 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 446     sizeof(struct namecache), "sizeof(struct namecache)");
 447
 448 /*
 449  * The new name cache statistics
 450  */
 451 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 452     "Name cache statistics");
 453
 454 #define STATNODE_ULONG(name, varname, descr)                                    \
 455         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 456 #define STATNODE_COUNTER(name, varname, descr)                                  \
 457         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 458         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 459             descr);
 460 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 461 STATNODE_ULONG(count, numcache, "Number of cache entries");
 462 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 463 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 464 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 465 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 466 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 467 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 468 STATNODE_COUNTER(posszaps, numposzaps,
 469     "Number of cache hits (positive) we do not want to cache");
 470 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 471 STATNODE_COUNTER(negzaps, numnegzaps,
 472     "Number of cache hits (negative) we do not want to cache");
 473 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 474 /* These count for vn_getcwd(), too. */
 475 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 476 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 477 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 478     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 479 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 480 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 481 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
 482
 483 /*
 484  * Debug or developer statistics.
 485  */
 486 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 487     "Name cache debugging");
 488 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 489         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 490 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 491         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 492         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 493             descr);
 494 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 495     "Number of successful removals after relocking");
 496 static long zap_bucket_fail;
 497 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 498 static long zap_bucket_fail2;
 499 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 500 static long cache_lock_vnodes_cel_3_failures;
 501 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 502     "Number of times 3-way vnode locking failed");
 503
 504 static void cache_zap_locked(struct namecache *ncp);
 505 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 506     char **freebuf, size_t *buflen);
 507 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 508     char **retbuf, size_t *buflen, size_t addend);
 509 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 510     char **retbuf, size_t *buflen);
 511 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 512     char **retbuf, size_t *len, size_t addend);
 513
 514 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 515
 516 static inline void
 517 cache_assert_vlp_locked(struct mtx *vlp)
 518 {
 519
 520         if (vlp != NULL)
 521                 mtx_assert(vlp, MA_OWNED);
 522 }
 523
 524 static inline void
 525 cache_assert_vnode_locked(struct vnode *vp)
 526 {
 527         struct mtx *vlp;
 528
 529         vlp = VP2VNODELOCK(vp);
 530         cache_assert_vlp_locked(vlp);
 531 }
 532
 533 /*
 534  * Directory vnodes with entries are held for two reasons:
 535  * 1. make them less of a target for reclamation in vnlru
 536  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
 537  *
 538  * It will be feasible to stop doing it altogether if all filesystems start
 539  * supporting lockless lookup.
 540  */
 541 static void
 542 cache_hold_vnode(struct vnode *vp)
 543 {
 544
 545         cache_assert_vnode_locked(vp);
 546         VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 547         vhold(vp);
 548         counter_u64_add(numcachehv, 1);
 549 }
 550
 551 static void
 552 cache_drop_vnode(struct vnode *vp)
 553 {
 554
 555         /*
 556          * Called after all locks are dropped, meaning we can't assert
 557          * on the state of v_cache_src.
 558          */
 559         vdrop(vp);
 560         counter_u64_add(numcachehv, -1);
 561 }
 562
 563 /*
 564  * UMA zones.
 565  */
 566 static uma_zone_t __read_mostly cache_zone_small;
 567 static uma_zone_t __read_mostly cache_zone_small_ts;
 568 static uma_zone_t __read_mostly cache_zone_large;
 569 static uma_zone_t __read_mostly cache_zone_large_ts;
 570
 571 char *
 572 cache_symlink_alloc(size_t size, int flags)
 573 {
 574
 575         if (size < CACHE_ZONE_SMALL_SIZE) {
 576                 return (uma_zalloc_smr(cache_zone_small, flags));
 577         }
 578         if (size < CACHE_ZONE_LARGE_SIZE) {
 579                 return (uma_zalloc_smr(cache_zone_large, flags));
 580         }
 581         counter_u64_add(symlinktoobig, 1);
 582         SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
 583         return (NULL);
 584 }
 585
 586 void
 587 cache_symlink_free(char *string, size_t size)
 588 {
 589
 590         MPASS(string != NULL);
 591         KASSERT(size < CACHE_ZONE_LARGE_SIZE,
 592             ("%s: size %zu too big", __func__, size));
 593
 594         if (size < CACHE_ZONE_SMALL_SIZE) {
 595                 uma_zfree_smr(cache_zone_small, string);
 596                 return;
 597         }
 598         if (size < CACHE_ZONE_LARGE_SIZE) {
 599                 uma_zfree_smr(cache_zone_large, string);
 600                 return;
 601         }
 602         __assert_unreachable();
 603 }
 604
 605 static struct namecache *
 606 cache_alloc_uma(int len, bool ts)
 607 {
 608         struct namecache_ts *ncp_ts;
 609         struct namecache *ncp;
 610
 611         if (__predict_false(ts)) {
 612                 if (len <= CACHE_PATH_CUTOFF)
 613                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 614                 else
 615                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 616                 ncp = &ncp_ts->nc_nc;
 617         } else {
 618                 if (len <= CACHE_PATH_CUTOFF)
 619                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 620                 else
 621                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 622         }
 623         return (ncp);
 624 }
 625
 626 static void
 627 cache_free_uma(struct namecache *ncp)
 628 {
 629         struct namecache_ts *ncp_ts;
 630
 631         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 632                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 633                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 634                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 635                 else
 636                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 637         } else {
 638                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 639                         uma_zfree_smr(cache_zone_small, ncp);
 640                 else
 641                         uma_zfree_smr(cache_zone_large, ncp);
 642         }
 643 }
 644
 645 static struct namecache *
 646 cache_alloc(int len, bool ts)
 647 {
 648         u_long lnumcache;
 649
 650         /*
 651          * Avoid blowout in namecache entries.
 652          *
 653          * Bugs:
 654          * 1. filesystems may end up trying to add an already existing entry
 655          * (for example this can happen after a cache miss during concurrent
 656          * lookup), in which case we will call cache_neg_evict despite not
 657          * adding anything.
 658          * 2. the routine may fail to free anything and no provisions are made
 659          * to make it try harder (see the inside for failure modes)
 660          * 3. it only ever looks at negative entries.
 661          */
 662         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 663         if (cache_neg_evict_cond(lnumcache)) {
 664                 lnumcache = atomic_load_long(&numcache);
 665         }
 666         if (__predict_false(lnumcache >= ncsize)) {
 667                 atomic_subtract_long(&numcache, 1);
 668                 counter_u64_add(numdrops, 1);
 669                 return (NULL);
 670         }
 671         return (cache_alloc_uma(len, ts));
 672 }
 673
 674 static void
 675 cache_free(struct namecache *ncp)
 676 {
 677
 678         MPASS(ncp != NULL);
 679         if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 680                 cache_drop_vnode(ncp->nc_dvp);
 681         }
 682         cache_free_uma(ncp);
 683         atomic_subtract_long(&numcache, 1);
 684 }
 685
 686 static void
 687 cache_free_batch(struct cache_freebatch *batch)
 688 {
 689         struct namecache *ncp, *nnp;
 690         int i;
 691
 692         i = 0;
 693         if (TAILQ_EMPTY(batch))
 694                 goto out;
 695         TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 696                 if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 697                         cache_drop_vnode(ncp->nc_dvp);
 698                 }
 699                 cache_free_uma(ncp);
 700                 i++;
 701         }
 702         atomic_subtract_long(&numcache, i);
 703 out:
 704         SDT_PROBE1(vfs, namecache, purge, batch, i);
 705 }
 706
 707 /*
 708  * TODO: With the value stored we can do better than computing the hash based
 709  * on the address. The choice of FNV should also be revisited.
 710  */
 711 static void
 712 cache_prehash(struct vnode *vp)
 713 {
 714
 715         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 716 }
 717
 718 static uint32_t
 719 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 720 {
 721
 722         return (fnv_32_buf(name, len, dvp->v_nchash));
 723 }
 724
 725 static inline struct nchashhead *
 726 NCP2BUCKET(struct namecache *ncp)
 727 {
 728         uint32_t hash;
 729
 730         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 731         return (NCHHASH(hash));
 732 }
 733
 734 static inline struct mtx *
 735 NCP2BUCKETLOCK(struct namecache *ncp)
 736 {
 737         uint32_t hash;
 738
 739         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 740         return (HASH2BUCKETLOCK(hash));
 741 }
 742
 743 #ifdef INVARIANTS
 744 static void
 745 cache_assert_bucket_locked(struct namecache *ncp)
 746 {
 747         struct mtx *blp;
 748
 749         blp = NCP2BUCKETLOCK(ncp);
 750         mtx_assert(blp, MA_OWNED);
 751 }
 752
 753 static void
 754 cache_assert_bucket_unlocked(struct namecache *ncp)
 755 {
 756         struct mtx *blp;
 757
 758         blp = NCP2BUCKETLOCK(ncp);
 759         mtx_assert(blp, MA_NOTOWNED);
 760 }
 761 #else
 762 #define cache_assert_bucket_locked(x) do { } while (0)
 763 #define cache_assert_bucket_unlocked(x) do { } while (0)
 764 #endif
 765
 766 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 767 static void
 768 _cache_sort_vnodes(void **p1, void **p2)
 769 {
 770         void *tmp;
 771
 772         MPASS(*p1 != NULL || *p2 != NULL);
 773
 774         if (*p1 > *p2) {
 775                 tmp = *p2;
 776                 *p2 = *p1;
 777                 *p1 = tmp;
 778         }
 779 }
 780
 781 static void
 782 cache_lock_all_buckets(void)
 783 {
 784         u_int i;
 785
 786         for (i = 0; i < numbucketlocks; i++)
 787                 mtx_lock(&bucketlocks[i]);
 788 }
 789
 790 static void
 791 cache_unlock_all_buckets(void)
 792 {
 793         u_int i;
 794
 795         for (i = 0; i < numbucketlocks; i++)
 796                 mtx_unlock(&bucketlocks[i]);
 797 }
 798
 799 static void
 800 cache_lock_all_vnodes(void)
 801 {
 802         u_int i;
 803
 804         for (i = 0; i < numvnodelocks; i++)
 805                 mtx_lock(&vnodelocks[i]);
 806 }
 807
 808 static void
 809 cache_unlock_all_vnodes(void)
 810 {
 811         u_int i;
 812
 813         for (i = 0; i < numvnodelocks; i++)
 814                 mtx_unlock(&vnodelocks[i]);
 815 }
 816
 817 static int
 818 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 819 {
 820
 821         cache_sort_vnodes(&vlp1, &vlp2);
 822
 823         if (vlp1 != NULL) {
 824                 if (!mtx_trylock(vlp1))
 825                         return (EAGAIN);
 826         }
 827         if (!mtx_trylock(vlp2)) {
 828                 if (vlp1 != NULL)
 829                         mtx_unlock(vlp1);
 830                 return (EAGAIN);
 831         }
 832
 833         return (0);
 834 }
 835
 836 static void
 837 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 838 {
 839
 840         MPASS(vlp1 != NULL || vlp2 != NULL);
 841         MPASS(vlp1 <= vlp2);
 842
 843         if (vlp1 != NULL)
 844                 mtx_lock(vlp1);
 845         if (vlp2 != NULL)
 846                 mtx_lock(vlp2);
 847 }
 848
 849 static void
 850 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 851 {
 852
 853         MPASS(vlp1 != NULL || vlp2 != NULL);
 854
 855         if (vlp1 != NULL)
 856                 mtx_unlock(vlp1);
 857         if (vlp2 != NULL)
 858                 mtx_unlock(vlp2);
 859 }
 860
 861 static int
 862 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 863 {
 864         struct nchstats snap;
 865
 866         if (req->oldptr == NULL)
 867                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 868
 869         snap = nchstats;
 870         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 871         snap.ncs_neghits = counter_u64_fetch(numneghits);
 872         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 873             counter_u64_fetch(numnegzaps);
 874         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 875             counter_u64_fetch(nummiss);
 876
 877         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 878 }
 879 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 880     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 881     "VFS cache effectiveness statistics");
 882
 883 static void
 884 cache_recalc_neg_min(u_int val)
 885 {
 886
 887         neg_min = (ncsize * val) / 100;
 888 }
 889
 890 static int
 891 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 892 {
 893         u_int val;
 894         int error;
 895
 896         val = ncnegminpct;
 897         error = sysctl_handle_int(oidp, &val, 0, req);
 898         if (error != 0 || req->newptr == NULL)
 899                 return (error);
 900
 901         if (val == ncnegminpct)
 902                 return (0);
 903         if (val < 0 || val > 99)
 904                 return (EINVAL);
 905         ncnegminpct = val;
 906         cache_recalc_neg_min(val);
 907         return (0);
 908 }
 909
 910 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 911     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 912     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 913
 914 #ifdef DIAGNOSTIC
 915 /*
 916  * Grab an atomic snapshot of the name cache hash chain lengths
 917  */
 918 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 919     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 920     "hash table stats");
 921
 922 static int
 923 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 924 {
 925         struct nchashhead *ncpp;
 926         struct namecache *ncp;
 927         int i, error, n_nchash, *cntbuf;
 928
 929 retry:
 930         n_nchash = nchash + 1;  /* nchash is max index, not count */
 931         if (req->oldptr == NULL)
 932                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 933         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 934         cache_lock_all_buckets();
 935         if (n_nchash != nchash + 1) {
 936                 cache_unlock_all_buckets();
 937                 free(cntbuf, M_TEMP);
 938                 goto retry;
 939         }
 940         /* Scan hash tables counting entries */
 941         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 942                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 943                         cntbuf[i]++;
 944         cache_unlock_all_buckets();
 945         for (error = 0, i = 0; i < n_nchash; i++)
 946                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 947                         break;
 948         free(cntbuf, M_TEMP);
 949         return (error);
 950 }
 951 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 952     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 953     "nchash chain lengths");
 954
 955 static int
 956 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 957 {
 958         int error;
 959         struct nchashhead *ncpp;
 960         struct namecache *ncp;
 961         int n_nchash;
 962         int count, maxlength, used, pct;
 963
 964         if (!req->oldptr)
 965                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 966
 967         cache_lock_all_buckets();
 968         n_nchash = nchash + 1;  /* nchash is max index, not count */
 969         used = 0;
 970         maxlength = 0;
 971
 972         /* Scan hash tables for applicable entries */
 973         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 974                 count = 0;
 975                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 976                         count++;
 977                 }
 978                 if (count)
 979                         used++;
 980                 if (maxlength < count)
 981                         maxlength = count;
 982         }
 983         n_nchash = nchash + 1;
 984         cache_unlock_all_buckets();
 985         pct = (used * 100) / (n_nchash / 100);
 986         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 987         if (error)
 988                 return (error);
 989         error = SYSCTL_OUT(req, &used, sizeof(used));
 990         if (error)
 991                 return (error);
 992         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 993         if (error)
 994                 return (error);
 995         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 996         if (error)
 997                 return (error);
 998         return (0);
 999 }
1000 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
1001     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
1002     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
1003 #endif
1004
1005 /*
1006  * Negative entries management
1007  *
1008  * Various workloads create plenty of negative entries and barely use them
1009  * afterwards. Moreover malicious users can keep performing bogus lookups
1010  * adding even more entries. For example "make tinderbox" as of writing this
1011  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
1012  * negative.
1013  *
1014  * As such, a rather aggressive eviction method is needed. The currently
1015  * employed method is a placeholder.
1016  *
1017  * Entries are split over numneglists separate lists, each of which is further
1018  * split into hot and cold entries. Entries get promoted after getting a hit.
1019  * Eviction happens on addition of new entry.
1020  */
1021 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1022     "Name cache negative entry statistics");
1023
1024 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
1025     "Number of negative cache entries");
1026
1027 static COUNTER_U64_DEFINE_EARLY(neg_created);
1028 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
1029     "Number of created negative entries");
1030
1031 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
1032 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
1033     "Number of evicted negative entries");
1034
1035 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
1036 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
1037     &neg_evict_skipped_empty,
1038     "Number of times evicting failed due to lack of entries");
1039
1040 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
1041 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
1042     &neg_evict_skipped_missed,
1043     "Number of times evicting failed due to target entry disappearing");
1044
1045 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
1046 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
1047     &neg_evict_skipped_contended,
1048     "Number of times evicting failed due to contention");
1049
1050 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
1051     "Number of cache hits (negative)");
1052
1053 static int
1054 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
1055 {
1056         int i, out;
1057
1058         out = 0;
1059         for (i = 0; i < numneglists; i++)
1060                 out += neglists[i].nl_hotnum;
1061
1062         return (SYSCTL_OUT(req, &out, sizeof(out)));
1063 }
1064 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
1065     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
1066     "Number of hot negative entries");
1067
1068 static void
1069 cache_neg_init(struct namecache *ncp)
1070 {
1071         struct negstate *ns;
1072
1073         ncp->nc_flag |= NCF_NEGATIVE;
1074         ns = NCP2NEGSTATE(ncp);
1075         ns->neg_flag = 0;
1076         ns->neg_hit = 0;
1077         counter_u64_add(neg_created, 1);
1078 }
1079
1080 #define CACHE_NEG_PROMOTION_THRESH 2
1081
1082 static bool
1083 cache_neg_hit_prep(struct namecache *ncp)
1084 {
1085         struct negstate *ns;
1086         u_char n;
1087
1088         ns = NCP2NEGSTATE(ncp);
1089         n = atomic_load_char(&ns->neg_hit);
1090         for (;;) {
1091                 if (n >= CACHE_NEG_PROMOTION_THRESH)
1092                         return (false);
1093                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
1094                         break;
1095         }
1096         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
1097 }
1098
1099 /*
1100  * Nothing to do here but it is provided for completeness as some
1101  * cache_neg_hit_prep callers may end up returning without even
1102  * trying to promote.
1103  */
1104 #define cache_neg_hit_abort(ncp)        do { } while (0)
1105
1106 static void
1107 cache_neg_hit_finish(struct namecache *ncp)
1108 {
1109
1110         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
1111         counter_u64_add(numneghits, 1);
1112 }
1113
1114 /*
1115  * Move a negative entry to the hot list.
1116  */
1117 static void
1118 cache_neg_promote_locked(struct namecache *ncp)
1119 {
1120         struct neglist *nl;
1121         struct negstate *ns;
1122
1123         ns = NCP2NEGSTATE(ncp);
1124         nl = NCP2NEGLIST(ncp);
1125         mtx_assert(&nl->nl_lock, MA_OWNED);
1126         if ((ns->neg_flag & NEG_HOT) == 0) {
1127                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1128                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
1129                 nl->nl_hotnum++;
1130                 ns->neg_flag |= NEG_HOT;
1131         }
1132 }
1133
1134 /*
1135  * Move a hot negative entry to the cold list.
1136  */
1137 static void
1138 cache_neg_demote_locked(struct namecache *ncp)
1139 {
1140         struct neglist *nl;
1141         struct negstate *ns;
1142
1143         ns = NCP2NEGSTATE(ncp);
1144         nl = NCP2NEGLIST(ncp);
1145         mtx_assert(&nl->nl_lock, MA_OWNED);
1146         MPASS(ns->neg_flag & NEG_HOT);
1147         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1148         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1149         nl->nl_hotnum--;
1150         ns->neg_flag &= ~NEG_HOT;
1151         atomic_store_char(&ns->neg_hit, 0);
1152 }
1153
1154 /*
1155  * Move a negative entry to the hot list if it matches the lookup.
1156  *
1157  * We have to take locks, but they may be contended and in the worst
1158  * case we may need to go off CPU. We don't want to spin within the
1159  * smr section and we can't block with it. Exiting the section means
1160  * the found entry could have been evicted. We are going to look it
1161  * up again.
1162  */
1163 static bool
1164 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1165     struct namecache *oncp, uint32_t hash)
1166 {
1167         struct namecache *ncp;
1168         struct neglist *nl;
1169         u_char nc_flag;
1170
1171         nl = NCP2NEGLIST(oncp);
1172
1173         mtx_lock(&nl->nl_lock);
1174         /*
1175          * For hash iteration.
1176          */
1177         vfs_smr_enter();
1178
1179         /*
1180          * Avoid all surprises by only succeeding if we got the same entry and
1181          * bailing completely otherwise.
1182          * XXX There are no provisions to keep the vnode around, meaning we may
1183          * end up promoting a negative entry for a *new* vnode and returning
1184          * ENOENT on its account. This is the error we want to return anyway
1185          * and promotion is harmless.
1186          *
1187          * In particular at this point there can be a new ncp which matches the
1188          * search but hashes to a different neglist.
1189          */
1190         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1191                 if (ncp == oncp)
1192                         break;
1193         }
1194
1195         /*
1196          * No match to begin with.
1197          */
1198         if (__predict_false(ncp == NULL)) {
1199                 goto out_abort;
1200         }
1201
1202         /*
1203          * The newly found entry may be something different...
1204          */
1205         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1206             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1207                 goto out_abort;
1208         }
1209
1210         /*
1211          * ... and not even negative.
1212          */
1213         nc_flag = atomic_load_char(&ncp->nc_flag);
1214         if ((nc_flag & NCF_NEGATIVE) == 0) {
1215                 goto out_abort;
1216         }
1217
1218         if (!cache_ncp_canuse(ncp)) {
1219                 goto out_abort;
1220         }
1221
1222         cache_neg_promote_locked(ncp);
1223         cache_neg_hit_finish(ncp);
1224         vfs_smr_exit();
1225         mtx_unlock(&nl->nl_lock);
1226         return (true);
1227 out_abort:
1228         vfs_smr_exit();
1229         mtx_unlock(&nl->nl_lock);
1230         return (false);
1231 }
1232
1233 static void
1234 cache_neg_promote(struct namecache *ncp)
1235 {
1236         struct neglist *nl;
1237
1238         nl = NCP2NEGLIST(ncp);
1239         mtx_lock(&nl->nl_lock);
1240         cache_neg_promote_locked(ncp);
1241         mtx_unlock(&nl->nl_lock);
1242 }
1243
1244 static void
1245 cache_neg_insert(struct namecache *ncp)
1246 {
1247         struct neglist *nl;
1248
1249         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1250         cache_assert_bucket_locked(ncp);
1251         nl = NCP2NEGLIST(ncp);
1252         mtx_lock(&nl->nl_lock);
1253         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1254         mtx_unlock(&nl->nl_lock);
1255         atomic_add_long(&numneg, 1);
1256 }
1257
1258 static void
1259 cache_neg_remove(struct namecache *ncp)
1260 {
1261         struct neglist *nl;
1262         struct negstate *ns;
1263
1264         cache_assert_bucket_locked(ncp);
1265         nl = NCP2NEGLIST(ncp);
1266         ns = NCP2NEGSTATE(ncp);
1267         mtx_lock(&nl->nl_lock);
1268         if ((ns->neg_flag & NEG_HOT) != 0) {
1269                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1270                 nl->nl_hotnum--;
1271         } else {
1272                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1273         }
1274         mtx_unlock(&nl->nl_lock);
1275         atomic_subtract_long(&numneg, 1);
1276 }
1277
1278 static struct neglist *
1279 cache_neg_evict_select_list(void)
1280 {
1281         struct neglist *nl;
1282         u_int c;
1283
1284         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1285         nl = &neglists[c % numneglists];
1286         if (!mtx_trylock(&nl->nl_evict_lock)) {
1287                 counter_u64_add(neg_evict_skipped_contended, 1);
1288                 return (NULL);
1289         }
1290         return (nl);
1291 }
1292
1293 static struct namecache *
1294 cache_neg_evict_select_entry(struct neglist *nl)
1295 {
1296         struct namecache *ncp, *lncp;
1297         struct negstate *ns, *lns;
1298         int i;
1299
1300         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1301         mtx_assert(&nl->nl_lock, MA_OWNED);
1302         ncp = TAILQ_FIRST(&nl->nl_list);
1303         if (ncp == NULL)
1304                 return (NULL);
1305         lncp = ncp;
1306         lns = NCP2NEGSTATE(lncp);
1307         for (i = 1; i < 4; i++) {
1308                 ncp = TAILQ_NEXT(ncp, nc_dst);
1309                 if (ncp == NULL)
1310                         break;
1311                 ns = NCP2NEGSTATE(ncp);
1312                 if (ns->neg_hit < lns->neg_hit) {
1313                         lncp = ncp;
1314                         lns = ns;
1315                 }
1316         }
1317         return (lncp);
1318 }
1319
1320 static bool
1321 cache_neg_evict(void)
1322 {
1323         struct namecache *ncp, *ncp2;
1324         struct neglist *nl;
1325         struct vnode *dvp;
1326         struct mtx *dvlp;
1327         struct mtx *blp;
1328         uint32_t hash;
1329         u_char nlen;
1330         bool evicted;
1331
1332         nl = cache_neg_evict_select_list();
1333         if (nl == NULL) {
1334                 return (false);
1335         }
1336
1337         mtx_lock(&nl->nl_lock);
1338         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1339         if (ncp != NULL) {
1340                 cache_neg_demote_locked(ncp);
1341         }
1342         ncp = cache_neg_evict_select_entry(nl);
1343         if (ncp == NULL) {
1344                 counter_u64_add(neg_evict_skipped_empty, 1);
1345                 mtx_unlock(&nl->nl_lock);
1346                 mtx_unlock(&nl->nl_evict_lock);
1347                 return (false);
1348         }
1349         nlen = ncp->nc_nlen;
1350         dvp = ncp->nc_dvp;
1351         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1352         dvlp = VP2VNODELOCK(dvp);
1353         blp = HASH2BUCKETLOCK(hash);
1354         mtx_unlock(&nl->nl_lock);
1355         mtx_unlock(&nl->nl_evict_lock);
1356         mtx_lock(dvlp);
1357         mtx_lock(blp);
1358         /*
1359          * Note that since all locks were dropped above, the entry may be
1360          * gone or reallocated to be something else.
1361          */
1362         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1363                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1364                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1365                         break;
1366         }
1367         if (ncp2 == NULL) {
1368                 counter_u64_add(neg_evict_skipped_missed, 1);
1369                 ncp = NULL;
1370                 evicted = false;
1371         } else {
1372                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1373                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1374                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1375                     ncp->nc_name);
1376                 cache_zap_locked(ncp);
1377                 counter_u64_add(neg_evicted, 1);
1378                 evicted = true;
1379         }
1380         mtx_unlock(blp);
1381         mtx_unlock(dvlp);
1382         if (ncp != NULL)
1383                 cache_free(ncp);
1384         return (evicted);
1385 }
1386
1387 /*
1388  * Maybe evict a negative entry to create more room.
1389  *
1390  * The ncnegfactor parameter limits what fraction of the total count
1391  * can comprise of negative entries. However, if the cache is just
1392  * warming up this leads to excessive evictions.  As such, ncnegminpct
1393  * (recomputed to neg_min) dictates whether the above should be
1394  * applied.
1395  *
1396  * Try evicting if the cache is close to full capacity regardless of
1397  * other considerations.
1398  */
1399 static bool
1400 cache_neg_evict_cond(u_long lnumcache)
1401 {
1402         u_long lnumneg;
1403
1404         if (ncsize - 1000 < lnumcache)
1405                 goto out_evict;
1406         lnumneg = atomic_load_long(&numneg);
1407         if (lnumneg < neg_min)
1408                 return (false);
1409         if (lnumneg * ncnegfactor < lnumcache)
1410                 return (false);
1411 out_evict:
1412         return (cache_neg_evict());
1413 }
1414
1415 /*
1416  * cache_zap_locked():
1417  *
1418  *   Removes a namecache entry from cache, whether it contains an actual
1419  *   pointer to a vnode or if it is just a negative cache entry.
1420  */
1421 static void
1422 cache_zap_locked(struct namecache *ncp)
1423 {
1424         struct nchashhead *ncpp;
1425         struct vnode *dvp, *vp;
1426
1427         dvp = ncp->nc_dvp;
1428         vp = ncp->nc_vp;
1429
1430         if (!(ncp->nc_flag & NCF_NEGATIVE))
1431                 cache_assert_vnode_locked(vp);
1432         cache_assert_vnode_locked(dvp);
1433         cache_assert_bucket_locked(ncp);
1434
1435         cache_ncp_invalidate(ncp);
1436
1437         ncpp = NCP2BUCKET(ncp);
1438         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1439         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1440                 SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
1441                 TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
1442                 if (ncp == vp->v_cache_dd) {
1443                         atomic_store_ptr(&vp->v_cache_dd, NULL);
1444                 }
1445         } else {
1446                 SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
1447                 cache_neg_remove(ncp);
1448         }
1449         if (ncp->nc_flag & NCF_ISDOTDOT) {
1450                 if (ncp == dvp->v_cache_dd) {
1451                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
1452                 }
1453         } else {
1454                 LIST_REMOVE(ncp, nc_src);
1455                 if (LIST_EMPTY(&dvp->v_cache_src)) {
1456                         ncp->nc_flag |= NCF_DVDROP;
1457                 }
1458         }
1459 }
1460
1461 static void
1462 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1463 {
1464         struct mtx *blp;
1465
1466         MPASS(ncp->nc_dvp == vp);
1467         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1468         cache_assert_vnode_locked(vp);
1469
1470         blp = NCP2BUCKETLOCK(ncp);
1471         mtx_lock(blp);
1472         cache_zap_locked(ncp);
1473         mtx_unlock(blp);
1474 }
1475
1476 static bool
1477 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1478     struct mtx **vlpp)
1479 {
1480         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1481         struct mtx *blp;
1482
1483         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1484         cache_assert_vnode_locked(vp);
1485
1486         if (ncp->nc_flag & NCF_NEGATIVE) {
1487                 if (*vlpp != NULL) {
1488                         mtx_unlock(*vlpp);
1489                         *vlpp = NULL;
1490                 }
1491                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1492                 return (true);
1493         }
1494
1495         pvlp = VP2VNODELOCK(vp);
1496         blp = NCP2BUCKETLOCK(ncp);
1497         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1498         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1499
1500         if (*vlpp == vlp1 || *vlpp == vlp2) {
1501                 to_unlock = *vlpp;
1502                 *vlpp = NULL;
1503         } else {
1504                 if (*vlpp != NULL) {
1505                         mtx_unlock(*vlpp);
1506                         *vlpp = NULL;
1507                 }
1508                 cache_sort_vnodes(&vlp1, &vlp2);
1509                 if (vlp1 == pvlp) {
1510                         mtx_lock(vlp2);
1511                         to_unlock = vlp2;
1512                 } else {
1513                         if (!mtx_trylock(vlp1))
1514                                 goto out_relock;
1515                         to_unlock = vlp1;
1516                 }
1517         }
1518         mtx_lock(blp);
1519         cache_zap_locked(ncp);
1520         mtx_unlock(blp);
1521         if (to_unlock != NULL)
1522                 mtx_unlock(to_unlock);
1523         return (true);
1524
1525 out_relock:
1526         mtx_unlock(vlp2);
1527         mtx_lock(vlp1);
1528         mtx_lock(vlp2);
1529         MPASS(*vlpp == NULL);
1530         *vlpp = vlp1;
1531         return (false);
1532 }
1533
1534 /*
1535  * If trylocking failed we can get here. We know enough to take all needed locks
1536  * in the right order and re-lookup the entry.
1537  */
1538 static int
1539 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1540     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1541     struct mtx *blp)
1542 {
1543         struct namecache *rncp;
1544
1545         cache_assert_bucket_unlocked(ncp);
1546
1547         cache_sort_vnodes(&dvlp, &vlp);
1548         cache_lock_vnodes(dvlp, vlp);
1549         mtx_lock(blp);
1550         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1551                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1552                     rncp->nc_nlen == cnp->cn_namelen &&
1553                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1554                         break;
1555         }
1556         if (rncp != NULL) {
1557                 cache_zap_locked(rncp);
1558                 mtx_unlock(blp);
1559                 cache_unlock_vnodes(dvlp, vlp);
1560                 counter_u64_add(zap_bucket_relock_success, 1);
1561                 return (0);
1562         }
1563
1564         mtx_unlock(blp);
1565         cache_unlock_vnodes(dvlp, vlp);
1566         return (EAGAIN);
1567 }
1568
1569 static int __noinline
1570 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1571     uint32_t hash, struct mtx *blp)
1572 {
1573         struct mtx *dvlp, *vlp;
1574         struct vnode *dvp;
1575
1576         cache_assert_bucket_locked(ncp);
1577
1578         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1579         vlp = NULL;
1580         if (!(ncp->nc_flag & NCF_NEGATIVE))
1581                 vlp = VP2VNODELOCK(ncp->nc_vp);
1582         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1583                 cache_zap_locked(ncp);
1584                 mtx_unlock(blp);
1585                 cache_unlock_vnodes(dvlp, vlp);
1586                 return (0);
1587         }
1588
1589         dvp = ncp->nc_dvp;
1590         mtx_unlock(blp);
1591         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1592 }
1593
1594 static __noinline int
1595 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1596 {
1597         struct namecache *ncp;
1598         struct mtx *blp;
1599         struct mtx *dvlp, *dvlp2;
1600         uint32_t hash;
1601         int error;
1602
1603         if (cnp->cn_namelen == 2 &&
1604             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1605                 dvlp = VP2VNODELOCK(dvp);
1606                 dvlp2 = NULL;
1607                 mtx_lock(dvlp);
1608 retry_dotdot:
1609                 ncp = dvp->v_cache_dd;
1610                 if (ncp == NULL) {
1611                         mtx_unlock(dvlp);
1612                         if (dvlp2 != NULL)
1613                                 mtx_unlock(dvlp2);
1614                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1615                         return (0);
1616                 }
1617                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1618                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1619                                 goto retry_dotdot;
1620                         MPASS(dvp->v_cache_dd == NULL);
1621                         mtx_unlock(dvlp);
1622                         if (dvlp2 != NULL)
1623                                 mtx_unlock(dvlp2);
1624                         cache_free(ncp);
1625                 } else {
1626                         atomic_store_ptr(&dvp->v_cache_dd, NULL);
1627                         mtx_unlock(dvlp);
1628                         if (dvlp2 != NULL)
1629                                 mtx_unlock(dvlp2);
1630                 }
1631                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1632                 return (1);
1633         }
1634
1635         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1636         blp = HASH2BUCKETLOCK(hash);
1637 retry:
1638         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1639                 goto out_no_entry;
1640
1641         mtx_lock(blp);
1642
1643         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1644                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1645                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1646                         break;
1647         }
1648
1649         if (ncp == NULL) {
1650                 mtx_unlock(blp);
1651                 goto out_no_entry;
1652         }
1653
1654         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1655         if (__predict_false(error != 0)) {
1656                 zap_bucket_fail++;
1657                 goto retry;
1658         }
1659         counter_u64_add(numposzaps, 1);
1660         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1661         cache_free(ncp);
1662         return (1);
1663 out_no_entry:
1664         counter_u64_add(nummisszap, 1);
1665         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1666         return (0);
1667 }
1668
1669 static int __noinline
1670 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1671     struct timespec *tsp, int *ticksp)
1672 {
1673         int ltype;
1674
1675         *vpp = dvp;
1676         counter_u64_add(dothits, 1);
1677         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1678         if (tsp != NULL)
1679                 timespecclear(tsp);
1680         if (ticksp != NULL)
1681                 *ticksp = ticks;
1682         vrefact(*vpp);
1683         /*
1684          * When we lookup "." we still can be asked to lock it
1685          * differently...
1686          */
1687         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1688         if (ltype != VOP_ISLOCKED(*vpp)) {
1689                 if (ltype == LK_EXCLUSIVE) {
1690                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1691                         if (VN_IS_DOOMED((*vpp))) {
1692                                 /* forced unmount */
1693                                 vrele(*vpp);
1694                                 *vpp = NULL;
1695                                 return (ENOENT);
1696                         }
1697                 } else
1698                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1699         }
1700         return (-1);
1701 }
1702
1703 static int __noinline
1704 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1705     struct timespec *tsp, int *ticksp)
1706 {
1707         struct namecache_ts *ncp_ts;
1708         struct namecache *ncp;
1709         struct mtx *dvlp;
1710         enum vgetstate vs;
1711         int error, ltype;
1712         bool whiteout;
1713
1714         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1715
1716         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1717                 cache_remove_cnp(dvp, cnp);
1718                 return (0);
1719         }
1720
1721         counter_u64_add(dotdothits, 1);
1722 retry:
1723         dvlp = VP2VNODELOCK(dvp);
1724         mtx_lock(dvlp);
1725         ncp = dvp->v_cache_dd;
1726         if (ncp == NULL) {
1727                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1728                 mtx_unlock(dvlp);
1729                 return (0);
1730         }
1731         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1732                 if (ncp->nc_flag & NCF_NEGATIVE)
1733                         *vpp = NULL;
1734                 else
1735                         *vpp = ncp->nc_vp;
1736         } else
1737                 *vpp = ncp->nc_dvp;
1738         if (*vpp == NULL)
1739                 goto negative_success;
1740         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1741         cache_out_ts(ncp, tsp, ticksp);
1742         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1743             NCF_DTS && tsp != NULL) {
1744                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1745                 *tsp = ncp_ts->nc_dotdottime;
1746         }
1747
1748         MPASS(dvp != *vpp);
1749         ltype = VOP_ISLOCKED(dvp);
1750         VOP_UNLOCK(dvp);
1751         vs = vget_prep(*vpp);
1752         mtx_unlock(dvlp);
1753         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1754         vn_lock(dvp, ltype | LK_RETRY);
1755         if (VN_IS_DOOMED(dvp)) {
1756                 if (error == 0)
1757                         vput(*vpp);
1758                 *vpp = NULL;
1759                 return (ENOENT);
1760         }
1761         if (error) {
1762                 *vpp = NULL;
1763                 goto retry;
1764         }
1765         return (-1);
1766 negative_success:
1767         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1768                 if (cnp->cn_flags & ISLASTCN) {
1769                         counter_u64_add(numnegzaps, 1);
1770                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1771                         mtx_unlock(dvlp);
1772                         cache_free(ncp);
1773                         return (0);
1774                 }
1775         }
1776
1777         whiteout = (ncp->nc_flag & NCF_WHITE);
1778         cache_out_ts(ncp, tsp, ticksp);
1779         if (cache_neg_hit_prep(ncp))
1780                 cache_neg_promote(ncp);
1781         else
1782                 cache_neg_hit_finish(ncp);
1783         mtx_unlock(dvlp);
1784         if (whiteout)
1785                 cnp->cn_flags |= ISWHITEOUT;
1786         return (ENOENT);
1787 }
1788
1789 /**
1790  * Lookup a name in the name cache
1791  *
1792  * # Arguments
1793  *
1794  * - dvp:       Parent directory in which to search.
1795  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1796  * - cnp:       Parameters of the name search.  The most interesting bits of
1797  *              the cn_flags field have the following meanings:
1798  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1799  *                      it up.
1800  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1801  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1802  *              or negative) lookup, tsp will be filled with any timespec that
1803  *              was stored when this cache entry was created.  However, it will
1804  *              be clear for "." entries.
1805  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1806  *              (positive or negative) lookup, it will contain the ticks value
1807  *              that was current when the cache entry was created, unless cnp
1808  *              was ".".
1809  *
1810  * Either both tsp and ticks have to be provided or neither of them.
1811  *
1812  * # Returns
1813  *
1814  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1815  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1816  *              to a forced unmount.  vpp will not be modified.  If the entry
1817  *              is a whiteout, then the ISWHITEOUT flag will be set in
1818  *              cnp->cn_flags.
1819  * - 0:         A cache miss.  vpp will not be modified.
1820  *
1821  * # Locking
1822  *
1823  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1824  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1825  * lock is not recursively acquired.
1826  */
1827 static int __noinline
1828 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1829     struct timespec *tsp, int *ticksp)
1830 {
1831         struct namecache *ncp;
1832         struct mtx *blp;
1833         uint32_t hash;
1834         enum vgetstate vs;
1835         int error;
1836         bool whiteout;
1837
1838         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1839         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1840
1841 retry:
1842         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1843         blp = HASH2BUCKETLOCK(hash);
1844         mtx_lock(blp);
1845
1846         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1847                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1848                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1849                         break;
1850         }
1851
1852         if (__predict_false(ncp == NULL)) {
1853                 mtx_unlock(blp);
1854                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1855                     NULL);
1856                 counter_u64_add(nummiss, 1);
1857                 return (0);
1858         }
1859
1860         if (ncp->nc_flag & NCF_NEGATIVE)
1861                 goto negative_success;
1862
1863         counter_u64_add(numposhits, 1);
1864         *vpp = ncp->nc_vp;
1865         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1866         cache_out_ts(ncp, tsp, ticksp);
1867         MPASS(dvp != *vpp);
1868         vs = vget_prep(*vpp);
1869         mtx_unlock(blp);
1870         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1871         if (error) {
1872                 *vpp = NULL;
1873                 goto retry;
1874         }
1875         return (-1);
1876 negative_success:
1877         /*
1878          * We don't get here with regular lookup apart from corner cases.
1879          */
1880         if (__predict_true(cnp->cn_nameiop == CREATE)) {
1881                 if (cnp->cn_flags & ISLASTCN) {
1882                         counter_u64_add(numnegzaps, 1);
1883                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1884                         if (__predict_false(error != 0)) {
1885                                 zap_bucket_fail2++;
1886                                 goto retry;
1887                         }
1888                         cache_free(ncp);
1889                         return (0);
1890                 }
1891         }
1892
1893         whiteout = (ncp->nc_flag & NCF_WHITE);
1894         cache_out_ts(ncp, tsp, ticksp);
1895         if (cache_neg_hit_prep(ncp))
1896                 cache_neg_promote(ncp);
1897         else
1898                 cache_neg_hit_finish(ncp);
1899         mtx_unlock(blp);
1900         if (whiteout)
1901                 cnp->cn_flags |= ISWHITEOUT;
1902         return (ENOENT);
1903 }
1904
1905 int
1906 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1907     struct timespec *tsp, int *ticksp)
1908 {
1909         struct namecache *ncp;
1910         uint32_t hash;
1911         enum vgetstate vs;
1912         int error;
1913         bool whiteout, neg_promote;
1914         u_short nc_flag;
1915
1916         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1917
1918 #ifdef DEBUG_CACHE
1919         if (__predict_false(!doingcache)) {
1920                 cnp->cn_flags &= ~MAKEENTRY;
1921                 return (0);
1922         }
1923 #endif
1924
1925         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1926                 if (cnp->cn_namelen == 1)
1927                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1928                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1929                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1930         }
1931
1932         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1933
1934         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1935                 cache_remove_cnp(dvp, cnp);
1936                 return (0);
1937         }
1938
1939         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1940         vfs_smr_enter();
1941
1942         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1943                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1944                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1945                         break;
1946         }
1947
1948         if (__predict_false(ncp == NULL)) {
1949                 vfs_smr_exit();
1950                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1951                     NULL);
1952                 counter_u64_add(nummiss, 1);
1953                 return (0);
1954         }
1955
1956         nc_flag = atomic_load_char(&ncp->nc_flag);
1957         if (nc_flag & NCF_NEGATIVE)
1958                 goto negative_success;
1959
1960         counter_u64_add(numposhits, 1);
1961         *vpp = ncp->nc_vp;
1962         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1963         cache_out_ts(ncp, tsp, ticksp);
1964         MPASS(dvp != *vpp);
1965         if (!cache_ncp_canuse(ncp)) {
1966                 vfs_smr_exit();
1967                 *vpp = NULL;
1968                 goto out_fallback;
1969         }
1970         vs = vget_prep_smr(*vpp);
1971         vfs_smr_exit();
1972         if (__predict_false(vs == VGET_NONE)) {
1973                 *vpp = NULL;
1974                 goto out_fallback;
1975         }
1976         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1977         if (error) {
1978                 *vpp = NULL;
1979                 goto out_fallback;
1980         }
1981         return (-1);
1982 negative_success:
1983         if (cnp->cn_nameiop == CREATE) {
1984                 if (cnp->cn_flags & ISLASTCN) {
1985                         vfs_smr_exit();
1986                         goto out_fallback;
1987                 }
1988         }
1989
1990         cache_out_ts(ncp, tsp, ticksp);
1991         whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
1992         neg_promote = cache_neg_hit_prep(ncp);
1993         if (!cache_ncp_canuse(ncp)) {
1994                 cache_neg_hit_abort(ncp);
1995                 vfs_smr_exit();
1996                 goto out_fallback;
1997         }
1998         if (neg_promote) {
1999                 vfs_smr_exit();
2000                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
2001                         goto out_fallback;
2002         } else {
2003                 cache_neg_hit_finish(ncp);
2004                 vfs_smr_exit();
2005         }
2006         if (whiteout)
2007                 cnp->cn_flags |= ISWHITEOUT;
2008         return (ENOENT);
2009 out_fallback:
2010         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
2011 }
2012
2013 struct celockstate {
2014         struct mtx *vlp[3];
2015         struct mtx *blp[2];
2016 };
2017 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
2018 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
2019
2020 static inline void
2021 cache_celockstate_init(struct celockstate *cel)
2022 {
2023
2024         bzero(cel, sizeof(*cel));
2025 }
2026
2027 static void
2028 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
2029     struct vnode *dvp)
2030 {
2031         struct mtx *vlp1, *vlp2;
2032
2033         MPASS(cel->vlp[0] == NULL);
2034         MPASS(cel->vlp[1] == NULL);
2035         MPASS(cel->vlp[2] == NULL);
2036
2037         MPASS(vp != NULL || dvp != NULL);
2038
2039         vlp1 = VP2VNODELOCK(vp);
2040         vlp2 = VP2VNODELOCK(dvp);
2041         cache_sort_vnodes(&vlp1, &vlp2);
2042
2043         if (vlp1 != NULL) {
2044                 mtx_lock(vlp1);
2045                 cel->vlp[0] = vlp1;
2046         }
2047         mtx_lock(vlp2);
2048         cel->vlp[1] = vlp2;
2049 }
2050
2051 static void
2052 cache_unlock_vnodes_cel(struct celockstate *cel)
2053 {
2054
2055         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
2056
2057         if (cel->vlp[0] != NULL)
2058                 mtx_unlock(cel->vlp[0]);
2059         if (cel->vlp[1] != NULL)
2060                 mtx_unlock(cel->vlp[1]);
2061         if (cel->vlp[2] != NULL)
2062                 mtx_unlock(cel->vlp[2]);
2063 }
2064
2065 static bool
2066 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
2067 {
2068         struct mtx *vlp;
2069         bool ret;
2070
2071         cache_assert_vlp_locked(cel->vlp[0]);
2072         cache_assert_vlp_locked(cel->vlp[1]);
2073         MPASS(cel->vlp[2] == NULL);
2074
2075         MPASS(vp != NULL);
2076         vlp = VP2VNODELOCK(vp);
2077
2078         ret = true;
2079         if (vlp >= cel->vlp[1]) {
2080                 mtx_lock(vlp);
2081         } else {
2082                 if (mtx_trylock(vlp))
2083                         goto out;
2084                 cache_lock_vnodes_cel_3_failures++;
2085                 cache_unlock_vnodes_cel(cel);
2086                 if (vlp < cel->vlp[0]) {
2087                         mtx_lock(vlp);
2088                         mtx_lock(cel->vlp[0]);
2089                         mtx_lock(cel->vlp[1]);
2090                 } else {
2091                         if (cel->vlp[0] != NULL)
2092                                 mtx_lock(cel->vlp[0]);
2093                         mtx_lock(vlp);
2094                         mtx_lock(cel->vlp[1]);
2095                 }
2096                 ret = false;
2097         }
2098 out:
2099         cel->vlp[2] = vlp;
2100         return (ret);
2101 }
2102
2103 static void
2104 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
2105     struct mtx *blp2)
2106 {
2107
2108         MPASS(cel->blp[0] == NULL);
2109         MPASS(cel->blp[1] == NULL);
2110
2111         cache_sort_vnodes(&blp1, &blp2);
2112
2113         if (blp1 != NULL) {
2114                 mtx_lock(blp1);
2115                 cel->blp[0] = blp1;
2116         }
2117         mtx_lock(blp2);
2118         cel->blp[1] = blp2;
2119 }
2120
2121 static void
2122 cache_unlock_buckets_cel(struct celockstate *cel)
2123 {
2124
2125         if (cel->blp[0] != NULL)
2126                 mtx_unlock(cel->blp[0]);
2127         mtx_unlock(cel->blp[1]);
2128 }
2129
2130 /*
2131  * Lock part of the cache affected by the insertion.
2132  *
2133  * This means vnodelocks for dvp, vp and the relevant bucketlock.
2134  * However, insertion can result in removal of an old entry. In this
2135  * case we have an additional vnode and bucketlock pair to lock.
2136  *
2137  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
2138  * preserving the locking order (smaller address first).
2139  */
2140 static void
2141 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2142     uint32_t hash)
2143 {
2144         struct namecache *ncp;
2145         struct mtx *blps[2];
2146         u_char nc_flag;
2147
2148         blps[0] = HASH2BUCKETLOCK(hash);
2149         for (;;) {
2150                 blps[1] = NULL;
2151                 cache_lock_vnodes_cel(cel, dvp, vp);
2152                 if (vp == NULL || vp->v_type != VDIR)
2153                         break;
2154                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
2155                 if (ncp == NULL)
2156                         break;
2157                 nc_flag = atomic_load_char(&ncp->nc_flag);
2158                 if ((nc_flag & NCF_ISDOTDOT) == 0)
2159                         break;
2160                 MPASS(ncp->nc_dvp == vp);
2161                 blps[1] = NCP2BUCKETLOCK(ncp);
2162                 if ((nc_flag & NCF_NEGATIVE) != 0)
2163                         break;
2164                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2165                         break;
2166                 /*
2167                  * All vnodes got re-locked. Re-validate the state and if
2168                  * nothing changed we are done. Otherwise restart.
2169                  */
2170                 if (ncp == vp->v_cache_dd &&
2171                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2172                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2173                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2174                         break;
2175                 cache_unlock_vnodes_cel(cel);
2176                 cel->vlp[0] = NULL;
2177                 cel->vlp[1] = NULL;
2178                 cel->vlp[2] = NULL;
2179         }
2180         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2181 }
2182
2183 static void
2184 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2185     uint32_t hash)
2186 {
2187         struct namecache *ncp;
2188         struct mtx *blps[2];
2189         u_char nc_flag;
2190
2191         blps[0] = HASH2BUCKETLOCK(hash);
2192         for (;;) {
2193                 blps[1] = NULL;
2194                 cache_lock_vnodes_cel(cel, dvp, vp);
2195                 ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
2196                 if (ncp == NULL)
2197                         break;
2198                 nc_flag = atomic_load_char(&ncp->nc_flag);
2199                 if ((nc_flag & NCF_ISDOTDOT) == 0)
2200                         break;
2201                 MPASS(ncp->nc_dvp == dvp);
2202                 blps[1] = NCP2BUCKETLOCK(ncp);
2203                 if ((nc_flag & NCF_NEGATIVE) != 0)
2204                         break;
2205                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2206                         break;
2207                 if (ncp == dvp->v_cache_dd &&
2208                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2209                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2210                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2211                         break;
2212                 cache_unlock_vnodes_cel(cel);
2213                 cel->vlp[0] = NULL;
2214                 cel->vlp[1] = NULL;
2215                 cel->vlp[2] = NULL;
2216         }
2217         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2218 }
2219
2220 static void
2221 cache_enter_unlock(struct celockstate *cel)
2222 {
2223
2224         cache_unlock_buckets_cel(cel);
2225         cache_unlock_vnodes_cel(cel);
2226 }
2227
2228 static void __noinline
2229 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2230     struct componentname *cnp)
2231 {
2232         struct celockstate cel;
2233         struct namecache *ncp;
2234         uint32_t hash;
2235         int len;
2236
2237         if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
2238                 return;
2239         len = cnp->cn_namelen;
2240         cache_celockstate_init(&cel);
2241         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2242         cache_enter_lock_dd(&cel, dvp, vp, hash);
2243         ncp = dvp->v_cache_dd;
2244         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2245                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2246                 cache_zap_locked(ncp);
2247         } else {
2248                 ncp = NULL;
2249         }
2250         atomic_store_ptr(&dvp->v_cache_dd, NULL);
2251         cache_enter_unlock(&cel);
2252         if (ncp != NULL)
2253                 cache_free(ncp);
2254 }
2255
2256 /*
2257  * Add an entry to the cache.
2258  */
2259 void
2260 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2261     struct timespec *tsp, struct timespec *dtsp)
2262 {
2263         struct celockstate cel;
2264         struct namecache *ncp, *n2, *ndd;
2265         struct namecache_ts *ncp_ts;
2266         struct nchashhead *ncpp;
2267         uint32_t hash;
2268         int flag;
2269         int len;
2270
2271         KASSERT(cnp->cn_namelen <= NAME_MAX,
2272             ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
2273             NAME_MAX));
2274         VNPASS(dvp != vp, dvp);
2275         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2276         VNPASS(dvp->v_type != VNON, dvp);
2277         if (vp != NULL) {
2278                 VNPASS(!VN_IS_DOOMED(vp), vp);
2279                 VNPASS(vp->v_type != VNON, vp);
2280         }
2281
2282 #ifdef DEBUG_CACHE
2283         if (__predict_false(!doingcache))
2284                 return;
2285 #endif
2286
2287         flag = 0;
2288         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2289                 if (cnp->cn_namelen == 1)
2290                         return;
2291                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2292                         cache_enter_dotdot_prep(dvp, vp, cnp);
2293                         flag = NCF_ISDOTDOT;
2294                 }
2295         }
2296
2297         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2298         if (ncp == NULL)
2299                 return;
2300
2301         cache_celockstate_init(&cel);
2302         ndd = NULL;
2303         ncp_ts = NULL;
2304
2305         /*
2306          * Calculate the hash key and setup as much of the new
2307          * namecache entry as possible before acquiring the lock.
2308          */
2309         ncp->nc_flag = flag | NCF_WIP;
2310         ncp->nc_vp = vp;
2311         if (vp == NULL)
2312                 cache_neg_init(ncp);
2313         ncp->nc_dvp = dvp;
2314         if (tsp != NULL) {
2315                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2316                 ncp_ts->nc_time = *tsp;
2317                 ncp_ts->nc_ticks = ticks;
2318                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2319                 if (dtsp != NULL) {
2320                         ncp_ts->nc_dotdottime = *dtsp;
2321                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2322                 }
2323         }
2324         len = ncp->nc_nlen = cnp->cn_namelen;
2325         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2326         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2327         ncp->nc_name[len] = '\0';
2328         cache_enter_lock(&cel, dvp, vp, hash);
2329
2330         /*
2331          * See if this vnode or negative entry is already in the cache
2332          * with this name.  This can happen with concurrent lookups of
2333          * the same path name.
2334          */
2335         ncpp = NCHHASH(hash);
2336         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2337                 if (n2->nc_dvp == dvp &&
2338                     n2->nc_nlen == cnp->cn_namelen &&
2339                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2340                         MPASS(cache_ncp_canuse(n2));
2341                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2342                                 KASSERT(vp == NULL,
2343                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2344                                     __func__, NULL, vp));
2345                         else
2346                                 KASSERT(n2->nc_vp == vp,
2347                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2348                                     __func__, n2->nc_vp, vp));
2349                         /*
2350                          * Entries are supposed to be immutable unless in the
2351                          * process of getting destroyed. Accommodating for
2352                          * changing timestamps is possible but not worth it.
2353                          * This should be harmless in terms of correctness, in
2354                          * the worst case resulting in an earlier expiration.
2355                          * Alternatively, the found entry can be replaced
2356                          * altogether.
2357                          */
2358                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2359 #if 0
2360                         if (tsp != NULL) {
2361                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2362                                     ("no NCF_TS"));
2363                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2364                                 n2_ts->nc_time = ncp_ts->nc_time;
2365                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2366                                 if (dtsp != NULL) {
2367                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2368                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2369                                 }
2370                         }
2371 #endif
2372                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2373                             vp);
2374                         goto out_unlock_free;
2375                 }
2376         }
2377
2378         if (flag == NCF_ISDOTDOT) {
2379                 /*
2380                  * See if we are trying to add .. entry, but some other lookup
2381                  * has populated v_cache_dd pointer already.
2382                  */
2383                 if (dvp->v_cache_dd != NULL)
2384                         goto out_unlock_free;
2385                 KASSERT(vp == NULL || vp->v_type == VDIR,
2386                     ("wrong vnode type %p", vp));
2387                 atomic_thread_fence_rel();
2388                 atomic_store_ptr(&dvp->v_cache_dd, ncp);
2389         }
2390
2391         if (vp != NULL) {
2392                 if (flag != NCF_ISDOTDOT) {
2393                         /*
2394                          * For this case, the cache entry maps both the
2395                          * directory name in it and the name ".." for the
2396                          * directory's parent.
2397                          */
2398                         if ((ndd = vp->v_cache_dd) != NULL) {
2399                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2400                                         cache_zap_locked(ndd);
2401                                 else
2402                                         ndd = NULL;
2403                         }
2404                         atomic_thread_fence_rel();
2405                         atomic_store_ptr(&vp->v_cache_dd, ncp);
2406                 } else if (vp->v_type != VDIR) {
2407                         if (vp->v_cache_dd != NULL) {
2408                                 atomic_store_ptr(&vp->v_cache_dd, NULL);
2409                         }
2410                 }
2411         }
2412
2413         if (flag != NCF_ISDOTDOT) {
2414                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2415                         cache_hold_vnode(dvp);
2416                 }
2417                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2418         }
2419
2420         /*
2421          * If the entry is "negative", we place it into the
2422          * "negative" cache queue, otherwise, we place it into the
2423          * destination vnode's cache entries queue.
2424          */
2425         if (vp != NULL) {
2426                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2427                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2428                     vp);
2429         } else {
2430                 if (cnp->cn_flags & ISWHITEOUT)
2431                         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
2432                 cache_neg_insert(ncp);
2433                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2434                     ncp->nc_name);
2435         }
2436
2437         /*
2438          * Insert the new namecache entry into the appropriate chain
2439          * within the cache entries table.
2440          */
2441         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2442
2443         atomic_thread_fence_rel();
2444         /*
2445          * Mark the entry as fully constructed.
2446          * It is immutable past this point until its removal.
2447          */
2448         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2449
2450         cache_enter_unlock(&cel);
2451         if (ndd != NULL)
2452                 cache_free(ndd);
2453         return;
2454 out_unlock_free:
2455         cache_enter_unlock(&cel);
2456         cache_free(ncp);
2457         return;
2458 }
2459
2460 static u_int
2461 cache_roundup_2(u_int val)
2462 {
2463         u_int res;
2464
2465         for (res = 1; res <= val; res <<= 1)
2466                 continue;
2467
2468         return (res);
2469 }
2470
2471 static struct nchashhead *
2472 nchinittbl(u_long elements, u_long *hashmask)
2473 {
2474         struct nchashhead *hashtbl;
2475         u_long hashsize, i;
2476
2477         hashsize = cache_roundup_2(elements) / 2;
2478
2479         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2480         for (i = 0; i < hashsize; i++)
2481                 CK_SLIST_INIT(&hashtbl[i]);
2482         *hashmask = hashsize - 1;
2483         return (hashtbl);
2484 }
2485
2486 static void
2487 ncfreetbl(struct nchashhead *hashtbl)
2488 {
2489
2490         free(hashtbl, M_VFSCACHE);
2491 }
2492
2493 /*
2494  * Name cache initialization, from vfs_init() when we are booting
2495  */
2496 static void
2497 nchinit(void *dummy __unused)
2498 {
2499         u_int i;
2500
2501         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2502             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2503         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2504             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2505         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2506             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2507         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2508             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2509
2510         VFS_SMR_ZONE_SET(cache_zone_small);
2511         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2512         VFS_SMR_ZONE_SET(cache_zone_large);
2513         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2514
2515         ncsize = desiredvnodes * ncsizefactor;
2516         cache_recalc_neg_min(ncnegminpct);
2517         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2518         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2519         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2520                 ncbuckethash = 7;
2521         if (ncbuckethash > nchash)
2522                 ncbuckethash = nchash;
2523         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2524             M_WAITOK | M_ZERO);
2525         for (i = 0; i < numbucketlocks; i++)
2526                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2527         ncvnodehash = ncbuckethash;
2528         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2529             M_WAITOK | M_ZERO);
2530         for (i = 0; i < numvnodelocks; i++)
2531                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2532
2533         for (i = 0; i < numneglists; i++) {
2534                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2535                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2536                 TAILQ_INIT(&neglists[i].nl_list);
2537                 TAILQ_INIT(&neglists[i].nl_hotlist);
2538         }
2539 }
2540 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2541
2542 void
2543 cache_vnode_init(struct vnode *vp)
2544 {
2545
2546         LIST_INIT(&vp->v_cache_src);
2547         TAILQ_INIT(&vp->v_cache_dst);
2548         vp->v_cache_dd = NULL;
2549         cache_prehash(vp);
2550 }
2551
2552 void
2553 cache_changesize(u_long newmaxvnodes)
2554 {
2555         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2556         u_long new_nchash, old_nchash;
2557         struct namecache *ncp;
2558         uint32_t hash;
2559         u_long newncsize;
2560         int i;
2561
2562         newncsize = newmaxvnodes * ncsizefactor;
2563         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2564         if (newmaxvnodes < numbucketlocks)
2565                 newmaxvnodes = numbucketlocks;
2566
2567         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2568         /* If same hash table size, nothing to do */
2569         if (nchash == new_nchash) {
2570                 ncfreetbl(new_nchashtbl);
2571                 return;
2572         }
2573         /*
2574          * Move everything from the old hash table to the new table.
2575          * None of the namecache entries in the table can be removed
2576          * because to do so, they have to be removed from the hash table.
2577          */
2578         cache_lock_all_vnodes();
2579         cache_lock_all_buckets();
2580         old_nchashtbl = nchashtbl;
2581         old_nchash = nchash;
2582         nchashtbl = new_nchashtbl;
2583         nchash = new_nchash;
2584         for (i = 0; i <= old_nchash; i++) {
2585                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2586                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2587                             ncp->nc_dvp);
2588                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2589                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2590                 }
2591         }
2592         ncsize = newncsize;
2593         cache_recalc_neg_min(ncnegminpct);
2594         cache_unlock_all_buckets();
2595         cache_unlock_all_vnodes();
2596         ncfreetbl(old_nchashtbl);
2597 }
2598
2599 /*
2600  * Remove all entries from and to a particular vnode.
2601  */
2602 static void
2603 cache_purge_impl(struct vnode *vp)
2604 {
2605         struct cache_freebatch batch;
2606         struct namecache *ncp;
2607         struct mtx *vlp, *vlp2;
2608
2609         TAILQ_INIT(&batch);
2610         vlp = VP2VNODELOCK(vp);
2611         vlp2 = NULL;
2612         mtx_lock(vlp);
2613 retry:
2614         while (!LIST_EMPTY(&vp->v_cache_src)) {
2615                 ncp = LIST_FIRST(&vp->v_cache_src);
2616                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2617                         goto retry;
2618                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2619         }
2620         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2621                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2622                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2623                         goto retry;
2624                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2625         }
2626         ncp = vp->v_cache_dd;
2627         if (ncp != NULL) {
2628                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2629                    ("lost dotdot link"));
2630                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2631                         goto retry;
2632                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2633         }
2634         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2635         mtx_unlock(vlp);
2636         if (vlp2 != NULL)
2637                 mtx_unlock(vlp2);
2638         cache_free_batch(&batch);
2639 }
2640
2641 /*
2642  * Opportunistic check to see if there is anything to do.
2643  */
2644 static bool
2645 cache_has_entries(struct vnode *vp)
2646 {
2647
2648         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2649             atomic_load_ptr(&vp->v_cache_dd) == NULL)
2650                 return (false);
2651         return (true);
2652 }
2653
2654 void
2655 cache_purge(struct vnode *vp)
2656 {
2657
2658         SDT_PROBE1(vfs, namecache, purge, done, vp);
2659         if (!cache_has_entries(vp))
2660                 return;
2661         cache_purge_impl(vp);
2662 }
2663
2664 /*
2665  * Only to be used by vgone.
2666  */
2667 void
2668 cache_purge_vgone(struct vnode *vp)
2669 {
2670         struct mtx *vlp;
2671
2672         VNPASS(VN_IS_DOOMED(vp), vp);
2673         if (cache_has_entries(vp)) {
2674                 cache_purge_impl(vp);
2675                 return;
2676         }
2677
2678         /*
2679          * Serialize against a potential thread doing cache_purge.
2680          */
2681         vlp = VP2VNODELOCK(vp);
2682         mtx_wait_unlocked(vlp);
2683         if (cache_has_entries(vp)) {
2684                 cache_purge_impl(vp);
2685                 return;
2686         }
2687         return;
2688 }
2689
2690 /*
2691  * Remove all negative entries for a particular directory vnode.
2692  */
2693 void
2694 cache_purge_negative(struct vnode *vp)
2695 {
2696         struct cache_freebatch batch;
2697         struct namecache *ncp, *nnp;
2698         struct mtx *vlp;
2699
2700         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2701         if (LIST_EMPTY(&vp->v_cache_src))
2702                 return;
2703         TAILQ_INIT(&batch);
2704         vlp = VP2VNODELOCK(vp);
2705         mtx_lock(vlp);
2706         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2707                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2708                         continue;
2709                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2710                 TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
2711         }
2712         mtx_unlock(vlp);
2713         cache_free_batch(&batch);
2714 }
2715
2716 /*
2717  * Entry points for modifying VOP operations.
2718  */
2719 void
2720 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2721     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2722 {
2723
2724         ASSERT_VOP_IN_SEQC(fdvp);
2725         ASSERT_VOP_IN_SEQC(fvp);
2726         ASSERT_VOP_IN_SEQC(tdvp);
2727         if (tvp != NULL)
2728                 ASSERT_VOP_IN_SEQC(tvp);
2729
2730         cache_purge(fvp);
2731         if (tvp != NULL) {
2732                 cache_purge(tvp);
2733                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2734                     ("%s: lingering negative entry", __func__));
2735         } else {
2736                 cache_remove_cnp(tdvp, tcnp);
2737         }
2738
2739         /*
2740          * TODO
2741          *
2742          * Historically renaming was always purging all revelang entries,
2743          * but that's quite wasteful. In particular turns out that in many cases
2744          * the target file is immediately accessed after rename, inducing a cache
2745          * miss.
2746          *
2747          * Recode this to reduce relocking and reuse the existing entry (if any)
2748          * instead of just removing it above and allocating a new one here.
2749          */
2750         if (cache_rename_add) {
2751                 cache_enter(tdvp, fvp, tcnp);
2752         }
2753 }
2754
2755 void
2756 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
2757 {
2758
2759         ASSERT_VOP_IN_SEQC(dvp);
2760         ASSERT_VOP_IN_SEQC(vp);
2761         cache_purge(vp);
2762 }
2763
2764 #ifdef INVARIANTS
2765 /*
2766  * Validate that if an entry exists it matches.
2767  */
2768 void
2769 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2770 {
2771         struct namecache *ncp;
2772         struct mtx *blp;
2773         uint32_t hash;
2774
2775         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2776         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2777                 return;
2778         blp = HASH2BUCKETLOCK(hash);
2779         mtx_lock(blp);
2780         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2781                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2782                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2783                         if (ncp->nc_vp != vp)
2784                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2785                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2786                                     ncp->nc_vp);
2787                 }
2788         }
2789         mtx_unlock(blp);
2790 }
2791 #endif
2792
2793 /*
2794  * Flush all entries referencing a particular filesystem.
2795  */
2796 void
2797 cache_purgevfs(struct mount *mp)
2798 {
2799         struct vnode *vp, *mvp;
2800
2801         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2802         /*
2803          * Somewhat wasteful iteration over all vnodes. Would be better to
2804          * support filtering and avoid the interlock to begin with.
2805          */
2806         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2807                 if (!cache_has_entries(vp)) {
2808                         VI_UNLOCK(vp);
2809                         continue;
2810                 }
2811                 vholdl(vp);
2812                 VI_UNLOCK(vp);
2813                 cache_purge(vp);
2814                 vdrop(vp);
2815         }
2816 }
2817
2818 /*
2819  * Perform canonical checks and cache lookup and pass on to filesystem
2820  * through the vop_cachedlookup only if needed.
2821  */
2822
2823 int
2824 vfs_cache_lookup(struct vop_lookup_args *ap)
2825 {
2826         struct vnode *dvp;
2827         int error;
2828         struct vnode **vpp = ap->a_vpp;
2829         struct componentname *cnp = ap->a_cnp;
2830         int flags = cnp->cn_flags;
2831
2832         *vpp = NULL;
2833         dvp = ap->a_dvp;
2834
2835         if (dvp->v_type != VDIR)
2836                 return (ENOTDIR);
2837
2838         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2839             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2840                 return (EROFS);
2841
2842         error = vn_dir_check_exec(dvp, cnp);
2843         if (error != 0)
2844                 return (error);
2845
2846         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2847         if (error == 0)
2848                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2849         if (error == -1)
2850                 return (0);
2851         return (error);
2852 }
2853
2854 /* Implementation of the getcwd syscall. */
2855 int
2856 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2857 {
2858         char *buf, *retbuf;
2859         size_t buflen;
2860         int error;
2861
2862         buflen = uap->buflen;
2863         if (__predict_false(buflen < 2))
2864                 return (EINVAL);
2865         if (buflen > MAXPATHLEN)
2866                 buflen = MAXPATHLEN;
2867
2868         buf = uma_zalloc(namei_zone, M_WAITOK);
2869         error = vn_getcwd(buf, &retbuf, &buflen);
2870         if (error == 0)
2871                 error = copyout(retbuf, uap->buf, buflen);
2872         uma_zfree(namei_zone, buf);
2873         return (error);
2874 }
2875
2876 int
2877 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2878 {
2879         struct pwd *pwd;
2880         int error;
2881
2882         vfs_smr_enter();
2883         pwd = pwd_get_smr();
2884         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2885             buflen, 0);
2886         VFS_SMR_ASSERT_NOT_ENTERED();
2887         if (error < 0) {
2888                 pwd = pwd_hold(curthread);
2889                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2890                     retbuf, buflen);
2891                 pwd_drop(pwd);
2892         }
2893
2894 #ifdef KTRACE
2895         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2896                 ktrnamei(*retbuf);
2897 #endif
2898         return (error);
2899 }
2900
2901 static int
2902 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2903     size_t size, int flags, enum uio_seg pathseg)
2904 {
2905         struct nameidata nd;
2906         char *retbuf, *freebuf;
2907         int error;
2908
2909         if (flags != 0)
2910                 return (EINVAL);
2911         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2912             pathseg, path, fd, &cap_fstat_rights, td);
2913         if ((error = namei(&nd)) != 0)
2914                 return (error);
2915         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2916         if (error == 0) {
2917                 error = copyout(retbuf, buf, size);
2918                 free(freebuf, M_TEMP);
2919         }
2920         NDFREE(&nd, 0);
2921         return (error);
2922 }
2923
2924 int
2925 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2926 {
2927
2928         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2929             uap->flags, UIO_USERSPACE));
2930 }
2931
2932 /*
2933  * Retrieve the full filesystem path that correspond to a vnode from the name
2934  * cache (if available)
2935  */
2936 int
2937 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2938 {
2939         struct pwd *pwd;
2940         char *buf;
2941         size_t buflen;
2942         int error;
2943
2944         if (__predict_false(vp == NULL))
2945                 return (EINVAL);
2946
2947         buflen = MAXPATHLEN;
2948         buf = malloc(buflen, M_TEMP, M_WAITOK);
2949         vfs_smr_enter();
2950         pwd = pwd_get_smr();
2951         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2952         VFS_SMR_ASSERT_NOT_ENTERED();
2953         if (error < 0) {
2954                 pwd = pwd_hold(curthread);
2955                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2956                 pwd_drop(pwd);
2957         }
2958         if (error == 0)
2959                 *freebuf = buf;
2960         else
2961                 free(buf, M_TEMP);
2962         return (error);
2963 }
2964
2965 /*
2966  * This function is similar to vn_fullpath, but it attempts to lookup the
2967  * pathname relative to the global root mount point.  This is required for the
2968  * auditing sub-system, as audited pathnames must be absolute, relative to the
2969  * global root mount point.
2970  */
2971 int
2972 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2973 {
2974         char *buf;
2975         size_t buflen;
2976         int error;
2977
2978         if (__predict_false(vp == NULL))
2979                 return (EINVAL);
2980         buflen = MAXPATHLEN;
2981         buf = malloc(buflen, M_TEMP, M_WAITOK);
2982         vfs_smr_enter();
2983         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2984         VFS_SMR_ASSERT_NOT_ENTERED();
2985         if (error < 0) {
2986                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2987         }
2988         if (error == 0)
2989                 *freebuf = buf;
2990         else
2991                 free(buf, M_TEMP);
2992         return (error);
2993 }
2994
2995 static struct namecache *
2996 vn_dd_from_dst(struct vnode *vp)
2997 {
2998         struct namecache *ncp;
2999
3000         cache_assert_vnode_locked(vp);
3001         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
3002                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3003                         return (ncp);
3004         }
3005         return (NULL);
3006 }
3007
3008 int
3009 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
3010 {
3011         struct vnode *dvp;
3012         struct namecache *ncp;
3013         struct mtx *vlp;
3014         int error;
3015
3016         vlp = VP2VNODELOCK(*vp);
3017         mtx_lock(vlp);
3018         ncp = (*vp)->v_cache_dd;
3019         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
3020                 KASSERT(ncp == vn_dd_from_dst(*vp),
3021                     ("%s: mismatch for dd entry (%p != %p)", __func__,
3022                     ncp, vn_dd_from_dst(*vp)));
3023         } else {
3024                 ncp = vn_dd_from_dst(*vp);
3025         }
3026         if (ncp != NULL) {
3027                 if (*buflen < ncp->nc_nlen) {
3028                         mtx_unlock(vlp);
3029                         vrele(*vp);
3030                         counter_u64_add(numfullpathfail4, 1);
3031                         error = ENOMEM;
3032                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3033                             vp, NULL);
3034                         return (error);
3035                 }
3036                 *buflen -= ncp->nc_nlen;
3037                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3038                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
3039                     ncp->nc_name, vp);
3040                 dvp = *vp;
3041                 *vp = ncp->nc_dvp;
3042                 vref(*vp);
3043                 mtx_unlock(vlp);
3044                 vrele(dvp);
3045                 return (0);
3046         }
3047         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
3048
3049         mtx_unlock(vlp);
3050         vn_lock(*vp, LK_SHARED | LK_RETRY);
3051         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
3052         vput(*vp);
3053         if (error) {
3054                 counter_u64_add(numfullpathfail2, 1);
3055                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
3056                 return (error);
3057         }
3058
3059         *vp = dvp;
3060         if (VN_IS_DOOMED(dvp)) {
3061                 /* forced unmount */
3062                 vrele(dvp);
3063                 error = ENOENT;
3064                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
3065                 return (error);
3066         }
3067         /*
3068          * *vp has its use count incremented still.
3069          */
3070
3071         return (0);
3072 }
3073
3074 /*
3075  * Resolve a directory to a pathname.
3076  *
3077  * The name of the directory can always be found in the namecache or fetched
3078  * from the filesystem. There is also guaranteed to be only one parent, meaning
3079  * we can just follow vnodes up until we find the root.
3080  *
3081  * The vnode must be referenced.
3082  */
3083 static int
3084 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3085     size_t *len, size_t addend)
3086 {
3087 #ifdef KDTRACE_HOOKS
3088         struct vnode *startvp = vp;
3089 #endif
3090         struct vnode *vp1;
3091         size_t buflen;
3092         int error;
3093         bool slash_prefixed;
3094
3095         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
3096         VNPASS(vp->v_usecount > 0, vp);
3097
3098         buflen = *len;
3099
3100         slash_prefixed = true;
3101         if (addend == 0) {
3102                 MPASS(*len >= 2);
3103                 buflen--;
3104                 buf[buflen] = '\0';
3105                 slash_prefixed = false;
3106         }
3107
3108         error = 0;
3109
3110         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
3111         counter_u64_add(numfullpathcalls, 1);
3112         while (vp != rdir && vp != rootvnode) {
3113                 /*
3114                  * The vp vnode must be already fully constructed,
3115                  * since it is either found in namecache or obtained
3116                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
3117                  * without obtaining the vnode lock.
3118                  */
3119                 if ((vp->v_vflag & VV_ROOT) != 0) {
3120                         vn_lock(vp, LK_RETRY | LK_SHARED);
3121
3122                         /*
3123                          * With the vnode locked, check for races with
3124                          * unmount, forced or not.  Note that we
3125                          * already verified that vp is not equal to
3126                          * the root vnode, which means that
3127                          * mnt_vnodecovered can be NULL only for the
3128                          * case of unmount.
3129                          */
3130                         if (VN_IS_DOOMED(vp) ||
3131                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
3132                             vp1->v_mountedhere != vp->v_mount) {
3133                                 vput(vp);
3134                                 error = ENOENT;
3135                                 SDT_PROBE3(vfs, namecache, fullpath, return,
3136                                     error, vp, NULL);
3137                                 break;
3138                         }
3139
3140                         vref(vp1);
3141                         vput(vp);
3142                         vp = vp1;
3143                         continue;
3144                 }
3145                 if (vp->v_type != VDIR) {
3146                         vrele(vp);
3147                         counter_u64_add(numfullpathfail1, 1);
3148                         error = ENOTDIR;
3149                         SDT_PROBE3(vfs, namecache, fullpath, return,
3150                             error, vp, NULL);
3151                         break;
3152                 }
3153                 error = vn_vptocnp(&vp, buf, &buflen);
3154                 if (error)
3155                         break;
3156                 if (buflen == 0) {
3157                         vrele(vp);
3158                         error = ENOMEM;
3159                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3160                             startvp, NULL);
3161                         break;
3162                 }
3163                 buf[--buflen] = '/';
3164                 slash_prefixed = true;
3165         }
3166         if (error)
3167                 return (error);
3168         if (!slash_prefixed) {
3169                 if (buflen == 0) {
3170                         vrele(vp);
3171                         counter_u64_add(numfullpathfail4, 1);
3172                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3173                             startvp, NULL);
3174                         return (ENOMEM);
3175                 }
3176                 buf[--buflen] = '/';
3177         }
3178         counter_u64_add(numfullpathfound, 1);
3179         vrele(vp);
3180
3181         *retbuf = buf + buflen;
3182         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3183         *len -= buflen;
3184         *len += addend;
3185         return (0);
3186 }
3187
3188 /*
3189  * Resolve an arbitrary vnode to a pathname.
3190  *
3191  * Note 2 caveats:
3192  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3193  *   resolve to a different path than the one used to find it
3194  * - namecache is not mandatory, meaning names are not guaranteed to be added
3195  *   (in which case resolving fails)
3196  */
3197 static void __inline
3198 cache_rev_failed_impl(int *reason, int line)
3199 {
3200
3201         *reason = line;
3202 }
3203 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3204
3205 static int
3206 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3207     char **retbuf, size_t *buflen, size_t addend)
3208 {
3209 #ifdef KDTRACE_HOOKS
3210         struct vnode *startvp = vp;
3211 #endif
3212         struct vnode *tvp;
3213         struct mount *mp;
3214         struct namecache *ncp;
3215         size_t orig_buflen;
3216         int reason;
3217         int error;
3218 #ifdef KDTRACE_HOOKS
3219         int i;
3220 #endif
3221         seqc_t vp_seqc, tvp_seqc;
3222         u_char nc_flag;
3223
3224         VFS_SMR_ASSERT_ENTERED();
3225
3226         if (!cache_fast_revlookup) {
3227                 vfs_smr_exit();
3228                 return (-1);
3229         }
3230
3231         orig_buflen = *buflen;
3232
3233         if (addend == 0) {
3234                 MPASS(*buflen >= 2);
3235                 *buflen -= 1;
3236                 buf[*buflen] = '\0';
3237         }
3238
3239         if (vp == rdir || vp == rootvnode) {
3240                 if (addend == 0) {
3241                         *buflen -= 1;
3242                         buf[*buflen] = '/';
3243                 }
3244                 goto out_ok;
3245         }
3246
3247 #ifdef KDTRACE_HOOKS
3248         i = 0;
3249 #endif
3250         error = -1;
3251         ncp = NULL; /* for sdt probe down below */
3252         vp_seqc = vn_seqc_read_any(vp);
3253         if (seqc_in_modify(vp_seqc)) {
3254                 cache_rev_failed(&reason);
3255                 goto out_abort;
3256         }
3257
3258         for (;;) {
3259 #ifdef KDTRACE_HOOKS
3260                 i++;
3261 #endif
3262                 if ((vp->v_vflag & VV_ROOT) != 0) {
3263                         mp = atomic_load_ptr(&vp->v_mount);
3264                         if (mp == NULL) {
3265                                 cache_rev_failed(&reason);
3266                                 goto out_abort;
3267                         }
3268                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3269                         tvp_seqc = vn_seqc_read_any(tvp);
3270                         if (seqc_in_modify(tvp_seqc)) {
3271                                 cache_rev_failed(&reason);
3272                                 goto out_abort;
3273                         }
3274                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3275                                 cache_rev_failed(&reason);
3276                                 goto out_abort;
3277                         }
3278                         vp = tvp;
3279                         vp_seqc = tvp_seqc;
3280                         continue;
3281                 }
3282                 ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
3283                 if (ncp == NULL) {
3284                         cache_rev_failed(&reason);
3285                         goto out_abort;
3286                 }
3287                 nc_flag = atomic_load_char(&ncp->nc_flag);
3288                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3289                         cache_rev_failed(&reason);
3290                         goto out_abort;
3291                 }
3292                 if (ncp->nc_nlen >= *buflen) {
3293                         cache_rev_failed(&reason);
3294                         error = ENOMEM;
3295                         goto out_abort;
3296                 }
3297                 *buflen -= ncp->nc_nlen;
3298                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3299                 *buflen -= 1;
3300                 buf[*buflen] = '/';
3301                 tvp = ncp->nc_dvp;
3302                 tvp_seqc = vn_seqc_read_any(tvp);
3303                 if (seqc_in_modify(tvp_seqc)) {
3304                         cache_rev_failed(&reason);
3305                         goto out_abort;
3306                 }
3307                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3308                         cache_rev_failed(&reason);
3309                         goto out_abort;
3310                 }
3311                 /*
3312                  * Acquire fence provided by vn_seqc_read_any above.
3313                  */
3314                 if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
3315                         cache_rev_failed(&reason);
3316                         goto out_abort;
3317                 }
3318                 if (!cache_ncp_canuse(ncp)) {
3319                         cache_rev_failed(&reason);
3320                         goto out_abort;
3321                 }
3322                 vp = tvp;
3323                 vp_seqc = tvp_seqc;
3324                 if (vp == rdir || vp == rootvnode)
3325                         break;
3326         }
3327 out_ok:
3328         vfs_smr_exit();
3329         *retbuf = buf + *buflen;
3330         *buflen = orig_buflen - *buflen + addend;
3331         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3332         return (0);
3333
3334 out_abort:
3335         *buflen = orig_buflen;
3336         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3337         vfs_smr_exit();
3338         return (error);
3339 }
3340
3341 static int
3342 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3343     size_t *buflen)
3344 {
3345         size_t orig_buflen, addend;
3346         int error;
3347
3348         if (*buflen < 2)
3349                 return (EINVAL);
3350
3351         orig_buflen = *buflen;
3352
3353         vref(vp);
3354         addend = 0;
3355         if (vp->v_type != VDIR) {
3356                 *buflen -= 1;
3357                 buf[*buflen] = '\0';
3358                 error = vn_vptocnp(&vp, buf, buflen);
3359                 if (error)
3360                         return (error);
3361                 if (*buflen == 0) {
3362                         vrele(vp);
3363                         return (ENOMEM);
3364                 }
3365                 *buflen -= 1;
3366                 buf[*buflen] = '/';
3367                 addend = orig_buflen - *buflen;
3368         }
3369
3370         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3371 }
3372
3373 /*
3374  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3375  *
3376  * Since the namecache does not track hardlinks, the caller is expected to first
3377  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3378  *
3379  * Then we have 2 cases:
3380  * - if the found vnode is a directory, the path can be constructed just by
3381  *   following names up the chain
3382  * - otherwise we populate the buffer with the saved name and start resolving
3383  *   from the parent
3384  */
3385 static int
3386 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3387     size_t *buflen)
3388 {
3389         char *buf, *tmpbuf;
3390         struct pwd *pwd;
3391         struct componentname *cnp;
3392         struct vnode *vp;
3393         size_t addend;
3394         int error;
3395         enum vtype type;
3396
3397         if (*buflen < 2)
3398                 return (EINVAL);
3399         if (*buflen > MAXPATHLEN)
3400                 *buflen = MAXPATHLEN;
3401
3402         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3403
3404         addend = 0;
3405         vp = ndp->ni_vp;
3406         /*
3407          * Check for VBAD to work around the vp_crossmp bug in lookup().
3408          *
3409          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3410          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3411          * If the type is VDIR (like in this very case) we can skip looking
3412          * at ni_dvp in the first place. However, since vnodes get passed here
3413          * unlocked the target may transition to doomed state (type == VBAD)
3414          * before we get to evaluate the condition. If this happens, we will
3415          * populate part of the buffer and descend to vn_fullpath_dir with
3416          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3417          *
3418          * This should be atomic_load(&vp->v_type) but it is illegal to take
3419          * an address of a bit field, even if said field is sized to char.
3420          * Work around the problem by reading the value into a full-sized enum
3421          * and then re-reading it with atomic_load which will still prevent
3422          * the compiler from re-reading down the road.
3423          */
3424         type = vp->v_type;
3425         type = atomic_load_int(&type);
3426         if (type == VBAD) {
3427                 error = ENOENT;
3428                 goto out_bad;
3429         }
3430         if (type != VDIR) {
3431                 cnp = &ndp->ni_cnd;
3432                 addend = cnp->cn_namelen + 2;
3433                 if (*buflen < addend) {
3434                         error = ENOMEM;
3435                         goto out_bad;
3436                 }
3437                 *buflen -= addend;
3438                 tmpbuf = buf + *buflen;
3439                 tmpbuf[0] = '/';
3440                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3441                 tmpbuf[addend - 1] = '\0';
3442                 vp = ndp->ni_dvp;
3443         }
3444
3445         vfs_smr_enter();
3446         pwd = pwd_get_smr();
3447         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3448             addend);
3449         VFS_SMR_ASSERT_NOT_ENTERED();
3450         if (error < 0) {
3451                 pwd = pwd_hold(curthread);
3452                 vref(vp);
3453                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3454                     addend);
3455                 pwd_drop(pwd);
3456                 if (error != 0)
3457                         goto out_bad;
3458         }
3459
3460         *freebuf = buf;
3461
3462         return (0);
3463 out_bad:
3464         free(buf, M_TEMP);
3465         return (error);
3466 }
3467
3468 struct vnode *
3469 vn_dir_dd_ino(struct vnode *vp)
3470 {
3471         struct namecache *ncp;
3472         struct vnode *ddvp;
3473         struct mtx *vlp;
3474         enum vgetstate vs;
3475
3476         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3477         vlp = VP2VNODELOCK(vp);
3478         mtx_lock(vlp);
3479         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3480                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3481                         continue;
3482                 ddvp = ncp->nc_dvp;
3483                 vs = vget_prep(ddvp);
3484                 mtx_unlock(vlp);
3485                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3486                         return (NULL);
3487                 return (ddvp);
3488         }
3489         mtx_unlock(vlp);
3490         return (NULL);
3491 }
3492
3493 int
3494 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3495 {
3496         struct namecache *ncp;
3497         struct mtx *vlp;
3498         int l;
3499
3500         vlp = VP2VNODELOCK(vp);
3501         mtx_lock(vlp);
3502         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3503                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3504                         break;
3505         if (ncp == NULL) {
3506                 mtx_unlock(vlp);
3507                 return (ENOENT);
3508         }
3509         l = min(ncp->nc_nlen, buflen - 1);
3510         memcpy(buf, ncp->nc_name, l);
3511         mtx_unlock(vlp);
3512         buf[l] = '\0';
3513         return (0);
3514 }
3515
3516 /*
3517  * This function updates path string to vnode's full global path
3518  * and checks the size of the new path string against the pathlen argument.
3519  *
3520  * Requires a locked, referenced vnode.
3521  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3522  *
3523  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3524  * because it falls back to the ".." lookup if the namecache lookup fails.
3525  */
3526 int
3527 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3528     u_int pathlen)
3529 {
3530         struct nameidata nd;
3531         struct vnode *vp1;
3532         char *rpath, *fbuf;
3533         int error;
3534
3535         ASSERT_VOP_ELOCKED(vp, __func__);
3536
3537         /* Construct global filesystem path from vp. */
3538         VOP_UNLOCK(vp);
3539         error = vn_fullpath_global(vp, &rpath, &fbuf);
3540
3541         if (error != 0) {
3542                 vrele(vp);
3543                 return (error);
3544         }
3545
3546         if (strlen(rpath) >= pathlen) {
3547                 vrele(vp);
3548                 error = ENAMETOOLONG;
3549                 goto out;
3550         }
3551
3552         /*
3553          * Re-lookup the vnode by path to detect a possible rename.
3554          * As a side effect, the vnode is relocked.
3555          * If vnode was renamed, return ENOENT.
3556          */
3557         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3558             UIO_SYSSPACE, path, td);
3559         error = namei(&nd);
3560         if (error != 0) {
3561                 vrele(vp);
3562                 goto out;
3563         }
3564         NDFREE(&nd, NDF_ONLY_PNBUF);
3565         vp1 = nd.ni_vp;
3566         vrele(vp);
3567         if (vp1 == vp)
3568                 strcpy(path, rpath);
3569         else {
3570                 vput(vp1);
3571                 error = ENOENT;
3572         }
3573
3574 out:
3575         free(fbuf, M_TEMP);
3576         return (error);
3577 }
3578
3579 #ifdef DDB
3580 static void
3581 db_print_vpath(struct vnode *vp)
3582 {
3583
3584         while (vp != NULL) {
3585                 db_printf("%p: ", vp);
3586                 if (vp == rootvnode) {
3587                         db_printf("/");
3588                         vp = NULL;
3589                 } else {
3590                         if (vp->v_vflag & VV_ROOT) {
3591                                 db_printf("<mount point>");
3592                                 vp = vp->v_mount->mnt_vnodecovered;
3593                         } else {
3594                                 struct namecache *ncp;
3595                                 char *ncn;
3596                                 int i;
3597
3598                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3599                                 if (ncp != NULL) {
3600                                         ncn = ncp->nc_name;
3601                                         for (i = 0; i < ncp->nc_nlen; i++)
3602                                                 db_printf("%c", *ncn++);
3603                                         vp = ncp->nc_dvp;
3604                                 } else {
3605                                         vp = NULL;
3606                                 }
3607                         }
3608                 }
3609                 db_printf("\n");
3610         }
3611
3612         return;
3613 }
3614
3615 DB_SHOW_COMMAND(vpath, db_show_vpath)
3616 {
3617         struct vnode *vp;
3618
3619         if (!have_addr) {
3620                 db_printf("usage: show vpath <struct vnode *>\n");
3621                 return;
3622         }
3623
3624         vp = (struct vnode *)addr;
3625         db_print_vpath(vp);
3626 }
3627
3628 #endif
3629
3630 static int cache_fast_lookup = 1;
3631 static char __read_frequently cache_fast_lookup_enabled = true;
3632
3633 #define CACHE_FPL_FAILED        -2020
3634
3635 void
3636 cache_fast_lookup_enabled_recalc(void)
3637 {
3638         int lookup_flag;
3639         int mac_on;
3640
3641 #ifdef MAC
3642         mac_on = mac_vnode_check_lookup_enabled();
3643         mac_on |= mac_vnode_check_readlink_enabled();
3644 #else
3645         mac_on = 0;
3646 #endif
3647
3648         lookup_flag = atomic_load_int(&cache_fast_lookup);
3649         if (lookup_flag && !mac_on) {
3650                 atomic_store_char(&cache_fast_lookup_enabled, true);
3651         } else {
3652                 atomic_store_char(&cache_fast_lookup_enabled, false);
3653         }
3654 }
3655
3656 static int
3657 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
3658 {
3659         int error, old;
3660
3661         old = atomic_load_int(&cache_fast_lookup);
3662         error = sysctl_handle_int(oidp, arg1, arg2, req);
3663         if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
3664                 cache_fast_lookup_enabled_recalc();
3665         return (error);
3666 }
3667 SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
3668     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
3669
3670 /*
3671  * Components of nameidata (or objects it can point to) which may
3672  * need restoring in case fast path lookup fails.
3673  */
3674 struct nameidata_outer {
3675         size_t ni_pathlen;
3676         int cn_flags;
3677 };
3678
3679 struct nameidata_saved {
3680 #ifdef INVARIANTS
3681         char *cn_nameptr;
3682         size_t ni_pathlen;
3683 #endif
3684 };
3685
3686 #ifdef INVARIANTS
3687 struct cache_fpl_debug {
3688         size_t ni_pathlen;
3689 };
3690 #endif
3691
3692 struct cache_fpl {
3693         struct nameidata *ndp;
3694         struct componentname *cnp;
3695         char *nulchar;
3696         struct pwd **pwd;
3697         struct vnode *dvp;
3698         struct vnode *tvp;
3699         seqc_t dvp_seqc;
3700         seqc_t tvp_seqc;
3701         struct nameidata_saved snd;
3702         struct nameidata_outer snd_outer;
3703         int line;
3704         enum cache_fpl_status status:8;
3705         bool in_smr;
3706         bool fsearch;
3707         bool savename;
3708 #ifdef INVARIANTS
3709         struct cache_fpl_debug debug;
3710 #endif
3711 };
3712
3713 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
3714 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
3715 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
3716 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
3717 static int cache_fplookup_preparse(struct cache_fpl *fpl);
3718 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
3719 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
3720 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
3721 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
3722
3723 static void
3724 cache_fpl_cleanup_cnp(struct componentname *cnp)
3725 {
3726
3727         uma_zfree(namei_zone, cnp->cn_pnbuf);
3728 #ifdef DIAGNOSTIC
3729         cnp->cn_pnbuf = NULL;
3730         cnp->cn_nameptr = NULL;
3731 #endif
3732 }
3733
3734 static struct vnode *
3735 cache_fpl_handle_root(struct cache_fpl *fpl)
3736 {
3737         struct nameidata *ndp;
3738         struct componentname *cnp;
3739
3740         ndp = fpl->ndp;
3741         cnp = fpl->cnp;
3742
3743         MPASS(*(cnp->cn_nameptr) == '/');
3744         cnp->cn_nameptr++;
3745         cache_fpl_pathlen_dec(fpl);
3746
3747         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
3748                 do {
3749                         cnp->cn_nameptr++;
3750                         cache_fpl_pathlen_dec(fpl);
3751                 } while (*(cnp->cn_nameptr) == '/');
3752         }
3753
3754         return (ndp->ni_rootdir);
3755 }
3756
3757 static void
3758 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
3759 {
3760
3761         fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
3762         fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
3763 }
3764
3765 static void
3766 cache_fpl_checkpoint(struct cache_fpl *fpl)
3767 {
3768
3769 #ifdef INVARIANTS
3770         fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3771         fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
3772 #endif
3773 }
3774
3775 static void
3776 cache_fpl_restore_partial(struct cache_fpl *fpl)
3777 {
3778
3779         fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
3780 #ifdef INVARIANTS
3781         fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
3782 #endif
3783 }
3784
3785 static void
3786 cache_fpl_restore_abort(struct cache_fpl *fpl)
3787 {
3788
3789         cache_fpl_restore_partial(fpl);
3790         /*
3791          * It is 0 on entry by API contract.
3792          */
3793         fpl->ndp->ni_resflags = 0;
3794         fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
3795         fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
3796 }
3797
3798 #ifdef INVARIANTS
3799 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3800         struct cache_fpl *_fpl = (fpl);                         \
3801         MPASS(_fpl->in_smr == true);                            \
3802         VFS_SMR_ASSERT_ENTERED();                               \
3803 })
3804 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3805         struct cache_fpl *_fpl = (fpl);                         \
3806         MPASS(_fpl->in_smr == false);                           \
3807         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3808 })
3809 static void
3810 cache_fpl_assert_status(struct cache_fpl *fpl)
3811 {
3812
3813         switch (fpl->status) {
3814         case CACHE_FPL_STATUS_UNSET:
3815                 __assert_unreachable();
3816                 break;
3817         case CACHE_FPL_STATUS_DESTROYED:
3818         case CACHE_FPL_STATUS_ABORTED:
3819         case CACHE_FPL_STATUS_PARTIAL:
3820         case CACHE_FPL_STATUS_HANDLED:
3821                 break;
3822         }
3823 }
3824 #else
3825 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3826 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3827 #define cache_fpl_assert_status(fpl) do { } while (0)
3828 #endif
3829
3830 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3831         struct cache_fpl *_fpl = (fpl);                         \
3832         vfs_smr_enter();                                        \
3833         _fpl->in_smr = true;                                    \
3834 })
3835
3836 #define cache_fpl_smr_enter(fpl) ({                             \
3837         struct cache_fpl *_fpl = (fpl);                         \
3838         MPASS(_fpl->in_smr == false);                           \
3839         vfs_smr_enter();                                        \
3840         _fpl->in_smr = true;                                    \
3841 })
3842
3843 #define cache_fpl_smr_exit(fpl) ({                              \
3844         struct cache_fpl *_fpl = (fpl);                         \
3845         MPASS(_fpl->in_smr == true);                            \
3846         vfs_smr_exit();                                         \
3847         _fpl->in_smr = false;                                   \
3848 })
3849
3850 static int
3851 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
3852 {
3853
3854         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3855                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3856                     ("%s: converting to abort from %d at %d, set at %d\n",
3857                     __func__, fpl->status, line, fpl->line));
3858         }
3859         cache_fpl_smr_assert_not_entered(fpl);
3860         fpl->status = CACHE_FPL_STATUS_ABORTED;
3861         fpl->line = line;
3862         return (CACHE_FPL_FAILED);
3863 }
3864
3865 #define cache_fpl_aborted_early(x)      cache_fpl_aborted_early_impl((x), __LINE__)
3866
3867 static int __noinline
3868 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3869 {
3870         struct nameidata *ndp;
3871         struct componentname *cnp;
3872
3873         ndp = fpl->ndp;
3874         cnp = fpl->cnp;
3875
3876         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3877                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3878                     ("%s: converting to abort from %d at %d, set at %d\n",
3879                     __func__, fpl->status, line, fpl->line));
3880         }
3881         fpl->status = CACHE_FPL_STATUS_ABORTED;
3882         fpl->line = line;
3883         if (fpl->in_smr)
3884                 cache_fpl_smr_exit(fpl);
3885         cache_fpl_restore_abort(fpl);
3886         /*
3887          * Resolving symlinks overwrites data passed by the caller.
3888          * Let namei know.
3889          */
3890         if (ndp->ni_loopcnt > 0) {
3891                 fpl->status = CACHE_FPL_STATUS_DESTROYED;
3892                 cache_fpl_cleanup_cnp(cnp);
3893         }
3894         return (CACHE_FPL_FAILED);
3895 }
3896
3897 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3898
3899 static int __noinline
3900 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3901 {
3902
3903         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3904             ("%s: setting to partial at %d, but already set to %d at %d\n",
3905             __func__, line, fpl->status, fpl->line));
3906         cache_fpl_smr_assert_entered(fpl);
3907         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3908         fpl->line = line;
3909         return (cache_fplookup_partial_setup(fpl));
3910 }
3911
3912 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3913
3914 static int
3915 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
3916 {
3917
3918         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3919             ("%s: setting to handled at %d, but already set to %d at %d\n",
3920             __func__, line, fpl->status, fpl->line));
3921         cache_fpl_smr_assert_not_entered(fpl);
3922         fpl->status = CACHE_FPL_STATUS_HANDLED;
3923         fpl->line = line;
3924         return (0);
3925 }
3926
3927 #define cache_fpl_handled(x)    cache_fpl_handled_impl((x), __LINE__)
3928
3929 static int
3930 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
3931 {
3932
3933         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3934             ("%s: setting to handled at %d, but already set to %d at %d\n",
3935             __func__, line, fpl->status, fpl->line));
3936         MPASS(error != 0);
3937         MPASS(error != CACHE_FPL_FAILED);
3938         cache_fpl_smr_assert_not_entered(fpl);
3939         fpl->status = CACHE_FPL_STATUS_HANDLED;
3940         fpl->line = line;
3941         fpl->dvp = NULL;
3942         fpl->tvp = NULL;
3943         fpl->savename = false;
3944         return (error);
3945 }
3946
3947 #define cache_fpl_handled_error(x, e)   cache_fpl_handled_error_impl((x), (e), __LINE__)
3948
3949 static bool
3950 cache_fpl_terminated(struct cache_fpl *fpl)
3951 {
3952
3953         return (fpl->status != CACHE_FPL_STATUS_UNSET);
3954 }
3955
3956 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3957         (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
3958          FAILIFEXISTS | FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | \
3959          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3960
3961 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3962         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3963
3964 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3965     "supported and internal flags overlap");
3966
3967 static bool
3968 cache_fpl_islastcn(struct nameidata *ndp)
3969 {
3970
3971         return (*ndp->ni_next == 0);
3972 }
3973
3974 static bool
3975 cache_fpl_isdotdot(struct componentname *cnp)
3976 {
3977
3978         if (cnp->cn_namelen == 2 &&
3979             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3980                 return (true);
3981         return (false);
3982 }
3983
3984 static bool
3985 cache_can_fplookup(struct cache_fpl *fpl)
3986 {
3987         struct nameidata *ndp;
3988         struct componentname *cnp;
3989         struct thread *td;
3990
3991         ndp = fpl->ndp;
3992         cnp = fpl->cnp;
3993         td = cnp->cn_thread;
3994
3995         if (!atomic_load_char(&cache_fast_lookup_enabled)) {
3996                 cache_fpl_aborted_early(fpl);
3997                 return (false);
3998         }
3999         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
4000                 cache_fpl_aborted_early(fpl);
4001                 return (false);
4002         }
4003         if (IN_CAPABILITY_MODE(td)) {
4004                 cache_fpl_aborted_early(fpl);
4005                 return (false);
4006         }
4007         if (AUDITING_TD(td)) {
4008                 cache_fpl_aborted_early(fpl);
4009                 return (false);
4010         }
4011         if (ndp->ni_startdir != NULL) {
4012                 cache_fpl_aborted_early(fpl);
4013                 return (false);
4014         }
4015         return (true);
4016 }
4017
4018 static int
4019 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
4020 {
4021         struct nameidata *ndp;
4022         int error;
4023         bool fsearch;
4024
4025         ndp = fpl->ndp;
4026         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
4027         if (__predict_false(error != 0)) {
4028                 return (cache_fpl_aborted(fpl));
4029         }
4030         fpl->fsearch = fsearch;
4031         return (0);
4032 }
4033
4034 static int __noinline
4035 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
4036     uint32_t hash)
4037 {
4038         struct componentname *cnp;
4039         struct vnode *dvp;
4040
4041         cnp = fpl->cnp;
4042         dvp = fpl->dvp;
4043
4044         cache_fpl_smr_exit(fpl);
4045         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
4046                 return (cache_fpl_handled_error(fpl, ENOENT));
4047         else
4048                 return (cache_fpl_aborted(fpl));
4049 }
4050
4051 /*
4052  * The target vnode is not supported, prepare for the slow path to take over.
4053  */
4054 static int __noinline
4055 cache_fplookup_partial_setup(struct cache_fpl *fpl)
4056 {
4057         struct nameidata *ndp;
4058         struct componentname *cnp;
4059         enum vgetstate dvs;
4060         struct vnode *dvp;
4061         struct pwd *pwd;
4062         seqc_t dvp_seqc;
4063
4064         ndp = fpl->ndp;
4065         cnp = fpl->cnp;
4066         pwd = *(fpl->pwd);
4067         dvp = fpl->dvp;
4068         dvp_seqc = fpl->dvp_seqc;
4069
4070         if (!pwd_hold_smr(pwd)) {
4071                 return (cache_fpl_aborted(fpl));
4072         }
4073
4074         /*
4075          * Note that seqc is checked before the vnode is locked, so by
4076          * the time regular lookup gets to it it may have moved.
4077          *
4078          * Ultimately this does not affect correctness, any lookup errors
4079          * are userspace racing with itself. It is guaranteed that any
4080          * path which ultimately gets found could also have been found
4081          * by regular lookup going all the way in absence of concurrent
4082          * modifications.
4083          */
4084         dvs = vget_prep_smr(dvp);
4085         cache_fpl_smr_exit(fpl);
4086         if (__predict_false(dvs == VGET_NONE)) {
4087                 pwd_drop(pwd);
4088                 return (cache_fpl_aborted(fpl));
4089         }
4090
4091         vget_finish_ref(dvp, dvs);
4092         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4093                 vrele(dvp);
4094                 pwd_drop(pwd);
4095                 return (cache_fpl_aborted(fpl));
4096         }
4097
4098         cache_fpl_restore_partial(fpl);
4099 #ifdef INVARIANTS
4100         if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
4101                 panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
4102                     cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
4103         }
4104 #endif
4105
4106         ndp->ni_startdir = dvp;
4107         cnp->cn_flags |= MAKEENTRY;
4108         if (cache_fpl_islastcn(ndp))
4109                 cnp->cn_flags |= ISLASTCN;
4110         if (cache_fpl_isdotdot(cnp))
4111                 cnp->cn_flags |= ISDOTDOT;
4112
4113         /*
4114          * Skip potential extra slashes parsing did not take care of.
4115          * cache_fplookup_skip_slashes explains the mechanism.
4116          */
4117         if (__predict_false(*(cnp->cn_nameptr) == '/')) {
4118                 do {
4119                         cnp->cn_nameptr++;
4120                         cache_fpl_pathlen_dec(fpl);
4121                 } while (*(cnp->cn_nameptr) == '/');
4122         }
4123
4124         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
4125 #ifdef INVARIANTS
4126         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4127                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4128                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4129                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4130         }
4131 #endif
4132         return (0);
4133 }
4134
4135 static int
4136 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
4137 {
4138         struct componentname *cnp;
4139         struct vnode *tvp;
4140         seqc_t tvp_seqc;
4141         int error, lkflags;
4142
4143         cnp = fpl->cnp;
4144         tvp = fpl->tvp;
4145         tvp_seqc = fpl->tvp_seqc;
4146
4147         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4148                 lkflags = LK_SHARED;
4149                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4150                         lkflags = LK_EXCLUSIVE;
4151                 error = vget_finish(tvp, lkflags, tvs);
4152                 if (__predict_false(error != 0)) {
4153                         return (cache_fpl_aborted(fpl));
4154                 }
4155         } else {
4156                 vget_finish_ref(tvp, tvs);
4157         }
4158
4159         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
4160                 if ((cnp->cn_flags & LOCKLEAF) != 0)
4161                         vput(tvp);
4162                 else
4163                         vrele(tvp);
4164                 return (cache_fpl_aborted(fpl));
4165         }
4166
4167         return (cache_fpl_handled(fpl));
4168 }
4169
4170 /*
4171  * They want to possibly modify the state of the namecache.
4172  */
4173 static int __noinline
4174 cache_fplookup_final_modifying(struct cache_fpl *fpl)
4175 {
4176         struct nameidata *ndp;
4177         struct componentname *cnp;
4178         enum vgetstate dvs;
4179         struct vnode *dvp, *tvp;
4180         struct mount *mp;
4181         seqc_t dvp_seqc;
4182         int error;
4183         bool docache;
4184
4185         ndp = fpl->ndp;
4186         cnp = fpl->cnp;
4187         dvp = fpl->dvp;
4188         dvp_seqc = fpl->dvp_seqc;
4189
4190         MPASS(*(cnp->cn_nameptr) != '/');
4191         MPASS(cache_fpl_islastcn(ndp));
4192         if ((cnp->cn_flags & LOCKPARENT) == 0)
4193                 MPASS((cnp->cn_flags & WANTPARENT) != 0);
4194         MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
4195         MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
4196             cnp->cn_nameiop == RENAME);
4197         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4198         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4199
4200         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4201         if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
4202                 docache = false;
4203
4204         mp = atomic_load_ptr(&dvp->v_mount);
4205         if (__predict_false(mp == NULL)) {
4206                 return (cache_fpl_aborted(fpl));
4207         }
4208
4209         if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
4210                 cache_fpl_smr_exit(fpl);
4211                 /*
4212                  * Original code keeps not checking for CREATE which
4213                  * might be a bug. For now let the old lookup decide.
4214                  */
4215                 if (cnp->cn_nameiop == CREATE) {
4216                         return (cache_fpl_aborted(fpl));
4217                 }
4218                 return (cache_fpl_handled_error(fpl, EROFS));
4219         }
4220
4221         if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
4222                 cache_fpl_smr_exit(fpl);
4223                 return (cache_fpl_handled_error(fpl, EEXIST));
4224         }
4225
4226         /*
4227          * Secure access to dvp; check cache_fplookup_partial_setup for
4228          * reasoning.
4229          *
4230          * XXX At least UFS requires its lookup routine to be called for
4231          * the last path component, which leads to some level of complication
4232          * and inefficiency:
4233          * - the target routine always locks the target vnode, but our caller
4234          *   may not need it locked
4235          * - some of the VOP machinery asserts that the parent is locked, which
4236          *   once more may be not required
4237          *
4238          * TODO: add a flag for filesystems which don't need this.
4239          */
4240         dvs = vget_prep_smr(dvp);
4241         cache_fpl_smr_exit(fpl);
4242         if (__predict_false(dvs == VGET_NONE)) {
4243                 return (cache_fpl_aborted(fpl));
4244         }
4245
4246         vget_finish_ref(dvp, dvs);
4247         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4248                 vrele(dvp);
4249                 return (cache_fpl_aborted(fpl));
4250         }
4251
4252         error = vn_lock(dvp, LK_EXCLUSIVE);
4253         if (__predict_false(error != 0)) {
4254                 vrele(dvp);
4255                 return (cache_fpl_aborted(fpl));
4256         }
4257
4258         tvp = NULL;
4259         cnp->cn_flags |= ISLASTCN;
4260         if (docache)
4261                 cnp->cn_flags |= MAKEENTRY;
4262         if (cache_fpl_isdotdot(cnp))
4263                 cnp->cn_flags |= ISDOTDOT;
4264         cnp->cn_lkflags = LK_EXCLUSIVE;
4265         error = VOP_LOOKUP(dvp, &tvp, cnp);
4266         switch (error) {
4267         case EJUSTRETURN:
4268         case 0:
4269                 break;
4270         case ENOTDIR:
4271         case ENOENT:
4272                 vput(dvp);
4273                 return (cache_fpl_handled_error(fpl, error));
4274         default:
4275                 vput(dvp);
4276                 return (cache_fpl_aborted(fpl));
4277         }
4278
4279         fpl->tvp = tvp;
4280         fpl->savename = (cnp->cn_flags & SAVENAME) != 0;
4281
4282         if (tvp == NULL) {
4283                 if ((cnp->cn_flags & SAVESTART) != 0) {
4284                         ndp->ni_startdir = dvp;
4285                         vrefact(ndp->ni_startdir);
4286                         cnp->cn_flags |= SAVENAME;
4287                         fpl->savename = true;
4288                 }
4289                 MPASS(error == EJUSTRETURN);
4290                 if ((cnp->cn_flags & LOCKPARENT) == 0) {
4291                         VOP_UNLOCK(dvp);
4292                 }
4293                 return (cache_fpl_handled(fpl));
4294         }
4295
4296         /*
4297          * There are very hairy corner cases concerning various flag combinations
4298          * and locking state. In particular here we only hold one lock instead of
4299          * two.
4300          *
4301          * Skip the complexity as it is of no significance for normal workloads.
4302          */
4303         if (__predict_false(tvp == dvp)) {
4304                 vput(dvp);
4305                 vrele(tvp);
4306                 return (cache_fpl_aborted(fpl));
4307         }
4308
4309         /*
4310          * If they want the symlink itself we are fine, but if they want to
4311          * follow it regular lookup has to be engaged.
4312          */
4313         if (tvp->v_type == VLNK) {
4314                 if ((cnp->cn_flags & FOLLOW) != 0) {
4315                         vput(dvp);
4316                         vput(tvp);
4317                         return (cache_fpl_aborted(fpl));
4318                 }
4319         }
4320
4321         /*
4322          * Since we expect this to be the terminal vnode it should almost never
4323          * be a mount point.
4324          */
4325         if (__predict_false(cache_fplookup_is_mp(fpl))) {
4326                 vput(dvp);
4327                 vput(tvp);
4328                 return (cache_fpl_aborted(fpl));
4329         }
4330
4331         if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
4332                 vput(dvp);
4333                 vput(tvp);
4334                 return (cache_fpl_handled_error(fpl, EEXIST));
4335         }
4336
4337         if ((cnp->cn_flags & LOCKLEAF) == 0) {
4338                 VOP_UNLOCK(tvp);
4339         }
4340
4341         if ((cnp->cn_flags & LOCKPARENT) == 0) {
4342                 VOP_UNLOCK(dvp);
4343         }
4344
4345         if ((cnp->cn_flags & SAVESTART) != 0) {
4346                 ndp->ni_startdir = dvp;
4347                 vrefact(ndp->ni_startdir);
4348                 cnp->cn_flags |= SAVENAME;
4349                 fpl->savename = true;
4350         }
4351
4352         return (cache_fpl_handled(fpl));
4353 }
4354
4355 static int __noinline
4356 cache_fplookup_modifying(struct cache_fpl *fpl)
4357 {
4358         struct nameidata *ndp;
4359
4360         ndp = fpl->ndp;
4361
4362         if (!cache_fpl_islastcn(ndp)) {
4363                 return (cache_fpl_partial(fpl));
4364         }
4365         return (cache_fplookup_final_modifying(fpl));
4366 }
4367
4368 static int __noinline
4369 cache_fplookup_final_withparent(struct cache_fpl *fpl)
4370 {
4371         struct componentname *cnp;
4372         enum vgetstate dvs, tvs;
4373         struct vnode *dvp, *tvp;
4374         seqc_t dvp_seqc;
4375         int error;
4376
4377         cnp = fpl->cnp;
4378         dvp = fpl->dvp;
4379         dvp_seqc = fpl->dvp_seqc;
4380         tvp = fpl->tvp;
4381
4382         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
4383
4384         /*
4385          * This is less efficient than it can be for simplicity.
4386          */
4387         dvs = vget_prep_smr(dvp);
4388         if (__predict_false(dvs == VGET_NONE)) {
4389                 return (cache_fpl_aborted(fpl));
4390         }
4391         tvs = vget_prep_smr(tvp);
4392         if (__predict_false(tvs == VGET_NONE)) {
4393                 cache_fpl_smr_exit(fpl);
4394                 vget_abort(dvp, dvs);
4395                 return (cache_fpl_aborted(fpl));
4396         }
4397
4398         cache_fpl_smr_exit(fpl);
4399
4400         if ((cnp->cn_flags & LOCKPARENT) != 0) {
4401                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
4402                 if (__predict_false(error != 0)) {
4403                         vget_abort(tvp, tvs);
4404                         return (cache_fpl_aborted(fpl));
4405                 }
4406         } else {
4407                 vget_finish_ref(dvp, dvs);
4408         }
4409
4410         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4411                 vget_abort(tvp, tvs);
4412                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4413                         vput(dvp);
4414                 else
4415                         vrele(dvp);
4416                 return (cache_fpl_aborted(fpl));
4417         }
4418
4419         error = cache_fplookup_final_child(fpl, tvs);
4420         if (__predict_false(error != 0)) {
4421                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
4422                 if ((cnp->cn_flags & LOCKPARENT) != 0)
4423                         vput(dvp);
4424                 else
4425                         vrele(dvp);
4426                 return (error);
4427         }
4428
4429         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
4430         return (0);
4431 }
4432
4433 static int
4434 cache_fplookup_final(struct cache_fpl *fpl)
4435 {
4436         struct componentname *cnp;
4437         enum vgetstate tvs;
4438         struct vnode *dvp, *tvp;
4439         seqc_t dvp_seqc;
4440
4441         cnp = fpl->cnp;
4442         dvp = fpl->dvp;
4443         dvp_seqc = fpl->dvp_seqc;
4444         tvp = fpl->tvp;
4445
4446         MPASS(*(cnp->cn_nameptr) != '/');
4447
4448         if (cnp->cn_nameiop != LOOKUP) {
4449                 return (cache_fplookup_final_modifying(fpl));
4450         }
4451
4452         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
4453                 return (cache_fplookup_final_withparent(fpl));
4454
4455         tvs = vget_prep_smr(tvp);
4456         if (__predict_false(tvs == VGET_NONE)) {
4457                 return (cache_fpl_partial(fpl));
4458         }
4459
4460         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4461                 cache_fpl_smr_exit(fpl);
4462                 vget_abort(tvp, tvs);
4463                 return (cache_fpl_aborted(fpl));
4464         }
4465
4466         cache_fpl_smr_exit(fpl);
4467         return (cache_fplookup_final_child(fpl, tvs));
4468 }
4469
4470 /*
4471  * Comment from locked lookup:
4472  * Check for degenerate name (e.g. / or "") which is a way of talking about a
4473  * directory, e.g. like "/." or ".".
4474  */
4475 static int __noinline
4476 cache_fplookup_degenerate(struct cache_fpl *fpl)
4477 {
4478         struct componentname *cnp;
4479         struct vnode *dvp;
4480         enum vgetstate dvs;
4481         int error, lkflags;
4482
4483         fpl->tvp = fpl->dvp;
4484         fpl->tvp_seqc = fpl->dvp_seqc;
4485
4486         cnp = fpl->cnp;
4487         dvp = fpl->dvp;
4488
4489         if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
4490                 cache_fpl_smr_exit(fpl);
4491                 return (cache_fpl_handled_error(fpl, EISDIR));
4492         }
4493
4494         MPASS((cnp->cn_flags & SAVESTART) == 0);
4495
4496         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
4497                 return (cache_fplookup_final_withparent(fpl));
4498         }
4499
4500         dvs = vget_prep_smr(dvp);
4501         cache_fpl_smr_exit(fpl);
4502         if (__predict_false(dvs == VGET_NONE)) {
4503                 return (cache_fpl_aborted(fpl));
4504         }
4505
4506         if ((cnp->cn_flags & LOCKLEAF) != 0) {
4507                 lkflags = LK_SHARED;
4508                 if ((cnp->cn_flags & LOCKSHARED) == 0)
4509                         lkflags = LK_EXCLUSIVE;
4510                 error = vget_finish(dvp, lkflags, dvs);
4511                 if (__predict_false(error != 0)) {
4512                         return (cache_fpl_aborted(fpl));
4513                 }
4514         } else {
4515                 vget_finish_ref(dvp, dvs);
4516         }
4517         return (cache_fpl_handled(fpl));
4518 }
4519
4520 static int __noinline
4521 cache_fplookup_noentry(struct cache_fpl *fpl)
4522 {
4523         struct nameidata *ndp;
4524         struct componentname *cnp;
4525         enum vgetstate dvs;
4526         struct vnode *dvp, *tvp;
4527         seqc_t dvp_seqc;
4528         int error;
4529         bool docache;
4530
4531         ndp = fpl->ndp;
4532         cnp = fpl->cnp;
4533         dvp = fpl->dvp;
4534         dvp_seqc = fpl->dvp_seqc;
4535
4536         MPASS(*(cnp->cn_nameptr) != '/');
4537         MPASS((cnp->cn_flags & MAKEENTRY) == 0);
4538         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
4539         MPASS(!cache_fpl_isdotdot(cnp));
4540
4541         /*
4542          * Hack: delayed name len checking.
4543          */
4544         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4545                 cache_fpl_smr_exit(fpl);
4546                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
4547         }
4548
4549         if (cnp->cn_nameiop != LOOKUP) {
4550                 fpl->tvp = NULL;
4551                 return (cache_fplookup_modifying(fpl));
4552         }
4553
4554         MPASS((cnp->cn_flags & SAVESTART) == 0);
4555
4556         /*
4557          * Only try to fill in the component if it is the last one,
4558          * otherwise not only there may be several to handle but the
4559          * walk may be complicated.
4560          */
4561         if (!cache_fpl_islastcn(ndp)) {
4562                 return (cache_fpl_partial(fpl));
4563         }
4564
4565         /*
4566          * Secure access to dvp; check cache_fplookup_partial_setup for
4567          * reasoning.
4568          */
4569         dvs = vget_prep_smr(dvp);
4570         cache_fpl_smr_exit(fpl);
4571         if (__predict_false(dvs == VGET_NONE)) {
4572                 return (cache_fpl_aborted(fpl));
4573         }
4574
4575         vget_finish_ref(dvp, dvs);
4576         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4577                 vrele(dvp);
4578                 return (cache_fpl_aborted(fpl));
4579         }
4580
4581         error = vn_lock(dvp, LK_SHARED);
4582         if (__predict_false(error != 0)) {
4583                 vrele(dvp);
4584                 return (cache_fpl_aborted(fpl));
4585         }
4586
4587         tvp = NULL;
4588         /*
4589          * TODO: provide variants which don't require locking either vnode.
4590          */
4591         cnp->cn_flags |= ISLASTCN;
4592         docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
4593         if (docache)
4594                 cnp->cn_flags |= MAKEENTRY;
4595         cnp->cn_lkflags = LK_SHARED;
4596         if ((cnp->cn_flags & LOCKSHARED) == 0) {
4597                 cnp->cn_lkflags = LK_EXCLUSIVE;
4598         }
4599         error = VOP_LOOKUP(dvp, &tvp, cnp);
4600         switch (error) {
4601         case EJUSTRETURN:
4602         case 0:
4603                 break;
4604         case ENOTDIR:
4605         case ENOENT:
4606                 vput(dvp);
4607                 return (cache_fpl_handled_error(fpl, error));
4608         default:
4609                 vput(dvp);
4610                 return (cache_fpl_aborted(fpl));
4611         }
4612
4613         fpl->tvp = tvp;
4614         if (!fpl->savename) {
4615                 MPASS((cnp->cn_flags & SAVENAME) == 0);
4616         }
4617
4618         if (tvp == NULL) {
4619                 MPASS(error == EJUSTRETURN);
4620                 if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4621                         vput(dvp);
4622                 } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4623                         VOP_UNLOCK(dvp);
4624                 }
4625                 return (cache_fpl_handled(fpl));
4626         }
4627
4628         if (tvp->v_type == VLNK) {
4629                 if ((cnp->cn_flags & FOLLOW) != 0) {
4630                         vput(dvp);
4631                         vput(tvp);
4632                         return (cache_fpl_aborted(fpl));
4633                 }
4634         }
4635
4636         if (__predict_false(cache_fplookup_is_mp(fpl))) {
4637                 vput(dvp);
4638                 vput(tvp);
4639                 return (cache_fpl_aborted(fpl));
4640         }
4641
4642         if ((cnp->cn_flags & LOCKLEAF) == 0) {
4643                 VOP_UNLOCK(tvp);
4644         }
4645
4646         if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
4647                 vput(dvp);
4648         } else if ((cnp->cn_flags & LOCKPARENT) == 0) {
4649                 VOP_UNLOCK(dvp);
4650         }
4651         return (cache_fpl_handled(fpl));
4652 }
4653
4654 static int __noinline
4655 cache_fplookup_dot(struct cache_fpl *fpl)
4656 {
4657         int error;
4658
4659         MPASS(!seqc_in_modify(fpl->dvp_seqc));
4660         /*
4661          * Just re-assign the value. seqc will be checked later for the first
4662          * non-dot path component in line and/or before deciding to return the
4663          * vnode.
4664          */
4665         fpl->tvp = fpl->dvp;
4666         fpl->tvp_seqc = fpl->dvp_seqc;
4667
4668         counter_u64_add(dothits, 1);
4669         SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
4670
4671         error = 0;
4672         if (cache_fplookup_is_mp(fpl)) {
4673                 error = cache_fplookup_cross_mount(fpl);
4674         }
4675         return (error);
4676 }
4677
4678 static int __noinline
4679 cache_fplookup_dotdot(struct cache_fpl *fpl)
4680 {
4681         struct nameidata *ndp;
4682         struct componentname *cnp;
4683         struct namecache *ncp;
4684         struct vnode *dvp;
4685         struct prison *pr;
4686         u_char nc_flag;
4687
4688         ndp = fpl->ndp;
4689         cnp = fpl->cnp;
4690         dvp = fpl->dvp;
4691
4692         MPASS(cache_fpl_isdotdot(cnp));
4693
4694         /*
4695          * XXX this is racy the same way regular lookup is
4696          */
4697         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
4698             pr = pr->pr_parent)
4699                 if (dvp == pr->pr_root)
4700                         break;
4701
4702         if (dvp == ndp->ni_rootdir ||
4703             dvp == ndp->ni_topdir ||
4704             dvp == rootvnode ||
4705             pr != NULL) {
4706                 fpl->tvp = dvp;
4707                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
4708                 if (seqc_in_modify(fpl->tvp_seqc)) {
4709                         return (cache_fpl_aborted(fpl));
4710                 }
4711                 return (0);
4712         }
4713
4714         if ((dvp->v_vflag & VV_ROOT) != 0) {
4715                 /*
4716                  * TODO
4717                  * The opposite of climb mount is needed here.
4718                  */
4719                 return (cache_fpl_partial(fpl));
4720         }
4721
4722         ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
4723         if (ncp == NULL) {
4724                 return (cache_fpl_aborted(fpl));
4725         }
4726
4727         nc_flag = atomic_load_char(&ncp->nc_flag);
4728         if ((nc_flag & NCF_ISDOTDOT) != 0) {
4729                 if ((nc_flag & NCF_NEGATIVE) != 0)
4730                         return (cache_fpl_aborted(fpl));
4731                 fpl->tvp = ncp->nc_vp;
4732         } else {
4733                 fpl->tvp = ncp->nc_dvp;
4734         }
4735
4736         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4737         if (seqc_in_modify(fpl->tvp_seqc)) {
4738                 return (cache_fpl_partial(fpl));
4739         }
4740
4741         /*
4742          * Acquire fence provided by vn_seqc_read_any above.
4743          */
4744         if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
4745                 return (cache_fpl_aborted(fpl));
4746         }
4747
4748         if (!cache_ncp_canuse(ncp)) {
4749                 return (cache_fpl_aborted(fpl));
4750         }
4751
4752         counter_u64_add(dotdothits, 1);
4753         return (0);
4754 }
4755
4756 static int __noinline
4757 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4758 {
4759         u_char nc_flag;
4760         bool neg_promote;
4761
4762         nc_flag = atomic_load_char(&ncp->nc_flag);
4763         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4764         /*
4765          * If they want to create an entry we need to replace this one.
4766          */
4767         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4768                 fpl->tvp = NULL;
4769                 return (cache_fplookup_modifying(fpl));
4770         }
4771         neg_promote = cache_neg_hit_prep(ncp);
4772         if (!cache_fpl_neg_ncp_canuse(ncp)) {
4773                 cache_neg_hit_abort(ncp);
4774                 return (cache_fpl_partial(fpl));
4775         }
4776         if (neg_promote) {
4777                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4778         }
4779         cache_neg_hit_finish(ncp);
4780         cache_fpl_smr_exit(fpl);
4781         return (cache_fpl_handled_error(fpl, ENOENT));
4782 }
4783
4784 /*
4785  * Resolve a symlink. Called by filesystem-specific routines.
4786  *
4787  * Code flow is:
4788  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
4789  */
4790 int
4791 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
4792 {
4793         struct nameidata *ndp;
4794         struct componentname *cnp;
4795
4796         ndp = fpl->ndp;
4797         cnp = fpl->cnp;
4798
4799         if (__predict_false(len == 0)) {
4800                 return (ENOENT);
4801         }
4802
4803         ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
4804 #ifdef INVARIANTS
4805         if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
4806                 panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
4807                     __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
4808                     cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
4809         }
4810 #endif
4811
4812         if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
4813                 return (ENAMETOOLONG);
4814         }
4815
4816         if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
4817                 return (ELOOP);
4818         }
4819
4820         if (ndp->ni_pathlen > 1) {
4821                 bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
4822         } else {
4823                 cnp->cn_pnbuf[len] = '\0';
4824         }
4825         bcopy(string, cnp->cn_pnbuf, len);
4826
4827         ndp->ni_pathlen += len;
4828         cache_fpl_pathlen_add(fpl, len);
4829         cnp->cn_nameptr = cnp->cn_pnbuf;
4830         fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
4831
4832         return (0);
4833 }
4834
4835 static int __noinline
4836 cache_fplookup_symlink(struct cache_fpl *fpl)
4837 {
4838         struct mount *mp;
4839         struct nameidata *ndp;
4840         struct componentname *cnp;
4841         struct vnode *dvp, *tvp;
4842         int error;
4843
4844         ndp = fpl->ndp;
4845         cnp = fpl->cnp;
4846         dvp = fpl->dvp;
4847         tvp = fpl->tvp;
4848
4849         if (cache_fpl_islastcn(ndp)) {
4850                 if ((cnp->cn_flags & FOLLOW) == 0) {
4851                         return (cache_fplookup_final(fpl));
4852                 }
4853         }
4854
4855         mp = atomic_load_ptr(&dvp->v_mount);
4856         if (__predict_false(mp == NULL)) {
4857                 return (cache_fpl_aborted(fpl));
4858         }
4859
4860         /*
4861          * Note this check races against setting the flag just like regular
4862          * lookup.
4863          */
4864         if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
4865                 cache_fpl_smr_exit(fpl);
4866                 return (cache_fpl_handled_error(fpl, EACCES));
4867         }
4868
4869         error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
4870         if (__predict_false(error != 0)) {
4871                 switch (error) {
4872                 case EAGAIN:
4873                         return (cache_fpl_partial(fpl));
4874                 case ENOENT:
4875                 case ENAMETOOLONG:
4876                 case ELOOP:
4877                         cache_fpl_smr_exit(fpl);
4878                         return (cache_fpl_handled_error(fpl, error));
4879                 default:
4880                         return (cache_fpl_aborted(fpl));
4881                 }
4882         }
4883
4884         if (*(cnp->cn_nameptr) == '/') {
4885                 fpl->dvp = cache_fpl_handle_root(fpl);
4886                 fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4887                 if (seqc_in_modify(fpl->dvp_seqc)) {
4888                         return (cache_fpl_aborted(fpl));
4889                 }
4890         }
4891
4892         return (cache_fplookup_preparse(fpl));
4893 }
4894
4895 static int
4896 cache_fplookup_next(struct cache_fpl *fpl)
4897 {
4898         struct componentname *cnp;
4899         struct namecache *ncp;
4900         struct vnode *dvp, *tvp;
4901         u_char nc_flag;
4902         uint32_t hash;
4903         int error;
4904
4905         cnp = fpl->cnp;
4906         dvp = fpl->dvp;
4907
4908         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
4909                 if (cnp->cn_namelen == 1) {
4910                         return (cache_fplookup_dot(fpl));
4911                 }
4912                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
4913                         return (cache_fplookup_dotdot(fpl));
4914                 }
4915         }
4916
4917         MPASS(!cache_fpl_isdotdot(cnp));
4918
4919         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4920
4921         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4922                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4923                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4924                         break;
4925         }
4926
4927         if (__predict_false(ncp == NULL)) {
4928                 if (cnp->cn_nameptr[0] == '/') {
4929                         return (cache_fplookup_skip_slashes(fpl));
4930                 }
4931                 return (cache_fplookup_noentry(fpl));
4932         }
4933
4934         tvp = atomic_load_ptr(&ncp->nc_vp);
4935         nc_flag = atomic_load_char(&ncp->nc_flag);
4936         if ((nc_flag & NCF_NEGATIVE) != 0) {
4937                 return (cache_fplookup_neg(fpl, ncp, hash));
4938         }
4939
4940         if (!cache_ncp_canuse(ncp)) {
4941                 return (cache_fpl_partial(fpl));
4942         }
4943
4944         fpl->tvp = tvp;
4945         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4946         if (seqc_in_modify(fpl->tvp_seqc)) {
4947                 return (cache_fpl_partial(fpl));
4948         }
4949
4950         counter_u64_add(numposhits, 1);
4951         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4952
4953         error = 0;
4954         if (cache_fplookup_is_mp(fpl)) {
4955                 error = cache_fplookup_cross_mount(fpl);
4956         }
4957         return (error);
4958 }
4959
4960 static bool
4961 cache_fplookup_mp_supported(struct mount *mp)
4962 {
4963
4964         MPASS(mp != NULL);
4965         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4966                 return (false);
4967         return (true);
4968 }
4969
4970 /*
4971  * Walk up the mount stack (if any).
4972  *
4973  * Correctness is provided in the following ways:
4974  * - all vnodes are protected from freeing with SMR
4975  * - struct mount objects are type stable making them always safe to access
4976  * - stability of the particular mount is provided by busying it
4977  * - relationship between the vnode which is mounted on and the mount is
4978  *   verified with the vnode sequence counter after busying
4979  * - association between root vnode of the mount and the mount is protected
4980  *   by busy
4981  *
4982  * From that point on we can read the sequence counter of the root vnode
4983  * and get the next mount on the stack (if any) using the same protection.
4984  *
4985  * By the end of successful walk we are guaranteed the reached state was
4986  * indeed present at least at some point which matches the regular lookup.
4987  */
4988 static int __noinline
4989 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4990 {
4991         struct mount *mp, *prev_mp;
4992         struct mount_pcpu *mpcpu, *prev_mpcpu;
4993         struct vnode *vp;
4994         seqc_t vp_seqc;
4995
4996         vp = fpl->tvp;
4997         vp_seqc = fpl->tvp_seqc;
4998
4999         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
5000         mp = atomic_load_ptr(&vp->v_mountedhere);
5001         if (__predict_false(mp == NULL)) {
5002                 return (0);
5003         }
5004
5005         prev_mp = NULL;
5006         for (;;) {
5007                 if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5008                         if (prev_mp != NULL)
5009                                 vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5010                         return (cache_fpl_partial(fpl));
5011                 }
5012                 if (prev_mp != NULL)
5013                         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5014                 if (!vn_seqc_consistent(vp, vp_seqc)) {
5015                         vfs_op_thread_exit_crit(mp, mpcpu);
5016                         return (cache_fpl_partial(fpl));
5017                 }
5018                 if (!cache_fplookup_mp_supported(mp)) {
5019                         vfs_op_thread_exit_crit(mp, mpcpu);
5020                         return (cache_fpl_partial(fpl));
5021                 }
5022                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
5023                 if (vp == NULL) {
5024                         vfs_op_thread_exit_crit(mp, mpcpu);
5025                         return (cache_fpl_partial(fpl));
5026                 }
5027                 vp_seqc = vn_seqc_read_any(vp);
5028                 if (seqc_in_modify(vp_seqc)) {
5029                         vfs_op_thread_exit_crit(mp, mpcpu);
5030                         return (cache_fpl_partial(fpl));
5031                 }
5032                 prev_mp = mp;
5033                 prev_mpcpu = mpcpu;
5034                 mp = atomic_load_ptr(&vp->v_mountedhere);
5035                 if (mp == NULL)
5036                         break;
5037         }
5038
5039         vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
5040         fpl->tvp = vp;
5041         fpl->tvp_seqc = vp_seqc;
5042         return (0);
5043 }
5044
5045 static int __noinline
5046 cache_fplookup_cross_mount(struct cache_fpl *fpl)
5047 {
5048         struct mount *mp;
5049         struct mount_pcpu *mpcpu;
5050         struct vnode *vp;
5051         seqc_t vp_seqc;
5052
5053         vp = fpl->tvp;
5054         vp_seqc = fpl->tvp_seqc;
5055
5056         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
5057         mp = atomic_load_ptr(&vp->v_mountedhere);
5058         if (__predict_false(mp == NULL)) {
5059                 return (0);
5060         }
5061
5062         if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
5063                 return (cache_fpl_partial(fpl));
5064         }
5065         if (!vn_seqc_consistent(vp, vp_seqc)) {
5066                 vfs_op_thread_exit_crit(mp, mpcpu);
5067                 return (cache_fpl_partial(fpl));
5068         }
5069         if (!cache_fplookup_mp_supported(mp)) {
5070                 vfs_op_thread_exit_crit(mp, mpcpu);
5071                 return (cache_fpl_partial(fpl));
5072         }
5073         vp = atomic_load_ptr(&mp->mnt_rootvnode);
5074         if (__predict_false(vp == NULL)) {
5075                 vfs_op_thread_exit_crit(mp, mpcpu);
5076                 return (cache_fpl_partial(fpl));
5077         }
5078         vp_seqc = vn_seqc_read_any(vp);
5079         vfs_op_thread_exit_crit(mp, mpcpu);
5080         if (seqc_in_modify(vp_seqc)) {
5081                 return (cache_fpl_partial(fpl));
5082         }
5083         mp = atomic_load_ptr(&vp->v_mountedhere);
5084         if (__predict_false(mp != NULL)) {
5085                 /*
5086                  * There are possibly more mount points on top.
5087                  * Normally this does not happen so for simplicity just start
5088                  * over.
5089                  */
5090                 return (cache_fplookup_climb_mount(fpl));
5091         }
5092
5093         fpl->tvp = vp;
5094         fpl->tvp_seqc = vp_seqc;
5095         return (0);
5096 }
5097
5098 /*
5099  * Check if a vnode is mounted on.
5100  */
5101 static bool
5102 cache_fplookup_is_mp(struct cache_fpl *fpl)
5103 {
5104         struct vnode *vp;
5105
5106         vp = fpl->tvp;
5107         return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
5108 }
5109
5110 /*
5111  * Parse the path.
5112  *
5113  * The code was originally copy-pasted from regular lookup and despite
5114  * clean ups leaves performance on the table. Any modifications here
5115  * must take into account that in case off fallback the resulting
5116  * nameidata state has to be compatible with the original.
5117  */
5118
5119 /*
5120  * Debug ni_pathlen tracking.
5121  */
5122 #ifdef INVARIANTS
5123 static void
5124 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5125 {
5126
5127         fpl->debug.ni_pathlen += n;
5128         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5129             ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5130 }
5131
5132 static void
5133 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5134 {
5135
5136         fpl->debug.ni_pathlen -= n;
5137         KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
5138             ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
5139 }
5140
5141 static void
5142 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5143 {
5144
5145         cache_fpl_pathlen_add(fpl, 1);
5146 }
5147
5148 static void
5149 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5150 {
5151
5152         cache_fpl_pathlen_sub(fpl, 1);
5153 }
5154 #else
5155 static void
5156 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
5157 {
5158 }
5159
5160 static void
5161 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
5162 {
5163 }
5164
5165 static void
5166 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
5167 {
5168 }
5169
5170 static void
5171 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
5172 {
5173 }
5174 #endif
5175
5176 static int __always_inline
5177 cache_fplookup_preparse(struct cache_fpl *fpl)
5178 {
5179         struct componentname *cnp;
5180
5181         cnp = fpl->cnp;
5182
5183         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
5184                 return (cache_fplookup_degenerate(fpl));
5185         }
5186
5187         /*
5188          * By this point the shortest possible pathname is one character + nul
5189          * terminator, hence 2.
5190          */
5191         KASSERT(fpl->debug.ni_pathlen >= 2, ("%s: pathlen %zu\n", __func__,
5192             fpl->debug.ni_pathlen));
5193         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 2] == fpl->nulchar - 1,
5194             ("%s: mismatch on string (%p != %p) [%s]\n", __func__,
5195             &cnp->cn_nameptr[fpl->debug.ni_pathlen - 2], fpl->nulchar - 1,
5196             cnp->cn_pnbuf));
5197         if (__predict_false(*(fpl->nulchar - 1) == '/')) {
5198                 /*
5199                  * TODO
5200                  * Regular lookup performs the following:
5201                  * *ndp->ni_next = '\0';
5202                  * cnp->cn_flags |= TRAILINGSLASH;
5203                  *
5204                  * Which is problematic since it modifies data read
5205                  * from userspace. Then if fast path lookup was to
5206                  * abort we would have to either restore it or convey
5207                  * the flag. Since this is a corner case just ignore
5208                  * it for simplicity.
5209                  */
5210                 return (cache_fpl_aborted(fpl));
5211         }
5212         return (0);
5213 }
5214
5215 static void
5216 cache_fplookup_parse(struct cache_fpl *fpl)
5217 {
5218         struct nameidata *ndp;
5219         struct componentname *cnp;
5220         char *cp;
5221
5222         ndp = fpl->ndp;
5223         cnp = fpl->cnp;
5224
5225         /*
5226          * Find the end of this path component, it is either / or nul.
5227          *
5228          * Store / as a temporary sentinel so that we only have one character
5229          * to test for. Pathnames tend to be short so this should not be
5230          * resulting in cache misses.
5231          */
5232         KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
5233             ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
5234             __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
5235             fpl->nulchar, cnp->cn_pnbuf));
5236         KASSERT(*fpl->nulchar == '\0',
5237             ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
5238             cnp->cn_pnbuf));
5239         *fpl->nulchar = '/';
5240         for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
5241                 KASSERT(*cp != '\0',
5242                     ("%s: encountered unexpected nul; string [%s]\n", __func__,
5243                     cnp->cn_nameptr));
5244                 continue;
5245         }
5246         *fpl->nulchar = '\0';
5247
5248         cnp->cn_namelen = cp - cnp->cn_nameptr;
5249         cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
5250         /*
5251          * Hack: we have to check if the found path component's length exceeds
5252          * NAME_MAX. However, the condition is very rarely true and check can
5253          * be elided in the common case -- if an entry was found in the cache,
5254          * then it could not have been too long to begin with.
5255          */
5256         ndp->ni_next = cp;
5257
5258 #ifdef INVARIANTS
5259         /*
5260          * Code below is only here to assure compatibility with regular lookup.
5261          * It covers handling of trailing slashes and names like "/", both of
5262          * which of can be taken care of upfront which lockless lookup does
5263          * in cache_fplookup_preparse. Regular lookup performs these for each
5264          * path component.
5265          */
5266         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
5267                 cp++;
5268                 if (*cp == '\0') {
5269                         panic("%s: ran into TRAILINGSLASH handling from [%s]\n",
5270                             __func__, cnp->cn_pnbuf);
5271                 }
5272         }
5273
5274         if (cnp->cn_nameptr[0] == '\0') {
5275                 panic("%s: ran into degenerate name from [%s]\n", __func__, cnp->cn_pnbuf);
5276         }
5277 #endif
5278 }
5279
5280 static void
5281 cache_fplookup_parse_advance(struct cache_fpl *fpl)
5282 {
5283         struct nameidata *ndp;
5284         struct componentname *cnp;
5285
5286         ndp = fpl->ndp;
5287         cnp = fpl->cnp;
5288
5289         cnp->cn_nameptr = ndp->ni_next;
5290         KASSERT(*(cnp->cn_nameptr) == '/',
5291             ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
5292             cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
5293         cnp->cn_nameptr++;
5294         cache_fpl_pathlen_dec(fpl);
5295 }
5296
5297 /*
5298  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
5299  *
5300  * Lockless lookup tries to elide checking for spurious slashes and should they
5301  * be present is guaranteed to fail to find an entry. In this case the caller
5302  * must check if the name starts with a slash and call this routine.  It is
5303  * going to fast forward across the spurious slashes and set the state up for
5304  * retry.
5305  */
5306 static int __noinline
5307 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
5308 {
5309         struct nameidata *ndp;
5310         struct componentname *cnp;
5311
5312         ndp = fpl->ndp;
5313         cnp = fpl->cnp;
5314
5315         MPASS(*(cnp->cn_nameptr) == '/');
5316         do {
5317                 cnp->cn_nameptr++;
5318                 cache_fpl_pathlen_dec(fpl);
5319         } while (*(cnp->cn_nameptr) == '/');
5320
5321         /*
5322          * Go back to one slash so that cache_fplookup_parse_advance has
5323          * something to skip.
5324          */
5325         cnp->cn_nameptr--;
5326         cache_fpl_pathlen_inc(fpl);
5327
5328         /*
5329          * cache_fplookup_parse_advance starts from ndp->ni_next
5330          */
5331         ndp->ni_next = cnp->cn_nameptr;
5332
5333         /*
5334          * See cache_fplookup_dot.
5335          */
5336         fpl->tvp = fpl->dvp;
5337         fpl->tvp_seqc = fpl->dvp_seqc;
5338
5339         return (0);
5340 }
5341
5342 /*
5343  * See the API contract for VOP_FPLOOKUP_VEXEC.
5344  */
5345 static int __noinline
5346 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
5347 {
5348         struct componentname *cnp;
5349         struct vnode *dvp;
5350         seqc_t dvp_seqc;
5351
5352         cnp = fpl->cnp;
5353         dvp = fpl->dvp;
5354         dvp_seqc = fpl->dvp_seqc;
5355
5356         /*
5357          * Hack: delayed name len checking.
5358          */
5359         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
5360                 cache_fpl_smr_exit(fpl);
5361                 return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
5362         }
5363
5364         /*
5365          * Hack: they may be looking up foo/bar, where foo is not a directory.
5366          * In such a case we need to return ENOTDIR, but we may happen to get
5367          * here with a different error.
5368          */
5369         if (dvp->v_type != VDIR) {
5370                 error = ENOTDIR;
5371         }
5372
5373         /*
5374          * Hack: handle O_SEARCH.
5375          *
5376          * Open Group Base Specifications Issue 7, 2018 edition states:
5377          * <quote>
5378          * If the access mode of the open file description associated with the
5379          * file descriptor is not O_SEARCH, the function shall check whether
5380          * directory searches are permitted using the current permissions of
5381          * the directory underlying the file descriptor. If the access mode is
5382          * O_SEARCH, the function shall not perform the check.
5383          * </quote>
5384          *
5385          * Regular lookup tests for the NOEXECCHECK flag for every path
5386          * component to decide whether to do the permission check. However,
5387          * since most lookups never have the flag (and when they do it is only
5388          * present for the first path component), lockless lookup only acts on
5389          * it if there is a permission problem. Here the flag is represented
5390          * with a boolean so that we don't have to clear it on the way out.
5391          *
5392          * For simplicity this always aborts.
5393          * TODO: check if this is the first lookup and ignore the permission
5394          * problem. Note the flag has to survive fallback (if it happens to be
5395          * performed).
5396          */
5397         if (fpl->fsearch) {
5398                 return (cache_fpl_aborted(fpl));
5399         }
5400
5401         switch (error) {
5402         case EAGAIN:
5403                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5404                         error = cache_fpl_aborted(fpl);
5405                 } else {
5406                         cache_fpl_partial(fpl);
5407                 }
5408                 break;
5409         default:
5410                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
5411                         error = cache_fpl_aborted(fpl);
5412                 } else {
5413                         cache_fpl_smr_exit(fpl);
5414                         cache_fpl_handled_error(fpl, error);
5415                 }
5416                 break;
5417         }
5418         return (error);
5419 }
5420
5421 static int
5422 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
5423 {
5424         struct nameidata *ndp;
5425         struct componentname *cnp;
5426         struct mount *mp;
5427         int error;
5428
5429         ndp = fpl->ndp;
5430         cnp = fpl->cnp;
5431
5432         cache_fpl_checkpoint(fpl);
5433
5434         /*
5435          * The vnode at hand is almost always stable, skip checking for it.
5436          * Worst case this postpones the check towards the end of the iteration
5437          * of the main loop.
5438          */
5439         fpl->dvp = dvp;
5440         fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
5441
5442         mp = atomic_load_ptr(&dvp->v_mount);
5443         if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
5444                 return (cache_fpl_aborted(fpl));
5445         }
5446
5447         error = cache_fplookup_preparse(fpl);
5448         if (__predict_false(cache_fpl_terminated(fpl))) {
5449                 return (error);
5450         }
5451
5452         for (;;) {
5453                 cache_fplookup_parse(fpl);
5454
5455                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
5456                 if (__predict_false(error != 0)) {
5457                         error = cache_fplookup_failed_vexec(fpl, error);
5458                         break;
5459                 }
5460
5461                 error = cache_fplookup_next(fpl);
5462                 if (__predict_false(cache_fpl_terminated(fpl))) {
5463                         break;
5464                 }
5465
5466                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
5467
5468                 if (fpl->tvp->v_type == VLNK) {
5469                         error = cache_fplookup_symlink(fpl);
5470                         if (cache_fpl_terminated(fpl)) {
5471                                 break;
5472                         }
5473                 } else {
5474                         if (cache_fpl_islastcn(ndp)) {
5475                                 error = cache_fplookup_final(fpl);
5476                                 break;
5477                         }
5478
5479                         if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
5480                                 error = cache_fpl_aborted(fpl);
5481                                 break;
5482                         }
5483
5484                         fpl->dvp = fpl->tvp;
5485                         fpl->dvp_seqc = fpl->tvp_seqc;
5486                         cache_fplookup_parse_advance(fpl);
5487                 }
5488
5489                 cache_fpl_checkpoint(fpl);
5490         }
5491
5492         return (error);
5493 }
5494
5495 /*
5496  * Fast path lookup protected with SMR and sequence counters.
5497  *
5498  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
5499  *
5500  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
5501  * outlined below.
5502  *
5503  * Traditional vnode lookup conceptually looks like this:
5504  *
5505  * vn_lock(current);
5506  * for (;;) {
5507  *      next = find();
5508  *      vn_lock(next);
5509  *      vn_unlock(current);
5510  *      current = next;
5511  *      if (last)
5512  *          break;
5513  * }
5514  * return (current);
5515  *
5516  * Each jump to the next vnode is safe memory-wise and atomic with respect to
5517  * any modifications thanks to holding respective locks.
5518  *
5519  * The same guarantee can be provided with a combination of safe memory
5520  * reclamation and sequence counters instead. If all operations which affect
5521  * the relationship between the current vnode and the one we are looking for
5522  * also modify the counter, we can verify whether all the conditions held as
5523  * we made the jump. This includes things like permissions, mount points etc.
5524  * Counter modification is provided by enclosing relevant places in
5525  * vn_seqc_write_begin()/end() calls.
5526  *
5527  * Thus this translates to:
5528  *
5529  * vfs_smr_enter();
5530  * dvp_seqc = seqc_read_any(dvp);
5531  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
5532  *     abort();
5533  * for (;;) {
5534  *      tvp = find();
5535  *      tvp_seqc = seqc_read_any(tvp);
5536  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
5537  *          abort();
5538  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
5539  *          abort();
5540  *      dvp = tvp; // we know nothing of importance has changed
5541  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
5542  *      if (last)
5543  *          break;
5544  * }
5545  * vget(); // secure the vnode
5546  * if (!seqc_consistent(tvp, tvp_seqc) // final check
5547  *          abort();
5548  * // at this point we know nothing has changed for any parent<->child pair
5549  * // as they were crossed during the lookup, meaning we matched the guarantee
5550  * // of the locked variant
5551  * return (tvp);
5552  *
5553  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
5554  * - they are called while within vfs_smr protection which they must never exit
5555  * - EAGAIN can be returned to denote checking could not be performed, it is
5556  *   always valid to return it
5557  * - if the sequence counter has not changed the result must be valid
5558  * - if the sequence counter has changed both false positives and false negatives
5559  *   are permitted (since the result will be rejected later)
5560  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
5561  *
5562  * Caveats to watch out for:
5563  * - vnodes are passed unlocked and unreferenced with nothing stopping
5564  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
5565  *   to use atomic_load_ptr to fetch it.
5566  * - the aforementioned object can also get freed, meaning absent other means it
5567  *   should be protected with vfs_smr
5568  * - either safely checking permissions as they are modified or guaranteeing
5569  *   their stability is left to the routine
5570  */
5571 int
5572 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
5573     struct pwd **pwdp)
5574 {
5575         struct cache_fpl fpl;
5576         struct pwd *pwd;
5577         struct vnode *dvp;
5578         struct componentname *cnp;
5579         int error;
5580
5581         fpl.status = CACHE_FPL_STATUS_UNSET;
5582         fpl.in_smr = false;
5583         fpl.ndp = ndp;
5584         fpl.cnp = cnp = &ndp->ni_cnd;
5585         MPASS(ndp->ni_lcf == 0);
5586         MPASS(curthread == cnp->cn_thread);
5587         KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
5588             ("%s: internal flags found in cn_flags %" PRIx64, __func__,
5589             cnp->cn_flags));
5590         if ((cnp->cn_flags & SAVESTART) != 0) {
5591                 MPASS(cnp->cn_nameiop != LOOKUP);
5592         }
5593         MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
5594
5595         if (__predict_false(!cache_can_fplookup(&fpl))) {
5596                 *status = fpl.status;
5597                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
5598                 return (EOPNOTSUPP);
5599         }
5600
5601         cache_fpl_checkpoint_outer(&fpl);
5602
5603         cache_fpl_smr_enter_initial(&fpl);
5604 #ifdef INVARIANTS
5605         fpl.debug.ni_pathlen = ndp->ni_pathlen;
5606 #endif
5607         fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
5608         fpl.fsearch = false;
5609         fpl.savename = (cnp->cn_flags & SAVENAME) != 0;
5610         fpl.pwd = pwdp;
5611         pwd = pwd_get_smr();
5612         *(fpl.pwd) = pwd;
5613         ndp->ni_rootdir = pwd->pwd_rdir;
5614         ndp->ni_topdir = pwd->pwd_jdir;
5615
5616         if (cnp->cn_pnbuf[0] == '/') {
5617                 dvp = cache_fpl_handle_root(&fpl);
5618                 MPASS(ndp->ni_resflags == 0);
5619                 ndp->ni_resflags = NIRES_ABS;
5620         } else {
5621                 if (ndp->ni_dirfd == AT_FDCWD) {
5622                         dvp = pwd->pwd_cdir;
5623                 } else {
5624                         error = cache_fplookup_dirfd(&fpl, &dvp);
5625                         if (__predict_false(error != 0)) {
5626                                 goto out;
5627                         }
5628                 }
5629         }
5630
5631         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
5632         error = cache_fplookup_impl(dvp, &fpl);
5633 out:
5634         cache_fpl_smr_assert_not_entered(&fpl);
5635         cache_fpl_assert_status(&fpl);
5636         *status = fpl.status;
5637         if (SDT_PROBES_ENABLED()) {
5638                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
5639                 if (fpl.status == CACHE_FPL_STATUS_HANDLED)
5640                         SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
5641                             ndp);
5642         }
5643
5644         if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
5645                 MPASS(error != CACHE_FPL_FAILED);
5646                 if (error != 0) {
5647                         MPASS(fpl.dvp == NULL);
5648                         MPASS(fpl.tvp == NULL);
5649                         MPASS(fpl.savename == false);
5650                 }
5651                 ndp->ni_dvp = fpl.dvp;
5652                 ndp->ni_vp = fpl.tvp;
5653                 if (fpl.savename) {
5654                         cnp->cn_flags |= HASBUF;
5655                 } else {
5656                         cache_fpl_cleanup_cnp(cnp);
5657                 }
5658         }
5659         return (error);
5660 }