sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 SDT_PROVIDER_DECLARE(vfs);
  83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  84     "struct vnode *");
  85 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  86     "struct vnode *");
  87 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  88     "char *");
  89 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  90     "const char *");
  91 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  92     "struct namecache *", "int", "int");
  93 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  94 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  95     "char *", "struct vnode *");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  98     "struct vnode *", "char *");
  99 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 100     "struct vnode *");
 101 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 102     "struct vnode *", "char *");
 103 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 104     "char *");
 105 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 106     "struct componentname *");
 107 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 108     "struct componentname *");
 109 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 110 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 111 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 112 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 113     "struct vnode *");
 114 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 115     "char *");
 116 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 117     "char *");
 118
 119 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 120 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 121 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 122
 123 /*
 124  * This structure describes the elements in the cache of recent
 125  * names looked up by namei.
 126  */
 127 struct negstate {
 128         u_char neg_flag;
 129 };
 130 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 131     "the state must fit in a union with a pointer without growing it");
 132
 133 struct  namecache {
 134         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 135         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 136         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 137         struct  vnode *nc_dvp;          /* vnode of parent of name */
 138         union {
 139                 struct  vnode *nu_vp;   /* vnode the name refers to */
 140                 struct  negstate nu_neg;/* negative entry state */
 141         } n_un;
 142         u_char  nc_flag;                /* flag bits */
 143         u_char  nc_nlen;                /* length of name */
 144         char    nc_name[0];             /* segment name + nul */
 145 };
 146
 147 /*
 148  * struct namecache_ts repeats struct namecache layout up to the
 149  * nc_nlen member.
 150  * struct namecache_ts is used in place of struct namecache when time(s) need
 151  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 152  * both a non-dotdot directory name plus dotdot for the directory's
 153  * parent.
 154  *
 155  * See below for alignment requirement.
 156  */
 157 struct  namecache_ts {
 158         struct  timespec nc_time;       /* timespec provided by fs */
 159         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 160         int     nc_ticks;               /* ticks value when entry was added */
 161         struct namecache nc_nc;
 162 };
 163
 164 /*
 165  * At least mips n32 performs 64-bit accesses to timespec as found
 166  * in namecache_ts and requires them to be aligned. Since others
 167  * may be in the same spot suffer a little bit and enforce the
 168  * alignment for everyone. Note this is a nop for 64-bit platforms.
 169  */
 170 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 171 #define CACHE_PATH_CUTOFF       39
 172
 173 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 174 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 175 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 176 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 177
 178 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 179 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 180 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 181 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 182
 183 #define nc_vp           n_un.nu_vp
 184 #define nc_neg          n_un.nu_neg
 185
 186 /*
 187  * Flags in namecache.nc_flag
 188  */
 189 #define NCF_WHITE       0x01
 190 #define NCF_ISDOTDOT    0x02
 191 #define NCF_TS          0x04
 192 #define NCF_DTS         0x08
 193 #define NCF_DVDROP      0x10
 194 #define NCF_NEGATIVE    0x20
 195 #define NCF_INVALID     0x40
 196 #define NCF_WIP         0x80
 197
 198 /*
 199  * Flags in negstate.neg_flag
 200  */
 201 #define NEG_HOT         0x01
 202
 203 /*
 204  * Mark an entry as invalid.
 205  *
 206  * This is called before it starts getting deconstructed.
 207  */
 208 static void
 209 cache_ncp_invalidate(struct namecache *ncp)
 210 {
 211
 212         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 213             ("%s: entry %p already invalid", __func__, ncp));
 214         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 215         atomic_thread_fence_rel();
 216 }
 217
 218 /*
 219  * Check whether the entry can be safely used.
 220  *
 221  * All places which elide locks are supposed to call this after they are
 222  * done with reading from an entry.
 223  */
 224 static bool
 225 cache_ncp_canuse(struct namecache *ncp)
 226 {
 227
 228         atomic_thread_fence_acq();
 229         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 230 }
 231
 232 /*
 233  * Name caching works as follows:
 234  *
 235  * Names found by directory scans are retained in a cache
 236  * for future reference.  It is managed LRU, so frequently
 237  * used names will hang around.  Cache is indexed by hash value
 238  * obtained from (dvp, name) where dvp refers to the directory
 239  * containing name.
 240  *
 241  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 242  * exist) the vnode pointer will be NULL.
 243  *
 244  * Upon reaching the last segment of a path, if the reference
 245  * is for DELETE, or NOCACHE is set (rewrite), and the
 246  * name is located in the cache, it will be dropped.
 247  *
 248  * These locks are used (in the order in which they can be taken):
 249  * NAME         TYPE    ROLE
 250  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 251  * bucketlock   mtx     for access to given set of hash buckets
 252  * neglist      mtx     negative entry LRU management
 253  *
 254  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 255  * order is lower address first. Both are recursive.
 256  *
 257  * "." lookups are lockless.
 258  *
 259  * ".." and vnode -> name lookups require vnodelock.
 260  *
 261  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 262  *
 263  * Insertions and removals of entries require involved vnodes and bucketlocks
 264  * to be locked to provide safe operation against other threads modifying the
 265  * cache.
 266  *
 267  * Some lookups result in removal of the found entry (e.g. getting rid of a
 268  * negative entry with the intent to create a positive one), which poses a
 269  * problem when multiple threads reach the state. Similarly, two different
 270  * threads can purge two different vnodes and try to remove the same name.
 271  *
 272  * If the already held vnode lock is lower than the second required lock, we
 273  * can just take the other lock. However, in the opposite case, this could
 274  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 275  * the first node, locking everything in order and revalidating the state.
 276  */
 277
 278 VFS_SMR_DECLARE;
 279
 280 /*
 281  * Structures associated with name caching.
 282  */
 283 #define NCHHASH(hash) \
 284         (&nchashtbl[(hash) & nchash])
 285 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 286 static u_long __read_mostly     nchash;                 /* size of hash table */
 287 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 288     "Size of namecache hash table");
 289 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 290 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 291     "Ratio of negative namecache entries");
 292 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 293 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 294 u_int ncsizefactor = 2;
 295 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 296     "Size factor for namecache");
 297 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 298
 299 struct nchstats nchstats;               /* cache effectiveness statistics */
 300
 301 static bool __read_frequently cache_fast_revlookup = true;
 302 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 303     &cache_fast_revlookup, 0, "");
 304
 305 static u_int __exclusive_cache_line neg_cycle;
 306
 307 #define ncneghash       3
 308 #define numneglists     (ncneghash + 1)
 309
 310 struct neglist {
 311         struct mtx              nl_evict_lock;
 312         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 313         TAILQ_HEAD(, namecache) nl_list;
 314         TAILQ_HEAD(, namecache) nl_hotlist;
 315         u_long                  nl_hotnum;
 316 } __aligned(CACHE_LINE_SIZE);
 317
 318 static struct neglist neglists[numneglists];
 319
 320 static inline struct neglist *
 321 NCP2NEGLIST(struct namecache *ncp)
 322 {
 323
 324         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 325 }
 326
 327 static inline struct negstate *
 328 NCP2NEGSTATE(struct namecache *ncp)
 329 {
 330
 331         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 332         return (&ncp->nc_neg);
 333 }
 334
 335 #define numbucketlocks (ncbuckethash + 1)
 336 static u_int __read_mostly  ncbuckethash;
 337 static struct mtx_padalign __read_mostly  *bucketlocks;
 338 #define HASH2BUCKETLOCK(hash) \
 339         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 340
 341 #define numvnodelocks (ncvnodehash + 1)
 342 static u_int __read_mostly  ncvnodehash;
 343 static struct mtx __read_mostly *vnodelocks;
 344 static inline struct mtx *
 345 VP2VNODELOCK(struct vnode *vp)
 346 {
 347
 348         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 349 }
 350
 351 /*
 352  * UMA zones for the VFS cache.
 353  *
 354  * The small cache is used for entries with short names, which are the
 355  * most common.  The large cache is used for entries which are too big to
 356  * fit in the small cache.
 357  */
 358 static uma_zone_t __read_mostly cache_zone_small;
 359 static uma_zone_t __read_mostly cache_zone_small_ts;
 360 static uma_zone_t __read_mostly cache_zone_large;
 361 static uma_zone_t __read_mostly cache_zone_large_ts;
 362
 363 static struct namecache *
 364 cache_alloc(int len, int ts)
 365 {
 366         struct namecache_ts *ncp_ts;
 367         struct namecache *ncp;
 368
 369         if (__predict_false(ts)) {
 370                 if (len <= CACHE_PATH_CUTOFF)
 371                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 372                 else
 373                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 374                 ncp = &ncp_ts->nc_nc;
 375         } else {
 376                 if (len <= CACHE_PATH_CUTOFF)
 377                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 378                 else
 379                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 380         }
 381         return (ncp);
 382 }
 383
 384 static void
 385 cache_free(struct namecache *ncp)
 386 {
 387         struct namecache_ts *ncp_ts;
 388
 389         MPASS(ncp != NULL);
 390         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 391                 vdrop(ncp->nc_dvp);
 392         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 393                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 394                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 395                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 396                 else
 397                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 398         } else {
 399                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 400                         uma_zfree_smr(cache_zone_small, ncp);
 401                 else
 402                         uma_zfree_smr(cache_zone_large, ncp);
 403         }
 404 }
 405
 406 static void
 407 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 408 {
 409         struct namecache_ts *ncp_ts;
 410
 411         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 412             (tsp == NULL && ticksp == NULL),
 413             ("No NCF_TS"));
 414
 415         if (tsp == NULL)
 416                 return;
 417
 418         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 419         *tsp = ncp_ts->nc_time;
 420         *ticksp = ncp_ts->nc_ticks;
 421 }
 422
 423 #ifdef DEBUG_CACHE
 424 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 425 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 426     "VFS namecache enabled");
 427 #endif
 428
 429 /* Export size information to userland */
 430 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 431     sizeof(struct namecache), "sizeof(struct namecache)");
 432
 433 /*
 434  * The new name cache statistics
 435  */
 436 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 437     "Name cache statistics");
 438 #define STATNODE_ULONG(name, descr)                                     \
 439         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 440 #define STATNODE_COUNTER(name, descr)                                   \
 441         static COUNTER_U64_DEFINE_EARLY(name);                          \
 442         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 443             descr);
 444 STATNODE_ULONG(numneg, "Number of negative cache entries");
 445 STATNODE_ULONG(numcache, "Number of cache entries");
 446 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 447 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 448 STATNODE_COUNTER(dothits, "Number of '.' hits");
 449 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 450 STATNODE_COUNTER(nummiss, "Number of cache misses");
 451 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 452 STATNODE_COUNTER(numposzaps,
 453     "Number of cache hits (positive) we do not want to cache");
 454 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 455 STATNODE_COUNTER(numnegzaps,
 456     "Number of cache hits (negative) we do not want to cache");
 457 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 458 /* These count for vn_getcwd(), too. */
 459 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 460 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 461 STATNODE_COUNTER(numfullpathfail2,
 462     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 463 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 464 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 465 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 466     "Number of successful removals after relocking");
 467 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 468     "Number of times zap_and_exit failed to lock");
 469 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 470     "Number of times zap_and_exit failed to lock");
 471 static long cache_lock_vnodes_cel_3_failures;
 472 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 473     "Number of times 3-way vnode locking failed");
 474
 475 static void cache_zap_locked(struct namecache *ncp);
 476 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 477     char **freebuf, size_t *buflen);
 478 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 479     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
 480 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 481     char **retbuf, size_t *buflen);
 482 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 483     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 484
 485 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 486
 487 static inline void
 488 cache_assert_vlp_locked(struct mtx *vlp)
 489 {
 490
 491         if (vlp != NULL)
 492                 mtx_assert(vlp, MA_OWNED);
 493 }
 494
 495 static inline void
 496 cache_assert_vnode_locked(struct vnode *vp)
 497 {
 498         struct mtx *vlp;
 499
 500         vlp = VP2VNODELOCK(vp);
 501         cache_assert_vlp_locked(vlp);
 502 }
 503
 504 /*
 505  * TODO: With the value stored we can do better than computing the hash based
 506  * on the address. The choice of FNV should also be revisited.
 507  */
 508 static void
 509 cache_prehash(struct vnode *vp)
 510 {
 511
 512         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 513 }
 514
 515 static uint32_t
 516 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 517 {
 518
 519         return (fnv_32_buf(name, len, dvp->v_nchash));
 520 }
 521
 522 static inline struct nchashhead *
 523 NCP2BUCKET(struct namecache *ncp)
 524 {
 525         uint32_t hash;
 526
 527         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 528         return (NCHHASH(hash));
 529 }
 530
 531 static inline struct mtx *
 532 NCP2BUCKETLOCK(struct namecache *ncp)
 533 {
 534         uint32_t hash;
 535
 536         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 537         return (HASH2BUCKETLOCK(hash));
 538 }
 539
 540 #ifdef INVARIANTS
 541 static void
 542 cache_assert_bucket_locked(struct namecache *ncp)
 543 {
 544         struct mtx *blp;
 545
 546         blp = NCP2BUCKETLOCK(ncp);
 547         mtx_assert(blp, MA_OWNED);
 548 }
 549
 550 static void
 551 cache_assert_bucket_unlocked(struct namecache *ncp)
 552 {
 553         struct mtx *blp;
 554
 555         blp = NCP2BUCKETLOCK(ncp);
 556         mtx_assert(blp, MA_NOTOWNED);
 557 }
 558 #else
 559 #define cache_assert_bucket_locked(x) do { } while (0)
 560 #define cache_assert_bucket_unlocked(x) do { } while (0)
 561 #endif
 562
 563 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 564 static void
 565 _cache_sort_vnodes(void **p1, void **p2)
 566 {
 567         void *tmp;
 568
 569         MPASS(*p1 != NULL || *p2 != NULL);
 570
 571         if (*p1 > *p2) {
 572                 tmp = *p2;
 573                 *p2 = *p1;
 574                 *p1 = tmp;
 575         }
 576 }
 577
 578 static void
 579 cache_lock_all_buckets(void)
 580 {
 581         u_int i;
 582
 583         for (i = 0; i < numbucketlocks; i++)
 584                 mtx_lock(&bucketlocks[i]);
 585 }
 586
 587 static void
 588 cache_unlock_all_buckets(void)
 589 {
 590         u_int i;
 591
 592         for (i = 0; i < numbucketlocks; i++)
 593                 mtx_unlock(&bucketlocks[i]);
 594 }
 595
 596 static void
 597 cache_lock_all_vnodes(void)
 598 {
 599         u_int i;
 600
 601         for (i = 0; i < numvnodelocks; i++)
 602                 mtx_lock(&vnodelocks[i]);
 603 }
 604
 605 static void
 606 cache_unlock_all_vnodes(void)
 607 {
 608         u_int i;
 609
 610         for (i = 0; i < numvnodelocks; i++)
 611                 mtx_unlock(&vnodelocks[i]);
 612 }
 613
 614 static int
 615 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 616 {
 617
 618         cache_sort_vnodes(&vlp1, &vlp2);
 619
 620         if (vlp1 != NULL) {
 621                 if (!mtx_trylock(vlp1))
 622                         return (EAGAIN);
 623         }
 624         if (!mtx_trylock(vlp2)) {
 625                 if (vlp1 != NULL)
 626                         mtx_unlock(vlp1);
 627                 return (EAGAIN);
 628         }
 629
 630         return (0);
 631 }
 632
 633 static void
 634 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 635 {
 636
 637         MPASS(vlp1 != NULL || vlp2 != NULL);
 638         MPASS(vlp1 <= vlp2);
 639
 640         if (vlp1 != NULL)
 641                 mtx_lock(vlp1);
 642         if (vlp2 != NULL)
 643                 mtx_lock(vlp2);
 644 }
 645
 646 static void
 647 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 648 {
 649
 650         MPASS(vlp1 != NULL || vlp2 != NULL);
 651
 652         if (vlp1 != NULL)
 653                 mtx_unlock(vlp1);
 654         if (vlp2 != NULL)
 655                 mtx_unlock(vlp2);
 656 }
 657
 658 static int
 659 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 660 {
 661         struct nchstats snap;
 662
 663         if (req->oldptr == NULL)
 664                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 665
 666         snap = nchstats;
 667         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 668         snap.ncs_neghits = counter_u64_fetch(numneghits);
 669         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 670             counter_u64_fetch(numnegzaps);
 671         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 672             counter_u64_fetch(nummiss);
 673
 674         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 675 }
 676 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 677     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 678     "VFS cache effectiveness statistics");
 679
 680 #ifdef DIAGNOSTIC
 681 /*
 682  * Grab an atomic snapshot of the name cache hash chain lengths
 683  */
 684 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 685     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 686     "hash table stats");
 687
 688 static int
 689 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 690 {
 691         struct nchashhead *ncpp;
 692         struct namecache *ncp;
 693         int i, error, n_nchash, *cntbuf;
 694
 695 retry:
 696         n_nchash = nchash + 1;  /* nchash is max index, not count */
 697         if (req->oldptr == NULL)
 698                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 699         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 700         cache_lock_all_buckets();
 701         if (n_nchash != nchash + 1) {
 702                 cache_unlock_all_buckets();
 703                 free(cntbuf, M_TEMP);
 704                 goto retry;
 705         }
 706         /* Scan hash tables counting entries */
 707         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 708                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 709                         cntbuf[i]++;
 710         cache_unlock_all_buckets();
 711         for (error = 0, i = 0; i < n_nchash; i++)
 712                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 713                         break;
 714         free(cntbuf, M_TEMP);
 715         return (error);
 716 }
 717 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 718     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 719     "nchash chain lengths");
 720
 721 static int
 722 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 723 {
 724         int error;
 725         struct nchashhead *ncpp;
 726         struct namecache *ncp;
 727         int n_nchash;
 728         int count, maxlength, used, pct;
 729
 730         if (!req->oldptr)
 731                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 732
 733         cache_lock_all_buckets();
 734         n_nchash = nchash + 1;  /* nchash is max index, not count */
 735         used = 0;
 736         maxlength = 0;
 737
 738         /* Scan hash tables for applicable entries */
 739         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 740                 count = 0;
 741                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 742                         count++;
 743                 }
 744                 if (count)
 745                         used++;
 746                 if (maxlength < count)
 747                         maxlength = count;
 748         }
 749         n_nchash = nchash + 1;
 750         cache_unlock_all_buckets();
 751         pct = (used * 100) / (n_nchash / 100);
 752         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 753         if (error)
 754                 return (error);
 755         error = SYSCTL_OUT(req, &used, sizeof(used));
 756         if (error)
 757                 return (error);
 758         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 759         if (error)
 760                 return (error);
 761         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 762         if (error)
 763                 return (error);
 764         return (0);
 765 }
 766 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 767     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 768     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 769 #endif
 770
 771 /*
 772  * Negative entries management
 773  *
 774  * Various workloads create plenty of negative entries and barely use them
 775  * afterwards. Moreover malicious users can keep performing bogus lookups
 776  * adding even more entries. For example "make tinderbox" as of writing this
 777  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 778  * negative.
 779  *
 780  * As such, a rather aggressive eviction method is needed. The currently
 781  * employed method is a placeholder.
 782  *
 783  * Entries are split over numneglists separate lists, each of which is further
 784  * split into hot and cold entries. Entries get promoted after getting a hit.
 785  * Eviction happens on addition of new entry.
 786  */
 787 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 788     "Name cache negative entry statistics");
 789
 790 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 791     "Number of negative cache entries");
 792
 793 static COUNTER_U64_DEFINE_EARLY(neg_created);
 794 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 795     "Number of created negative entries");
 796
 797 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 798 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 799     "Number of evicted negative entries");
 800
 801 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 802 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 803     &neg_evict_skipped_empty,
 804     "Number of times evicting failed due to lack of entries");
 805
 806 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 807 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 808     &neg_evict_skipped_missed,
 809     "Number of times evicting failed due to target entry disappearing");
 810
 811 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 812 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 813     &neg_evict_skipped_contended,
 814     "Number of times evicting failed due to contention");
 815
 816 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 817     "Number of cache hits (negative)");
 818
 819 static int
 820 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 821 {
 822         int i, out;
 823
 824         out = 0;
 825         for (i = 0; i < numneglists; i++)
 826                 out += neglists[i].nl_hotnum;
 827
 828         return (SYSCTL_OUT(req, &out, sizeof(out)));
 829 }
 830 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 831     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 832     "Number of hot negative entries");
 833
 834 static void
 835 cache_neg_init(struct namecache *ncp)
 836 {
 837         struct negstate *ns;
 838
 839         ncp->nc_flag |= NCF_NEGATIVE;
 840         ns = NCP2NEGSTATE(ncp);
 841         ns->neg_flag = 0;
 842         counter_u64_add(neg_created, 1);
 843 }
 844
 845 /*
 846  * Move a negative entry to the hot list.
 847  */
 848 static void
 849 cache_neg_promote(struct namecache *ncp)
 850 {
 851         struct neglist *nl;
 852         struct negstate *ns;
 853
 854         ns = NCP2NEGSTATE(ncp);
 855         nl = NCP2NEGLIST(ncp);
 856         mtx_assert(&nl->nl_lock, MA_OWNED);
 857         if ((ns->neg_flag & NEG_HOT) == 0) {
 858                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 859                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 860                 nl->nl_hotnum++;
 861                 ns->neg_flag |= NEG_HOT;
 862         }
 863 }
 864
 865 /*
 866  * Move a negative entry to the hot list if it matches the lookup.
 867  *
 868  * We have to take locks, but they may be contended and in the worst
 869  * case we may need to go off CPU. We don't want to spin within the
 870  * smr section and we can't block with it. Exiting the section means
 871  * the found entry could have been evicted. We are going to look it
 872  * up again.
 873  */
 874 static bool
 875 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 876     struct namecache *oncp, uint32_t hash)
 877 {
 878         struct namecache *ncp;
 879         struct neglist *nl;
 880         u_char nc_flag;
 881
 882         nl = NCP2NEGLIST(oncp);
 883
 884         mtx_lock(&nl->nl_lock);
 885         /*
 886          * For hash iteration.
 887          */
 888         vfs_smr_enter();
 889
 890         /*
 891          * Avoid all surprises by only succeeding if we got the same entry and
 892          * bailing completely otherwise.
 893          * XXX There are no provisions to keep the vnode around, meaning we may
 894          * end up promoting a negative entry for a *new* vnode and returning
 895          * ENOENT on its account. This is the error we want to return anyway
 896          * and promotion is harmless.
 897          *
 898          * In particular at this point there can be a new ncp which matches the
 899          * search but hashes to a different neglist.
 900          */
 901         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 902                 if (ncp == oncp)
 903                         break;
 904         }
 905
 906         /*
 907          * No match to begin with.
 908          */
 909         if (__predict_false(ncp == NULL)) {
 910                 goto out_abort;
 911         }
 912
 913         /*
 914          * The newly found entry may be something different...
 915          */
 916         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 917             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 918                 goto out_abort;
 919         }
 920
 921         /*
 922          * ... and not even negative.
 923          */
 924         nc_flag = atomic_load_char(&ncp->nc_flag);
 925         if ((nc_flag & NCF_NEGATIVE) == 0) {
 926                 goto out_abort;
 927         }
 928
 929         if (__predict_false(!cache_ncp_canuse(ncp))) {
 930                 goto out_abort;
 931         }
 932
 933         cache_neg_promote(ncp);
 934
 935         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
 936         counter_u64_add(numneghits, 1);
 937         vfs_smr_exit();
 938         mtx_unlock(&nl->nl_lock);
 939         return (true);
 940 out_abort:
 941         vfs_smr_exit();
 942         mtx_unlock(&nl->nl_lock);
 943         return (false);
 944 }
 945
 946 static void
 947 cache_neg_hit(struct namecache *ncp)
 948 {
 949         struct neglist *nl;
 950         struct negstate *ns;
 951
 952         ns = NCP2NEGSTATE(ncp);
 953         if ((ns->neg_flag & NEG_HOT) != 0)
 954                 return;
 955         nl = NCP2NEGLIST(ncp);
 956         mtx_lock(&nl->nl_lock);
 957         cache_neg_promote(ncp);
 958         mtx_unlock(&nl->nl_lock);
 959 }
 960
 961 static void
 962 cache_neg_insert(struct namecache *ncp)
 963 {
 964         struct neglist *nl;
 965
 966         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 967         cache_assert_bucket_locked(ncp);
 968         nl = NCP2NEGLIST(ncp);
 969         mtx_lock(&nl->nl_lock);
 970         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 971         mtx_unlock(&nl->nl_lock);
 972         atomic_add_long(&numneg, 1);
 973 }
 974
 975 static void
 976 cache_neg_remove(struct namecache *ncp)
 977 {
 978         struct neglist *nl;
 979         struct negstate *ns;
 980
 981         cache_assert_bucket_locked(ncp);
 982         nl = NCP2NEGLIST(ncp);
 983         ns = NCP2NEGSTATE(ncp);
 984         mtx_lock(&nl->nl_lock);
 985         if ((ns->neg_flag & NEG_HOT) != 0) {
 986                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 987                 nl->nl_hotnum--;
 988         } else {
 989                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 990         }
 991         mtx_unlock(&nl->nl_lock);
 992         atomic_subtract_long(&numneg, 1);
 993 }
 994
 995 static struct neglist *
 996 cache_neg_evict_select(void)
 997 {
 998         struct neglist *nl;
 999         u_int c;
1000
1001         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1002         nl = &neglists[c % numneglists];
1003         if (!mtx_trylock(&nl->nl_evict_lock)) {
1004                 counter_u64_add(neg_evict_skipped_contended, 1);
1005                 return (NULL);
1006         }
1007         return (nl);
1008 }
1009
1010 static void
1011 cache_neg_evict(void)
1012 {
1013         struct namecache *ncp, *ncp2;
1014         struct neglist *nl;
1015         struct negstate *ns;
1016         struct vnode *dvp;
1017         struct mtx *dvlp;
1018         struct mtx *blp;
1019         uint32_t hash;
1020         u_char nlen;
1021
1022         nl = cache_neg_evict_select();
1023         if (nl == NULL) {
1024                 return;
1025         }
1026
1027         mtx_lock(&nl->nl_lock);
1028         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1029         if (ncp != NULL) {
1030                 ns = NCP2NEGSTATE(ncp);
1031                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1032                 TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1033                 nl->nl_hotnum--;
1034                 ns->neg_flag &= ~NEG_HOT;
1035         }
1036         ncp = TAILQ_FIRST(&nl->nl_list);
1037         if (ncp == NULL) {
1038                 counter_u64_add(neg_evict_skipped_empty, 1);
1039                 mtx_unlock(&nl->nl_lock);
1040                 mtx_unlock(&nl->nl_evict_lock);
1041                 return;
1042         }
1043         ns = NCP2NEGSTATE(ncp);
1044         nlen = ncp->nc_nlen;
1045         dvp = ncp->nc_dvp;
1046         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1047         dvlp = VP2VNODELOCK(dvp);
1048         blp = HASH2BUCKETLOCK(hash);
1049         mtx_unlock(&nl->nl_lock);
1050         mtx_unlock(&nl->nl_evict_lock);
1051         mtx_lock(dvlp);
1052         mtx_lock(blp);
1053         /*
1054          * Note that since all locks were dropped above, the entry may be
1055          * gone or reallocated to be something else.
1056          */
1057         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1058                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1059                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1060                         break;
1061         }
1062         if (ncp2 == NULL) {
1063                 counter_u64_add(neg_evict_skipped_missed, 1);
1064                 ncp = NULL;
1065         } else {
1066                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1067                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1068                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1069                     ncp->nc_name);
1070                 cache_zap_locked(ncp);
1071                 counter_u64_add(neg_evicted, 1);
1072         }
1073         mtx_unlock(blp);
1074         mtx_unlock(dvlp);
1075         if (ncp != NULL)
1076                 cache_free(ncp);
1077 }
1078
1079 /*
1080  * cache_zap_locked():
1081  *
1082  *   Removes a namecache entry from cache, whether it contains an actual
1083  *   pointer to a vnode or if it is just a negative cache entry.
1084  */
1085 static void
1086 cache_zap_locked(struct namecache *ncp)
1087 {
1088         struct nchashhead *ncpp;
1089
1090         if (!(ncp->nc_flag & NCF_NEGATIVE))
1091                 cache_assert_vnode_locked(ncp->nc_vp);
1092         cache_assert_vnode_locked(ncp->nc_dvp);
1093         cache_assert_bucket_locked(ncp);
1094
1095         cache_ncp_invalidate(ncp);
1096
1097         ncpp = NCP2BUCKET(ncp);
1098         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1099         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1100                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1101                     ncp->nc_name, ncp->nc_vp);
1102                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1103                 if (ncp == ncp->nc_vp->v_cache_dd) {
1104                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1105                         ncp->nc_vp->v_cache_dd = NULL;
1106                         vn_seqc_write_end(ncp->nc_vp);
1107                 }
1108         } else {
1109                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1110                     ncp->nc_name);
1111                 cache_neg_remove(ncp);
1112         }
1113         if (ncp->nc_flag & NCF_ISDOTDOT) {
1114                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1115                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1116                         ncp->nc_dvp->v_cache_dd = NULL;
1117                         vn_seqc_write_end(ncp->nc_dvp);
1118                 }
1119         } else {
1120                 LIST_REMOVE(ncp, nc_src);
1121                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1122                         ncp->nc_flag |= NCF_DVDROP;
1123                         counter_u64_add(numcachehv, -1);
1124                 }
1125         }
1126         atomic_subtract_long(&numcache, 1);
1127 }
1128
1129 static void
1130 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1131 {
1132         struct mtx *blp;
1133
1134         MPASS(ncp->nc_dvp == vp);
1135         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1136         cache_assert_vnode_locked(vp);
1137
1138         blp = NCP2BUCKETLOCK(ncp);
1139         mtx_lock(blp);
1140         cache_zap_locked(ncp);
1141         mtx_unlock(blp);
1142 }
1143
1144 static bool
1145 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1146     struct mtx **vlpp)
1147 {
1148         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1149         struct mtx *blp;
1150
1151         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1152         cache_assert_vnode_locked(vp);
1153
1154         if (ncp->nc_flag & NCF_NEGATIVE) {
1155                 if (*vlpp != NULL) {
1156                         mtx_unlock(*vlpp);
1157                         *vlpp = NULL;
1158                 }
1159                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1160                 return (true);
1161         }
1162
1163         pvlp = VP2VNODELOCK(vp);
1164         blp = NCP2BUCKETLOCK(ncp);
1165         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1166         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1167
1168         if (*vlpp == vlp1 || *vlpp == vlp2) {
1169                 to_unlock = *vlpp;
1170                 *vlpp = NULL;
1171         } else {
1172                 if (*vlpp != NULL) {
1173                         mtx_unlock(*vlpp);
1174                         *vlpp = NULL;
1175                 }
1176                 cache_sort_vnodes(&vlp1, &vlp2);
1177                 if (vlp1 == pvlp) {
1178                         mtx_lock(vlp2);
1179                         to_unlock = vlp2;
1180                 } else {
1181                         if (!mtx_trylock(vlp1))
1182                                 goto out_relock;
1183                         to_unlock = vlp1;
1184                 }
1185         }
1186         mtx_lock(blp);
1187         cache_zap_locked(ncp);
1188         mtx_unlock(blp);
1189         if (to_unlock != NULL)
1190                 mtx_unlock(to_unlock);
1191         return (true);
1192
1193 out_relock:
1194         mtx_unlock(vlp2);
1195         mtx_lock(vlp1);
1196         mtx_lock(vlp2);
1197         MPASS(*vlpp == NULL);
1198         *vlpp = vlp1;
1199         return (false);
1200 }
1201
1202 /*
1203  * If trylocking failed we can get here. We know enough to take all needed locks
1204  * in the right order and re-lookup the entry.
1205  */
1206 static int
1207 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1208     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1209     struct mtx *blp)
1210 {
1211         struct namecache *rncp;
1212
1213         cache_assert_bucket_unlocked(ncp);
1214
1215         cache_sort_vnodes(&dvlp, &vlp);
1216         cache_lock_vnodes(dvlp, vlp);
1217         mtx_lock(blp);
1218         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1219                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1220                     rncp->nc_nlen == cnp->cn_namelen &&
1221                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1222                         break;
1223         }
1224         if (rncp != NULL) {
1225                 cache_zap_locked(rncp);
1226                 mtx_unlock(blp);
1227                 cache_unlock_vnodes(dvlp, vlp);
1228                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1229                 return (0);
1230         }
1231
1232         mtx_unlock(blp);
1233         cache_unlock_vnodes(dvlp, vlp);
1234         return (EAGAIN);
1235 }
1236
1237 static int __noinline
1238 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1239     uint32_t hash, struct mtx *blp)
1240 {
1241         struct mtx *dvlp, *vlp;
1242         struct vnode *dvp;
1243
1244         cache_assert_bucket_locked(ncp);
1245
1246         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1247         vlp = NULL;
1248         if (!(ncp->nc_flag & NCF_NEGATIVE))
1249                 vlp = VP2VNODELOCK(ncp->nc_vp);
1250         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1251                 cache_zap_locked(ncp);
1252                 mtx_unlock(blp);
1253                 cache_unlock_vnodes(dvlp, vlp);
1254                 return (0);
1255         }
1256
1257         dvp = ncp->nc_dvp;
1258         mtx_unlock(blp);
1259         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1260 }
1261
1262 static __noinline int
1263 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1264 {
1265         struct namecache *ncp;
1266         struct mtx *blp;
1267         struct mtx *dvlp, *dvlp2;
1268         uint32_t hash;
1269         int error;
1270
1271         if (cnp->cn_namelen == 2 &&
1272             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1273                 dvlp = VP2VNODELOCK(dvp);
1274                 dvlp2 = NULL;
1275                 mtx_lock(dvlp);
1276 retry_dotdot:
1277                 ncp = dvp->v_cache_dd;
1278                 if (ncp == NULL) {
1279                         mtx_unlock(dvlp);
1280                         if (dvlp2 != NULL)
1281                                 mtx_unlock(dvlp2);
1282                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1283                         return (0);
1284                 }
1285                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1286                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1287                                 goto retry_dotdot;
1288                         MPASS(dvp->v_cache_dd == NULL);
1289                         mtx_unlock(dvlp);
1290                         if (dvlp2 != NULL)
1291                                 mtx_unlock(dvlp2);
1292                         cache_free(ncp);
1293                 } else {
1294                         vn_seqc_write_begin(dvp);
1295                         dvp->v_cache_dd = NULL;
1296                         vn_seqc_write_end(dvp);
1297                         mtx_unlock(dvlp);
1298                         if (dvlp2 != NULL)
1299                                 mtx_unlock(dvlp2);
1300                 }
1301                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1302                 return (1);
1303         }
1304
1305         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1306         blp = HASH2BUCKETLOCK(hash);
1307 retry:
1308         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1309                 goto out_no_entry;
1310
1311         mtx_lock(blp);
1312
1313         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1314                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1315                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1316                         break;
1317         }
1318
1319         if (ncp == NULL) {
1320                 mtx_unlock(blp);
1321                 goto out_no_entry;
1322         }
1323
1324         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1325         if (__predict_false(error != 0)) {
1326                 zap_and_exit_bucket_fail++;
1327                 goto retry;
1328         }
1329         counter_u64_add(numposzaps, 1);
1330         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1331         cache_free(ncp);
1332         return (1);
1333 out_no_entry:
1334         counter_u64_add(nummisszap, 1);
1335         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1336         return (0);
1337 }
1338
1339 static int __noinline
1340 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1341     struct timespec *tsp, int *ticksp)
1342 {
1343         int ltype;
1344
1345         *vpp = dvp;
1346         counter_u64_add(dothits, 1);
1347         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1348         if (tsp != NULL)
1349                 timespecclear(tsp);
1350         if (ticksp != NULL)
1351                 *ticksp = ticks;
1352         vrefact(*vpp);
1353         /*
1354          * When we lookup "." we still can be asked to lock it
1355          * differently...
1356          */
1357         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1358         if (ltype != VOP_ISLOCKED(*vpp)) {
1359                 if (ltype == LK_EXCLUSIVE) {
1360                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1361                         if (VN_IS_DOOMED((*vpp))) {
1362                                 /* forced unmount */
1363                                 vrele(*vpp);
1364                                 *vpp = NULL;
1365                                 return (ENOENT);
1366                         }
1367                 } else
1368                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1369         }
1370         return (-1);
1371 }
1372
1373 static int __noinline
1374 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1375     struct timespec *tsp, int *ticksp)
1376 {
1377         struct namecache_ts *ncp_ts;
1378         struct namecache *ncp;
1379         struct mtx *dvlp;
1380         enum vgetstate vs;
1381         int error, ltype;
1382         bool whiteout;
1383
1384         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1385
1386         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1387                 cache_remove_cnp(dvp, cnp);
1388                 return (0);
1389         }
1390
1391         counter_u64_add(dotdothits, 1);
1392 retry:
1393         dvlp = VP2VNODELOCK(dvp);
1394         mtx_lock(dvlp);
1395         ncp = dvp->v_cache_dd;
1396         if (ncp == NULL) {
1397                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1398                 mtx_unlock(dvlp);
1399                 return (0);
1400         }
1401         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1402                 if (ncp->nc_flag & NCF_NEGATIVE)
1403                         *vpp = NULL;
1404                 else
1405                         *vpp = ncp->nc_vp;
1406         } else
1407                 *vpp = ncp->nc_dvp;
1408         if (*vpp == NULL)
1409                 goto negative_success;
1410         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1411         cache_out_ts(ncp, tsp, ticksp);
1412         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1413             NCF_DTS && tsp != NULL) {
1414                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1415                 *tsp = ncp_ts->nc_dotdottime;
1416         }
1417
1418         MPASS(dvp != *vpp);
1419         ltype = VOP_ISLOCKED(dvp);
1420         VOP_UNLOCK(dvp);
1421         vs = vget_prep(*vpp);
1422         mtx_unlock(dvlp);
1423         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1424         vn_lock(dvp, ltype | LK_RETRY);
1425         if (VN_IS_DOOMED(dvp)) {
1426                 if (error == 0)
1427                         vput(*vpp);
1428                 *vpp = NULL;
1429                 return (ENOENT);
1430         }
1431         if (error) {
1432                 *vpp = NULL;
1433                 goto retry;
1434         }
1435         return (-1);
1436 negative_success:
1437         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1438                 if (cnp->cn_flags & ISLASTCN) {
1439                         counter_u64_add(numnegzaps, 1);
1440                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1441                         mtx_unlock(dvlp);
1442                         cache_free(ncp);
1443                         return (0);
1444                 }
1445         }
1446
1447         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1448         cache_out_ts(ncp, tsp, ticksp);
1449         counter_u64_add(numneghits, 1);
1450         whiteout = (ncp->nc_flag & NCF_WHITE);
1451         cache_neg_hit(ncp);
1452         mtx_unlock(dvlp);
1453         if (whiteout)
1454                 cnp->cn_flags |= ISWHITEOUT;
1455         return (ENOENT);
1456 }
1457
1458 /**
1459  * Lookup a name in the name cache
1460  *
1461  * # Arguments
1462  *
1463  * - dvp:       Parent directory in which to search.
1464  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1465  * - cnp:       Parameters of the name search.  The most interesting bits of
1466  *              the cn_flags field have the following meanings:
1467  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1468  *                      it up.
1469  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1470  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1471  *              or negative) lookup, tsp will be filled with any timespec that
1472  *              was stored when this cache entry was created.  However, it will
1473  *              be clear for "." entries.
1474  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1475  *              (positive or negative) lookup, it will contain the ticks value
1476  *              that was current when the cache entry was created, unless cnp
1477  *              was ".".
1478  *
1479  * Either both tsp and ticks have to be provided or neither of them.
1480  *
1481  * # Returns
1482  *
1483  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1484  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1485  *              to a forced unmount.  vpp will not be modified.  If the entry
1486  *              is a whiteout, then the ISWHITEOUT flag will be set in
1487  *              cnp->cn_flags.
1488  * - 0:         A cache miss.  vpp will not be modified.
1489  *
1490  * # Locking
1491  *
1492  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1493  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1494  * lock is not recursively acquired.
1495  */
1496 static int __noinline
1497 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1498     struct timespec *tsp, int *ticksp)
1499 {
1500         struct namecache *ncp;
1501         struct mtx *blp;
1502         uint32_t hash;
1503         enum vgetstate vs;
1504         int error;
1505         bool whiteout;
1506
1507         MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1508
1509 retry:
1510         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1511         blp = HASH2BUCKETLOCK(hash);
1512         mtx_lock(blp);
1513
1514         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1515                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1516                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1517                         break;
1518         }
1519
1520         if (__predict_false(ncp == NULL)) {
1521                 mtx_unlock(blp);
1522                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1523                     NULL);
1524                 counter_u64_add(nummiss, 1);
1525                 return (0);
1526         }
1527
1528         if (ncp->nc_flag & NCF_NEGATIVE)
1529                 goto negative_success;
1530
1531         counter_u64_add(numposhits, 1);
1532         *vpp = ncp->nc_vp;
1533         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1534         cache_out_ts(ncp, tsp, ticksp);
1535         MPASS(dvp != *vpp);
1536         vs = vget_prep(*vpp);
1537         mtx_unlock(blp);
1538         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1539         if (error) {
1540                 *vpp = NULL;
1541                 goto retry;
1542         }
1543         return (-1);
1544 negative_success:
1545         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1546                 if (cnp->cn_flags & ISLASTCN) {
1547                         counter_u64_add(numnegzaps, 1);
1548                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1549                         if (__predict_false(error != 0)) {
1550                                 zap_and_exit_bucket_fail2++;
1551                                 goto retry;
1552                         }
1553                         cache_free(ncp);
1554                         return (0);
1555                 }
1556         }
1557
1558         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1559         cache_out_ts(ncp, tsp, ticksp);
1560         counter_u64_add(numneghits, 1);
1561         whiteout = (ncp->nc_flag & NCF_WHITE);
1562         cache_neg_hit(ncp);
1563         mtx_unlock(blp);
1564         if (whiteout)
1565                 cnp->cn_flags |= ISWHITEOUT;
1566         return (ENOENT);
1567 }
1568
1569 int
1570 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1571     struct timespec *tsp, int *ticksp)
1572 {
1573         struct namecache *ncp;
1574         struct negstate *ns;
1575         uint32_t hash;
1576         enum vgetstate vs;
1577         int error;
1578         bool whiteout, neg_hot;
1579         u_short nc_flag;
1580
1581         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1582
1583 #ifdef DEBUG_CACHE
1584         if (__predict_false(!doingcache)) {
1585                 cnp->cn_flags &= ~MAKEENTRY;
1586                 return (0);
1587         }
1588 #endif
1589
1590         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1591                 if (cnp->cn_namelen == 1)
1592                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1593                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1594                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1595         }
1596
1597         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1598
1599         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1600                 cache_remove_cnp(dvp, cnp);
1601                 return (0);
1602         }
1603
1604         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1605         vfs_smr_enter();
1606
1607         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1608                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1609                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1610                         break;
1611         }
1612
1613         if (__predict_false(ncp == NULL)) {
1614                 vfs_smr_exit();
1615                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1616                     NULL);
1617                 counter_u64_add(nummiss, 1);
1618                 return (0);
1619         }
1620
1621         nc_flag = atomic_load_char(&ncp->nc_flag);
1622         if (nc_flag & NCF_NEGATIVE)
1623                 goto negative_success;
1624
1625         counter_u64_add(numposhits, 1);
1626         *vpp = ncp->nc_vp;
1627         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1628         cache_out_ts(ncp, tsp, ticksp);
1629         MPASS(dvp != *vpp);
1630         if (!cache_ncp_canuse(ncp)) {
1631                 vfs_smr_exit();
1632                 *vpp = NULL;
1633                 goto out_fallback;
1634         }
1635         vs = vget_prep_smr(*vpp);
1636         vfs_smr_exit();
1637         if (__predict_false(vs == VGET_NONE)) {
1638                 *vpp = NULL;
1639                 goto out_fallback;
1640         }
1641         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1642         if (error) {
1643                 *vpp = NULL;
1644                 goto out_fallback;
1645         }
1646         return (-1);
1647 negative_success:
1648         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1649                 if (cnp->cn_flags & ISLASTCN) {
1650                         vfs_smr_exit();
1651                         goto out_fallback;
1652                 }
1653         }
1654
1655         cache_out_ts(ncp, tsp, ticksp);
1656         whiteout = (ncp->nc_flag & NCF_WHITE);
1657         ns = NCP2NEGSTATE(ncp);
1658         neg_hot = ((ns->neg_flag & NEG_HOT) != 0);
1659         if (__predict_false(!cache_ncp_canuse(ncp))) {
1660                 vfs_smr_exit();
1661                 goto out_fallback;
1662         }
1663         if (!neg_hot) {
1664                 vfs_smr_exit();
1665                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1666                         goto out_fallback;
1667         } else {
1668                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1669                 counter_u64_add(numneghits, 1);
1670                 vfs_smr_exit();
1671         }
1672         if (whiteout)
1673                 cnp->cn_flags |= ISWHITEOUT;
1674         return (ENOENT);
1675 out_fallback:
1676         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1677 }
1678
1679 struct celockstate {
1680         struct mtx *vlp[3];
1681         struct mtx *blp[2];
1682 };
1683 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1684 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1685
1686 static inline void
1687 cache_celockstate_init(struct celockstate *cel)
1688 {
1689
1690         bzero(cel, sizeof(*cel));
1691 }
1692
1693 static void
1694 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1695     struct vnode *dvp)
1696 {
1697         struct mtx *vlp1, *vlp2;
1698
1699         MPASS(cel->vlp[0] == NULL);
1700         MPASS(cel->vlp[1] == NULL);
1701         MPASS(cel->vlp[2] == NULL);
1702
1703         MPASS(vp != NULL || dvp != NULL);
1704
1705         vlp1 = VP2VNODELOCK(vp);
1706         vlp2 = VP2VNODELOCK(dvp);
1707         cache_sort_vnodes(&vlp1, &vlp2);
1708
1709         if (vlp1 != NULL) {
1710                 mtx_lock(vlp1);
1711                 cel->vlp[0] = vlp1;
1712         }
1713         mtx_lock(vlp2);
1714         cel->vlp[1] = vlp2;
1715 }
1716
1717 static void
1718 cache_unlock_vnodes_cel(struct celockstate *cel)
1719 {
1720
1721         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1722
1723         if (cel->vlp[0] != NULL)
1724                 mtx_unlock(cel->vlp[0]);
1725         if (cel->vlp[1] != NULL)
1726                 mtx_unlock(cel->vlp[1]);
1727         if (cel->vlp[2] != NULL)
1728                 mtx_unlock(cel->vlp[2]);
1729 }
1730
1731 static bool
1732 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1733 {
1734         struct mtx *vlp;
1735         bool ret;
1736
1737         cache_assert_vlp_locked(cel->vlp[0]);
1738         cache_assert_vlp_locked(cel->vlp[1]);
1739         MPASS(cel->vlp[2] == NULL);
1740
1741         MPASS(vp != NULL);
1742         vlp = VP2VNODELOCK(vp);
1743
1744         ret = true;
1745         if (vlp >= cel->vlp[1]) {
1746                 mtx_lock(vlp);
1747         } else {
1748                 if (mtx_trylock(vlp))
1749                         goto out;
1750                 cache_lock_vnodes_cel_3_failures++;
1751                 cache_unlock_vnodes_cel(cel);
1752                 if (vlp < cel->vlp[0]) {
1753                         mtx_lock(vlp);
1754                         mtx_lock(cel->vlp[0]);
1755                         mtx_lock(cel->vlp[1]);
1756                 } else {
1757                         if (cel->vlp[0] != NULL)
1758                                 mtx_lock(cel->vlp[0]);
1759                         mtx_lock(vlp);
1760                         mtx_lock(cel->vlp[1]);
1761                 }
1762                 ret = false;
1763         }
1764 out:
1765         cel->vlp[2] = vlp;
1766         return (ret);
1767 }
1768
1769 static void
1770 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1771     struct mtx *blp2)
1772 {
1773
1774         MPASS(cel->blp[0] == NULL);
1775         MPASS(cel->blp[1] == NULL);
1776
1777         cache_sort_vnodes(&blp1, &blp2);
1778
1779         if (blp1 != NULL) {
1780                 mtx_lock(blp1);
1781                 cel->blp[0] = blp1;
1782         }
1783         mtx_lock(blp2);
1784         cel->blp[1] = blp2;
1785 }
1786
1787 static void
1788 cache_unlock_buckets_cel(struct celockstate *cel)
1789 {
1790
1791         if (cel->blp[0] != NULL)
1792                 mtx_unlock(cel->blp[0]);
1793         mtx_unlock(cel->blp[1]);
1794 }
1795
1796 /*
1797  * Lock part of the cache affected by the insertion.
1798  *
1799  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1800  * However, insertion can result in removal of an old entry. In this
1801  * case we have an additional vnode and bucketlock pair to lock.
1802  *
1803  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1804  * preserving the locking order (smaller address first).
1805  */
1806 static void
1807 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1808     uint32_t hash)
1809 {
1810         struct namecache *ncp;
1811         struct mtx *blps[2];
1812
1813         blps[0] = HASH2BUCKETLOCK(hash);
1814         for (;;) {
1815                 blps[1] = NULL;
1816                 cache_lock_vnodes_cel(cel, dvp, vp);
1817                 if (vp == NULL || vp->v_type != VDIR)
1818                         break;
1819                 ncp = vp->v_cache_dd;
1820                 if (ncp == NULL)
1821                         break;
1822                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1823                         break;
1824                 MPASS(ncp->nc_dvp == vp);
1825                 blps[1] = NCP2BUCKETLOCK(ncp);
1826                 if (ncp->nc_flag & NCF_NEGATIVE)
1827                         break;
1828                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1829                         break;
1830                 /*
1831                  * All vnodes got re-locked. Re-validate the state and if
1832                  * nothing changed we are done. Otherwise restart.
1833                  */
1834                 if (ncp == vp->v_cache_dd &&
1835                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1836                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1837                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1838                         break;
1839                 cache_unlock_vnodes_cel(cel);
1840                 cel->vlp[0] = NULL;
1841                 cel->vlp[1] = NULL;
1842                 cel->vlp[2] = NULL;
1843         }
1844         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1845 }
1846
1847 static void
1848 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1849     uint32_t hash)
1850 {
1851         struct namecache *ncp;
1852         struct mtx *blps[2];
1853
1854         blps[0] = HASH2BUCKETLOCK(hash);
1855         for (;;) {
1856                 blps[1] = NULL;
1857                 cache_lock_vnodes_cel(cel, dvp, vp);
1858                 ncp = dvp->v_cache_dd;
1859                 if (ncp == NULL)
1860                         break;
1861                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1862                         break;
1863                 MPASS(ncp->nc_dvp == dvp);
1864                 blps[1] = NCP2BUCKETLOCK(ncp);
1865                 if (ncp->nc_flag & NCF_NEGATIVE)
1866                         break;
1867                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1868                         break;
1869                 if (ncp == dvp->v_cache_dd &&
1870                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1871                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1872                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1873                         break;
1874                 cache_unlock_vnodes_cel(cel);
1875                 cel->vlp[0] = NULL;
1876                 cel->vlp[1] = NULL;
1877                 cel->vlp[2] = NULL;
1878         }
1879         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1880 }
1881
1882 static void
1883 cache_enter_unlock(struct celockstate *cel)
1884 {
1885
1886         cache_unlock_buckets_cel(cel);
1887         cache_unlock_vnodes_cel(cel);
1888 }
1889
1890 static void __noinline
1891 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1892     struct componentname *cnp)
1893 {
1894         struct celockstate cel;
1895         struct namecache *ncp;
1896         uint32_t hash;
1897         int len;
1898
1899         if (dvp->v_cache_dd == NULL)
1900                 return;
1901         len = cnp->cn_namelen;
1902         cache_celockstate_init(&cel);
1903         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1904         cache_enter_lock_dd(&cel, dvp, vp, hash);
1905         vn_seqc_write_begin(dvp);
1906         ncp = dvp->v_cache_dd;
1907         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1908                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1909                 cache_zap_locked(ncp);
1910         } else {
1911                 ncp = NULL;
1912         }
1913         dvp->v_cache_dd = NULL;
1914         vn_seqc_write_end(dvp);
1915         cache_enter_unlock(&cel);
1916         if (ncp != NULL)
1917                 cache_free(ncp);
1918 }
1919
1920 /*
1921  * Add an entry to the cache.
1922  */
1923 void
1924 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1925     struct timespec *tsp, struct timespec *dtsp)
1926 {
1927         struct celockstate cel;
1928         struct namecache *ncp, *n2, *ndd;
1929         struct namecache_ts *ncp_ts;
1930         struct nchashhead *ncpp;
1931         uint32_t hash;
1932         int flag;
1933         int len;
1934         u_long lnumcache;
1935
1936         VNPASS(!VN_IS_DOOMED(dvp), dvp);
1937         VNPASS(dvp->v_type != VNON, dvp);
1938         if (vp != NULL) {
1939                 VNPASS(!VN_IS_DOOMED(vp), vp);
1940                 VNPASS(vp->v_type != VNON, vp);
1941         }
1942
1943 #ifdef DEBUG_CACHE
1944         if (__predict_false(!doingcache))
1945                 return;
1946 #endif
1947
1948         flag = 0;
1949         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1950                 if (cnp->cn_namelen == 1)
1951                         return;
1952                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1953                         cache_enter_dotdot_prep(dvp, vp, cnp);
1954                         flag = NCF_ISDOTDOT;
1955                 }
1956         }
1957
1958         /*
1959          * Avoid blowout in namecache entries.
1960          *
1961          * Bugs:
1962          * 1. filesystems may end up tryng to add an already existing entry
1963          * (for example this can happen after a cache miss during concurrent
1964          * lookup), in which case we will call cache_neg_evict despite not
1965          * adding anything.
1966          * 2. the routine may fail to free anything and no provisions are made
1967          * to make it try harder (see the inside for failure modes)
1968          * 3. it only ever looks at negative entries.
1969          */
1970         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1971         if (numneg * ncnegfactor > lnumcache) {
1972                 cache_neg_evict();
1973                 lnumcache = atomic_load_long(&numcache);
1974         }
1975         if (__predict_false(lnumcache >= ncsize)) {
1976                 atomic_subtract_long(&numcache, 1);
1977                 counter_u64_add(numdrops, 1);
1978                 return;
1979         }
1980
1981         cache_celockstate_init(&cel);
1982         ndd = NULL;
1983         ncp_ts = NULL;
1984
1985         /*
1986          * Calculate the hash key and setup as much of the new
1987          * namecache entry as possible before acquiring the lock.
1988          */
1989         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1990         ncp->nc_flag = flag | NCF_WIP;
1991         ncp->nc_vp = vp;
1992         if (vp == NULL)
1993                 cache_neg_init(ncp);
1994         ncp->nc_dvp = dvp;
1995         if (tsp != NULL) {
1996                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1997                 ncp_ts->nc_time = *tsp;
1998                 ncp_ts->nc_ticks = ticks;
1999                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2000                 if (dtsp != NULL) {
2001                         ncp_ts->nc_dotdottime = *dtsp;
2002                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2003                 }
2004         }
2005         len = ncp->nc_nlen = cnp->cn_namelen;
2006         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2007         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2008         ncp->nc_name[len] = '\0';
2009         cache_enter_lock(&cel, dvp, vp, hash);
2010
2011         /*
2012          * See if this vnode or negative entry is already in the cache
2013          * with this name.  This can happen with concurrent lookups of
2014          * the same path name.
2015          */
2016         ncpp = NCHHASH(hash);
2017         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2018                 if (n2->nc_dvp == dvp &&
2019                     n2->nc_nlen == cnp->cn_namelen &&
2020                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2021                         MPASS(cache_ncp_canuse(n2));
2022                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2023                                 KASSERT(vp == NULL,
2024                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2025                                     __func__, NULL, vp));
2026                         else
2027                                 KASSERT(n2->nc_vp == vp,
2028                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2029                                     __func__, n2->nc_vp, vp));
2030                         /*
2031                          * Entries are supposed to be immutable unless in the
2032                          * process of getting destroyed. Accommodating for
2033                          * changing timestamps is possible but not worth it.
2034                          * This should be harmless in terms of correctness, in
2035                          * the worst case resulting in an earlier expiration.
2036                          * Alternatively, the found entry can be replaced
2037                          * altogether.
2038                          */
2039                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2040 #if 0
2041                         if (tsp != NULL) {
2042                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2043                                     ("no NCF_TS"));
2044                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2045                                 n2_ts->nc_time = ncp_ts->nc_time;
2046                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2047                                 if (dtsp != NULL) {
2048                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2049                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2050                                 }
2051                         }
2052 #endif
2053                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2054                             vp);
2055                         goto out_unlock_free;
2056                 }
2057         }
2058
2059         if (flag == NCF_ISDOTDOT) {
2060                 /*
2061                  * See if we are trying to add .. entry, but some other lookup
2062                  * has populated v_cache_dd pointer already.
2063                  */
2064                 if (dvp->v_cache_dd != NULL)
2065                         goto out_unlock_free;
2066                 KASSERT(vp == NULL || vp->v_type == VDIR,
2067                     ("wrong vnode type %p", vp));
2068                 vn_seqc_write_begin(dvp);
2069                 dvp->v_cache_dd = ncp;
2070                 vn_seqc_write_end(dvp);
2071         }
2072
2073         if (vp != NULL) {
2074                 if (flag != NCF_ISDOTDOT) {
2075                         /*
2076                          * For this case, the cache entry maps both the
2077                          * directory name in it and the name ".." for the
2078                          * directory's parent.
2079                          */
2080                         vn_seqc_write_begin(vp);
2081                         if ((ndd = vp->v_cache_dd) != NULL) {
2082                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2083                                         cache_zap_locked(ndd);
2084                                 else
2085                                         ndd = NULL;
2086                         }
2087                         vp->v_cache_dd = ncp;
2088                         vn_seqc_write_end(vp);
2089                 } else if (vp->v_type != VDIR) {
2090                         if (vp->v_cache_dd != NULL) {
2091                                 vn_seqc_write_begin(vp);
2092                                 vp->v_cache_dd = NULL;
2093                                 vn_seqc_write_end(vp);
2094                         }
2095                 }
2096         }
2097
2098         if (flag != NCF_ISDOTDOT) {
2099                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2100                         vhold(dvp);
2101                         counter_u64_add(numcachehv, 1);
2102                 }
2103                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2104         }
2105
2106         /*
2107          * If the entry is "negative", we place it into the
2108          * "negative" cache queue, otherwise, we place it into the
2109          * destination vnode's cache entries queue.
2110          */
2111         if (vp != NULL) {
2112                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2113                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2114                     vp);
2115         } else {
2116                 if (cnp->cn_flags & ISWHITEOUT)
2117                         ncp->nc_flag |= NCF_WHITE;
2118                 cache_neg_insert(ncp);
2119                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2120                     ncp->nc_name);
2121         }
2122
2123         /*
2124          * Insert the new namecache entry into the appropriate chain
2125          * within the cache entries table.
2126          */
2127         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2128
2129         atomic_thread_fence_rel();
2130         /*
2131          * Mark the entry as fully constructed.
2132          * It is immutable past this point until its removal.
2133          */
2134         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2135
2136         cache_enter_unlock(&cel);
2137         if (ndd != NULL)
2138                 cache_free(ndd);
2139         return;
2140 out_unlock_free:
2141         cache_enter_unlock(&cel);
2142         atomic_subtract_long(&numcache, 1);
2143         cache_free(ncp);
2144         return;
2145 }
2146
2147 static u_int
2148 cache_roundup_2(u_int val)
2149 {
2150         u_int res;
2151
2152         for (res = 1; res <= val; res <<= 1)
2153                 continue;
2154
2155         return (res);
2156 }
2157
2158 static struct nchashhead *
2159 nchinittbl(u_long elements, u_long *hashmask)
2160 {
2161         struct nchashhead *hashtbl;
2162         u_long hashsize, i;
2163
2164         hashsize = cache_roundup_2(elements) / 2;
2165
2166         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2167         for (i = 0; i < hashsize; i++)
2168                 CK_SLIST_INIT(&hashtbl[i]);
2169         *hashmask = hashsize - 1;
2170         return (hashtbl);
2171 }
2172
2173 static void
2174 ncfreetbl(struct nchashhead *hashtbl)
2175 {
2176
2177         free(hashtbl, M_VFSCACHE);
2178 }
2179
2180 /*
2181  * Name cache initialization, from vfs_init() when we are booting
2182  */
2183 static void
2184 nchinit(void *dummy __unused)
2185 {
2186         u_int i;
2187
2188         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2189             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2190         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2191             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2192         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2193             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2194         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2195             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2196
2197         VFS_SMR_ZONE_SET(cache_zone_small);
2198         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2199         VFS_SMR_ZONE_SET(cache_zone_large);
2200         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2201
2202         ncsize = desiredvnodes * ncsizefactor;
2203         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2204         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2205         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2206                 ncbuckethash = 7;
2207         if (ncbuckethash > nchash)
2208                 ncbuckethash = nchash;
2209         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2210             M_WAITOK | M_ZERO);
2211         for (i = 0; i < numbucketlocks; i++)
2212                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2213         ncvnodehash = ncbuckethash;
2214         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2215             M_WAITOK | M_ZERO);
2216         for (i = 0; i < numvnodelocks; i++)
2217                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2218
2219         for (i = 0; i < numneglists; i++) {
2220                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2221                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2222                 TAILQ_INIT(&neglists[i].nl_list);
2223                 TAILQ_INIT(&neglists[i].nl_hotlist);
2224         }
2225 }
2226 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2227
2228 void
2229 cache_vnode_init(struct vnode *vp)
2230 {
2231
2232         LIST_INIT(&vp->v_cache_src);
2233         TAILQ_INIT(&vp->v_cache_dst);
2234         vp->v_cache_dd = NULL;
2235         cache_prehash(vp);
2236 }
2237
2238 void
2239 cache_changesize(u_long newmaxvnodes)
2240 {
2241         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2242         u_long new_nchash, old_nchash;
2243         struct namecache *ncp;
2244         uint32_t hash;
2245         u_long newncsize;
2246         int i;
2247
2248         newncsize = newmaxvnodes * ncsizefactor;
2249         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2250         if (newmaxvnodes < numbucketlocks)
2251                 newmaxvnodes = numbucketlocks;
2252
2253         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2254         /* If same hash table size, nothing to do */
2255         if (nchash == new_nchash) {
2256                 ncfreetbl(new_nchashtbl);
2257                 return;
2258         }
2259         /*
2260          * Move everything from the old hash table to the new table.
2261          * None of the namecache entries in the table can be removed
2262          * because to do so, they have to be removed from the hash table.
2263          */
2264         cache_lock_all_vnodes();
2265         cache_lock_all_buckets();
2266         old_nchashtbl = nchashtbl;
2267         old_nchash = nchash;
2268         nchashtbl = new_nchashtbl;
2269         nchash = new_nchash;
2270         for (i = 0; i <= old_nchash; i++) {
2271                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2272                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2273                             ncp->nc_dvp);
2274                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2275                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2276                 }
2277         }
2278         ncsize = newncsize;
2279         cache_unlock_all_buckets();
2280         cache_unlock_all_vnodes();
2281         ncfreetbl(old_nchashtbl);
2282 }
2283
2284 /*
2285  * Invalidate all entries from and to a particular vnode.
2286  */
2287 static void
2288 cache_purge_impl(struct vnode *vp)
2289 {
2290         TAILQ_HEAD(, namecache) ncps;
2291         struct namecache *ncp, *nnp;
2292         struct mtx *vlp, *vlp2;
2293
2294         TAILQ_INIT(&ncps);
2295         vlp = VP2VNODELOCK(vp);
2296         vlp2 = NULL;
2297         mtx_lock(vlp);
2298 retry:
2299         while (!LIST_EMPTY(&vp->v_cache_src)) {
2300                 ncp = LIST_FIRST(&vp->v_cache_src);
2301                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2302                         goto retry;
2303                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2304         }
2305         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2306                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2307                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2308                         goto retry;
2309                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2310         }
2311         ncp = vp->v_cache_dd;
2312         if (ncp != NULL) {
2313                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2314                    ("lost dotdot link"));
2315                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2316                         goto retry;
2317                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2318         }
2319         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2320         mtx_unlock(vlp);
2321         if (vlp2 != NULL)
2322                 mtx_unlock(vlp2);
2323         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2324                 cache_free(ncp);
2325         }
2326 }
2327
2328 /*
2329  * Opportunistic check to see if there is anything to do.
2330  */
2331 static bool
2332 cache_has_entries(struct vnode *vp)
2333 {
2334
2335         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2336             vp->v_cache_dd == NULL)
2337                 return (false);
2338         return (true);
2339 }
2340
2341 void
2342 cache_purge(struct vnode *vp)
2343 {
2344
2345         SDT_PROBE1(vfs, namecache, purge, done, vp);
2346         if (!cache_has_entries(vp))
2347                 return;
2348         cache_purge_impl(vp);
2349 }
2350
2351 /*
2352  * Only to be used by vgone.
2353  */
2354 void
2355 cache_purge_vgone(struct vnode *vp)
2356 {
2357         struct mtx *vlp;
2358
2359         VNPASS(VN_IS_DOOMED(vp), vp);
2360         if (cache_has_entries(vp)) {
2361                 cache_purge_impl(vp);
2362                 return;
2363         }
2364
2365         /*
2366          * Serialize against a potential thread doing cache_purge.
2367          */
2368         vlp = VP2VNODELOCK(vp);
2369         mtx_wait_unlocked(vlp);
2370         if (cache_has_entries(vp)) {
2371                 cache_purge_impl(vp);
2372                 return;
2373         }
2374         return;
2375 }
2376
2377 /*
2378  * Invalidate all negative entries for a particular directory vnode.
2379  */
2380 void
2381 cache_purge_negative(struct vnode *vp)
2382 {
2383         TAILQ_HEAD(, namecache) ncps;
2384         struct namecache *ncp, *nnp;
2385         struct mtx *vlp;
2386
2387         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2388         if (LIST_EMPTY(&vp->v_cache_src))
2389                 return;
2390         TAILQ_INIT(&ncps);
2391         vlp = VP2VNODELOCK(vp);
2392         mtx_lock(vlp);
2393         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2394                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2395                         continue;
2396                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2397                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2398         }
2399         mtx_unlock(vlp);
2400         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2401                 cache_free(ncp);
2402         }
2403 }
2404
2405 void
2406 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2407     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2408 {
2409
2410         ASSERT_VOP_IN_SEQC(fdvp);
2411         ASSERT_VOP_IN_SEQC(fvp);
2412         ASSERT_VOP_IN_SEQC(tdvp);
2413         if (tvp != NULL)
2414                 ASSERT_VOP_IN_SEQC(tvp);
2415
2416         cache_purge(fvp);
2417         if (tvp != NULL) {
2418                 cache_purge(tvp);
2419                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2420                     ("%s: lingering negative entry", __func__));
2421         } else {
2422                 cache_remove_cnp(tdvp, tcnp);
2423         }
2424 }
2425
2426 /*
2427  * Flush all entries referencing a particular filesystem.
2428  */
2429 void
2430 cache_purgevfs(struct mount *mp)
2431 {
2432         struct vnode *vp, *mvp;
2433
2434         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2435         /*
2436          * Somewhat wasteful iteration over all vnodes. Would be better to
2437          * support filtering and avoid the interlock to begin with.
2438          */
2439         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2440                 if (!cache_has_entries(vp)) {
2441                         VI_UNLOCK(vp);
2442                         continue;
2443                 }
2444                 vholdl(vp);
2445                 VI_UNLOCK(vp);
2446                 cache_purge(vp);
2447                 vdrop(vp);
2448         }
2449 }
2450
2451 /*
2452  * Perform canonical checks and cache lookup and pass on to filesystem
2453  * through the vop_cachedlookup only if needed.
2454  */
2455
2456 int
2457 vfs_cache_lookup(struct vop_lookup_args *ap)
2458 {
2459         struct vnode *dvp;
2460         int error;
2461         struct vnode **vpp = ap->a_vpp;
2462         struct componentname *cnp = ap->a_cnp;
2463         int flags = cnp->cn_flags;
2464
2465         *vpp = NULL;
2466         dvp = ap->a_dvp;
2467
2468         if (dvp->v_type != VDIR)
2469                 return (ENOTDIR);
2470
2471         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2472             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2473                 return (EROFS);
2474
2475         error = vn_dir_check_exec(dvp, cnp);
2476         if (error != 0)
2477                 return (error);
2478
2479         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2480         if (error == 0)
2481                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2482         if (error == -1)
2483                 return (0);
2484         return (error);
2485 }
2486
2487 /* Implementation of the getcwd syscall. */
2488 int
2489 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2490 {
2491         char *buf, *retbuf;
2492         size_t buflen;
2493         int error;
2494
2495         buflen = uap->buflen;
2496         if (__predict_false(buflen < 2))
2497                 return (EINVAL);
2498         if (buflen > MAXPATHLEN)
2499                 buflen = MAXPATHLEN;
2500
2501         buf = uma_zalloc(namei_zone, M_WAITOK);
2502         error = vn_getcwd(buf, &retbuf, &buflen);
2503         if (error == 0)
2504                 error = copyout(retbuf, uap->buf, buflen);
2505         uma_zfree(namei_zone, buf);
2506         return (error);
2507 }
2508
2509 int
2510 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2511 {
2512         struct pwd *pwd;
2513         int error;
2514
2515         vfs_smr_enter();
2516         pwd = pwd_get_smr();
2517         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2518             buflen, false, 0);
2519         VFS_SMR_ASSERT_NOT_ENTERED();
2520         if (error < 0) {
2521                 pwd = pwd_hold(curthread);
2522                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2523                     retbuf, buflen);
2524                 pwd_drop(pwd);
2525         }
2526
2527 #ifdef KTRACE
2528         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2529                 ktrnamei(*retbuf);
2530 #endif
2531         return (error);
2532 }
2533
2534 static int
2535 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2536     size_t size, int flags, enum uio_seg pathseg)
2537 {
2538         struct nameidata nd;
2539         char *retbuf, *freebuf;
2540         int error;
2541
2542         if (flags != 0)
2543                 return (EINVAL);
2544         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2545             pathseg, path, fd, &cap_fstat_rights, td);
2546         if ((error = namei(&nd)) != 0)
2547                 return (error);
2548         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2549         if (error == 0) {
2550                 error = copyout(retbuf, buf, size);
2551                 free(freebuf, M_TEMP);
2552         }
2553         NDFREE(&nd, 0);
2554         return (error);
2555 }
2556
2557 int
2558 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2559 {
2560
2561         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2562             uap->flags, UIO_USERSPACE));
2563 }
2564
2565 /*
2566  * Retrieve the full filesystem path that correspond to a vnode from the name
2567  * cache (if available)
2568  */
2569 int
2570 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2571 {
2572         struct pwd *pwd;
2573         char *buf;
2574         size_t buflen;
2575         int error;
2576
2577         if (__predict_false(vp == NULL))
2578                 return (EINVAL);
2579
2580         buflen = MAXPATHLEN;
2581         buf = malloc(buflen, M_TEMP, M_WAITOK);
2582         vfs_smr_enter();
2583         pwd = pwd_get_smr();
2584         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2585         VFS_SMR_ASSERT_NOT_ENTERED();
2586         if (error < 0) {
2587                 pwd = pwd_hold(curthread);
2588                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2589                 pwd_drop(pwd);
2590         }
2591         if (error == 0)
2592                 *freebuf = buf;
2593         else
2594                 free(buf, M_TEMP);
2595         return (error);
2596 }
2597
2598 /*
2599  * This function is similar to vn_fullpath, but it attempts to lookup the
2600  * pathname relative to the global root mount point.  This is required for the
2601  * auditing sub-system, as audited pathnames must be absolute, relative to the
2602  * global root mount point.
2603  */
2604 int
2605 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2606 {
2607         char *buf;
2608         size_t buflen;
2609         int error;
2610
2611         if (__predict_false(vp == NULL))
2612                 return (EINVAL);
2613         buflen = MAXPATHLEN;
2614         buf = malloc(buflen, M_TEMP, M_WAITOK);
2615         vfs_smr_enter();
2616         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2617         VFS_SMR_ASSERT_NOT_ENTERED();
2618         if (error < 0) {
2619                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2620         }
2621         if (error == 0)
2622                 *freebuf = buf;
2623         else
2624                 free(buf, M_TEMP);
2625         return (error);
2626 }
2627
2628 static struct namecache *
2629 vn_dd_from_dst(struct vnode *vp)
2630 {
2631         struct namecache *ncp;
2632
2633         cache_assert_vnode_locked(vp);
2634         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2635                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2636                         return (ncp);
2637         }
2638         return (NULL);
2639 }
2640
2641 int
2642 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2643 {
2644         struct vnode *dvp;
2645         struct namecache *ncp;
2646         struct mtx *vlp;
2647         int error;
2648
2649         vlp = VP2VNODELOCK(*vp);
2650         mtx_lock(vlp);
2651         ncp = (*vp)->v_cache_dd;
2652         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2653                 KASSERT(ncp == vn_dd_from_dst(*vp),
2654                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2655                     ncp, vn_dd_from_dst(*vp)));
2656         } else {
2657                 ncp = vn_dd_from_dst(*vp);
2658         }
2659         if (ncp != NULL) {
2660                 if (*buflen < ncp->nc_nlen) {
2661                         mtx_unlock(vlp);
2662                         vrele(*vp);
2663                         counter_u64_add(numfullpathfail4, 1);
2664                         error = ENOMEM;
2665                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2666                             vp, NULL);
2667                         return (error);
2668                 }
2669                 *buflen -= ncp->nc_nlen;
2670                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2671                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2672                     ncp->nc_name, vp);
2673                 dvp = *vp;
2674                 *vp = ncp->nc_dvp;
2675                 vref(*vp);
2676                 mtx_unlock(vlp);
2677                 vrele(dvp);
2678                 return (0);
2679         }
2680         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2681
2682         mtx_unlock(vlp);
2683         vn_lock(*vp, LK_SHARED | LK_RETRY);
2684         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2685         vput(*vp);
2686         if (error) {
2687                 counter_u64_add(numfullpathfail2, 1);
2688                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2689                 return (error);
2690         }
2691
2692         *vp = dvp;
2693         if (VN_IS_DOOMED(dvp)) {
2694                 /* forced unmount */
2695                 vrele(dvp);
2696                 error = ENOENT;
2697                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2698                 return (error);
2699         }
2700         /*
2701          * *vp has its use count incremented still.
2702          */
2703
2704         return (0);
2705 }
2706
2707 /*
2708  * Resolve a directory to a pathname.
2709  *
2710  * The name of the directory can always be found in the namecache or fetched
2711  * from the filesystem. There is also guaranteed to be only one parent, meaning
2712  * we can just follow vnodes up until we find the root.
2713  *
2714  * The vnode must be referenced.
2715  */
2716 static int
2717 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2718     size_t *len, bool slash_prefixed, size_t addend)
2719 {
2720 #ifdef KDTRACE_HOOKS
2721         struct vnode *startvp = vp;
2722 #endif
2723         struct vnode *vp1;
2724         size_t buflen;
2725         int error;
2726
2727         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2728         VNPASS(vp->v_usecount > 0, vp);
2729
2730         buflen = *len;
2731
2732         if (!slash_prefixed) {
2733                 MPASS(*len >= 2);
2734                 buflen--;
2735                 buf[buflen] = '\0';
2736         }
2737
2738         error = 0;
2739
2740         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2741         counter_u64_add(numfullpathcalls, 1);
2742         while (vp != rdir && vp != rootvnode) {
2743                 /*
2744                  * The vp vnode must be already fully constructed,
2745                  * since it is either found in namecache or obtained
2746                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2747                  * without obtaining the vnode lock.
2748                  */
2749                 if ((vp->v_vflag & VV_ROOT) != 0) {
2750                         vn_lock(vp, LK_RETRY | LK_SHARED);
2751
2752                         /*
2753                          * With the vnode locked, check for races with
2754                          * unmount, forced or not.  Note that we
2755                          * already verified that vp is not equal to
2756                          * the root vnode, which means that
2757                          * mnt_vnodecovered can be NULL only for the
2758                          * case of unmount.
2759                          */
2760                         if (VN_IS_DOOMED(vp) ||
2761                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2762                             vp1->v_mountedhere != vp->v_mount) {
2763                                 vput(vp);
2764                                 error = ENOENT;
2765                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2766                                     error, vp, NULL);
2767                                 break;
2768                         }
2769
2770                         vref(vp1);
2771                         vput(vp);
2772                         vp = vp1;
2773                         continue;
2774                 }
2775                 if (vp->v_type != VDIR) {
2776                         vrele(vp);
2777                         counter_u64_add(numfullpathfail1, 1);
2778                         error = ENOTDIR;
2779                         SDT_PROBE3(vfs, namecache, fullpath, return,
2780                             error, vp, NULL);
2781                         break;
2782                 }
2783                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2784                 if (error)
2785                         break;
2786                 if (buflen == 0) {
2787                         vrele(vp);
2788                         error = ENOMEM;
2789                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2790                             startvp, NULL);
2791                         break;
2792                 }
2793                 buf[--buflen] = '/';
2794                 slash_prefixed = true;
2795         }
2796         if (error)
2797                 return (error);
2798         if (!slash_prefixed) {
2799                 if (buflen == 0) {
2800                         vrele(vp);
2801                         counter_u64_add(numfullpathfail4, 1);
2802                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2803                             startvp, NULL);
2804                         return (ENOMEM);
2805                 }
2806                 buf[--buflen] = '/';
2807         }
2808         counter_u64_add(numfullpathfound, 1);
2809         vrele(vp);
2810
2811         *retbuf = buf + buflen;
2812         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2813         *len -= buflen;
2814         *len += addend;
2815         return (0);
2816 }
2817
2818 /*
2819  * Resolve an arbitrary vnode to a pathname.
2820  *
2821  * Note 2 caveats:
2822  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2823  *   resolve to a different path than the one used to find it
2824  * - namecache is not mandatory, meaning names are not guaranteed to be added
2825  *   (in which case resolving fails)
2826  */
2827 static void __inline
2828 cache_rev_failed_impl(int *reason, int line)
2829 {
2830
2831         *reason = line;
2832 }
2833 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
2834
2835 static int
2836 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
2837     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
2838 {
2839 #ifdef KDTRACE_HOOKS
2840         struct vnode *startvp = vp;
2841 #endif
2842         struct vnode *tvp;
2843         struct mount *mp;
2844         struct namecache *ncp;
2845         size_t orig_buflen;
2846         int reason;
2847         int error;
2848 #ifdef KDTRACE_HOOKS
2849         int i;
2850 #endif
2851         seqc_t vp_seqc, tvp_seqc;
2852         u_char nc_flag;
2853
2854         VFS_SMR_ASSERT_ENTERED();
2855
2856         if (!cache_fast_revlookup) {
2857                 vfs_smr_exit();
2858                 return (-1);
2859         }
2860
2861         orig_buflen = *buflen;
2862
2863         if (!slash_prefixed) {
2864                 MPASS(*buflen >= 2);
2865                 *buflen -= 1;
2866                 buf[*buflen] = '\0';
2867         }
2868
2869         if (vp == rdir || vp == rootvnode) {
2870                 if (!slash_prefixed) {
2871                         *buflen -= 1;
2872                         buf[*buflen] = '/';
2873                 }
2874                 goto out_ok;
2875         }
2876
2877 #ifdef KDTRACE_HOOKS
2878         i = 0;
2879 #endif
2880         error = -1;
2881         ncp = NULL; /* for sdt probe down below */
2882         vp_seqc = vn_seqc_read_any(vp);
2883         if (seqc_in_modify(vp_seqc)) {
2884                 cache_rev_failed(&reason);
2885                 goto out_abort;
2886         }
2887
2888         for (;;) {
2889 #ifdef KDTRACE_HOOKS
2890                 i++;
2891 #endif
2892                 if ((vp->v_vflag & VV_ROOT) != 0) {
2893                         mp = atomic_load_ptr(&vp->v_mount);
2894                         if (mp == NULL) {
2895                                 cache_rev_failed(&reason);
2896                                 goto out_abort;
2897                         }
2898                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
2899                         tvp_seqc = vn_seqc_read_any(tvp);
2900                         if (seqc_in_modify(tvp_seqc)) {
2901                                 cache_rev_failed(&reason);
2902                                 goto out_abort;
2903                         }
2904                         if (!vn_seqc_consistent(vp, vp_seqc)) {
2905                                 cache_rev_failed(&reason);
2906                                 goto out_abort;
2907                         }
2908                         vp = tvp;
2909                         vp_seqc = tvp_seqc;
2910                         continue;
2911                 }
2912                 ncp = atomic_load_ptr(&vp->v_cache_dd);
2913                 if (ncp == NULL) {
2914                         cache_rev_failed(&reason);
2915                         goto out_abort;
2916                 }
2917                 nc_flag = atomic_load_char(&ncp->nc_flag);
2918                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
2919                         cache_rev_failed(&reason);
2920                         goto out_abort;
2921                 }
2922                 if (!cache_ncp_canuse(ncp)) {
2923                         cache_rev_failed(&reason);
2924                         goto out_abort;
2925                 }
2926                 if (ncp->nc_nlen >= *buflen) {
2927                         cache_rev_failed(&reason);
2928                         error = ENOMEM;
2929                         goto out_abort;
2930                 }
2931                 *buflen -= ncp->nc_nlen;
2932                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2933                 *buflen -= 1;
2934                 buf[*buflen] = '/';
2935                 tvp = ncp->nc_dvp;
2936                 tvp_seqc = vn_seqc_read_any(tvp);
2937                 if (seqc_in_modify(tvp_seqc)) {
2938                         cache_rev_failed(&reason);
2939                         goto out_abort;
2940                 }
2941                 if (!vn_seqc_consistent(vp, vp_seqc)) {
2942                         cache_rev_failed(&reason);
2943                         goto out_abort;
2944                 }
2945                 vp = tvp;
2946                 vp_seqc = tvp_seqc;
2947                 if (vp == rdir || vp == rootvnode)
2948                         break;
2949         }
2950 out_ok:
2951         vfs_smr_exit();
2952         *retbuf = buf + *buflen;
2953         *buflen = orig_buflen - *buflen + addend;
2954         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
2955         return (0);
2956
2957 out_abort:
2958         *buflen = orig_buflen;
2959         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
2960         vfs_smr_exit();
2961         return (error);
2962 }
2963
2964 static int
2965 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2966     size_t *buflen)
2967 {
2968         size_t orig_buflen;
2969         bool slash_prefixed;
2970         int error;
2971
2972         if (*buflen < 2)
2973                 return (EINVAL);
2974
2975         orig_buflen = *buflen;
2976
2977         vref(vp);
2978         slash_prefixed = false;
2979         if (vp->v_type != VDIR) {
2980                 *buflen -= 1;
2981                 buf[*buflen] = '\0';
2982                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
2983                 if (error)
2984                         return (error);
2985                 if (*buflen == 0) {
2986                         vrele(vp);
2987                         return (ENOMEM);
2988                 }
2989                 *buflen -= 1;
2990                 buf[*buflen] = '/';
2991                 slash_prefixed = true;
2992         }
2993
2994         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
2995             orig_buflen - *buflen));
2996 }
2997
2998 /*
2999  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3000  *
3001  * Since the namecache does not track handlings, the caller is expected to first
3002  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3003  *
3004  * Then we have 2 cases:
3005  * - if the found vnode is a directory, the path can be constructed just by
3006  *   fullowing names up the chain
3007  * - otherwise we populate the buffer with the saved name and start resolving
3008  *   from the parent
3009  */
3010 static int
3011 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3012     size_t *buflen)
3013 {
3014         char *buf, *tmpbuf;
3015         struct pwd *pwd;
3016         struct componentname *cnp;
3017         struct vnode *vp;
3018         size_t addend;
3019         int error;
3020         bool slash_prefixed;
3021         enum vtype type;
3022
3023         if (*buflen < 2)
3024                 return (EINVAL);
3025         if (*buflen > MAXPATHLEN)
3026                 *buflen = MAXPATHLEN;
3027
3028         slash_prefixed = false;
3029
3030         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3031
3032         addend = 0;
3033         vp = ndp->ni_vp;
3034         /*
3035          * Check for VBAD to work around the vp_crossmp bug in lookup().
3036          *
3037          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3038          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3039          * If the type is VDIR (like in this very case) we can skip looking
3040          * at ni_dvp in the first place. However, since vnodes get passed here
3041          * unlocked the target may transition to doomed state (type == VBAD)
3042          * before we get to evaluate the condition. If this happens, we will
3043          * populate part of the buffer and descend to vn_fullpath_dir with
3044          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3045          *
3046          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3047          * an address of a bit field, even if said field is sized to char.
3048          * Work around the problem by reading the value into a full-sized enum
3049          * and then re-reading it with atomic_load which will still prevent
3050          * the compiler from re-reading down the road.
3051          */
3052         type = vp->v_type;
3053         type = atomic_load_int(&type);
3054         if (type == VBAD) {
3055                 error = ENOENT;
3056                 goto out_bad;
3057         }
3058         if (type != VDIR) {
3059                 cnp = &ndp->ni_cnd;
3060                 addend = cnp->cn_namelen + 2;
3061                 if (*buflen < addend) {
3062                         error = ENOMEM;
3063                         goto out_bad;
3064                 }
3065                 *buflen -= addend;
3066                 tmpbuf = buf + *buflen;
3067                 tmpbuf[0] = '/';
3068                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3069                 tmpbuf[addend - 1] = '\0';
3070                 slash_prefixed = true;
3071                 vp = ndp->ni_dvp;
3072         }
3073
3074         vfs_smr_enter();
3075         pwd = pwd_get_smr();
3076         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3077             slash_prefixed, addend);
3078         VFS_SMR_ASSERT_NOT_ENTERED();
3079         if (error < 0) {
3080                 pwd = pwd_hold(curthread);
3081                 vref(vp);
3082                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3083                     slash_prefixed, addend);
3084                 pwd_drop(pwd);
3085                 if (error != 0)
3086                         goto out_bad;
3087         }
3088
3089         *freebuf = buf;
3090
3091         return (0);
3092 out_bad:
3093         free(buf, M_TEMP);
3094         return (error);
3095 }
3096
3097 struct vnode *
3098 vn_dir_dd_ino(struct vnode *vp)
3099 {
3100         struct namecache *ncp;
3101         struct vnode *ddvp;
3102         struct mtx *vlp;
3103         enum vgetstate vs;
3104
3105         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3106         vlp = VP2VNODELOCK(vp);
3107         mtx_lock(vlp);
3108         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3109                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3110                         continue;
3111                 ddvp = ncp->nc_dvp;
3112                 vs = vget_prep(ddvp);
3113                 mtx_unlock(vlp);
3114                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3115                         return (NULL);
3116                 return (ddvp);
3117         }
3118         mtx_unlock(vlp);
3119         return (NULL);
3120 }
3121
3122 int
3123 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3124 {
3125         struct namecache *ncp;
3126         struct mtx *vlp;
3127         int l;
3128
3129         vlp = VP2VNODELOCK(vp);
3130         mtx_lock(vlp);
3131         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3132                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3133                         break;
3134         if (ncp == NULL) {
3135                 mtx_unlock(vlp);
3136                 return (ENOENT);
3137         }
3138         l = min(ncp->nc_nlen, buflen - 1);
3139         memcpy(buf, ncp->nc_name, l);
3140         mtx_unlock(vlp);
3141         buf[l] = '\0';
3142         return (0);
3143 }
3144
3145 /*
3146  * This function updates path string to vnode's full global path
3147  * and checks the size of the new path string against the pathlen argument.
3148  *
3149  * Requires a locked, referenced vnode.
3150  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3151  *
3152  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3153  * because it falls back to the ".." lookup if the namecache lookup fails.
3154  */
3155 int
3156 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3157     u_int pathlen)
3158 {
3159         struct nameidata nd;
3160         struct vnode *vp1;
3161         char *rpath, *fbuf;
3162         int error;
3163
3164         ASSERT_VOP_ELOCKED(vp, __func__);
3165
3166         /* Construct global filesystem path from vp. */
3167         VOP_UNLOCK(vp);
3168         error = vn_fullpath_global(vp, &rpath, &fbuf);
3169
3170         if (error != 0) {
3171                 vrele(vp);
3172                 return (error);
3173         }
3174
3175         if (strlen(rpath) >= pathlen) {
3176                 vrele(vp);
3177                 error = ENAMETOOLONG;
3178                 goto out;
3179         }
3180
3181         /*
3182          * Re-lookup the vnode by path to detect a possible rename.
3183          * As a side effect, the vnode is relocked.
3184          * If vnode was renamed, return ENOENT.
3185          */
3186         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3187             UIO_SYSSPACE, path, td);
3188         error = namei(&nd);
3189         if (error != 0) {
3190                 vrele(vp);
3191                 goto out;
3192         }
3193         NDFREE(&nd, NDF_ONLY_PNBUF);
3194         vp1 = nd.ni_vp;
3195         vrele(vp);
3196         if (vp1 == vp)
3197                 strcpy(path, rpath);
3198         else {
3199                 vput(vp1);
3200                 error = ENOENT;
3201         }
3202
3203 out:
3204         free(fbuf, M_TEMP);
3205         return (error);
3206 }
3207
3208 #ifdef DDB
3209 static void
3210 db_print_vpath(struct vnode *vp)
3211 {
3212
3213         while (vp != NULL) {
3214                 db_printf("%p: ", vp);
3215                 if (vp == rootvnode) {
3216                         db_printf("/");
3217                         vp = NULL;
3218                 } else {
3219                         if (vp->v_vflag & VV_ROOT) {
3220                                 db_printf("<mount point>");
3221                                 vp = vp->v_mount->mnt_vnodecovered;
3222                         } else {
3223                                 struct namecache *ncp;
3224                                 char *ncn;
3225                                 int i;
3226
3227                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3228                                 if (ncp != NULL) {
3229                                         ncn = ncp->nc_name;
3230                                         for (i = 0; i < ncp->nc_nlen; i++)
3231                                                 db_printf("%c", *ncn++);
3232                                         vp = ncp->nc_dvp;
3233                                 } else {
3234                                         vp = NULL;
3235                                 }
3236                         }
3237                 }
3238                 db_printf("\n");
3239         }
3240
3241         return;
3242 }
3243
3244 DB_SHOW_COMMAND(vpath, db_show_vpath)
3245 {
3246         struct vnode *vp;
3247
3248         if (!have_addr) {
3249                 db_printf("usage: show vpath <struct vnode *>\n");
3250                 return;
3251         }
3252
3253         vp = (struct vnode *)addr;
3254         db_print_vpath(vp);
3255 }
3256
3257 #endif
3258
3259 static bool __read_frequently cache_fast_lookup = true;
3260 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3261     &cache_fast_lookup, 0, "");
3262
3263 #define CACHE_FPL_FAILED        -2020
3264
3265 static void
3266 cache_fpl_cleanup_cnp(struct componentname *cnp)
3267 {
3268
3269         uma_zfree(namei_zone, cnp->cn_pnbuf);
3270 #ifdef DIAGNOSTIC
3271         cnp->cn_pnbuf = NULL;
3272         cnp->cn_nameptr = NULL;
3273 #endif
3274 }
3275
3276 static void
3277 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3278 {
3279         struct componentname *cnp;
3280
3281         cnp = &ndp->ni_cnd;
3282         while (*(cnp->cn_nameptr) == '/') {
3283                 cnp->cn_nameptr++;
3284                 ndp->ni_pathlen--;
3285         }
3286
3287         *dpp = ndp->ni_rootdir;
3288 }
3289
3290 /*
3291  * Components of nameidata (or objects it can point to) which may
3292  * need restoring in case fast path lookup fails.
3293  */
3294 struct nameidata_saved {
3295         long cn_namelen;
3296         char *cn_nameptr;
3297         size_t ni_pathlen;
3298         int cn_flags;
3299 };
3300
3301 struct cache_fpl {
3302         struct nameidata *ndp;
3303         struct componentname *cnp;
3304         struct pwd *pwd;
3305         struct vnode *dvp;
3306         struct vnode *tvp;
3307         seqc_t dvp_seqc;
3308         seqc_t tvp_seqc;
3309         struct nameidata_saved snd;
3310         int line;
3311         enum cache_fpl_status status:8;
3312         bool in_smr;
3313         bool fsearch;
3314 };
3315
3316 static void
3317 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3318 {
3319
3320         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3321         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3322         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3323         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3324 }
3325
3326 static void
3327 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3328 {
3329
3330         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3331         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3332         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3333         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3334 }
3335
3336 #ifdef INVARIANTS
3337 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3338         struct cache_fpl *_fpl = (fpl);                         \
3339         MPASS(_fpl->in_smr == true);                            \
3340         VFS_SMR_ASSERT_ENTERED();                               \
3341 })
3342 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3343         struct cache_fpl *_fpl = (fpl);                         \
3344         MPASS(_fpl->in_smr == false);                           \
3345         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3346 })
3347 #else
3348 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3349 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3350 #endif
3351
3352 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3353         struct cache_fpl *_fpl = (fpl);                         \
3354         vfs_smr_enter();                                        \
3355         _fpl->in_smr = true;                                    \
3356 })
3357
3358 #define cache_fpl_smr_enter(fpl) ({                             \
3359         struct cache_fpl *_fpl = (fpl);                         \
3360         MPASS(_fpl->in_smr == false);                           \
3361         vfs_smr_enter();                                        \
3362         _fpl->in_smr = true;                                    \
3363 })
3364
3365 #define cache_fpl_smr_exit(fpl) ({                              \
3366         struct cache_fpl *_fpl = (fpl);                         \
3367         MPASS(_fpl->in_smr == true);                            \
3368         vfs_smr_exit();                                         \
3369         _fpl->in_smr = false;                                   \
3370 })
3371
3372 static int
3373 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3374 {
3375
3376         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3377                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3378                     ("%s: converting to abort from %d at %d, set at %d\n",
3379                     __func__, fpl->status, line, fpl->line));
3380         }
3381         fpl->status = CACHE_FPL_STATUS_ABORTED;
3382         fpl->line = line;
3383         return (CACHE_FPL_FAILED);
3384 }
3385
3386 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3387
3388 static int
3389 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3390 {
3391
3392         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3393             ("%s: setting to partial at %d, but already set to %d at %d\n",
3394             __func__, line, fpl->status, fpl->line));
3395         cache_fpl_smr_assert_entered(fpl);
3396         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3397         fpl->line = line;
3398         return (CACHE_FPL_FAILED);
3399 }
3400
3401 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3402
3403 static int
3404 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3405 {
3406
3407         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3408             ("%s: setting to handled at %d, but already set to %d at %d\n",
3409             __func__, line, fpl->status, fpl->line));
3410         cache_fpl_smr_assert_not_entered(fpl);
3411         MPASS(error != CACHE_FPL_FAILED);
3412         fpl->status = CACHE_FPL_STATUS_HANDLED;
3413         fpl->line = line;
3414         return (error);
3415 }
3416
3417 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3418
3419 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3420         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3421          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3422
3423 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3424         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3425
3426 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3427     "supported and internal flags overlap");
3428
3429 static bool
3430 cache_fpl_islastcn(struct nameidata *ndp)
3431 {
3432
3433         return (*ndp->ni_next == 0);
3434 }
3435
3436 static bool
3437 cache_fpl_isdotdot(struct componentname *cnp)
3438 {
3439
3440         if (cnp->cn_namelen == 2 &&
3441             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3442                 return (true);
3443         return (false);
3444 }
3445
3446 static bool
3447 cache_can_fplookup(struct cache_fpl *fpl)
3448 {
3449         struct nameidata *ndp;
3450         struct componentname *cnp;
3451         struct thread *td;
3452
3453         ndp = fpl->ndp;
3454         cnp = fpl->cnp;
3455         td = cnp->cn_thread;
3456
3457         if (!cache_fast_lookup) {
3458                 cache_fpl_aborted(fpl);
3459                 return (false);
3460         }
3461 #ifdef MAC
3462         if (mac_vnode_check_lookup_enabled()) {
3463                 cache_fpl_aborted(fpl);
3464                 return (false);
3465         }
3466 #endif
3467         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3468                 cache_fpl_aborted(fpl);
3469                 return (false);
3470         }
3471         if (IN_CAPABILITY_MODE(td)) {
3472                 cache_fpl_aborted(fpl);
3473                 return (false);
3474         }
3475         if (AUDITING_TD(td)) {
3476                 cache_fpl_aborted(fpl);
3477                 return (false);
3478         }
3479         if (ndp->ni_startdir != NULL) {
3480                 cache_fpl_aborted(fpl);
3481                 return (false);
3482         }
3483         return (true);
3484 }
3485
3486 static int
3487 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3488 {
3489         struct nameidata *ndp;
3490         int error;
3491         bool fsearch;
3492
3493         ndp = fpl->ndp;
3494         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3495         if (__predict_false(error != 0)) {
3496                 cache_fpl_smr_exit(fpl);
3497                 return (cache_fpl_aborted(fpl));
3498         }
3499         fpl->fsearch = fsearch;
3500         return (0);
3501 }
3502
3503 static bool
3504 cache_fplookup_vnode_supported(struct vnode *vp)
3505 {
3506
3507         return (vp->v_type != VLNK);
3508 }
3509
3510 static int __noinline
3511 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3512     uint32_t hash)
3513 {
3514         struct componentname *cnp;
3515         struct vnode *dvp;
3516
3517         cnp = fpl->cnp;
3518         dvp = fpl->dvp;
3519
3520         cache_fpl_smr_exit(fpl);
3521         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3522                 return (cache_fpl_handled(fpl, ENOENT));
3523         else
3524                 return (cache_fpl_aborted(fpl));
3525 }
3526
3527 /*
3528  * The target vnode is not supported, prepare for the slow path to take over.
3529  */
3530 static int __noinline
3531 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3532 {
3533         struct nameidata *ndp;
3534         struct componentname *cnp;
3535         enum vgetstate dvs;
3536         struct vnode *dvp;
3537         struct pwd *pwd;
3538         seqc_t dvp_seqc;
3539
3540         ndp = fpl->ndp;
3541         cnp = fpl->cnp;
3542         pwd = fpl->pwd;
3543         dvp = fpl->dvp;
3544         dvp_seqc = fpl->dvp_seqc;
3545
3546         if (!pwd_hold_smr(pwd)) {
3547                 cache_fpl_smr_exit(fpl);
3548                 return (cache_fpl_aborted(fpl));
3549         }
3550
3551         dvs = vget_prep_smr(dvp);
3552         cache_fpl_smr_exit(fpl);
3553         if (__predict_false(dvs == VGET_NONE)) {
3554                 pwd_drop(pwd);
3555                 return (cache_fpl_aborted(fpl));
3556         }
3557
3558         vget_finish_ref(dvp, dvs);
3559         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3560                 vrele(dvp);
3561                 pwd_drop(pwd);
3562                 return (cache_fpl_aborted(fpl));
3563         }
3564
3565         cache_fpl_restore(fpl, &fpl->snd);
3566
3567         ndp->ni_startdir = dvp;
3568         cnp->cn_flags |= MAKEENTRY;
3569         if (cache_fpl_islastcn(ndp))
3570                 cnp->cn_flags |= ISLASTCN;
3571         if (cache_fpl_isdotdot(cnp))
3572                 cnp->cn_flags |= ISDOTDOT;
3573
3574         return (0);
3575 }
3576
3577 static int
3578 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3579 {
3580         struct componentname *cnp;
3581         struct vnode *tvp;
3582         seqc_t tvp_seqc;
3583         int error, lkflags;
3584
3585         cnp = fpl->cnp;
3586         tvp = fpl->tvp;
3587         tvp_seqc = fpl->tvp_seqc;
3588
3589         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3590                 lkflags = LK_SHARED;
3591                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3592                         lkflags = LK_EXCLUSIVE;
3593                 error = vget_finish(tvp, lkflags, tvs);
3594                 if (__predict_false(error != 0)) {
3595                         return (cache_fpl_aborted(fpl));
3596                 }
3597         } else {
3598                 vget_finish_ref(tvp, tvs);
3599         }
3600
3601         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3602                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3603                         vput(tvp);
3604                 else
3605                         vrele(tvp);
3606                 return (cache_fpl_aborted(fpl));
3607         }
3608
3609         return (cache_fpl_handled(fpl, 0));
3610 }
3611
3612 /*
3613  * They want to possibly modify the state of the namecache.
3614  *
3615  * Don't try to match the API contract, just leave.
3616  * TODO: this leaves scalability on the table
3617  */
3618 static int
3619 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3620 {
3621         struct componentname *cnp;
3622
3623         cnp = fpl->cnp;
3624         MPASS(cnp->cn_nameiop != LOOKUP);
3625         return (cache_fpl_partial(fpl));
3626 }
3627
3628 static int __noinline
3629 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3630 {
3631         struct componentname *cnp;
3632         enum vgetstate dvs, tvs;
3633         struct vnode *dvp, *tvp;
3634         seqc_t dvp_seqc;
3635         int error;
3636
3637         cnp = fpl->cnp;
3638         dvp = fpl->dvp;
3639         dvp_seqc = fpl->dvp_seqc;
3640         tvp = fpl->tvp;
3641
3642         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3643
3644         /*
3645          * This is less efficient than it can be for simplicity.
3646          */
3647         dvs = vget_prep_smr(dvp);
3648         if (__predict_false(dvs == VGET_NONE)) {
3649                 return (cache_fpl_aborted(fpl));
3650         }
3651         tvs = vget_prep_smr(tvp);
3652         if (__predict_false(tvs == VGET_NONE)) {
3653                 cache_fpl_smr_exit(fpl);
3654                 vget_abort(dvp, dvs);
3655                 return (cache_fpl_aborted(fpl));
3656         }
3657
3658         cache_fpl_smr_exit(fpl);
3659
3660         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3661                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3662                 if (__predict_false(error != 0)) {
3663                         vget_abort(tvp, tvs);
3664                         return (cache_fpl_aborted(fpl));
3665                 }
3666         } else {
3667                 vget_finish_ref(dvp, dvs);
3668         }
3669
3670         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3671                 vget_abort(tvp, tvs);
3672                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3673                         vput(dvp);
3674                 else
3675                         vrele(dvp);
3676                 return (cache_fpl_aborted(fpl));
3677         }
3678
3679         error = cache_fplookup_final_child(fpl, tvs);
3680         if (__predict_false(error != 0)) {
3681                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3682                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3683                         vput(dvp);
3684                 else
3685                         vrele(dvp);
3686                 return (error);
3687         }
3688
3689         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3690         return (0);
3691 }
3692
3693 static int
3694 cache_fplookup_final(struct cache_fpl *fpl)
3695 {
3696         struct componentname *cnp;
3697         enum vgetstate tvs;
3698         struct vnode *dvp, *tvp;
3699         seqc_t dvp_seqc;
3700
3701         cnp = fpl->cnp;
3702         dvp = fpl->dvp;
3703         dvp_seqc = fpl->dvp_seqc;
3704         tvp = fpl->tvp;
3705
3706         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3707
3708         if (cnp->cn_nameiop != LOOKUP) {
3709                 return (cache_fplookup_final_modifying(fpl));
3710         }
3711
3712         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3713                 return (cache_fplookup_final_withparent(fpl));
3714
3715         tvs = vget_prep_smr(tvp);
3716         if (__predict_false(tvs == VGET_NONE)) {
3717                 return (cache_fpl_partial(fpl));
3718         }
3719
3720         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3721                 cache_fpl_smr_exit(fpl);
3722                 vget_abort(tvp, tvs);
3723                 return (cache_fpl_aborted(fpl));
3724         }
3725
3726         cache_fpl_smr_exit(fpl);
3727         return (cache_fplookup_final_child(fpl, tvs));
3728 }
3729
3730 static int __noinline
3731 cache_fplookup_dot(struct cache_fpl *fpl)
3732 {
3733         struct vnode *dvp;
3734
3735         dvp = fpl->dvp;
3736
3737         fpl->tvp = dvp;
3738         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3739         if (seqc_in_modify(fpl->tvp_seqc)) {
3740                 return (cache_fpl_aborted(fpl));
3741         }
3742
3743         counter_u64_add(dothits, 1);
3744         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3745
3746         return (0);
3747 }
3748
3749 static int __noinline
3750 cache_fplookup_dotdot(struct cache_fpl *fpl)
3751 {
3752         struct nameidata *ndp;
3753         struct componentname *cnp;
3754         struct namecache *ncp;
3755         struct vnode *dvp;
3756         struct prison *pr;
3757         u_char nc_flag;
3758
3759         ndp = fpl->ndp;
3760         cnp = fpl->cnp;
3761         dvp = fpl->dvp;
3762
3763         /*
3764          * XXX this is racy the same way regular lookup is
3765          */
3766         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3767             pr = pr->pr_parent)
3768                 if (dvp == pr->pr_root)
3769                         break;
3770
3771         if (dvp == ndp->ni_rootdir ||
3772             dvp == ndp->ni_topdir ||
3773             dvp == rootvnode ||
3774             pr != NULL) {
3775                 fpl->tvp = dvp;
3776                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3777                 if (seqc_in_modify(fpl->tvp_seqc)) {
3778                         return (cache_fpl_aborted(fpl));
3779                 }
3780                 return (0);
3781         }
3782
3783         if ((dvp->v_vflag & VV_ROOT) != 0) {
3784                 /*
3785                  * TODO
3786                  * The opposite of climb mount is needed here.
3787                  */
3788                 return (cache_fpl_aborted(fpl));
3789         }
3790
3791         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3792         if (ncp == NULL) {
3793                 return (cache_fpl_aborted(fpl));
3794         }
3795
3796         nc_flag = atomic_load_char(&ncp->nc_flag);
3797         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3798                 if ((nc_flag & NCF_NEGATIVE) != 0)
3799                         return (cache_fpl_aborted(fpl));
3800                 fpl->tvp = ncp->nc_vp;
3801         } else {
3802                 fpl->tvp = ncp->nc_dvp;
3803         }
3804
3805         if (__predict_false(!cache_ncp_canuse(ncp))) {
3806                 return (cache_fpl_aborted(fpl));
3807         }
3808
3809         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3810         if (seqc_in_modify(fpl->tvp_seqc)) {
3811                 return (cache_fpl_partial(fpl));
3812         }
3813
3814         counter_u64_add(dotdothits, 1);
3815         return (0);
3816 }
3817
3818 static int
3819 cache_fplookup_next(struct cache_fpl *fpl)
3820 {
3821         struct componentname *cnp;
3822         struct namecache *ncp;
3823         struct negstate *ns;
3824         struct vnode *dvp, *tvp;
3825         u_char nc_flag;
3826         uint32_t hash;
3827         bool neg_hot;
3828
3829         cnp = fpl->cnp;
3830         dvp = fpl->dvp;
3831
3832         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3833                 return (cache_fplookup_dot(fpl));
3834         }
3835
3836         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3837
3838         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3839                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3840                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3841                         break;
3842         }
3843
3844         /*
3845          * If there is no entry we have to punt to the slow path to perform
3846          * actual lookup. Should there be nothing with this name a negative
3847          * entry will be created.
3848          */
3849         if (__predict_false(ncp == NULL)) {
3850                 return (cache_fpl_partial(fpl));
3851         }
3852
3853         tvp = atomic_load_ptr(&ncp->nc_vp);
3854         nc_flag = atomic_load_char(&ncp->nc_flag);
3855         if ((nc_flag & NCF_NEGATIVE) != 0) {
3856                 /*
3857                  * If they want to create an entry we need to replace this one.
3858                  */
3859                 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3860                         return (cache_fpl_partial(fpl));
3861                 }
3862                 ns = NCP2NEGSTATE(ncp);
3863                 neg_hot = ((ns->neg_flag & NEG_HOT) != 0);
3864                 if (__predict_false(!cache_ncp_canuse(ncp))) {
3865                         return (cache_fpl_partial(fpl));
3866                 }
3867                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3868                         return (cache_fpl_partial(fpl));
3869                 }
3870                 if (!neg_hot) {
3871                         return (cache_fplookup_negative_promote(fpl, ncp, hash));
3872                 }
3873                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3874                     ncp->nc_name);
3875                 counter_u64_add(numneghits, 1);
3876                 cache_fpl_smr_exit(fpl);
3877                 return (cache_fpl_handled(fpl, ENOENT));
3878         }
3879
3880         if (__predict_false(!cache_ncp_canuse(ncp))) {
3881                 return (cache_fpl_partial(fpl));
3882         }
3883
3884         fpl->tvp = tvp;
3885         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3886         if (seqc_in_modify(fpl->tvp_seqc)) {
3887                 return (cache_fpl_partial(fpl));
3888         }
3889
3890         if (!cache_fplookup_vnode_supported(tvp)) {
3891                 return (cache_fpl_partial(fpl));
3892         }
3893
3894         counter_u64_add(numposhits, 1);
3895         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3896         return (0);
3897 }
3898
3899 static bool
3900 cache_fplookup_mp_supported(struct mount *mp)
3901 {
3902
3903         if (mp == NULL)
3904                 return (false);
3905         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3906                 return (false);
3907         return (true);
3908 }
3909
3910 /*
3911  * Walk up the mount stack (if any).
3912  *
3913  * Correctness is provided in the following ways:
3914  * - all vnodes are protected from freeing with SMR
3915  * - struct mount objects are type stable making them always safe to access
3916  * - stability of the particular mount is provided by busying it
3917  * - relationship between the vnode which is mounted on and the mount is
3918  *   verified with the vnode sequence counter after busying
3919  * - association between root vnode of the mount and the mount is protected
3920  *   by busy
3921  *
3922  * From that point on we can read the sequence counter of the root vnode
3923  * and get the next mount on the stack (if any) using the same protection.
3924  *
3925  * By the end of successful walk we are guaranteed the reached state was
3926  * indeed present at least at some point which matches the regular lookup.
3927  */
3928 static int __noinline
3929 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3930 {
3931         struct mount *mp, *prev_mp;
3932         struct vnode *vp;
3933         seqc_t vp_seqc;
3934
3935         vp = fpl->tvp;
3936         vp_seqc = fpl->tvp_seqc;
3937
3938         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3939         mp = atomic_load_ptr(&vp->v_mountedhere);
3940         if (mp == NULL)
3941                 return (0);
3942
3943         prev_mp = NULL;
3944         for (;;) {
3945                 if (!vfs_op_thread_enter_crit(mp)) {
3946                         if (prev_mp != NULL)
3947                                 vfs_op_thread_exit_crit(prev_mp);
3948                         return (cache_fpl_partial(fpl));
3949                 }
3950                 if (prev_mp != NULL)
3951                         vfs_op_thread_exit_crit(prev_mp);
3952                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3953                         vfs_op_thread_exit_crit(mp);
3954                         return (cache_fpl_partial(fpl));
3955                 }
3956                 if (!cache_fplookup_mp_supported(mp)) {
3957                         vfs_op_thread_exit_crit(mp);
3958                         return (cache_fpl_partial(fpl));
3959                 }
3960                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3961                 if (vp == NULL || VN_IS_DOOMED(vp)) {
3962                         vfs_op_thread_exit_crit(mp);
3963                         return (cache_fpl_partial(fpl));
3964                 }
3965                 vp_seqc = vn_seqc_read_any(vp);
3966                 if (seqc_in_modify(vp_seqc)) {
3967                         vfs_op_thread_exit_crit(mp);
3968                         return (cache_fpl_partial(fpl));
3969                 }
3970                 prev_mp = mp;
3971                 mp = atomic_load_ptr(&vp->v_mountedhere);
3972                 if (mp == NULL)
3973                         break;
3974         }
3975
3976         vfs_op_thread_exit_crit(prev_mp);
3977         fpl->tvp = vp;
3978         fpl->tvp_seqc = vp_seqc;
3979         return (0);
3980 }
3981
3982 static bool
3983 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3984 {
3985         struct mount *mp;
3986         struct vnode *vp;
3987
3988         vp = fpl->tvp;
3989
3990         /*
3991          * Hack: while this is a union, the pointer tends to be NULL so save on
3992          * a branch.
3993          */
3994         mp = atomic_load_ptr(&vp->v_mountedhere);
3995         if (mp == NULL)
3996                 return (false);
3997         if (vp->v_type == VDIR)
3998                 return (true);
3999         return (false);
4000 }
4001
4002 /*
4003  * Parse the path.
4004  *
4005  * The code was originally copy-pasted from regular lookup and despite
4006  * clean ups leaves performance on the table. Any modifications here
4007  * must take into account that in case off fallback the resulting
4008  * nameidata state has to be compatible with the original.
4009  */
4010 static int
4011 cache_fplookup_parse(struct cache_fpl *fpl)
4012 {
4013         struct nameidata *ndp;
4014         struct componentname *cnp;
4015         char *cp;
4016
4017         ndp = fpl->ndp;
4018         cnp = fpl->cnp;
4019
4020         /*
4021          * Search a new directory.
4022          *
4023          * The last component of the filename is left accessible via
4024          * cnp->cn_nameptr for callers that need the name. Callers needing
4025          * the name set the SAVENAME flag. When done, they assume
4026          * responsibility for freeing the pathname buffer.
4027          */
4028         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4029                 continue;
4030         cnp->cn_namelen = cp - cnp->cn_nameptr;
4031         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4032                 cache_fpl_smr_exit(fpl);
4033                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4034         }
4035         ndp->ni_pathlen -= cnp->cn_namelen;
4036         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4037             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4038         ndp->ni_next = cp;
4039
4040         /*
4041          * Replace multiple slashes by a single slash and trailing slashes
4042          * by a null.  This must be done before VOP_LOOKUP() because some
4043          * fs's don't know about trailing slashes.  Remember if there were
4044          * trailing slashes to handle symlinks, existing non-directories
4045          * and non-existing files that won't be directories specially later.
4046          */
4047         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4048                 cp++;
4049                 ndp->ni_pathlen--;
4050                 if (*cp == '\0') {
4051                         /*
4052                          * TODO
4053                          * Regular lookup performs the following:
4054                          * *ndp->ni_next = '\0';
4055                          * cnp->cn_flags |= TRAILINGSLASH;
4056                          *
4057                          * Which is problematic since it modifies data read
4058                          * from userspace. Then if fast path lookup was to
4059                          * abort we would have to either restore it or convey
4060                          * the flag. Since this is a corner case just ignore
4061                          * it for simplicity.
4062                          */
4063                         return (cache_fpl_partial(fpl));
4064                 }
4065         }
4066         ndp->ni_next = cp;
4067
4068         /*
4069          * Check for degenerate name (e.g. / or "")
4070          * which is a way of talking about a directory,
4071          * e.g. like "/." or ".".
4072          *
4073          * TODO
4074          * Another corner case handled by the regular lookup
4075          */
4076         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4077                 return (cache_fpl_partial(fpl));
4078         }
4079         return (0);
4080 }
4081
4082 static void
4083 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4084 {
4085         struct nameidata *ndp;
4086         struct componentname *cnp;
4087
4088         ndp = fpl->ndp;
4089         cnp = fpl->cnp;
4090
4091         cnp->cn_nameptr = ndp->ni_next;
4092         while (*cnp->cn_nameptr == '/') {
4093                 cnp->cn_nameptr++;
4094                 ndp->ni_pathlen--;
4095         }
4096 }
4097
4098 /*
4099  * See the API contract for VOP_FPLOOKUP_VEXEC.
4100  */
4101 static int __noinline
4102 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4103 {
4104         struct componentname *cnp;
4105         struct vnode *dvp;
4106         seqc_t dvp_seqc;
4107
4108         cnp = fpl->cnp;
4109         dvp = fpl->dvp;
4110         dvp_seqc = fpl->dvp_seqc;
4111
4112         /*
4113          * Hack: they may be looking up foo/bar, where foo is a
4114          * regular file. In such a case we need to turn ENOTDIR,
4115          * but we may happen to get here with a different error.
4116          */
4117         if (dvp->v_type != VDIR) {
4118                 /*
4119                  * The check here is predominantly to catch
4120                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4121                  * gets doomed past this point it is going to
4122                  * fail seqc verification.
4123                  */
4124                 if (VN_IS_DOOMED(dvp)) {
4125                         return (cache_fpl_aborted(fpl));
4126                 }
4127                 error = ENOTDIR;
4128         }
4129
4130         /*
4131          * Hack: handle O_SEARCH.
4132          *
4133          * Open Group Base Specifications Issue 7, 2018 edition states:
4134          * If the access mode of the open file description associated with the
4135          * file descriptor is not O_SEARCH, the function shall check whether
4136          * directory searches are permitted using the current permissions of
4137          * the directory underlying the file descriptor. If the access mode is
4138          * O_SEARCH, the function shall not perform the check.
4139          *
4140          * Regular lookup tests for the NOEXECCHECK flag for every path
4141          * component to decide whether to do the permission check. However,
4142          * since most lookups never have the flag (and when they do it is only
4143          * present for the first path component), lockless lookup only acts on
4144          * it if there is a permission problem. Here the flag is represented
4145          * with a boolean so that we don't have to clear it on the way out.
4146          *
4147          * For simplicity this always aborts.
4148          * TODO: check if this is the first lookup and ignore the permission
4149          * problem. Note the flag has to survive fallback (if it happens to be
4150          * performed).
4151          */
4152         if (fpl->fsearch) {
4153                 return (cache_fpl_aborted(fpl));
4154         }
4155
4156         switch (error) {
4157         case EAGAIN:
4158                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4159                         error = cache_fpl_aborted(fpl);
4160                 } else {
4161                         cache_fpl_partial(fpl);
4162                 }
4163                 break;
4164         default:
4165                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4166                         error = cache_fpl_aborted(fpl);
4167                 } else {
4168                         cache_fpl_smr_exit(fpl);
4169                         cache_fpl_handled(fpl, error);
4170                 }
4171                 break;
4172         }
4173         return (error);
4174 }
4175
4176 static int
4177 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4178 {
4179         struct nameidata *ndp;
4180         struct componentname *cnp;
4181         struct mount *mp;
4182         int error;
4183
4184         error = CACHE_FPL_FAILED;
4185         ndp = fpl->ndp;
4186         cnp = fpl->cnp;
4187
4188         cache_fpl_checkpoint(fpl, &fpl->snd);
4189
4190         fpl->dvp = dvp;
4191         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4192         if (seqc_in_modify(fpl->dvp_seqc)) {
4193                 cache_fpl_aborted(fpl);
4194                 goto out;
4195         }
4196         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4197         if (!cache_fplookup_mp_supported(mp)) {
4198                 cache_fpl_aborted(fpl);
4199                 goto out;
4200         }
4201
4202         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4203
4204         for (;;) {
4205                 error = cache_fplookup_parse(fpl);
4206                 if (__predict_false(error != 0)) {
4207                         break;
4208                 }
4209
4210                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4211
4212                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4213                 if (__predict_false(error != 0)) {
4214                         error = cache_fplookup_failed_vexec(fpl, error);
4215                         break;
4216                 }
4217
4218                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4219                         error = cache_fplookup_dotdot(fpl);
4220                         if (__predict_false(error != 0)) {
4221                                 break;
4222                         }
4223                 } else {
4224                         error = cache_fplookup_next(fpl);
4225                         if (__predict_false(error != 0)) {
4226                                 break;
4227                         }
4228
4229                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4230
4231                         if (cache_fplookup_need_climb_mount(fpl)) {
4232                                 error = cache_fplookup_climb_mount(fpl);
4233                                 if (__predict_false(error != 0)) {
4234                                         break;
4235                                 }
4236                         }
4237                 }
4238
4239                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4240
4241                 if (cache_fpl_islastcn(ndp)) {
4242                         error = cache_fplookup_final(fpl);
4243                         break;
4244                 }
4245
4246                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4247                         error = cache_fpl_aborted(fpl);
4248                         break;
4249                 }
4250
4251                 fpl->dvp = fpl->tvp;
4252                 fpl->dvp_seqc = fpl->tvp_seqc;
4253
4254                 cache_fplookup_parse_advance(fpl);
4255                 cache_fpl_checkpoint(fpl, &fpl->snd);
4256         }
4257 out:
4258         switch (fpl->status) {
4259         case CACHE_FPL_STATUS_UNSET:
4260                 __assert_unreachable();
4261                 break;
4262         case CACHE_FPL_STATUS_PARTIAL:
4263                 cache_fpl_smr_assert_entered(fpl);
4264                 return (cache_fplookup_partial_setup(fpl));
4265         case CACHE_FPL_STATUS_ABORTED:
4266                 if (fpl->in_smr)
4267                         cache_fpl_smr_exit(fpl);
4268                 return (CACHE_FPL_FAILED);
4269         case CACHE_FPL_STATUS_HANDLED:
4270                 MPASS(error != CACHE_FPL_FAILED);
4271                 cache_fpl_smr_assert_not_entered(fpl);
4272                 if (__predict_false(error != 0)) {
4273                         ndp->ni_dvp = NULL;
4274                         ndp->ni_vp = NULL;
4275                         cache_fpl_cleanup_cnp(cnp);
4276                         return (error);
4277                 }
4278                 ndp->ni_dvp = fpl->dvp;
4279                 ndp->ni_vp = fpl->tvp;
4280                 if (cnp->cn_flags & SAVENAME)
4281                         cnp->cn_flags |= HASBUF;
4282                 else
4283                         cache_fpl_cleanup_cnp(cnp);
4284                 return (error);
4285         }
4286 }
4287
4288 /*
4289  * Fast path lookup protected with SMR and sequence counters.
4290  *
4291  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4292  *
4293  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4294  * outlined below.
4295  *
4296  * Traditional vnode lookup conceptually looks like this:
4297  *
4298  * vn_lock(current);
4299  * for (;;) {
4300  *      next = find();
4301  *      vn_lock(next);
4302  *      vn_unlock(current);
4303  *      current = next;
4304  *      if (last)
4305  *          break;
4306  * }
4307  * return (current);
4308  *
4309  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4310  * any modifications thanks to holding respective locks.
4311  *
4312  * The same guarantee can be provided with a combination of safe memory
4313  * reclamation and sequence counters instead. If all operations which affect
4314  * the relationship between the current vnode and the one we are looking for
4315  * also modify the counter, we can verify whether all the conditions held as
4316  * we made the jump. This includes things like permissions, mount points etc.
4317  * Counter modification is provided by enclosing relevant places in
4318  * vn_seqc_write_begin()/end() calls.
4319  *
4320  * Thus this translates to:
4321  *
4322  * vfs_smr_enter();
4323  * dvp_seqc = seqc_read_any(dvp);
4324  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4325  *     abort();
4326  * for (;;) {
4327  *      tvp = find();
4328  *      tvp_seqc = seqc_read_any(tvp);
4329  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4330  *          abort();
4331  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4332  *          abort();
4333  *      dvp = tvp; // we know nothing of importance has changed
4334  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4335  *      if (last)
4336  *          break;
4337  * }
4338  * vget(); // secure the vnode
4339  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4340  *          abort();
4341  * // at this point we know nothing has changed for any parent<->child pair
4342  * // as they were crossed during the lookup, meaning we matched the guarantee
4343  * // of the locked variant
4344  * return (tvp);
4345  *
4346  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4347  * - they are called while within vfs_smr protection which they must never exit
4348  * - EAGAIN can be returned to denote checking could not be performed, it is
4349  *   always valid to return it
4350  * - if the sequence counter has not changed the result must be valid
4351  * - if the sequence counter has changed both false positives and false negatives
4352  *   are permitted (since the result will be rejected later)
4353  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4354  *
4355  * Caveats to watch out for:
4356  * - vnodes are passed unlocked and unreferenced with nothing stopping
4357  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4358  *   to use atomic_load_ptr to fetch it.
4359  * - the aforementioned object can also get freed, meaning absent other means it
4360  *   should be protected with vfs_smr
4361  * - either safely checking permissions as they are modified or guaranteeing
4362  *   their stability is left to the routine
4363  */
4364 int
4365 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4366     struct pwd **pwdp)
4367 {
4368         struct cache_fpl fpl;
4369         struct pwd *pwd;
4370         struct vnode *dvp;
4371         struct componentname *cnp;
4372         struct nameidata_saved orig;
4373         int error;
4374
4375         MPASS(ndp->ni_lcf == 0);
4376
4377         fpl.status = CACHE_FPL_STATUS_UNSET;
4378         fpl.ndp = ndp;
4379         fpl.cnp = &ndp->ni_cnd;
4380         MPASS(curthread == fpl.cnp->cn_thread);
4381
4382         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4383                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4384
4385         if (!cache_can_fplookup(&fpl)) {
4386                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4387                 *status = fpl.status;
4388                 return (EOPNOTSUPP);
4389         }
4390
4391         cache_fpl_checkpoint(&fpl, &orig);
4392
4393         cache_fpl_smr_enter_initial(&fpl);
4394         fpl.fsearch = false;
4395         pwd = pwd_get_smr();
4396         fpl.pwd = pwd;
4397         ndp->ni_rootdir = pwd->pwd_rdir;
4398         ndp->ni_topdir = pwd->pwd_jdir;
4399
4400         cnp = fpl.cnp;
4401         cnp->cn_nameptr = cnp->cn_pnbuf;
4402         if (cnp->cn_pnbuf[0] == '/') {
4403                 cache_fpl_handle_root(ndp, &dvp);
4404         } else {
4405                 if (ndp->ni_dirfd == AT_FDCWD) {
4406                         dvp = pwd->pwd_cdir;
4407                 } else {
4408                         error = cache_fplookup_dirfd(&fpl, &dvp);
4409                         if (__predict_false(error != 0)) {
4410                                 goto out;
4411                         }
4412                 }
4413         }
4414
4415         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4416
4417         error = cache_fplookup_impl(dvp, &fpl);
4418 out:
4419         cache_fpl_smr_assert_not_entered(&fpl);
4420         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4421
4422         *status = fpl.status;
4423         switch (fpl.status) {
4424         case CACHE_FPL_STATUS_UNSET:
4425                 __assert_unreachable();
4426                 break;
4427         case CACHE_FPL_STATUS_HANDLED:
4428                 SDT_PROBE3(vfs, namei, lookup, return, error,
4429                     (error == 0 ? ndp->ni_vp : NULL), true);
4430                 break;
4431         case CACHE_FPL_STATUS_PARTIAL:
4432                 *pwdp = fpl.pwd;
4433                 /*
4434                  * Status restored by cache_fplookup_partial_setup.
4435                  */
4436                 break;
4437         case CACHE_FPL_STATUS_ABORTED:
4438                 cache_fpl_restore(&fpl, &orig);
4439                 break;
4440         }
4441         return (error);
4442 }