sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/rwlock.h>
  59 #include <sys/seqc.h>
  60 #include <sys/sdt.h>
  61 #include <sys/smr.h>
  62 #include <sys/smp.h>
  63 #include <sys/syscallsubr.h>
  64 #include <sys/sysctl.h>
  65 #include <sys/sysproto.h>
  66 #include <sys/vnode.h>
  67 #include <ck_queue.h>
  68 #ifdef KTRACE
  69 #include <sys/ktrace.h>
  70 #endif
  71
  72 #include <sys/capsicum.h>
  73
  74 #include <security/audit/audit.h>
  75 #include <security/mac/mac_framework.h>
  76
  77 #ifdef DDB
  78 #include <ddb/ddb.h>
  79 #endif
  80
  81 #include <vm/uma.h>
  82
  83 SDT_PROVIDER_DECLARE(vfs);
  84 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  85     "struct vnode *");
  86 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  87     "char *");
  88 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  89 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  90     "char *", "struct vnode *");
  91 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  92 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  93     "struct vnode *", "char *");
  94 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  95     "struct vnode *");
  96 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  97     "struct vnode *", "char *");
  98 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  99     "char *");
 100 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 101 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 102 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 103 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 104     "struct vnode *");
 105 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 106     "char *");
 107 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
 108     "char *");
 109
 110 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 111 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 112 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 113
 114 /*
 115  * This structure describes the elements in the cache of recent
 116  * names looked up by namei.
 117  */
 118 struct negstate {
 119         u_char neg_flag;
 120 };
 121 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 122     "the state must fit in a union with a pointer without growing it");
 123
 124 struct  namecache {
 125         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 126         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 127         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 128         struct  vnode *nc_dvp;          /* vnode of parent of name */
 129         union {
 130                 struct  vnode *nu_vp;   /* vnode the name refers to */
 131                 struct  negstate nu_neg;/* negative entry state */
 132         } n_un;
 133         u_char  nc_flag;                /* flag bits */
 134         u_char  nc_nlen;                /* length of name */
 135         char    nc_name[0];             /* segment name + nul */
 136 };
 137
 138 /*
 139  * struct namecache_ts repeats struct namecache layout up to the
 140  * nc_nlen member.
 141  * struct namecache_ts is used in place of struct namecache when time(s) need
 142  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 143  * both a non-dotdot directory name plus dotdot for the directory's
 144  * parent.
 145  *
 146  * See below for alignment requirement.
 147  */
 148 struct  namecache_ts {
 149         struct  timespec nc_time;       /* timespec provided by fs */
 150         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 151         int     nc_ticks;               /* ticks value when entry was added */
 152         struct namecache nc_nc;
 153 };
 154
 155 /*
 156  * At least mips n32 performs 64-bit accesses to timespec as found
 157  * in namecache_ts and requires them to be aligned. Since others
 158  * may be in the same spot suffer a little bit and enforce the
 159  * alignment for everyone. Note this is a nop for 64-bit platforms.
 160  */
 161 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 162 #define CACHE_PATH_CUTOFF       39
 163
 164 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 165 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 166 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 167 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 168
 169 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 170 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 171 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 172 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 173
 174 #define nc_vp           n_un.nu_vp
 175 #define nc_neg          n_un.nu_neg
 176
 177 /*
 178  * Flags in namecache.nc_flag
 179  */
 180 #define NCF_WHITE       0x01
 181 #define NCF_ISDOTDOT    0x02
 182 #define NCF_TS          0x04
 183 #define NCF_DTS         0x08
 184 #define NCF_DVDROP      0x10
 185 #define NCF_NEGATIVE    0x20
 186 #define NCF_INVALID     0x40
 187 #define NCF_WIP         0x80
 188
 189 /*
 190  * Flags in negstate.neg_flag
 191  */
 192 #define NEG_HOT         0x01
 193
 194 /*
 195  * Mark an entry as invalid.
 196  *
 197  * This is called before it starts getting deconstructed.
 198  */
 199 static void
 200 cache_ncp_invalidate(struct namecache *ncp)
 201 {
 202
 203         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 204             ("%s: entry %p already invalid", __func__, ncp));
 205         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 206         atomic_thread_fence_rel();
 207 }
 208
 209 /*
 210  * Check whether the entry can be safely used.
 211  *
 212  * All places which elide locks are supposed to call this after they are
 213  * done with reading from an entry.
 214  */
 215 static bool
 216 cache_ncp_canuse(struct namecache *ncp)
 217 {
 218
 219         atomic_thread_fence_acq();
 220         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 221 }
 222
 223 /*
 224  * Name caching works as follows:
 225  *
 226  * Names found by directory scans are retained in a cache
 227  * for future reference.  It is managed LRU, so frequently
 228  * used names will hang around.  Cache is indexed by hash value
 229  * obtained from (dvp, name) where dvp refers to the directory
 230  * containing name.
 231  *
 232  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 233  * exist) the vnode pointer will be NULL.
 234  *
 235  * Upon reaching the last segment of a path, if the reference
 236  * is for DELETE, or NOCACHE is set (rewrite), and the
 237  * name is located in the cache, it will be dropped.
 238  *
 239  * These locks are used (in the order in which they can be taken):
 240  * NAME         TYPE    ROLE
 241  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 242  * bucketlock   rwlock  for access to given set of hash buckets
 243  * neglist      mtx     negative entry LRU management
 244  *
 245  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
 246  * shrinking the LRU list.
 247  *
 248  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 249  * order is lower address first. Both are recursive.
 250  *
 251  * "." lookups are lockless.
 252  *
 253  * ".." and vnode -> name lookups require vnodelock.
 254  *
 255  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 256  *
 257  * Insertions and removals of entries require involved vnodes and bucketlocks
 258  * to be write-locked to prevent other threads from seeing the entry.
 259  *
 260  * Some lookups result in removal of the found entry (e.g. getting rid of a
 261  * negative entry with the intent to create a positive one), which poses a
 262  * problem when multiple threads reach the state. Similarly, two different
 263  * threads can purge two different vnodes and try to remove the same name.
 264  *
 265  * If the already held vnode lock is lower than the second required lock, we
 266  * can just take the other lock. However, in the opposite case, this could
 267  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 268  * the first node, locking everything in order and revalidating the state.
 269  */
 270
 271 VFS_SMR_DECLARE;
 272
 273 /*
 274  * Structures associated with name caching.
 275  */
 276 #define NCHHASH(hash) \
 277         (&nchashtbl[(hash) & nchash])
 278 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 279 static u_long __read_mostly     nchash;                 /* size of hash table */
 280 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 281     "Size of namecache hash table");
 282 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 283 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 284     "Ratio of negative namecache entries");
 285 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 286 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 287 u_int ncsizefactor = 2;
 288 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 289     "Size factor for namecache");
 290 static u_int __read_mostly      ncpurgeminvnodes;
 291 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
 292     "Number of vnodes below which purgevfs ignores the request");
 293 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 294
 295 struct nchstats nchstats;               /* cache effectiveness statistics */
 296
 297 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
 298
 299 struct neglist {
 300         struct mtx              nl_lock;
 301         TAILQ_HEAD(, namecache) nl_list;
 302 } __aligned(CACHE_LINE_SIZE);
 303
 304 static struct neglist __read_mostly     *neglists;
 305 static struct neglist ncneg_hot;
 306 static u_long numhotneg;
 307
 308 #define ncneghash       3
 309 #define numneglists     (ncneghash + 1)
 310 static inline struct neglist *
 311 NCP2NEGLIST(struct namecache *ncp)
 312 {
 313
 314         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 315 }
 316
 317 static inline struct negstate *
 318 NCP2NEGSTATE(struct namecache *ncp)
 319 {
 320
 321         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 322         return (&ncp->nc_neg);
 323 }
 324
 325 #define numbucketlocks (ncbuckethash + 1)
 326 static u_int __read_mostly  ncbuckethash;
 327 static struct rwlock_padalign __read_mostly  *bucketlocks;
 328 #define HASH2BUCKETLOCK(hash) \
 329         ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
 330
 331 #define numvnodelocks (ncvnodehash + 1)
 332 static u_int __read_mostly  ncvnodehash;
 333 static struct mtx __read_mostly *vnodelocks;
 334 static inline struct mtx *
 335 VP2VNODELOCK(struct vnode *vp)
 336 {
 337
 338         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 339 }
 340
 341 /*
 342  * UMA zones for the VFS cache.
 343  *
 344  * The small cache is used for entries with short names, which are the
 345  * most common.  The large cache is used for entries which are too big to
 346  * fit in the small cache.
 347  */
 348 static uma_zone_t __read_mostly cache_zone_small;
 349 static uma_zone_t __read_mostly cache_zone_small_ts;
 350 static uma_zone_t __read_mostly cache_zone_large;
 351 static uma_zone_t __read_mostly cache_zone_large_ts;
 352
 353 static struct namecache *
 354 cache_alloc(int len, int ts)
 355 {
 356         struct namecache_ts *ncp_ts;
 357         struct namecache *ncp;
 358
 359         if (__predict_false(ts)) {
 360                 if (len <= CACHE_PATH_CUTOFF)
 361                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 362                 else
 363                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 364                 ncp = &ncp_ts->nc_nc;
 365         } else {
 366                 if (len <= CACHE_PATH_CUTOFF)
 367                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 368                 else
 369                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 370         }
 371         return (ncp);
 372 }
 373
 374 static void
 375 cache_free(struct namecache *ncp)
 376 {
 377         struct namecache_ts *ncp_ts;
 378
 379         if (ncp == NULL)
 380                 return;
 381         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 382                 vdrop(ncp->nc_dvp);
 383         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 384                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 385                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 386                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 387                 else
 388                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 389         } else {
 390                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 391                         uma_zfree_smr(cache_zone_small, ncp);
 392                 else
 393                         uma_zfree_smr(cache_zone_large, ncp);
 394         }
 395 }
 396
 397 static void
 398 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 399 {
 400         struct namecache_ts *ncp_ts;
 401
 402         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 403             (tsp == NULL && ticksp == NULL),
 404             ("No NCF_TS"));
 405
 406         if (tsp == NULL && ticksp == NULL)
 407                 return;
 408
 409         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 410         if (tsp != NULL)
 411                 *tsp = ncp_ts->nc_time;
 412         if (ticksp != NULL)
 413                 *ticksp = ncp_ts->nc_ticks;
 414 }
 415
 416 #ifdef DEBUG_CACHE
 417 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 418 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 419     "VFS namecache enabled");
 420 #endif
 421
 422 /* Export size information to userland */
 423 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 424     sizeof(struct namecache), "sizeof(struct namecache)");
 425
 426 /*
 427  * The new name cache statistics
 428  */
 429 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 430     "Name cache statistics");
 431 #define STATNODE_ULONG(name, descr)                                     \
 432         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 433 #define STATNODE_COUNTER(name, descr)                                   \
 434         static COUNTER_U64_DEFINE_EARLY(name);                          \
 435         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 436             descr);
 437 STATNODE_ULONG(numneg, "Number of negative cache entries");
 438 STATNODE_ULONG(numcache, "Number of cache entries");
 439 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 440 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 441 STATNODE_COUNTER(dothits, "Number of '.' hits");
 442 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 443 STATNODE_COUNTER(nummiss, "Number of cache misses");
 444 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 445 STATNODE_COUNTER(numposzaps,
 446     "Number of cache hits (positive) we do not want to cache");
 447 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 448 STATNODE_COUNTER(numnegzaps,
 449     "Number of cache hits (negative) we do not want to cache");
 450 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 451 /* These count for vn_getcwd(), too. */
 452 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 453 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 454 STATNODE_COUNTER(numfullpathfail2,
 455     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 456 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 457 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 458 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 459     "Number of successful removals after relocking");
 460 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 461     "Number of times zap_and_exit failed to lock");
 462 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 463     "Number of times zap_and_exit failed to lock");
 464 static long cache_lock_vnodes_cel_3_failures;
 465 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 466     "Number of times 3-way vnode locking failed");
 467 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
 468 STATNODE_COUNTER(numneg_evicted,
 469     "Number of negative entries evicted when adding a new entry");
 470 STATNODE_COUNTER(shrinking_skipped,
 471     "Number of times shrinking was already in progress");
 472
 473 static void cache_zap_locked(struct namecache *ncp);
 474 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
 475     char **freebuf, size_t *buflen);
 476 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
 477     char *buf, char **retbuf, size_t *buflen);
 478 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
 479     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 480
 481 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 482
 483 static int cache_yield;
 484 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
 485     "Number of times cache called yield");
 486
 487 static void __noinline
 488 cache_maybe_yield(void)
 489 {
 490
 491         if (should_yield()) {
 492                 cache_yield++;
 493                 kern_yield(PRI_USER);
 494         }
 495 }
 496
 497 static inline void
 498 cache_assert_vlp_locked(struct mtx *vlp)
 499 {
 500
 501         if (vlp != NULL)
 502                 mtx_assert(vlp, MA_OWNED);
 503 }
 504
 505 static inline void
 506 cache_assert_vnode_locked(struct vnode *vp)
 507 {
 508         struct mtx *vlp;
 509
 510         vlp = VP2VNODELOCK(vp);
 511         cache_assert_vlp_locked(vlp);
 512 }
 513
 514 /*
 515  * TODO: With the value stored we can do better than computing the hash based
 516  * on the address and the choice of FNV should also be revisisted.
 517  */
 518 static void
 519 cache_prehash(struct vnode *vp)
 520 {
 521
 522         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 523 }
 524
 525 static uint32_t
 526 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 527 {
 528
 529         return (fnv_32_buf(name, len, dvp->v_nchash));
 530 }
 531
 532 static inline struct nchashhead *
 533 NCP2BUCKET(struct namecache *ncp)
 534 {
 535         uint32_t hash;
 536
 537         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 538         return (NCHHASH(hash));
 539 }
 540
 541 static inline struct rwlock *
 542 NCP2BUCKETLOCK(struct namecache *ncp)
 543 {
 544         uint32_t hash;
 545
 546         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 547         return (HASH2BUCKETLOCK(hash));
 548 }
 549
 550 #ifdef INVARIANTS
 551 static void
 552 cache_assert_bucket_locked(struct namecache *ncp, int mode)
 553 {
 554         struct rwlock *blp;
 555
 556         blp = NCP2BUCKETLOCK(ncp);
 557         rw_assert(blp, mode);
 558 }
 559 #else
 560 #define cache_assert_bucket_locked(x, y) do { } while (0)
 561 #endif
 562
 563 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 564 static void
 565 _cache_sort_vnodes(void **p1, void **p2)
 566 {
 567         void *tmp;
 568
 569         MPASS(*p1 != NULL || *p2 != NULL);
 570
 571         if (*p1 > *p2) {
 572                 tmp = *p2;
 573                 *p2 = *p1;
 574                 *p1 = tmp;
 575         }
 576 }
 577
 578 static void
 579 cache_lock_all_buckets(void)
 580 {
 581         u_int i;
 582
 583         for (i = 0; i < numbucketlocks; i++)
 584                 rw_wlock(&bucketlocks[i]);
 585 }
 586
 587 static void
 588 cache_unlock_all_buckets(void)
 589 {
 590         u_int i;
 591
 592         for (i = 0; i < numbucketlocks; i++)
 593                 rw_wunlock(&bucketlocks[i]);
 594 }
 595
 596 static void
 597 cache_lock_all_vnodes(void)
 598 {
 599         u_int i;
 600
 601         for (i = 0; i < numvnodelocks; i++)
 602                 mtx_lock(&vnodelocks[i]);
 603 }
 604
 605 static void
 606 cache_unlock_all_vnodes(void)
 607 {
 608         u_int i;
 609
 610         for (i = 0; i < numvnodelocks; i++)
 611                 mtx_unlock(&vnodelocks[i]);
 612 }
 613
 614 static int
 615 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 616 {
 617
 618         cache_sort_vnodes(&vlp1, &vlp2);
 619
 620         if (vlp1 != NULL) {
 621                 if (!mtx_trylock(vlp1))
 622                         return (EAGAIN);
 623         }
 624         if (!mtx_trylock(vlp2)) {
 625                 if (vlp1 != NULL)
 626                         mtx_unlock(vlp1);
 627                 return (EAGAIN);
 628         }
 629
 630         return (0);
 631 }
 632
 633 static void
 634 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 635 {
 636
 637         MPASS(vlp1 != NULL || vlp2 != NULL);
 638         MPASS(vlp1 <= vlp2);
 639
 640         if (vlp1 != NULL)
 641                 mtx_lock(vlp1);
 642         if (vlp2 != NULL)
 643                 mtx_lock(vlp2);
 644 }
 645
 646 static void
 647 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 648 {
 649
 650         MPASS(vlp1 != NULL || vlp2 != NULL);
 651
 652         if (vlp1 != NULL)
 653                 mtx_unlock(vlp1);
 654         if (vlp2 != NULL)
 655                 mtx_unlock(vlp2);
 656 }
 657
 658 static int
 659 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 660 {
 661         struct nchstats snap;
 662
 663         if (req->oldptr == NULL)
 664                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 665
 666         snap = nchstats;
 667         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 668         snap.ncs_neghits = counter_u64_fetch(numneghits);
 669         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 670             counter_u64_fetch(numnegzaps);
 671         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 672             counter_u64_fetch(nummiss);
 673
 674         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 675 }
 676 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 677     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 678     "VFS cache effectiveness statistics");
 679
 680 #ifdef DIAGNOSTIC
 681 /*
 682  * Grab an atomic snapshot of the name cache hash chain lengths
 683  */
 684 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 685     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 686     "hash table stats");
 687
 688 static int
 689 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 690 {
 691         struct nchashhead *ncpp;
 692         struct namecache *ncp;
 693         int i, error, n_nchash, *cntbuf;
 694
 695 retry:
 696         n_nchash = nchash + 1;  /* nchash is max index, not count */
 697         if (req->oldptr == NULL)
 698                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 699         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 700         cache_lock_all_buckets();
 701         if (n_nchash != nchash + 1) {
 702                 cache_unlock_all_buckets();
 703                 free(cntbuf, M_TEMP);
 704                 goto retry;
 705         }
 706         /* Scan hash tables counting entries */
 707         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 708                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 709                         cntbuf[i]++;
 710         cache_unlock_all_buckets();
 711         for (error = 0, i = 0; i < n_nchash; i++)
 712                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 713                         break;
 714         free(cntbuf, M_TEMP);
 715         return (error);
 716 }
 717 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 718     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 719     "nchash chain lengths");
 720
 721 static int
 722 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 723 {
 724         int error;
 725         struct nchashhead *ncpp;
 726         struct namecache *ncp;
 727         int n_nchash;
 728         int count, maxlength, used, pct;
 729
 730         if (!req->oldptr)
 731                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 732
 733         cache_lock_all_buckets();
 734         n_nchash = nchash + 1;  /* nchash is max index, not count */
 735         used = 0;
 736         maxlength = 0;
 737
 738         /* Scan hash tables for applicable entries */
 739         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 740                 count = 0;
 741                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 742                         count++;
 743                 }
 744                 if (count)
 745                         used++;
 746                 if (maxlength < count)
 747                         maxlength = count;
 748         }
 749         n_nchash = nchash + 1;
 750         cache_unlock_all_buckets();
 751         pct = (used * 100) / (n_nchash / 100);
 752         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 753         if (error)
 754                 return (error);
 755         error = SYSCTL_OUT(req, &used, sizeof(used));
 756         if (error)
 757                 return (error);
 758         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 759         if (error)
 760                 return (error);
 761         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 762         if (error)
 763                 return (error);
 764         return (0);
 765 }
 766 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 767     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 768     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 769 #endif
 770
 771 /*
 772  * Negative entries management
 773  *
 774  * A variation of LRU scheme is used. New entries are hashed into one of
 775  * numneglists cold lists. Entries get promoted to the hot list on first hit.
 776  *
 777  * The shrinker will demote hot list head and evict from the cold list in a
 778  * round-robin manner.
 779  */
 780 static void
 781 cache_negative_init(struct namecache *ncp)
 782 {
 783         struct negstate *negstate;
 784
 785         ncp->nc_flag |= NCF_NEGATIVE;
 786         negstate = NCP2NEGSTATE(ncp);
 787         negstate->neg_flag = 0;
 788 }
 789
 790 static void
 791 cache_negative_hit(struct namecache *ncp)
 792 {
 793         struct neglist *neglist;
 794         struct negstate *negstate;
 795
 796         negstate = NCP2NEGSTATE(ncp);
 797         if ((negstate->neg_flag & NEG_HOT) != 0)
 798                 return;
 799         neglist = NCP2NEGLIST(ncp);
 800         mtx_lock(&ncneg_hot.nl_lock);
 801         mtx_lock(&neglist->nl_lock);
 802         if ((negstate->neg_flag & NEG_HOT) == 0) {
 803                 numhotneg++;
 804                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 805                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 806                 negstate->neg_flag |= NEG_HOT;
 807         }
 808         mtx_unlock(&neglist->nl_lock);
 809         mtx_unlock(&ncneg_hot.nl_lock);
 810 }
 811
 812 static void
 813 cache_negative_insert(struct namecache *ncp)
 814 {
 815         struct neglist *neglist;
 816
 817         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 818         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 819         neglist = NCP2NEGLIST(ncp);
 820         mtx_lock(&neglist->nl_lock);
 821         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 822         mtx_unlock(&neglist->nl_lock);
 823         atomic_add_rel_long(&numneg, 1);
 824 }
 825
 826 static void
 827 cache_negative_remove(struct namecache *ncp)
 828 {
 829         struct neglist *neglist;
 830         struct negstate *negstate;
 831         bool hot_locked = false;
 832         bool list_locked = false;
 833
 834         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 835         neglist = NCP2NEGLIST(ncp);
 836         negstate = NCP2NEGSTATE(ncp);
 837         if ((negstate->neg_flag & NEG_HOT) != 0) {
 838                 hot_locked = true;
 839                 mtx_lock(&ncneg_hot.nl_lock);
 840                 if ((negstate->neg_flag & NEG_HOT) == 0) {
 841                         list_locked = true;
 842                         mtx_lock(&neglist->nl_lock);
 843                 }
 844         } else {
 845                 list_locked = true;
 846                 mtx_lock(&neglist->nl_lock);
 847                 /*
 848                  * We may be racing against promotion in lockless lookup.
 849                  */
 850                 if ((negstate->neg_flag & NEG_HOT) != 0) {
 851                         mtx_unlock(&neglist->nl_lock);
 852                         hot_locked = true;
 853                         mtx_lock(&ncneg_hot.nl_lock);
 854                         mtx_lock(&neglist->nl_lock);
 855                 }
 856         }
 857         if ((negstate->neg_flag & NEG_HOT) != 0) {
 858                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
 859                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 860                 numhotneg--;
 861         } else {
 862                 mtx_assert(&neglist->nl_lock, MA_OWNED);
 863                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 864         }
 865         if (list_locked)
 866                 mtx_unlock(&neglist->nl_lock);
 867         if (hot_locked)
 868                 mtx_unlock(&ncneg_hot.nl_lock);
 869         atomic_subtract_rel_long(&numneg, 1);
 870 }
 871
 872 static void
 873 cache_negative_shrink_select(struct namecache **ncpp,
 874     struct neglist **neglistpp)
 875 {
 876         struct neglist *neglist;
 877         struct namecache *ncp;
 878         static u_int cycle;
 879         u_int i;
 880
 881         *ncpp = ncp = NULL;
 882
 883         for (i = 0; i < numneglists; i++) {
 884                 neglist = &neglists[(cycle + i) % numneglists];
 885                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
 886                         continue;
 887                 mtx_lock(&neglist->nl_lock);
 888                 ncp = TAILQ_FIRST(&neglist->nl_list);
 889                 if (ncp != NULL)
 890                         break;
 891                 mtx_unlock(&neglist->nl_lock);
 892         }
 893
 894         *neglistpp = neglist;
 895         *ncpp = ncp;
 896         cycle++;
 897 }
 898
 899 static void
 900 cache_negative_zap_one(void)
 901 {
 902         struct namecache *ncp, *ncp2;
 903         struct neglist *neglist;
 904         struct negstate *negstate;
 905         struct mtx *dvlp;
 906         struct rwlock *blp;
 907
 908         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 909             !mtx_trylock(&ncneg_shrink_lock)) {
 910                 counter_u64_add(shrinking_skipped, 1);
 911                 return;
 912         }
 913
 914         mtx_lock(&ncneg_hot.nl_lock);
 915         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
 916         if (ncp != NULL) {
 917                 neglist = NCP2NEGLIST(ncp);
 918                 negstate = NCP2NEGSTATE(ncp);
 919                 mtx_lock(&neglist->nl_lock);
 920                 MPASS((negstate->neg_flag & NEG_HOT) != 0);
 921                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 922                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 923                 negstate->neg_flag &= ~NEG_HOT;
 924                 numhotneg--;
 925                 mtx_unlock(&neglist->nl_lock);
 926         }
 927         mtx_unlock(&ncneg_hot.nl_lock);
 928
 929         cache_negative_shrink_select(&ncp, &neglist);
 930
 931         mtx_unlock(&ncneg_shrink_lock);
 932         if (ncp == NULL)
 933                 return;
 934
 935         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 936         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 937         blp = NCP2BUCKETLOCK(ncp);
 938         mtx_unlock(&neglist->nl_lock);
 939         mtx_lock(dvlp);
 940         rw_wlock(blp);
 941         /*
 942          * Enter SMR to safely check the negative list.
 943          * Even if the found pointer matches, the entry may now be reallocated
 944          * and used by a different vnode.
 945          */
 946         vfs_smr_enter();
 947         ncp2 = TAILQ_FIRST(&neglist->nl_list);
 948         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 949             blp != NCP2BUCKETLOCK(ncp2)) {
 950                 vfs_smr_exit();
 951                 ncp = NULL;
 952         } else {
 953                 vfs_smr_exit();
 954                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 955                     ncp->nc_name);
 956                 cache_zap_locked(ncp);
 957                 counter_u64_add(numneg_evicted, 1);
 958         }
 959         rw_wunlock(blp);
 960         mtx_unlock(dvlp);
 961         cache_free(ncp);
 962 }
 963
 964 /*
 965  * cache_zap_locked():
 966  *
 967  *   Removes a namecache entry from cache, whether it contains an actual
 968  *   pointer to a vnode or if it is just a negative cache entry.
 969  */
 970 static void
 971 cache_zap_locked(struct namecache *ncp)
 972 {
 973         struct nchashhead *ncpp;
 974
 975         if (!(ncp->nc_flag & NCF_NEGATIVE))
 976                 cache_assert_vnode_locked(ncp->nc_vp);
 977         cache_assert_vnode_locked(ncp->nc_dvp);
 978         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 979
 980         CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
 981             (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
 982
 983         cache_ncp_invalidate(ncp);
 984
 985         ncpp = NCP2BUCKET(ncp);
 986         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 987         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 988                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 989                     ncp->nc_name, ncp->nc_vp);
 990                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 991                 if (ncp == ncp->nc_vp->v_cache_dd) {
 992                         vn_seqc_write_begin_unheld(ncp->nc_vp);
 993                         ncp->nc_vp->v_cache_dd = NULL;
 994                         vn_seqc_write_end(ncp->nc_vp);
 995                 }
 996         } else {
 997                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 998                     ncp->nc_name);
 999                 cache_negative_remove(ncp);
1000         }
1001         if (ncp->nc_flag & NCF_ISDOTDOT) {
1002                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1003                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1004                         ncp->nc_dvp->v_cache_dd = NULL;
1005                         vn_seqc_write_end(ncp->nc_dvp);
1006                 }
1007         } else {
1008                 LIST_REMOVE(ncp, nc_src);
1009                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1010                         ncp->nc_flag |= NCF_DVDROP;
1011                         counter_u64_add(numcachehv, -1);
1012                 }
1013         }
1014         atomic_subtract_rel_long(&numcache, 1);
1015 }
1016
1017 static void
1018 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1019 {
1020         struct rwlock *blp;
1021
1022         MPASS(ncp->nc_dvp == vp);
1023         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1024         cache_assert_vnode_locked(vp);
1025
1026         blp = NCP2BUCKETLOCK(ncp);
1027         rw_wlock(blp);
1028         cache_zap_locked(ncp);
1029         rw_wunlock(blp);
1030 }
1031
1032 static bool
1033 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1034     struct mtx **vlpp)
1035 {
1036         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1037         struct rwlock *blp;
1038
1039         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1040         cache_assert_vnode_locked(vp);
1041
1042         if (ncp->nc_flag & NCF_NEGATIVE) {
1043                 if (*vlpp != NULL) {
1044                         mtx_unlock(*vlpp);
1045                         *vlpp = NULL;
1046                 }
1047                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1048                 return (true);
1049         }
1050
1051         pvlp = VP2VNODELOCK(vp);
1052         blp = NCP2BUCKETLOCK(ncp);
1053         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1054         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1055
1056         if (*vlpp == vlp1 || *vlpp == vlp2) {
1057                 to_unlock = *vlpp;
1058                 *vlpp = NULL;
1059         } else {
1060                 if (*vlpp != NULL) {
1061                         mtx_unlock(*vlpp);
1062                         *vlpp = NULL;
1063                 }
1064                 cache_sort_vnodes(&vlp1, &vlp2);
1065                 if (vlp1 == pvlp) {
1066                         mtx_lock(vlp2);
1067                         to_unlock = vlp2;
1068                 } else {
1069                         if (!mtx_trylock(vlp1))
1070                                 goto out_relock;
1071                         to_unlock = vlp1;
1072                 }
1073         }
1074         rw_wlock(blp);
1075         cache_zap_locked(ncp);
1076         rw_wunlock(blp);
1077         if (to_unlock != NULL)
1078                 mtx_unlock(to_unlock);
1079         return (true);
1080
1081 out_relock:
1082         mtx_unlock(vlp2);
1083         mtx_lock(vlp1);
1084         mtx_lock(vlp2);
1085         MPASS(*vlpp == NULL);
1086         *vlpp = vlp1;
1087         return (false);
1088 }
1089
1090 static int __noinline
1091 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1092 {
1093         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1094         struct rwlock *blp;
1095         int error = 0;
1096
1097         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1098         cache_assert_vnode_locked(vp);
1099
1100         pvlp = VP2VNODELOCK(vp);
1101         if (ncp->nc_flag & NCF_NEGATIVE) {
1102                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1103                 goto out;
1104         }
1105
1106         blp = NCP2BUCKETLOCK(ncp);
1107         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1108         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1109         cache_sort_vnodes(&vlp1, &vlp2);
1110         if (vlp1 == pvlp) {
1111                 mtx_lock(vlp2);
1112                 to_unlock = vlp2;
1113         } else {
1114                 if (!mtx_trylock(vlp1)) {
1115                         error = EAGAIN;
1116                         goto out;
1117                 }
1118                 to_unlock = vlp1;
1119         }
1120         rw_wlock(blp);
1121         cache_zap_locked(ncp);
1122         rw_wunlock(blp);
1123         mtx_unlock(to_unlock);
1124 out:
1125         mtx_unlock(pvlp);
1126         return (error);
1127 }
1128
1129 /*
1130  * If trylocking failed we can get here. We know enough to take all needed locks
1131  * in the right order and re-lookup the entry.
1132  */
1133 static int
1134 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1135     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1136     struct rwlock *blp)
1137 {
1138         struct namecache *rncp;
1139
1140         cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1141
1142         cache_sort_vnodes(&dvlp, &vlp);
1143         cache_lock_vnodes(dvlp, vlp);
1144         rw_wlock(blp);
1145         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1146                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1147                     rncp->nc_nlen == cnp->cn_namelen &&
1148                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1149                         break;
1150         }
1151         if (rncp != NULL) {
1152                 cache_zap_locked(rncp);
1153                 rw_wunlock(blp);
1154                 cache_unlock_vnodes(dvlp, vlp);
1155                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1156                 return (0);
1157         }
1158
1159         rw_wunlock(blp);
1160         cache_unlock_vnodes(dvlp, vlp);
1161         return (EAGAIN);
1162 }
1163
1164 static int __noinline
1165 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1166     uint32_t hash, struct rwlock *blp)
1167 {
1168         struct mtx *dvlp, *vlp;
1169         struct vnode *dvp;
1170
1171         cache_assert_bucket_locked(ncp, RA_WLOCKED);
1172
1173         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1174         vlp = NULL;
1175         if (!(ncp->nc_flag & NCF_NEGATIVE))
1176                 vlp = VP2VNODELOCK(ncp->nc_vp);
1177         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1178                 cache_zap_locked(ncp);
1179                 rw_wunlock(blp);
1180                 cache_unlock_vnodes(dvlp, vlp);
1181                 return (0);
1182         }
1183
1184         dvp = ncp->nc_dvp;
1185         rw_wunlock(blp);
1186         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1187 }
1188
1189 static int __noinline
1190 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1191     uint32_t hash, struct rwlock *blp)
1192 {
1193         struct mtx *dvlp, *vlp;
1194         struct vnode *dvp;
1195
1196         cache_assert_bucket_locked(ncp, RA_RLOCKED);
1197
1198         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1199         vlp = NULL;
1200         if (!(ncp->nc_flag & NCF_NEGATIVE))
1201                 vlp = VP2VNODELOCK(ncp->nc_vp);
1202         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1203                 rw_runlock(blp);
1204                 rw_wlock(blp);
1205                 cache_zap_locked(ncp);
1206                 rw_wunlock(blp);
1207                 cache_unlock_vnodes(dvlp, vlp);
1208                 return (0);
1209         }
1210
1211         dvp = ncp->nc_dvp;
1212         rw_runlock(blp);
1213         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1214 }
1215
1216 static int
1217 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1218     struct mtx **vlpp1, struct mtx **vlpp2)
1219 {
1220         struct mtx *dvlp, *vlp;
1221
1222         cache_assert_bucket_locked(ncp, RA_WLOCKED);
1223
1224         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1225         vlp = NULL;
1226         if (!(ncp->nc_flag & NCF_NEGATIVE))
1227                 vlp = VP2VNODELOCK(ncp->nc_vp);
1228         cache_sort_vnodes(&dvlp, &vlp);
1229
1230         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1231                 cache_zap_locked(ncp);
1232                 cache_unlock_vnodes(dvlp, vlp);
1233                 *vlpp1 = NULL;
1234                 *vlpp2 = NULL;
1235                 return (0);
1236         }
1237
1238         if (*vlpp1 != NULL)
1239                 mtx_unlock(*vlpp1);
1240         if (*vlpp2 != NULL)
1241                 mtx_unlock(*vlpp2);
1242         *vlpp1 = NULL;
1243         *vlpp2 = NULL;
1244
1245         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1246                 cache_zap_locked(ncp);
1247                 cache_unlock_vnodes(dvlp, vlp);
1248                 return (0);
1249         }
1250
1251         rw_wunlock(blp);
1252         *vlpp1 = dvlp;
1253         *vlpp2 = vlp;
1254         if (*vlpp1 != NULL)
1255                 mtx_lock(*vlpp1);
1256         mtx_lock(*vlpp2);
1257         rw_wlock(blp);
1258         return (EAGAIN);
1259 }
1260
1261 static void
1262 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1263 {
1264
1265         if (blp != NULL) {
1266                 rw_runlock(blp);
1267         } else {
1268                 mtx_unlock(vlp);
1269         }
1270 }
1271
1272 static int __noinline
1273 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1274     struct timespec *tsp, int *ticksp)
1275 {
1276         int ltype;
1277
1278         *vpp = dvp;
1279         CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1280                         dvp, cnp->cn_nameptr);
1281         counter_u64_add(dothits, 1);
1282         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1283         if (tsp != NULL)
1284                 timespecclear(tsp);
1285         if (ticksp != NULL)
1286                 *ticksp = ticks;
1287         vrefact(*vpp);
1288         /*
1289          * When we lookup "." we still can be asked to lock it
1290          * differently...
1291          */
1292         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1293         if (ltype != VOP_ISLOCKED(*vpp)) {
1294                 if (ltype == LK_EXCLUSIVE) {
1295                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1296                         if (VN_IS_DOOMED((*vpp))) {
1297                                 /* forced unmount */
1298                                 vrele(*vpp);
1299                                 *vpp = NULL;
1300                                 return (ENOENT);
1301                         }
1302                 } else
1303                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1304         }
1305         return (-1);
1306 }
1307
1308 static __noinline int
1309 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1310     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1311 {
1312         struct namecache *ncp;
1313         struct rwlock *blp;
1314         struct mtx *dvlp, *dvlp2;
1315         uint32_t hash;
1316         int error;
1317
1318         if (cnp->cn_namelen == 2 &&
1319             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1320                 counter_u64_add(dotdothits, 1);
1321                 dvlp = VP2VNODELOCK(dvp);
1322                 dvlp2 = NULL;
1323                 mtx_lock(dvlp);
1324 retry_dotdot:
1325                 ncp = dvp->v_cache_dd;
1326                 if (ncp == NULL) {
1327                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1328                             "..", NULL);
1329                         mtx_unlock(dvlp);
1330                         if (dvlp2 != NULL)
1331                                 mtx_unlock(dvlp2);
1332                         return (0);
1333                 }
1334                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1335                         if (ncp->nc_dvp != dvp)
1336                                 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1337                         if (!cache_zap_locked_vnode_kl2(ncp,
1338                             dvp, &dvlp2))
1339                                 goto retry_dotdot;
1340                         MPASS(dvp->v_cache_dd == NULL);
1341                         mtx_unlock(dvlp);
1342                         if (dvlp2 != NULL)
1343                                 mtx_unlock(dvlp2);
1344                         cache_free(ncp);
1345                 } else {
1346                         vn_seqc_write_begin(dvp);
1347                         dvp->v_cache_dd = NULL;
1348                         vn_seqc_write_end(dvp);
1349                         mtx_unlock(dvlp);
1350                         if (dvlp2 != NULL)
1351                                 mtx_unlock(dvlp2);
1352                 }
1353                 return (0);
1354         }
1355
1356         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1357         blp = HASH2BUCKETLOCK(hash);
1358 retry:
1359         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1360                 goto out_no_entry;
1361
1362         rw_wlock(blp);
1363
1364         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1365                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1366                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1367                         break;
1368         }
1369
1370         /* We failed to find an entry */
1371         if (ncp == NULL) {
1372                 rw_wunlock(blp);
1373                 goto out_no_entry;
1374         }
1375
1376         error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1377         if (__predict_false(error != 0)) {
1378                 zap_and_exit_bucket_fail++;
1379                 cache_maybe_yield();
1380                 goto retry;
1381         }
1382         counter_u64_add(numposzaps, 1);
1383         cache_free(ncp);
1384         return (0);
1385 out_no_entry:
1386         SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1387         counter_u64_add(nummisszap, 1);
1388         return (0);
1389 }
1390
1391 /**
1392  * Lookup a name in the name cache
1393  *
1394  * # Arguments
1395  *
1396  * - dvp:       Parent directory in which to search.
1397  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1398  * - cnp:       Parameters of the name search.  The most interesting bits of
1399  *              the cn_flags field have the following meanings:
1400  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1401  *                      it up.
1402  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1403  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1404  *              or negative) lookup, tsp will be filled with any timespec that
1405  *              was stored when this cache entry was created.  However, it will
1406  *              be clear for "." entries.
1407  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1408  *              (positive or negative) lookup, it will contain the ticks value
1409  *              that was current when the cache entry was created, unless cnp
1410  *              was ".".
1411  *
1412  * # Returns
1413  *
1414  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1415  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1416  *              to a forced unmount.  vpp will not be modified.  If the entry
1417  *              is a whiteout, then the ISWHITEOUT flag will be set in
1418  *              cnp->cn_flags.
1419  * - 0:         A cache miss.  vpp will not be modified.
1420  *
1421  * # Locking
1422  *
1423  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1424  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1425  * lock is not recursively acquired.
1426  */
1427 int
1428 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1429     struct timespec *tsp, int *ticksp)
1430 {
1431         struct namecache_ts *ncp_ts;
1432         struct namecache *ncp;
1433         struct negstate *negstate;
1434         struct rwlock *blp;
1435         struct mtx *dvlp;
1436         uint32_t hash;
1437         enum vgetstate vs;
1438         int error, ltype;
1439         bool try_smr, doing_smr, whiteout;
1440
1441 #ifdef DEBUG_CACHE
1442         if (__predict_false(!doingcache)) {
1443                 cnp->cn_flags &= ~MAKEENTRY;
1444                 return (0);
1445         }
1446 #endif
1447
1448         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1449                 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1450
1451         if ((cnp->cn_flags & MAKEENTRY) == 0)
1452                 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1453
1454         try_smr = true;
1455         if (cnp->cn_nameiop == CREATE)
1456                 try_smr = false;
1457 retry:
1458         doing_smr = false;
1459         blp = NULL;
1460         dvlp = NULL;
1461         error = 0;
1462         if (cnp->cn_namelen == 2 &&
1463             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1464                 counter_u64_add(dotdothits, 1);
1465                 dvlp = VP2VNODELOCK(dvp);
1466                 mtx_lock(dvlp);
1467                 ncp = dvp->v_cache_dd;
1468                 if (ncp == NULL) {
1469                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1470                             "..", NULL);
1471                         mtx_unlock(dvlp);
1472                         return (0);
1473                 }
1474                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1475                         if (ncp->nc_flag & NCF_NEGATIVE)
1476                                 *vpp = NULL;
1477                         else
1478                                 *vpp = ncp->nc_vp;
1479                 } else
1480                         *vpp = ncp->nc_dvp;
1481                 /* Return failure if negative entry was found. */
1482                 if (*vpp == NULL)
1483                         goto negative_success;
1484                 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1485                     dvp, cnp->cn_nameptr, *vpp);
1486                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1487                     *vpp);
1488                 cache_out_ts(ncp, tsp, ticksp);
1489                 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1490                     NCF_DTS && tsp != NULL) {
1491                         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1492                         *tsp = ncp_ts->nc_dotdottime;
1493                 }
1494                 goto success;
1495         }
1496
1497         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1498 retry_hashed:
1499         if (try_smr) {
1500                 vfs_smr_enter();
1501                 doing_smr = true;
1502                 try_smr = false;
1503         } else {
1504                 blp = HASH2BUCKETLOCK(hash);
1505                 rw_rlock(blp);
1506         }
1507
1508         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1509                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1510                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1511                         break;
1512         }
1513
1514         /* We failed to find an entry */
1515         if (__predict_false(ncp == NULL)) {
1516                 if (doing_smr)
1517                         vfs_smr_exit();
1518                 else
1519                         rw_runlock(blp);
1520                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1521                     NULL);
1522                 counter_u64_add(nummiss, 1);
1523                 return (0);
1524         }
1525
1526         if (ncp->nc_flag & NCF_NEGATIVE)
1527                 goto negative_success;
1528
1529         /* We found a "positive" match, return the vnode */
1530         counter_u64_add(numposhits, 1);
1531         *vpp = ncp->nc_vp;
1532         CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1533             dvp, cnp->cn_nameptr, *vpp, ncp);
1534         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1535             *vpp);
1536         cache_out_ts(ncp, tsp, ticksp);
1537 success:
1538         /*
1539          * On success we return a locked and ref'd vnode as per the lookup
1540          * protocol.
1541          */
1542         MPASS(dvp != *vpp);
1543         ltype = 0;      /* silence gcc warning */
1544         if (cnp->cn_flags & ISDOTDOT) {
1545                 ltype = VOP_ISLOCKED(dvp);
1546                 VOP_UNLOCK(dvp);
1547         }
1548         if (doing_smr) {
1549                 if (!cache_ncp_canuse(ncp)) {
1550                         vfs_smr_exit();
1551                         *vpp = NULL;
1552                         goto retry;
1553                 }
1554                 vs = vget_prep_smr(*vpp);
1555                 vfs_smr_exit();
1556                 if (__predict_false(vs == VGET_NONE)) {
1557                         *vpp = NULL;
1558                         goto retry;
1559                 }
1560         } else {
1561                 vs = vget_prep(*vpp);
1562                 cache_lookup_unlock(blp, dvlp);
1563         }
1564         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1565         if (cnp->cn_flags & ISDOTDOT) {
1566                 vn_lock(dvp, ltype | LK_RETRY);
1567                 if (VN_IS_DOOMED(dvp)) {
1568                         if (error == 0)
1569                                 vput(*vpp);
1570                         *vpp = NULL;
1571                         return (ENOENT);
1572                 }
1573         }
1574         if (error) {
1575                 *vpp = NULL;
1576                 goto retry;
1577         }
1578         if ((cnp->cn_flags & ISLASTCN) &&
1579             (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1580                 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1581         }
1582         return (-1);
1583
1584 negative_success:
1585         /* We found a negative match, and want to create it, so purge */
1586         if (cnp->cn_nameiop == CREATE) {
1587                 MPASS(!doing_smr);
1588                 counter_u64_add(numnegzaps, 1);
1589                 goto zap_and_exit;
1590         }
1591
1592         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1593         cache_out_ts(ncp, tsp, ticksp);
1594         counter_u64_add(numneghits, 1);
1595         whiteout = (ncp->nc_flag & NCF_WHITE);
1596
1597         if (doing_smr) {
1598                 /*
1599                  * We need to take locks to promote an entry.
1600                  */
1601                 negstate = NCP2NEGSTATE(ncp);
1602                 if ((negstate->neg_flag & NEG_HOT) == 0 ||
1603                     !cache_ncp_canuse(ncp)) {
1604                         vfs_smr_exit();
1605                         doing_smr = false;
1606                         goto retry_hashed;
1607                 }
1608                 vfs_smr_exit();
1609         } else {
1610                 cache_negative_hit(ncp);
1611                 cache_lookup_unlock(blp, dvlp);
1612         }
1613         if (whiteout)
1614                 cnp->cn_flags |= ISWHITEOUT;
1615         return (ENOENT);
1616
1617 zap_and_exit:
1618         MPASS(!doing_smr);
1619         if (blp != NULL)
1620                 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1621         else
1622                 error = cache_zap_locked_vnode(ncp, dvp);
1623         if (__predict_false(error != 0)) {
1624                 zap_and_exit_bucket_fail2++;
1625                 cache_maybe_yield();
1626                 goto retry;
1627         }
1628         cache_free(ncp);
1629         return (0);
1630 }
1631
1632 struct celockstate {
1633         struct mtx *vlp[3];
1634         struct rwlock *blp[2];
1635 };
1636 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1637 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1638
1639 static inline void
1640 cache_celockstate_init(struct celockstate *cel)
1641 {
1642
1643         bzero(cel, sizeof(*cel));
1644 }
1645
1646 static void
1647 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1648     struct vnode *dvp)
1649 {
1650         struct mtx *vlp1, *vlp2;
1651
1652         MPASS(cel->vlp[0] == NULL);
1653         MPASS(cel->vlp[1] == NULL);
1654         MPASS(cel->vlp[2] == NULL);
1655
1656         MPASS(vp != NULL || dvp != NULL);
1657
1658         vlp1 = VP2VNODELOCK(vp);
1659         vlp2 = VP2VNODELOCK(dvp);
1660         cache_sort_vnodes(&vlp1, &vlp2);
1661
1662         if (vlp1 != NULL) {
1663                 mtx_lock(vlp1);
1664                 cel->vlp[0] = vlp1;
1665         }
1666         mtx_lock(vlp2);
1667         cel->vlp[1] = vlp2;
1668 }
1669
1670 static void
1671 cache_unlock_vnodes_cel(struct celockstate *cel)
1672 {
1673
1674         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1675
1676         if (cel->vlp[0] != NULL)
1677                 mtx_unlock(cel->vlp[0]);
1678         if (cel->vlp[1] != NULL)
1679                 mtx_unlock(cel->vlp[1]);
1680         if (cel->vlp[2] != NULL)
1681                 mtx_unlock(cel->vlp[2]);
1682 }
1683
1684 static bool
1685 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1686 {
1687         struct mtx *vlp;
1688         bool ret;
1689
1690         cache_assert_vlp_locked(cel->vlp[0]);
1691         cache_assert_vlp_locked(cel->vlp[1]);
1692         MPASS(cel->vlp[2] == NULL);
1693
1694         MPASS(vp != NULL);
1695         vlp = VP2VNODELOCK(vp);
1696
1697         ret = true;
1698         if (vlp >= cel->vlp[1]) {
1699                 mtx_lock(vlp);
1700         } else {
1701                 if (mtx_trylock(vlp))
1702                         goto out;
1703                 cache_lock_vnodes_cel_3_failures++;
1704                 cache_unlock_vnodes_cel(cel);
1705                 if (vlp < cel->vlp[0]) {
1706                         mtx_lock(vlp);
1707                         mtx_lock(cel->vlp[0]);
1708                         mtx_lock(cel->vlp[1]);
1709                 } else {
1710                         if (cel->vlp[0] != NULL)
1711                                 mtx_lock(cel->vlp[0]);
1712                         mtx_lock(vlp);
1713                         mtx_lock(cel->vlp[1]);
1714                 }
1715                 ret = false;
1716         }
1717 out:
1718         cel->vlp[2] = vlp;
1719         return (ret);
1720 }
1721
1722 static void
1723 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1724     struct rwlock *blp2)
1725 {
1726
1727         MPASS(cel->blp[0] == NULL);
1728         MPASS(cel->blp[1] == NULL);
1729
1730         cache_sort_vnodes(&blp1, &blp2);
1731
1732         if (blp1 != NULL) {
1733                 rw_wlock(blp1);
1734                 cel->blp[0] = blp1;
1735         }
1736         rw_wlock(blp2);
1737         cel->blp[1] = blp2;
1738 }
1739
1740 static void
1741 cache_unlock_buckets_cel(struct celockstate *cel)
1742 {
1743
1744         if (cel->blp[0] != NULL)
1745                 rw_wunlock(cel->blp[0]);
1746         rw_wunlock(cel->blp[1]);
1747 }
1748
1749 /*
1750  * Lock part of the cache affected by the insertion.
1751  *
1752  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1753  * However, insertion can result in removal of an old entry. In this
1754  * case we have an additional vnode and bucketlock pair to lock. If the
1755  * entry is negative, ncelock is locked instead of the vnode.
1756  *
1757  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1758  * preserving the locking order (smaller address first).
1759  */
1760 static void
1761 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1762     uint32_t hash)
1763 {
1764         struct namecache *ncp;
1765         struct rwlock *blps[2];
1766
1767         blps[0] = HASH2BUCKETLOCK(hash);
1768         for (;;) {
1769                 blps[1] = NULL;
1770                 cache_lock_vnodes_cel(cel, dvp, vp);
1771                 if (vp == NULL || vp->v_type != VDIR)
1772                         break;
1773                 ncp = vp->v_cache_dd;
1774                 if (ncp == NULL)
1775                         break;
1776                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1777                         break;
1778                 MPASS(ncp->nc_dvp == vp);
1779                 blps[1] = NCP2BUCKETLOCK(ncp);
1780                 if (ncp->nc_flag & NCF_NEGATIVE)
1781                         break;
1782                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1783                         break;
1784                 /*
1785                  * All vnodes got re-locked. Re-validate the state and if
1786                  * nothing changed we are done. Otherwise restart.
1787                  */
1788                 if (ncp == vp->v_cache_dd &&
1789                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1790                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1791                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1792                         break;
1793                 cache_unlock_vnodes_cel(cel);
1794                 cel->vlp[0] = NULL;
1795                 cel->vlp[1] = NULL;
1796                 cel->vlp[2] = NULL;
1797         }
1798         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1799 }
1800
1801 static void
1802 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1803     uint32_t hash)
1804 {
1805         struct namecache *ncp;
1806         struct rwlock *blps[2];
1807
1808         blps[0] = HASH2BUCKETLOCK(hash);
1809         for (;;) {
1810                 blps[1] = NULL;
1811                 cache_lock_vnodes_cel(cel, dvp, vp);
1812                 ncp = dvp->v_cache_dd;
1813                 if (ncp == NULL)
1814                         break;
1815                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1816                         break;
1817                 MPASS(ncp->nc_dvp == dvp);
1818                 blps[1] = NCP2BUCKETLOCK(ncp);
1819                 if (ncp->nc_flag & NCF_NEGATIVE)
1820                         break;
1821                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1822                         break;
1823                 if (ncp == dvp->v_cache_dd &&
1824                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1825                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1826                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1827                         break;
1828                 cache_unlock_vnodes_cel(cel);
1829                 cel->vlp[0] = NULL;
1830                 cel->vlp[1] = NULL;
1831                 cel->vlp[2] = NULL;
1832         }
1833         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1834 }
1835
1836 static void
1837 cache_enter_unlock(struct celockstate *cel)
1838 {
1839
1840         cache_unlock_buckets_cel(cel);
1841         cache_unlock_vnodes_cel(cel);
1842 }
1843
1844 static void __noinline
1845 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1846     struct componentname *cnp)
1847 {
1848         struct celockstate cel;
1849         struct namecache *ncp;
1850         uint32_t hash;
1851         int len;
1852
1853         if (dvp->v_cache_dd == NULL)
1854                 return;
1855         len = cnp->cn_namelen;
1856         cache_celockstate_init(&cel);
1857         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1858         cache_enter_lock_dd(&cel, dvp, vp, hash);
1859         vn_seqc_write_begin(dvp);
1860         ncp = dvp->v_cache_dd;
1861         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1862                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1863                 cache_zap_locked(ncp);
1864         } else {
1865                 ncp = NULL;
1866         }
1867         dvp->v_cache_dd = NULL;
1868         vn_seqc_write_end(dvp);
1869         cache_enter_unlock(&cel);
1870         cache_free(ncp);
1871 }
1872
1873 /*
1874  * Add an entry to the cache.
1875  */
1876 void
1877 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1878     struct timespec *tsp, struct timespec *dtsp)
1879 {
1880         struct celockstate cel;
1881         struct namecache *ncp, *n2, *ndd;
1882         struct namecache_ts *ncp_ts, *n2_ts;
1883         struct nchashhead *ncpp;
1884         uint32_t hash;
1885         int flag;
1886         int len;
1887         u_long lnumcache;
1888
1889         CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1890         VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1891             ("cache_enter: Adding a doomed vnode"));
1892         VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1893             ("cache_enter: Doomed vnode used as src"));
1894
1895 #ifdef DEBUG_CACHE
1896         if (__predict_false(!doingcache))
1897                 return;
1898 #endif
1899
1900         flag = 0;
1901         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1902                 if (cnp->cn_namelen == 1)
1903                         return;
1904                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1905                         cache_enter_dotdot_prep(dvp, vp, cnp);
1906                         flag = NCF_ISDOTDOT;
1907                 }
1908         }
1909
1910         /*
1911          * Avoid blowout in namecache entries.
1912          */
1913         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1914         if (__predict_false(lnumcache >= ncsize)) {
1915                 atomic_add_long(&numcache, -1);
1916                 counter_u64_add(numdrops, 1);
1917                 return;
1918         }
1919
1920         cache_celockstate_init(&cel);
1921         ndd = NULL;
1922         ncp_ts = NULL;
1923
1924         /*
1925          * Calculate the hash key and setup as much of the new
1926          * namecache entry as possible before acquiring the lock.
1927          */
1928         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1929         ncp->nc_flag = flag | NCF_WIP;
1930         ncp->nc_vp = vp;
1931         if (vp == NULL)
1932                 cache_negative_init(ncp);
1933         ncp->nc_dvp = dvp;
1934         if (tsp != NULL) {
1935                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1936                 ncp_ts->nc_time = *tsp;
1937                 ncp_ts->nc_ticks = ticks;
1938                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1939                 if (dtsp != NULL) {
1940                         ncp_ts->nc_dotdottime = *dtsp;
1941                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1942                 }
1943         }
1944         len = ncp->nc_nlen = cnp->cn_namelen;
1945         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1946         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
1947         ncp->nc_name[len] = '\0';
1948         cache_enter_lock(&cel, dvp, vp, hash);
1949
1950         /*
1951          * See if this vnode or negative entry is already in the cache
1952          * with this name.  This can happen with concurrent lookups of
1953          * the same path name.
1954          */
1955         ncpp = NCHHASH(hash);
1956         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
1957                 if (n2->nc_dvp == dvp &&
1958                     n2->nc_nlen == cnp->cn_namelen &&
1959                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1960                         if (tsp != NULL) {
1961                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
1962                                     ("no NCF_TS"));
1963                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1964                                 n2_ts->nc_time = ncp_ts->nc_time;
1965                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
1966                                 if (dtsp != NULL) {
1967                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1968                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
1969                                 }
1970                         }
1971                         goto out_unlock_free;
1972                 }
1973         }
1974
1975         if (flag == NCF_ISDOTDOT) {
1976                 /*
1977                  * See if we are trying to add .. entry, but some other lookup
1978                  * has populated v_cache_dd pointer already.
1979                  */
1980                 if (dvp->v_cache_dd != NULL)
1981                         goto out_unlock_free;
1982                 KASSERT(vp == NULL || vp->v_type == VDIR,
1983                     ("wrong vnode type %p", vp));
1984                 vn_seqc_write_begin(dvp);
1985                 dvp->v_cache_dd = ncp;
1986                 vn_seqc_write_end(dvp);
1987         }
1988
1989         if (vp != NULL) {
1990                 if (vp->v_type == VDIR) {
1991                         if (flag != NCF_ISDOTDOT) {
1992                                 /*
1993                                  * For this case, the cache entry maps both the
1994                                  * directory name in it and the name ".." for the
1995                                  * directory's parent.
1996                                  */
1997                                 vn_seqc_write_begin(vp);
1998                                 if ((ndd = vp->v_cache_dd) != NULL) {
1999                                         if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2000                                                 cache_zap_locked(ndd);
2001                                         else
2002                                                 ndd = NULL;
2003                                 }
2004                                 vp->v_cache_dd = ncp;
2005                                 vn_seqc_write_end(vp);
2006                         }
2007                 } else {
2008                         if (vp->v_cache_dd != NULL) {
2009                                 vn_seqc_write_begin(vp);
2010                                 vp->v_cache_dd = NULL;
2011                                 vn_seqc_write_end(vp);
2012                         }
2013                 }
2014         }
2015
2016         if (flag != NCF_ISDOTDOT) {
2017                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2018                         vhold(dvp);
2019                         counter_u64_add(numcachehv, 1);
2020                 }
2021                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2022         }
2023
2024         /*
2025          * If the entry is "negative", we place it into the
2026          * "negative" cache queue, otherwise, we place it into the
2027          * destination vnode's cache entries queue.
2028          */
2029         if (vp != NULL) {
2030                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2031                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2032                     vp);
2033         } else {
2034                 if (cnp->cn_flags & ISWHITEOUT)
2035                         ncp->nc_flag |= NCF_WHITE;
2036                 cache_negative_insert(ncp);
2037                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2038                     ncp->nc_name);
2039         }
2040
2041         /*
2042          * Insert the new namecache entry into the appropriate chain
2043          * within the cache entries table.
2044          */
2045         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2046
2047         atomic_thread_fence_rel();
2048         /*
2049          * Mark the entry as fully constructed.
2050          * It is immutable past this point until its removal.
2051          */
2052         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2053
2054         cache_enter_unlock(&cel);
2055         if (numneg * ncnegfactor > lnumcache)
2056                 cache_negative_zap_one();
2057         cache_free(ndd);
2058         return;
2059 out_unlock_free:
2060         cache_enter_unlock(&cel);
2061         atomic_add_long(&numcache, -1);
2062         cache_free(ncp);
2063         return;
2064 }
2065
2066 static u_int
2067 cache_roundup_2(u_int val)
2068 {
2069         u_int res;
2070
2071         for (res = 1; res <= val; res <<= 1)
2072                 continue;
2073
2074         return (res);
2075 }
2076
2077 static struct nchashhead *
2078 nchinittbl(u_long elements, u_long *hashmask)
2079 {
2080         struct nchashhead *hashtbl;
2081         u_long hashsize, i;
2082
2083         hashsize = cache_roundup_2(elements) / 2;
2084
2085         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2086         for (i = 0; i < hashsize; i++)
2087                 CK_SLIST_INIT(&hashtbl[i]);
2088         *hashmask = hashsize - 1;
2089         return (hashtbl);
2090 }
2091
2092 static void
2093 ncfreetbl(struct nchashhead *hashtbl)
2094 {
2095
2096         free(hashtbl, M_VFSCACHE);
2097 }
2098
2099 /*
2100  * Name cache initialization, from vfs_init() when we are booting
2101  */
2102 static void
2103 nchinit(void *dummy __unused)
2104 {
2105         u_int i;
2106
2107         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2108             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2109         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2110             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2111         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2112             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2113         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2114             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2115
2116         VFS_SMR_ZONE_SET(cache_zone_small);
2117         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2118         VFS_SMR_ZONE_SET(cache_zone_large);
2119         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2120
2121         ncsize = desiredvnodes * ncsizefactor;
2122         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2123         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2124         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2125                 ncbuckethash = 7;
2126         if (ncbuckethash > nchash)
2127                 ncbuckethash = nchash;
2128         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2129             M_WAITOK | M_ZERO);
2130         for (i = 0; i < numbucketlocks; i++)
2131                 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2132         ncvnodehash = ncbuckethash;
2133         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2134             M_WAITOK | M_ZERO);
2135         for (i = 0; i < numvnodelocks; i++)
2136                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2137         ncpurgeminvnodes = numbucketlocks * 2;
2138
2139         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2140             M_WAITOK | M_ZERO);
2141         for (i = 0; i < numneglists; i++) {
2142                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2143                 TAILQ_INIT(&neglists[i].nl_list);
2144         }
2145         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2146         TAILQ_INIT(&ncneg_hot.nl_list);
2147
2148         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2149 }
2150 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2151
2152 void
2153 cache_vnode_init(struct vnode *vp)
2154 {
2155
2156         LIST_INIT(&vp->v_cache_src);
2157         TAILQ_INIT(&vp->v_cache_dst);
2158         vp->v_cache_dd = NULL;
2159         cache_prehash(vp);
2160 }
2161
2162 void
2163 cache_changesize(u_long newmaxvnodes)
2164 {
2165         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2166         u_long new_nchash, old_nchash;
2167         struct namecache *ncp;
2168         uint32_t hash;
2169         u_long newncsize;
2170         int i;
2171
2172         newncsize = newmaxvnodes * ncsizefactor;
2173         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2174         if (newmaxvnodes < numbucketlocks)
2175                 newmaxvnodes = numbucketlocks;
2176
2177         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2178         /* If same hash table size, nothing to do */
2179         if (nchash == new_nchash) {
2180                 ncfreetbl(new_nchashtbl);
2181                 return;
2182         }
2183         /*
2184          * Move everything from the old hash table to the new table.
2185          * None of the namecache entries in the table can be removed
2186          * because to do so, they have to be removed from the hash table.
2187          */
2188         cache_lock_all_vnodes();
2189         cache_lock_all_buckets();
2190         old_nchashtbl = nchashtbl;
2191         old_nchash = nchash;
2192         nchashtbl = new_nchashtbl;
2193         nchash = new_nchash;
2194         for (i = 0; i <= old_nchash; i++) {
2195                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2196                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2197                             ncp->nc_dvp);
2198                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2199                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2200                 }
2201         }
2202         ncsize = newncsize;
2203         cache_unlock_all_buckets();
2204         cache_unlock_all_vnodes();
2205         ncfreetbl(old_nchashtbl);
2206 }
2207
2208 /*
2209  * Invalidate all entries from and to a particular vnode.
2210  */
2211 static void
2212 cache_purge_impl(struct vnode *vp)
2213 {
2214         TAILQ_HEAD(, namecache) ncps;
2215         struct namecache *ncp, *nnp;
2216         struct mtx *vlp, *vlp2;
2217
2218         TAILQ_INIT(&ncps);
2219         vlp = VP2VNODELOCK(vp);
2220         vlp2 = NULL;
2221         mtx_assert(vlp, MA_OWNED);
2222 retry:
2223         while (!LIST_EMPTY(&vp->v_cache_src)) {
2224                 ncp = LIST_FIRST(&vp->v_cache_src);
2225                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2226                         goto retry;
2227                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2228         }
2229         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2230                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2231                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2232                         goto retry;
2233                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2234         }
2235         ncp = vp->v_cache_dd;
2236         if (ncp != NULL) {
2237                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2238                    ("lost dotdot link"));
2239                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2240                         goto retry;
2241                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2242         }
2243         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2244         mtx_unlock(vlp);
2245         if (vlp2 != NULL)
2246                 mtx_unlock(vlp2);
2247         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2248                 cache_free(ncp);
2249         }
2250 }
2251
2252 void
2253 cache_purge(struct vnode *vp)
2254 {
2255         struct mtx *vlp;
2256
2257         SDT_PROBE1(vfs, namecache, purge, done, vp);
2258         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2259             vp->v_cache_dd == NULL)
2260                 return;
2261         vlp = VP2VNODELOCK(vp);
2262         mtx_lock(vlp);
2263         cache_purge_impl(vp);
2264 }
2265
2266 /*
2267  * Only to be used by vgone.
2268  */
2269 void
2270 cache_purge_vgone(struct vnode *vp)
2271 {
2272         struct mtx *vlp;
2273
2274         VNPASS(VN_IS_DOOMED(vp), vp);
2275         vlp = VP2VNODELOCK(vp);
2276         if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2277             vp->v_cache_dd == NULL)) {
2278                 mtx_lock(vlp);
2279                 cache_purge_impl(vp);
2280                 mtx_assert(vlp, MA_NOTOWNED);
2281                 return;
2282         }
2283
2284         /*
2285          * All the NULL pointer state we found above may be transient.
2286          * Serialize against a possible thread doing cache_purge.
2287          */
2288         mtx_wait_unlocked(vlp);
2289         if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2290             vp->v_cache_dd == NULL)) {
2291                 mtx_lock(vlp);
2292                 cache_purge_impl(vp);
2293                 mtx_assert(vlp, MA_NOTOWNED);
2294                 return;
2295         }
2296         return;
2297 }
2298
2299 /*
2300  * Invalidate all negative entries for a particular directory vnode.
2301  */
2302 void
2303 cache_purge_negative(struct vnode *vp)
2304 {
2305         TAILQ_HEAD(, namecache) ncps;
2306         struct namecache *ncp, *nnp;
2307         struct mtx *vlp;
2308
2309         CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2310         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2311         if (LIST_EMPTY(&vp->v_cache_src))
2312                 return;
2313         TAILQ_INIT(&ncps);
2314         vlp = VP2VNODELOCK(vp);
2315         mtx_lock(vlp);
2316         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2317                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2318                         continue;
2319                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2320                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2321         }
2322         mtx_unlock(vlp);
2323         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2324                 cache_free(ncp);
2325         }
2326 }
2327
2328 /*
2329  * Flush all entries referencing a particular filesystem.
2330  */
2331 void
2332 cache_purgevfs(struct mount *mp, bool force)
2333 {
2334         TAILQ_HEAD(, namecache) ncps;
2335         struct mtx *vlp1, *vlp2;
2336         struct rwlock *blp;
2337         struct nchashhead *bucket;
2338         struct namecache *ncp, *nnp;
2339         u_long i, j, n_nchash;
2340         int error;
2341
2342         /* Scan hash tables for applicable entries */
2343         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2344         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2345                 return;
2346         TAILQ_INIT(&ncps);
2347         n_nchash = nchash + 1;
2348         vlp1 = vlp2 = NULL;
2349         for (i = 0; i < numbucketlocks; i++) {
2350                 blp = (struct rwlock *)&bucketlocks[i];
2351                 rw_wlock(blp);
2352                 for (j = i; j < n_nchash; j += numbucketlocks) {
2353 retry:
2354                         bucket = &nchashtbl[j];
2355                         CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2356                                 cache_assert_bucket_locked(ncp, RA_WLOCKED);
2357                                 if (ncp->nc_dvp->v_mount != mp)
2358                                         continue;
2359                                 error = cache_zap_wlocked_bucket_kl(ncp, blp,
2360                                     &vlp1, &vlp2);
2361                                 if (error != 0)
2362                                         goto retry;
2363                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2364                         }
2365                 }
2366                 rw_wunlock(blp);
2367                 if (vlp1 == NULL && vlp2 == NULL)
2368                         cache_maybe_yield();
2369         }
2370         if (vlp1 != NULL)
2371                 mtx_unlock(vlp1);
2372         if (vlp2 != NULL)
2373                 mtx_unlock(vlp2);
2374
2375         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2376                 cache_free(ncp);
2377         }
2378 }
2379
2380 /*
2381  * Perform canonical checks and cache lookup and pass on to filesystem
2382  * through the vop_cachedlookup only if needed.
2383  */
2384
2385 int
2386 vfs_cache_lookup(struct vop_lookup_args *ap)
2387 {
2388         struct vnode *dvp;
2389         int error;
2390         struct vnode **vpp = ap->a_vpp;
2391         struct componentname *cnp = ap->a_cnp;
2392         int flags = cnp->cn_flags;
2393
2394         *vpp = NULL;
2395         dvp = ap->a_dvp;
2396
2397         if (dvp->v_type != VDIR)
2398                 return (ENOTDIR);
2399
2400         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2401             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2402                 return (EROFS);
2403
2404         error = vn_dir_check_exec(dvp, cnp);
2405         if (error != 0)
2406                 return (error);
2407
2408         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2409         if (error == 0)
2410                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2411         if (error == -1)
2412                 return (0);
2413         return (error);
2414 }
2415
2416 /* Implementation of the getcwd syscall. */
2417 int
2418 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2419 {
2420         char *buf, *retbuf;
2421         size_t buflen;
2422         int error;
2423
2424         buflen = uap->buflen;
2425         if (__predict_false(buflen < 2))
2426                 return (EINVAL);
2427         if (buflen > MAXPATHLEN)
2428                 buflen = MAXPATHLEN;
2429
2430         buf = uma_zalloc(namei_zone, M_WAITOK);
2431         error = vn_getcwd(td, buf, &retbuf, &buflen);
2432         if (error == 0)
2433                 error = copyout(retbuf, uap->buf, buflen);
2434         uma_zfree(namei_zone, buf);
2435         return (error);
2436 }
2437
2438 int
2439 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2440 {
2441         struct pwd *pwd;
2442         int error;
2443
2444         pwd = pwd_hold(td);
2445         error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2446         pwd_drop(pwd);
2447
2448 #ifdef KTRACE
2449         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2450                 ktrnamei(*retbuf);
2451 #endif
2452         return (error);
2453 }
2454
2455 static int
2456 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2457     size_t size, int flags, enum uio_seg pathseg)
2458 {
2459         struct nameidata nd;
2460         char *retbuf, *freebuf;
2461         int error;
2462
2463         if (flags != 0)
2464                 return (EINVAL);
2465         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2466             pathseg, path, fd, &cap_fstat_rights, td);
2467         if ((error = namei(&nd)) != 0)
2468                 return (error);
2469         error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2470         if (error == 0) {
2471                 error = copyout(retbuf, buf, size);
2472                 free(freebuf, M_TEMP);
2473         }
2474         NDFREE(&nd, 0);
2475         return (error);
2476 }
2477
2478 int
2479 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2480 {
2481
2482         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2483             uap->flags, UIO_USERSPACE));
2484 }
2485
2486 /*
2487  * Retrieve the full filesystem path that correspond to a vnode from the name
2488  * cache (if available)
2489  */
2490 int
2491 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2492 {
2493         struct pwd *pwd;
2494         char *buf;
2495         size_t buflen;
2496         int error;
2497
2498         if (__predict_false(vn == NULL))
2499                 return (EINVAL);
2500
2501         buflen = MAXPATHLEN;
2502         buf = malloc(buflen, M_TEMP, M_WAITOK);
2503         pwd = pwd_hold(td);
2504         error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2505         pwd_drop(pwd);
2506
2507         if (!error)
2508                 *freebuf = buf;
2509         else
2510                 free(buf, M_TEMP);
2511         return (error);
2512 }
2513
2514 /*
2515  * This function is similar to vn_fullpath, but it attempts to lookup the
2516  * pathname relative to the global root mount point.  This is required for the
2517  * auditing sub-system, as audited pathnames must be absolute, relative to the
2518  * global root mount point.
2519  */
2520 int
2521 vn_fullpath_global(struct thread *td, struct vnode *vn,
2522     char **retbuf, char **freebuf)
2523 {
2524         char *buf;
2525         size_t buflen;
2526         int error;
2527
2528         if (__predict_false(vn == NULL))
2529                 return (EINVAL);
2530         buflen = MAXPATHLEN;
2531         buf = malloc(buflen, M_TEMP, M_WAITOK);
2532         error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2533         if (!error)
2534                 *freebuf = buf;
2535         else
2536                 free(buf, M_TEMP);
2537         return (error);
2538 }
2539
2540 int
2541 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2542 {
2543         struct vnode *dvp;
2544         struct namecache *ncp;
2545         struct mtx *vlp;
2546         int error;
2547
2548         vlp = VP2VNODELOCK(*vp);
2549         mtx_lock(vlp);
2550         TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2551                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2552                         break;
2553         }
2554         if (ncp != NULL) {
2555                 if (*buflen < ncp->nc_nlen) {
2556                         mtx_unlock(vlp);
2557                         vrele(*vp);
2558                         counter_u64_add(numfullpathfail4, 1);
2559                         error = ENOMEM;
2560                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2561                             vp, NULL);
2562                         return (error);
2563                 }
2564                 *buflen -= ncp->nc_nlen;
2565                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2566                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2567                     ncp->nc_name, vp);
2568                 dvp = *vp;
2569                 *vp = ncp->nc_dvp;
2570                 vref(*vp);
2571                 mtx_unlock(vlp);
2572                 vrele(dvp);
2573                 return (0);
2574         }
2575         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2576
2577         mtx_unlock(vlp);
2578         vn_lock(*vp, LK_SHARED | LK_RETRY);
2579         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2580         vput(*vp);
2581         if (error) {
2582                 counter_u64_add(numfullpathfail2, 1);
2583                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2584                 return (error);
2585         }
2586
2587         *vp = dvp;
2588         if (VN_IS_DOOMED(dvp)) {
2589                 /* forced unmount */
2590                 vrele(dvp);
2591                 error = ENOENT;
2592                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2593                 return (error);
2594         }
2595         /*
2596          * *vp has its use count incremented still.
2597          */
2598
2599         return (0);
2600 }
2601
2602 /*
2603  * Resolve a directory to a pathname.
2604  *
2605  * The name of the directory can always be found in the namecache or fetched
2606  * from the filesystem. There is also guaranteed to be only one parent, meaning
2607  * we can just follow vnodes up until we find the root.
2608  *
2609  * The vnode must be referenced.
2610  */
2611 static int
2612 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2613     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2614 {
2615 #ifdef KDTRACE_HOOKS
2616         struct vnode *startvp = vp;
2617 #endif
2618         struct vnode *vp1;
2619         size_t buflen;
2620         int error;
2621
2622         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2623         VNPASS(vp->v_usecount > 0, vp);
2624
2625         buflen = *len;
2626
2627         if (!slash_prefixed) {
2628                 MPASS(*len >= 2);
2629                 buflen--;
2630                 buf[buflen] = '\0';
2631         }
2632
2633         error = 0;
2634
2635         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2636         counter_u64_add(numfullpathcalls, 1);
2637         while (vp != rdir && vp != rootvnode) {
2638                 /*
2639                  * The vp vnode must be already fully constructed,
2640                  * since it is either found in namecache or obtained
2641                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2642                  * without obtaining the vnode lock.
2643                  */
2644                 if ((vp->v_vflag & VV_ROOT) != 0) {
2645                         vn_lock(vp, LK_RETRY | LK_SHARED);
2646
2647                         /*
2648                          * With the vnode locked, check for races with
2649                          * unmount, forced or not.  Note that we
2650                          * already verified that vp is not equal to
2651                          * the root vnode, which means that
2652                          * mnt_vnodecovered can be NULL only for the
2653                          * case of unmount.
2654                          */
2655                         if (VN_IS_DOOMED(vp) ||
2656                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2657                             vp1->v_mountedhere != vp->v_mount) {
2658                                 vput(vp);
2659                                 error = ENOENT;
2660                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2661                                     error, vp, NULL);
2662                                 break;
2663                         }
2664
2665                         vref(vp1);
2666                         vput(vp);
2667                         vp = vp1;
2668                         continue;
2669                 }
2670                 if (vp->v_type != VDIR) {
2671                         vrele(vp);
2672                         counter_u64_add(numfullpathfail1, 1);
2673                         error = ENOTDIR;
2674                         SDT_PROBE3(vfs, namecache, fullpath, return,
2675                             error, vp, NULL);
2676                         break;
2677                 }
2678                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2679                 if (error)
2680                         break;
2681                 if (buflen == 0) {
2682                         vrele(vp);
2683                         error = ENOMEM;
2684                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2685                             startvp, NULL);
2686                         break;
2687                 }
2688                 buf[--buflen] = '/';
2689                 slash_prefixed = true;
2690         }
2691         if (error)
2692                 return (error);
2693         if (!slash_prefixed) {
2694                 if (buflen == 0) {
2695                         vrele(vp);
2696                         counter_u64_add(numfullpathfail4, 1);
2697                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2698                             startvp, NULL);
2699                         return (ENOMEM);
2700                 }
2701                 buf[--buflen] = '/';
2702         }
2703         counter_u64_add(numfullpathfound, 1);
2704         vrele(vp);
2705
2706         *retbuf = buf + buflen;
2707         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2708         *len -= buflen;
2709         *len += addend;
2710         return (0);
2711 }
2712
2713 /*
2714  * Resolve an arbitrary vnode to a pathname.
2715  *
2716  * Note 2 caveats:
2717  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2718  *   resolve to a different path than the one used to find it
2719  * - namecache is not mandatory, meaning names are not guaranteed to be added
2720  *   (in which case resolving fails)
2721  */
2722 static int
2723 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2724     char *buf, char **retbuf, size_t *buflen)
2725 {
2726         size_t orig_buflen;
2727         bool slash_prefixed;
2728         int error;
2729
2730         if (*buflen < 2)
2731                 return (EINVAL);
2732
2733         orig_buflen = *buflen;
2734
2735         vref(vp);
2736         slash_prefixed = false;
2737         if (vp->v_type != VDIR) {
2738                 *buflen -= 1;
2739                 buf[*buflen] = '\0';
2740                 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2741                 if (error)
2742                         return (error);
2743                 if (*buflen == 0) {
2744                         vrele(vp);
2745                         return (ENOMEM);
2746                 }
2747                 *buflen -= 1;
2748                 buf[*buflen] = '/';
2749                 slash_prefixed = true;
2750         }
2751
2752         return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2753             orig_buflen - *buflen));
2754 }
2755
2756 /*
2757  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2758  *
2759  * Since the namecache does not track handlings, the caller is expected to first
2760  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2761  *
2762  * Then we have 2 cases:
2763  * - if the found vnode is a directory, the path can be constructed just by
2764  *   fullowing names up the chain
2765  * - otherwise we populate the buffer with the saved name and start resolving
2766  *   from the parent
2767  */
2768 static int
2769 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2770     char **freebuf, size_t *buflen)
2771 {
2772         char *buf, *tmpbuf;
2773         struct pwd *pwd;
2774         struct componentname *cnp;
2775         struct vnode *vp;
2776         size_t addend;
2777         int error;
2778         bool slash_prefixed;
2779
2780         if (*buflen < 2)
2781                 return (EINVAL);
2782         if (*buflen > MAXPATHLEN)
2783                 *buflen = MAXPATHLEN;
2784
2785         slash_prefixed = false;
2786
2787         buf = malloc(*buflen, M_TEMP, M_WAITOK);
2788         pwd = pwd_hold(td);
2789
2790         addend = 0;
2791         vp = ndp->ni_vp;
2792         if (vp->v_type != VDIR) {
2793                 cnp = &ndp->ni_cnd;
2794                 addend = cnp->cn_namelen + 2;
2795                 if (*buflen < addend) {
2796                         error = ENOMEM;
2797                         goto out_bad;
2798                 }
2799                 *buflen -= addend;
2800                 tmpbuf = buf + *buflen;
2801                 tmpbuf[0] = '/';
2802                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2803                 tmpbuf[addend - 1] = '\0';
2804                 slash_prefixed = true;
2805                 vp = ndp->ni_dvp;
2806         }
2807
2808         vref(vp);
2809         error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2810             slash_prefixed, addend);
2811         if (error != 0)
2812                 goto out_bad;
2813
2814         pwd_drop(pwd);
2815         *freebuf = buf;
2816
2817         return (0);
2818 out_bad:
2819         pwd_drop(pwd);
2820         free(buf, M_TEMP);
2821         return (error);
2822 }
2823
2824 struct vnode *
2825 vn_dir_dd_ino(struct vnode *vp)
2826 {
2827         struct namecache *ncp;
2828         struct vnode *ddvp;
2829         struct mtx *vlp;
2830         enum vgetstate vs;
2831
2832         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2833         vlp = VP2VNODELOCK(vp);
2834         mtx_lock(vlp);
2835         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2836                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2837                         continue;
2838                 ddvp = ncp->nc_dvp;
2839                 vs = vget_prep(ddvp);
2840                 mtx_unlock(vlp);
2841                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2842                         return (NULL);
2843                 return (ddvp);
2844         }
2845         mtx_unlock(vlp);
2846         return (NULL);
2847 }
2848
2849 int
2850 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2851 {
2852         struct namecache *ncp;
2853         struct mtx *vlp;
2854         int l;
2855
2856         vlp = VP2VNODELOCK(vp);
2857         mtx_lock(vlp);
2858         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2859                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2860                         break;
2861         if (ncp == NULL) {
2862                 mtx_unlock(vlp);
2863                 return (ENOENT);
2864         }
2865         l = min(ncp->nc_nlen, buflen - 1);
2866         memcpy(buf, ncp->nc_name, l);
2867         mtx_unlock(vlp);
2868         buf[l] = '\0';
2869         return (0);
2870 }
2871
2872 /*
2873  * This function updates path string to vnode's full global path
2874  * and checks the size of the new path string against the pathlen argument.
2875  *
2876  * Requires a locked, referenced vnode.
2877  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2878  *
2879  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2880  * because it falls back to the ".." lookup if the namecache lookup fails.
2881  */
2882 int
2883 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2884     u_int pathlen)
2885 {
2886         struct nameidata nd;
2887         struct vnode *vp1;
2888         char *rpath, *fbuf;
2889         int error;
2890
2891         ASSERT_VOP_ELOCKED(vp, __func__);
2892
2893         /* Construct global filesystem path from vp. */
2894         VOP_UNLOCK(vp);
2895         error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2896
2897         if (error != 0) {
2898                 vrele(vp);
2899                 return (error);
2900         }
2901
2902         if (strlen(rpath) >= pathlen) {
2903                 vrele(vp);
2904                 error = ENAMETOOLONG;
2905                 goto out;
2906         }
2907
2908         /*
2909          * Re-lookup the vnode by path to detect a possible rename.
2910          * As a side effect, the vnode is relocked.
2911          * If vnode was renamed, return ENOENT.
2912          */
2913         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2914             UIO_SYSSPACE, path, td);
2915         error = namei(&nd);
2916         if (error != 0) {
2917                 vrele(vp);
2918                 goto out;
2919         }
2920         NDFREE(&nd, NDF_ONLY_PNBUF);
2921         vp1 = nd.ni_vp;
2922         vrele(vp);
2923         if (vp1 == vp)
2924                 strcpy(path, rpath);
2925         else {
2926                 vput(vp1);
2927                 error = ENOENT;
2928         }
2929
2930 out:
2931         free(fbuf, M_TEMP);
2932         return (error);
2933 }
2934
2935 #ifdef DDB
2936 static void
2937 db_print_vpath(struct vnode *vp)
2938 {
2939
2940         while (vp != NULL) {
2941                 db_printf("%p: ", vp);
2942                 if (vp == rootvnode) {
2943                         db_printf("/");
2944                         vp = NULL;
2945                 } else {
2946                         if (vp->v_vflag & VV_ROOT) {
2947                                 db_printf("<mount point>");
2948                                 vp = vp->v_mount->mnt_vnodecovered;
2949                         } else {
2950                                 struct namecache *ncp;
2951                                 char *ncn;
2952                                 int i;
2953
2954                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2955                                 if (ncp != NULL) {
2956                                         ncn = ncp->nc_name;
2957                                         for (i = 0; i < ncp->nc_nlen; i++)
2958                                                 db_printf("%c", *ncn++);
2959                                         vp = ncp->nc_dvp;
2960                                 } else {
2961                                         vp = NULL;
2962                                 }
2963                         }
2964                 }
2965                 db_printf("\n");
2966         }
2967
2968         return;
2969 }
2970
2971 DB_SHOW_COMMAND(vpath, db_show_vpath)
2972 {
2973         struct vnode *vp;
2974
2975         if (!have_addr) {
2976                 db_printf("usage: show vpath <struct vnode *>\n");
2977                 return;
2978         }
2979
2980         vp = (struct vnode *)addr;
2981         db_print_vpath(vp);
2982 }
2983
2984 #endif
2985
2986 static bool __read_frequently cache_fast_lookup = true;
2987 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2988     &cache_fast_lookup, 0, "");
2989
2990 #define CACHE_FPL_FAILED        -2020
2991
2992 static void
2993 cache_fpl_cleanup_cnp(struct componentname *cnp)
2994 {
2995
2996         uma_zfree(namei_zone, cnp->cn_pnbuf);
2997 #ifdef DIAGNOSTIC
2998         cnp->cn_pnbuf = NULL;
2999         cnp->cn_nameptr = NULL;
3000 #endif
3001 }
3002
3003 static void
3004 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3005 {
3006         struct componentname *cnp;
3007
3008         cnp = &ndp->ni_cnd;
3009         while (*(cnp->cn_nameptr) == '/') {
3010                 cnp->cn_nameptr++;
3011                 ndp->ni_pathlen--;
3012         }
3013
3014         *dpp = ndp->ni_rootdir;
3015 }
3016
3017 /*
3018  * Components of nameidata (or objects it can point to) which may
3019  * need restoring in case fast path lookup fails.
3020  */
3021 struct nameidata_saved {
3022         long cn_namelen;
3023         char *cn_nameptr;
3024         size_t ni_pathlen;
3025         int cn_flags;
3026 };
3027
3028 struct cache_fpl {
3029         struct nameidata *ndp;
3030         struct componentname *cnp;
3031         struct pwd *pwd;
3032         struct vnode *dvp;
3033         struct vnode *tvp;
3034         seqc_t dvp_seqc;
3035         seqc_t tvp_seqc;
3036         struct nameidata_saved snd;
3037         int line;
3038         enum cache_fpl_status status:8;
3039         bool in_smr;
3040 };
3041
3042 static void
3043 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3044 {
3045
3046         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3047         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3048         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3049         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3050 }
3051
3052 static void
3053 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3054 {
3055
3056         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3057         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3058         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3059         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3060 }
3061
3062 #ifdef INVARIANTS
3063 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3064         struct cache_fpl *_fpl = (fpl);                         \
3065         MPASS(_fpl->in_smr == true);                            \
3066         VFS_SMR_ASSERT_ENTERED();                               \
3067 })
3068 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3069         struct cache_fpl *_fpl = (fpl);                         \
3070         MPASS(_fpl->in_smr == false);                           \
3071         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3072 })
3073 #else
3074 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3075 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3076 #endif
3077
3078 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3079         struct cache_fpl *_fpl = (fpl);                         \
3080         vfs_smr_enter();                                        \
3081         _fpl->in_smr = true;                                    \
3082 })
3083
3084 #define cache_fpl_smr_enter(fpl) ({                             \
3085         struct cache_fpl *_fpl = (fpl);                         \
3086         MPASS(_fpl->in_smr == false);                           \
3087         vfs_smr_enter();                                        \
3088         _fpl->in_smr = true;                                    \
3089 })
3090
3091 #define cache_fpl_smr_exit(fpl) ({                              \
3092         struct cache_fpl *_fpl = (fpl);                         \
3093         MPASS(_fpl->in_smr == true);                            \
3094         vfs_smr_exit();                                         \
3095         _fpl->in_smr = false;                                   \
3096 })
3097
3098 static int
3099 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3100 {
3101
3102         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3103                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3104                     ("%s: converting to abort from %d at %d, set at %d\n",
3105                     __func__, fpl->status, line, fpl->line));
3106         }
3107         fpl->status = CACHE_FPL_STATUS_ABORTED;
3108         fpl->line = line;
3109         return (CACHE_FPL_FAILED);
3110 }
3111
3112 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3113
3114 static int
3115 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3116 {
3117
3118         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3119             ("%s: setting to partial at %d, but already set to %d at %d\n",
3120             __func__, line, fpl->status, fpl->line));
3121         cache_fpl_smr_assert_entered(fpl);
3122         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3123         fpl->line = line;
3124         return (CACHE_FPL_FAILED);
3125 }
3126
3127 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3128
3129 static int
3130 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3131 {
3132
3133         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3134             ("%s: setting to handled at %d, but already set to %d at %d\n",
3135             __func__, line, fpl->status, fpl->line));
3136         cache_fpl_smr_assert_not_entered(fpl);
3137         MPASS(error != CACHE_FPL_FAILED);
3138         fpl->status = CACHE_FPL_STATUS_HANDLED;
3139         fpl->line = line;
3140         return (error);
3141 }
3142
3143 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3144
3145 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3146         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3147          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
3148
3149 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3150         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3151
3152 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3153     "supported and internal flags overlap");
3154
3155 static bool
3156 cache_fpl_islastcn(struct nameidata *ndp)
3157 {
3158
3159         return (*ndp->ni_next == 0);
3160 }
3161
3162 static bool
3163 cache_fpl_isdotdot(struct componentname *cnp)
3164 {
3165
3166         if (cnp->cn_namelen == 2 &&
3167             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3168                 return (true);
3169         return (false);
3170 }
3171
3172 static bool
3173 cache_can_fplookup(struct cache_fpl *fpl)
3174 {
3175         struct nameidata *ndp;
3176         struct componentname *cnp;
3177         struct thread *td;
3178
3179         ndp = fpl->ndp;
3180         cnp = fpl->cnp;
3181         td = cnp->cn_thread;
3182
3183         if (!cache_fast_lookup) {
3184                 cache_fpl_aborted(fpl);
3185                 return (false);
3186         }
3187 #ifdef MAC
3188         if (mac_vnode_check_lookup_enabled()) {
3189                 cache_fpl_aborted(fpl);
3190                 return (false);
3191         }
3192 #endif
3193         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3194                 cache_fpl_aborted(fpl);
3195                 return (false);
3196         }
3197         if (ndp->ni_dirfd != AT_FDCWD) {
3198                 cache_fpl_aborted(fpl);
3199                 return (false);
3200         }
3201         if (IN_CAPABILITY_MODE(td)) {
3202                 cache_fpl_aborted(fpl);
3203                 return (false);
3204         }
3205         if (AUDITING_TD(td)) {
3206                 cache_fpl_aborted(fpl);
3207                 return (false);
3208         }
3209         if (ndp->ni_startdir != NULL) {
3210                 cache_fpl_aborted(fpl);
3211                 return (false);
3212         }
3213         return (true);
3214 }
3215
3216 static bool
3217 cache_fplookup_vnode_supported(struct vnode *vp)
3218 {
3219
3220         return (vp->v_type != VLNK);
3221 }
3222
3223 /*
3224  * Move a negative entry to the hot list.
3225  *
3226  * We have to take locks, but they may be contended and in the worst
3227  * case we may need to go off CPU. We don't want to spin within the
3228  * smr section and we can't block with it. Instead we are going to
3229  * look up the entry again.
3230  */
3231 static int __noinline
3232 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3233     uint32_t hash)
3234 {
3235         struct componentname *cnp;
3236         struct namecache *ncp;
3237         struct neglist *neglist;
3238         struct negstate *negstate;
3239         struct vnode *dvp;
3240         u_char nc_flag;
3241
3242         cnp = fpl->cnp;
3243         dvp = fpl->dvp;
3244
3245         if (!vhold_smr(dvp))
3246                 return (cache_fpl_aborted(fpl));
3247
3248         neglist = NCP2NEGLIST(oncp);
3249         cache_fpl_smr_exit(fpl);
3250
3251         mtx_lock(&ncneg_hot.nl_lock);
3252         mtx_lock(&neglist->nl_lock);
3253         /*
3254          * For hash iteration.
3255          */
3256         cache_fpl_smr_enter(fpl);
3257
3258         /*
3259          * Avoid all surprises by only succeeding if we got the same entry and
3260          * bailing completely otherwise.
3261          *
3262          * In particular at this point there can be a new ncp which matches the
3263          * search but hashes to a different neglist.
3264          */
3265         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3266                 if (ncp == oncp)
3267                         break;
3268         }
3269
3270         /*
3271          * No match to begin with.
3272          */
3273         if (__predict_false(ncp == NULL)) {
3274                 goto out_abort;
3275         }
3276
3277         /*
3278          * The newly found entry may be something different...
3279          */
3280         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3281             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3282                 goto out_abort;
3283         }
3284
3285         /*
3286          * ... and not even negative.
3287          */
3288         nc_flag = atomic_load_char(&ncp->nc_flag);
3289         if ((nc_flag & NCF_NEGATIVE) == 0) {
3290                 goto out_abort;
3291         }
3292
3293         if (__predict_false(!cache_ncp_canuse(ncp))) {
3294                 goto out_abort;
3295         }
3296
3297         negstate = NCP2NEGSTATE(ncp);
3298         if ((negstate->neg_flag & NEG_HOT) == 0) {
3299                 numhotneg++;
3300                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3301                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3302                 negstate->neg_flag |= NEG_HOT;
3303         }
3304
3305         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3306         counter_u64_add(numneghits, 1);
3307         cache_fpl_smr_exit(fpl);
3308         mtx_unlock(&neglist->nl_lock);
3309         mtx_unlock(&ncneg_hot.nl_lock);
3310         vdrop(dvp);
3311         return (cache_fpl_handled(fpl, ENOENT));
3312 out_abort:
3313         cache_fpl_smr_exit(fpl);
3314         mtx_unlock(&neglist->nl_lock);
3315         mtx_unlock(&ncneg_hot.nl_lock);
3316         vdrop(dvp);
3317         return (cache_fpl_aborted(fpl));
3318 }
3319
3320 /*
3321  * The target vnode is not supported, prepare for the slow path to take over.
3322  */
3323 static int __noinline
3324 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3325 {
3326         struct nameidata *ndp;
3327         struct componentname *cnp;
3328         enum vgetstate dvs;
3329         struct vnode *dvp;
3330         struct pwd *pwd;
3331         seqc_t dvp_seqc;
3332
3333         ndp = fpl->ndp;
3334         cnp = fpl->cnp;
3335         dvp = fpl->dvp;
3336         dvp_seqc = fpl->dvp_seqc;
3337
3338         dvs = vget_prep_smr(dvp);
3339         if (__predict_false(dvs == VGET_NONE)) {
3340                 cache_fpl_smr_exit(fpl);
3341                 return (cache_fpl_aborted(fpl));
3342         }
3343
3344         cache_fpl_smr_exit(fpl);
3345
3346         vget_finish_ref(dvp, dvs);
3347         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3348                 vrele(dvp);
3349                 return (cache_fpl_aborted(fpl));
3350         }
3351
3352         pwd = pwd_hold(curthread);
3353         if (fpl->pwd != pwd) {
3354                 vrele(dvp);
3355                 pwd_drop(pwd);
3356                 return (cache_fpl_aborted(fpl));
3357         }
3358
3359         cache_fpl_restore(fpl, &fpl->snd);
3360
3361         ndp->ni_startdir = dvp;
3362         cnp->cn_flags |= MAKEENTRY;
3363         if (cache_fpl_islastcn(ndp))
3364                 cnp->cn_flags |= ISLASTCN;
3365         if (cache_fpl_isdotdot(cnp))
3366                 cnp->cn_flags |= ISDOTDOT;
3367
3368         return (0);
3369 }
3370
3371 static int
3372 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3373 {
3374         struct componentname *cnp;
3375         struct vnode *tvp;
3376         seqc_t tvp_seqc;
3377         int error, lkflags;
3378
3379         cnp = fpl->cnp;
3380         tvp = fpl->tvp;
3381         tvp_seqc = fpl->tvp_seqc;
3382
3383         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3384                 lkflags = LK_SHARED;
3385                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3386                         lkflags = LK_EXCLUSIVE;
3387                 error = vget_finish(tvp, lkflags, tvs);
3388                 if (__predict_false(error != 0)) {
3389                         return (cache_fpl_aborted(fpl));
3390                 }
3391         } else {
3392                 vget_finish_ref(tvp, tvs);
3393         }
3394
3395         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3396                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3397                         vput(tvp);
3398                 else
3399                         vrele(tvp);
3400                 return (cache_fpl_aborted(fpl));
3401         }
3402
3403         return (cache_fpl_handled(fpl, 0));
3404 }
3405
3406 /*
3407  * They want to possibly modify the state of the namecache.
3408  *
3409  * Don't try to match the API contract, just leave.
3410  * TODO: this leaves scalability on the table
3411  */
3412 static int
3413 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3414 {
3415         struct componentname *cnp;
3416
3417         cnp = fpl->cnp;
3418         MPASS(cnp->cn_nameiop != LOOKUP);
3419         return (cache_fpl_partial(fpl));
3420 }
3421
3422 static int __noinline
3423 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3424 {
3425         struct componentname *cnp;
3426         enum vgetstate dvs, tvs;
3427         struct vnode *dvp, *tvp;
3428         seqc_t dvp_seqc, tvp_seqc;
3429         int error;
3430
3431         cnp = fpl->cnp;
3432         dvp = fpl->dvp;
3433         dvp_seqc = fpl->dvp_seqc;
3434         tvp = fpl->tvp;
3435         tvp_seqc = fpl->tvp_seqc;
3436
3437         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3438
3439         /*
3440          * This is less efficient than it can be for simplicity.
3441          */
3442         dvs = vget_prep_smr(dvp);
3443         if (__predict_false(dvs == VGET_NONE)) {
3444                 return (cache_fpl_aborted(fpl));
3445         }
3446         tvs = vget_prep_smr(tvp);
3447         if (__predict_false(tvs == VGET_NONE)) {
3448                 cache_fpl_smr_exit(fpl);
3449                 vget_abort(dvp, dvs);
3450                 return (cache_fpl_aborted(fpl));
3451         }
3452
3453         cache_fpl_smr_exit(fpl);
3454
3455         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3456                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3457                 if (__predict_false(error != 0)) {
3458                         vget_abort(tvp, tvs);
3459                         return (cache_fpl_aborted(fpl));
3460                 }
3461         } else {
3462                 vget_finish_ref(dvp, dvs);
3463         }
3464
3465         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3466                 vget_abort(tvp, tvs);
3467                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3468                         vput(dvp);
3469                 else
3470                         vrele(dvp);
3471                 return (cache_fpl_aborted(fpl));
3472         }
3473
3474         error = cache_fplookup_final_child(fpl, tvs);
3475         if (__predict_false(error != 0)) {
3476                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3477                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3478                         vput(dvp);
3479                 else
3480                         vrele(dvp);
3481                 return (error);
3482         }
3483
3484         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3485         return (0);
3486 }
3487
3488 static int
3489 cache_fplookup_final(struct cache_fpl *fpl)
3490 {
3491         struct componentname *cnp;
3492         enum vgetstate tvs;
3493         struct vnode *dvp, *tvp;
3494         seqc_t dvp_seqc, tvp_seqc;
3495
3496         cnp = fpl->cnp;
3497         dvp = fpl->dvp;
3498         dvp_seqc = fpl->dvp_seqc;
3499         tvp = fpl->tvp;
3500         tvp_seqc = fpl->tvp_seqc;
3501
3502         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3503
3504         if (cnp->cn_nameiop != LOOKUP) {
3505                 return (cache_fplookup_final_modifying(fpl));
3506         }
3507
3508         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3509                 return (cache_fplookup_final_withparent(fpl));
3510
3511         tvs = vget_prep_smr(tvp);
3512         if (__predict_false(tvs == VGET_NONE)) {
3513                 return (cache_fpl_partial(fpl));
3514         }
3515
3516         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3517                 cache_fpl_smr_exit(fpl);
3518                 vget_abort(tvp, tvs);
3519                 return (cache_fpl_aborted(fpl));
3520         }
3521
3522         cache_fpl_smr_exit(fpl);
3523         return (cache_fplookup_final_child(fpl, tvs));
3524 }
3525
3526 static int __noinline
3527 cache_fplookup_dot(struct cache_fpl *fpl)
3528 {
3529         struct vnode *dvp;
3530
3531         dvp = fpl->dvp;
3532
3533         fpl->tvp = dvp;
3534         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3535         if (seqc_in_modify(fpl->tvp_seqc)) {
3536                 return (cache_fpl_aborted(fpl));
3537         }
3538
3539         counter_u64_add(dothits, 1);
3540         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3541
3542         return (0);
3543 }
3544
3545 static int __noinline
3546 cache_fplookup_dotdot(struct cache_fpl *fpl)
3547 {
3548         struct nameidata *ndp;
3549         struct componentname *cnp;
3550         struct namecache *ncp;
3551         struct vnode *dvp;
3552         struct prison *pr;
3553         u_char nc_flag;
3554
3555         ndp = fpl->ndp;
3556         cnp = fpl->cnp;
3557         dvp = fpl->dvp;
3558
3559         /*
3560          * XXX this is racy the same way regular lookup is
3561          */
3562         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3563             pr = pr->pr_parent)
3564                 if (dvp == pr->pr_root)
3565                         break;
3566
3567         if (dvp == ndp->ni_rootdir ||
3568             dvp == ndp->ni_topdir ||
3569             dvp == rootvnode ||
3570             pr != NULL) {
3571                 fpl->tvp = dvp;
3572                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3573                 if (seqc_in_modify(fpl->tvp_seqc)) {
3574                         return (cache_fpl_aborted(fpl));
3575                 }
3576                 return (0);
3577         }
3578
3579         if ((dvp->v_vflag & VV_ROOT) != 0) {
3580                 /*
3581                  * TODO
3582                  * The opposite of climb mount is needed here.
3583                  */
3584                 return (cache_fpl_aborted(fpl));
3585         }
3586
3587         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3588         if (ncp == NULL) {
3589                 return (cache_fpl_aborted(fpl));
3590         }
3591
3592         nc_flag = atomic_load_char(&ncp->nc_flag);
3593         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3594                 if ((nc_flag & NCF_NEGATIVE) != 0)
3595                         return (cache_fpl_aborted(fpl));
3596                 fpl->tvp = ncp->nc_vp;
3597         } else {
3598                 fpl->tvp = ncp->nc_dvp;
3599         }
3600
3601         if (__predict_false(!cache_ncp_canuse(ncp))) {
3602                 return (cache_fpl_aborted(fpl));
3603         }
3604
3605         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3606         if (seqc_in_modify(fpl->tvp_seqc)) {
3607                 return (cache_fpl_partial(fpl));
3608         }
3609
3610         counter_u64_add(dotdothits, 1);
3611         return (0);
3612 }
3613
3614 static int
3615 cache_fplookup_next(struct cache_fpl *fpl)
3616 {
3617         struct componentname *cnp;
3618         struct namecache *ncp;
3619         struct negstate *negstate;
3620         struct vnode *dvp, *tvp;
3621         u_char nc_flag;
3622         uint32_t hash;
3623         bool neg_hot;
3624
3625         cnp = fpl->cnp;
3626         dvp = fpl->dvp;
3627
3628         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3629                 return (cache_fplookup_dot(fpl));
3630         }
3631
3632         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3633
3634         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3635                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3636                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3637                         break;
3638         }
3639
3640         /*
3641          * If there is no entry we have to punt to the slow path to perform
3642          * actual lookup. Should there be nothing with this name a negative
3643          * entry will be created.
3644          */
3645         if (__predict_false(ncp == NULL)) {
3646                 return (cache_fpl_partial(fpl));
3647         }
3648
3649         tvp = atomic_load_ptr(&ncp->nc_vp);
3650         nc_flag = atomic_load_char(&ncp->nc_flag);
3651         if ((nc_flag & NCF_NEGATIVE) != 0) {
3652                 /*
3653                  * If they want to create an entry we need to replace this one.
3654                  */
3655                 if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
3656                         return (cache_fpl_partial(fpl));
3657                 }
3658                 negstate = NCP2NEGSTATE(ncp);
3659                 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3660                 if (__predict_false(!cache_ncp_canuse(ncp))) {
3661                         return (cache_fpl_partial(fpl));
3662                 }
3663                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3664                         return (cache_fpl_partial(fpl));
3665                 }
3666                 if (!neg_hot) {
3667                         return (cache_fplookup_negative_promote(fpl, ncp, hash));
3668                 }
3669                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3670                     ncp->nc_name);
3671                 counter_u64_add(numneghits, 1);
3672                 cache_fpl_smr_exit(fpl);
3673                 return (cache_fpl_handled(fpl, ENOENT));
3674         }
3675
3676         if (__predict_false(!cache_ncp_canuse(ncp))) {
3677                 return (cache_fpl_partial(fpl));
3678         }
3679
3680         fpl->tvp = tvp;
3681         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3682         if (seqc_in_modify(fpl->tvp_seqc)) {
3683                 return (cache_fpl_partial(fpl));
3684         }
3685
3686         if (!cache_fplookup_vnode_supported(tvp)) {
3687                 return (cache_fpl_partial(fpl));
3688         }
3689
3690         counter_u64_add(numposhits, 1);
3691         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3692         return (0);
3693 }
3694
3695 static bool
3696 cache_fplookup_mp_supported(struct mount *mp)
3697 {
3698
3699         if (mp == NULL)
3700                 return (false);
3701         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3702                 return (false);
3703         return (true);
3704 }
3705
3706 /*
3707  * Walk up the mount stack (if any).
3708  *
3709  * Correctness is provided in the following ways:
3710  * - all vnodes are protected from freeing with SMR
3711  * - struct mount objects are type stable making them always safe to access
3712  * - stability of the particular mount is provided by busying it
3713  * - relationship between the vnode which is mounted on and the mount is
3714  *   verified with the vnode sequence counter after busying
3715  * - association between root vnode of the mount and the mount is protected
3716  *   by busy
3717  *
3718  * From that point on we can read the sequence counter of the root vnode
3719  * and get the next mount on the stack (if any) using the same protection.
3720  *
3721  * By the end of successful walk we are guaranteed the reached state was
3722  * indeed present at least at some point which matches the regular lookup.
3723  */
3724 static int __noinline
3725 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3726 {
3727         struct mount *mp, *prev_mp;
3728         struct vnode *vp;
3729         seqc_t vp_seqc;
3730
3731         vp = fpl->tvp;
3732         vp_seqc = fpl->tvp_seqc;
3733
3734         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
3735         mp = atomic_load_ptr(&vp->v_mountedhere);
3736         if (mp == NULL)
3737                 return (0);
3738
3739         prev_mp = NULL;
3740         for (;;) {
3741                 if (!vfs_op_thread_enter_crit(mp)) {
3742                         if (prev_mp != NULL)
3743                                 vfs_op_thread_exit_crit(prev_mp);
3744                         return (cache_fpl_partial(fpl));
3745                 }
3746                 if (prev_mp != NULL)
3747                         vfs_op_thread_exit_crit(prev_mp);
3748                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3749                         vfs_op_thread_exit_crit(mp);
3750                         return (cache_fpl_partial(fpl));
3751                 }
3752                 if (!cache_fplookup_mp_supported(mp)) {
3753                         vfs_op_thread_exit_crit(mp);
3754                         return (cache_fpl_partial(fpl));
3755                 }
3756                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3757                 if (vp == NULL || VN_IS_DOOMED(vp)) {
3758                         vfs_op_thread_exit_crit(mp);
3759                         return (cache_fpl_partial(fpl));
3760                 }
3761                 vp_seqc = vn_seqc_read_any(vp);
3762                 if (seqc_in_modify(vp_seqc)) {
3763                         vfs_op_thread_exit_crit(mp);
3764                         return (cache_fpl_partial(fpl));
3765                 }
3766                 prev_mp = mp;
3767                 mp = atomic_load_ptr(&vp->v_mountedhere);
3768                 if (mp == NULL)
3769                         break;
3770         }
3771
3772         vfs_op_thread_exit_crit(prev_mp);
3773         fpl->tvp = vp;
3774         fpl->tvp_seqc = vp_seqc;
3775         return (0);
3776 }
3777
3778 static bool
3779 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
3780 {
3781         struct mount *mp;
3782         struct vnode *vp;
3783
3784         vp = fpl->tvp;
3785
3786         /*
3787          * Hack: while this is a union, the pointer tends to be NULL so save on
3788          * a branch.
3789          */
3790         mp = atomic_load_ptr(&vp->v_mountedhere);
3791         if (mp == NULL)
3792                 return (false);
3793         if (vp->v_type == VDIR)
3794                 return (true);
3795         return (false);
3796 }
3797
3798 /*
3799  * Parse the path.
3800  *
3801  * The code is mostly copy-pasted from regular lookup, see lookup().
3802  * The structure is maintained along with comments for easier maintenance.
3803  * Deduplicating the code will become feasible after fast path lookup
3804  * becomes more feature-complete.
3805  */
3806 static int
3807 cache_fplookup_parse(struct cache_fpl *fpl)
3808 {
3809         struct nameidata *ndp;
3810         struct componentname *cnp;
3811         char *cp;
3812
3813         ndp = fpl->ndp;
3814         cnp = fpl->cnp;
3815
3816         /*
3817          * Search a new directory.
3818          *
3819          * The last component of the filename is left accessible via
3820          * cnp->cn_nameptr for callers that need the name. Callers needing
3821          * the name set the SAVENAME flag. When done, they assume
3822          * responsibility for freeing the pathname buffer.
3823          */
3824         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3825                 continue;
3826         cnp->cn_namelen = cp - cnp->cn_nameptr;
3827         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
3828                 cache_fpl_smr_exit(fpl);
3829                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
3830         }
3831         ndp->ni_pathlen -= cnp->cn_namelen;
3832         KASSERT(ndp->ni_pathlen <= PATH_MAX,
3833             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3834         ndp->ni_next = cp;
3835
3836         /*
3837          * Replace multiple slashes by a single slash and trailing slashes
3838          * by a null.  This must be done before VOP_LOOKUP() because some
3839          * fs's don't know about trailing slashes.  Remember if there were
3840          * trailing slashes to handle symlinks, existing non-directories
3841          * and non-existing files that won't be directories specially later.
3842          */
3843         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3844                 cp++;
3845                 ndp->ni_pathlen--;
3846                 if (*cp == '\0') {
3847                         /*
3848                          * TODO
3849                          * Regular lookup performs the following:
3850                          * *ndp->ni_next = '\0';
3851                          * cnp->cn_flags |= TRAILINGSLASH;
3852                          *
3853                          * Which is problematic since it modifies data read
3854                          * from userspace. Then if fast path lookup was to
3855                          * abort we would have to either restore it or convey
3856                          * the flag. Since this is a corner case just ignore
3857                          * it for simplicity.
3858                          */
3859                         return (cache_fpl_partial(fpl));
3860                 }
3861         }
3862         ndp->ni_next = cp;
3863
3864         /*
3865          * Check for degenerate name (e.g. / or "")
3866          * which is a way of talking about a directory,
3867          * e.g. like "/." or ".".
3868          *
3869          * TODO
3870          * Another corner case handled by the regular lookup
3871          */
3872         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3873                 return (cache_fpl_partial(fpl));
3874         }
3875         return (0);
3876 }
3877
3878 static void
3879 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3880 {
3881         struct nameidata *ndp;
3882         struct componentname *cnp;
3883
3884         ndp = fpl->ndp;
3885         cnp = fpl->cnp;
3886
3887         cnp->cn_nameptr = ndp->ni_next;
3888         while (*cnp->cn_nameptr == '/') {
3889                 cnp->cn_nameptr++;
3890                 ndp->ni_pathlen--;
3891         }
3892 }
3893
3894 static int __noinline
3895 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
3896 {
3897
3898         switch (error) {
3899         case EAGAIN:
3900                 /*
3901                  * Can happen when racing against vgone.
3902                  * */
3903         case EOPNOTSUPP:
3904                 cache_fpl_partial(fpl);
3905                 break;
3906         default:
3907                 /*
3908                  * See the API contract for VOP_FPLOOKUP_VEXEC.
3909                  */
3910                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3911                         error = cache_fpl_aborted(fpl);
3912                 } else {
3913                         cache_fpl_smr_exit(fpl);
3914                         cache_fpl_handled(fpl, error);
3915                 }
3916                 break;
3917         }
3918         return (error);
3919 }
3920
3921 static int
3922 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3923 {
3924         struct nameidata *ndp;
3925         struct componentname *cnp;
3926         struct mount *mp;
3927         int error;
3928
3929         error = CACHE_FPL_FAILED;
3930         ndp = fpl->ndp;
3931         cnp = fpl->cnp;
3932
3933         cache_fpl_checkpoint(fpl, &fpl->snd);
3934
3935         fpl->dvp = dvp;
3936         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3937         if (seqc_in_modify(fpl->dvp_seqc)) {
3938                 cache_fpl_aborted(fpl);
3939                 goto out;
3940         }
3941         mp = atomic_load_ptr(&fpl->dvp->v_mount);
3942         if (!cache_fplookup_mp_supported(mp)) {
3943                 cache_fpl_aborted(fpl);
3944                 goto out;
3945         }
3946
3947         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3948
3949         for (;;) {
3950                 error = cache_fplookup_parse(fpl);
3951                 if (__predict_false(error != 0)) {
3952                         break;
3953                 }
3954
3955                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3956
3957                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
3958                 if (__predict_false(error != 0)) {
3959                         error = cache_fplookup_failed_vexec(fpl, error);
3960                         break;
3961                 }
3962
3963                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
3964                         error = cache_fplookup_dotdot(fpl);
3965                         if (__predict_false(error != 0)) {
3966                                 break;
3967                         }
3968                 } else {
3969                         error = cache_fplookup_next(fpl);
3970                         if (__predict_false(error != 0)) {
3971                                 break;
3972                         }
3973
3974                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3975
3976                         if (cache_fplookup_need_climb_mount(fpl)) {
3977                                 error = cache_fplookup_climb_mount(fpl);
3978                                 if (__predict_false(error != 0)) {
3979                                         break;
3980                                 }
3981                         }
3982                 }
3983
3984                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3985
3986                 if (cache_fpl_islastcn(ndp)) {
3987                         error = cache_fplookup_final(fpl);
3988                         break;
3989                 }
3990
3991                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3992                         error = cache_fpl_aborted(fpl);
3993                         break;
3994                 }
3995
3996                 fpl->dvp = fpl->tvp;
3997                 fpl->dvp_seqc = fpl->tvp_seqc;
3998
3999                 cache_fplookup_parse_advance(fpl);
4000                 cache_fpl_checkpoint(fpl, &fpl->snd);
4001         }
4002 out:
4003         switch (fpl->status) {
4004         case CACHE_FPL_STATUS_UNSET:
4005                 __assert_unreachable();
4006                 break;
4007         case CACHE_FPL_STATUS_PARTIAL:
4008                 cache_fpl_smr_assert_entered(fpl);
4009                 return (cache_fplookup_partial_setup(fpl));
4010         case CACHE_FPL_STATUS_ABORTED:
4011                 if (fpl->in_smr)
4012                         cache_fpl_smr_exit(fpl);
4013                 return (CACHE_FPL_FAILED);
4014         case CACHE_FPL_STATUS_HANDLED:
4015                 MPASS(error != CACHE_FPL_FAILED);
4016                 cache_fpl_smr_assert_not_entered(fpl);
4017                 if (__predict_false(error != 0)) {
4018                         ndp->ni_dvp = NULL;
4019                         ndp->ni_vp = NULL;
4020                         cache_fpl_cleanup_cnp(cnp);
4021                         return (error);
4022                 }
4023                 ndp->ni_dvp = fpl->dvp;
4024                 ndp->ni_vp = fpl->tvp;
4025                 if (cnp->cn_flags & SAVENAME)
4026                         cnp->cn_flags |= HASBUF;
4027                 else
4028                         cache_fpl_cleanup_cnp(cnp);
4029                 return (error);
4030         }
4031 }
4032
4033 /*
4034  * Fast path lookup protected with SMR and sequence counters.
4035  *
4036  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4037  *
4038  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4039  * outlined below.
4040  *
4041  * Traditional vnode lookup conceptually looks like this:
4042  *
4043  * vn_lock(current);
4044  * for (;;) {
4045  *      next = find();
4046  *      vn_lock(next);
4047  *      vn_unlock(current);
4048  *      current = next;
4049  *      if (last)
4050  *          break;
4051  * }
4052  * return (current);
4053  *
4054  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4055  * any modifications thanks to holding respective locks.
4056  *
4057  * The same guarantee can be provided with a combination of safe memory
4058  * reclamation and sequence counters instead. If all operations which affect
4059  * the relationship between the current vnode and the one we are looking for
4060  * also modify the counter, we can verify whether all the conditions held as
4061  * we made the jump. This includes things like permissions, mount points etc.
4062  * Counter modification is provided by enclosing relevant places in
4063  * vn_seqc_write_begin()/end() calls.
4064  *
4065  * Thus this translates to:
4066  *
4067  * vfs_smr_enter();
4068  * dvp_seqc = seqc_read_any(dvp);
4069  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4070  *     abort();
4071  * for (;;) {
4072  *      tvp = find();
4073  *      tvp_seqc = seqc_read_any(tvp);
4074  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4075  *          abort();
4076  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4077  *          abort();
4078  *      dvp = tvp; // we know nothing of importance has changed
4079  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4080  *      if (last)
4081  *          break;
4082  * }
4083  * vget(); // secure the vnode
4084  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4085  *          abort();
4086  * // at this point we know nothing has changed for any parent<->child pair
4087  * // as they were crossed during the lookup, meaning we matched the guarantee
4088  * // of the locked variant
4089  * return (tvp);
4090  *
4091  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4092  * - they are called while within vfs_smr protection which they must never exit
4093  * - EAGAIN can be returned to denote checking could not be performed, it is
4094  *   always valid to return it
4095  * - if the sequence counter has not changed the result must be valid
4096  * - if the sequence counter has changed both false positives and false negatives
4097  *   are permitted (since the result will be rejected later)
4098  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4099  *
4100  * Caveats to watch out for:
4101  * - vnodes are passed unlocked and unreferenced with nothing stopping
4102  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4103  *   to use atomic_load_ptr to fetch it.
4104  * - the aforementioned object can also get freed, meaning absent other means it
4105  *   should be protected with vfs_smr
4106  * - either safely checking permissions as they are modified or guaranteeing
4107  *   their stability is left to the routine
4108  */
4109 int
4110 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4111     struct pwd **pwdp)
4112 {
4113         struct cache_fpl fpl;
4114         struct pwd *pwd;
4115         struct vnode *dvp;
4116         struct componentname *cnp;
4117         struct nameidata_saved orig;
4118         int error;
4119
4120         MPASS(ndp->ni_lcf == 0);
4121
4122         fpl.status = CACHE_FPL_STATUS_UNSET;
4123         fpl.ndp = ndp;
4124         fpl.cnp = &ndp->ni_cnd;
4125         MPASS(curthread == fpl.cnp->cn_thread);
4126
4127         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4128                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4129
4130         if (!cache_can_fplookup(&fpl)) {
4131                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4132                 *status = fpl.status;
4133                 return (EOPNOTSUPP);
4134         }
4135
4136         cache_fpl_checkpoint(&fpl, &orig);
4137
4138         cache_fpl_smr_enter_initial(&fpl);
4139         pwd = pwd_get_smr();
4140         fpl.pwd = pwd;
4141         ndp->ni_rootdir = pwd->pwd_rdir;
4142         ndp->ni_topdir = pwd->pwd_jdir;
4143
4144         cnp = fpl.cnp;
4145         cnp->cn_nameptr = cnp->cn_pnbuf;
4146         if (cnp->cn_pnbuf[0] == '/') {
4147                 cache_fpl_handle_root(ndp, &dvp);
4148         } else {
4149                 MPASS(ndp->ni_dirfd == AT_FDCWD);
4150                 dvp = pwd->pwd_cdir;
4151         }
4152
4153         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4154
4155         error = cache_fplookup_impl(dvp, &fpl);
4156         cache_fpl_smr_assert_not_entered(&fpl);
4157         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4158
4159         *status = fpl.status;
4160         switch (fpl.status) {
4161         case CACHE_FPL_STATUS_UNSET:
4162                 __assert_unreachable();
4163                 break;
4164         case CACHE_FPL_STATUS_HANDLED:
4165                 SDT_PROBE3(vfs, namei, lookup, return, error,
4166                     (error == 0 ? ndp->ni_vp : NULL), true);
4167                 break;
4168         case CACHE_FPL_STATUS_PARTIAL:
4169                 *pwdp = fpl.pwd;
4170                 /*
4171                  * Status restored by cache_fplookup_partial_setup.
4172                  */
4173                 break;
4174         case CACHE_FPL_STATUS_ABORTED:
4175                 cache_fpl_restore(&fpl, &orig);
4176                 break;
4177         }
4178         return (error);
4179 }