sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/mount.h>
  55 #include <sys/namei.h>
  56 #include <sys/proc.h>
  57 #include <sys/rwlock.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 SDT_PROVIDER_DECLARE(vfs);
  83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  84     "struct vnode *");
  85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  86     "char *");
  87 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  89     "char *", "struct vnode *");
  90 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  91 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  92     "struct vnode *", "char *");
  93 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  94     "struct vnode *");
  95 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  96     "struct vnode *", "char *");
  97 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  98     "char *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 100 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 101 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 105     "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
 107     "char *");
 108
 109 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 110 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 111 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 112
 113 /*
 114  * This structure describes the elements in the cache of recent
 115  * names looked up by namei.
 116  */
 117 struct negstate {
 118         u_char neg_flag;
 119 };
 120 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 121     "the state must fit in a union with a pointer without growing it");
 122
 123 struct  namecache {
 124         CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */
 125         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 126         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 127         struct  vnode *nc_dvp;          /* vnode of parent of name */
 128         union {
 129                 struct  vnode *nu_vp;   /* vnode the name refers to */
 130                 struct  negstate nu_neg;/* negative entry state */
 131         } n_un;
 132         u_char  nc_flag;                /* flag bits */
 133         u_char  nc_nlen;                /* length of name */
 134         char    nc_name[0];             /* segment name + nul */
 135 };
 136
 137 /*
 138  * struct namecache_ts repeats struct namecache layout up to the
 139  * nc_nlen member.
 140  * struct namecache_ts is used in place of struct namecache when time(s) need
 141  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 142  * both a non-dotdot directory name plus dotdot for the directory's
 143  * parent.
 144  */
 145 struct  namecache_ts {
 146         struct  timespec nc_time;       /* timespec provided by fs */
 147         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 148         int     nc_ticks;               /* ticks value when entry was added */
 149         struct namecache nc_nc;
 150 };
 151
 152 #define nc_vp           n_un.nu_vp
 153 #define nc_neg          n_un.nu_neg
 154
 155 /*
 156  * Flags in namecache.nc_flag
 157  */
 158 #define NCF_WHITE       0x01
 159 #define NCF_ISDOTDOT    0x02
 160 #define NCF_TS          0x04
 161 #define NCF_DTS         0x08
 162 #define NCF_DVDROP      0x10
 163 #define NCF_NEGATIVE    0x20
 164 #define NCF_INVALID     0x40
 165
 166 /*
 167  * Flags in negstate.neg_flag
 168  */
 169 #define NEG_HOT         0x01
 170
 171 /*
 172  * Mark an entry as invalid.
 173  *
 174  * This is called before it starts getting deconstructed.
 175  */
 176 static void
 177 cache_ncp_invalidate(struct namecache *ncp)
 178 {
 179
 180         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 181             ("%s: entry %p already invalid", __func__, ncp));
 182         ncp->nc_flag |= NCF_INVALID;
 183         atomic_thread_fence_rel();
 184 }
 185
 186 /*
 187  * Verify validity of an entry.
 188  *
 189  * All places which elide locks are supposed to call this after they are
 190  * done with reading from an entry.
 191  */
 192 static bool
 193 cache_ncp_invalid(struct namecache *ncp)
 194 {
 195
 196         atomic_thread_fence_acq();
 197         return ((ncp->nc_flag & NCF_INVALID) != 0);
 198 }
 199
 200 /*
 201  * Name caching works as follows:
 202  *
 203  * Names found by directory scans are retained in a cache
 204  * for future reference.  It is managed LRU, so frequently
 205  * used names will hang around.  Cache is indexed by hash value
 206  * obtained from (dvp, name) where dvp refers to the directory
 207  * containing name.
 208  *
 209  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 210  * exist) the vnode pointer will be NULL.
 211  *
 212  * Upon reaching the last segment of a path, if the reference
 213  * is for DELETE, or NOCACHE is set (rewrite), and the
 214  * name is located in the cache, it will be dropped.
 215  *
 216  * These locks are used (in the order in which they can be taken):
 217  * NAME         TYPE    ROLE
 218  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 219  * bucketlock   rwlock  for access to given set of hash buckets
 220  * neglist      mtx     negative entry LRU management
 221  *
 222  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
 223  * shrinking the LRU list.
 224  *
 225  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 226  * order is lower address first. Both are recursive.
 227  *
 228  * "." lookups are lockless.
 229  *
 230  * ".." and vnode -> name lookups require vnodelock.
 231  *
 232  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 233  *
 234  * Insertions and removals of entries require involved vnodes and bucketlocks
 235  * to be write-locked to prevent other threads from seeing the entry.
 236  *
 237  * Some lookups result in removal of the found entry (e.g. getting rid of a
 238  * negative entry with the intent to create a positive one), which poses a
 239  * problem when multiple threads reach the state. Similarly, two different
 240  * threads can purge two different vnodes and try to remove the same name.
 241  *
 242  * If the already held vnode lock is lower than the second required lock, we
 243  * can just take the other lock. However, in the opposite case, this could
 244  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 245  * the first node, locking everything in order and revalidating the state.
 246  */
 247
 248 VFS_SMR_DECLARE;
 249
 250 /*
 251  * Structures associated with name caching.
 252  */
 253 #define NCHHASH(hash) \
 254         (&nchashtbl[(hash) & nchash])
 255 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 256 static u_long __read_mostly     nchash;                 /* size of hash table */
 257 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 258     "Size of namecache hash table");
 259 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 260 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 261     "Ratio of negative namecache entries");
 262 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 263 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 264 u_int ncsizefactor = 2;
 265 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 266     "Size factor for namecache");
 267 static u_int __read_mostly      ncpurgeminvnodes;
 268 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
 269     "Number of vnodes below which purgevfs ignores the request");
 270 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 271
 272 struct nchstats nchstats;               /* cache effectiveness statistics */
 273
 274 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
 275
 276 struct neglist {
 277         struct mtx              nl_lock;
 278         TAILQ_HEAD(, namecache) nl_list;
 279 } __aligned(CACHE_LINE_SIZE);
 280
 281 static struct neglist __read_mostly     *neglists;
 282 static struct neglist ncneg_hot;
 283 static u_long numhotneg;
 284
 285 #define numneglists (ncneghash + 1)
 286 static u_int __read_mostly      ncneghash;
 287 static inline struct neglist *
 288 NCP2NEGLIST(struct namecache *ncp)
 289 {
 290
 291         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 292 }
 293
 294 static inline struct negstate *
 295 NCP2NEGSTATE(struct namecache *ncp)
 296 {
 297
 298         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 299         return (&ncp->nc_neg);
 300 }
 301
 302 #define numbucketlocks (ncbuckethash + 1)
 303 static u_int __read_mostly  ncbuckethash;
 304 static struct rwlock_padalign __read_mostly  *bucketlocks;
 305 #define HASH2BUCKETLOCK(hash) \
 306         ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
 307
 308 #define numvnodelocks (ncvnodehash + 1)
 309 static u_int __read_mostly  ncvnodehash;
 310 static struct mtx __read_mostly *vnodelocks;
 311 static inline struct mtx *
 312 VP2VNODELOCK(struct vnode *vp)
 313 {
 314
 315         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 316 }
 317
 318 /*
 319  * UMA zones for the VFS cache.
 320  *
 321  * The small cache is used for entries with short names, which are the
 322  * most common.  The large cache is used for entries which are too big to
 323  * fit in the small cache.
 324  */
 325 static uma_zone_t __read_mostly cache_zone_small;
 326 static uma_zone_t __read_mostly cache_zone_small_ts;
 327 static uma_zone_t __read_mostly cache_zone_large;
 328 static uma_zone_t __read_mostly cache_zone_large_ts;
 329
 330 #define CACHE_PATH_CUTOFF       35
 331
 332 static struct namecache *
 333 cache_alloc(int len, int ts)
 334 {
 335         struct namecache_ts *ncp_ts;
 336         struct namecache *ncp;
 337
 338         if (__predict_false(ts)) {
 339                 if (len <= CACHE_PATH_CUTOFF)
 340                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 341                 else
 342                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 343                 ncp = &ncp_ts->nc_nc;
 344         } else {
 345                 if (len <= CACHE_PATH_CUTOFF)
 346                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 347                 else
 348                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 349         }
 350         return (ncp);
 351 }
 352
 353 static void
 354 cache_free(struct namecache *ncp)
 355 {
 356         struct namecache_ts *ncp_ts;
 357
 358         if (ncp == NULL)
 359                 return;
 360         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 361                 vdrop(ncp->nc_dvp);
 362         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 363                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 364                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 365                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 366                 else
 367                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 368         } else {
 369                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 370                         uma_zfree_smr(cache_zone_small, ncp);
 371                 else
 372                         uma_zfree_smr(cache_zone_large, ncp);
 373         }
 374 }
 375
 376 static void
 377 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 378 {
 379         struct namecache_ts *ncp_ts;
 380
 381         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 382             (tsp == NULL && ticksp == NULL),
 383             ("No NCF_TS"));
 384
 385         if (tsp == NULL && ticksp == NULL)
 386                 return;
 387
 388         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 389         if (tsp != NULL)
 390                 *tsp = ncp_ts->nc_time;
 391         if (ticksp != NULL)
 392                 *ticksp = ncp_ts->nc_ticks;
 393 }
 394
 395 #ifdef DEBUG_CACHE
 396 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 397 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 398     "VFS namecache enabled");
 399 #endif
 400
 401 /* Export size information to userland */
 402 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 403     sizeof(struct namecache), "sizeof(struct namecache)");
 404
 405 /*
 406  * The new name cache statistics
 407  */
 408 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 409     "Name cache statistics");
 410 #define STATNODE_ULONG(name, descr)                                     \
 411         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 412 #define STATNODE_COUNTER(name, descr)                                   \
 413         static COUNTER_U64_DEFINE_EARLY(name);                          \
 414         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 415             descr);
 416 STATNODE_ULONG(numneg, "Number of negative cache entries");
 417 STATNODE_ULONG(numcache, "Number of cache entries");
 418 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 419 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 420 STATNODE_COUNTER(dothits, "Number of '.' hits");
 421 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 422 STATNODE_COUNTER(nummiss, "Number of cache misses");
 423 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 424 STATNODE_COUNTER(numposzaps,
 425     "Number of cache hits (positive) we do not want to cache");
 426 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 427 STATNODE_COUNTER(numnegzaps,
 428     "Number of cache hits (negative) we do not want to cache");
 429 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 430 /* These count for vn_getcwd(), too. */
 431 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 432 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 433 STATNODE_COUNTER(numfullpathfail2,
 434     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 435 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 436 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 437 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 438     "Number of successful removals after relocking");
 439 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 440     "Number of times zap_and_exit failed to lock");
 441 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 442     "Number of times zap_and_exit failed to lock");
 443 static long cache_lock_vnodes_cel_3_failures;
 444 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 445     "Number of times 3-way vnode locking failed");
 446 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
 447 STATNODE_COUNTER(numneg_evicted,
 448     "Number of negative entries evicted when adding a new entry");
 449 STATNODE_COUNTER(shrinking_skipped,
 450     "Number of times shrinking was already in progress");
 451
 452 static void cache_zap_locked(struct namecache *ncp);
 453 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
 454     char **freebuf, size_t *buflen);
 455 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
 456     char *buf, char **retbuf, size_t *buflen);
 457 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
 458     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 459
 460 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 461
 462 static int cache_yield;
 463 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
 464     "Number of times cache called yield");
 465
 466 static void __noinline
 467 cache_maybe_yield(void)
 468 {
 469
 470         if (should_yield()) {
 471                 cache_yield++;
 472                 kern_yield(PRI_USER);
 473         }
 474 }
 475
 476 static inline void
 477 cache_assert_vlp_locked(struct mtx *vlp)
 478 {
 479
 480         if (vlp != NULL)
 481                 mtx_assert(vlp, MA_OWNED);
 482 }
 483
 484 static inline void
 485 cache_assert_vnode_locked(struct vnode *vp)
 486 {
 487         struct mtx *vlp;
 488
 489         vlp = VP2VNODELOCK(vp);
 490         cache_assert_vlp_locked(vlp);
 491 }
 492
 493 static uint32_t
 494 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 495 {
 496         uint32_t hash;
 497
 498         hash = fnv_32_buf(name, len, FNV1_32_INIT);
 499         hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 500         return (hash);
 501 }
 502
 503 static inline struct rwlock *
 504 NCP2BUCKETLOCK(struct namecache *ncp)
 505 {
 506         uint32_t hash;
 507
 508         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 509         return (HASH2BUCKETLOCK(hash));
 510 }
 511
 512 #ifdef INVARIANTS
 513 static void
 514 cache_assert_bucket_locked(struct namecache *ncp, int mode)
 515 {
 516         struct rwlock *blp;
 517
 518         blp = NCP2BUCKETLOCK(ncp);
 519         rw_assert(blp, mode);
 520 }
 521 #else
 522 #define cache_assert_bucket_locked(x, y) do { } while (0)
 523 #endif
 524
 525 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 526 static void
 527 _cache_sort_vnodes(void **p1, void **p2)
 528 {
 529         void *tmp;
 530
 531         MPASS(*p1 != NULL || *p2 != NULL);
 532
 533         if (*p1 > *p2) {
 534                 tmp = *p2;
 535                 *p2 = *p1;
 536                 *p1 = tmp;
 537         }
 538 }
 539
 540 static void
 541 cache_lock_all_buckets(void)
 542 {
 543         u_int i;
 544
 545         for (i = 0; i < numbucketlocks; i++)
 546                 rw_wlock(&bucketlocks[i]);
 547 }
 548
 549 static void
 550 cache_unlock_all_buckets(void)
 551 {
 552         u_int i;
 553
 554         for (i = 0; i < numbucketlocks; i++)
 555                 rw_wunlock(&bucketlocks[i]);
 556 }
 557
 558 static void
 559 cache_lock_all_vnodes(void)
 560 {
 561         u_int i;
 562
 563         for (i = 0; i < numvnodelocks; i++)
 564                 mtx_lock(&vnodelocks[i]);
 565 }
 566
 567 static void
 568 cache_unlock_all_vnodes(void)
 569 {
 570         u_int i;
 571
 572         for (i = 0; i < numvnodelocks; i++)
 573                 mtx_unlock(&vnodelocks[i]);
 574 }
 575
 576 static int
 577 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 578 {
 579
 580         cache_sort_vnodes(&vlp1, &vlp2);
 581
 582         if (vlp1 != NULL) {
 583                 if (!mtx_trylock(vlp1))
 584                         return (EAGAIN);
 585         }
 586         if (!mtx_trylock(vlp2)) {
 587                 if (vlp1 != NULL)
 588                         mtx_unlock(vlp1);
 589                 return (EAGAIN);
 590         }
 591
 592         return (0);
 593 }
 594
 595 static void
 596 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 597 {
 598
 599         MPASS(vlp1 != NULL || vlp2 != NULL);
 600         MPASS(vlp1 <= vlp2);
 601
 602         if (vlp1 != NULL)
 603                 mtx_lock(vlp1);
 604         if (vlp2 != NULL)
 605                 mtx_lock(vlp2);
 606 }
 607
 608 static void
 609 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 610 {
 611
 612         MPASS(vlp1 != NULL || vlp2 != NULL);
 613
 614         if (vlp1 != NULL)
 615                 mtx_unlock(vlp1);
 616         if (vlp2 != NULL)
 617                 mtx_unlock(vlp2);
 618 }
 619
 620 static int
 621 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 622 {
 623         struct nchstats snap;
 624
 625         if (req->oldptr == NULL)
 626                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 627
 628         snap = nchstats;
 629         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 630         snap.ncs_neghits = counter_u64_fetch(numneghits);
 631         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 632             counter_u64_fetch(numnegzaps);
 633         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 634             counter_u64_fetch(nummiss);
 635
 636         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 637 }
 638 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 639     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 640     "VFS cache effectiveness statistics");
 641
 642 #ifdef DIAGNOSTIC
 643 /*
 644  * Grab an atomic snapshot of the name cache hash chain lengths
 645  */
 646 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 647     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 648     "hash table stats");
 649
 650 static int
 651 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 652 {
 653         struct nchashhead *ncpp;
 654         struct namecache *ncp;
 655         int i, error, n_nchash, *cntbuf;
 656
 657 retry:
 658         n_nchash = nchash + 1;  /* nchash is max index, not count */
 659         if (req->oldptr == NULL)
 660                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 661         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 662         cache_lock_all_buckets();
 663         if (n_nchash != nchash + 1) {
 664                 cache_unlock_all_buckets();
 665                 free(cntbuf, M_TEMP);
 666                 goto retry;
 667         }
 668         /* Scan hash tables counting entries */
 669         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 670                 CK_LIST_FOREACH(ncp, ncpp, nc_hash)
 671                         cntbuf[i]++;
 672         cache_unlock_all_buckets();
 673         for (error = 0, i = 0; i < n_nchash; i++)
 674                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 675                         break;
 676         free(cntbuf, M_TEMP);
 677         return (error);
 678 }
 679 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 680     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 681     "nchash chain lengths");
 682
 683 static int
 684 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 685 {
 686         int error;
 687         struct nchashhead *ncpp;
 688         struct namecache *ncp;
 689         int n_nchash;
 690         int count, maxlength, used, pct;
 691
 692         if (!req->oldptr)
 693                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 694
 695         cache_lock_all_buckets();
 696         n_nchash = nchash + 1;  /* nchash is max index, not count */
 697         used = 0;
 698         maxlength = 0;
 699
 700         /* Scan hash tables for applicable entries */
 701         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 702                 count = 0;
 703                 CK_LIST_FOREACH(ncp, ncpp, nc_hash) {
 704                         count++;
 705                 }
 706                 if (count)
 707                         used++;
 708                 if (maxlength < count)
 709                         maxlength = count;
 710         }
 711         n_nchash = nchash + 1;
 712         cache_unlock_all_buckets();
 713         pct = (used * 100) / (n_nchash / 100);
 714         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 715         if (error)
 716                 return (error);
 717         error = SYSCTL_OUT(req, &used, sizeof(used));
 718         if (error)
 719                 return (error);
 720         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 721         if (error)
 722                 return (error);
 723         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 724         if (error)
 725                 return (error);
 726         return (0);
 727 }
 728 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 729     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 730     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 731 #endif
 732
 733 /*
 734  * Negative entries management
 735  *
 736  * A variation of LRU scheme is used. New entries are hashed into one of
 737  * numneglists cold lists. Entries get promoted to the hot list on first hit.
 738  *
 739  * The shrinker will demote hot list head and evict from the cold list in a
 740  * round-robin manner.
 741  */
 742 static void
 743 cache_negative_init(struct namecache *ncp)
 744 {
 745         struct negstate *negstate;
 746
 747         ncp->nc_flag |= NCF_NEGATIVE;
 748         negstate = NCP2NEGSTATE(ncp);
 749         negstate->neg_flag = 0;
 750 }
 751
 752 static void
 753 cache_negative_hit(struct namecache *ncp)
 754 {
 755         struct neglist *neglist;
 756         struct negstate *negstate;
 757
 758         negstate = NCP2NEGSTATE(ncp);
 759         if ((negstate->neg_flag & NEG_HOT) != 0)
 760                 return;
 761         neglist = NCP2NEGLIST(ncp);
 762         mtx_lock(&ncneg_hot.nl_lock);
 763         mtx_lock(&neglist->nl_lock);
 764         if ((negstate->neg_flag & NEG_HOT) == 0) {
 765                 numhotneg++;
 766                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 767                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 768                 negstate->neg_flag |= NEG_HOT;
 769         }
 770         mtx_unlock(&neglist->nl_lock);
 771         mtx_unlock(&ncneg_hot.nl_lock);
 772 }
 773
 774 static void
 775 cache_negative_insert(struct namecache *ncp)
 776 {
 777         struct neglist *neglist;
 778
 779         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 780         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 781         neglist = NCP2NEGLIST(ncp);
 782         mtx_lock(&neglist->nl_lock);
 783         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 784         mtx_unlock(&neglist->nl_lock);
 785         atomic_add_rel_long(&numneg, 1);
 786 }
 787
 788 static void
 789 cache_negative_remove(struct namecache *ncp)
 790 {
 791         struct neglist *neglist;
 792         struct negstate *negstate;
 793         bool hot_locked = false;
 794         bool list_locked = false;
 795
 796         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 797         neglist = NCP2NEGLIST(ncp);
 798         negstate = NCP2NEGSTATE(ncp);
 799         if ((negstate->neg_flag & NEG_HOT) != 0) {
 800                 hot_locked = true;
 801                 mtx_lock(&ncneg_hot.nl_lock);
 802                 if ((negstate->neg_flag & NEG_HOT) == 0) {
 803                         list_locked = true;
 804                         mtx_lock(&neglist->nl_lock);
 805                 }
 806         } else {
 807                 list_locked = true;
 808                 mtx_lock(&neglist->nl_lock);
 809                 /*
 810                  * We may be racing against promotion in lockless lookup.
 811                  */
 812                 if ((negstate->neg_flag & NEG_HOT) != 0) {
 813                         mtx_unlock(&neglist->nl_lock);
 814                         hot_locked = true;
 815                         mtx_lock(&ncneg_hot.nl_lock);
 816                         mtx_lock(&neglist->nl_lock);
 817                 }
 818         }
 819         if ((negstate->neg_flag & NEG_HOT) != 0) {
 820                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
 821                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 822                 numhotneg--;
 823         } else {
 824                 mtx_assert(&neglist->nl_lock, MA_OWNED);
 825                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 826         }
 827         if (list_locked)
 828                 mtx_unlock(&neglist->nl_lock);
 829         if (hot_locked)
 830                 mtx_unlock(&ncneg_hot.nl_lock);
 831         atomic_subtract_rel_long(&numneg, 1);
 832 }
 833
 834 static void
 835 cache_negative_shrink_select(struct namecache **ncpp,
 836     struct neglist **neglistpp)
 837 {
 838         struct neglist *neglist;
 839         struct namecache *ncp;
 840         static u_int cycle;
 841         u_int i;
 842
 843         *ncpp = ncp = NULL;
 844
 845         for (i = 0; i < numneglists; i++) {
 846                 neglist = &neglists[(cycle + i) % numneglists];
 847                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
 848                         continue;
 849                 mtx_lock(&neglist->nl_lock);
 850                 ncp = TAILQ_FIRST(&neglist->nl_list);
 851                 if (ncp != NULL)
 852                         break;
 853                 mtx_unlock(&neglist->nl_lock);
 854         }
 855
 856         *neglistpp = neglist;
 857         *ncpp = ncp;
 858         cycle++;
 859 }
 860
 861 static void
 862 cache_negative_zap_one(void)
 863 {
 864         struct namecache *ncp, *ncp2;
 865         struct neglist *neglist;
 866         struct negstate *negstate;
 867         struct mtx *dvlp;
 868         struct rwlock *blp;
 869
 870         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 871             !mtx_trylock(&ncneg_shrink_lock)) {
 872                 counter_u64_add(shrinking_skipped, 1);
 873                 return;
 874         }
 875
 876         mtx_lock(&ncneg_hot.nl_lock);
 877         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
 878         if (ncp != NULL) {
 879                 neglist = NCP2NEGLIST(ncp);
 880                 negstate = NCP2NEGSTATE(ncp);
 881                 mtx_lock(&neglist->nl_lock);
 882                 MPASS((negstate->neg_flag & NEG_HOT) != 0);
 883                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 884                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 885                 negstate->neg_flag &= ~NEG_HOT;
 886                 numhotneg--;
 887                 mtx_unlock(&neglist->nl_lock);
 888         }
 889         mtx_unlock(&ncneg_hot.nl_lock);
 890
 891         cache_negative_shrink_select(&ncp, &neglist);
 892
 893         mtx_unlock(&ncneg_shrink_lock);
 894         if (ncp == NULL)
 895                 return;
 896
 897         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 898         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 899         blp = NCP2BUCKETLOCK(ncp);
 900         mtx_unlock(&neglist->nl_lock);
 901         mtx_lock(dvlp);
 902         rw_wlock(blp);
 903         /*
 904          * Enter SMR to safely check the negative list.
 905          * Even if the found pointer matches, the entry may now be reallocated
 906          * and used by a different vnode.
 907          */
 908         vfs_smr_enter();
 909         ncp2 = TAILQ_FIRST(&neglist->nl_list);
 910         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 911             blp != NCP2BUCKETLOCK(ncp2)) {
 912                 vfs_smr_exit();
 913                 ncp = NULL;
 914         } else {
 915                 vfs_smr_exit();
 916                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 917                     ncp->nc_name);
 918                 cache_zap_locked(ncp);
 919                 counter_u64_add(numneg_evicted, 1);
 920         }
 921         rw_wunlock(blp);
 922         mtx_unlock(dvlp);
 923         cache_free(ncp);
 924 }
 925
 926 /*
 927  * cache_zap_locked():
 928  *
 929  *   Removes a namecache entry from cache, whether it contains an actual
 930  *   pointer to a vnode or if it is just a negative cache entry.
 931  */
 932 static void
 933 cache_zap_locked(struct namecache *ncp)
 934 {
 935
 936         if (!(ncp->nc_flag & NCF_NEGATIVE))
 937                 cache_assert_vnode_locked(ncp->nc_vp);
 938         cache_assert_vnode_locked(ncp->nc_dvp);
 939         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 940
 941         CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
 942             (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
 943
 944         cache_ncp_invalidate(ncp);
 945
 946         CK_LIST_REMOVE(ncp, nc_hash);
 947         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 948                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 949                     ncp->nc_name, ncp->nc_vp);
 950                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 951                 if (ncp == ncp->nc_vp->v_cache_dd)
 952                         ncp->nc_vp->v_cache_dd = NULL;
 953         } else {
 954                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 955                     ncp->nc_name);
 956                 cache_negative_remove(ncp);
 957         }
 958         if (ncp->nc_flag & NCF_ISDOTDOT) {
 959                 if (ncp == ncp->nc_dvp->v_cache_dd)
 960                         ncp->nc_dvp->v_cache_dd = NULL;
 961         } else {
 962                 LIST_REMOVE(ncp, nc_src);
 963                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 964                         ncp->nc_flag |= NCF_DVDROP;
 965                         counter_u64_add(numcachehv, -1);
 966                 }
 967         }
 968         atomic_subtract_rel_long(&numcache, 1);
 969 }
 970
 971 static void
 972 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 973 {
 974         struct rwlock *blp;
 975
 976         MPASS(ncp->nc_dvp == vp);
 977         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 978         cache_assert_vnode_locked(vp);
 979
 980         blp = NCP2BUCKETLOCK(ncp);
 981         rw_wlock(blp);
 982         cache_zap_locked(ncp);
 983         rw_wunlock(blp);
 984 }
 985
 986 static bool
 987 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
 988     struct mtx **vlpp)
 989 {
 990         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 991         struct rwlock *blp;
 992
 993         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 994         cache_assert_vnode_locked(vp);
 995
 996         if (ncp->nc_flag & NCF_NEGATIVE) {
 997                 if (*vlpp != NULL) {
 998                         mtx_unlock(*vlpp);
 999                         *vlpp = NULL;
1000                 }
1001                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1002                 return (true);
1003         }
1004
1005         pvlp = VP2VNODELOCK(vp);
1006         blp = NCP2BUCKETLOCK(ncp);
1007         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1008         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1009
1010         if (*vlpp == vlp1 || *vlpp == vlp2) {
1011                 to_unlock = *vlpp;
1012                 *vlpp = NULL;
1013         } else {
1014                 if (*vlpp != NULL) {
1015                         mtx_unlock(*vlpp);
1016                         *vlpp = NULL;
1017                 }
1018                 cache_sort_vnodes(&vlp1, &vlp2);
1019                 if (vlp1 == pvlp) {
1020                         mtx_lock(vlp2);
1021                         to_unlock = vlp2;
1022                 } else {
1023                         if (!mtx_trylock(vlp1))
1024                                 goto out_relock;
1025                         to_unlock = vlp1;
1026                 }
1027         }
1028         rw_wlock(blp);
1029         cache_zap_locked(ncp);
1030         rw_wunlock(blp);
1031         if (to_unlock != NULL)
1032                 mtx_unlock(to_unlock);
1033         return (true);
1034
1035 out_relock:
1036         mtx_unlock(vlp2);
1037         mtx_lock(vlp1);
1038         mtx_lock(vlp2);
1039         MPASS(*vlpp == NULL);
1040         *vlpp = vlp1;
1041         return (false);
1042 }
1043
1044 static int __noinline
1045 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1046 {
1047         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1048         struct rwlock *blp;
1049         int error = 0;
1050
1051         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1052         cache_assert_vnode_locked(vp);
1053
1054         pvlp = VP2VNODELOCK(vp);
1055         if (ncp->nc_flag & NCF_NEGATIVE) {
1056                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1057                 goto out;
1058         }
1059
1060         blp = NCP2BUCKETLOCK(ncp);
1061         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1062         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1063         cache_sort_vnodes(&vlp1, &vlp2);
1064         if (vlp1 == pvlp) {
1065                 mtx_lock(vlp2);
1066                 to_unlock = vlp2;
1067         } else {
1068                 if (!mtx_trylock(vlp1)) {
1069                         error = EAGAIN;
1070                         goto out;
1071                 }
1072                 to_unlock = vlp1;
1073         }
1074         rw_wlock(blp);
1075         cache_zap_locked(ncp);
1076         rw_wunlock(blp);
1077         mtx_unlock(to_unlock);
1078 out:
1079         mtx_unlock(pvlp);
1080         return (error);
1081 }
1082
1083 /*
1084  * If trylocking failed we can get here. We know enough to take all needed locks
1085  * in the right order and re-lookup the entry.
1086  */
1087 static int
1088 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1089     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1090     struct rwlock *blp)
1091 {
1092         struct namecache *rncp;
1093
1094         cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1095
1096         cache_sort_vnodes(&dvlp, &vlp);
1097         cache_lock_vnodes(dvlp, vlp);
1098         rw_wlock(blp);
1099         CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1100                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1101                     rncp->nc_nlen == cnp->cn_namelen &&
1102                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1103                         break;
1104         }
1105         if (rncp != NULL) {
1106                 cache_zap_locked(rncp);
1107                 rw_wunlock(blp);
1108                 cache_unlock_vnodes(dvlp, vlp);
1109                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1110                 return (0);
1111         }
1112
1113         rw_wunlock(blp);
1114         cache_unlock_vnodes(dvlp, vlp);
1115         return (EAGAIN);
1116 }
1117
1118 static int __noinline
1119 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1120     uint32_t hash, struct rwlock *blp)
1121 {
1122         struct mtx *dvlp, *vlp;
1123         struct vnode *dvp;
1124
1125         cache_assert_bucket_locked(ncp, RA_WLOCKED);
1126
1127         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1128         vlp = NULL;
1129         if (!(ncp->nc_flag & NCF_NEGATIVE))
1130                 vlp = VP2VNODELOCK(ncp->nc_vp);
1131         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1132                 cache_zap_locked(ncp);
1133                 rw_wunlock(blp);
1134                 cache_unlock_vnodes(dvlp, vlp);
1135                 return (0);
1136         }
1137
1138         dvp = ncp->nc_dvp;
1139         rw_wunlock(blp);
1140         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1141 }
1142
1143 static int __noinline
1144 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1145     uint32_t hash, struct rwlock *blp)
1146 {
1147         struct mtx *dvlp, *vlp;
1148         struct vnode *dvp;
1149
1150         cache_assert_bucket_locked(ncp, RA_RLOCKED);
1151
1152         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1153         vlp = NULL;
1154         if (!(ncp->nc_flag & NCF_NEGATIVE))
1155                 vlp = VP2VNODELOCK(ncp->nc_vp);
1156         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1157                 rw_runlock(blp);
1158                 rw_wlock(blp);
1159                 cache_zap_locked(ncp);
1160                 rw_wunlock(blp);
1161                 cache_unlock_vnodes(dvlp, vlp);
1162                 return (0);
1163         }
1164
1165         dvp = ncp->nc_dvp;
1166         rw_runlock(blp);
1167         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1168 }
1169
1170 static int
1171 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1172     struct mtx **vlpp1, struct mtx **vlpp2)
1173 {
1174         struct mtx *dvlp, *vlp;
1175
1176         cache_assert_bucket_locked(ncp, RA_WLOCKED);
1177
1178         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1179         vlp = NULL;
1180         if (!(ncp->nc_flag & NCF_NEGATIVE))
1181                 vlp = VP2VNODELOCK(ncp->nc_vp);
1182         cache_sort_vnodes(&dvlp, &vlp);
1183
1184         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1185                 cache_zap_locked(ncp);
1186                 cache_unlock_vnodes(dvlp, vlp);
1187                 *vlpp1 = NULL;
1188                 *vlpp2 = NULL;
1189                 return (0);
1190         }
1191
1192         if (*vlpp1 != NULL)
1193                 mtx_unlock(*vlpp1);
1194         if (*vlpp2 != NULL)
1195                 mtx_unlock(*vlpp2);
1196         *vlpp1 = NULL;
1197         *vlpp2 = NULL;
1198
1199         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1200                 cache_zap_locked(ncp);
1201                 cache_unlock_vnodes(dvlp, vlp);
1202                 return (0);
1203         }
1204
1205         rw_wunlock(blp);
1206         *vlpp1 = dvlp;
1207         *vlpp2 = vlp;
1208         if (*vlpp1 != NULL)
1209                 mtx_lock(*vlpp1);
1210         mtx_lock(*vlpp2);
1211         rw_wlock(blp);
1212         return (EAGAIN);
1213 }
1214
1215 static void
1216 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1217 {
1218
1219         if (blp != NULL) {
1220                 rw_runlock(blp);
1221         } else {
1222                 mtx_unlock(vlp);
1223         }
1224 }
1225
1226 static int __noinline
1227 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1228     struct timespec *tsp, int *ticksp)
1229 {
1230         int ltype;
1231
1232         *vpp = dvp;
1233         CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1234                         dvp, cnp->cn_nameptr);
1235         counter_u64_add(dothits, 1);
1236         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1237         if (tsp != NULL)
1238                 timespecclear(tsp);
1239         if (ticksp != NULL)
1240                 *ticksp = ticks;
1241         vrefact(*vpp);
1242         /*
1243          * When we lookup "." we still can be asked to lock it
1244          * differently...
1245          */
1246         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1247         if (ltype != VOP_ISLOCKED(*vpp)) {
1248                 if (ltype == LK_EXCLUSIVE) {
1249                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1250                         if (VN_IS_DOOMED((*vpp))) {
1251                                 /* forced unmount */
1252                                 vrele(*vpp);
1253                                 *vpp = NULL;
1254                                 return (ENOENT);
1255                         }
1256                 } else
1257                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1258         }
1259         return (-1);
1260 }
1261
1262 static __noinline int
1263 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1264     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1265 {
1266         struct namecache *ncp;
1267         struct rwlock *blp;
1268         struct mtx *dvlp, *dvlp2;
1269         uint32_t hash;
1270         int error;
1271
1272         if (cnp->cn_namelen == 2 &&
1273             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1274                 counter_u64_add(dotdothits, 1);
1275                 dvlp = VP2VNODELOCK(dvp);
1276                 dvlp2 = NULL;
1277                 mtx_lock(dvlp);
1278 retry_dotdot:
1279                 ncp = dvp->v_cache_dd;
1280                 if (ncp == NULL) {
1281                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1282                             "..", NULL);
1283                         mtx_unlock(dvlp);
1284                         if (dvlp2 != NULL)
1285                                 mtx_unlock(dvlp2);
1286                         return (0);
1287                 }
1288                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1289                         if (ncp->nc_dvp != dvp)
1290                                 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1291                         if (!cache_zap_locked_vnode_kl2(ncp,
1292                             dvp, &dvlp2))
1293                                 goto retry_dotdot;
1294                         MPASS(dvp->v_cache_dd == NULL);
1295                         mtx_unlock(dvlp);
1296                         if (dvlp2 != NULL)
1297                                 mtx_unlock(dvlp2);
1298                         cache_free(ncp);
1299                 } else {
1300                         dvp->v_cache_dd = NULL;
1301                         mtx_unlock(dvlp);
1302                         if (dvlp2 != NULL)
1303                                 mtx_unlock(dvlp2);
1304                 }
1305                 return (0);
1306         }
1307
1308         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1309         blp = HASH2BUCKETLOCK(hash);
1310 retry:
1311         if (CK_LIST_EMPTY(NCHHASH(hash)))
1312                 goto out_no_entry;
1313
1314         rw_wlock(blp);
1315
1316         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1317                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1318                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1319                         break;
1320         }
1321
1322         /* We failed to find an entry */
1323         if (ncp == NULL) {
1324                 rw_wunlock(blp);
1325                 goto out_no_entry;
1326         }
1327
1328         error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1329         if (__predict_false(error != 0)) {
1330                 zap_and_exit_bucket_fail++;
1331                 cache_maybe_yield();
1332                 goto retry;
1333         }
1334         counter_u64_add(numposzaps, 1);
1335         cache_free(ncp);
1336         return (0);
1337 out_no_entry:
1338         SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1339         counter_u64_add(nummisszap, 1);
1340         return (0);
1341 }
1342
1343 /**
1344  * Lookup a name in the name cache
1345  *
1346  * # Arguments
1347  *
1348  * - dvp:       Parent directory in which to search.
1349  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1350  * - cnp:       Parameters of the name search.  The most interesting bits of
1351  *              the cn_flags field have the following meanings:
1352  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1353  *                      it up.
1354  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1355  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1356  *              or negative) lookup, tsp will be filled with any timespec that
1357  *              was stored when this cache entry was created.  However, it will
1358  *              be clear for "." entries.
1359  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1360  *              (positive or negative) lookup, it will contain the ticks value
1361  *              that was current when the cache entry was created, unless cnp
1362  *              was ".".
1363  *
1364  * # Returns
1365  *
1366  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1367  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1368  *              to a forced unmount.  vpp will not be modified.  If the entry
1369  *              is a whiteout, then the ISWHITEOUT flag will be set in
1370  *              cnp->cn_flags.
1371  * - 0:         A cache miss.  vpp will not be modified.
1372  *
1373  * # Locking
1374  *
1375  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1376  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1377  * lock is not recursively acquired.
1378  */
1379 int
1380 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1381     struct timespec *tsp, int *ticksp)
1382 {
1383         struct namecache_ts *ncp_ts;
1384         struct namecache *ncp;
1385         struct negstate *negstate;
1386         struct rwlock *blp;
1387         struct mtx *dvlp;
1388         uint32_t hash;
1389         enum vgetstate vs;
1390         int error, ltype;
1391         bool try_smr, doing_smr, whiteout;
1392
1393 #ifdef DEBUG_CACHE
1394         if (__predict_false(!doingcache)) {
1395                 cnp->cn_flags &= ~MAKEENTRY;
1396                 return (0);
1397         }
1398 #endif
1399
1400         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1401                 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1402
1403         if ((cnp->cn_flags & MAKEENTRY) == 0)
1404                 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1405
1406         try_smr = true;
1407         if (cnp->cn_nameiop == CREATE)
1408                 try_smr = false;
1409 retry:
1410         doing_smr = false;
1411         blp = NULL;
1412         dvlp = NULL;
1413         error = 0;
1414         if (cnp->cn_namelen == 2 &&
1415             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1416                 counter_u64_add(dotdothits, 1);
1417                 dvlp = VP2VNODELOCK(dvp);
1418                 mtx_lock(dvlp);
1419                 ncp = dvp->v_cache_dd;
1420                 if (ncp == NULL) {
1421                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1422                             "..", NULL);
1423                         mtx_unlock(dvlp);
1424                         return (0);
1425                 }
1426                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1427                         if (ncp->nc_flag & NCF_NEGATIVE)
1428                                 *vpp = NULL;
1429                         else
1430                                 *vpp = ncp->nc_vp;
1431                 } else
1432                         *vpp = ncp->nc_dvp;
1433                 /* Return failure if negative entry was found. */
1434                 if (*vpp == NULL)
1435                         goto negative_success;
1436                 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1437                     dvp, cnp->cn_nameptr, *vpp);
1438                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1439                     *vpp);
1440                 cache_out_ts(ncp, tsp, ticksp);
1441                 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1442                     NCF_DTS && tsp != NULL) {
1443                         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1444                         *tsp = ncp_ts->nc_dotdottime;
1445                 }
1446                 goto success;
1447         }
1448
1449         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1450 retry_hashed:
1451         if (try_smr) {
1452                 vfs_smr_enter();
1453                 doing_smr = true;
1454                 try_smr = false;
1455         } else {
1456                 blp = HASH2BUCKETLOCK(hash);
1457                 rw_rlock(blp);
1458         }
1459
1460         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1461                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1462                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1463                         break;
1464         }
1465
1466         /* We failed to find an entry */
1467         if (__predict_false(ncp == NULL)) {
1468                 if (doing_smr)
1469                         vfs_smr_exit();
1470                 else
1471                         rw_runlock(blp);
1472                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1473                     NULL);
1474                 counter_u64_add(nummiss, 1);
1475                 return (0);
1476         }
1477
1478         if (ncp->nc_flag & NCF_NEGATIVE)
1479                 goto negative_success;
1480
1481         /* We found a "positive" match, return the vnode */
1482         counter_u64_add(numposhits, 1);
1483         *vpp = ncp->nc_vp;
1484         CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1485             dvp, cnp->cn_nameptr, *vpp, ncp);
1486         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1487             *vpp);
1488         cache_out_ts(ncp, tsp, ticksp);
1489 success:
1490         /*
1491          * On success we return a locked and ref'd vnode as per the lookup
1492          * protocol.
1493          */
1494         MPASS(dvp != *vpp);
1495         ltype = 0;      /* silence gcc warning */
1496         if (cnp->cn_flags & ISDOTDOT) {
1497                 ltype = VOP_ISLOCKED(dvp);
1498                 VOP_UNLOCK(dvp);
1499         }
1500         if (doing_smr) {
1501                 if (cache_ncp_invalid(ncp)) {
1502                         vfs_smr_exit();
1503                         *vpp = NULL;
1504                         goto retry;
1505                 }
1506                 vs = vget_prep_smr(*vpp);
1507                 vfs_smr_exit();
1508                 if (vs == VGET_NONE) {
1509                         *vpp = NULL;
1510                         goto retry;
1511                 }
1512         } else {
1513                 vs = vget_prep(*vpp);
1514                 cache_lookup_unlock(blp, dvlp);
1515         }
1516         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1517         if (cnp->cn_flags & ISDOTDOT) {
1518                 vn_lock(dvp, ltype | LK_RETRY);
1519                 if (VN_IS_DOOMED(dvp)) {
1520                         if (error == 0)
1521                                 vput(*vpp);
1522                         *vpp = NULL;
1523                         return (ENOENT);
1524                 }
1525         }
1526         if (error) {
1527                 *vpp = NULL;
1528                 goto retry;
1529         }
1530         if ((cnp->cn_flags & ISLASTCN) &&
1531             (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1532                 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1533         }
1534         return (-1);
1535
1536 negative_success:
1537         /* We found a negative match, and want to create it, so purge */
1538         if (cnp->cn_nameiop == CREATE) {
1539                 MPASS(!doing_smr);
1540                 counter_u64_add(numnegzaps, 1);
1541                 goto zap_and_exit;
1542         }
1543
1544         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1545         cache_out_ts(ncp, tsp, ticksp);
1546         counter_u64_add(numneghits, 1);
1547         whiteout = (ncp->nc_flag & NCF_WHITE);
1548
1549         if (doing_smr) {
1550                 /*
1551                  * We need to take locks to promote an entry.
1552                  */
1553                 negstate = NCP2NEGSTATE(ncp);
1554                 if ((negstate->neg_flag & NEG_HOT) == 0 ||
1555                     cache_ncp_invalid(ncp)) {
1556                         vfs_smr_exit();
1557                         doing_smr = false;
1558                         goto retry_hashed;
1559                 }
1560                 vfs_smr_exit();
1561         } else {
1562                 cache_negative_hit(ncp);
1563                 cache_lookup_unlock(blp, dvlp);
1564         }
1565         if (whiteout)
1566                 cnp->cn_flags |= ISWHITEOUT;
1567         return (ENOENT);
1568
1569 zap_and_exit:
1570         MPASS(!doing_smr);
1571         if (blp != NULL)
1572                 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1573         else
1574                 error = cache_zap_locked_vnode(ncp, dvp);
1575         if (__predict_false(error != 0)) {
1576                 zap_and_exit_bucket_fail2++;
1577                 cache_maybe_yield();
1578                 goto retry;
1579         }
1580         cache_free(ncp);
1581         return (0);
1582 }
1583
1584 struct celockstate {
1585         struct mtx *vlp[3];
1586         struct rwlock *blp[2];
1587 };
1588 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1589 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1590
1591 static inline void
1592 cache_celockstate_init(struct celockstate *cel)
1593 {
1594
1595         bzero(cel, sizeof(*cel));
1596 }
1597
1598 static void
1599 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1600     struct vnode *dvp)
1601 {
1602         struct mtx *vlp1, *vlp2;
1603
1604         MPASS(cel->vlp[0] == NULL);
1605         MPASS(cel->vlp[1] == NULL);
1606         MPASS(cel->vlp[2] == NULL);
1607
1608         MPASS(vp != NULL || dvp != NULL);
1609
1610         vlp1 = VP2VNODELOCK(vp);
1611         vlp2 = VP2VNODELOCK(dvp);
1612         cache_sort_vnodes(&vlp1, &vlp2);
1613
1614         if (vlp1 != NULL) {
1615                 mtx_lock(vlp1);
1616                 cel->vlp[0] = vlp1;
1617         }
1618         mtx_lock(vlp2);
1619         cel->vlp[1] = vlp2;
1620 }
1621
1622 static void
1623 cache_unlock_vnodes_cel(struct celockstate *cel)
1624 {
1625
1626         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1627
1628         if (cel->vlp[0] != NULL)
1629                 mtx_unlock(cel->vlp[0]);
1630         if (cel->vlp[1] != NULL)
1631                 mtx_unlock(cel->vlp[1]);
1632         if (cel->vlp[2] != NULL)
1633                 mtx_unlock(cel->vlp[2]);
1634 }
1635
1636 static bool
1637 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1638 {
1639         struct mtx *vlp;
1640         bool ret;
1641
1642         cache_assert_vlp_locked(cel->vlp[0]);
1643         cache_assert_vlp_locked(cel->vlp[1]);
1644         MPASS(cel->vlp[2] == NULL);
1645
1646         MPASS(vp != NULL);
1647         vlp = VP2VNODELOCK(vp);
1648
1649         ret = true;
1650         if (vlp >= cel->vlp[1]) {
1651                 mtx_lock(vlp);
1652         } else {
1653                 if (mtx_trylock(vlp))
1654                         goto out;
1655                 cache_lock_vnodes_cel_3_failures++;
1656                 cache_unlock_vnodes_cel(cel);
1657                 if (vlp < cel->vlp[0]) {
1658                         mtx_lock(vlp);
1659                         mtx_lock(cel->vlp[0]);
1660                         mtx_lock(cel->vlp[1]);
1661                 } else {
1662                         if (cel->vlp[0] != NULL)
1663                                 mtx_lock(cel->vlp[0]);
1664                         mtx_lock(vlp);
1665                         mtx_lock(cel->vlp[1]);
1666                 }
1667                 ret = false;
1668         }
1669 out:
1670         cel->vlp[2] = vlp;
1671         return (ret);
1672 }
1673
1674 static void
1675 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1676     struct rwlock *blp2)
1677 {
1678
1679         MPASS(cel->blp[0] == NULL);
1680         MPASS(cel->blp[1] == NULL);
1681
1682         cache_sort_vnodes(&blp1, &blp2);
1683
1684         if (blp1 != NULL) {
1685                 rw_wlock(blp1);
1686                 cel->blp[0] = blp1;
1687         }
1688         rw_wlock(blp2);
1689         cel->blp[1] = blp2;
1690 }
1691
1692 static void
1693 cache_unlock_buckets_cel(struct celockstate *cel)
1694 {
1695
1696         if (cel->blp[0] != NULL)
1697                 rw_wunlock(cel->blp[0]);
1698         rw_wunlock(cel->blp[1]);
1699 }
1700
1701 /*
1702  * Lock part of the cache affected by the insertion.
1703  *
1704  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1705  * However, insertion can result in removal of an old entry. In this
1706  * case we have an additional vnode and bucketlock pair to lock. If the
1707  * entry is negative, ncelock is locked instead of the vnode.
1708  *
1709  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1710  * preserving the locking order (smaller address first).
1711  */
1712 static void
1713 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1714     uint32_t hash)
1715 {
1716         struct namecache *ncp;
1717         struct rwlock *blps[2];
1718
1719         blps[0] = HASH2BUCKETLOCK(hash);
1720         for (;;) {
1721                 blps[1] = NULL;
1722                 cache_lock_vnodes_cel(cel, dvp, vp);
1723                 if (vp == NULL || vp->v_type != VDIR)
1724                         break;
1725                 ncp = vp->v_cache_dd;
1726                 if (ncp == NULL)
1727                         break;
1728                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1729                         break;
1730                 MPASS(ncp->nc_dvp == vp);
1731                 blps[1] = NCP2BUCKETLOCK(ncp);
1732                 if (ncp->nc_flag & NCF_NEGATIVE)
1733                         break;
1734                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1735                         break;
1736                 /*
1737                  * All vnodes got re-locked. Re-validate the state and if
1738                  * nothing changed we are done. Otherwise restart.
1739                  */
1740                 if (ncp == vp->v_cache_dd &&
1741                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1742                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1743                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1744                         break;
1745                 cache_unlock_vnodes_cel(cel);
1746                 cel->vlp[0] = NULL;
1747                 cel->vlp[1] = NULL;
1748                 cel->vlp[2] = NULL;
1749         }
1750         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1751 }
1752
1753 static void
1754 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1755     uint32_t hash)
1756 {
1757         struct namecache *ncp;
1758         struct rwlock *blps[2];
1759
1760         blps[0] = HASH2BUCKETLOCK(hash);
1761         for (;;) {
1762                 blps[1] = NULL;
1763                 cache_lock_vnodes_cel(cel, dvp, vp);
1764                 ncp = dvp->v_cache_dd;
1765                 if (ncp == NULL)
1766                         break;
1767                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1768                         break;
1769                 MPASS(ncp->nc_dvp == dvp);
1770                 blps[1] = NCP2BUCKETLOCK(ncp);
1771                 if (ncp->nc_flag & NCF_NEGATIVE)
1772                         break;
1773                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1774                         break;
1775                 if (ncp == dvp->v_cache_dd &&
1776                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1777                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1778                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1779                         break;
1780                 cache_unlock_vnodes_cel(cel);
1781                 cel->vlp[0] = NULL;
1782                 cel->vlp[1] = NULL;
1783                 cel->vlp[2] = NULL;
1784         }
1785         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1786 }
1787
1788 static void
1789 cache_enter_unlock(struct celockstate *cel)
1790 {
1791
1792         cache_unlock_buckets_cel(cel);
1793         cache_unlock_vnodes_cel(cel);
1794 }
1795
1796 static void __noinline
1797 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1798     struct componentname *cnp)
1799 {
1800         struct celockstate cel;
1801         struct namecache *ncp;
1802         uint32_t hash;
1803         int len;
1804
1805         if (dvp->v_cache_dd == NULL)
1806                 return;
1807         len = cnp->cn_namelen;
1808         cache_celockstate_init(&cel);
1809         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1810         cache_enter_lock_dd(&cel, dvp, vp, hash);
1811         ncp = dvp->v_cache_dd;
1812         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1813                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1814                 cache_zap_locked(ncp);
1815         } else {
1816                 ncp = NULL;
1817         }
1818         dvp->v_cache_dd = NULL;
1819         cache_enter_unlock(&cel);
1820         cache_free(ncp);
1821 }
1822
1823 /*
1824  * Add an entry to the cache.
1825  */
1826 void
1827 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1828     struct timespec *tsp, struct timespec *dtsp)
1829 {
1830         struct celockstate cel;
1831         struct namecache *ncp, *n2, *ndd;
1832         struct namecache_ts *ncp_ts, *n2_ts;
1833         struct nchashhead *ncpp;
1834         uint32_t hash;
1835         int flag;
1836         int len;
1837         u_long lnumcache;
1838
1839         CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1840         VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1841             ("cache_enter: Adding a doomed vnode"));
1842         VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1843             ("cache_enter: Doomed vnode used as src"));
1844
1845 #ifdef DEBUG_CACHE
1846         if (__predict_false(!doingcache))
1847                 return;
1848 #endif
1849
1850         flag = 0;
1851         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1852                 if (cnp->cn_namelen == 1)
1853                         return;
1854                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1855                         cache_enter_dotdot_prep(dvp, vp, cnp);
1856                         flag = NCF_ISDOTDOT;
1857                 }
1858         }
1859
1860         /*
1861          * Avoid blowout in namecache entries.
1862          */
1863         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1864         if (__predict_false(lnumcache >= ncsize)) {
1865                 atomic_add_long(&numcache, -1);
1866                 counter_u64_add(numdrops, 1);
1867                 return;
1868         }
1869
1870         cache_celockstate_init(&cel);
1871         ndd = NULL;
1872         ncp_ts = NULL;
1873
1874         /*
1875          * Calculate the hash key and setup as much of the new
1876          * namecache entry as possible before acquiring the lock.
1877          */
1878         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1879         ncp->nc_flag = flag;
1880         ncp->nc_vp = vp;
1881         if (vp == NULL)
1882                 cache_negative_init(ncp);
1883         ncp->nc_dvp = dvp;
1884         if (tsp != NULL) {
1885                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1886                 ncp_ts->nc_time = *tsp;
1887                 ncp_ts->nc_ticks = ticks;
1888                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1889                 if (dtsp != NULL) {
1890                         ncp_ts->nc_dotdottime = *dtsp;
1891                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1892                 }
1893         }
1894         len = ncp->nc_nlen = cnp->cn_namelen;
1895         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1896         strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1897         cache_enter_lock(&cel, dvp, vp, hash);
1898
1899         /*
1900          * See if this vnode or negative entry is already in the cache
1901          * with this name.  This can happen with concurrent lookups of
1902          * the same path name.
1903          */
1904         ncpp = NCHHASH(hash);
1905         CK_LIST_FOREACH(n2, ncpp, nc_hash) {
1906                 if (n2->nc_dvp == dvp &&
1907                     n2->nc_nlen == cnp->cn_namelen &&
1908                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1909                         if (tsp != NULL) {
1910                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
1911                                     ("no NCF_TS"));
1912                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1913                                 n2_ts->nc_time = ncp_ts->nc_time;
1914                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
1915                                 if (dtsp != NULL) {
1916                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1917                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
1918                                 }
1919                         }
1920                         goto out_unlock_free;
1921                 }
1922         }
1923
1924         if (flag == NCF_ISDOTDOT) {
1925                 /*
1926                  * See if we are trying to add .. entry, but some other lookup
1927                  * has populated v_cache_dd pointer already.
1928                  */
1929                 if (dvp->v_cache_dd != NULL)
1930                         goto out_unlock_free;
1931                 KASSERT(vp == NULL || vp->v_type == VDIR,
1932                     ("wrong vnode type %p", vp));
1933                 dvp->v_cache_dd = ncp;
1934         }
1935
1936         if (vp != NULL) {
1937                 if (vp->v_type == VDIR) {
1938                         if (flag != NCF_ISDOTDOT) {
1939                                 /*
1940                                  * For this case, the cache entry maps both the
1941                                  * directory name in it and the name ".." for the
1942                                  * directory's parent.
1943                                  */
1944                                 if ((ndd = vp->v_cache_dd) != NULL) {
1945                                         if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1946                                                 cache_zap_locked(ndd);
1947                                         else
1948                                                 ndd = NULL;
1949                                 }
1950                                 vp->v_cache_dd = ncp;
1951                         }
1952                 } else {
1953                         vp->v_cache_dd = NULL;
1954                 }
1955         }
1956
1957         if (flag != NCF_ISDOTDOT) {
1958                 if (LIST_EMPTY(&dvp->v_cache_src)) {
1959                         vhold(dvp);
1960                         counter_u64_add(numcachehv, 1);
1961                 }
1962                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1963         }
1964
1965         /*
1966          * If the entry is "negative", we place it into the
1967          * "negative" cache queue, otherwise, we place it into the
1968          * destination vnode's cache entries queue.
1969          */
1970         if (vp != NULL) {
1971                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1972                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1973                     vp);
1974         } else {
1975                 if (cnp->cn_flags & ISWHITEOUT)
1976                         ncp->nc_flag |= NCF_WHITE;
1977                 cache_negative_insert(ncp);
1978                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1979                     ncp->nc_name);
1980         }
1981
1982         atomic_thread_fence_rel();
1983         /*
1984          * Insert the new namecache entry into the appropriate chain
1985          * within the cache entries table.
1986          */
1987         CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1988
1989         cache_enter_unlock(&cel);
1990         if (numneg * ncnegfactor > lnumcache)
1991                 cache_negative_zap_one();
1992         cache_free(ndd);
1993         return;
1994 out_unlock_free:
1995         cache_enter_unlock(&cel);
1996         cache_free(ncp);
1997         return;
1998 }
1999
2000 static u_int
2001 cache_roundup_2(u_int val)
2002 {
2003         u_int res;
2004
2005         for (res = 1; res <= val; res <<= 1)
2006                 continue;
2007
2008         return (res);
2009 }
2010
2011 /*
2012  * Name cache initialization, from vfs_init() when we are booting
2013  */
2014 static void
2015 nchinit(void *dummy __unused)
2016 {
2017         u_int i;
2018
2019         cache_zone_small = uma_zcreate("S VFS Cache",
2020             sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2021             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2022             UMA_ZONE_ZINIT);
2023         cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2024             sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2025             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2026             UMA_ZONE_ZINIT);
2027         cache_zone_large = uma_zcreate("L VFS Cache",
2028             sizeof(struct namecache) + NAME_MAX + 1,
2029             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2030             UMA_ZONE_ZINIT);
2031         cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2032             sizeof(struct namecache_ts) + NAME_MAX + 1,
2033             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2034             UMA_ZONE_ZINIT);
2035
2036         VFS_SMR_ZONE_SET(cache_zone_small);
2037         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2038         VFS_SMR_ZONE_SET(cache_zone_large);
2039         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2040
2041         ncsize = desiredvnodes * ncsizefactor;
2042         nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
2043         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2044         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2045                 ncbuckethash = 7;
2046         if (ncbuckethash > nchash)
2047                 ncbuckethash = nchash;
2048         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2049             M_WAITOK | M_ZERO);
2050         for (i = 0; i < numbucketlocks; i++)
2051                 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2052         ncvnodehash = ncbuckethash;
2053         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2054             M_WAITOK | M_ZERO);
2055         for (i = 0; i < numvnodelocks; i++)
2056                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2057         ncpurgeminvnodes = numbucketlocks * 2;
2058
2059         ncneghash = 3;
2060         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2061             M_WAITOK | M_ZERO);
2062         for (i = 0; i < numneglists; i++) {
2063                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2064                 TAILQ_INIT(&neglists[i].nl_list);
2065         }
2066         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2067         TAILQ_INIT(&ncneg_hot.nl_list);
2068
2069         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2070 }
2071 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2072
2073 void
2074 cache_changesize(u_long newmaxvnodes)
2075 {
2076         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2077         u_long new_nchash, old_nchash;
2078         struct namecache *ncp;
2079         uint32_t hash;
2080         u_long newncsize;
2081         int i;
2082
2083         newncsize = newmaxvnodes * ncsizefactor;
2084         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2085         if (newmaxvnodes < numbucketlocks)
2086                 newmaxvnodes = numbucketlocks;
2087
2088         new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
2089         /* If same hash table size, nothing to do */
2090         if (nchash == new_nchash) {
2091                 free(new_nchashtbl, M_VFSCACHE);
2092                 return;
2093         }
2094         /*
2095          * Move everything from the old hash table to the new table.
2096          * None of the namecache entries in the table can be removed
2097          * because to do so, they have to be removed from the hash table.
2098          */
2099         cache_lock_all_vnodes();
2100         cache_lock_all_buckets();
2101         old_nchashtbl = nchashtbl;
2102         old_nchash = nchash;
2103         nchashtbl = new_nchashtbl;
2104         nchash = new_nchash;
2105         for (i = 0; i <= old_nchash; i++) {
2106                 while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) {
2107                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2108                             ncp->nc_dvp);
2109                         CK_LIST_REMOVE(ncp, nc_hash);
2110                         CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2111                 }
2112         }
2113         ncsize = newncsize;
2114         cache_unlock_all_buckets();
2115         cache_unlock_all_vnodes();
2116         free(old_nchashtbl, M_VFSCACHE);
2117 }
2118
2119 /*
2120  * Invalidate all entries from and to a particular vnode.
2121  */
2122 void
2123 cache_purge(struct vnode *vp)
2124 {
2125         TAILQ_HEAD(, namecache) ncps;
2126         struct namecache *ncp, *nnp;
2127         struct mtx *vlp, *vlp2;
2128
2129         CTR1(KTR_VFS, "cache_purge(%p)", vp);
2130         SDT_PROBE1(vfs, namecache, purge, done, vp);
2131         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2132             vp->v_cache_dd == NULL)
2133                 return;
2134         TAILQ_INIT(&ncps);
2135         vlp = VP2VNODELOCK(vp);
2136         vlp2 = NULL;
2137         mtx_lock(vlp);
2138 retry:
2139         while (!LIST_EMPTY(&vp->v_cache_src)) {
2140                 ncp = LIST_FIRST(&vp->v_cache_src);
2141                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2142                         goto retry;
2143                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2144         }
2145         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2146                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2147                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2148                         goto retry;
2149                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2150         }
2151         ncp = vp->v_cache_dd;
2152         if (ncp != NULL) {
2153                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2154                    ("lost dotdot link"));
2155                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2156                         goto retry;
2157                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2158         }
2159         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2160         mtx_unlock(vlp);
2161         if (vlp2 != NULL)
2162                 mtx_unlock(vlp2);
2163         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2164                 cache_free(ncp);
2165         }
2166 }
2167
2168 /*
2169  * Invalidate all negative entries for a particular directory vnode.
2170  */
2171 void
2172 cache_purge_negative(struct vnode *vp)
2173 {
2174         TAILQ_HEAD(, namecache) ncps;
2175         struct namecache *ncp, *nnp;
2176         struct mtx *vlp;
2177
2178         CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2179         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2180         if (LIST_EMPTY(&vp->v_cache_src))
2181                 return;
2182         TAILQ_INIT(&ncps);
2183         vlp = VP2VNODELOCK(vp);
2184         mtx_lock(vlp);
2185         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2186                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2187                         continue;
2188                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2189                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2190         }
2191         mtx_unlock(vlp);
2192         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2193                 cache_free(ncp);
2194         }
2195 }
2196
2197 /*
2198  * Flush all entries referencing a particular filesystem.
2199  */
2200 void
2201 cache_purgevfs(struct mount *mp, bool force)
2202 {
2203         TAILQ_HEAD(, namecache) ncps;
2204         struct mtx *vlp1, *vlp2;
2205         struct rwlock *blp;
2206         struct nchashhead *bucket;
2207         struct namecache *ncp, *nnp;
2208         u_long i, j, n_nchash;
2209         int error;
2210
2211         /* Scan hash tables for applicable entries */
2212         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2213         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2214                 return;
2215         TAILQ_INIT(&ncps);
2216         n_nchash = nchash + 1;
2217         vlp1 = vlp2 = NULL;
2218         for (i = 0; i < numbucketlocks; i++) {
2219                 blp = (struct rwlock *)&bucketlocks[i];
2220                 rw_wlock(blp);
2221                 for (j = i; j < n_nchash; j += numbucketlocks) {
2222 retry:
2223                         bucket = &nchashtbl[j];
2224                         CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2225                                 cache_assert_bucket_locked(ncp, RA_WLOCKED);
2226                                 if (ncp->nc_dvp->v_mount != mp)
2227                                         continue;
2228                                 error = cache_zap_wlocked_bucket_kl(ncp, blp,
2229                                     &vlp1, &vlp2);
2230                                 if (error != 0)
2231                                         goto retry;
2232                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2233                         }
2234                 }
2235                 rw_wunlock(blp);
2236                 if (vlp1 == NULL && vlp2 == NULL)
2237                         cache_maybe_yield();
2238         }
2239         if (vlp1 != NULL)
2240                 mtx_unlock(vlp1);
2241         if (vlp2 != NULL)
2242                 mtx_unlock(vlp2);
2243
2244         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2245                 cache_free(ncp);
2246         }
2247 }
2248
2249 /*
2250  * Perform canonical checks and cache lookup and pass on to filesystem
2251  * through the vop_cachedlookup only if needed.
2252  */
2253
2254 int
2255 vfs_cache_lookup(struct vop_lookup_args *ap)
2256 {
2257         struct vnode *dvp;
2258         int error;
2259         struct vnode **vpp = ap->a_vpp;
2260         struct componentname *cnp = ap->a_cnp;
2261         int flags = cnp->cn_flags;
2262
2263         *vpp = NULL;
2264         dvp = ap->a_dvp;
2265
2266         if (dvp->v_type != VDIR)
2267                 return (ENOTDIR);
2268
2269         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2270             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2271                 return (EROFS);
2272
2273         error = vn_dir_check_exec(dvp, cnp);
2274         if (error != 0)
2275                 return (error);
2276
2277         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2278         if (error == 0)
2279                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2280         if (error == -1)
2281                 return (0);
2282         return (error);
2283 }
2284
2285 /* Implementation of the getcwd syscall. */
2286 int
2287 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2288 {
2289         char *buf, *retbuf;
2290         size_t buflen;
2291         int error;
2292
2293         buflen = uap->buflen;
2294         if (__predict_false(buflen < 2))
2295                 return (EINVAL);
2296         if (buflen > MAXPATHLEN)
2297                 buflen = MAXPATHLEN;
2298
2299         buf = malloc(buflen, M_TEMP, M_WAITOK);
2300         error = vn_getcwd(td, buf, &retbuf, &buflen);
2301         if (error == 0)
2302                 error = copyout(retbuf, uap->buf, buflen);
2303         free(buf, M_TEMP);
2304         return (error);
2305 }
2306
2307 int
2308 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2309 {
2310         struct pwd *pwd;
2311         int error;
2312
2313         pwd = pwd_hold(td);
2314         error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2315         pwd_drop(pwd);
2316
2317 #ifdef KTRACE
2318         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2319                 ktrnamei(*retbuf);
2320 #endif
2321         return (error);
2322 }
2323
2324 static int
2325 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2326     size_t size, int flags, enum uio_seg pathseg)
2327 {
2328         struct nameidata nd;
2329         char *retbuf, *freebuf;
2330         int error;
2331
2332         if (flags != 0)
2333                 return (EINVAL);
2334         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2335             pathseg, path, fd, &cap_fstat_rights, td);
2336         if ((error = namei(&nd)) != 0)
2337                 return (error);
2338         error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2339         if (error == 0) {
2340                 error = copyout(retbuf, buf, size);
2341                 free(freebuf, M_TEMP);
2342         }
2343         NDFREE(&nd, 0);
2344         return (error);
2345 }
2346
2347 int
2348 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2349 {
2350
2351         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2352             uap->flags, UIO_USERSPACE));
2353 }
2354
2355 /*
2356  * Retrieve the full filesystem path that correspond to a vnode from the name
2357  * cache (if available)
2358  */
2359 int
2360 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2361 {
2362         struct pwd *pwd;
2363         char *buf;
2364         size_t buflen;
2365         int error;
2366
2367         if (__predict_false(vn == NULL))
2368                 return (EINVAL);
2369
2370         buflen = MAXPATHLEN;
2371         buf = malloc(buflen, M_TEMP, M_WAITOK);
2372         pwd = pwd_hold(td);
2373         error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2374         pwd_drop(pwd);
2375
2376         if (!error)
2377                 *freebuf = buf;
2378         else
2379                 free(buf, M_TEMP);
2380         return (error);
2381 }
2382
2383 /*
2384  * This function is similar to vn_fullpath, but it attempts to lookup the
2385  * pathname relative to the global root mount point.  This is required for the
2386  * auditing sub-system, as audited pathnames must be absolute, relative to the
2387  * global root mount point.
2388  */
2389 int
2390 vn_fullpath_global(struct thread *td, struct vnode *vn,
2391     char **retbuf, char **freebuf)
2392 {
2393         char *buf;
2394         size_t buflen;
2395         int error;
2396
2397         if (__predict_false(vn == NULL))
2398                 return (EINVAL);
2399         buflen = MAXPATHLEN;
2400         buf = malloc(buflen, M_TEMP, M_WAITOK);
2401         error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2402         if (!error)
2403                 *freebuf = buf;
2404         else
2405                 free(buf, M_TEMP);
2406         return (error);
2407 }
2408
2409 int
2410 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2411 {
2412         struct vnode *dvp;
2413         struct namecache *ncp;
2414         struct mtx *vlp;
2415         int error;
2416
2417         vlp = VP2VNODELOCK(*vp);
2418         mtx_lock(vlp);
2419         TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2420                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2421                         break;
2422         }
2423         if (ncp != NULL) {
2424                 if (*buflen < ncp->nc_nlen) {
2425                         mtx_unlock(vlp);
2426                         vrele(*vp);
2427                         counter_u64_add(numfullpathfail4, 1);
2428                         error = ENOMEM;
2429                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2430                             vp, NULL);
2431                         return (error);
2432                 }
2433                 *buflen -= ncp->nc_nlen;
2434                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2435                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2436                     ncp->nc_name, vp);
2437                 dvp = *vp;
2438                 *vp = ncp->nc_dvp;
2439                 vref(*vp);
2440                 mtx_unlock(vlp);
2441                 vrele(dvp);
2442                 return (0);
2443         }
2444         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2445
2446         mtx_unlock(vlp);
2447         vn_lock(*vp, LK_SHARED | LK_RETRY);
2448         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2449         vput(*vp);
2450         if (error) {
2451                 counter_u64_add(numfullpathfail2, 1);
2452                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2453                 return (error);
2454         }
2455
2456         *vp = dvp;
2457         if (VN_IS_DOOMED(dvp)) {
2458                 /* forced unmount */
2459                 vrele(dvp);
2460                 error = ENOENT;
2461                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2462                 return (error);
2463         }
2464         /*
2465          * *vp has its use count incremented still.
2466          */
2467
2468         return (0);
2469 }
2470
2471 /*
2472  * Resolve a directory to a pathname.
2473  *
2474  * The name of the directory can always be found in the namecache or fetched
2475  * from the filesystem. There is also guaranteed to be only one parent, meaning
2476  * we can just follow vnodes up until we find the root.
2477  *
2478  * The vnode must be referenced.
2479  */
2480 static int
2481 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2482     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2483 {
2484 #ifdef KDTRACE_HOOKS
2485         struct vnode *startvp = vp;
2486 #endif
2487         struct vnode *vp1;
2488         size_t buflen;
2489         int error;
2490
2491         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2492         VNPASS(vp->v_usecount > 0, vp);
2493
2494         buflen = *len;
2495
2496         if (!slash_prefixed) {
2497                 MPASS(*len >= 2);
2498                 buflen--;
2499                 buf[buflen] = '\0';
2500         }
2501
2502         error = 0;
2503
2504         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2505         counter_u64_add(numfullpathcalls, 1);
2506         while (vp != rdir && vp != rootvnode) {
2507                 /*
2508                  * The vp vnode must be already fully constructed,
2509                  * since it is either found in namecache or obtained
2510                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2511                  * without obtaining the vnode lock.
2512                  */
2513                 if ((vp->v_vflag & VV_ROOT) != 0) {
2514                         vn_lock(vp, LK_RETRY | LK_SHARED);
2515
2516                         /*
2517                          * With the vnode locked, check for races with
2518                          * unmount, forced or not.  Note that we
2519                          * already verified that vp is not equal to
2520                          * the root vnode, which means that
2521                          * mnt_vnodecovered can be NULL only for the
2522                          * case of unmount.
2523                          */
2524                         if (VN_IS_DOOMED(vp) ||
2525                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2526                             vp1->v_mountedhere != vp->v_mount) {
2527                                 vput(vp);
2528                                 error = ENOENT;
2529                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2530                                     error, vp, NULL);
2531                                 break;
2532                         }
2533
2534                         vref(vp1);
2535                         vput(vp);
2536                         vp = vp1;
2537                         continue;
2538                 }
2539                 if (vp->v_type != VDIR) {
2540                         vrele(vp);
2541                         counter_u64_add(numfullpathfail1, 1);
2542                         error = ENOTDIR;
2543                         SDT_PROBE3(vfs, namecache, fullpath, return,
2544                             error, vp, NULL);
2545                         break;
2546                 }
2547                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2548                 if (error)
2549                         break;
2550                 if (buflen == 0) {
2551                         vrele(vp);
2552                         error = ENOMEM;
2553                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2554                             startvp, NULL);
2555                         break;
2556                 }
2557                 buf[--buflen] = '/';
2558                 slash_prefixed = true;
2559         }
2560         if (error)
2561                 return (error);
2562         if (!slash_prefixed) {
2563                 if (buflen == 0) {
2564                         vrele(vp);
2565                         counter_u64_add(numfullpathfail4, 1);
2566                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2567                             startvp, NULL);
2568                         return (ENOMEM);
2569                 }
2570                 buf[--buflen] = '/';
2571         }
2572         counter_u64_add(numfullpathfound, 1);
2573         vrele(vp);
2574
2575         *retbuf = buf + buflen;
2576         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2577         *len -= buflen;
2578         *len += addend;
2579         return (0);
2580 }
2581
2582 /*
2583  * Resolve an arbitrary vnode to a pathname.
2584  *
2585  * Note 2 caveats:
2586  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2587  *   resolve to a different path than the one used to find it
2588  * - namecache is not mandatory, meaning names are not guaranteed to be added
2589  *   (in which case resolving fails)
2590  */
2591 static int
2592 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2593     char *buf, char **retbuf, size_t *buflen)
2594 {
2595         size_t orig_buflen;
2596         bool slash_prefixed;
2597         int error;
2598
2599         if (*buflen < 2)
2600                 return (EINVAL);
2601
2602         orig_buflen = *buflen;
2603
2604         vref(vp);
2605         slash_prefixed = false;
2606         if (vp->v_type != VDIR) {
2607                 *buflen -= 1;
2608                 buf[*buflen] = '\0';
2609                 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2610                 if (error)
2611                         return (error);
2612                 if (*buflen == 0) {
2613                         vrele(vp);
2614                         return (ENOMEM);
2615                 }
2616                 *buflen -= 1;
2617                 buf[*buflen] = '/';
2618                 slash_prefixed = true;
2619         }
2620
2621         return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2622             orig_buflen - *buflen));
2623 }
2624
2625 /*
2626  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2627  *
2628  * Since the namecache does not track handlings, the caller is expected to first
2629  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2630  *
2631  * Then we have 2 cases:
2632  * - if the found vnode is a directory, the path can be constructed just by
2633  *   fullowing names up the chain
2634  * - otherwise we populate the buffer with the saved name and start resolving
2635  *   from the parent
2636  */
2637 static int
2638 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2639     char **freebuf, size_t *buflen)
2640 {
2641         char *buf, *tmpbuf;
2642         struct pwd *pwd;
2643         struct componentname *cnp;
2644         struct vnode *vp;
2645         size_t addend;
2646         int error;
2647         bool slash_prefixed;
2648
2649         if (*buflen < 2)
2650                 return (EINVAL);
2651         if (*buflen > MAXPATHLEN)
2652                 *buflen = MAXPATHLEN;
2653
2654         slash_prefixed = false;
2655
2656         buf = malloc(*buflen, M_TEMP, M_WAITOK);
2657         pwd = pwd_hold(td);
2658
2659         addend = 0;
2660         vp = ndp->ni_vp;
2661         if (vp->v_type != VDIR) {
2662                 cnp = &ndp->ni_cnd;
2663                 addend = cnp->cn_namelen + 2;
2664                 if (*buflen < addend) {
2665                         error = ENOMEM;
2666                         goto out_bad;
2667                 }
2668                 *buflen -= addend;
2669                 tmpbuf = buf + *buflen;
2670                 tmpbuf[0] = '/';
2671                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2672                 tmpbuf[addend - 1] = '\0';
2673                 slash_prefixed = true;
2674                 vp = ndp->ni_dvp;
2675         }
2676
2677         vref(vp);
2678         error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2679             slash_prefixed, addend);
2680         if (error != 0)
2681                 goto out_bad;
2682
2683         pwd_drop(pwd);
2684         *freebuf = buf;
2685
2686         return (0);
2687 out_bad:
2688         pwd_drop(pwd);
2689         free(buf, M_TEMP);
2690         return (error);
2691 }
2692
2693 struct vnode *
2694 vn_dir_dd_ino(struct vnode *vp)
2695 {
2696         struct namecache *ncp;
2697         struct vnode *ddvp;
2698         struct mtx *vlp;
2699         enum vgetstate vs;
2700
2701         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2702         vlp = VP2VNODELOCK(vp);
2703         mtx_lock(vlp);
2704         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2705                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2706                         continue;
2707                 ddvp = ncp->nc_dvp;
2708                 vs = vget_prep(ddvp);
2709                 mtx_unlock(vlp);
2710                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2711                         return (NULL);
2712                 return (ddvp);
2713         }
2714         mtx_unlock(vlp);
2715         return (NULL);
2716 }
2717
2718 int
2719 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2720 {
2721         struct namecache *ncp;
2722         struct mtx *vlp;
2723         int l;
2724
2725         vlp = VP2VNODELOCK(vp);
2726         mtx_lock(vlp);
2727         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2728                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2729                         break;
2730         if (ncp == NULL) {
2731                 mtx_unlock(vlp);
2732                 return (ENOENT);
2733         }
2734         l = min(ncp->nc_nlen, buflen - 1);
2735         memcpy(buf, ncp->nc_name, l);
2736         mtx_unlock(vlp);
2737         buf[l] = '\0';
2738         return (0);
2739 }
2740
2741 /*
2742  * This function updates path string to vnode's full global path
2743  * and checks the size of the new path string against the pathlen argument.
2744  *
2745  * Requires a locked, referenced vnode.
2746  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2747  *
2748  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2749  * because it falls back to the ".." lookup if the namecache lookup fails.
2750  */
2751 int
2752 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2753     u_int pathlen)
2754 {
2755         struct nameidata nd;
2756         struct vnode *vp1;
2757         char *rpath, *fbuf;
2758         int error;
2759
2760         ASSERT_VOP_ELOCKED(vp, __func__);
2761
2762         /* Construct global filesystem path from vp. */
2763         VOP_UNLOCK(vp);
2764         error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2765
2766         if (error != 0) {
2767                 vrele(vp);
2768                 return (error);
2769         }
2770
2771         if (strlen(rpath) >= pathlen) {
2772                 vrele(vp);
2773                 error = ENAMETOOLONG;
2774                 goto out;
2775         }
2776
2777         /*
2778          * Re-lookup the vnode by path to detect a possible rename.
2779          * As a side effect, the vnode is relocked.
2780          * If vnode was renamed, return ENOENT.
2781          */
2782         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2783             UIO_SYSSPACE, path, td);
2784         error = namei(&nd);
2785         if (error != 0) {
2786                 vrele(vp);
2787                 goto out;
2788         }
2789         NDFREE(&nd, NDF_ONLY_PNBUF);
2790         vp1 = nd.ni_vp;
2791         vrele(vp);
2792         if (vp1 == vp)
2793                 strcpy(path, rpath);
2794         else {
2795                 vput(vp1);
2796                 error = ENOENT;
2797         }
2798
2799 out:
2800         free(fbuf, M_TEMP);
2801         return (error);
2802 }
2803
2804 #ifdef DDB
2805 static void
2806 db_print_vpath(struct vnode *vp)
2807 {
2808
2809         while (vp != NULL) {
2810                 db_printf("%p: ", vp);
2811                 if (vp == rootvnode) {
2812                         db_printf("/");
2813                         vp = NULL;
2814                 } else {
2815                         if (vp->v_vflag & VV_ROOT) {
2816                                 db_printf("<mount point>");
2817                                 vp = vp->v_mount->mnt_vnodecovered;
2818                         } else {
2819                                 struct namecache *ncp;
2820                                 char *ncn;
2821                                 int i;
2822
2823                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2824                                 if (ncp != NULL) {
2825                                         ncn = ncp->nc_name;
2826                                         for (i = 0; i < ncp->nc_nlen; i++)
2827                                                 db_printf("%c", *ncn++);
2828                                         vp = ncp->nc_dvp;
2829                                 } else {
2830                                         vp = NULL;
2831                                 }
2832                         }
2833                 }
2834                 db_printf("\n");
2835         }
2836
2837         return;
2838 }
2839
2840 DB_SHOW_COMMAND(vpath, db_show_vpath)
2841 {
2842         struct vnode *vp;
2843
2844         if (!have_addr) {
2845                 db_printf("usage: show vpath <struct vnode *>\n");
2846                 return;
2847         }
2848
2849         vp = (struct vnode *)addr;
2850         db_print_vpath(vp);
2851 }
2852
2853 #endif
2854
2855 extern uma_zone_t namei_zone;
2856
2857 static bool __read_frequently cache_fast_lookup = true;
2858 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2859     &cache_fast_lookup, 0, "");
2860
2861 #define CACHE_FPL_FAILED        -2020
2862
2863 static void
2864 cache_fpl_cleanup_cnp(struct componentname *cnp)
2865 {
2866
2867         uma_zfree(namei_zone, cnp->cn_pnbuf);
2868 #ifdef DIAGNOSTIC
2869         cnp->cn_pnbuf = NULL;
2870         cnp->cn_nameptr = NULL;
2871 #endif
2872 }
2873
2874 static void
2875 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
2876 {
2877         struct componentname *cnp;
2878
2879         cnp = &ndp->ni_cnd;
2880         while (*(cnp->cn_nameptr) == '/') {
2881                 cnp->cn_nameptr++;
2882                 ndp->ni_pathlen--;
2883         }
2884
2885         *dpp = ndp->ni_rootdir;
2886 }
2887
2888 /*
2889  * Components of nameidata (or objects it can point to) which may
2890  * need restoring in case fast path lookup fails.
2891  */
2892 struct nameidata_saved {
2893         int cn_flags;
2894         long cn_namelen;
2895         char *cn_nameptr;
2896         size_t ni_pathlen;
2897 };
2898
2899 struct cache_fpl {
2900         int line;
2901         enum cache_fpl_status status;
2902         bool in_smr;
2903         struct nameidata *ndp;
2904         struct nameidata_saved snd;
2905         struct componentname *cnp;
2906         struct vnode *dvp;
2907         seqc_t dvp_seqc;
2908         struct vnode *tvp;
2909         seqc_t tvp_seqc;
2910         struct pwd *pwd;
2911 };
2912
2913 static void
2914 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
2915 {
2916
2917         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
2918         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
2919         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
2920         snd->ni_pathlen = fpl->ndp->ni_pathlen;
2921 }
2922
2923 static void
2924 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
2925 {
2926
2927         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
2928         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
2929         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
2930         fpl->ndp->ni_pathlen = snd->ni_pathlen;
2931 }
2932
2933 #ifdef INVARIANTS
2934 #define cache_fpl_smr_assert_entered(fpl) ({                    \
2935         struct cache_fpl *_fpl = (fpl);                         \
2936         MPASS(_fpl->in_smr == true);                            \
2937         VFS_SMR_ASSERT_ENTERED();                               \
2938 })
2939 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
2940         struct cache_fpl *_fpl = (fpl);                         \
2941         MPASS(_fpl->in_smr == false);                           \
2942         VFS_SMR_ASSERT_NOT_ENTERED();                           \
2943 })
2944 #else
2945 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
2946 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
2947 #endif
2948
2949 #define cache_fpl_smr_enter(fpl) ({                             \
2950         struct cache_fpl *_fpl = (fpl);                         \
2951         MPASS(_fpl->in_smr == false);                           \
2952         vfs_smr_enter();                                        \
2953         _fpl->in_smr = true;                                    \
2954 })
2955
2956 #define cache_fpl_smr_exit(fpl) ({                              \
2957         struct cache_fpl *_fpl = (fpl);                         \
2958         MPASS(_fpl->in_smr == true);                            \
2959         vfs_smr_exit();                                         \
2960         _fpl->in_smr = false;                                   \
2961 })
2962
2963 static int
2964 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
2965 {
2966
2967         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
2968                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
2969                     ("%s: converting to abort from %d at %d, set at %d\n",
2970                     __func__, fpl->status, line, fpl->line));
2971         }
2972         fpl->status = CACHE_FPL_STATUS_ABORTED;
2973         fpl->line = line;
2974         return (CACHE_FPL_FAILED);
2975 }
2976
2977 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
2978
2979 static int
2980 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
2981 {
2982
2983         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
2984             ("%s: setting to partial at %d, but already set to %d at %d\n",
2985             __func__, line, fpl->status, fpl->line));
2986         cache_fpl_smr_assert_entered(fpl);
2987         fpl->status = CACHE_FPL_STATUS_PARTIAL;
2988         fpl->line = line;
2989         return (CACHE_FPL_FAILED);
2990 }
2991
2992 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
2993
2994 static int
2995 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
2996 {
2997
2998         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
2999             ("%s: setting to handled at %d, but already set to %d at %d\n",
3000             __func__, line, fpl->status, fpl->line));
3001         cache_fpl_smr_assert_not_entered(fpl);
3002         MPASS(error != CACHE_FPL_FAILED);
3003         fpl->status = CACHE_FPL_STATUS_HANDLED;
3004         fpl->line = line;
3005         return (error);
3006 }
3007
3008 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3009
3010 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3011         (LOCKLEAF | LOCKPARENT | WANTPARENT | FOLLOW | LOCKSHARED | SAVENAME | \
3012          ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2)
3013
3014 static bool
3015 cache_can_fplookup(struct cache_fpl *fpl)
3016 {
3017         struct nameidata *ndp;
3018         struct componentname *cnp;
3019         struct thread *td;
3020
3021         ndp = fpl->ndp;
3022         cnp = fpl->cnp;
3023         td = cnp->cn_thread;
3024
3025         if (!cache_fast_lookup) {
3026                 cache_fpl_aborted(fpl);
3027                 return (false);
3028         }
3029 #ifdef MAC
3030         if (mac_vnode_check_lookup_enabled()) {
3031                 cache_fpl_aborted(fpl);
3032                 return (false);
3033         }
3034 #endif
3035         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3036                 cache_fpl_aborted(fpl);
3037                 return (false);
3038         }
3039         if (cnp->cn_nameiop != LOOKUP) {
3040                 cache_fpl_aborted(fpl);
3041                 return (false);
3042         }
3043         if (ndp->ni_dirfd != AT_FDCWD) {
3044                 cache_fpl_aborted(fpl);
3045                 return (false);
3046         }
3047         if (IN_CAPABILITY_MODE(td)) {
3048                 cache_fpl_aborted(fpl);
3049                 return (false);
3050         }
3051         if (AUDITING_TD(td)) {
3052                 cache_fpl_aborted(fpl);
3053                 return (false);
3054         }
3055         if (ndp->ni_startdir != NULL) {
3056                 cache_fpl_aborted(fpl);
3057                 return (false);
3058         }
3059         return (true);
3060 }
3061
3062 static bool
3063 cache_fplookup_vnode_supported(struct vnode *vp)
3064 {
3065
3066         return (vp->v_type != VLNK);
3067 }
3068
3069 /*
3070  * Move a negative entry to the hot list.
3071  *
3072  * We have to take locks, but they may be contended and in the worst
3073  * case we may need to go off CPU. We don't want to spin within the
3074  * smr section and we can't block with it. Instead we are going to
3075  * look up the entry again.
3076  */
3077 static int __noinline
3078 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3079     uint32_t hash)
3080 {
3081         struct componentname *cnp;
3082         struct namecache *ncp;
3083         struct neglist *neglist;
3084         struct negstate *negstate;
3085         struct vnode *dvp;
3086         u_char nc_flag;
3087
3088         cnp = fpl->cnp;
3089         dvp = fpl->dvp;
3090
3091         if (!vhold_smr(dvp))
3092                 return (cache_fpl_aborted(fpl));
3093
3094         neglist = NCP2NEGLIST(oncp);
3095         cache_fpl_smr_exit(fpl);
3096
3097         mtx_lock(&ncneg_hot.nl_lock);
3098         mtx_lock(&neglist->nl_lock);
3099         /*
3100          * For hash iteration.
3101          */
3102         cache_fpl_smr_enter(fpl);
3103
3104         /*
3105          * Avoid all surprises by only succeeding if we got the same entry and
3106          * bailing completely otherwise.
3107          *
3108          * In particular at this point there can be a new ncp which matches the
3109          * search but hashes to a different neglist.
3110          */
3111         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3112                 if (ncp == oncp)
3113                         break;
3114         }
3115
3116         /*
3117          * No match to begin with.
3118          */
3119         if (__predict_false(ncp == NULL)) {
3120                 goto out_abort;
3121         }
3122
3123         /*
3124          * The newly found entry may be something different...
3125          */
3126         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3127             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
3128                 goto out_abort;
3129         }
3130
3131         /*
3132          * ... and not even negative.
3133          */
3134         nc_flag = atomic_load_char(&ncp->nc_flag);
3135         if ((nc_flag & NCF_NEGATIVE) == 0) {
3136                 goto out_abort;
3137         }
3138
3139         if (__predict_false(cache_ncp_invalid(ncp))) {
3140                 goto out_abort;
3141         }
3142
3143         negstate = NCP2NEGSTATE(ncp);
3144         if ((negstate->neg_flag & NEG_HOT) == 0) {
3145                 numhotneg++;
3146                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
3147                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
3148                 negstate->neg_flag |= NEG_HOT;
3149         }
3150
3151         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
3152         counter_u64_add(numneghits, 1);
3153         cache_fpl_smr_exit(fpl);
3154         mtx_unlock(&neglist->nl_lock);
3155         mtx_unlock(&ncneg_hot.nl_lock);
3156         vdrop(dvp);
3157         return (cache_fpl_handled(fpl, ENOENT));
3158 out_abort:
3159         cache_fpl_smr_exit(fpl);
3160         mtx_unlock(&neglist->nl_lock);
3161         mtx_unlock(&ncneg_hot.nl_lock);
3162         vdrop(dvp);
3163         return (cache_fpl_aborted(fpl));
3164 }
3165
3166 /*
3167  * The target vnode is not supported, prepare for the slow path to take over.
3168  */
3169 static int
3170 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3171 {
3172         struct componentname *cnp;
3173         enum vgetstate dvs;
3174         struct vnode *dvp;
3175         struct pwd *pwd;
3176         seqc_t dvp_seqc;
3177
3178         cnp = fpl->cnp;
3179         dvp = fpl->dvp;
3180         dvp_seqc = fpl->dvp_seqc;
3181
3182         dvs = vget_prep_smr(dvp);
3183         if (dvs == VGET_NONE) {
3184                 cache_fpl_smr_exit(fpl);
3185                 return (cache_fpl_aborted(fpl));
3186         }
3187
3188         cache_fpl_smr_exit(fpl);
3189
3190         vget_finish_ref(dvp, dvs);
3191         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3192                 vrele(dvp);
3193                 return (cache_fpl_aborted(fpl));
3194         }
3195
3196         pwd = pwd_hold(curthread);
3197         if (fpl->pwd != pwd) {
3198                 vrele(dvp);
3199                 pwd_drop(pwd);
3200                 return (cache_fpl_aborted(fpl));
3201         }
3202
3203         fpl->ndp->ni_startdir = dvp;
3204         return (0);
3205 }
3206
3207 static int
3208 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3209 {
3210         struct componentname *cnp;
3211         struct vnode *tvp;
3212         seqc_t tvp_seqc;
3213         int error;
3214
3215         cnp = fpl->cnp;
3216         tvp = fpl->tvp;
3217         tvp_seqc = fpl->tvp_seqc;
3218
3219         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3220                 error = vget_finish(tvp, cnp->cn_lkflags, tvs);
3221                 if (error != 0) {
3222                         return (cache_fpl_aborted(fpl));
3223                 }
3224         } else {
3225                 vget_finish_ref(tvp, tvs);
3226         }
3227
3228         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3229                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3230                         vput(tvp);
3231                 else
3232                         vrele(tvp);
3233                 return (cache_fpl_aborted(fpl));
3234         }
3235
3236         return (cache_fpl_handled(fpl, 0));
3237 }
3238
3239 static int __noinline
3240 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3241 {
3242         enum vgetstate dvs, tvs;
3243         struct componentname *cnp;
3244         struct vnode *dvp, *tvp;
3245         seqc_t dvp_seqc, tvp_seqc;
3246         int error;
3247
3248         cnp = fpl->cnp;
3249         dvp = fpl->dvp;
3250         dvp_seqc = fpl->dvp_seqc;
3251         tvp = fpl->tvp;
3252         tvp_seqc = fpl->tvp_seqc;
3253
3254         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3255
3256         /*
3257          * This is less efficient than it can be for simplicity.
3258          */
3259         dvs = vget_prep_smr(dvp);
3260         if (dvs == VGET_NONE) {
3261                 return (cache_fpl_aborted(fpl));
3262         }
3263         tvs = vget_prep_smr(tvp);
3264         if (tvs == VGET_NONE) {
3265                 cache_fpl_smr_exit(fpl);
3266                 vget_abort(dvp, dvs);
3267                 return (cache_fpl_aborted(fpl));
3268         }
3269
3270         cache_fpl_smr_exit(fpl);
3271
3272         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3273                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3274                 if (error != 0) {
3275                         vget_abort(tvp, tvs);
3276                         return (cache_fpl_aborted(fpl));
3277                 }
3278         } else {
3279                 vget_finish_ref(dvp, dvs);
3280         }
3281
3282         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3283                 vget_abort(tvp, tvs);
3284                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3285                         vput(dvp);
3286                 else
3287                         vrele(dvp);
3288                 cache_fpl_aborted(fpl);
3289                 return (error);
3290         }
3291
3292         error = cache_fplookup_final_child(fpl, tvs);
3293         if (error != 0) {
3294                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3295                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3296                         vput(dvp);
3297                 else
3298                         vrele(dvp);
3299                 return (error);
3300         }
3301
3302         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3303         return (0);
3304 }
3305
3306 static int
3307 cache_fplookup_final(struct cache_fpl *fpl)
3308 {
3309         struct componentname *cnp;
3310         enum vgetstate tvs;
3311         struct vnode *dvp, *tvp;
3312         seqc_t dvp_seqc, tvp_seqc;
3313
3314         cnp = fpl->cnp;
3315         dvp = fpl->dvp;
3316         dvp_seqc = fpl->dvp_seqc;
3317         tvp = fpl->tvp;
3318         tvp_seqc = fpl->tvp_seqc;
3319
3320         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3321
3322         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3323                 return (cache_fplookup_final_withparent(fpl));
3324
3325         tvs = vget_prep_smr(tvp);
3326         if (tvs == VGET_NONE) {
3327                 return (cache_fpl_partial(fpl));
3328         }
3329
3330         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3331                 cache_fpl_smr_exit(fpl);
3332                 vget_abort(tvp, tvs);
3333                 return (cache_fpl_aborted(fpl));
3334         }
3335
3336         cache_fpl_smr_exit(fpl);
3337         return (cache_fplookup_final_child(fpl, tvs));
3338 }
3339
3340 static int
3341 cache_fplookup_next(struct cache_fpl *fpl)
3342 {
3343         struct componentname *cnp;
3344         struct namecache *ncp;
3345         struct negstate *negstate;
3346         struct vnode *dvp, *tvp;
3347         u_char nc_flag;
3348         uint32_t hash;
3349         bool neg_hot;
3350
3351         cnp = fpl->cnp;
3352         dvp = fpl->dvp;
3353
3354         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3355                 fpl->tvp = dvp;
3356                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3357                 if (seqc_in_modify(fpl->tvp_seqc)) {
3358                         return (cache_fpl_aborted(fpl));
3359                 }
3360                 return (0);
3361         }
3362
3363         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3364
3365         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3366                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3367                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3368                         break;
3369         }
3370
3371         /*
3372          * If there is no entry we have to punt to the slow path to perform
3373          * actual lookup. Should there be nothing with this name a negative
3374          * entry will be created.
3375          */
3376         if (__predict_false(ncp == NULL)) {
3377                 return (cache_fpl_partial(fpl));
3378         }
3379
3380         tvp = atomic_load_ptr(&ncp->nc_vp);
3381         nc_flag = atomic_load_char(&ncp->nc_flag);
3382         if ((nc_flag & NCF_NEGATIVE) != 0) {
3383                 negstate = NCP2NEGSTATE(ncp);
3384                 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3385                 if (__predict_false(cache_ncp_invalid(ncp))) {
3386                         return (cache_fpl_partial(fpl));
3387                 }
3388                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3389                         return (cache_fpl_partial(fpl));
3390                 }
3391                 if (!neg_hot) {
3392                         return (cache_fplookup_negative_promote(fpl, ncp, hash));
3393                 }
3394                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3395                     ncp->nc_name);
3396                 counter_u64_add(numneghits, 1);
3397                 cache_fpl_smr_exit(fpl);
3398                 return (cache_fpl_handled(fpl, ENOENT));
3399         }
3400
3401         if (__predict_false(cache_ncp_invalid(ncp))) {
3402                 return (cache_fpl_partial(fpl));
3403         }
3404
3405         fpl->tvp = tvp;
3406         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3407         if (seqc_in_modify(fpl->tvp_seqc)) {
3408                 return (cache_fpl_partial(fpl));
3409         }
3410
3411         if (!cache_fplookup_vnode_supported(tvp)) {
3412                 return (cache_fpl_partial(fpl));
3413         }
3414
3415         counter_u64_add(numposhits, 1);
3416         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3417         return (0);
3418 }
3419
3420 static bool
3421 cache_fplookup_mp_supported(struct mount *mp)
3422 {
3423
3424         if (mp == NULL)
3425                 return (false);
3426         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3427                 return (false);
3428         if ((mp->mnt_flag & MNT_UNION) != 0)
3429                 return (false);
3430         return (true);
3431 }
3432
3433 /*
3434  * Walk up the mount stack (if any).
3435  *
3436  * Correctness is provided in the following ways:
3437  * - all vnodes are protected from freeing with SMR
3438  * - struct mount objects are type stable making them always safe to access
3439  * - stability of the particular mount is provided by busying it
3440  * - relationship between the vnode which is mounted on and the mount is
3441  *   verified with the vnode sequence counter after busying
3442  * - association between root vnode of the mount and the mount is protected
3443  *   by busy
3444  *
3445  * From that point on we can read the sequence counter of the root vnode
3446  * and get the next mount on the stack (if any) using the same protection.
3447  *
3448  * By the end of successful walk we are guaranteed the reached state was
3449  * indeed present at least at some point which matches the regular lookup.
3450  */
3451 static int
3452 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3453 {
3454         struct mount *mp, *prev_mp;
3455         struct vnode *vp;
3456         seqc_t vp_seqc;
3457
3458         vp = fpl->tvp;
3459         vp_seqc = fpl->tvp_seqc;
3460         if (vp->v_type != VDIR)
3461                 return (0);
3462
3463         mp = atomic_load_ptr(&vp->v_mountedhere);
3464         if (mp == NULL)
3465                 return (0);
3466
3467         prev_mp = NULL;
3468         for (;;) {
3469                 if (!vfs_op_thread_enter(mp)) {
3470                         if (prev_mp != NULL)
3471                                 vfs_op_thread_exit(prev_mp);
3472                         return (cache_fpl_partial(fpl));
3473                 }
3474                 if (prev_mp != NULL)
3475                         vfs_op_thread_exit(prev_mp);
3476                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3477                         vfs_op_thread_exit(mp);
3478                         return (cache_fpl_partial(fpl));
3479                 }
3480                 if (!cache_fplookup_mp_supported(mp)) {
3481                         vfs_op_thread_exit(mp);
3482                         return (cache_fpl_partial(fpl));
3483                 }
3484                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3485                 if (vp == NULL || VN_IS_DOOMED(vp)) {
3486                         vfs_op_thread_exit(mp);
3487                         return (cache_fpl_partial(fpl));
3488                 }
3489                 vp_seqc = vn_seqc_read_any(vp);
3490                 if (seqc_in_modify(vp_seqc)) {
3491                         vfs_op_thread_exit(mp);
3492                         return (cache_fpl_partial(fpl));
3493                 }
3494                 prev_mp = mp;
3495                 mp = atomic_load_ptr(&vp->v_mountedhere);
3496                 if (mp == NULL)
3497                         break;
3498         }
3499
3500         vfs_op_thread_exit(prev_mp);
3501         fpl->tvp = vp;
3502         fpl->tvp_seqc = vp_seqc;
3503         return (0);
3504 }
3505
3506 /*
3507  * Parse the path.
3508  *
3509  * The code is mostly copy-pasted from regular lookup, see lookup().
3510  * The structure is maintained along with comments for easier maintenance.
3511  * Deduplicating the code will become feasible after fast path lookup
3512  * becomes more feature-complete.
3513  */
3514 static int
3515 cache_fplookup_parse(struct cache_fpl *fpl)
3516 {
3517         struct nameidata *ndp;
3518         struct componentname *cnp;
3519         char *cp;
3520         char *prev_ni_next;             /* saved ndp->ni_next */
3521         size_t prev_ni_pathlen;         /* saved ndp->ni_pathlen */
3522
3523         ndp = fpl->ndp;
3524         cnp = fpl->cnp;
3525
3526         /*
3527          * Search a new directory.
3528          *
3529          * The last component of the filename is left accessible via
3530          * cnp->cn_nameptr for callers that need the name. Callers needing
3531          * the name set the SAVENAME flag. When done, they assume
3532          * responsibility for freeing the pathname buffer.
3533          */
3534         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3535                 continue;
3536         cnp->cn_namelen = cp - cnp->cn_nameptr;
3537         if (cnp->cn_namelen > NAME_MAX) {
3538                 cache_fpl_smr_exit(fpl);
3539                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
3540         }
3541         prev_ni_pathlen = ndp->ni_pathlen;
3542         ndp->ni_pathlen -= cnp->cn_namelen;
3543         KASSERT(ndp->ni_pathlen <= PATH_MAX,
3544             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3545         prev_ni_next = ndp->ni_next;
3546         ndp->ni_next = cp;
3547
3548         /*
3549          * Replace multiple slashes by a single slash and trailing slashes
3550          * by a null.  This must be done before VOP_LOOKUP() because some
3551          * fs's don't know about trailing slashes.  Remember if there were
3552          * trailing slashes to handle symlinks, existing non-directories
3553          * and non-existing files that won't be directories specially later.
3554          */
3555         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3556                 cp++;
3557                 ndp->ni_pathlen--;
3558                 if (*cp == '\0') {
3559                         /*
3560                          * TODO
3561                          * Regular lookup performs the following:
3562                          * *ndp->ni_next = '\0';
3563                          * cnp->cn_flags |= TRAILINGSLASH;
3564                          *
3565                          * Which is problematic since it modifies data read
3566                          * from userspace. Then if fast path lookup was to
3567                          * abort we would have to either restore it or convey
3568                          * the flag. Since this is a corner case just ignore
3569                          * it for simplicity.
3570                          */
3571                         return (cache_fpl_partial(fpl));
3572                 }
3573         }
3574         ndp->ni_next = cp;
3575
3576         cnp->cn_flags |= MAKEENTRY;
3577
3578         if (cnp->cn_namelen == 2 &&
3579             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3580                 cnp->cn_flags |= ISDOTDOT;
3581         else
3582                 cnp->cn_flags &= ~ISDOTDOT;
3583         if (*ndp->ni_next == 0)
3584                 cnp->cn_flags |= ISLASTCN;
3585         else
3586                 cnp->cn_flags &= ~ISLASTCN;
3587
3588         /*
3589          * Check for degenerate name (e.g. / or "")
3590          * which is a way of talking about a directory,
3591          * e.g. like "/." or ".".
3592          *
3593          * TODO
3594          * Another corner case handled by the regular lookup
3595          */
3596         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3597                 return (cache_fpl_partial(fpl));
3598         }
3599         return (0);
3600 }
3601
3602 static void
3603 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3604 {
3605         struct nameidata *ndp;
3606         struct componentname *cnp;
3607
3608         ndp = fpl->ndp;
3609         cnp = fpl->cnp;
3610
3611         cnp->cn_nameptr = ndp->ni_next;
3612         while (*cnp->cn_nameptr == '/') {
3613                 cnp->cn_nameptr++;
3614                 ndp->ni_pathlen--;
3615         }
3616 }
3617
3618 static int
3619 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3620 {
3621         struct nameidata *ndp;
3622         struct componentname *cnp;
3623         struct mount *mp;
3624         int error;
3625
3626         error = CACHE_FPL_FAILED;
3627         ndp = fpl->ndp;
3628         ndp->ni_lcf = 0;
3629         cnp = fpl->cnp;
3630         cnp->cn_lkflags = LK_SHARED;
3631         if ((cnp->cn_flags & LOCKSHARED) == 0)
3632                 cnp->cn_lkflags = LK_EXCLUSIVE;
3633
3634         cache_fpl_checkpoint(fpl, &fpl->snd);
3635
3636         fpl->dvp = dvp;
3637         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3638         if (seqc_in_modify(fpl->dvp_seqc)) {
3639                 cache_fpl_aborted(fpl);
3640                 goto out;
3641         }
3642         mp = atomic_load_ptr(&fpl->dvp->v_mount);
3643         if (!cache_fplookup_mp_supported(mp)) {
3644                 cache_fpl_aborted(fpl);
3645                 goto out;
3646         }
3647
3648         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3649
3650         for (;;) {
3651                 error = cache_fplookup_parse(fpl);
3652                 if (__predict_false(error != 0)) {
3653                         break;
3654                 }
3655
3656                 if (cnp->cn_flags & ISDOTDOT) {
3657                         error = cache_fpl_partial(fpl);
3658                         break;
3659                 }
3660
3661                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3662
3663                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
3664                 if (__predict_false(error != 0)) {
3665                         switch (error) {
3666                         case EAGAIN:
3667                         case EOPNOTSUPP: /* can happen when racing against vgone */
3668                                 cache_fpl_partial(fpl);
3669                                 break;
3670                         default:
3671                                 /*
3672                                  * See the API contract for VOP_FPLOOKUP_VEXEC.
3673                                  */
3674                                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3675                                         error = cache_fpl_aborted(fpl);
3676                                 } else {
3677                                         cache_fpl_smr_exit(fpl);
3678                                         cache_fpl_handled(fpl, error);
3679                                 }
3680                                 break;
3681                         }
3682                         break;
3683                 }
3684
3685                 error = cache_fplookup_next(fpl);
3686                 if (__predict_false(error != 0)) {
3687                         break;
3688                 }
3689
3690                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3691
3692                 error = cache_fplookup_climb_mount(fpl);
3693                 if (__predict_false(error != 0)) {
3694                         break;
3695                 }
3696
3697                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3698
3699                 if (cnp->cn_flags & ISLASTCN) {
3700                         error = cache_fplookup_final(fpl);
3701                         break;
3702                 }
3703
3704                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3705                         error = cache_fpl_aborted(fpl);
3706                         break;
3707                 }
3708
3709                 fpl->dvp = fpl->tvp;
3710                 fpl->dvp_seqc = fpl->tvp_seqc;
3711
3712                 cache_fplookup_parse_advance(fpl);
3713                 cache_fpl_checkpoint(fpl, &fpl->snd);
3714         }
3715 out:
3716         switch (fpl->status) {
3717         case CACHE_FPL_STATUS_UNSET:
3718                 __assert_unreachable();
3719                 break;
3720         case CACHE_FPL_STATUS_PARTIAL:
3721                 cache_fpl_smr_assert_entered(fpl);
3722                 return (cache_fplookup_partial_setup(fpl));
3723         case CACHE_FPL_STATUS_ABORTED:
3724                 if (fpl->in_smr)
3725                         cache_fpl_smr_exit(fpl);
3726                 return (CACHE_FPL_FAILED);
3727         case CACHE_FPL_STATUS_HANDLED:
3728                 cache_fpl_smr_assert_not_entered(fpl);
3729                 if (__predict_false(error != 0)) {
3730                         ndp->ni_dvp = NULL;
3731                         ndp->ni_vp = NULL;
3732                         cache_fpl_cleanup_cnp(cnp);
3733                         return (error);
3734                 }
3735                 ndp->ni_dvp = fpl->dvp;
3736                 ndp->ni_vp = fpl->tvp;
3737                 if (cnp->cn_flags & SAVENAME)
3738                         cnp->cn_flags |= HASBUF;
3739                 else
3740                         cache_fpl_cleanup_cnp(cnp);
3741                 return (error);
3742         }
3743 }
3744
3745 /*
3746  * Fast path lookup protected with SMR and sequence counters.
3747  *
3748  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
3749  *
3750  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
3751  * outlined below.
3752  *
3753  * Traditional vnode lookup conceptually looks like this:
3754  *
3755  * vn_lock(current);
3756  * for (;;) {
3757  *      next = find();
3758  *      vn_lock(next);
3759  *      vn_unlock(current);
3760  *      current = next;
3761  *      if (last)
3762  *          break;
3763  * }
3764  * return (current);
3765  *
3766  * Each jump to the next vnode is safe memory-wise and atomic with respect to
3767  * any modifications thanks to holding respective locks.
3768  *
3769  * The same guarantee can be provided with a combination of safe memory
3770  * reclamation and sequence counters instead. If all operations which affect
3771  * the relationship between the current vnode and the one we are looking for
3772  * also modify the counter, we can verify whether all the conditions held as
3773  * we made the jump. This includes things like permissions, mount points etc.
3774  * Counter modification is provided by enclosing relevant places in
3775  * vn_seqc_write_begin()/end() calls.
3776  *
3777  * Thus this translates to:
3778  *
3779  * vfs_smr_enter();
3780  * dvp_seqc = seqc_read_any(dvp);
3781  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
3782  *     abort();
3783  * for (;;) {
3784  *      tvp = find();
3785  *      tvp_seqc = seqc_read_any(tvp);
3786  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
3787  *          abort();
3788  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
3789  *          abort();
3790  *      dvp = tvp; // we know nothing of importance has changed
3791  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
3792  *      if (last)
3793  *          break;
3794  * }
3795  * vget(); // secure the vnode
3796  * if (!seqc_consistent(tvp, tvp_seqc) // final check
3797  *          abort();
3798  * // at this point we know nothing has changed for any parent<->child pair
3799  * // as they were crossed during the lookup, meaning we matched the guarantee
3800  * // of the locked variant
3801  * return (tvp);
3802  *
3803  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
3804  * - they are called while within vfs_smr protection which they must never exit
3805  * - EAGAIN can be returned to denote checking could not be performed, it is
3806  *   always valid to return it
3807  * - if the sequence counter has not changed the result must be valid
3808  * - if the sequence counter has changed both false positives and false negatives
3809  *   are permitted (since the result will be rejected later)
3810  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
3811  *
3812  * Caveats to watch out for:
3813  * - vnodes are passed unlocked and unreferenced with nothing stopping
3814  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
3815  *   to use atomic_load_ptr to fetch it.
3816  * - the aforementioned object can also get freed, meaning absent other means it
3817  *   should be protected with vfs_smr
3818  * - either safely checking permissions as they are modified or guaranteeing
3819  *   their stability is left to the routine
3820  */
3821 int
3822 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
3823     struct pwd **pwdp)
3824 {
3825         struct cache_fpl fpl;
3826         struct pwd *pwd;
3827         struct vnode *dvp;
3828         struct componentname *cnp;
3829         struct nameidata_saved orig;
3830         int error;
3831
3832         *status = CACHE_FPL_STATUS_UNSET;
3833         bzero(&fpl, sizeof(fpl));
3834         fpl.status = CACHE_FPL_STATUS_UNSET;
3835         fpl.ndp = ndp;
3836         fpl.cnp = &ndp->ni_cnd;
3837         MPASS(curthread == fpl.cnp->cn_thread);
3838
3839         if (!cache_can_fplookup(&fpl)) {
3840                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3841                 *status = fpl.status;
3842                 return (EOPNOTSUPP);
3843         }
3844
3845         cache_fpl_checkpoint(&fpl, &orig);
3846
3847         cache_fpl_smr_enter(&fpl);
3848         pwd = pwd_get_smr();
3849         fpl.pwd = pwd;
3850         ndp->ni_rootdir = pwd->pwd_rdir;
3851         ndp->ni_topdir = pwd->pwd_jdir;
3852
3853         cnp = fpl.cnp;
3854         cnp->cn_nameptr = cnp->cn_pnbuf;
3855         if (cnp->cn_pnbuf[0] == '/') {
3856                 cache_fpl_handle_root(ndp, &dvp);
3857         } else {
3858                 MPASS(ndp->ni_dirfd == AT_FDCWD);
3859                 dvp = pwd->pwd_cdir;
3860         }
3861
3862         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
3863
3864         error = cache_fplookup_impl(dvp, &fpl);
3865         cache_fpl_smr_assert_not_entered(&fpl);
3866         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3867
3868         *status = fpl.status;
3869         switch (fpl.status) {
3870         case CACHE_FPL_STATUS_UNSET:
3871                 __assert_unreachable();
3872                 break;
3873         case CACHE_FPL_STATUS_HANDLED:
3874                 SDT_PROBE3(vfs, namei, lookup, return, error,
3875                     (error == 0 ? ndp->ni_vp : NULL), true);
3876                 break;
3877         case CACHE_FPL_STATUS_PARTIAL:
3878                 *pwdp = fpl.pwd;
3879                 cache_fpl_restore(&fpl, &fpl.snd);
3880                 break;
3881         case CACHE_FPL_STATUS_ABORTED:
3882                 cache_fpl_restore(&fpl, &orig);
3883                 break;
3884         }
3885         return (error);
3886 }