sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/mount.h>
  55 #include <sys/namei.h>
  56 #include <sys/proc.h>
  57 #include <sys/rwlock.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 SDT_PROVIDER_DECLARE(vfs);
  83 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  84     "struct vnode *");
  85 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  86     "char *");
  87 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  89     "char *", "struct vnode *");
  90 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
  91 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
  92     "struct vnode *", "char *");
  93 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
  94     "struct vnode *");
  95 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
  96     "struct vnode *", "char *");
  97 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
  98     "char *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 100 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 101 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 105     "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
 107     "char *");
 108
 109 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 110 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 111 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 112
 113 /*
 114  * This structure describes the elements in the cache of recent
 115  * names looked up by namei.
 116  */
 117 struct negstate {
 118         u_char neg_flag;
 119 };
 120 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 121     "the state must fit in a union with a pointer without growing it");
 122
 123 struct  namecache {
 124         CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */
 125         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 126         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 127         struct  vnode *nc_dvp;          /* vnode of parent of name */
 128         union {
 129                 struct  vnode *nu_vp;   /* vnode the name refers to */
 130                 struct  negstate nu_neg;/* negative entry state */
 131         } n_un;
 132         u_char  nc_flag;                /* flag bits */
 133         u_char  nc_nlen;                /* length of name */
 134         char    nc_name[0];             /* segment name + nul */
 135 };
 136
 137 /*
 138  * struct namecache_ts repeats struct namecache layout up to the
 139  * nc_nlen member.
 140  * struct namecache_ts is used in place of struct namecache when time(s) need
 141  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 142  * both a non-dotdot directory name plus dotdot for the directory's
 143  * parent.
 144  */
 145 struct  namecache_ts {
 146         struct  timespec nc_time;       /* timespec provided by fs */
 147         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 148         int     nc_ticks;               /* ticks value when entry was added */
 149         struct namecache nc_nc;
 150 };
 151
 152 #define nc_vp           n_un.nu_vp
 153 #define nc_neg          n_un.nu_neg
 154
 155 /*
 156  * Flags in namecache.nc_flag
 157  */
 158 #define NCF_WHITE       0x01
 159 #define NCF_ISDOTDOT    0x02
 160 #define NCF_TS          0x04
 161 #define NCF_DTS         0x08
 162 #define NCF_DVDROP      0x10
 163 #define NCF_NEGATIVE    0x20
 164 #define NCF_INVALID     0x40
 165
 166 /*
 167  * Flags in negstate.neg_flag
 168  */
 169 #define NEG_HOT         0x01
 170
 171 /*
 172  * Mark an entry as invalid.
 173  *
 174  * This is called before it starts getting deconstructed.
 175  */
 176 static void
 177 cache_ncp_invalidate(struct namecache *ncp)
 178 {
 179
 180         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 181             ("%s: entry %p already invalid", __func__, ncp));
 182         ncp->nc_flag |= NCF_INVALID;
 183         atomic_thread_fence_rel();
 184 }
 185
 186 /*
 187  * Verify validity of an entry.
 188  *
 189  * All places which elide locks are supposed to call this after they are
 190  * done with reading from an entry.
 191  */
 192 static bool
 193 cache_ncp_invalid(struct namecache *ncp)
 194 {
 195
 196         atomic_thread_fence_acq();
 197         return ((ncp->nc_flag & NCF_INVALID) != 0);
 198 }
 199
 200 /*
 201  * Name caching works as follows:
 202  *
 203  * Names found by directory scans are retained in a cache
 204  * for future reference.  It is managed LRU, so frequently
 205  * used names will hang around.  Cache is indexed by hash value
 206  * obtained from (dvp, name) where dvp refers to the directory
 207  * containing name.
 208  *
 209  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 210  * exist) the vnode pointer will be NULL.
 211  *
 212  * Upon reaching the last segment of a path, if the reference
 213  * is for DELETE, or NOCACHE is set (rewrite), and the
 214  * name is located in the cache, it will be dropped.
 215  *
 216  * These locks are used (in the order in which they can be taken):
 217  * NAME         TYPE    ROLE
 218  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 219  * bucketlock   rwlock  for access to given set of hash buckets
 220  * neglist      mtx     negative entry LRU management
 221  *
 222  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
 223  * shrinking the LRU list.
 224  *
 225  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 226  * order is lower address first. Both are recursive.
 227  *
 228  * "." lookups are lockless.
 229  *
 230  * ".." and vnode -> name lookups require vnodelock.
 231  *
 232  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 233  *
 234  * Insertions and removals of entries require involved vnodes and bucketlocks
 235  * to be write-locked to prevent other threads from seeing the entry.
 236  *
 237  * Some lookups result in removal of the found entry (e.g. getting rid of a
 238  * negative entry with the intent to create a positive one), which poses a
 239  * problem when multiple threads reach the state. Similarly, two different
 240  * threads can purge two different vnodes and try to remove the same name.
 241  *
 242  * If the already held vnode lock is lower than the second required lock, we
 243  * can just take the other lock. However, in the opposite case, this could
 244  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 245  * the first node, locking everything in order and revalidating the state.
 246  */
 247
 248 VFS_SMR_DECLARE;
 249
 250 /*
 251  * Structures associated with name caching.
 252  */
 253 #define NCHHASH(hash) \
 254         (&nchashtbl[(hash) & nchash])
 255 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 256 static u_long __read_mostly     nchash;                 /* size of hash table */
 257 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 258     "Size of namecache hash table");
 259 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 260 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 261     "Ratio of negative namecache entries");
 262 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 263 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 264 u_int ncsizefactor = 2;
 265 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 266     "Size factor for namecache");
 267 static u_int __read_mostly      ncpurgeminvnodes;
 268 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
 269     "Number of vnodes below which purgevfs ignores the request");
 270 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 271
 272 struct nchstats nchstats;               /* cache effectiveness statistics */
 273
 274 static struct mtx __exclusive_cache_line        ncneg_shrink_lock;
 275
 276 struct neglist {
 277         struct mtx              nl_lock;
 278         TAILQ_HEAD(, namecache) nl_list;
 279 } __aligned(CACHE_LINE_SIZE);
 280
 281 static struct neglist __read_mostly     *neglists;
 282 static struct neglist ncneg_hot;
 283 static u_long numhotneg;
 284
 285 #define numneglists (ncneghash + 1)
 286 static u_int __read_mostly      ncneghash;
 287 static inline struct neglist *
 288 NCP2NEGLIST(struct namecache *ncp)
 289 {
 290
 291         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 292 }
 293
 294 static inline struct negstate *
 295 NCP2NEGSTATE(struct namecache *ncp)
 296 {
 297
 298         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 299         return (&ncp->nc_neg);
 300 }
 301
 302 #define numbucketlocks (ncbuckethash + 1)
 303 static u_int __read_mostly  ncbuckethash;
 304 static struct rwlock_padalign __read_mostly  *bucketlocks;
 305 #define HASH2BUCKETLOCK(hash) \
 306         ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
 307
 308 #define numvnodelocks (ncvnodehash + 1)
 309 static u_int __read_mostly  ncvnodehash;
 310 static struct mtx __read_mostly *vnodelocks;
 311 static inline struct mtx *
 312 VP2VNODELOCK(struct vnode *vp)
 313 {
 314
 315         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 316 }
 317
 318 /*
 319  * UMA zones for the VFS cache.
 320  *
 321  * The small cache is used for entries with short names, which are the
 322  * most common.  The large cache is used for entries which are too big to
 323  * fit in the small cache.
 324  */
 325 static uma_zone_t __read_mostly cache_zone_small;
 326 static uma_zone_t __read_mostly cache_zone_small_ts;
 327 static uma_zone_t __read_mostly cache_zone_large;
 328 static uma_zone_t __read_mostly cache_zone_large_ts;
 329
 330 #define CACHE_PATH_CUTOFF       35
 331
 332 static struct namecache *
 333 cache_alloc(int len, int ts)
 334 {
 335         struct namecache_ts *ncp_ts;
 336         struct namecache *ncp;
 337
 338         if (__predict_false(ts)) {
 339                 if (len <= CACHE_PATH_CUTOFF)
 340                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 341                 else
 342                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 343                 ncp = &ncp_ts->nc_nc;
 344         } else {
 345                 if (len <= CACHE_PATH_CUTOFF)
 346                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 347                 else
 348                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 349         }
 350         return (ncp);
 351 }
 352
 353 static void
 354 cache_free(struct namecache *ncp)
 355 {
 356         struct namecache_ts *ncp_ts;
 357
 358         if (ncp == NULL)
 359                 return;
 360         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 361                 vdrop(ncp->nc_dvp);
 362         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 363                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 364                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 365                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 366                 else
 367                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 368         } else {
 369                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 370                         uma_zfree_smr(cache_zone_small, ncp);
 371                 else
 372                         uma_zfree_smr(cache_zone_large, ncp);
 373         }
 374 }
 375
 376 static void
 377 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 378 {
 379         struct namecache_ts *ncp_ts;
 380
 381         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 382             (tsp == NULL && ticksp == NULL),
 383             ("No NCF_TS"));
 384
 385         if (tsp == NULL && ticksp == NULL)
 386                 return;
 387
 388         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 389         if (tsp != NULL)
 390                 *tsp = ncp_ts->nc_time;
 391         if (ticksp != NULL)
 392                 *ticksp = ncp_ts->nc_ticks;
 393 }
 394
 395 #ifdef DEBUG_CACHE
 396 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 397 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 398     "VFS namecache enabled");
 399 #endif
 400
 401 /* Export size information to userland */
 402 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 403     sizeof(struct namecache), "sizeof(struct namecache)");
 404
 405 /*
 406  * The new name cache statistics
 407  */
 408 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 409     "Name cache statistics");
 410 #define STATNODE_ULONG(name, descr)                                     \
 411         SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 412 #define STATNODE_COUNTER(name, descr)                                   \
 413         static COUNTER_U64_DEFINE_EARLY(name);                          \
 414         SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
 415             descr);
 416 STATNODE_ULONG(numneg, "Number of negative cache entries");
 417 STATNODE_ULONG(numcache, "Number of cache entries");
 418 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
 419 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
 420 STATNODE_COUNTER(dothits, "Number of '.' hits");
 421 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 422 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
 423 STATNODE_COUNTER(nummiss, "Number of cache misses");
 424 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 425 STATNODE_COUNTER(numposzaps,
 426     "Number of cache hits (positive) we do not want to cache");
 427 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 428 STATNODE_COUNTER(numnegzaps,
 429     "Number of cache hits (negative) we do not want to cache");
 430 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 431 /* These count for vn_getcwd(), too. */
 432 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 433 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 434 STATNODE_COUNTER(numfullpathfail2,
 435     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 436 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 437 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 438 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
 439     "Number of successful removals after relocking");
 440 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
 441     "Number of times zap_and_exit failed to lock");
 442 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
 443     "Number of times zap_and_exit failed to lock");
 444 static long cache_lock_vnodes_cel_3_failures;
 445 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
 446     "Number of times 3-way vnode locking failed");
 447 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
 448 STATNODE_COUNTER(numneg_evicted,
 449     "Number of negative entries evicted when adding a new entry");
 450 STATNODE_COUNTER(shrinking_skipped,
 451     "Number of times shrinking was already in progress");
 452
 453 static void cache_zap_locked(struct namecache *ncp);
 454 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
 455     char **freebuf, size_t *buflen);
 456 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
 457     char *buf, char **retbuf, size_t *buflen);
 458 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
 459     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 460
 461 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 462
 463 static int cache_yield;
 464 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
 465     "Number of times cache called yield");
 466
 467 static void __noinline
 468 cache_maybe_yield(void)
 469 {
 470
 471         if (should_yield()) {
 472                 cache_yield++;
 473                 kern_yield(PRI_USER);
 474         }
 475 }
 476
 477 static inline void
 478 cache_assert_vlp_locked(struct mtx *vlp)
 479 {
 480
 481         if (vlp != NULL)
 482                 mtx_assert(vlp, MA_OWNED);
 483 }
 484
 485 static inline void
 486 cache_assert_vnode_locked(struct vnode *vp)
 487 {
 488         struct mtx *vlp;
 489
 490         vlp = VP2VNODELOCK(vp);
 491         cache_assert_vlp_locked(vlp);
 492 }
 493
 494 static uint32_t
 495 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 496 {
 497         uint32_t hash;
 498
 499         hash = fnv_32_buf(name, len, FNV1_32_INIT);
 500         hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 501         return (hash);
 502 }
 503
 504 static inline struct rwlock *
 505 NCP2BUCKETLOCK(struct namecache *ncp)
 506 {
 507         uint32_t hash;
 508
 509         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 510         return (HASH2BUCKETLOCK(hash));
 511 }
 512
 513 #ifdef INVARIANTS
 514 static void
 515 cache_assert_bucket_locked(struct namecache *ncp, int mode)
 516 {
 517         struct rwlock *blp;
 518
 519         blp = NCP2BUCKETLOCK(ncp);
 520         rw_assert(blp, mode);
 521 }
 522 #else
 523 #define cache_assert_bucket_locked(x, y) do { } while (0)
 524 #endif
 525
 526 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 527 static void
 528 _cache_sort_vnodes(void **p1, void **p2)
 529 {
 530         void *tmp;
 531
 532         MPASS(*p1 != NULL || *p2 != NULL);
 533
 534         if (*p1 > *p2) {
 535                 tmp = *p2;
 536                 *p2 = *p1;
 537                 *p1 = tmp;
 538         }
 539 }
 540
 541 static void
 542 cache_lock_all_buckets(void)
 543 {
 544         u_int i;
 545
 546         for (i = 0; i < numbucketlocks; i++)
 547                 rw_wlock(&bucketlocks[i]);
 548 }
 549
 550 static void
 551 cache_unlock_all_buckets(void)
 552 {
 553         u_int i;
 554
 555         for (i = 0; i < numbucketlocks; i++)
 556                 rw_wunlock(&bucketlocks[i]);
 557 }
 558
 559 static void
 560 cache_lock_all_vnodes(void)
 561 {
 562         u_int i;
 563
 564         for (i = 0; i < numvnodelocks; i++)
 565                 mtx_lock(&vnodelocks[i]);
 566 }
 567
 568 static void
 569 cache_unlock_all_vnodes(void)
 570 {
 571         u_int i;
 572
 573         for (i = 0; i < numvnodelocks; i++)
 574                 mtx_unlock(&vnodelocks[i]);
 575 }
 576
 577 static int
 578 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 579 {
 580
 581         cache_sort_vnodes(&vlp1, &vlp2);
 582
 583         if (vlp1 != NULL) {
 584                 if (!mtx_trylock(vlp1))
 585                         return (EAGAIN);
 586         }
 587         if (!mtx_trylock(vlp2)) {
 588                 if (vlp1 != NULL)
 589                         mtx_unlock(vlp1);
 590                 return (EAGAIN);
 591         }
 592
 593         return (0);
 594 }
 595
 596 static void
 597 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 598 {
 599
 600         MPASS(vlp1 != NULL || vlp2 != NULL);
 601         MPASS(vlp1 <= vlp2);
 602
 603         if (vlp1 != NULL)
 604                 mtx_lock(vlp1);
 605         if (vlp2 != NULL)
 606                 mtx_lock(vlp2);
 607 }
 608
 609 static void
 610 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 611 {
 612
 613         MPASS(vlp1 != NULL || vlp2 != NULL);
 614
 615         if (vlp1 != NULL)
 616                 mtx_unlock(vlp1);
 617         if (vlp2 != NULL)
 618                 mtx_unlock(vlp2);
 619 }
 620
 621 static int
 622 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 623 {
 624         struct nchstats snap;
 625
 626         if (req->oldptr == NULL)
 627                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 628
 629         snap = nchstats;
 630         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 631         snap.ncs_neghits = counter_u64_fetch(numneghits);
 632         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 633             counter_u64_fetch(numnegzaps);
 634         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 635             counter_u64_fetch(nummiss);
 636
 637         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 638 }
 639 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 640     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 641     "VFS cache effectiveness statistics");
 642
 643 #ifdef DIAGNOSTIC
 644 /*
 645  * Grab an atomic snapshot of the name cache hash chain lengths
 646  */
 647 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 648     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 649     "hash table stats");
 650
 651 static int
 652 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 653 {
 654         struct nchashhead *ncpp;
 655         struct namecache *ncp;
 656         int i, error, n_nchash, *cntbuf;
 657
 658 retry:
 659         n_nchash = nchash + 1;  /* nchash is max index, not count */
 660         if (req->oldptr == NULL)
 661                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 662         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 663         cache_lock_all_buckets();
 664         if (n_nchash != nchash + 1) {
 665                 cache_unlock_all_buckets();
 666                 free(cntbuf, M_TEMP);
 667                 goto retry;
 668         }
 669         /* Scan hash tables counting entries */
 670         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 671                 CK_LIST_FOREACH(ncp, ncpp, nc_hash)
 672                         cntbuf[i]++;
 673         cache_unlock_all_buckets();
 674         for (error = 0, i = 0; i < n_nchash; i++)
 675                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 676                         break;
 677         free(cntbuf, M_TEMP);
 678         return (error);
 679 }
 680 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 681     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 682     "nchash chain lengths");
 683
 684 static int
 685 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 686 {
 687         int error;
 688         struct nchashhead *ncpp;
 689         struct namecache *ncp;
 690         int n_nchash;
 691         int count, maxlength, used, pct;
 692
 693         if (!req->oldptr)
 694                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 695
 696         cache_lock_all_buckets();
 697         n_nchash = nchash + 1;  /* nchash is max index, not count */
 698         used = 0;
 699         maxlength = 0;
 700
 701         /* Scan hash tables for applicable entries */
 702         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 703                 count = 0;
 704                 CK_LIST_FOREACH(ncp, ncpp, nc_hash) {
 705                         count++;
 706                 }
 707                 if (count)
 708                         used++;
 709                 if (maxlength < count)
 710                         maxlength = count;
 711         }
 712         n_nchash = nchash + 1;
 713         cache_unlock_all_buckets();
 714         pct = (used * 100) / (n_nchash / 100);
 715         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 716         if (error)
 717                 return (error);
 718         error = SYSCTL_OUT(req, &used, sizeof(used));
 719         if (error)
 720                 return (error);
 721         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 722         if (error)
 723                 return (error);
 724         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 725         if (error)
 726                 return (error);
 727         return (0);
 728 }
 729 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 730     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 731     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 732 #endif
 733
 734 /*
 735  * Negative entries management
 736  *
 737  * A variation of LRU scheme is used. New entries are hashed into one of
 738  * numneglists cold lists. Entries get promoted to the hot list on first hit.
 739  *
 740  * The shrinker will demote hot list head and evict from the cold list in a
 741  * round-robin manner.
 742  */
 743 static void
 744 cache_negative_init(struct namecache *ncp)
 745 {
 746         struct negstate *negstate;
 747
 748         ncp->nc_flag |= NCF_NEGATIVE;
 749         negstate = NCP2NEGSTATE(ncp);
 750         negstate->neg_flag = 0;
 751 }
 752
 753 static void
 754 cache_negative_hit(struct namecache *ncp)
 755 {
 756         struct neglist *neglist;
 757         struct negstate *negstate;
 758
 759         negstate = NCP2NEGSTATE(ncp);
 760         if ((negstate->neg_flag & NEG_HOT) != 0)
 761                 return;
 762         neglist = NCP2NEGLIST(ncp);
 763         mtx_lock(&ncneg_hot.nl_lock);
 764         mtx_lock(&neglist->nl_lock);
 765         if ((negstate->neg_flag & NEG_HOT) == 0) {
 766                 numhotneg++;
 767                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 768                 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
 769                 negstate->neg_flag |= NEG_HOT;
 770         }
 771         mtx_unlock(&neglist->nl_lock);
 772         mtx_unlock(&ncneg_hot.nl_lock);
 773 }
 774
 775 static void
 776 cache_negative_insert(struct namecache *ncp)
 777 {
 778         struct neglist *neglist;
 779
 780         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 781         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 782         neglist = NCP2NEGLIST(ncp);
 783         mtx_lock(&neglist->nl_lock);
 784         TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 785         mtx_unlock(&neglist->nl_lock);
 786         atomic_add_rel_long(&numneg, 1);
 787 }
 788
 789 static void
 790 cache_negative_remove(struct namecache *ncp)
 791 {
 792         struct neglist *neglist;
 793         struct negstate *negstate;
 794         bool hot_locked = false;
 795         bool list_locked = false;
 796
 797         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 798         neglist = NCP2NEGLIST(ncp);
 799         negstate = NCP2NEGSTATE(ncp);
 800         if ((negstate->neg_flag & NEG_HOT) != 0) {
 801                 hot_locked = true;
 802                 mtx_lock(&ncneg_hot.nl_lock);
 803                 if ((negstate->neg_flag & NEG_HOT) == 0) {
 804                         list_locked = true;
 805                         mtx_lock(&neglist->nl_lock);
 806                 }
 807         } else {
 808                 list_locked = true;
 809                 mtx_lock(&neglist->nl_lock);
 810         }
 811         if ((negstate->neg_flag & NEG_HOT) != 0) {
 812                 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
 813                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 814                 numhotneg--;
 815         } else {
 816                 mtx_assert(&neglist->nl_lock, MA_OWNED);
 817                 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
 818         }
 819         if (list_locked)
 820                 mtx_unlock(&neglist->nl_lock);
 821         if (hot_locked)
 822                 mtx_unlock(&ncneg_hot.nl_lock);
 823         atomic_subtract_rel_long(&numneg, 1);
 824 }
 825
 826 static void
 827 cache_negative_shrink_select(struct namecache **ncpp,
 828     struct neglist **neglistpp)
 829 {
 830         struct neglist *neglist;
 831         struct namecache *ncp;
 832         static u_int cycle;
 833         u_int i;
 834
 835         *ncpp = ncp = NULL;
 836
 837         for (i = 0; i < numneglists; i++) {
 838                 neglist = &neglists[(cycle + i) % numneglists];
 839                 if (TAILQ_FIRST(&neglist->nl_list) == NULL)
 840                         continue;
 841                 mtx_lock(&neglist->nl_lock);
 842                 ncp = TAILQ_FIRST(&neglist->nl_list);
 843                 if (ncp != NULL)
 844                         break;
 845                 mtx_unlock(&neglist->nl_lock);
 846         }
 847
 848         *neglistpp = neglist;
 849         *ncpp = ncp;
 850         cycle++;
 851 }
 852
 853 static void
 854 cache_negative_zap_one(void)
 855 {
 856         struct namecache *ncp, *ncp2;
 857         struct neglist *neglist;
 858         struct negstate *negstate;
 859         struct mtx *dvlp;
 860         struct rwlock *blp;
 861
 862         if (mtx_owner(&ncneg_shrink_lock) != NULL ||
 863             !mtx_trylock(&ncneg_shrink_lock)) {
 864                 counter_u64_add(shrinking_skipped, 1);
 865                 return;
 866         }
 867
 868         mtx_lock(&ncneg_hot.nl_lock);
 869         ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
 870         if (ncp != NULL) {
 871                 neglist = NCP2NEGLIST(ncp);
 872                 negstate = NCP2NEGSTATE(ncp);
 873                 mtx_lock(&neglist->nl_lock);
 874                 MPASS((negstate->neg_flag & NEG_HOT) != 0);
 875                 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
 876                 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
 877                 negstate->neg_flag &= ~NEG_HOT;
 878                 numhotneg--;
 879                 mtx_unlock(&neglist->nl_lock);
 880         }
 881         mtx_unlock(&ncneg_hot.nl_lock);
 882
 883         cache_negative_shrink_select(&ncp, &neglist);
 884
 885         mtx_unlock(&ncneg_shrink_lock);
 886         if (ncp == NULL)
 887                 return;
 888
 889         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 890         dvlp = VP2VNODELOCK(ncp->nc_dvp);
 891         blp = NCP2BUCKETLOCK(ncp);
 892         mtx_unlock(&neglist->nl_lock);
 893         mtx_lock(dvlp);
 894         rw_wlock(blp);
 895         /*
 896          * Enter SMR to safely check the negative list.
 897          * Even if the found pointer matches, the entry may now be reallocated
 898          * and used by a different vnode.
 899          */
 900         vfs_smr_enter();
 901         ncp2 = TAILQ_FIRST(&neglist->nl_list);
 902         if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
 903             blp != NCP2BUCKETLOCK(ncp2)) {
 904                 vfs_smr_exit();
 905                 ncp = NULL;
 906         } else {
 907                 vfs_smr_exit();
 908                 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
 909                     ncp->nc_name);
 910                 cache_zap_locked(ncp);
 911                 counter_u64_add(numneg_evicted, 1);
 912         }
 913         rw_wunlock(blp);
 914         mtx_unlock(dvlp);
 915         cache_free(ncp);
 916 }
 917
 918 /*
 919  * cache_zap_locked():
 920  *
 921  *   Removes a namecache entry from cache, whether it contains an actual
 922  *   pointer to a vnode or if it is just a negative cache entry.
 923  */
 924 static void
 925 cache_zap_locked(struct namecache *ncp)
 926 {
 927
 928         if (!(ncp->nc_flag & NCF_NEGATIVE))
 929                 cache_assert_vnode_locked(ncp->nc_vp);
 930         cache_assert_vnode_locked(ncp->nc_dvp);
 931         cache_assert_bucket_locked(ncp, RA_WLOCKED);
 932
 933         CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
 934             (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
 935
 936         cache_ncp_invalidate(ncp);
 937
 938         CK_LIST_REMOVE(ncp, nc_hash);
 939         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 940                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 941                     ncp->nc_name, ncp->nc_vp);
 942                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 943                 if (ncp == ncp->nc_vp->v_cache_dd)
 944                         ncp->nc_vp->v_cache_dd = NULL;
 945         } else {
 946                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 947                     ncp->nc_name);
 948                 cache_negative_remove(ncp);
 949         }
 950         if (ncp->nc_flag & NCF_ISDOTDOT) {
 951                 if (ncp == ncp->nc_dvp->v_cache_dd)
 952                         ncp->nc_dvp->v_cache_dd = NULL;
 953         } else {
 954                 LIST_REMOVE(ncp, nc_src);
 955                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 956                         ncp->nc_flag |= NCF_DVDROP;
 957                         counter_u64_add(numcachehv, -1);
 958                 }
 959         }
 960         atomic_subtract_rel_long(&numcache, 1);
 961 }
 962
 963 static void
 964 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 965 {
 966         struct rwlock *blp;
 967
 968         MPASS(ncp->nc_dvp == vp);
 969         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 970         cache_assert_vnode_locked(vp);
 971
 972         blp = NCP2BUCKETLOCK(ncp);
 973         rw_wlock(blp);
 974         cache_zap_locked(ncp);
 975         rw_wunlock(blp);
 976 }
 977
 978 static bool
 979 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
 980     struct mtx **vlpp)
 981 {
 982         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 983         struct rwlock *blp;
 984
 985         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 986         cache_assert_vnode_locked(vp);
 987
 988         if (ncp->nc_flag & NCF_NEGATIVE) {
 989                 if (*vlpp != NULL) {
 990                         mtx_unlock(*vlpp);
 991                         *vlpp = NULL;
 992                 }
 993                 cache_zap_negative_locked_vnode_kl(ncp, vp);
 994                 return (true);
 995         }
 996
 997         pvlp = VP2VNODELOCK(vp);
 998         blp = NCP2BUCKETLOCK(ncp);
 999         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1000         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1001
1002         if (*vlpp == vlp1 || *vlpp == vlp2) {
1003                 to_unlock = *vlpp;
1004                 *vlpp = NULL;
1005         } else {
1006                 if (*vlpp != NULL) {
1007                         mtx_unlock(*vlpp);
1008                         *vlpp = NULL;
1009                 }
1010                 cache_sort_vnodes(&vlp1, &vlp2);
1011                 if (vlp1 == pvlp) {
1012                         mtx_lock(vlp2);
1013                         to_unlock = vlp2;
1014                 } else {
1015                         if (!mtx_trylock(vlp1))
1016                                 goto out_relock;
1017                         to_unlock = vlp1;
1018                 }
1019         }
1020         rw_wlock(blp);
1021         cache_zap_locked(ncp);
1022         rw_wunlock(blp);
1023         if (to_unlock != NULL)
1024                 mtx_unlock(to_unlock);
1025         return (true);
1026
1027 out_relock:
1028         mtx_unlock(vlp2);
1029         mtx_lock(vlp1);
1030         mtx_lock(vlp2);
1031         MPASS(*vlpp == NULL);
1032         *vlpp = vlp1;
1033         return (false);
1034 }
1035
1036 static int __noinline
1037 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1038 {
1039         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1040         struct rwlock *blp;
1041         int error = 0;
1042
1043         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1044         cache_assert_vnode_locked(vp);
1045
1046         pvlp = VP2VNODELOCK(vp);
1047         if (ncp->nc_flag & NCF_NEGATIVE) {
1048                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1049                 goto out;
1050         }
1051
1052         blp = NCP2BUCKETLOCK(ncp);
1053         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1054         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1055         cache_sort_vnodes(&vlp1, &vlp2);
1056         if (vlp1 == pvlp) {
1057                 mtx_lock(vlp2);
1058                 to_unlock = vlp2;
1059         } else {
1060                 if (!mtx_trylock(vlp1)) {
1061                         error = EAGAIN;
1062                         goto out;
1063                 }
1064                 to_unlock = vlp1;
1065         }
1066         rw_wlock(blp);
1067         cache_zap_locked(ncp);
1068         rw_wunlock(blp);
1069         mtx_unlock(to_unlock);
1070 out:
1071         mtx_unlock(pvlp);
1072         return (error);
1073 }
1074
1075 /*
1076  * If trylocking failed we can get here. We know enough to take all needed locks
1077  * in the right order and re-lookup the entry.
1078  */
1079 static int
1080 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1081     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1082     struct rwlock *blp)
1083 {
1084         struct namecache *rncp;
1085
1086         cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1087
1088         cache_sort_vnodes(&dvlp, &vlp);
1089         cache_lock_vnodes(dvlp, vlp);
1090         rw_wlock(blp);
1091         CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1092                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1093                     rncp->nc_nlen == cnp->cn_namelen &&
1094                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1095                         break;
1096         }
1097         if (rncp != NULL) {
1098                 cache_zap_locked(rncp);
1099                 rw_wunlock(blp);
1100                 cache_unlock_vnodes(dvlp, vlp);
1101                 counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1102                 return (0);
1103         }
1104
1105         rw_wunlock(blp);
1106         cache_unlock_vnodes(dvlp, vlp);
1107         return (EAGAIN);
1108 }
1109
1110 static int __noinline
1111 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1112     uint32_t hash, struct rwlock *blp)
1113 {
1114         struct mtx *dvlp, *vlp;
1115         struct vnode *dvp;
1116
1117         cache_assert_bucket_locked(ncp, RA_WLOCKED);
1118
1119         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1120         vlp = NULL;
1121         if (!(ncp->nc_flag & NCF_NEGATIVE))
1122                 vlp = VP2VNODELOCK(ncp->nc_vp);
1123         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1124                 cache_zap_locked(ncp);
1125                 rw_wunlock(blp);
1126                 cache_unlock_vnodes(dvlp, vlp);
1127                 return (0);
1128         }
1129
1130         dvp = ncp->nc_dvp;
1131         rw_wunlock(blp);
1132         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1133 }
1134
1135 static int __noinline
1136 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1137     uint32_t hash, struct rwlock *blp)
1138 {
1139         struct mtx *dvlp, *vlp;
1140         struct vnode *dvp;
1141
1142         cache_assert_bucket_locked(ncp, RA_RLOCKED);
1143
1144         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1145         vlp = NULL;
1146         if (!(ncp->nc_flag & NCF_NEGATIVE))
1147                 vlp = VP2VNODELOCK(ncp->nc_vp);
1148         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1149                 rw_runlock(blp);
1150                 rw_wlock(blp);
1151                 cache_zap_locked(ncp);
1152                 rw_wunlock(blp);
1153                 cache_unlock_vnodes(dvlp, vlp);
1154                 return (0);
1155         }
1156
1157         dvp = ncp->nc_dvp;
1158         rw_runlock(blp);
1159         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1160 }
1161
1162 static int
1163 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1164     struct mtx **vlpp1, struct mtx **vlpp2)
1165 {
1166         struct mtx *dvlp, *vlp;
1167
1168         cache_assert_bucket_locked(ncp, RA_WLOCKED);
1169
1170         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1171         vlp = NULL;
1172         if (!(ncp->nc_flag & NCF_NEGATIVE))
1173                 vlp = VP2VNODELOCK(ncp->nc_vp);
1174         cache_sort_vnodes(&dvlp, &vlp);
1175
1176         if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1177                 cache_zap_locked(ncp);
1178                 cache_unlock_vnodes(dvlp, vlp);
1179                 *vlpp1 = NULL;
1180                 *vlpp2 = NULL;
1181                 return (0);
1182         }
1183
1184         if (*vlpp1 != NULL)
1185                 mtx_unlock(*vlpp1);
1186         if (*vlpp2 != NULL)
1187                 mtx_unlock(*vlpp2);
1188         *vlpp1 = NULL;
1189         *vlpp2 = NULL;
1190
1191         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1192                 cache_zap_locked(ncp);
1193                 cache_unlock_vnodes(dvlp, vlp);
1194                 return (0);
1195         }
1196
1197         rw_wunlock(blp);
1198         *vlpp1 = dvlp;
1199         *vlpp2 = vlp;
1200         if (*vlpp1 != NULL)
1201                 mtx_lock(*vlpp1);
1202         mtx_lock(*vlpp2);
1203         rw_wlock(blp);
1204         return (EAGAIN);
1205 }
1206
1207 static void
1208 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1209 {
1210
1211         if (blp != NULL) {
1212                 rw_runlock(blp);
1213         } else {
1214                 mtx_unlock(vlp);
1215         }
1216 }
1217
1218 static int __noinline
1219 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1220     struct timespec *tsp, int *ticksp)
1221 {
1222         int ltype;
1223
1224         *vpp = dvp;
1225         CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1226                         dvp, cnp->cn_nameptr);
1227         counter_u64_add(dothits, 1);
1228         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1229         if (tsp != NULL)
1230                 timespecclear(tsp);
1231         if (ticksp != NULL)
1232                 *ticksp = ticks;
1233         vrefact(*vpp);
1234         /*
1235          * When we lookup "." we still can be asked to lock it
1236          * differently...
1237          */
1238         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1239         if (ltype != VOP_ISLOCKED(*vpp)) {
1240                 if (ltype == LK_EXCLUSIVE) {
1241                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1242                         if (VN_IS_DOOMED((*vpp))) {
1243                                 /* forced unmount */
1244                                 vrele(*vpp);
1245                                 *vpp = NULL;
1246                                 return (ENOENT);
1247                         }
1248                 } else
1249                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1250         }
1251         return (-1);
1252 }
1253
1254 static __noinline int
1255 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1256     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1257 {
1258         struct namecache *ncp;
1259         struct rwlock *blp;
1260         struct mtx *dvlp, *dvlp2;
1261         uint32_t hash;
1262         int error;
1263
1264         if (cnp->cn_namelen == 2 &&
1265             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1266                 counter_u64_add(dotdothits, 1);
1267                 dvlp = VP2VNODELOCK(dvp);
1268                 dvlp2 = NULL;
1269                 mtx_lock(dvlp);
1270 retry_dotdot:
1271                 ncp = dvp->v_cache_dd;
1272                 if (ncp == NULL) {
1273                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1274                             "..", NULL);
1275                         mtx_unlock(dvlp);
1276                         if (dvlp2 != NULL)
1277                                 mtx_unlock(dvlp2);
1278                         return (0);
1279                 }
1280                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1281                         if (ncp->nc_dvp != dvp)
1282                                 panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1283                         if (!cache_zap_locked_vnode_kl2(ncp,
1284                             dvp, &dvlp2))
1285                                 goto retry_dotdot;
1286                         MPASS(dvp->v_cache_dd == NULL);
1287                         mtx_unlock(dvlp);
1288                         if (dvlp2 != NULL)
1289                                 mtx_unlock(dvlp2);
1290                         cache_free(ncp);
1291                 } else {
1292                         dvp->v_cache_dd = NULL;
1293                         mtx_unlock(dvlp);
1294                         if (dvlp2 != NULL)
1295                                 mtx_unlock(dvlp2);
1296                 }
1297                 return (0);
1298         }
1299
1300         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1301         blp = HASH2BUCKETLOCK(hash);
1302 retry:
1303         if (CK_LIST_EMPTY(NCHHASH(hash)))
1304                 goto out_no_entry;
1305
1306         rw_wlock(blp);
1307
1308         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1309                 counter_u64_add(numchecks, 1);
1310                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1311                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1312                         break;
1313         }
1314
1315         /* We failed to find an entry */
1316         if (ncp == NULL) {
1317                 rw_wunlock(blp);
1318                 goto out_no_entry;
1319         }
1320
1321         error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1322         if (__predict_false(error != 0)) {
1323                 zap_and_exit_bucket_fail++;
1324                 cache_maybe_yield();
1325                 goto retry;
1326         }
1327         counter_u64_add(numposzaps, 1);
1328         cache_free(ncp);
1329         return (0);
1330 out_no_entry:
1331         SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1332         counter_u64_add(nummisszap, 1);
1333         return (0);
1334 }
1335
1336 /**
1337  * Lookup a name in the name cache
1338  *
1339  * # Arguments
1340  *
1341  * - dvp:       Parent directory in which to search.
1342  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1343  * - cnp:       Parameters of the name search.  The most interesting bits of
1344  *              the cn_flags field have the following meanings:
1345  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1346  *                      it up.
1347  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1348  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1349  *              or negative) lookup, tsp will be filled with any timespec that
1350  *              was stored when this cache entry was created.  However, it will
1351  *              be clear for "." entries.
1352  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1353  *              (positive or negative) lookup, it will contain the ticks value
1354  *              that was current when the cache entry was created, unless cnp
1355  *              was ".".
1356  *
1357  * # Returns
1358  *
1359  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1360  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1361  *              to a forced unmount.  vpp will not be modified.  If the entry
1362  *              is a whiteout, then the ISWHITEOUT flag will be set in
1363  *              cnp->cn_flags.
1364  * - 0:         A cache miss.  vpp will not be modified.
1365  *
1366  * # Locking
1367  *
1368  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1369  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1370  * lock is not recursively acquired.
1371  */
1372 int
1373 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1374     struct timespec *tsp, int *ticksp)
1375 {
1376         struct namecache_ts *ncp_ts;
1377         struct namecache *ncp;
1378         struct negstate *negstate;
1379         struct rwlock *blp;
1380         struct mtx *dvlp;
1381         uint32_t hash;
1382         enum vgetstate vs;
1383         int error, ltype;
1384         bool try_smr, doing_smr, whiteout;
1385
1386 #ifdef DEBUG_CACHE
1387         if (__predict_false(!doingcache)) {
1388                 cnp->cn_flags &= ~MAKEENTRY;
1389                 return (0);
1390         }
1391 #endif
1392
1393         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1394                 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1395
1396         if ((cnp->cn_flags & MAKEENTRY) == 0)
1397                 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1398
1399         try_smr = true;
1400         if (cnp->cn_nameiop == CREATE)
1401                 try_smr = false;
1402 retry:
1403         doing_smr = false;
1404         blp = NULL;
1405         dvlp = NULL;
1406         error = 0;
1407         if (cnp->cn_namelen == 2 &&
1408             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1409                 counter_u64_add(dotdothits, 1);
1410                 dvlp = VP2VNODELOCK(dvp);
1411                 mtx_lock(dvlp);
1412                 ncp = dvp->v_cache_dd;
1413                 if (ncp == NULL) {
1414                         SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1415                             "..", NULL);
1416                         mtx_unlock(dvlp);
1417                         return (0);
1418                 }
1419                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1420                         if (ncp->nc_flag & NCF_NEGATIVE)
1421                                 *vpp = NULL;
1422                         else
1423                                 *vpp = ncp->nc_vp;
1424                 } else
1425                         *vpp = ncp->nc_dvp;
1426                 /* Return failure if negative entry was found. */
1427                 if (*vpp == NULL)
1428                         goto negative_success;
1429                 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1430                     dvp, cnp->cn_nameptr, *vpp);
1431                 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1432                     *vpp);
1433                 cache_out_ts(ncp, tsp, ticksp);
1434                 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1435                     NCF_DTS && tsp != NULL) {
1436                         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1437                         *tsp = ncp_ts->nc_dotdottime;
1438                 }
1439                 goto success;
1440         }
1441
1442         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1443 retry_hashed:
1444         if (try_smr) {
1445                 vfs_smr_enter();
1446                 doing_smr = true;
1447                 try_smr = false;
1448         } else {
1449                 blp = HASH2BUCKETLOCK(hash);
1450                 rw_rlock(blp);
1451         }
1452
1453         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1454                 counter_u64_add(numchecks, 1);
1455                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1456                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1457                         break;
1458         }
1459
1460         /* We failed to find an entry */
1461         if (__predict_false(ncp == NULL)) {
1462                 if (doing_smr)
1463                         vfs_smr_exit();
1464                 else
1465                         rw_runlock(blp);
1466                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1467                     NULL);
1468                 counter_u64_add(nummiss, 1);
1469                 return (0);
1470         }
1471
1472         if (ncp->nc_flag & NCF_NEGATIVE)
1473                 goto negative_success;
1474
1475         /* We found a "positive" match, return the vnode */
1476         counter_u64_add(numposhits, 1);
1477         *vpp = ncp->nc_vp;
1478         CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1479             dvp, cnp->cn_nameptr, *vpp, ncp);
1480         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1481             *vpp);
1482         cache_out_ts(ncp, tsp, ticksp);
1483 success:
1484         /*
1485          * On success we return a locked and ref'd vnode as per the lookup
1486          * protocol.
1487          */
1488         MPASS(dvp != *vpp);
1489         ltype = 0;      /* silence gcc warning */
1490         if (cnp->cn_flags & ISDOTDOT) {
1491                 ltype = VOP_ISLOCKED(dvp);
1492                 VOP_UNLOCK(dvp);
1493         }
1494         if (doing_smr) {
1495                 if (cache_ncp_invalid(ncp)) {
1496                         vfs_smr_exit();
1497                         *vpp = NULL;
1498                         goto retry;
1499                 }
1500                 vs = vget_prep_smr(*vpp);
1501                 vfs_smr_exit();
1502                 if (vs == VGET_NONE) {
1503                         *vpp = NULL;
1504                         goto retry;
1505                 }
1506         } else {
1507                 vs = vget_prep(*vpp);
1508                 cache_lookup_unlock(blp, dvlp);
1509         }
1510         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1511         if (cnp->cn_flags & ISDOTDOT) {
1512                 vn_lock(dvp, ltype | LK_RETRY);
1513                 if (VN_IS_DOOMED(dvp)) {
1514                         if (error == 0)
1515                                 vput(*vpp);
1516                         *vpp = NULL;
1517                         return (ENOENT);
1518                 }
1519         }
1520         if (error) {
1521                 *vpp = NULL;
1522                 goto retry;
1523         }
1524         if ((cnp->cn_flags & ISLASTCN) &&
1525             (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1526                 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1527         }
1528         return (-1);
1529
1530 negative_success:
1531         /* We found a negative match, and want to create it, so purge */
1532         if (cnp->cn_nameiop == CREATE) {
1533                 MPASS(!doing_smr);
1534                 counter_u64_add(numnegzaps, 1);
1535                 goto zap_and_exit;
1536         }
1537
1538         SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1539         cache_out_ts(ncp, tsp, ticksp);
1540         counter_u64_add(numneghits, 1);
1541         whiteout = (ncp->nc_flag & NCF_WHITE);
1542
1543         if (doing_smr) {
1544                 /*
1545                  * We need to take locks to promote an entry.
1546                  */
1547                 negstate = NCP2NEGSTATE(ncp);
1548                 if ((negstate->neg_flag & NEG_HOT) == 0 ||
1549                     cache_ncp_invalid(ncp)) {
1550                         vfs_smr_exit();
1551                         doing_smr = false;
1552                         goto retry_hashed;
1553                 }
1554                 vfs_smr_exit();
1555         } else {
1556                 cache_negative_hit(ncp);
1557                 cache_lookup_unlock(blp, dvlp);
1558         }
1559         if (whiteout)
1560                 cnp->cn_flags |= ISWHITEOUT;
1561         return (ENOENT);
1562
1563 zap_and_exit:
1564         MPASS(!doing_smr);
1565         if (blp != NULL)
1566                 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1567         else
1568                 error = cache_zap_locked_vnode(ncp, dvp);
1569         if (__predict_false(error != 0)) {
1570                 zap_and_exit_bucket_fail2++;
1571                 cache_maybe_yield();
1572                 goto retry;
1573         }
1574         cache_free(ncp);
1575         return (0);
1576 }
1577
1578 struct celockstate {
1579         struct mtx *vlp[3];
1580         struct rwlock *blp[2];
1581 };
1582 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1583 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1584
1585 static inline void
1586 cache_celockstate_init(struct celockstate *cel)
1587 {
1588
1589         bzero(cel, sizeof(*cel));
1590 }
1591
1592 static void
1593 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1594     struct vnode *dvp)
1595 {
1596         struct mtx *vlp1, *vlp2;
1597
1598         MPASS(cel->vlp[0] == NULL);
1599         MPASS(cel->vlp[1] == NULL);
1600         MPASS(cel->vlp[2] == NULL);
1601
1602         MPASS(vp != NULL || dvp != NULL);
1603
1604         vlp1 = VP2VNODELOCK(vp);
1605         vlp2 = VP2VNODELOCK(dvp);
1606         cache_sort_vnodes(&vlp1, &vlp2);
1607
1608         if (vlp1 != NULL) {
1609                 mtx_lock(vlp1);
1610                 cel->vlp[0] = vlp1;
1611         }
1612         mtx_lock(vlp2);
1613         cel->vlp[1] = vlp2;
1614 }
1615
1616 static void
1617 cache_unlock_vnodes_cel(struct celockstate *cel)
1618 {
1619
1620         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1621
1622         if (cel->vlp[0] != NULL)
1623                 mtx_unlock(cel->vlp[0]);
1624         if (cel->vlp[1] != NULL)
1625                 mtx_unlock(cel->vlp[1]);
1626         if (cel->vlp[2] != NULL)
1627                 mtx_unlock(cel->vlp[2]);
1628 }
1629
1630 static bool
1631 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1632 {
1633         struct mtx *vlp;
1634         bool ret;
1635
1636         cache_assert_vlp_locked(cel->vlp[0]);
1637         cache_assert_vlp_locked(cel->vlp[1]);
1638         MPASS(cel->vlp[2] == NULL);
1639
1640         MPASS(vp != NULL);
1641         vlp = VP2VNODELOCK(vp);
1642
1643         ret = true;
1644         if (vlp >= cel->vlp[1]) {
1645                 mtx_lock(vlp);
1646         } else {
1647                 if (mtx_trylock(vlp))
1648                         goto out;
1649                 cache_lock_vnodes_cel_3_failures++;
1650                 cache_unlock_vnodes_cel(cel);
1651                 if (vlp < cel->vlp[0]) {
1652                         mtx_lock(vlp);
1653                         mtx_lock(cel->vlp[0]);
1654                         mtx_lock(cel->vlp[1]);
1655                 } else {
1656                         if (cel->vlp[0] != NULL)
1657                                 mtx_lock(cel->vlp[0]);
1658                         mtx_lock(vlp);
1659                         mtx_lock(cel->vlp[1]);
1660                 }
1661                 ret = false;
1662         }
1663 out:
1664         cel->vlp[2] = vlp;
1665         return (ret);
1666 }
1667
1668 static void
1669 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1670     struct rwlock *blp2)
1671 {
1672
1673         MPASS(cel->blp[0] == NULL);
1674         MPASS(cel->blp[1] == NULL);
1675
1676         cache_sort_vnodes(&blp1, &blp2);
1677
1678         if (blp1 != NULL) {
1679                 rw_wlock(blp1);
1680                 cel->blp[0] = blp1;
1681         }
1682         rw_wlock(blp2);
1683         cel->blp[1] = blp2;
1684 }
1685
1686 static void
1687 cache_unlock_buckets_cel(struct celockstate *cel)
1688 {
1689
1690         if (cel->blp[0] != NULL)
1691                 rw_wunlock(cel->blp[0]);
1692         rw_wunlock(cel->blp[1]);
1693 }
1694
1695 /*
1696  * Lock part of the cache affected by the insertion.
1697  *
1698  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1699  * However, insertion can result in removal of an old entry. In this
1700  * case we have an additional vnode and bucketlock pair to lock. If the
1701  * entry is negative, ncelock is locked instead of the vnode.
1702  *
1703  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1704  * preserving the locking order (smaller address first).
1705  */
1706 static void
1707 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1708     uint32_t hash)
1709 {
1710         struct namecache *ncp;
1711         struct rwlock *blps[2];
1712
1713         blps[0] = HASH2BUCKETLOCK(hash);
1714         for (;;) {
1715                 blps[1] = NULL;
1716                 cache_lock_vnodes_cel(cel, dvp, vp);
1717                 if (vp == NULL || vp->v_type != VDIR)
1718                         break;
1719                 ncp = vp->v_cache_dd;
1720                 if (ncp == NULL)
1721                         break;
1722                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1723                         break;
1724                 MPASS(ncp->nc_dvp == vp);
1725                 blps[1] = NCP2BUCKETLOCK(ncp);
1726                 if (ncp->nc_flag & NCF_NEGATIVE)
1727                         break;
1728                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1729                         break;
1730                 /*
1731                  * All vnodes got re-locked. Re-validate the state and if
1732                  * nothing changed we are done. Otherwise restart.
1733                  */
1734                 if (ncp == vp->v_cache_dd &&
1735                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1736                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1737                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1738                         break;
1739                 cache_unlock_vnodes_cel(cel);
1740                 cel->vlp[0] = NULL;
1741                 cel->vlp[1] = NULL;
1742                 cel->vlp[2] = NULL;
1743         }
1744         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1745 }
1746
1747 static void
1748 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1749     uint32_t hash)
1750 {
1751         struct namecache *ncp;
1752         struct rwlock *blps[2];
1753
1754         blps[0] = HASH2BUCKETLOCK(hash);
1755         for (;;) {
1756                 blps[1] = NULL;
1757                 cache_lock_vnodes_cel(cel, dvp, vp);
1758                 ncp = dvp->v_cache_dd;
1759                 if (ncp == NULL)
1760                         break;
1761                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1762                         break;
1763                 MPASS(ncp->nc_dvp == dvp);
1764                 blps[1] = NCP2BUCKETLOCK(ncp);
1765                 if (ncp->nc_flag & NCF_NEGATIVE)
1766                         break;
1767                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1768                         break;
1769                 if (ncp == dvp->v_cache_dd &&
1770                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1771                     blps[1] == NCP2BUCKETLOCK(ncp) &&
1772                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1773                         break;
1774                 cache_unlock_vnodes_cel(cel);
1775                 cel->vlp[0] = NULL;
1776                 cel->vlp[1] = NULL;
1777                 cel->vlp[2] = NULL;
1778         }
1779         cache_lock_buckets_cel(cel, blps[0], blps[1]);
1780 }
1781
1782 static void
1783 cache_enter_unlock(struct celockstate *cel)
1784 {
1785
1786         cache_unlock_buckets_cel(cel);
1787         cache_unlock_vnodes_cel(cel);
1788 }
1789
1790 static void __noinline
1791 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1792     struct componentname *cnp)
1793 {
1794         struct celockstate cel;
1795         struct namecache *ncp;
1796         uint32_t hash;
1797         int len;
1798
1799         if (dvp->v_cache_dd == NULL)
1800                 return;
1801         len = cnp->cn_namelen;
1802         cache_celockstate_init(&cel);
1803         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1804         cache_enter_lock_dd(&cel, dvp, vp, hash);
1805         ncp = dvp->v_cache_dd;
1806         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1807                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1808                 cache_zap_locked(ncp);
1809         } else {
1810                 ncp = NULL;
1811         }
1812         dvp->v_cache_dd = NULL;
1813         cache_enter_unlock(&cel);
1814         cache_free(ncp);
1815 }
1816
1817 /*
1818  * Add an entry to the cache.
1819  */
1820 void
1821 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1822     struct timespec *tsp, struct timespec *dtsp)
1823 {
1824         struct celockstate cel;
1825         struct namecache *ncp, *n2, *ndd;
1826         struct namecache_ts *ncp_ts, *n2_ts;
1827         struct nchashhead *ncpp;
1828         uint32_t hash;
1829         int flag;
1830         int len;
1831         u_long lnumcache;
1832
1833         CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1834         VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1835             ("cache_enter: Adding a doomed vnode"));
1836         VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1837             ("cache_enter: Doomed vnode used as src"));
1838
1839 #ifdef DEBUG_CACHE
1840         if (__predict_false(!doingcache))
1841                 return;
1842 #endif
1843
1844         flag = 0;
1845         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1846                 if (cnp->cn_namelen == 1)
1847                         return;
1848                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1849                         cache_enter_dotdot_prep(dvp, vp, cnp);
1850                         flag = NCF_ISDOTDOT;
1851                 }
1852         }
1853
1854         /*
1855          * Avoid blowout in namecache entries.
1856          */
1857         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1858         if (__predict_false(lnumcache >= ncsize)) {
1859                 atomic_add_long(&numcache, -1);
1860                 counter_u64_add(numdrops, 1);
1861                 return;
1862         }
1863
1864         cache_celockstate_init(&cel);
1865         ndd = NULL;
1866         ncp_ts = NULL;
1867
1868         /*
1869          * Calculate the hash key and setup as much of the new
1870          * namecache entry as possible before acquiring the lock.
1871          */
1872         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1873         ncp->nc_flag = flag;
1874         ncp->nc_vp = vp;
1875         if (vp == NULL)
1876                 cache_negative_init(ncp);
1877         ncp->nc_dvp = dvp;
1878         if (tsp != NULL) {
1879                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1880                 ncp_ts->nc_time = *tsp;
1881                 ncp_ts->nc_ticks = ticks;
1882                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
1883                 if (dtsp != NULL) {
1884                         ncp_ts->nc_dotdottime = *dtsp;
1885                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1886                 }
1887         }
1888         len = ncp->nc_nlen = cnp->cn_namelen;
1889         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1890         strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1891         cache_enter_lock(&cel, dvp, vp, hash);
1892
1893         /*
1894          * See if this vnode or negative entry is already in the cache
1895          * with this name.  This can happen with concurrent lookups of
1896          * the same path name.
1897          */
1898         ncpp = NCHHASH(hash);
1899         CK_LIST_FOREACH(n2, ncpp, nc_hash) {
1900                 if (n2->nc_dvp == dvp &&
1901                     n2->nc_nlen == cnp->cn_namelen &&
1902                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1903                         if (tsp != NULL) {
1904                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
1905                                     ("no NCF_TS"));
1906                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1907                                 n2_ts->nc_time = ncp_ts->nc_time;
1908                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
1909                                 if (dtsp != NULL) {
1910                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1911                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
1912                                 }
1913                         }
1914                         goto out_unlock_free;
1915                 }
1916         }
1917
1918         if (flag == NCF_ISDOTDOT) {
1919                 /*
1920                  * See if we are trying to add .. entry, but some other lookup
1921                  * has populated v_cache_dd pointer already.
1922                  */
1923                 if (dvp->v_cache_dd != NULL)
1924                         goto out_unlock_free;
1925                 KASSERT(vp == NULL || vp->v_type == VDIR,
1926                     ("wrong vnode type %p", vp));
1927                 dvp->v_cache_dd = ncp;
1928         }
1929
1930         if (vp != NULL) {
1931                 if (vp->v_type == VDIR) {
1932                         if (flag != NCF_ISDOTDOT) {
1933                                 /*
1934                                  * For this case, the cache entry maps both the
1935                                  * directory name in it and the name ".." for the
1936                                  * directory's parent.
1937                                  */
1938                                 if ((ndd = vp->v_cache_dd) != NULL) {
1939                                         if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1940                                                 cache_zap_locked(ndd);
1941                                         else
1942                                                 ndd = NULL;
1943                                 }
1944                                 vp->v_cache_dd = ncp;
1945                         }
1946                 } else {
1947                         vp->v_cache_dd = NULL;
1948                 }
1949         }
1950
1951         if (flag != NCF_ISDOTDOT) {
1952                 if (LIST_EMPTY(&dvp->v_cache_src)) {
1953                         vhold(dvp);
1954                         counter_u64_add(numcachehv, 1);
1955                 }
1956                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1957         }
1958
1959         /*
1960          * If the entry is "negative", we place it into the
1961          * "negative" cache queue, otherwise, we place it into the
1962          * destination vnode's cache entries queue.
1963          */
1964         if (vp != NULL) {
1965                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1966                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1967                     vp);
1968         } else {
1969                 if (cnp->cn_flags & ISWHITEOUT)
1970                         ncp->nc_flag |= NCF_WHITE;
1971                 cache_negative_insert(ncp);
1972                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1973                     ncp->nc_name);
1974         }
1975
1976         atomic_thread_fence_rel();
1977         /*
1978          * Insert the new namecache entry into the appropriate chain
1979          * within the cache entries table.
1980          */
1981         CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1982
1983         cache_enter_unlock(&cel);
1984         if (numneg * ncnegfactor > lnumcache)
1985                 cache_negative_zap_one();
1986         cache_free(ndd);
1987         return;
1988 out_unlock_free:
1989         cache_enter_unlock(&cel);
1990         cache_free(ncp);
1991         return;
1992 }
1993
1994 static u_int
1995 cache_roundup_2(u_int val)
1996 {
1997         u_int res;
1998
1999         for (res = 1; res <= val; res <<= 1)
2000                 continue;
2001
2002         return (res);
2003 }
2004
2005 /*
2006  * Name cache initialization, from vfs_init() when we are booting
2007  */
2008 static void
2009 nchinit(void *dummy __unused)
2010 {
2011         u_int i;
2012
2013         cache_zone_small = uma_zcreate("S VFS Cache",
2014             sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2015             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2016             UMA_ZONE_ZINIT);
2017         cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2018             sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2019             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2020             UMA_ZONE_ZINIT);
2021         cache_zone_large = uma_zcreate("L VFS Cache",
2022             sizeof(struct namecache) + NAME_MAX + 1,
2023             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2024             UMA_ZONE_ZINIT);
2025         cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2026             sizeof(struct namecache_ts) + NAME_MAX + 1,
2027             NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2028             UMA_ZONE_ZINIT);
2029
2030         VFS_SMR_ZONE_SET(cache_zone_small);
2031         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2032         VFS_SMR_ZONE_SET(cache_zone_large);
2033         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2034
2035         ncsize = desiredvnodes * ncsizefactor;
2036         nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
2037         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2038         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2039                 ncbuckethash = 7;
2040         if (ncbuckethash > nchash)
2041                 ncbuckethash = nchash;
2042         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2043             M_WAITOK | M_ZERO);
2044         for (i = 0; i < numbucketlocks; i++)
2045                 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2046         ncvnodehash = ncbuckethash;
2047         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2048             M_WAITOK | M_ZERO);
2049         for (i = 0; i < numvnodelocks; i++)
2050                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2051         ncpurgeminvnodes = numbucketlocks * 2;
2052
2053         ncneghash = 3;
2054         neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2055             M_WAITOK | M_ZERO);
2056         for (i = 0; i < numneglists; i++) {
2057                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2058                 TAILQ_INIT(&neglists[i].nl_list);
2059         }
2060         mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2061         TAILQ_INIT(&ncneg_hot.nl_list);
2062
2063         mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2064 }
2065 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2066
2067 void
2068 cache_changesize(u_long newmaxvnodes)
2069 {
2070         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2071         u_long new_nchash, old_nchash;
2072         struct namecache *ncp;
2073         uint32_t hash;
2074         u_long newncsize;
2075         int i;
2076
2077         newncsize = newmaxvnodes * ncsizefactor;
2078         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2079         if (newmaxvnodes < numbucketlocks)
2080                 newmaxvnodes = numbucketlocks;
2081
2082         new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
2083         /* If same hash table size, nothing to do */
2084         if (nchash == new_nchash) {
2085                 free(new_nchashtbl, M_VFSCACHE);
2086                 return;
2087         }
2088         /*
2089          * Move everything from the old hash table to the new table.
2090          * None of the namecache entries in the table can be removed
2091          * because to do so, they have to be removed from the hash table.
2092          */
2093         cache_lock_all_vnodes();
2094         cache_lock_all_buckets();
2095         old_nchashtbl = nchashtbl;
2096         old_nchash = nchash;
2097         nchashtbl = new_nchashtbl;
2098         nchash = new_nchash;
2099         for (i = 0; i <= old_nchash; i++) {
2100                 while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) {
2101                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2102                             ncp->nc_dvp);
2103                         CK_LIST_REMOVE(ncp, nc_hash);
2104                         CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2105                 }
2106         }
2107         ncsize = newncsize;
2108         cache_unlock_all_buckets();
2109         cache_unlock_all_vnodes();
2110         free(old_nchashtbl, M_VFSCACHE);
2111 }
2112
2113 /*
2114  * Invalidate all entries from and to a particular vnode.
2115  */
2116 void
2117 cache_purge(struct vnode *vp)
2118 {
2119         TAILQ_HEAD(, namecache) ncps;
2120         struct namecache *ncp, *nnp;
2121         struct mtx *vlp, *vlp2;
2122
2123         CTR1(KTR_VFS, "cache_purge(%p)", vp);
2124         SDT_PROBE1(vfs, namecache, purge, done, vp);
2125         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2126             vp->v_cache_dd == NULL)
2127                 return;
2128         TAILQ_INIT(&ncps);
2129         vlp = VP2VNODELOCK(vp);
2130         vlp2 = NULL;
2131         mtx_lock(vlp);
2132 retry:
2133         while (!LIST_EMPTY(&vp->v_cache_src)) {
2134                 ncp = LIST_FIRST(&vp->v_cache_src);
2135                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2136                         goto retry;
2137                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2138         }
2139         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2140                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2141                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2142                         goto retry;
2143                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2144         }
2145         ncp = vp->v_cache_dd;
2146         if (ncp != NULL) {
2147                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2148                    ("lost dotdot link"));
2149                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2150                         goto retry;
2151                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2152         }
2153         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2154         mtx_unlock(vlp);
2155         if (vlp2 != NULL)
2156                 mtx_unlock(vlp2);
2157         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2158                 cache_free(ncp);
2159         }
2160 }
2161
2162 /*
2163  * Invalidate all negative entries for a particular directory vnode.
2164  */
2165 void
2166 cache_purge_negative(struct vnode *vp)
2167 {
2168         TAILQ_HEAD(, namecache) ncps;
2169         struct namecache *ncp, *nnp;
2170         struct mtx *vlp;
2171
2172         CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2173         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2174         if (LIST_EMPTY(&vp->v_cache_src))
2175                 return;
2176         TAILQ_INIT(&ncps);
2177         vlp = VP2VNODELOCK(vp);
2178         mtx_lock(vlp);
2179         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2180                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2181                         continue;
2182                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2183                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2184         }
2185         mtx_unlock(vlp);
2186         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2187                 cache_free(ncp);
2188         }
2189 }
2190
2191 /*
2192  * Flush all entries referencing a particular filesystem.
2193  */
2194 void
2195 cache_purgevfs(struct mount *mp, bool force)
2196 {
2197         TAILQ_HEAD(, namecache) ncps;
2198         struct mtx *vlp1, *vlp2;
2199         struct rwlock *blp;
2200         struct nchashhead *bucket;
2201         struct namecache *ncp, *nnp;
2202         u_long i, j, n_nchash;
2203         int error;
2204
2205         /* Scan hash tables for applicable entries */
2206         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2207         if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2208                 return;
2209         TAILQ_INIT(&ncps);
2210         n_nchash = nchash + 1;
2211         vlp1 = vlp2 = NULL;
2212         for (i = 0; i < numbucketlocks; i++) {
2213                 blp = (struct rwlock *)&bucketlocks[i];
2214                 rw_wlock(blp);
2215                 for (j = i; j < n_nchash; j += numbucketlocks) {
2216 retry:
2217                         bucket = &nchashtbl[j];
2218                         CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2219                                 cache_assert_bucket_locked(ncp, RA_WLOCKED);
2220                                 if (ncp->nc_dvp->v_mount != mp)
2221                                         continue;
2222                                 error = cache_zap_wlocked_bucket_kl(ncp, blp,
2223                                     &vlp1, &vlp2);
2224                                 if (error != 0)
2225                                         goto retry;
2226                                 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2227                         }
2228                 }
2229                 rw_wunlock(blp);
2230                 if (vlp1 == NULL && vlp2 == NULL)
2231                         cache_maybe_yield();
2232         }
2233         if (vlp1 != NULL)
2234                 mtx_unlock(vlp1);
2235         if (vlp2 != NULL)
2236                 mtx_unlock(vlp2);
2237
2238         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2239                 cache_free(ncp);
2240         }
2241 }
2242
2243 /*
2244  * Perform canonical checks and cache lookup and pass on to filesystem
2245  * through the vop_cachedlookup only if needed.
2246  */
2247
2248 int
2249 vfs_cache_lookup(struct vop_lookup_args *ap)
2250 {
2251         struct vnode *dvp;
2252         int error;
2253         struct vnode **vpp = ap->a_vpp;
2254         struct componentname *cnp = ap->a_cnp;
2255         int flags = cnp->cn_flags;
2256
2257         *vpp = NULL;
2258         dvp = ap->a_dvp;
2259
2260         if (dvp->v_type != VDIR)
2261                 return (ENOTDIR);
2262
2263         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2264             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2265                 return (EROFS);
2266
2267         error = vn_dir_check_exec(dvp, cnp);
2268         if (error != 0)
2269                 return (error);
2270
2271         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2272         if (error == 0)
2273                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2274         if (error == -1)
2275                 return (0);
2276         return (error);
2277 }
2278
2279 /* Implementation of the getcwd syscall. */
2280 int
2281 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2282 {
2283         char *buf, *retbuf;
2284         size_t buflen;
2285         int error;
2286
2287         buflen = uap->buflen;
2288         if (__predict_false(buflen < 2))
2289                 return (EINVAL);
2290         if (buflen > MAXPATHLEN)
2291                 buflen = MAXPATHLEN;
2292
2293         buf = malloc(buflen, M_TEMP, M_WAITOK);
2294         error = vn_getcwd(td, buf, &retbuf, &buflen);
2295         if (error == 0)
2296                 error = copyout(retbuf, uap->buf, buflen);
2297         free(buf, M_TEMP);
2298         return (error);
2299 }
2300
2301 int
2302 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2303 {
2304         struct pwd *pwd;
2305         int error;
2306
2307         pwd = pwd_hold(td);
2308         error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2309         pwd_drop(pwd);
2310
2311 #ifdef KTRACE
2312         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2313                 ktrnamei(*retbuf);
2314 #endif
2315         return (error);
2316 }
2317
2318 static int
2319 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2320     size_t size, int flags, enum uio_seg pathseg)
2321 {
2322         struct nameidata nd;
2323         char *retbuf, *freebuf;
2324         int error;
2325
2326         if (flags != 0)
2327                 return (EINVAL);
2328         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2329             pathseg, path, fd, &cap_fstat_rights, td);
2330         if ((error = namei(&nd)) != 0)
2331                 return (error);
2332         error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2333         if (error == 0) {
2334                 error = copyout(retbuf, buf, size);
2335                 free(freebuf, M_TEMP);
2336         }
2337         NDFREE(&nd, 0);
2338         return (error);
2339 }
2340
2341 int
2342 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2343 {
2344
2345         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2346             uap->flags, UIO_USERSPACE));
2347 }
2348
2349 /*
2350  * Retrieve the full filesystem path that correspond to a vnode from the name
2351  * cache (if available)
2352  */
2353 int
2354 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2355 {
2356         struct pwd *pwd;
2357         char *buf;
2358         size_t buflen;
2359         int error;
2360
2361         if (__predict_false(vn == NULL))
2362                 return (EINVAL);
2363
2364         buflen = MAXPATHLEN;
2365         buf = malloc(buflen, M_TEMP, M_WAITOK);
2366         pwd = pwd_hold(td);
2367         error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2368         pwd_drop(pwd);
2369
2370         if (!error)
2371                 *freebuf = buf;
2372         else
2373                 free(buf, M_TEMP);
2374         return (error);
2375 }
2376
2377 /*
2378  * This function is similar to vn_fullpath, but it attempts to lookup the
2379  * pathname relative to the global root mount point.  This is required for the
2380  * auditing sub-system, as audited pathnames must be absolute, relative to the
2381  * global root mount point.
2382  */
2383 int
2384 vn_fullpath_global(struct thread *td, struct vnode *vn,
2385     char **retbuf, char **freebuf)
2386 {
2387         char *buf;
2388         size_t buflen;
2389         int error;
2390
2391         if (__predict_false(vn == NULL))
2392                 return (EINVAL);
2393         buflen = MAXPATHLEN;
2394         buf = malloc(buflen, M_TEMP, M_WAITOK);
2395         error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2396         if (!error)
2397                 *freebuf = buf;
2398         else
2399                 free(buf, M_TEMP);
2400         return (error);
2401 }
2402
2403 int
2404 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2405 {
2406         struct vnode *dvp;
2407         struct namecache *ncp;
2408         struct mtx *vlp;
2409         int error;
2410
2411         vlp = VP2VNODELOCK(*vp);
2412         mtx_lock(vlp);
2413         TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2414                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2415                         break;
2416         }
2417         if (ncp != NULL) {
2418                 if (*buflen < ncp->nc_nlen) {
2419                         mtx_unlock(vlp);
2420                         vrele(*vp);
2421                         counter_u64_add(numfullpathfail4, 1);
2422                         error = ENOMEM;
2423                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2424                             vp, NULL);
2425                         return (error);
2426                 }
2427                 *buflen -= ncp->nc_nlen;
2428                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2429                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2430                     ncp->nc_name, vp);
2431                 dvp = *vp;
2432                 *vp = ncp->nc_dvp;
2433                 vref(*vp);
2434                 mtx_unlock(vlp);
2435                 vrele(dvp);
2436                 return (0);
2437         }
2438         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2439
2440         mtx_unlock(vlp);
2441         vn_lock(*vp, LK_SHARED | LK_RETRY);
2442         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2443         vput(*vp);
2444         if (error) {
2445                 counter_u64_add(numfullpathfail2, 1);
2446                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2447                 return (error);
2448         }
2449
2450         *vp = dvp;
2451         if (VN_IS_DOOMED(dvp)) {
2452                 /* forced unmount */
2453                 vrele(dvp);
2454                 error = ENOENT;
2455                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2456                 return (error);
2457         }
2458         /*
2459          * *vp has its use count incremented still.
2460          */
2461
2462         return (0);
2463 }
2464
2465 /*
2466  * Resolve a directory to a pathname.
2467  *
2468  * The name of the directory can always be found in the namecache or fetched
2469  * from the filesystem. There is also guaranteed to be only one parent, meaning
2470  * we can just follow vnodes up until we find the root.
2471  *
2472  * The vnode must be referenced.
2473  */
2474 static int
2475 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2476     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2477 {
2478 #ifdef KDTRACE_HOOKS
2479         struct vnode *startvp = vp;
2480 #endif
2481         struct vnode *vp1;
2482         size_t buflen;
2483         int error;
2484
2485         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2486         VNPASS(vp->v_usecount > 0, vp);
2487
2488         buflen = *len;
2489
2490         if (!slash_prefixed) {
2491                 MPASS(*len >= 2);
2492                 buflen--;
2493                 buf[buflen] = '\0';
2494         }
2495
2496         error = 0;
2497
2498         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2499         counter_u64_add(numfullpathcalls, 1);
2500         while (vp != rdir && vp != rootvnode) {
2501                 /*
2502                  * The vp vnode must be already fully constructed,
2503                  * since it is either found in namecache or obtained
2504                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2505                  * without obtaining the vnode lock.
2506                  */
2507                 if ((vp->v_vflag & VV_ROOT) != 0) {
2508                         vn_lock(vp, LK_RETRY | LK_SHARED);
2509
2510                         /*
2511                          * With the vnode locked, check for races with
2512                          * unmount, forced or not.  Note that we
2513                          * already verified that vp is not equal to
2514                          * the root vnode, which means that
2515                          * mnt_vnodecovered can be NULL only for the
2516                          * case of unmount.
2517                          */
2518                         if (VN_IS_DOOMED(vp) ||
2519                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2520                             vp1->v_mountedhere != vp->v_mount) {
2521                                 vput(vp);
2522                                 error = ENOENT;
2523                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2524                                     error, vp, NULL);
2525                                 break;
2526                         }
2527
2528                         vref(vp1);
2529                         vput(vp);
2530                         vp = vp1;
2531                         continue;
2532                 }
2533                 if (vp->v_type != VDIR) {
2534                         vrele(vp);
2535                         counter_u64_add(numfullpathfail1, 1);
2536                         error = ENOTDIR;
2537                         SDT_PROBE3(vfs, namecache, fullpath, return,
2538                             error, vp, NULL);
2539                         break;
2540                 }
2541                 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2542                 if (error)
2543                         break;
2544                 if (buflen == 0) {
2545                         vrele(vp);
2546                         error = ENOMEM;
2547                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2548                             startvp, NULL);
2549                         break;
2550                 }
2551                 buf[--buflen] = '/';
2552                 slash_prefixed = true;
2553         }
2554         if (error)
2555                 return (error);
2556         if (!slash_prefixed) {
2557                 if (buflen == 0) {
2558                         vrele(vp);
2559                         counter_u64_add(numfullpathfail4, 1);
2560                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2561                             startvp, NULL);
2562                         return (ENOMEM);
2563                 }
2564                 buf[--buflen] = '/';
2565         }
2566         counter_u64_add(numfullpathfound, 1);
2567         vrele(vp);
2568
2569         *retbuf = buf + buflen;
2570         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2571         *len -= buflen;
2572         *len += addend;
2573         return (0);
2574 }
2575
2576 /*
2577  * Resolve an arbitrary vnode to a pathname.
2578  *
2579  * Note 2 caveats:
2580  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2581  *   resolve to a different path than the one used to find it
2582  * - namecache is not mandatory, meaning names are not guaranteed to be added
2583  *   (in which case resolving fails)
2584  */
2585 static int
2586 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2587     char *buf, char **retbuf, size_t *buflen)
2588 {
2589         size_t orig_buflen;
2590         bool slash_prefixed;
2591         int error;
2592
2593         if (*buflen < 2)
2594                 return (EINVAL);
2595
2596         orig_buflen = *buflen;
2597
2598         vref(vp);
2599         slash_prefixed = false;
2600         if (vp->v_type != VDIR) {
2601                 *buflen -= 1;
2602                 buf[*buflen] = '\0';
2603                 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2604                 if (error)
2605                         return (error);
2606                 if (*buflen == 0) {
2607                         vrele(vp);
2608                         return (ENOMEM);
2609                 }
2610                 *buflen -= 1;
2611                 buf[*buflen] = '/';
2612                 slash_prefixed = true;
2613         }
2614
2615         return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2616             orig_buflen - *buflen));
2617 }
2618
2619 /*
2620  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2621  *
2622  * Since the namecache does not track handlings, the caller is expected to first
2623  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2624  *
2625  * Then we have 2 cases:
2626  * - if the found vnode is a directory, the path can be constructed just by
2627  *   fullowing names up the chain
2628  * - otherwise we populate the buffer with the saved name and start resolving
2629  *   from the parent
2630  */
2631 static int
2632 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2633     char **freebuf, size_t *buflen)
2634 {
2635         char *buf, *tmpbuf;
2636         struct pwd *pwd;
2637         struct componentname *cnp;
2638         struct vnode *vp;
2639         size_t addend;
2640         int error;
2641         bool slash_prefixed;
2642
2643         if (*buflen < 2)
2644                 return (EINVAL);
2645         if (*buflen > MAXPATHLEN)
2646                 *buflen = MAXPATHLEN;
2647
2648         slash_prefixed = false;
2649
2650         buf = malloc(*buflen, M_TEMP, M_WAITOK);
2651         pwd = pwd_hold(td);
2652
2653         addend = 0;
2654         vp = ndp->ni_vp;
2655         if (vp->v_type != VDIR) {
2656                 cnp = &ndp->ni_cnd;
2657                 addend = cnp->cn_namelen + 2;
2658                 if (*buflen < addend) {
2659                         error = ENOMEM;
2660                         goto out_bad;
2661                 }
2662                 *buflen -= addend;
2663                 tmpbuf = buf + *buflen;
2664                 tmpbuf[0] = '/';
2665                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2666                 tmpbuf[addend - 1] = '\0';
2667                 slash_prefixed = true;
2668                 vp = ndp->ni_dvp;
2669         }
2670
2671         vref(vp);
2672         error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2673             slash_prefixed, addend);
2674         if (error != 0)
2675                 goto out_bad;
2676
2677         pwd_drop(pwd);
2678         *freebuf = buf;
2679
2680         return (0);
2681 out_bad:
2682         pwd_drop(pwd);
2683         free(buf, M_TEMP);
2684         return (error);
2685 }
2686
2687 struct vnode *
2688 vn_dir_dd_ino(struct vnode *vp)
2689 {
2690         struct namecache *ncp;
2691         struct vnode *ddvp;
2692         struct mtx *vlp;
2693         enum vgetstate vs;
2694
2695         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2696         vlp = VP2VNODELOCK(vp);
2697         mtx_lock(vlp);
2698         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2699                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2700                         continue;
2701                 ddvp = ncp->nc_dvp;
2702                 vs = vget_prep(ddvp);
2703                 mtx_unlock(vlp);
2704                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2705                         return (NULL);
2706                 return (ddvp);
2707         }
2708         mtx_unlock(vlp);
2709         return (NULL);
2710 }
2711
2712 int
2713 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2714 {
2715         struct namecache *ncp;
2716         struct mtx *vlp;
2717         int l;
2718
2719         vlp = VP2VNODELOCK(vp);
2720         mtx_lock(vlp);
2721         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2722                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2723                         break;
2724         if (ncp == NULL) {
2725                 mtx_unlock(vlp);
2726                 return (ENOENT);
2727         }
2728         l = min(ncp->nc_nlen, buflen - 1);
2729         memcpy(buf, ncp->nc_name, l);
2730         mtx_unlock(vlp);
2731         buf[l] = '\0';
2732         return (0);
2733 }
2734
2735 /*
2736  * This function updates path string to vnode's full global path
2737  * and checks the size of the new path string against the pathlen argument.
2738  *
2739  * Requires a locked, referenced vnode.
2740  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2741  *
2742  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2743  * because it falls back to the ".." lookup if the namecache lookup fails.
2744  */
2745 int
2746 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2747     u_int pathlen)
2748 {
2749         struct nameidata nd;
2750         struct vnode *vp1;
2751         char *rpath, *fbuf;
2752         int error;
2753
2754         ASSERT_VOP_ELOCKED(vp, __func__);
2755
2756         /* Construct global filesystem path from vp. */
2757         VOP_UNLOCK(vp);
2758         error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2759
2760         if (error != 0) {
2761                 vrele(vp);
2762                 return (error);
2763         }
2764
2765         if (strlen(rpath) >= pathlen) {
2766                 vrele(vp);
2767                 error = ENAMETOOLONG;
2768                 goto out;
2769         }
2770
2771         /*
2772          * Re-lookup the vnode by path to detect a possible rename.
2773          * As a side effect, the vnode is relocked.
2774          * If vnode was renamed, return ENOENT.
2775          */
2776         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2777             UIO_SYSSPACE, path, td);
2778         error = namei(&nd);
2779         if (error != 0) {
2780                 vrele(vp);
2781                 goto out;
2782         }
2783         NDFREE(&nd, NDF_ONLY_PNBUF);
2784         vp1 = nd.ni_vp;
2785         vrele(vp);
2786         if (vp1 == vp)
2787                 strcpy(path, rpath);
2788         else {
2789                 vput(vp1);
2790                 error = ENOENT;
2791         }
2792
2793 out:
2794         free(fbuf, M_TEMP);
2795         return (error);
2796 }
2797
2798 #ifdef DDB
2799 static void
2800 db_print_vpath(struct vnode *vp)
2801 {
2802
2803         while (vp != NULL) {
2804                 db_printf("%p: ", vp);
2805                 if (vp == rootvnode) {
2806                         db_printf("/");
2807                         vp = NULL;
2808                 } else {
2809                         if (vp->v_vflag & VV_ROOT) {
2810                                 db_printf("<mount point>");
2811                                 vp = vp->v_mount->mnt_vnodecovered;
2812                         } else {
2813                                 struct namecache *ncp;
2814                                 char *ncn;
2815                                 int i;
2816
2817                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2818                                 if (ncp != NULL) {
2819                                         ncn = ncp->nc_name;
2820                                         for (i = 0; i < ncp->nc_nlen; i++)
2821                                                 db_printf("%c", *ncn++);
2822                                         vp = ncp->nc_dvp;
2823                                 } else {
2824                                         vp = NULL;
2825                                 }
2826                         }
2827                 }
2828                 db_printf("\n");
2829         }
2830
2831         return;
2832 }
2833
2834 DB_SHOW_COMMAND(vpath, db_show_vpath)
2835 {
2836         struct vnode *vp;
2837
2838         if (!have_addr) {
2839                 db_printf("usage: show vpath <struct vnode *>\n");
2840                 return;
2841         }
2842
2843         vp = (struct vnode *)addr;
2844         db_print_vpath(vp);
2845 }
2846
2847 #endif
2848
2849 extern uma_zone_t namei_zone;
2850
2851 static bool __read_frequently cache_fast_lookup = true;
2852 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
2853     &cache_fast_lookup, 0, "");
2854
2855 #define CACHE_FPL_FAILED        -2020
2856
2857 static void
2858 cache_fpl_cleanup_cnp(struct componentname *cnp)
2859 {
2860
2861         uma_zfree(namei_zone, cnp->cn_pnbuf);
2862 #ifdef DIAGNOSTIC
2863         cnp->cn_pnbuf = NULL;
2864         cnp->cn_nameptr = NULL;
2865 #endif
2866 }
2867
2868 static void
2869 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
2870 {
2871         struct componentname *cnp;
2872
2873         cnp = &ndp->ni_cnd;
2874         while (*(cnp->cn_nameptr) == '/') {
2875                 cnp->cn_nameptr++;
2876                 ndp->ni_pathlen--;
2877         }
2878
2879         *dpp = ndp->ni_rootdir;
2880 }
2881
2882 /*
2883  * Components of nameidata (or objects it can point to) which may
2884  * need restoring in case fast path lookup fails.
2885  */
2886 struct nameidata_saved {
2887         int cn_flags;
2888         long cn_namelen;
2889         char *cn_nameptr;
2890         size_t ni_pathlen;
2891 };
2892
2893 struct cache_fpl {
2894         int line;
2895         enum cache_fpl_status status;
2896         bool in_smr;
2897         struct nameidata *ndp;
2898         struct nameidata_saved snd;
2899         struct componentname *cnp;
2900         struct vnode *dvp;
2901         seqc_t dvp_seqc;
2902         struct vnode *tvp;
2903         seqc_t tvp_seqc;
2904         struct pwd *pwd;
2905 };
2906
2907 static void
2908 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
2909 {
2910
2911         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
2912         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
2913         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
2914         snd->ni_pathlen = fpl->ndp->ni_pathlen;
2915 }
2916
2917 static void
2918 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
2919 {
2920
2921         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
2922         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
2923         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
2924         fpl->ndp->ni_pathlen = snd->ni_pathlen;
2925 }
2926
2927 #ifdef INVARIANTS
2928 #define cache_fpl_smr_assert_entered(fpl) ({                    \
2929         struct cache_fpl *_fpl = (fpl);                         \
2930         MPASS(_fpl->in_smr == true);                            \
2931         VFS_SMR_ASSERT_ENTERED();                               \
2932 })
2933 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
2934         struct cache_fpl *_fpl = (fpl);                         \
2935         MPASS(_fpl->in_smr == false);                           \
2936         VFS_SMR_ASSERT_NOT_ENTERED();                           \
2937 })
2938 #else
2939 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
2940 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
2941 #endif
2942
2943 #define cache_fpl_smr_enter(fpl) ({                             \
2944         struct cache_fpl *_fpl = (fpl);                         \
2945         MPASS(_fpl->in_smr == false);                           \
2946         vfs_smr_enter();                                        \
2947         _fpl->in_smr = true;                                    \
2948 })
2949
2950 #define cache_fpl_smr_exit(fpl) ({                              \
2951         struct cache_fpl *_fpl = (fpl);                         \
2952         MPASS(_fpl->in_smr == true);                            \
2953         vfs_smr_exit();                                         \
2954         _fpl->in_smr = false;                                   \
2955 })
2956
2957 static int
2958 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
2959 {
2960
2961         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
2962                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
2963                     ("%s: converting to abort from %d at %d, set at %d\n",
2964                     __func__, fpl->status, line, fpl->line));
2965         }
2966         fpl->status = CACHE_FPL_STATUS_ABORTED;
2967         fpl->line = line;
2968         return (CACHE_FPL_FAILED);
2969 }
2970
2971 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
2972
2973 static int
2974 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
2975 {
2976
2977         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
2978             ("%s: setting to partial at %d, but already set to %d at %d\n",
2979             __func__, line, fpl->status, fpl->line));
2980         cache_fpl_smr_assert_entered(fpl);
2981         fpl->status = CACHE_FPL_STATUS_PARTIAL;
2982         fpl->line = line;
2983         return (CACHE_FPL_FAILED);
2984 }
2985
2986 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
2987
2988 static int
2989 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
2990 {
2991
2992         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
2993             ("%s: setting to handled at %d, but already set to %d at %d\n",
2994             __func__, line, fpl->status, fpl->line));
2995         cache_fpl_smr_assert_not_entered(fpl);
2996         MPASS(error != CACHE_FPL_FAILED);
2997         fpl->status = CACHE_FPL_STATUS_HANDLED;
2998         fpl->line = line;
2999         return (error);
3000 }
3001
3002 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3003
3004 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3005         (LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1)
3006
3007 static bool
3008 cache_can_fplookup(struct cache_fpl *fpl)
3009 {
3010         struct nameidata *ndp;
3011         struct componentname *cnp;
3012         struct thread *td;
3013
3014         ndp = fpl->ndp;
3015         cnp = fpl->cnp;
3016         td = cnp->cn_thread;
3017
3018         if (!cache_fast_lookup) {
3019                 cache_fpl_aborted(fpl);
3020                 return (false);
3021         }
3022 #ifdef MAC
3023         if (mac_vnode_check_lookup_enabled()) {
3024                 cache_fpl_aborted(fpl);
3025                 return (false);
3026         }
3027 #endif
3028         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3029                 cache_fpl_aborted(fpl);
3030                 return (false);
3031         }
3032         if (cnp->cn_nameiop != LOOKUP) {
3033                 cache_fpl_aborted(fpl);
3034                 return (false);
3035         }
3036         if (ndp->ni_dirfd != AT_FDCWD) {
3037                 cache_fpl_aborted(fpl);
3038                 return (false);
3039         }
3040         if (IN_CAPABILITY_MODE(td)) {
3041                 cache_fpl_aborted(fpl);
3042                 return (false);
3043         }
3044         if (AUDITING_TD(td)) {
3045                 cache_fpl_aborted(fpl);
3046                 return (false);
3047         }
3048         if (ndp->ni_startdir != NULL) {
3049                 cache_fpl_aborted(fpl);
3050                 return (false);
3051         }
3052         return (true);
3053 }
3054
3055 static bool
3056 cache_fplookup_vnode_supported(struct vnode *vp)
3057 {
3058
3059         return (vp->v_type != VLNK);
3060 }
3061
3062 /*
3063  * The target vnode is not supported, prepare for the slow path to take over.
3064  */
3065 static int
3066 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3067 {
3068         struct componentname *cnp;
3069         enum vgetstate dvs;
3070         struct vnode *dvp;
3071         struct pwd *pwd;
3072         seqc_t dvp_seqc;
3073
3074         cnp = fpl->cnp;
3075         dvp = fpl->dvp;
3076         dvp_seqc = fpl->dvp_seqc;
3077
3078         dvs = vget_prep_smr(dvp);
3079         if (dvs == VGET_NONE) {
3080                 cache_fpl_smr_exit(fpl);
3081                 return (cache_fpl_aborted(fpl));
3082         }
3083
3084         cache_fpl_smr_exit(fpl);
3085
3086         vget_finish_ref(dvp, dvs);
3087         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3088                 vrele(dvp);
3089                 return (cache_fpl_aborted(fpl));
3090         }
3091
3092         pwd = pwd_hold(curthread);
3093         if (fpl->pwd != pwd) {
3094                 vrele(dvp);
3095                 pwd_drop(pwd);
3096                 return (cache_fpl_aborted(fpl));
3097         }
3098
3099         fpl->ndp->ni_startdir = dvp;
3100         return (0);
3101 }
3102
3103 static int
3104 cache_fplookup_final(struct cache_fpl *fpl)
3105 {
3106         struct componentname *cnp;
3107         enum vgetstate tvs;
3108         struct vnode *dvp, *tvp;
3109         seqc_t dvp_seqc, tvp_seqc;
3110         int error;
3111
3112         cnp = fpl->cnp;
3113         dvp = fpl->dvp;
3114         dvp_seqc = fpl->dvp_seqc;
3115         tvp = fpl->tvp;
3116         tvp_seqc = fpl->tvp_seqc;
3117
3118         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3119
3120         tvs = vget_prep_smr(tvp);
3121         if (tvs == VGET_NONE) {
3122                 return (cache_fpl_partial(fpl));
3123         }
3124
3125         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3126                 cache_fpl_smr_exit(fpl);
3127                 vget_abort(tvp, tvs);
3128                 return (cache_fpl_aborted(fpl));
3129         }
3130
3131         cache_fpl_smr_exit(fpl);
3132
3133         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3134                 error = vget_finish(tvp, cnp->cn_lkflags, tvs);
3135                 if (error != 0) {
3136                         return (cache_fpl_aborted(fpl));
3137                 }
3138         } else {
3139                 vget_finish_ref(tvp, tvs);
3140         }
3141
3142         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3143                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3144                         vput(tvp);
3145                 else
3146                         vrele(tvp);
3147                 return (cache_fpl_aborted(fpl));
3148         }
3149
3150         return (cache_fpl_handled(fpl, 0));
3151 }
3152
3153 static int
3154 cache_fplookup_next(struct cache_fpl *fpl)
3155 {
3156         struct componentname *cnp;
3157         struct namecache *ncp;
3158         struct negstate *negstate;
3159         struct vnode *dvp, *tvp;
3160         u_char nc_flag;
3161         uint32_t hash;
3162         bool neg_hot;
3163
3164         cnp = fpl->cnp;
3165         dvp = fpl->dvp;
3166
3167         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
3168                 fpl->tvp = dvp;
3169                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3170                 if (seqc_in_modify(fpl->tvp_seqc)) {
3171                         return (cache_fpl_aborted(fpl));
3172                 }
3173                 return (0);
3174         }
3175
3176         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
3177
3178         CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
3179                 counter_u64_add(numchecks, 1);
3180                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
3181                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
3182                         break;
3183         }
3184
3185         /*
3186          * If there is no entry we have to punt to the slow path to perform
3187          * actual lookup. Should there be nothing with this name a negative
3188          * entry will be created.
3189          */
3190         if (__predict_false(ncp == NULL)) {
3191                 return (cache_fpl_partial(fpl));
3192         }
3193
3194         tvp = atomic_load_ptr(&ncp->nc_vp);
3195         nc_flag = atomic_load_char(&ncp->nc_flag);
3196         if ((nc_flag & NCF_NEGATIVE) != 0) {
3197                 negstate = NCP2NEGSTATE(ncp);
3198                 neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
3199                 if (__predict_false(cache_ncp_invalid(ncp))) {
3200                         return (cache_fpl_partial(fpl));
3201                 }
3202                 if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
3203                         return (cache_fpl_partial(fpl));
3204                 }
3205                 if (!neg_hot) {
3206                         /*
3207                          * TODO
3208                          * Promoting to hot negative requires locks, thus is
3209                          * left not yet supported for simplicity.
3210                          */
3211                         return (cache_fpl_partial(fpl));
3212                 }
3213                 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
3214                     ncp->nc_name);
3215                 counter_u64_add(numneghits, 1);
3216                 cache_fpl_smr_exit(fpl);
3217                 return (cache_fpl_handled(fpl, ENOENT));
3218         }
3219
3220         if (__predict_false(cache_ncp_invalid(ncp))) {
3221                 return (cache_fpl_partial(fpl));
3222         }
3223
3224         fpl->tvp = tvp;
3225         fpl->tvp_seqc = vn_seqc_read_any(tvp);
3226         if (seqc_in_modify(fpl->tvp_seqc)) {
3227                 return (cache_fpl_partial(fpl));
3228         }
3229
3230         if (!cache_fplookup_vnode_supported(tvp)) {
3231                 return (cache_fpl_partial(fpl));
3232         }
3233
3234         counter_u64_add(numposhits, 1);
3235         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
3236         return (0);
3237 }
3238
3239 static bool
3240 cache_fplookup_mp_supported(struct mount *mp)
3241 {
3242
3243         if (mp == NULL)
3244                 return (false);
3245         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
3246                 return (false);
3247         if ((mp->mnt_flag & MNT_UNION) != 0)
3248                 return (false);
3249         return (true);
3250 }
3251
3252 /*
3253  * Walk up the mount stack (if any).
3254  *
3255  * Correctness is provided in the following ways:
3256  * - all vnodes are protected from freeing with SMR
3257  * - struct mount objects are type stable making them always safe to access
3258  * - stability of the particular mount is provided by busying it
3259  * - relationship between the vnode which is mounted on and the mount is
3260  *   verified with the vnode sequence counter after busying
3261  * - association between root vnode of the mount and the mount is protected
3262  *   by busy
3263  *
3264  * From that point on we can read the sequence counter of the root vnode
3265  * and get the next mount on the stack (if any) using the same protection.
3266  *
3267  * By the end of successful walk we are guaranteed the reached state was
3268  * indeed present at least at some point which matches the regular lookup.
3269  */
3270 static int
3271 cache_fplookup_climb_mount(struct cache_fpl *fpl)
3272 {
3273         struct mount *mp, *prev_mp;
3274         struct vnode *vp;
3275         seqc_t vp_seqc;
3276
3277         vp = fpl->tvp;
3278         vp_seqc = fpl->tvp_seqc;
3279         if (vp->v_type != VDIR)
3280                 return (0);
3281
3282         mp = atomic_load_ptr(&vp->v_mountedhere);
3283         if (mp == NULL)
3284                 return (0);
3285
3286         prev_mp = NULL;
3287         for (;;) {
3288                 if (!vfs_op_thread_enter(mp)) {
3289                         if (prev_mp != NULL)
3290                                 vfs_op_thread_exit(prev_mp);
3291                         return (cache_fpl_partial(fpl));
3292                 }
3293                 if (prev_mp != NULL)
3294                         vfs_op_thread_exit(prev_mp);
3295                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3296                         vfs_op_thread_exit(mp);
3297                         return (cache_fpl_partial(fpl));
3298                 }
3299                 if (!cache_fplookup_mp_supported(mp)) {
3300                         vfs_op_thread_exit(mp);
3301                         return (cache_fpl_partial(fpl));
3302                 }
3303                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
3304                 if (vp == NULL || VN_IS_DOOMED(vp)) {
3305                         vfs_op_thread_exit(mp);
3306                         return (cache_fpl_partial(fpl));
3307                 }
3308                 vp_seqc = vn_seqc_read_any(vp);
3309                 if (seqc_in_modify(vp_seqc)) {
3310                         vfs_op_thread_exit(mp);
3311                         return (cache_fpl_partial(fpl));
3312                 }
3313                 prev_mp = mp;
3314                 mp = atomic_load_ptr(&vp->v_mountedhere);
3315                 if (mp == NULL)
3316                         break;
3317         }
3318
3319         vfs_op_thread_exit(prev_mp);
3320         fpl->tvp = vp;
3321         fpl->tvp_seqc = vp_seqc;
3322         return (0);
3323 }
3324
3325 /*
3326  * Parse the path.
3327  *
3328  * The code is mostly copy-pasted from regular lookup, see lookup().
3329  * The structure is maintained along with comments for easier maintenance.
3330  * Deduplicating the code will become feasible after fast path lookup
3331  * becomes more feature-complete.
3332  */
3333 static int
3334 cache_fplookup_parse(struct cache_fpl *fpl)
3335 {
3336         struct nameidata *ndp;
3337         struct componentname *cnp;
3338         char *cp;
3339         char *prev_ni_next;             /* saved ndp->ni_next */
3340         size_t prev_ni_pathlen;         /* saved ndp->ni_pathlen */
3341
3342         ndp = fpl->ndp;
3343         cnp = fpl->cnp;
3344
3345         /*
3346          * Search a new directory.
3347          *
3348          * The last component of the filename is left accessible via
3349          * cnp->cn_nameptr for callers that need the name. Callers needing
3350          * the name set the SAVENAME flag. When done, they assume
3351          * responsibility for freeing the pathname buffer.
3352          */
3353         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
3354                 continue;
3355         cnp->cn_namelen = cp - cnp->cn_nameptr;
3356         if (cnp->cn_namelen > NAME_MAX) {
3357                 cache_fpl_smr_exit(fpl);
3358                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
3359         }
3360         prev_ni_pathlen = ndp->ni_pathlen;
3361         ndp->ni_pathlen -= cnp->cn_namelen;
3362         KASSERT(ndp->ni_pathlen <= PATH_MAX,
3363             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
3364         prev_ni_next = ndp->ni_next;
3365         ndp->ni_next = cp;
3366
3367         /*
3368          * Replace multiple slashes by a single slash and trailing slashes
3369          * by a null.  This must be done before VOP_LOOKUP() because some
3370          * fs's don't know about trailing slashes.  Remember if there were
3371          * trailing slashes to handle symlinks, existing non-directories
3372          * and non-existing files that won't be directories specially later.
3373          */
3374         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
3375                 cp++;
3376                 ndp->ni_pathlen--;
3377                 if (*cp == '\0') {
3378                         /*
3379                          * TODO
3380                          * Regular lookup performs the following:
3381                          * *ndp->ni_next = '\0';
3382                          * cnp->cn_flags |= TRAILINGSLASH;
3383                          *
3384                          * Which is problematic since it modifies data read
3385                          * from userspace. Then if fast path lookup was to
3386                          * abort we would have to either restore it or convey
3387                          * the flag. Since this is a corner case just ignore
3388                          * it for simplicity.
3389                          */
3390                         return (cache_fpl_partial(fpl));
3391                 }
3392         }
3393         ndp->ni_next = cp;
3394
3395         cnp->cn_flags |= MAKEENTRY;
3396
3397         if (cnp->cn_namelen == 2 &&
3398             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3399                 cnp->cn_flags |= ISDOTDOT;
3400         else
3401                 cnp->cn_flags &= ~ISDOTDOT;
3402         if (*ndp->ni_next == 0)
3403                 cnp->cn_flags |= ISLASTCN;
3404         else
3405                 cnp->cn_flags &= ~ISLASTCN;
3406
3407         /*
3408          * Check for degenerate name (e.g. / or "")
3409          * which is a way of talking about a directory,
3410          * e.g. like "/." or ".".
3411          *
3412          * TODO
3413          * Another corner case handled by the regular lookup
3414          */
3415         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
3416                 return (cache_fpl_partial(fpl));
3417         }
3418         return (0);
3419 }
3420
3421 static void
3422 cache_fplookup_parse_advance(struct cache_fpl *fpl)
3423 {
3424         struct nameidata *ndp;
3425         struct componentname *cnp;
3426
3427         ndp = fpl->ndp;
3428         cnp = fpl->cnp;
3429
3430         cnp->cn_nameptr = ndp->ni_next;
3431         while (*cnp->cn_nameptr == '/') {
3432                 cnp->cn_nameptr++;
3433                 ndp->ni_pathlen--;
3434         }
3435 }
3436
3437 static int
3438 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
3439 {
3440         struct nameidata *ndp;
3441         struct componentname *cnp;
3442         struct mount *mp;
3443         int error;
3444
3445         error = CACHE_FPL_FAILED;
3446         ndp = fpl->ndp;
3447         ndp->ni_lcf = 0;
3448         cnp = fpl->cnp;
3449         cnp->cn_lkflags = LK_SHARED;
3450         if ((cnp->cn_flags & LOCKSHARED) == 0)
3451                 cnp->cn_lkflags = LK_EXCLUSIVE;
3452
3453         cache_fpl_checkpoint(fpl, &fpl->snd);
3454
3455         fpl->dvp = dvp;
3456         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
3457         if (seqc_in_modify(fpl->dvp_seqc)) {
3458                 cache_fpl_aborted(fpl);
3459                 goto out;
3460         }
3461         mp = atomic_load_ptr(&fpl->dvp->v_mount);
3462         if (!cache_fplookup_mp_supported(mp)) {
3463                 cache_fpl_aborted(fpl);
3464                 goto out;
3465         }
3466
3467         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3468
3469         for (;;) {
3470                 error = cache_fplookup_parse(fpl);
3471                 if (__predict_false(error != 0)) {
3472                         break;
3473                 }
3474
3475                 if (cnp->cn_flags & ISDOTDOT) {
3476                         error = cache_fpl_partial(fpl);
3477                         break;
3478                 }
3479
3480                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
3481
3482                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
3483                 if (__predict_false(error != 0)) {
3484                         switch (error) {
3485                         case EAGAIN:
3486                         case EOPNOTSUPP: /* can happen when racing against vgone */
3487                                 cache_fpl_partial(fpl);
3488                                 break;
3489                         default:
3490                                 /*
3491                                  * See the API contract for VOP_FPLOOKUP_VEXEC.
3492                                  */
3493                                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3494                                         error = cache_fpl_aborted(fpl);
3495                                 } else {
3496                                         cache_fpl_smr_exit(fpl);
3497                                         cache_fpl_handled(fpl, error);
3498                                 }
3499                                 break;
3500                         }
3501                         break;
3502                 }
3503
3504                 error = cache_fplookup_next(fpl);
3505                 if (__predict_false(error != 0)) {
3506                         break;
3507                 }
3508
3509                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3510
3511                 error = cache_fplookup_climb_mount(fpl);
3512                 if (__predict_false(error != 0)) {
3513                         break;
3514                 }
3515
3516                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
3517
3518                 if (cnp->cn_flags & ISLASTCN) {
3519                         error = cache_fplookup_final(fpl);
3520                         break;
3521                 }
3522
3523                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
3524                         error = cache_fpl_aborted(fpl);
3525                         break;
3526                 }
3527
3528                 fpl->dvp = fpl->tvp;
3529                 fpl->dvp_seqc = fpl->tvp_seqc;
3530
3531                 cache_fplookup_parse_advance(fpl);
3532                 cache_fpl_checkpoint(fpl, &fpl->snd);
3533         }
3534 out:
3535         switch (fpl->status) {
3536         case CACHE_FPL_STATUS_UNSET:
3537                 __assert_unreachable();
3538                 break;
3539         case CACHE_FPL_STATUS_PARTIAL:
3540                 cache_fpl_smr_assert_entered(fpl);
3541                 return (cache_fplookup_partial_setup(fpl));
3542         case CACHE_FPL_STATUS_ABORTED:
3543                 if (fpl->in_smr)
3544                         cache_fpl_smr_exit(fpl);
3545                 return (CACHE_FPL_FAILED);
3546         case CACHE_FPL_STATUS_HANDLED:
3547                 cache_fpl_smr_assert_not_entered(fpl);
3548                 if (__predict_false(error != 0)) {
3549                         ndp->ni_dvp = NULL;
3550                         ndp->ni_vp = NULL;
3551                         cache_fpl_cleanup_cnp(cnp);
3552                         return (error);
3553                 }
3554                 ndp->ni_dvp = fpl->dvp;
3555                 ndp->ni_vp = fpl->tvp;
3556                 if (cnp->cn_flags & SAVENAME)
3557                         cnp->cn_flags |= HASBUF;
3558                 else
3559                         cache_fpl_cleanup_cnp(cnp);
3560                 return (error);
3561         }
3562 }
3563
3564 /*
3565  * Fast path lookup protected with SMR and sequence counters.
3566  *
3567  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
3568  *
3569  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
3570  * outlined below.
3571  *
3572  * Traditional vnode lookup conceptually looks like this:
3573  *
3574  * vn_lock(current);
3575  * for (;;) {
3576  *      next = find();
3577  *      vn_lock(next);
3578  *      vn_unlock(current);
3579  *      current = next;
3580  *      if (last)
3581  *          break;
3582  * }
3583  * return (current);
3584  *
3585  * Each jump to the next vnode is safe memory-wise and atomic with respect to
3586  * any modifications thanks to holding respective locks.
3587  *
3588  * The same guarantee can be provided with a combination of safe memory
3589  * reclamation and sequence counters instead. If all operations which affect
3590  * the relationship between the current vnode and the one we are looking for
3591  * also modify the counter, we can verify whether all the conditions held as
3592  * we made the jump. This includes things like permissions, mount points etc.
3593  * Counter modification is provided by enclosing relevant places in
3594  * vn_seqc_write_begin()/end() calls.
3595  *
3596  * Thus this translates to:
3597  *
3598  * vfs_smr_enter();
3599  * dvp_seqc = seqc_read_any(dvp);
3600  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
3601  *     abort();
3602  * for (;;) {
3603  *      tvp = find();
3604  *      tvp_seqc = seqc_read_any(tvp);
3605  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
3606  *          abort();
3607  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
3608  *          abort();
3609  *      dvp = tvp; // we know nothing of importance has changed
3610  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
3611  *      if (last)
3612  *          break;
3613  * }
3614  * vget(); // secure the vnode
3615  * if (!seqc_consistent(tvp, tvp_seqc) // final check
3616  *          abort();
3617  * // at this point we know nothing has changed for any parent<->child pair
3618  * // as they were crossed during the lookup, meaning we matched the guarantee
3619  * // of the locked variant
3620  * return (tvp);
3621  *
3622  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
3623  * - they are called while within vfs_smr protection which they must never exit
3624  * - EAGAIN can be returned to denote checking could not be performed, it is
3625  *   always valid to return it
3626  * - if the sequence counter has not changed the result must be valid
3627  * - if the sequence counter has changed both false positives and false negatives
3628  *   are permitted (since the result will be rejected later)
3629  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
3630  *
3631  * Caveats to watch out for:
3632  * - vnodes are passed unlocked and unreferenced with nothing stopping
3633  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
3634  *   to use atomic_load_ptr to fetch it.
3635  * - the aforementioned object can also get freed, meaning absent other means it
3636  *   should be protected with vfs_smr
3637  * - either safely checking permissions as they are modified or guaranteeing
3638  *   their stability is left to the routine
3639  */
3640 int
3641 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
3642     struct pwd **pwdp)
3643 {
3644         struct cache_fpl fpl;
3645         struct pwd *pwd;
3646         struct vnode *dvp;
3647         struct componentname *cnp;
3648         struct nameidata_saved orig;
3649         int error;
3650
3651         *status = CACHE_FPL_STATUS_UNSET;
3652         bzero(&fpl, sizeof(fpl));
3653         fpl.status = CACHE_FPL_STATUS_UNSET;
3654         fpl.ndp = ndp;
3655         fpl.cnp = &ndp->ni_cnd;
3656         MPASS(curthread == fpl.cnp->cn_thread);
3657
3658         if (!cache_can_fplookup(&fpl)) {
3659                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3660                 *status = fpl.status;
3661                 return (EOPNOTSUPP);
3662         }
3663
3664         cache_fpl_checkpoint(&fpl, &orig);
3665
3666         cache_fpl_smr_enter(&fpl);
3667         pwd = pwd_get_smr();
3668         fpl.pwd = pwd;
3669         ndp->ni_rootdir = pwd->pwd_rdir;
3670         ndp->ni_topdir = pwd->pwd_jdir;
3671
3672         cnp = fpl.cnp;
3673         cnp->cn_nameptr = cnp->cn_pnbuf;
3674         if (cnp->cn_pnbuf[0] == '/') {
3675                 cache_fpl_handle_root(ndp, &dvp);
3676         } else {
3677                 MPASS(ndp->ni_dirfd == AT_FDCWD);
3678                 dvp = pwd->pwd_cdir;
3679         }
3680
3681         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
3682
3683         error = cache_fplookup_impl(dvp, &fpl);
3684         cache_fpl_smr_assert_not_entered(&fpl);
3685         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
3686
3687         *status = fpl.status;
3688         switch (fpl.status) {
3689         case CACHE_FPL_STATUS_UNSET:
3690                 __assert_unreachable();
3691                 break;
3692         case CACHE_FPL_STATUS_HANDLED:
3693                 SDT_PROBE3(vfs, namei, lookup, return, error,
3694                     (error == 0 ? ndp->ni_vp : NULL), true);
3695                 break;
3696         case CACHE_FPL_STATUS_PARTIAL:
3697                 *pwdp = fpl.pwd;
3698                 cache_fpl_restore(&fpl, &fpl.snd);
3699                 break;
3700         case CACHE_FPL_STATUS_ABORTED:
3701                 cache_fpl_restore(&fpl, &orig);
3702                 break;
3703         }
3704         return (error);
3705 }