sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  83     "Name cache");
  84
  85 SDT_PROVIDER_DECLARE(vfs);
  86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  87     "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  89     "struct vnode *");
  90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  91     "char *");
  92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  93     "const char *");
  94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  95     "struct namecache *", "int", "int");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  98     "char *", "struct vnode *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 101     "struct vnode *", "char *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 105     "struct vnode *", "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 107     "char *");
 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 109     "struct componentname *");
 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 111     "struct componentname *");
 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 116     "struct vnode *");
 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 118     "char *");
 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 120     "char *");
 121
 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 125
 126 /*
 127  * This structure describes the elements in the cache of recent
 128  * names looked up by namei.
 129  */
 130 struct negstate {
 131         u_char neg_flag;
 132         u_char neg_hit;
 133 };
 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 135     "the state must fit in a union with a pointer without growing it");
 136
 137 struct  namecache {
 138         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 139         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 140         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 141         struct  vnode *nc_dvp;          /* vnode of parent of name */
 142         union {
 143                 struct  vnode *nu_vp;   /* vnode the name refers to */
 144                 struct  negstate nu_neg;/* negative entry state */
 145         } n_un;
 146         u_char  nc_flag;                /* flag bits */
 147         u_char  nc_nlen;                /* length of name */
 148         char    nc_name[0];             /* segment name + nul */
 149 };
 150
 151 /*
 152  * struct namecache_ts repeats struct namecache layout up to the
 153  * nc_nlen member.
 154  * struct namecache_ts is used in place of struct namecache when time(s) need
 155  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 156  * both a non-dotdot directory name plus dotdot for the directory's
 157  * parent.
 158  *
 159  * See below for alignment requirement.
 160  */
 161 struct  namecache_ts {
 162         struct  timespec nc_time;       /* timespec provided by fs */
 163         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 164         int     nc_ticks;               /* ticks value when entry was added */
 165         struct namecache nc_nc;
 166 };
 167
 168 /*
 169  * At least mips n32 performs 64-bit accesses to timespec as found
 170  * in namecache_ts and requires them to be aligned. Since others
 171  * may be in the same spot suffer a little bit and enforce the
 172  * alignment for everyone. Note this is a nop for 64-bit platforms.
 173  */
 174 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 175 #define CACHE_PATH_CUTOFF       39
 176
 177 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 178 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 179 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 180 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 181
 182 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 183 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 184 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 185 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 186
 187 #define nc_vp           n_un.nu_vp
 188 #define nc_neg          n_un.nu_neg
 189
 190 /*
 191  * Flags in namecache.nc_flag
 192  */
 193 #define NCF_WHITE       0x01
 194 #define NCF_ISDOTDOT    0x02
 195 #define NCF_TS          0x04
 196 #define NCF_DTS         0x08
 197 #define NCF_DVDROP      0x10
 198 #define NCF_NEGATIVE    0x20
 199 #define NCF_INVALID     0x40
 200 #define NCF_WIP         0x80
 201
 202 /*
 203  * Flags in negstate.neg_flag
 204  */
 205 #define NEG_HOT         0x01
 206
 207 /*
 208  * Mark an entry as invalid.
 209  *
 210  * This is called before it starts getting deconstructed.
 211  */
 212 static void
 213 cache_ncp_invalidate(struct namecache *ncp)
 214 {
 215
 216         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 217             ("%s: entry %p already invalid", __func__, ncp));
 218         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 219         atomic_thread_fence_rel();
 220 }
 221
 222 /*
 223  * Check whether the entry can be safely used.
 224  *
 225  * All places which elide locks are supposed to call this after they are
 226  * done with reading from an entry.
 227  */
 228 static bool
 229 cache_ncp_canuse(struct namecache *ncp)
 230 {
 231
 232         atomic_thread_fence_acq();
 233         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 234 }
 235
 236 /*
 237  * Name caching works as follows:
 238  *
 239  * Names found by directory scans are retained in a cache
 240  * for future reference.  It is managed LRU, so frequently
 241  * used names will hang around.  Cache is indexed by hash value
 242  * obtained from (dvp, name) where dvp refers to the directory
 243  * containing name.
 244  *
 245  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 246  * exist) the vnode pointer will be NULL.
 247  *
 248  * Upon reaching the last segment of a path, if the reference
 249  * is for DELETE, or NOCACHE is set (rewrite), and the
 250  * name is located in the cache, it will be dropped.
 251  *
 252  * These locks are used (in the order in which they can be taken):
 253  * NAME         TYPE    ROLE
 254  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 255  * bucketlock   mtx     for access to given set of hash buckets
 256  * neglist      mtx     negative entry LRU management
 257  *
 258  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 259  * order is lower address first. Both are recursive.
 260  *
 261  * "." lookups are lockless.
 262  *
 263  * ".." and vnode -> name lookups require vnodelock.
 264  *
 265  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 266  *
 267  * Insertions and removals of entries require involved vnodes and bucketlocks
 268  * to be locked to provide safe operation against other threads modifying the
 269  * cache.
 270  *
 271  * Some lookups result in removal of the found entry (e.g. getting rid of a
 272  * negative entry with the intent to create a positive one), which poses a
 273  * problem when multiple threads reach the state. Similarly, two different
 274  * threads can purge two different vnodes and try to remove the same name.
 275  *
 276  * If the already held vnode lock is lower than the second required lock, we
 277  * can just take the other lock. However, in the opposite case, this could
 278  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 279  * the first node, locking everything in order and revalidating the state.
 280  */
 281
 282 VFS_SMR_DECLARE;
 283
 284 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 285     "Name cache parameters");
 286
 287 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 288 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 289     "Total namecache capacity");
 290
 291 u_int ncsizefactor = 2;
 292 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 293     "Size factor for namecache");
 294
 295 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 296 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 297     "Ratio of negative namecache entries");
 298
 299 /*
 300  * Negative entry % of namecahe capacity above which automatic eviction is allowed.
 301  *
 302  * Check cache_neg_evict_cond for details.
 303  */
 304 static u_int ncnegminpct = 3;
 305
 306 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 307 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 308     "Negative entry count above which automatic eviction is allowed");
 309
 310 /*
 311  * Structures associated with name caching.
 312  */
 313 #define NCHHASH(hash) \
 314         (&nchashtbl[(hash) & nchash])
 315 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 316 static u_long __read_mostly     nchash;                 /* size of hash table */
 317 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 318     "Size of namecache hash table");
 319 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 320 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 321
 322 struct nchstats nchstats;               /* cache effectiveness statistics */
 323
 324 static bool __read_frequently cache_fast_revlookup = true;
 325 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 326     &cache_fast_revlookup, 0, "");
 327
 328 static u_int __exclusive_cache_line neg_cycle;
 329
 330 #define ncneghash       3
 331 #define numneglists     (ncneghash + 1)
 332
 333 struct neglist {
 334         struct mtx              nl_evict_lock;
 335         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 336         TAILQ_HEAD(, namecache) nl_list;
 337         TAILQ_HEAD(, namecache) nl_hotlist;
 338         u_long                  nl_hotnum;
 339 } __aligned(CACHE_LINE_SIZE);
 340
 341 static struct neglist neglists[numneglists];
 342
 343 static inline struct neglist *
 344 NCP2NEGLIST(struct namecache *ncp)
 345 {
 346
 347         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 348 }
 349
 350 static inline struct negstate *
 351 NCP2NEGSTATE(struct namecache *ncp)
 352 {
 353
 354         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 355         return (&ncp->nc_neg);
 356 }
 357
 358 #define numbucketlocks (ncbuckethash + 1)
 359 static u_int __read_mostly  ncbuckethash;
 360 static struct mtx_padalign __read_mostly  *bucketlocks;
 361 #define HASH2BUCKETLOCK(hash) \
 362         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 363
 364 #define numvnodelocks (ncvnodehash + 1)
 365 static u_int __read_mostly  ncvnodehash;
 366 static struct mtx __read_mostly *vnodelocks;
 367 static inline struct mtx *
 368 VP2VNODELOCK(struct vnode *vp)
 369 {
 370
 371         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 372 }
 373
 374 /*
 375  * UMA zones for the VFS cache.
 376  *
 377  * The small cache is used for entries with short names, which are the
 378  * most common.  The large cache is used for entries which are too big to
 379  * fit in the small cache.
 380  */
 381 static uma_zone_t __read_mostly cache_zone_small;
 382 static uma_zone_t __read_mostly cache_zone_small_ts;
 383 static uma_zone_t __read_mostly cache_zone_large;
 384 static uma_zone_t __read_mostly cache_zone_large_ts;
 385
 386 static struct namecache *
 387 cache_alloc(int len, int ts)
 388 {
 389         struct namecache_ts *ncp_ts;
 390         struct namecache *ncp;
 391
 392         if (__predict_false(ts)) {
 393                 if (len <= CACHE_PATH_CUTOFF)
 394                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 395                 else
 396                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 397                 ncp = &ncp_ts->nc_nc;
 398         } else {
 399                 if (len <= CACHE_PATH_CUTOFF)
 400                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 401                 else
 402                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 403         }
 404         return (ncp);
 405 }
 406
 407 static void
 408 cache_free(struct namecache *ncp)
 409 {
 410         struct namecache_ts *ncp_ts;
 411
 412         MPASS(ncp != NULL);
 413         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 414                 vdrop(ncp->nc_dvp);
 415         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 416                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 417                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 418                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 419                 else
 420                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 421         } else {
 422                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 423                         uma_zfree_smr(cache_zone_small, ncp);
 424                 else
 425                         uma_zfree_smr(cache_zone_large, ncp);
 426         }
 427 }
 428
 429 static void
 430 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 431 {
 432         struct namecache_ts *ncp_ts;
 433
 434         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 435             (tsp == NULL && ticksp == NULL),
 436             ("No NCF_TS"));
 437
 438         if (tsp == NULL)
 439                 return;
 440
 441         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 442         *tsp = ncp_ts->nc_time;
 443         *ticksp = ncp_ts->nc_ticks;
 444 }
 445
 446 #ifdef DEBUG_CACHE
 447 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 448 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 449     "VFS namecache enabled");
 450 #endif
 451
 452 /* Export size information to userland */
 453 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 454     sizeof(struct namecache), "sizeof(struct namecache)");
 455
 456 /*
 457  * The new name cache statistics
 458  */
 459 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 460     "Name cache statistics");
 461
 462 #define STATNODE_ULONG(name, varname, descr)                                    \
 463         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 464 #define STATNODE_COUNTER(name, varname, descr)                                  \
 465         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 466         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 467             descr);
 468 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 469 STATNODE_ULONG(count, numcache, "Number of cache entries");
 470 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 471 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 472 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 473 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 474 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 475 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 476 STATNODE_COUNTER(posszaps, numposzaps,
 477     "Number of cache hits (positive) we do not want to cache");
 478 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 479 STATNODE_COUNTER(negzaps, numnegzaps,
 480     "Number of cache hits (negative) we do not want to cache");
 481 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 482 /* These count for vn_getcwd(), too. */
 483 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 484 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 485 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 486     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 487 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 488 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 489
 490 /*
 491  * Debug or developer statistics.
 492  */
 493 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 494     "Name cache debugging");
 495 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 496         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 497 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 498         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 499         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 500             descr);
 501 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 502     "Number of successful removals after relocking");
 503 static long zap_bucket_fail;
 504 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 505 static long zap_bucket_fail2;
 506 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 507 static long cache_lock_vnodes_cel_3_failures;
 508 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 509     "Number of times 3-way vnode locking failed");
 510
 511 static void cache_zap_locked(struct namecache *ncp);
 512 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 513     char **freebuf, size_t *buflen);
 514 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 515     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend);
 516 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 517     char **retbuf, size_t *buflen);
 518 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 519     char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
 520
 521 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 522
 523 static inline void
 524 cache_assert_vlp_locked(struct mtx *vlp)
 525 {
 526
 527         if (vlp != NULL)
 528                 mtx_assert(vlp, MA_OWNED);
 529 }
 530
 531 static inline void
 532 cache_assert_vnode_locked(struct vnode *vp)
 533 {
 534         struct mtx *vlp;
 535
 536         vlp = VP2VNODELOCK(vp);
 537         cache_assert_vlp_locked(vlp);
 538 }
 539
 540 /*
 541  * TODO: With the value stored we can do better than computing the hash based
 542  * on the address. The choice of FNV should also be revisited.
 543  */
 544 static void
 545 cache_prehash(struct vnode *vp)
 546 {
 547
 548         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 549 }
 550
 551 static uint32_t
 552 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 553 {
 554
 555         return (fnv_32_buf(name, len, dvp->v_nchash));
 556 }
 557
 558 static inline struct nchashhead *
 559 NCP2BUCKET(struct namecache *ncp)
 560 {
 561         uint32_t hash;
 562
 563         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 564         return (NCHHASH(hash));
 565 }
 566
 567 static inline struct mtx *
 568 NCP2BUCKETLOCK(struct namecache *ncp)
 569 {
 570         uint32_t hash;
 571
 572         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 573         return (HASH2BUCKETLOCK(hash));
 574 }
 575
 576 #ifdef INVARIANTS
 577 static void
 578 cache_assert_bucket_locked(struct namecache *ncp)
 579 {
 580         struct mtx *blp;
 581
 582         blp = NCP2BUCKETLOCK(ncp);
 583         mtx_assert(blp, MA_OWNED);
 584 }
 585
 586 static void
 587 cache_assert_bucket_unlocked(struct namecache *ncp)
 588 {
 589         struct mtx *blp;
 590
 591         blp = NCP2BUCKETLOCK(ncp);
 592         mtx_assert(blp, MA_NOTOWNED);
 593 }
 594 #else
 595 #define cache_assert_bucket_locked(x) do { } while (0)
 596 #define cache_assert_bucket_unlocked(x) do { } while (0)
 597 #endif
 598
 599 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 600 static void
 601 _cache_sort_vnodes(void **p1, void **p2)
 602 {
 603         void *tmp;
 604
 605         MPASS(*p1 != NULL || *p2 != NULL);
 606
 607         if (*p1 > *p2) {
 608                 tmp = *p2;
 609                 *p2 = *p1;
 610                 *p1 = tmp;
 611         }
 612 }
 613
 614 static void
 615 cache_lock_all_buckets(void)
 616 {
 617         u_int i;
 618
 619         for (i = 0; i < numbucketlocks; i++)
 620                 mtx_lock(&bucketlocks[i]);
 621 }
 622
 623 static void
 624 cache_unlock_all_buckets(void)
 625 {
 626         u_int i;
 627
 628         for (i = 0; i < numbucketlocks; i++)
 629                 mtx_unlock(&bucketlocks[i]);
 630 }
 631
 632 static void
 633 cache_lock_all_vnodes(void)
 634 {
 635         u_int i;
 636
 637         for (i = 0; i < numvnodelocks; i++)
 638                 mtx_lock(&vnodelocks[i]);
 639 }
 640
 641 static void
 642 cache_unlock_all_vnodes(void)
 643 {
 644         u_int i;
 645
 646         for (i = 0; i < numvnodelocks; i++)
 647                 mtx_unlock(&vnodelocks[i]);
 648 }
 649
 650 static int
 651 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 652 {
 653
 654         cache_sort_vnodes(&vlp1, &vlp2);
 655
 656         if (vlp1 != NULL) {
 657                 if (!mtx_trylock(vlp1))
 658                         return (EAGAIN);
 659         }
 660         if (!mtx_trylock(vlp2)) {
 661                 if (vlp1 != NULL)
 662                         mtx_unlock(vlp1);
 663                 return (EAGAIN);
 664         }
 665
 666         return (0);
 667 }
 668
 669 static void
 670 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 671 {
 672
 673         MPASS(vlp1 != NULL || vlp2 != NULL);
 674         MPASS(vlp1 <= vlp2);
 675
 676         if (vlp1 != NULL)
 677                 mtx_lock(vlp1);
 678         if (vlp2 != NULL)
 679                 mtx_lock(vlp2);
 680 }
 681
 682 static void
 683 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 684 {
 685
 686         MPASS(vlp1 != NULL || vlp2 != NULL);
 687
 688         if (vlp1 != NULL)
 689                 mtx_unlock(vlp1);
 690         if (vlp2 != NULL)
 691                 mtx_unlock(vlp2);
 692 }
 693
 694 static int
 695 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 696 {
 697         struct nchstats snap;
 698
 699         if (req->oldptr == NULL)
 700                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 701
 702         snap = nchstats;
 703         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 704         snap.ncs_neghits = counter_u64_fetch(numneghits);
 705         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 706             counter_u64_fetch(numnegzaps);
 707         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 708             counter_u64_fetch(nummiss);
 709
 710         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 711 }
 712 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 713     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 714     "VFS cache effectiveness statistics");
 715
 716 static void
 717 cache_recalc_neg_min(u_int val)
 718 {
 719
 720         neg_min = (ncsize * val) / 100;
 721 }
 722
 723 static int
 724 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 725 {
 726         u_int val;
 727         int error;
 728
 729         val = ncnegminpct;
 730         error = sysctl_handle_int(oidp, &val, 0, req);
 731         if (error != 0 || req->newptr == NULL)
 732                 return (error);
 733
 734         if (val == ncnegminpct)
 735                 return (0);
 736         if (val < 0 || val > 99)
 737                 return (EINVAL);
 738         ncnegminpct = val;
 739         cache_recalc_neg_min(val);
 740         return (0);
 741 }
 742
 743 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 744     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 745     "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed");
 746
 747 #ifdef DIAGNOSTIC
 748 /*
 749  * Grab an atomic snapshot of the name cache hash chain lengths
 750  */
 751 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 752     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 753     "hash table stats");
 754
 755 static int
 756 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 757 {
 758         struct nchashhead *ncpp;
 759         struct namecache *ncp;
 760         int i, error, n_nchash, *cntbuf;
 761
 762 retry:
 763         n_nchash = nchash + 1;  /* nchash is max index, not count */
 764         if (req->oldptr == NULL)
 765                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 766         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 767         cache_lock_all_buckets();
 768         if (n_nchash != nchash + 1) {
 769                 cache_unlock_all_buckets();
 770                 free(cntbuf, M_TEMP);
 771                 goto retry;
 772         }
 773         /* Scan hash tables counting entries */
 774         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 775                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 776                         cntbuf[i]++;
 777         cache_unlock_all_buckets();
 778         for (error = 0, i = 0; i < n_nchash; i++)
 779                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 780                         break;
 781         free(cntbuf, M_TEMP);
 782         return (error);
 783 }
 784 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 785     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 786     "nchash chain lengths");
 787
 788 static int
 789 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 790 {
 791         int error;
 792         struct nchashhead *ncpp;
 793         struct namecache *ncp;
 794         int n_nchash;
 795         int count, maxlength, used, pct;
 796
 797         if (!req->oldptr)
 798                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 799
 800         cache_lock_all_buckets();
 801         n_nchash = nchash + 1;  /* nchash is max index, not count */
 802         used = 0;
 803         maxlength = 0;
 804
 805         /* Scan hash tables for applicable entries */
 806         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 807                 count = 0;
 808                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 809                         count++;
 810                 }
 811                 if (count)
 812                         used++;
 813                 if (maxlength < count)
 814                         maxlength = count;
 815         }
 816         n_nchash = nchash + 1;
 817         cache_unlock_all_buckets();
 818         pct = (used * 100) / (n_nchash / 100);
 819         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 820         if (error)
 821                 return (error);
 822         error = SYSCTL_OUT(req, &used, sizeof(used));
 823         if (error)
 824                 return (error);
 825         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 826         if (error)
 827                 return (error);
 828         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 829         if (error)
 830                 return (error);
 831         return (0);
 832 }
 833 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 834     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 835     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 836 #endif
 837
 838 /*
 839  * Negative entries management
 840  *
 841  * Various workloads create plenty of negative entries and barely use them
 842  * afterwards. Moreover malicious users can keep performing bogus lookups
 843  * adding even more entries. For example "make tinderbox" as of writing this
 844  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 845  * negative.
 846  *
 847  * As such, a rather aggressive eviction method is needed. The currently
 848  * employed method is a placeholder.
 849  *
 850  * Entries are split over numneglists separate lists, each of which is further
 851  * split into hot and cold entries. Entries get promoted after getting a hit.
 852  * Eviction happens on addition of new entry.
 853  */
 854 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 855     "Name cache negative entry statistics");
 856
 857 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 858     "Number of negative cache entries");
 859
 860 static COUNTER_U64_DEFINE_EARLY(neg_created);
 861 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 862     "Number of created negative entries");
 863
 864 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 865 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 866     "Number of evicted negative entries");
 867
 868 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 870     &neg_evict_skipped_empty,
 871     "Number of times evicting failed due to lack of entries");
 872
 873 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 874 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 875     &neg_evict_skipped_missed,
 876     "Number of times evicting failed due to target entry disappearing");
 877
 878 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 879 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 880     &neg_evict_skipped_contended,
 881     "Number of times evicting failed due to contention");
 882
 883 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 884     "Number of cache hits (negative)");
 885
 886 static int
 887 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 888 {
 889         int i, out;
 890
 891         out = 0;
 892         for (i = 0; i < numneglists; i++)
 893                 out += neglists[i].nl_hotnum;
 894
 895         return (SYSCTL_OUT(req, &out, sizeof(out)));
 896 }
 897 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 898     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 899     "Number of hot negative entries");
 900
 901 static void
 902 cache_neg_init(struct namecache *ncp)
 903 {
 904         struct negstate *ns;
 905
 906         ncp->nc_flag |= NCF_NEGATIVE;
 907         ns = NCP2NEGSTATE(ncp);
 908         ns->neg_flag = 0;
 909         ns->neg_hit = 0;
 910         counter_u64_add(neg_created, 1);
 911 }
 912
 913 #define CACHE_NEG_PROMOTION_THRESH 2
 914
 915 static bool
 916 cache_neg_hit_prep(struct namecache *ncp)
 917 {
 918         struct negstate *ns;
 919         u_char n;
 920
 921         ns = NCP2NEGSTATE(ncp);
 922         n = atomic_load_char(&ns->neg_hit);
 923         for (;;) {
 924                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 925                         return (false);
 926                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 927                         break;
 928         }
 929         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 930 }
 931
 932 /*
 933  * Nothing to do here but it is provided for completeness as some
 934  * cache_neg_hit_prep callers may end up returning without even
 935  * trying to promote.
 936  */
 937 #define cache_neg_hit_abort(ncp)        do { } while (0)
 938
 939 static void
 940 cache_neg_hit_finish(struct namecache *ncp)
 941 {
 942
 943         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 944         counter_u64_add(numneghits, 1);
 945 }
 946
 947 /*
 948  * Move a negative entry to the hot list.
 949  */
 950 static void
 951 cache_neg_promote_locked(struct namecache *ncp)
 952 {
 953         struct neglist *nl;
 954         struct negstate *ns;
 955
 956         ns = NCP2NEGSTATE(ncp);
 957         nl = NCP2NEGLIST(ncp);
 958         mtx_assert(&nl->nl_lock, MA_OWNED);
 959         if ((ns->neg_flag & NEG_HOT) == 0) {
 960                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 961                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 962                 nl->nl_hotnum++;
 963                 ns->neg_flag |= NEG_HOT;
 964         }
 965 }
 966
 967 /*
 968  * Move a hot negative entry to the cold list.
 969  */
 970 static void
 971 cache_neg_demote_locked(struct namecache *ncp)
 972 {
 973         struct neglist *nl;
 974         struct negstate *ns;
 975
 976         ns = NCP2NEGSTATE(ncp);
 977         nl = NCP2NEGLIST(ncp);
 978         mtx_assert(&nl->nl_lock, MA_OWNED);
 979         MPASS(ns->neg_flag & NEG_HOT);
 980         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 981         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 982         nl->nl_hotnum--;
 983         ns->neg_flag &= ~NEG_HOT;
 984         atomic_store_char(&ns->neg_hit, 0);
 985 }
 986
 987 /*
 988  * Move a negative entry to the hot list if it matches the lookup.
 989  *
 990  * We have to take locks, but they may be contended and in the worst
 991  * case we may need to go off CPU. We don't want to spin within the
 992  * smr section and we can't block with it. Exiting the section means
 993  * the found entry could have been evicted. We are going to look it
 994  * up again.
 995  */
 996 static bool
 997 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 998     struct namecache *oncp, uint32_t hash)
 999 {
1000         struct namecache *ncp;
1001         struct neglist *nl;
1002         u_char nc_flag;
1003
1004         nl = NCP2NEGLIST(oncp);
1005
1006         mtx_lock(&nl->nl_lock);
1007         /*
1008          * For hash iteration.
1009          */
1010         vfs_smr_enter();
1011
1012         /*
1013          * Avoid all surprises by only succeeding if we got the same entry and
1014          * bailing completely otherwise.
1015          * XXX There are no provisions to keep the vnode around, meaning we may
1016          * end up promoting a negative entry for a *new* vnode and returning
1017          * ENOENT on its account. This is the error we want to return anyway
1018          * and promotion is harmless.
1019          *
1020          * In particular at this point there can be a new ncp which matches the
1021          * search but hashes to a different neglist.
1022          */
1023         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1024                 if (ncp == oncp)
1025                         break;
1026         }
1027
1028         /*
1029          * No match to begin with.
1030          */
1031         if (__predict_false(ncp == NULL)) {
1032                 goto out_abort;
1033         }
1034
1035         /*
1036          * The newly found entry may be something different...
1037          */
1038         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1039             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1040                 goto out_abort;
1041         }
1042
1043         /*
1044          * ... and not even negative.
1045          */
1046         nc_flag = atomic_load_char(&ncp->nc_flag);
1047         if ((nc_flag & NCF_NEGATIVE) == 0) {
1048                 goto out_abort;
1049         }
1050
1051         if (__predict_false(!cache_ncp_canuse(ncp))) {
1052                 goto out_abort;
1053         }
1054
1055         cache_neg_promote_locked(ncp);
1056         cache_neg_hit_finish(ncp);
1057         vfs_smr_exit();
1058         mtx_unlock(&nl->nl_lock);
1059         return (true);
1060 out_abort:
1061         vfs_smr_exit();
1062         mtx_unlock(&nl->nl_lock);
1063         return (false);
1064 }
1065
1066 static void
1067 cache_neg_promote(struct namecache *ncp)
1068 {
1069         struct neglist *nl;
1070
1071         nl = NCP2NEGLIST(ncp);
1072         mtx_lock(&nl->nl_lock);
1073         cache_neg_promote_locked(ncp);
1074         mtx_unlock(&nl->nl_lock);
1075 }
1076
1077 static void
1078 cache_neg_insert(struct namecache *ncp)
1079 {
1080         struct neglist *nl;
1081
1082         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1083         cache_assert_bucket_locked(ncp);
1084         nl = NCP2NEGLIST(ncp);
1085         mtx_lock(&nl->nl_lock);
1086         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1087         mtx_unlock(&nl->nl_lock);
1088         atomic_add_long(&numneg, 1);
1089 }
1090
1091 static void
1092 cache_neg_remove(struct namecache *ncp)
1093 {
1094         struct neglist *nl;
1095         struct negstate *ns;
1096
1097         cache_assert_bucket_locked(ncp);
1098         nl = NCP2NEGLIST(ncp);
1099         ns = NCP2NEGSTATE(ncp);
1100         mtx_lock(&nl->nl_lock);
1101         if ((ns->neg_flag & NEG_HOT) != 0) {
1102                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1103                 nl->nl_hotnum--;
1104         } else {
1105                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1106         }
1107         mtx_unlock(&nl->nl_lock);
1108         atomic_subtract_long(&numneg, 1);
1109 }
1110
1111 static struct neglist *
1112 cache_neg_evict_select_list(void)
1113 {
1114         struct neglist *nl;
1115         u_int c;
1116
1117         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1118         nl = &neglists[c % numneglists];
1119         if (!mtx_trylock(&nl->nl_evict_lock)) {
1120                 counter_u64_add(neg_evict_skipped_contended, 1);
1121                 return (NULL);
1122         }
1123         return (nl);
1124 }
1125
1126 static struct namecache *
1127 cache_neg_evict_select_entry(struct neglist *nl)
1128 {
1129         struct namecache *ncp, *lncp;
1130         struct negstate *ns, *lns;
1131         int i;
1132
1133         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1134         mtx_assert(&nl->nl_lock, MA_OWNED);
1135         ncp = TAILQ_FIRST(&nl->nl_list);
1136         if (ncp == NULL)
1137                 return (NULL);
1138         lncp = ncp;
1139         lns = NCP2NEGSTATE(lncp);
1140         for (i = 1; i < 4; i++) {
1141                 ncp = TAILQ_NEXT(ncp, nc_dst);
1142                 if (ncp == NULL)
1143                         break;
1144                 ns = NCP2NEGSTATE(ncp);
1145                 if (ns->neg_hit < lns->neg_hit) {
1146                         lncp = ncp;
1147                         lns = ns;
1148                 }
1149         }
1150         return (lncp);
1151 }
1152
1153 static bool
1154 cache_neg_evict(void)
1155 {
1156         struct namecache *ncp, *ncp2;
1157         struct neglist *nl;
1158         struct negstate *ns;
1159         struct vnode *dvp;
1160         struct mtx *dvlp;
1161         struct mtx *blp;
1162         uint32_t hash;
1163         u_char nlen;
1164         bool evicted;
1165
1166         nl = cache_neg_evict_select_list();
1167         if (nl == NULL) {
1168                 return (false);
1169         }
1170
1171         mtx_lock(&nl->nl_lock);
1172         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1173         if (ncp != NULL) {
1174                 cache_neg_demote_locked(ncp);
1175         }
1176         ncp = cache_neg_evict_select_entry(nl);
1177         if (ncp == NULL) {
1178                 counter_u64_add(neg_evict_skipped_empty, 1);
1179                 mtx_unlock(&nl->nl_lock);
1180                 mtx_unlock(&nl->nl_evict_lock);
1181                 return (false);
1182         }
1183         ns = NCP2NEGSTATE(ncp);
1184         nlen = ncp->nc_nlen;
1185         dvp = ncp->nc_dvp;
1186         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1187         dvlp = VP2VNODELOCK(dvp);
1188         blp = HASH2BUCKETLOCK(hash);
1189         mtx_unlock(&nl->nl_lock);
1190         mtx_unlock(&nl->nl_evict_lock);
1191         mtx_lock(dvlp);
1192         mtx_lock(blp);
1193         /*
1194          * Note that since all locks were dropped above, the entry may be
1195          * gone or reallocated to be something else.
1196          */
1197         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1198                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1199                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1200                         break;
1201         }
1202         if (ncp2 == NULL) {
1203                 counter_u64_add(neg_evict_skipped_missed, 1);
1204                 ncp = NULL;
1205                 evicted = false;
1206         } else {
1207                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1208                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1209                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1210                     ncp->nc_name);
1211                 cache_zap_locked(ncp);
1212                 counter_u64_add(neg_evicted, 1);
1213                 evicted = true;
1214         }
1215         mtx_unlock(blp);
1216         mtx_unlock(dvlp);
1217         if (ncp != NULL)
1218                 cache_free(ncp);
1219         return (evicted);
1220 }
1221
1222 /*
1223  * Maybe evict a negative entry to create more room.
1224  *
1225  * The ncnegfactor parameter limits what fraction of the total count
1226  * can comprise of negative entries. However, if the cache is just
1227  * warming up this leads to excessive evictions.  As such, ncnegminpct
1228  * (recomputed to neg_min) dictates whether the above should be
1229  * applied.
1230  *
1231  * Try evicting if the cache is close to full capacity regardless of
1232  * other considerations.
1233  */
1234 static bool
1235 cache_neg_evict_cond(u_long lnumcache)
1236 {
1237         u_long lnumneg;
1238
1239         if (ncsize - 1000 < lnumcache)
1240                 goto out_evict;
1241         lnumneg = atomic_load_long(&numneg);
1242         if (lnumneg < neg_min)
1243                 return (false);
1244         if (lnumneg * ncnegfactor < lnumcache)
1245                 return (false);
1246 out_evict:
1247         return (cache_neg_evict());
1248 }
1249
1250 /*
1251  * cache_zap_locked():
1252  *
1253  *   Removes a namecache entry from cache, whether it contains an actual
1254  *   pointer to a vnode or if it is just a negative cache entry.
1255  */
1256 static void
1257 cache_zap_locked(struct namecache *ncp)
1258 {
1259         struct nchashhead *ncpp;
1260
1261         if (!(ncp->nc_flag & NCF_NEGATIVE))
1262                 cache_assert_vnode_locked(ncp->nc_vp);
1263         cache_assert_vnode_locked(ncp->nc_dvp);
1264         cache_assert_bucket_locked(ncp);
1265
1266         cache_ncp_invalidate(ncp);
1267
1268         ncpp = NCP2BUCKET(ncp);
1269         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1270         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1271                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1272                     ncp->nc_name, ncp->nc_vp);
1273                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1274                 if (ncp == ncp->nc_vp->v_cache_dd) {
1275                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1276                         ncp->nc_vp->v_cache_dd = NULL;
1277                         vn_seqc_write_end(ncp->nc_vp);
1278                 }
1279         } else {
1280                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1281                     ncp->nc_name);
1282                 cache_neg_remove(ncp);
1283         }
1284         if (ncp->nc_flag & NCF_ISDOTDOT) {
1285                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1286                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1287                         ncp->nc_dvp->v_cache_dd = NULL;
1288                         vn_seqc_write_end(ncp->nc_dvp);
1289                 }
1290         } else {
1291                 LIST_REMOVE(ncp, nc_src);
1292                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1293                         ncp->nc_flag |= NCF_DVDROP;
1294                         counter_u64_add(numcachehv, -1);
1295                 }
1296         }
1297         atomic_subtract_long(&numcache, 1);
1298 }
1299
1300 static void
1301 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1302 {
1303         struct mtx *blp;
1304
1305         MPASS(ncp->nc_dvp == vp);
1306         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1307         cache_assert_vnode_locked(vp);
1308
1309         blp = NCP2BUCKETLOCK(ncp);
1310         mtx_lock(blp);
1311         cache_zap_locked(ncp);
1312         mtx_unlock(blp);
1313 }
1314
1315 static bool
1316 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1317     struct mtx **vlpp)
1318 {
1319         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1320         struct mtx *blp;
1321
1322         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1323         cache_assert_vnode_locked(vp);
1324
1325         if (ncp->nc_flag & NCF_NEGATIVE) {
1326                 if (*vlpp != NULL) {
1327                         mtx_unlock(*vlpp);
1328                         *vlpp = NULL;
1329                 }
1330                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1331                 return (true);
1332         }
1333
1334         pvlp = VP2VNODELOCK(vp);
1335         blp = NCP2BUCKETLOCK(ncp);
1336         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1337         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1338
1339         if (*vlpp == vlp1 || *vlpp == vlp2) {
1340                 to_unlock = *vlpp;
1341                 *vlpp = NULL;
1342         } else {
1343                 if (*vlpp != NULL) {
1344                         mtx_unlock(*vlpp);
1345                         *vlpp = NULL;
1346                 }
1347                 cache_sort_vnodes(&vlp1, &vlp2);
1348                 if (vlp1 == pvlp) {
1349                         mtx_lock(vlp2);
1350                         to_unlock = vlp2;
1351                 } else {
1352                         if (!mtx_trylock(vlp1))
1353                                 goto out_relock;
1354                         to_unlock = vlp1;
1355                 }
1356         }
1357         mtx_lock(blp);
1358         cache_zap_locked(ncp);
1359         mtx_unlock(blp);
1360         if (to_unlock != NULL)
1361                 mtx_unlock(to_unlock);
1362         return (true);
1363
1364 out_relock:
1365         mtx_unlock(vlp2);
1366         mtx_lock(vlp1);
1367         mtx_lock(vlp2);
1368         MPASS(*vlpp == NULL);
1369         *vlpp = vlp1;
1370         return (false);
1371 }
1372
1373 /*
1374  * If trylocking failed we can get here. We know enough to take all needed locks
1375  * in the right order and re-lookup the entry.
1376  */
1377 static int
1378 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1379     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1380     struct mtx *blp)
1381 {
1382         struct namecache *rncp;
1383
1384         cache_assert_bucket_unlocked(ncp);
1385
1386         cache_sort_vnodes(&dvlp, &vlp);
1387         cache_lock_vnodes(dvlp, vlp);
1388         mtx_lock(blp);
1389         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1390                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1391                     rncp->nc_nlen == cnp->cn_namelen &&
1392                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1393                         break;
1394         }
1395         if (rncp != NULL) {
1396                 cache_zap_locked(rncp);
1397                 mtx_unlock(blp);
1398                 cache_unlock_vnodes(dvlp, vlp);
1399                 counter_u64_add(zap_bucket_relock_success, 1);
1400                 return (0);
1401         }
1402
1403         mtx_unlock(blp);
1404         cache_unlock_vnodes(dvlp, vlp);
1405         return (EAGAIN);
1406 }
1407
1408 static int __noinline
1409 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1410     uint32_t hash, struct mtx *blp)
1411 {
1412         struct mtx *dvlp, *vlp;
1413         struct vnode *dvp;
1414
1415         cache_assert_bucket_locked(ncp);
1416
1417         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1418         vlp = NULL;
1419         if (!(ncp->nc_flag & NCF_NEGATIVE))
1420                 vlp = VP2VNODELOCK(ncp->nc_vp);
1421         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1422                 cache_zap_locked(ncp);
1423                 mtx_unlock(blp);
1424                 cache_unlock_vnodes(dvlp, vlp);
1425                 return (0);
1426         }
1427
1428         dvp = ncp->nc_dvp;
1429         mtx_unlock(blp);
1430         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1431 }
1432
1433 static __noinline int
1434 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1435 {
1436         struct namecache *ncp;
1437         struct mtx *blp;
1438         struct mtx *dvlp, *dvlp2;
1439         uint32_t hash;
1440         int error;
1441
1442         if (cnp->cn_namelen == 2 &&
1443             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1444                 dvlp = VP2VNODELOCK(dvp);
1445                 dvlp2 = NULL;
1446                 mtx_lock(dvlp);
1447 retry_dotdot:
1448                 ncp = dvp->v_cache_dd;
1449                 if (ncp == NULL) {
1450                         mtx_unlock(dvlp);
1451                         if (dvlp2 != NULL)
1452                                 mtx_unlock(dvlp2);
1453                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1454                         return (0);
1455                 }
1456                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1457                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1458                                 goto retry_dotdot;
1459                         MPASS(dvp->v_cache_dd == NULL);
1460                         mtx_unlock(dvlp);
1461                         if (dvlp2 != NULL)
1462                                 mtx_unlock(dvlp2);
1463                         cache_free(ncp);
1464                 } else {
1465                         vn_seqc_write_begin(dvp);
1466                         dvp->v_cache_dd = NULL;
1467                         vn_seqc_write_end(dvp);
1468                         mtx_unlock(dvlp);
1469                         if (dvlp2 != NULL)
1470                                 mtx_unlock(dvlp2);
1471                 }
1472                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1473                 return (1);
1474         }
1475
1476         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1477         blp = HASH2BUCKETLOCK(hash);
1478 retry:
1479         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1480                 goto out_no_entry;
1481
1482         mtx_lock(blp);
1483
1484         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1485                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1486                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1487                         break;
1488         }
1489
1490         if (ncp == NULL) {
1491                 mtx_unlock(blp);
1492                 goto out_no_entry;
1493         }
1494
1495         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1496         if (__predict_false(error != 0)) {
1497                 zap_bucket_fail++;
1498                 goto retry;
1499         }
1500         counter_u64_add(numposzaps, 1);
1501         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1502         cache_free(ncp);
1503         return (1);
1504 out_no_entry:
1505         counter_u64_add(nummisszap, 1);
1506         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1507         return (0);
1508 }
1509
1510 static int __noinline
1511 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1512     struct timespec *tsp, int *ticksp)
1513 {
1514         int ltype;
1515
1516         *vpp = dvp;
1517         counter_u64_add(dothits, 1);
1518         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1519         if (tsp != NULL)
1520                 timespecclear(tsp);
1521         if (ticksp != NULL)
1522                 *ticksp = ticks;
1523         vrefact(*vpp);
1524         /*
1525          * When we lookup "." we still can be asked to lock it
1526          * differently...
1527          */
1528         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1529         if (ltype != VOP_ISLOCKED(*vpp)) {
1530                 if (ltype == LK_EXCLUSIVE) {
1531                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1532                         if (VN_IS_DOOMED((*vpp))) {
1533                                 /* forced unmount */
1534                                 vrele(*vpp);
1535                                 *vpp = NULL;
1536                                 return (ENOENT);
1537                         }
1538                 } else
1539                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1540         }
1541         return (-1);
1542 }
1543
1544 static int __noinline
1545 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1546     struct timespec *tsp, int *ticksp)
1547 {
1548         struct namecache_ts *ncp_ts;
1549         struct namecache *ncp;
1550         struct mtx *dvlp;
1551         enum vgetstate vs;
1552         int error, ltype;
1553         bool whiteout;
1554
1555         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1556
1557         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1558                 cache_remove_cnp(dvp, cnp);
1559                 return (0);
1560         }
1561
1562         counter_u64_add(dotdothits, 1);
1563 retry:
1564         dvlp = VP2VNODELOCK(dvp);
1565         mtx_lock(dvlp);
1566         ncp = dvp->v_cache_dd;
1567         if (ncp == NULL) {
1568                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1569                 mtx_unlock(dvlp);
1570                 return (0);
1571         }
1572         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1573                 if (ncp->nc_flag & NCF_NEGATIVE)
1574                         *vpp = NULL;
1575                 else
1576                         *vpp = ncp->nc_vp;
1577         } else
1578                 *vpp = ncp->nc_dvp;
1579         if (*vpp == NULL)
1580                 goto negative_success;
1581         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1582         cache_out_ts(ncp, tsp, ticksp);
1583         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1584             NCF_DTS && tsp != NULL) {
1585                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1586                 *tsp = ncp_ts->nc_dotdottime;
1587         }
1588
1589         MPASS(dvp != *vpp);
1590         ltype = VOP_ISLOCKED(dvp);
1591         VOP_UNLOCK(dvp);
1592         vs = vget_prep(*vpp);
1593         mtx_unlock(dvlp);
1594         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1595         vn_lock(dvp, ltype | LK_RETRY);
1596         if (VN_IS_DOOMED(dvp)) {
1597                 if (error == 0)
1598                         vput(*vpp);
1599                 *vpp = NULL;
1600                 return (ENOENT);
1601         }
1602         if (error) {
1603                 *vpp = NULL;
1604                 goto retry;
1605         }
1606         return (-1);
1607 negative_success:
1608         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1609                 if (cnp->cn_flags & ISLASTCN) {
1610                         counter_u64_add(numnegzaps, 1);
1611                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1612                         mtx_unlock(dvlp);
1613                         cache_free(ncp);
1614                         return (0);
1615                 }
1616         }
1617
1618         whiteout = (ncp->nc_flag & NCF_WHITE);
1619         cache_out_ts(ncp, tsp, ticksp);
1620         if (cache_neg_hit_prep(ncp))
1621                 cache_neg_promote(ncp);
1622         else
1623                 cache_neg_hit_finish(ncp);
1624         mtx_unlock(dvlp);
1625         if (whiteout)
1626                 cnp->cn_flags |= ISWHITEOUT;
1627         return (ENOENT);
1628 }
1629
1630 /**
1631  * Lookup a name in the name cache
1632  *
1633  * # Arguments
1634  *
1635  * - dvp:       Parent directory in which to search.
1636  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1637  * - cnp:       Parameters of the name search.  The most interesting bits of
1638  *              the cn_flags field have the following meanings:
1639  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1640  *                      it up.
1641  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1642  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1643  *              or negative) lookup, tsp will be filled with any timespec that
1644  *              was stored when this cache entry was created.  However, it will
1645  *              be clear for "." entries.
1646  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1647  *              (positive or negative) lookup, it will contain the ticks value
1648  *              that was current when the cache entry was created, unless cnp
1649  *              was ".".
1650  *
1651  * Either both tsp and ticks have to be provided or neither of them.
1652  *
1653  * # Returns
1654  *
1655  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1656  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1657  *              to a forced unmount.  vpp will not be modified.  If the entry
1658  *              is a whiteout, then the ISWHITEOUT flag will be set in
1659  *              cnp->cn_flags.
1660  * - 0:         A cache miss.  vpp will not be modified.
1661  *
1662  * # Locking
1663  *
1664  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1665  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1666  * lock is not recursively acquired.
1667  */
1668 static int __noinline
1669 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1670     struct timespec *tsp, int *ticksp)
1671 {
1672         struct namecache *ncp;
1673         struct mtx *blp;
1674         uint32_t hash;
1675         enum vgetstate vs;
1676         int error;
1677         bool whiteout;
1678
1679         MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY);
1680
1681 retry:
1682         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1683         blp = HASH2BUCKETLOCK(hash);
1684         mtx_lock(blp);
1685
1686         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1687                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1688                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1689                         break;
1690         }
1691
1692         if (__predict_false(ncp == NULL)) {
1693                 mtx_unlock(blp);
1694                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1695                     NULL);
1696                 counter_u64_add(nummiss, 1);
1697                 return (0);
1698         }
1699
1700         if (ncp->nc_flag & NCF_NEGATIVE)
1701                 goto negative_success;
1702
1703         counter_u64_add(numposhits, 1);
1704         *vpp = ncp->nc_vp;
1705         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1706         cache_out_ts(ncp, tsp, ticksp);
1707         MPASS(dvp != *vpp);
1708         vs = vget_prep(*vpp);
1709         mtx_unlock(blp);
1710         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1711         if (error) {
1712                 *vpp = NULL;
1713                 goto retry;
1714         }
1715         return (-1);
1716 negative_success:
1717         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1718                 if (cnp->cn_flags & ISLASTCN) {
1719                         counter_u64_add(numnegzaps, 1);
1720                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1721                         if (__predict_false(error != 0)) {
1722                                 zap_bucket_fail2++;
1723                                 goto retry;
1724                         }
1725                         cache_free(ncp);
1726                         return (0);
1727                 }
1728         }
1729
1730         whiteout = (ncp->nc_flag & NCF_WHITE);
1731         cache_out_ts(ncp, tsp, ticksp);
1732         if (cache_neg_hit_prep(ncp))
1733                 cache_neg_promote(ncp);
1734         else
1735                 cache_neg_hit_finish(ncp);
1736         mtx_unlock(blp);
1737         if (whiteout)
1738                 cnp->cn_flags |= ISWHITEOUT;
1739         return (ENOENT);
1740 }
1741
1742 int
1743 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1744     struct timespec *tsp, int *ticksp)
1745 {
1746         struct namecache *ncp;
1747         uint32_t hash;
1748         enum vgetstate vs;
1749         int error;
1750         bool whiteout, neg_promote;
1751         u_short nc_flag;
1752
1753         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1754
1755 #ifdef DEBUG_CACHE
1756         if (__predict_false(!doingcache)) {
1757                 cnp->cn_flags &= ~MAKEENTRY;
1758                 return (0);
1759         }
1760 #endif
1761
1762         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1763                 if (cnp->cn_namelen == 1)
1764                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1765                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1766                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1767         }
1768
1769         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1770
1771         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1772                 cache_remove_cnp(dvp, cnp);
1773                 return (0);
1774         }
1775
1776         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1777         vfs_smr_enter();
1778
1779         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1780                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1781                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1782                         break;
1783         }
1784
1785         if (__predict_false(ncp == NULL)) {
1786                 vfs_smr_exit();
1787                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1788                     NULL);
1789                 counter_u64_add(nummiss, 1);
1790                 return (0);
1791         }
1792
1793         nc_flag = atomic_load_char(&ncp->nc_flag);
1794         if (nc_flag & NCF_NEGATIVE)
1795                 goto negative_success;
1796
1797         counter_u64_add(numposhits, 1);
1798         *vpp = ncp->nc_vp;
1799         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1800         cache_out_ts(ncp, tsp, ticksp);
1801         MPASS(dvp != *vpp);
1802         if (!cache_ncp_canuse(ncp)) {
1803                 vfs_smr_exit();
1804                 *vpp = NULL;
1805                 goto out_fallback;
1806         }
1807         vs = vget_prep_smr(*vpp);
1808         vfs_smr_exit();
1809         if (__predict_false(vs == VGET_NONE)) {
1810                 *vpp = NULL;
1811                 goto out_fallback;
1812         }
1813         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1814         if (error) {
1815                 *vpp = NULL;
1816                 goto out_fallback;
1817         }
1818         return (-1);
1819 negative_success:
1820         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1821                 if (cnp->cn_flags & ISLASTCN) {
1822                         vfs_smr_exit();
1823                         goto out_fallback;
1824                 }
1825         }
1826
1827         cache_out_ts(ncp, tsp, ticksp);
1828         whiteout = (ncp->nc_flag & NCF_WHITE);
1829         neg_promote = cache_neg_hit_prep(ncp);
1830         if (__predict_false(!cache_ncp_canuse(ncp))) {
1831                 cache_neg_hit_abort(ncp);
1832                 vfs_smr_exit();
1833                 goto out_fallback;
1834         }
1835         if (neg_promote) {
1836                 vfs_smr_exit();
1837                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1838                         goto out_fallback;
1839         } else {
1840                 cache_neg_hit_finish(ncp);
1841                 vfs_smr_exit();
1842         }
1843         if (whiteout)
1844                 cnp->cn_flags |= ISWHITEOUT;
1845         return (ENOENT);
1846 out_fallback:
1847         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1848 }
1849
1850 struct celockstate {
1851         struct mtx *vlp[3];
1852         struct mtx *blp[2];
1853 };
1854 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1855 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1856
1857 static inline void
1858 cache_celockstate_init(struct celockstate *cel)
1859 {
1860
1861         bzero(cel, sizeof(*cel));
1862 }
1863
1864 static void
1865 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1866     struct vnode *dvp)
1867 {
1868         struct mtx *vlp1, *vlp2;
1869
1870         MPASS(cel->vlp[0] == NULL);
1871         MPASS(cel->vlp[1] == NULL);
1872         MPASS(cel->vlp[2] == NULL);
1873
1874         MPASS(vp != NULL || dvp != NULL);
1875
1876         vlp1 = VP2VNODELOCK(vp);
1877         vlp2 = VP2VNODELOCK(dvp);
1878         cache_sort_vnodes(&vlp1, &vlp2);
1879
1880         if (vlp1 != NULL) {
1881                 mtx_lock(vlp1);
1882                 cel->vlp[0] = vlp1;
1883         }
1884         mtx_lock(vlp2);
1885         cel->vlp[1] = vlp2;
1886 }
1887
1888 static void
1889 cache_unlock_vnodes_cel(struct celockstate *cel)
1890 {
1891
1892         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1893
1894         if (cel->vlp[0] != NULL)
1895                 mtx_unlock(cel->vlp[0]);
1896         if (cel->vlp[1] != NULL)
1897                 mtx_unlock(cel->vlp[1]);
1898         if (cel->vlp[2] != NULL)
1899                 mtx_unlock(cel->vlp[2]);
1900 }
1901
1902 static bool
1903 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1904 {
1905         struct mtx *vlp;
1906         bool ret;
1907
1908         cache_assert_vlp_locked(cel->vlp[0]);
1909         cache_assert_vlp_locked(cel->vlp[1]);
1910         MPASS(cel->vlp[2] == NULL);
1911
1912         MPASS(vp != NULL);
1913         vlp = VP2VNODELOCK(vp);
1914
1915         ret = true;
1916         if (vlp >= cel->vlp[1]) {
1917                 mtx_lock(vlp);
1918         } else {
1919                 if (mtx_trylock(vlp))
1920                         goto out;
1921                 cache_lock_vnodes_cel_3_failures++;
1922                 cache_unlock_vnodes_cel(cel);
1923                 if (vlp < cel->vlp[0]) {
1924                         mtx_lock(vlp);
1925                         mtx_lock(cel->vlp[0]);
1926                         mtx_lock(cel->vlp[1]);
1927                 } else {
1928                         if (cel->vlp[0] != NULL)
1929                                 mtx_lock(cel->vlp[0]);
1930                         mtx_lock(vlp);
1931                         mtx_lock(cel->vlp[1]);
1932                 }
1933                 ret = false;
1934         }
1935 out:
1936         cel->vlp[2] = vlp;
1937         return (ret);
1938 }
1939
1940 static void
1941 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1942     struct mtx *blp2)
1943 {
1944
1945         MPASS(cel->blp[0] == NULL);
1946         MPASS(cel->blp[1] == NULL);
1947
1948         cache_sort_vnodes(&blp1, &blp2);
1949
1950         if (blp1 != NULL) {
1951                 mtx_lock(blp1);
1952                 cel->blp[0] = blp1;
1953         }
1954         mtx_lock(blp2);
1955         cel->blp[1] = blp2;
1956 }
1957
1958 static void
1959 cache_unlock_buckets_cel(struct celockstate *cel)
1960 {
1961
1962         if (cel->blp[0] != NULL)
1963                 mtx_unlock(cel->blp[0]);
1964         mtx_unlock(cel->blp[1]);
1965 }
1966
1967 /*
1968  * Lock part of the cache affected by the insertion.
1969  *
1970  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1971  * However, insertion can result in removal of an old entry. In this
1972  * case we have an additional vnode and bucketlock pair to lock.
1973  *
1974  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1975  * preserving the locking order (smaller address first).
1976  */
1977 static void
1978 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1979     uint32_t hash)
1980 {
1981         struct namecache *ncp;
1982         struct mtx *blps[2];
1983
1984         blps[0] = HASH2BUCKETLOCK(hash);
1985         for (;;) {
1986                 blps[1] = NULL;
1987                 cache_lock_vnodes_cel(cel, dvp, vp);
1988                 if (vp == NULL || vp->v_type != VDIR)
1989                         break;
1990                 ncp = vp->v_cache_dd;
1991                 if (ncp == NULL)
1992                         break;
1993                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1994                         break;
1995                 MPASS(ncp->nc_dvp == vp);
1996                 blps[1] = NCP2BUCKETLOCK(ncp);
1997                 if (ncp->nc_flag & NCF_NEGATIVE)
1998                         break;
1999                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2000                         break;
2001                 /*
2002                  * All vnodes got re-locked. Re-validate the state and if
2003                  * nothing changed we are done. Otherwise restart.
2004                  */
2005                 if (ncp == vp->v_cache_dd &&
2006                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2007                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2008                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2009                         break;
2010                 cache_unlock_vnodes_cel(cel);
2011                 cel->vlp[0] = NULL;
2012                 cel->vlp[1] = NULL;
2013                 cel->vlp[2] = NULL;
2014         }
2015         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2016 }
2017
2018 static void
2019 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2020     uint32_t hash)
2021 {
2022         struct namecache *ncp;
2023         struct mtx *blps[2];
2024
2025         blps[0] = HASH2BUCKETLOCK(hash);
2026         for (;;) {
2027                 blps[1] = NULL;
2028                 cache_lock_vnodes_cel(cel, dvp, vp);
2029                 ncp = dvp->v_cache_dd;
2030                 if (ncp == NULL)
2031                         break;
2032                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2033                         break;
2034                 MPASS(ncp->nc_dvp == dvp);
2035                 blps[1] = NCP2BUCKETLOCK(ncp);
2036                 if (ncp->nc_flag & NCF_NEGATIVE)
2037                         break;
2038                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2039                         break;
2040                 if (ncp == dvp->v_cache_dd &&
2041                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2042                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2043                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2044                         break;
2045                 cache_unlock_vnodes_cel(cel);
2046                 cel->vlp[0] = NULL;
2047                 cel->vlp[1] = NULL;
2048                 cel->vlp[2] = NULL;
2049         }
2050         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2051 }
2052
2053 static void
2054 cache_enter_unlock(struct celockstate *cel)
2055 {
2056
2057         cache_unlock_buckets_cel(cel);
2058         cache_unlock_vnodes_cel(cel);
2059 }
2060
2061 static void __noinline
2062 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2063     struct componentname *cnp)
2064 {
2065         struct celockstate cel;
2066         struct namecache *ncp;
2067         uint32_t hash;
2068         int len;
2069
2070         if (dvp->v_cache_dd == NULL)
2071                 return;
2072         len = cnp->cn_namelen;
2073         cache_celockstate_init(&cel);
2074         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2075         cache_enter_lock_dd(&cel, dvp, vp, hash);
2076         vn_seqc_write_begin(dvp);
2077         ncp = dvp->v_cache_dd;
2078         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2079                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2080                 cache_zap_locked(ncp);
2081         } else {
2082                 ncp = NULL;
2083         }
2084         dvp->v_cache_dd = NULL;
2085         vn_seqc_write_end(dvp);
2086         cache_enter_unlock(&cel);
2087         if (ncp != NULL)
2088                 cache_free(ncp);
2089 }
2090
2091 /*
2092  * Add an entry to the cache.
2093  */
2094 void
2095 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2096     struct timespec *tsp, struct timespec *dtsp)
2097 {
2098         struct celockstate cel;
2099         struct namecache *ncp, *n2, *ndd;
2100         struct namecache_ts *ncp_ts;
2101         struct nchashhead *ncpp;
2102         uint32_t hash;
2103         int flag;
2104         int len;
2105         u_long lnumcache;
2106
2107         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2108         VNPASS(dvp->v_type != VNON, dvp);
2109         if (vp != NULL) {
2110                 VNPASS(!VN_IS_DOOMED(vp), vp);
2111                 VNPASS(vp->v_type != VNON, vp);
2112         }
2113
2114 #ifdef DEBUG_CACHE
2115         if (__predict_false(!doingcache))
2116                 return;
2117 #endif
2118
2119         flag = 0;
2120         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2121                 if (cnp->cn_namelen == 1)
2122                         return;
2123                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2124                         cache_enter_dotdot_prep(dvp, vp, cnp);
2125                         flag = NCF_ISDOTDOT;
2126                 }
2127         }
2128
2129         /*
2130          * Avoid blowout in namecache entries.
2131          *
2132          * Bugs:
2133          * 1. filesystems may end up tryng to add an already existing entry
2134          * (for example this can happen after a cache miss during concurrent
2135          * lookup), in which case we will call cache_neg_evict despite not
2136          * adding anything.
2137          * 2. the routine may fail to free anything and no provisions are made
2138          * to make it try harder (see the inside for failure modes)
2139          * 3. it only ever looks at negative entries.
2140          */
2141         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
2142         if (cache_neg_evict_cond(lnumcache)) {
2143                 lnumcache = atomic_load_long(&numcache);
2144         }
2145         if (__predict_false(lnumcache >= ncsize)) {
2146                 atomic_subtract_long(&numcache, 1);
2147                 counter_u64_add(numdrops, 1);
2148                 return;
2149         }
2150
2151         cache_celockstate_init(&cel);
2152         ndd = NULL;
2153         ncp_ts = NULL;
2154
2155         /*
2156          * Calculate the hash key and setup as much of the new
2157          * namecache entry as possible before acquiring the lock.
2158          */
2159         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2160         ncp->nc_flag = flag | NCF_WIP;
2161         ncp->nc_vp = vp;
2162         if (vp == NULL)
2163                 cache_neg_init(ncp);
2164         ncp->nc_dvp = dvp;
2165         if (tsp != NULL) {
2166                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2167                 ncp_ts->nc_time = *tsp;
2168                 ncp_ts->nc_ticks = ticks;
2169                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2170                 if (dtsp != NULL) {
2171                         ncp_ts->nc_dotdottime = *dtsp;
2172                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2173                 }
2174         }
2175         len = ncp->nc_nlen = cnp->cn_namelen;
2176         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2177         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2178         ncp->nc_name[len] = '\0';
2179         cache_enter_lock(&cel, dvp, vp, hash);
2180
2181         /*
2182          * See if this vnode or negative entry is already in the cache
2183          * with this name.  This can happen with concurrent lookups of
2184          * the same path name.
2185          */
2186         ncpp = NCHHASH(hash);
2187         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2188                 if (n2->nc_dvp == dvp &&
2189                     n2->nc_nlen == cnp->cn_namelen &&
2190                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2191                         MPASS(cache_ncp_canuse(n2));
2192                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2193                                 KASSERT(vp == NULL,
2194                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2195                                     __func__, NULL, vp));
2196                         else
2197                                 KASSERT(n2->nc_vp == vp,
2198                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2199                                     __func__, n2->nc_vp, vp));
2200                         /*
2201                          * Entries are supposed to be immutable unless in the
2202                          * process of getting destroyed. Accommodating for
2203                          * changing timestamps is possible but not worth it.
2204                          * This should be harmless in terms of correctness, in
2205                          * the worst case resulting in an earlier expiration.
2206                          * Alternatively, the found entry can be replaced
2207                          * altogether.
2208                          */
2209                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2210 #if 0
2211                         if (tsp != NULL) {
2212                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2213                                     ("no NCF_TS"));
2214                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2215                                 n2_ts->nc_time = ncp_ts->nc_time;
2216                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2217                                 if (dtsp != NULL) {
2218                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2219                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2220                                 }
2221                         }
2222 #endif
2223                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2224                             vp);
2225                         goto out_unlock_free;
2226                 }
2227         }
2228
2229         if (flag == NCF_ISDOTDOT) {
2230                 /*
2231                  * See if we are trying to add .. entry, but some other lookup
2232                  * has populated v_cache_dd pointer already.
2233                  */
2234                 if (dvp->v_cache_dd != NULL)
2235                         goto out_unlock_free;
2236                 KASSERT(vp == NULL || vp->v_type == VDIR,
2237                     ("wrong vnode type %p", vp));
2238                 vn_seqc_write_begin(dvp);
2239                 dvp->v_cache_dd = ncp;
2240                 vn_seqc_write_end(dvp);
2241         }
2242
2243         if (vp != NULL) {
2244                 if (flag != NCF_ISDOTDOT) {
2245                         /*
2246                          * For this case, the cache entry maps both the
2247                          * directory name in it and the name ".." for the
2248                          * directory's parent.
2249                          */
2250                         vn_seqc_write_begin(vp);
2251                         if ((ndd = vp->v_cache_dd) != NULL) {
2252                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2253                                         cache_zap_locked(ndd);
2254                                 else
2255                                         ndd = NULL;
2256                         }
2257                         vp->v_cache_dd = ncp;
2258                         vn_seqc_write_end(vp);
2259                 } else if (vp->v_type != VDIR) {
2260                         if (vp->v_cache_dd != NULL) {
2261                                 vn_seqc_write_begin(vp);
2262                                 vp->v_cache_dd = NULL;
2263                                 vn_seqc_write_end(vp);
2264                         }
2265                 }
2266         }
2267
2268         if (flag != NCF_ISDOTDOT) {
2269                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2270                         vhold(dvp);
2271                         counter_u64_add(numcachehv, 1);
2272                 }
2273                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2274         }
2275
2276         /*
2277          * If the entry is "negative", we place it into the
2278          * "negative" cache queue, otherwise, we place it into the
2279          * destination vnode's cache entries queue.
2280          */
2281         if (vp != NULL) {
2282                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2283                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2284                     vp);
2285         } else {
2286                 if (cnp->cn_flags & ISWHITEOUT)
2287                         ncp->nc_flag |= NCF_WHITE;
2288                 cache_neg_insert(ncp);
2289                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2290                     ncp->nc_name);
2291         }
2292
2293         /*
2294          * Insert the new namecache entry into the appropriate chain
2295          * within the cache entries table.
2296          */
2297         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2298
2299         atomic_thread_fence_rel();
2300         /*
2301          * Mark the entry as fully constructed.
2302          * It is immutable past this point until its removal.
2303          */
2304         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2305
2306         cache_enter_unlock(&cel);
2307         if (ndd != NULL)
2308                 cache_free(ndd);
2309         return;
2310 out_unlock_free:
2311         cache_enter_unlock(&cel);
2312         atomic_subtract_long(&numcache, 1);
2313         cache_free(ncp);
2314         return;
2315 }
2316
2317 static u_int
2318 cache_roundup_2(u_int val)
2319 {
2320         u_int res;
2321
2322         for (res = 1; res <= val; res <<= 1)
2323                 continue;
2324
2325         return (res);
2326 }
2327
2328 static struct nchashhead *
2329 nchinittbl(u_long elements, u_long *hashmask)
2330 {
2331         struct nchashhead *hashtbl;
2332         u_long hashsize, i;
2333
2334         hashsize = cache_roundup_2(elements) / 2;
2335
2336         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2337         for (i = 0; i < hashsize; i++)
2338                 CK_SLIST_INIT(&hashtbl[i]);
2339         *hashmask = hashsize - 1;
2340         return (hashtbl);
2341 }
2342
2343 static void
2344 ncfreetbl(struct nchashhead *hashtbl)
2345 {
2346
2347         free(hashtbl, M_VFSCACHE);
2348 }
2349
2350 /*
2351  * Name cache initialization, from vfs_init() when we are booting
2352  */
2353 static void
2354 nchinit(void *dummy __unused)
2355 {
2356         u_int i;
2357
2358         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2359             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2360         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2361             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2362         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2363             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2364         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2365             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2366
2367         VFS_SMR_ZONE_SET(cache_zone_small);
2368         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2369         VFS_SMR_ZONE_SET(cache_zone_large);
2370         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2371
2372         ncsize = desiredvnodes * ncsizefactor;
2373         cache_recalc_neg_min(ncnegminpct);
2374         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2375         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2376         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2377                 ncbuckethash = 7;
2378         if (ncbuckethash > nchash)
2379                 ncbuckethash = nchash;
2380         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2381             M_WAITOK | M_ZERO);
2382         for (i = 0; i < numbucketlocks; i++)
2383                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2384         ncvnodehash = ncbuckethash;
2385         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2386             M_WAITOK | M_ZERO);
2387         for (i = 0; i < numvnodelocks; i++)
2388                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2389
2390         for (i = 0; i < numneglists; i++) {
2391                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2392                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2393                 TAILQ_INIT(&neglists[i].nl_list);
2394                 TAILQ_INIT(&neglists[i].nl_hotlist);
2395         }
2396 }
2397 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2398
2399 void
2400 cache_vnode_init(struct vnode *vp)
2401 {
2402
2403         LIST_INIT(&vp->v_cache_src);
2404         TAILQ_INIT(&vp->v_cache_dst);
2405         vp->v_cache_dd = NULL;
2406         cache_prehash(vp);
2407 }
2408
2409 void
2410 cache_changesize(u_long newmaxvnodes)
2411 {
2412         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2413         u_long new_nchash, old_nchash;
2414         struct namecache *ncp;
2415         uint32_t hash;
2416         u_long newncsize;
2417         int i;
2418
2419         newncsize = newmaxvnodes * ncsizefactor;
2420         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2421         if (newmaxvnodes < numbucketlocks)
2422                 newmaxvnodes = numbucketlocks;
2423
2424         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2425         /* If same hash table size, nothing to do */
2426         if (nchash == new_nchash) {
2427                 ncfreetbl(new_nchashtbl);
2428                 return;
2429         }
2430         /*
2431          * Move everything from the old hash table to the new table.
2432          * None of the namecache entries in the table can be removed
2433          * because to do so, they have to be removed from the hash table.
2434          */
2435         cache_lock_all_vnodes();
2436         cache_lock_all_buckets();
2437         old_nchashtbl = nchashtbl;
2438         old_nchash = nchash;
2439         nchashtbl = new_nchashtbl;
2440         nchash = new_nchash;
2441         for (i = 0; i <= old_nchash; i++) {
2442                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2443                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2444                             ncp->nc_dvp);
2445                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2446                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2447                 }
2448         }
2449         ncsize = newncsize;
2450         cache_recalc_neg_min(ncnegminpct);
2451         cache_unlock_all_buckets();
2452         cache_unlock_all_vnodes();
2453         ncfreetbl(old_nchashtbl);
2454 }
2455
2456 /*
2457  * Invalidate all entries from and to a particular vnode.
2458  */
2459 static void
2460 cache_purge_impl(struct vnode *vp)
2461 {
2462         TAILQ_HEAD(, namecache) ncps;
2463         struct namecache *ncp, *nnp;
2464         struct mtx *vlp, *vlp2;
2465
2466         TAILQ_INIT(&ncps);
2467         vlp = VP2VNODELOCK(vp);
2468         vlp2 = NULL;
2469         mtx_lock(vlp);
2470 retry:
2471         while (!LIST_EMPTY(&vp->v_cache_src)) {
2472                 ncp = LIST_FIRST(&vp->v_cache_src);
2473                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2474                         goto retry;
2475                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2476         }
2477         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2478                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2479                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2480                         goto retry;
2481                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2482         }
2483         ncp = vp->v_cache_dd;
2484         if (ncp != NULL) {
2485                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2486                    ("lost dotdot link"));
2487                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2488                         goto retry;
2489                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2490         }
2491         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2492         mtx_unlock(vlp);
2493         if (vlp2 != NULL)
2494                 mtx_unlock(vlp2);
2495         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2496                 cache_free(ncp);
2497         }
2498 }
2499
2500 /*
2501  * Opportunistic check to see if there is anything to do.
2502  */
2503 static bool
2504 cache_has_entries(struct vnode *vp)
2505 {
2506
2507         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2508             vp->v_cache_dd == NULL)
2509                 return (false);
2510         return (true);
2511 }
2512
2513 void
2514 cache_purge(struct vnode *vp)
2515 {
2516
2517         SDT_PROBE1(vfs, namecache, purge, done, vp);
2518         if (!cache_has_entries(vp))
2519                 return;
2520         cache_purge_impl(vp);
2521 }
2522
2523 /*
2524  * Only to be used by vgone.
2525  */
2526 void
2527 cache_purge_vgone(struct vnode *vp)
2528 {
2529         struct mtx *vlp;
2530
2531         VNPASS(VN_IS_DOOMED(vp), vp);
2532         if (cache_has_entries(vp)) {
2533                 cache_purge_impl(vp);
2534                 return;
2535         }
2536
2537         /*
2538          * Serialize against a potential thread doing cache_purge.
2539          */
2540         vlp = VP2VNODELOCK(vp);
2541         mtx_wait_unlocked(vlp);
2542         if (cache_has_entries(vp)) {
2543                 cache_purge_impl(vp);
2544                 return;
2545         }
2546         return;
2547 }
2548
2549 /*
2550  * Invalidate all negative entries for a particular directory vnode.
2551  */
2552 void
2553 cache_purge_negative(struct vnode *vp)
2554 {
2555         TAILQ_HEAD(, namecache) ncps;
2556         struct namecache *ncp, *nnp;
2557         struct mtx *vlp;
2558
2559         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2560         if (LIST_EMPTY(&vp->v_cache_src))
2561                 return;
2562         TAILQ_INIT(&ncps);
2563         vlp = VP2VNODELOCK(vp);
2564         mtx_lock(vlp);
2565         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2566                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2567                         continue;
2568                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2569                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2570         }
2571         mtx_unlock(vlp);
2572         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2573                 cache_free(ncp);
2574         }
2575 }
2576
2577 void
2578 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2579     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2580 {
2581
2582         ASSERT_VOP_IN_SEQC(fdvp);
2583         ASSERT_VOP_IN_SEQC(fvp);
2584         ASSERT_VOP_IN_SEQC(tdvp);
2585         if (tvp != NULL)
2586                 ASSERT_VOP_IN_SEQC(tvp);
2587
2588         cache_purge(fvp);
2589         if (tvp != NULL) {
2590                 cache_purge(tvp);
2591                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2592                     ("%s: lingering negative entry", __func__));
2593         } else {
2594                 cache_remove_cnp(tdvp, tcnp);
2595         }
2596 }
2597
2598 /*
2599  * Flush all entries referencing a particular filesystem.
2600  */
2601 void
2602 cache_purgevfs(struct mount *mp)
2603 {
2604         struct vnode *vp, *mvp;
2605
2606         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2607         /*
2608          * Somewhat wasteful iteration over all vnodes. Would be better to
2609          * support filtering and avoid the interlock to begin with.
2610          */
2611         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2612                 if (!cache_has_entries(vp)) {
2613                         VI_UNLOCK(vp);
2614                         continue;
2615                 }
2616                 vholdl(vp);
2617                 VI_UNLOCK(vp);
2618                 cache_purge(vp);
2619                 vdrop(vp);
2620         }
2621 }
2622
2623 /*
2624  * Perform canonical checks and cache lookup and pass on to filesystem
2625  * through the vop_cachedlookup only if needed.
2626  */
2627
2628 int
2629 vfs_cache_lookup(struct vop_lookup_args *ap)
2630 {
2631         struct vnode *dvp;
2632         int error;
2633         struct vnode **vpp = ap->a_vpp;
2634         struct componentname *cnp = ap->a_cnp;
2635         int flags = cnp->cn_flags;
2636
2637         *vpp = NULL;
2638         dvp = ap->a_dvp;
2639
2640         if (dvp->v_type != VDIR)
2641                 return (ENOTDIR);
2642
2643         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2644             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2645                 return (EROFS);
2646
2647         error = vn_dir_check_exec(dvp, cnp);
2648         if (error != 0)
2649                 return (error);
2650
2651         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2652         if (error == 0)
2653                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2654         if (error == -1)
2655                 return (0);
2656         return (error);
2657 }
2658
2659 /* Implementation of the getcwd syscall. */
2660 int
2661 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2662 {
2663         char *buf, *retbuf;
2664         size_t buflen;
2665         int error;
2666
2667         buflen = uap->buflen;
2668         if (__predict_false(buflen < 2))
2669                 return (EINVAL);
2670         if (buflen > MAXPATHLEN)
2671                 buflen = MAXPATHLEN;
2672
2673         buf = uma_zalloc(namei_zone, M_WAITOK);
2674         error = vn_getcwd(buf, &retbuf, &buflen);
2675         if (error == 0)
2676                 error = copyout(retbuf, uap->buf, buflen);
2677         uma_zfree(namei_zone, buf);
2678         return (error);
2679 }
2680
2681 int
2682 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2683 {
2684         struct pwd *pwd;
2685         int error;
2686
2687         vfs_smr_enter();
2688         pwd = pwd_get_smr();
2689         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2690             buflen, false, 0);
2691         VFS_SMR_ASSERT_NOT_ENTERED();
2692         if (error < 0) {
2693                 pwd = pwd_hold(curthread);
2694                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2695                     retbuf, buflen);
2696                 pwd_drop(pwd);
2697         }
2698
2699 #ifdef KTRACE
2700         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2701                 ktrnamei(*retbuf);
2702 #endif
2703         return (error);
2704 }
2705
2706 static int
2707 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2708     size_t size, int flags, enum uio_seg pathseg)
2709 {
2710         struct nameidata nd;
2711         char *retbuf, *freebuf;
2712         int error;
2713
2714         if (flags != 0)
2715                 return (EINVAL);
2716         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2717             pathseg, path, fd, &cap_fstat_rights, td);
2718         if ((error = namei(&nd)) != 0)
2719                 return (error);
2720         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2721         if (error == 0) {
2722                 error = copyout(retbuf, buf, size);
2723                 free(freebuf, M_TEMP);
2724         }
2725         NDFREE(&nd, 0);
2726         return (error);
2727 }
2728
2729 int
2730 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2731 {
2732
2733         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2734             uap->flags, UIO_USERSPACE));
2735 }
2736
2737 /*
2738  * Retrieve the full filesystem path that correspond to a vnode from the name
2739  * cache (if available)
2740  */
2741 int
2742 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2743 {
2744         struct pwd *pwd;
2745         char *buf;
2746         size_t buflen;
2747         int error;
2748
2749         if (__predict_false(vp == NULL))
2750                 return (EINVAL);
2751
2752         buflen = MAXPATHLEN;
2753         buf = malloc(buflen, M_TEMP, M_WAITOK);
2754         vfs_smr_enter();
2755         pwd = pwd_get_smr();
2756         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, false, 0);
2757         VFS_SMR_ASSERT_NOT_ENTERED();
2758         if (error < 0) {
2759                 pwd = pwd_hold(curthread);
2760                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2761                 pwd_drop(pwd);
2762         }
2763         if (error == 0)
2764                 *freebuf = buf;
2765         else
2766                 free(buf, M_TEMP);
2767         return (error);
2768 }
2769
2770 /*
2771  * This function is similar to vn_fullpath, but it attempts to lookup the
2772  * pathname relative to the global root mount point.  This is required for the
2773  * auditing sub-system, as audited pathnames must be absolute, relative to the
2774  * global root mount point.
2775  */
2776 int
2777 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2778 {
2779         char *buf;
2780         size_t buflen;
2781         int error;
2782
2783         if (__predict_false(vp == NULL))
2784                 return (EINVAL);
2785         buflen = MAXPATHLEN;
2786         buf = malloc(buflen, M_TEMP, M_WAITOK);
2787         vfs_smr_enter();
2788         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, false, 0);
2789         VFS_SMR_ASSERT_NOT_ENTERED();
2790         if (error < 0) {
2791                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2792         }
2793         if (error == 0)
2794                 *freebuf = buf;
2795         else
2796                 free(buf, M_TEMP);
2797         return (error);
2798 }
2799
2800 static struct namecache *
2801 vn_dd_from_dst(struct vnode *vp)
2802 {
2803         struct namecache *ncp;
2804
2805         cache_assert_vnode_locked(vp);
2806         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2807                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2808                         return (ncp);
2809         }
2810         return (NULL);
2811 }
2812
2813 int
2814 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2815 {
2816         struct vnode *dvp;
2817         struct namecache *ncp;
2818         struct mtx *vlp;
2819         int error;
2820
2821         vlp = VP2VNODELOCK(*vp);
2822         mtx_lock(vlp);
2823         ncp = (*vp)->v_cache_dd;
2824         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2825                 KASSERT(ncp == vn_dd_from_dst(*vp),
2826                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2827                     ncp, vn_dd_from_dst(*vp)));
2828         } else {
2829                 ncp = vn_dd_from_dst(*vp);
2830         }
2831         if (ncp != NULL) {
2832                 if (*buflen < ncp->nc_nlen) {
2833                         mtx_unlock(vlp);
2834                         vrele(*vp);
2835                         counter_u64_add(numfullpathfail4, 1);
2836                         error = ENOMEM;
2837                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2838                             vp, NULL);
2839                         return (error);
2840                 }
2841                 *buflen -= ncp->nc_nlen;
2842                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2843                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2844                     ncp->nc_name, vp);
2845                 dvp = *vp;
2846                 *vp = ncp->nc_dvp;
2847                 vref(*vp);
2848                 mtx_unlock(vlp);
2849                 vrele(dvp);
2850                 return (0);
2851         }
2852         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2853
2854         mtx_unlock(vlp);
2855         vn_lock(*vp, LK_SHARED | LK_RETRY);
2856         error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2857         vput(*vp);
2858         if (error) {
2859                 counter_u64_add(numfullpathfail2, 1);
2860                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2861                 return (error);
2862         }
2863
2864         *vp = dvp;
2865         if (VN_IS_DOOMED(dvp)) {
2866                 /* forced unmount */
2867                 vrele(dvp);
2868                 error = ENOENT;
2869                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2870                 return (error);
2871         }
2872         /*
2873          * *vp has its use count incremented still.
2874          */
2875
2876         return (0);
2877 }
2878
2879 /*
2880  * Resolve a directory to a pathname.
2881  *
2882  * The name of the directory can always be found in the namecache or fetched
2883  * from the filesystem. There is also guaranteed to be only one parent, meaning
2884  * we can just follow vnodes up until we find the root.
2885  *
2886  * The vnode must be referenced.
2887  */
2888 static int
2889 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2890     size_t *len, bool slash_prefixed, size_t addend)
2891 {
2892 #ifdef KDTRACE_HOOKS
2893         struct vnode *startvp = vp;
2894 #endif
2895         struct vnode *vp1;
2896         size_t buflen;
2897         int error;
2898
2899         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2900         VNPASS(vp->v_usecount > 0, vp);
2901
2902         buflen = *len;
2903
2904         if (!slash_prefixed) {
2905                 MPASS(*len >= 2);
2906                 buflen--;
2907                 buf[buflen] = '\0';
2908         }
2909
2910         error = 0;
2911
2912         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2913         counter_u64_add(numfullpathcalls, 1);
2914         while (vp != rdir && vp != rootvnode) {
2915                 /*
2916                  * The vp vnode must be already fully constructed,
2917                  * since it is either found in namecache or obtained
2918                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2919                  * without obtaining the vnode lock.
2920                  */
2921                 if ((vp->v_vflag & VV_ROOT) != 0) {
2922                         vn_lock(vp, LK_RETRY | LK_SHARED);
2923
2924                         /*
2925                          * With the vnode locked, check for races with
2926                          * unmount, forced or not.  Note that we
2927                          * already verified that vp is not equal to
2928                          * the root vnode, which means that
2929                          * mnt_vnodecovered can be NULL only for the
2930                          * case of unmount.
2931                          */
2932                         if (VN_IS_DOOMED(vp) ||
2933                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2934                             vp1->v_mountedhere != vp->v_mount) {
2935                                 vput(vp);
2936                                 error = ENOENT;
2937                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2938                                     error, vp, NULL);
2939                                 break;
2940                         }
2941
2942                         vref(vp1);
2943                         vput(vp);
2944                         vp = vp1;
2945                         continue;
2946                 }
2947                 if (vp->v_type != VDIR) {
2948                         vrele(vp);
2949                         counter_u64_add(numfullpathfail1, 1);
2950                         error = ENOTDIR;
2951                         SDT_PROBE3(vfs, namecache, fullpath, return,
2952                             error, vp, NULL);
2953                         break;
2954                 }
2955                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, &buflen);
2956                 if (error)
2957                         break;
2958                 if (buflen == 0) {
2959                         vrele(vp);
2960                         error = ENOMEM;
2961                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2962                             startvp, NULL);
2963                         break;
2964                 }
2965                 buf[--buflen] = '/';
2966                 slash_prefixed = true;
2967         }
2968         if (error)
2969                 return (error);
2970         if (!slash_prefixed) {
2971                 if (buflen == 0) {
2972                         vrele(vp);
2973                         counter_u64_add(numfullpathfail4, 1);
2974                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2975                             startvp, NULL);
2976                         return (ENOMEM);
2977                 }
2978                 buf[--buflen] = '/';
2979         }
2980         counter_u64_add(numfullpathfound, 1);
2981         vrele(vp);
2982
2983         *retbuf = buf + buflen;
2984         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2985         *len -= buflen;
2986         *len += addend;
2987         return (0);
2988 }
2989
2990 /*
2991  * Resolve an arbitrary vnode to a pathname.
2992  *
2993  * Note 2 caveats:
2994  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2995  *   resolve to a different path than the one used to find it
2996  * - namecache is not mandatory, meaning names are not guaranteed to be added
2997  *   (in which case resolving fails)
2998  */
2999 static void __inline
3000 cache_rev_failed_impl(int *reason, int line)
3001 {
3002
3003         *reason = line;
3004 }
3005 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3006
3007 static int
3008 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3009     char **retbuf, size_t *buflen, bool slash_prefixed, size_t addend)
3010 {
3011 #ifdef KDTRACE_HOOKS
3012         struct vnode *startvp = vp;
3013 #endif
3014         struct vnode *tvp;
3015         struct mount *mp;
3016         struct namecache *ncp;
3017         size_t orig_buflen;
3018         int reason;
3019         int error;
3020 #ifdef KDTRACE_HOOKS
3021         int i;
3022 #endif
3023         seqc_t vp_seqc, tvp_seqc;
3024         u_char nc_flag;
3025
3026         VFS_SMR_ASSERT_ENTERED();
3027
3028         if (!cache_fast_revlookup) {
3029                 vfs_smr_exit();
3030                 return (-1);
3031         }
3032
3033         orig_buflen = *buflen;
3034
3035         if (!slash_prefixed) {
3036                 MPASS(*buflen >= 2);
3037                 *buflen -= 1;
3038                 buf[*buflen] = '\0';
3039         }
3040
3041         if (vp == rdir || vp == rootvnode) {
3042                 if (!slash_prefixed) {
3043                         *buflen -= 1;
3044                         buf[*buflen] = '/';
3045                 }
3046                 goto out_ok;
3047         }
3048
3049 #ifdef KDTRACE_HOOKS
3050         i = 0;
3051 #endif
3052         error = -1;
3053         ncp = NULL; /* for sdt probe down below */
3054         vp_seqc = vn_seqc_read_any(vp);
3055         if (seqc_in_modify(vp_seqc)) {
3056                 cache_rev_failed(&reason);
3057                 goto out_abort;
3058         }
3059
3060         for (;;) {
3061 #ifdef KDTRACE_HOOKS
3062                 i++;
3063 #endif
3064                 if ((vp->v_vflag & VV_ROOT) != 0) {
3065                         mp = atomic_load_ptr(&vp->v_mount);
3066                         if (mp == NULL) {
3067                                 cache_rev_failed(&reason);
3068                                 goto out_abort;
3069                         }
3070                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3071                         tvp_seqc = vn_seqc_read_any(tvp);
3072                         if (seqc_in_modify(tvp_seqc)) {
3073                                 cache_rev_failed(&reason);
3074                                 goto out_abort;
3075                         }
3076                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3077                                 cache_rev_failed(&reason);
3078                                 goto out_abort;
3079                         }
3080                         vp = tvp;
3081                         vp_seqc = tvp_seqc;
3082                         continue;
3083                 }
3084                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3085                 if (ncp == NULL) {
3086                         cache_rev_failed(&reason);
3087                         goto out_abort;
3088                 }
3089                 nc_flag = atomic_load_char(&ncp->nc_flag);
3090                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3091                         cache_rev_failed(&reason);
3092                         goto out_abort;
3093                 }
3094                 if (!cache_ncp_canuse(ncp)) {
3095                         cache_rev_failed(&reason);
3096                         goto out_abort;
3097                 }
3098                 if (ncp->nc_nlen >= *buflen) {
3099                         cache_rev_failed(&reason);
3100                         error = ENOMEM;
3101                         goto out_abort;
3102                 }
3103                 *buflen -= ncp->nc_nlen;
3104                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3105                 *buflen -= 1;
3106                 buf[*buflen] = '/';
3107                 tvp = ncp->nc_dvp;
3108                 tvp_seqc = vn_seqc_read_any(tvp);
3109                 if (seqc_in_modify(tvp_seqc)) {
3110                         cache_rev_failed(&reason);
3111                         goto out_abort;
3112                 }
3113                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3114                         cache_rev_failed(&reason);
3115                         goto out_abort;
3116                 }
3117                 vp = tvp;
3118                 vp_seqc = tvp_seqc;
3119                 if (vp == rdir || vp == rootvnode)
3120                         break;
3121         }
3122 out_ok:
3123         vfs_smr_exit();
3124         *retbuf = buf + *buflen;
3125         *buflen = orig_buflen - *buflen + addend;
3126         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3127         return (0);
3128
3129 out_abort:
3130         *buflen = orig_buflen;
3131         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3132         vfs_smr_exit();
3133         return (error);
3134 }
3135
3136 static int
3137 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3138     size_t *buflen)
3139 {
3140         size_t orig_buflen;
3141         bool slash_prefixed;
3142         int error;
3143
3144         if (*buflen < 2)
3145                 return (EINVAL);
3146
3147         orig_buflen = *buflen;
3148
3149         vref(vp);
3150         slash_prefixed = false;
3151         if (vp->v_type != VDIR) {
3152                 *buflen -= 1;
3153                 buf[*buflen] = '\0';
3154                 error = vn_vptocnp(&vp, curthread->td_ucred, buf, buflen);
3155                 if (error)
3156                         return (error);
3157                 if (*buflen == 0) {
3158                         vrele(vp);
3159                         return (ENOMEM);
3160                 }
3161                 *buflen -= 1;
3162                 buf[*buflen] = '/';
3163                 slash_prefixed = true;
3164         }
3165
3166         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, slash_prefixed,
3167             orig_buflen - *buflen));
3168 }
3169
3170 /*
3171  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3172  *
3173  * Since the namecache does not track handlings, the caller is expected to first
3174  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3175  *
3176  * Then we have 2 cases:
3177  * - if the found vnode is a directory, the path can be constructed just by
3178  *   fullowing names up the chain
3179  * - otherwise we populate the buffer with the saved name and start resolving
3180  *   from the parent
3181  */
3182 static int
3183 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3184     size_t *buflen)
3185 {
3186         char *buf, *tmpbuf;
3187         struct pwd *pwd;
3188         struct componentname *cnp;
3189         struct vnode *vp;
3190         size_t addend;
3191         int error;
3192         bool slash_prefixed;
3193         enum vtype type;
3194
3195         if (*buflen < 2)
3196                 return (EINVAL);
3197         if (*buflen > MAXPATHLEN)
3198                 *buflen = MAXPATHLEN;
3199
3200         slash_prefixed = false;
3201
3202         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3203
3204         addend = 0;
3205         vp = ndp->ni_vp;
3206         /*
3207          * Check for VBAD to work around the vp_crossmp bug in lookup().
3208          *
3209          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3210          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3211          * If the type is VDIR (like in this very case) we can skip looking
3212          * at ni_dvp in the first place. However, since vnodes get passed here
3213          * unlocked the target may transition to doomed state (type == VBAD)
3214          * before we get to evaluate the condition. If this happens, we will
3215          * populate part of the buffer and descend to vn_fullpath_dir with
3216          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3217          *
3218          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3219          * an address of a bit field, even if said field is sized to char.
3220          * Work around the problem by reading the value into a full-sized enum
3221          * and then re-reading it with atomic_load which will still prevent
3222          * the compiler from re-reading down the road.
3223          */
3224         type = vp->v_type;
3225         type = atomic_load_int(&type);
3226         if (type == VBAD) {
3227                 error = ENOENT;
3228                 goto out_bad;
3229         }
3230         if (type != VDIR) {
3231                 cnp = &ndp->ni_cnd;
3232                 addend = cnp->cn_namelen + 2;
3233                 if (*buflen < addend) {
3234                         error = ENOMEM;
3235                         goto out_bad;
3236                 }
3237                 *buflen -= addend;
3238                 tmpbuf = buf + *buflen;
3239                 tmpbuf[0] = '/';
3240                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3241                 tmpbuf[addend - 1] = '\0';
3242                 slash_prefixed = true;
3243                 vp = ndp->ni_dvp;
3244         }
3245
3246         vfs_smr_enter();
3247         pwd = pwd_get_smr();
3248         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3249             slash_prefixed, addend);
3250         VFS_SMR_ASSERT_NOT_ENTERED();
3251         if (error < 0) {
3252                 pwd = pwd_hold(curthread);
3253                 vref(vp);
3254                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3255                     slash_prefixed, addend);
3256                 pwd_drop(pwd);
3257                 if (error != 0)
3258                         goto out_bad;
3259         }
3260
3261         *freebuf = buf;
3262
3263         return (0);
3264 out_bad:
3265         free(buf, M_TEMP);
3266         return (error);
3267 }
3268
3269 struct vnode *
3270 vn_dir_dd_ino(struct vnode *vp)
3271 {
3272         struct namecache *ncp;
3273         struct vnode *ddvp;
3274         struct mtx *vlp;
3275         enum vgetstate vs;
3276
3277         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3278         vlp = VP2VNODELOCK(vp);
3279         mtx_lock(vlp);
3280         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3281                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3282                         continue;
3283                 ddvp = ncp->nc_dvp;
3284                 vs = vget_prep(ddvp);
3285                 mtx_unlock(vlp);
3286                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3287                         return (NULL);
3288                 return (ddvp);
3289         }
3290         mtx_unlock(vlp);
3291         return (NULL);
3292 }
3293
3294 int
3295 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3296 {
3297         struct namecache *ncp;
3298         struct mtx *vlp;
3299         int l;
3300
3301         vlp = VP2VNODELOCK(vp);
3302         mtx_lock(vlp);
3303         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3304                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3305                         break;
3306         if (ncp == NULL) {
3307                 mtx_unlock(vlp);
3308                 return (ENOENT);
3309         }
3310         l = min(ncp->nc_nlen, buflen - 1);
3311         memcpy(buf, ncp->nc_name, l);
3312         mtx_unlock(vlp);
3313         buf[l] = '\0';
3314         return (0);
3315 }
3316
3317 /*
3318  * This function updates path string to vnode's full global path
3319  * and checks the size of the new path string against the pathlen argument.
3320  *
3321  * Requires a locked, referenced vnode.
3322  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3323  *
3324  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3325  * because it falls back to the ".." lookup if the namecache lookup fails.
3326  */
3327 int
3328 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3329     u_int pathlen)
3330 {
3331         struct nameidata nd;
3332         struct vnode *vp1;
3333         char *rpath, *fbuf;
3334         int error;
3335
3336         ASSERT_VOP_ELOCKED(vp, __func__);
3337
3338         /* Construct global filesystem path from vp. */
3339         VOP_UNLOCK(vp);
3340         error = vn_fullpath_global(vp, &rpath, &fbuf);
3341
3342         if (error != 0) {
3343                 vrele(vp);
3344                 return (error);
3345         }
3346
3347         if (strlen(rpath) >= pathlen) {
3348                 vrele(vp);
3349                 error = ENAMETOOLONG;
3350                 goto out;
3351         }
3352
3353         /*
3354          * Re-lookup the vnode by path to detect a possible rename.
3355          * As a side effect, the vnode is relocked.
3356          * If vnode was renamed, return ENOENT.
3357          */
3358         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3359             UIO_SYSSPACE, path, td);
3360         error = namei(&nd);
3361         if (error != 0) {
3362                 vrele(vp);
3363                 goto out;
3364         }
3365         NDFREE(&nd, NDF_ONLY_PNBUF);
3366         vp1 = nd.ni_vp;
3367         vrele(vp);
3368         if (vp1 == vp)
3369                 strcpy(path, rpath);
3370         else {
3371                 vput(vp1);
3372                 error = ENOENT;
3373         }
3374
3375 out:
3376         free(fbuf, M_TEMP);
3377         return (error);
3378 }
3379
3380 #ifdef DDB
3381 static void
3382 db_print_vpath(struct vnode *vp)
3383 {
3384
3385         while (vp != NULL) {
3386                 db_printf("%p: ", vp);
3387                 if (vp == rootvnode) {
3388                         db_printf("/");
3389                         vp = NULL;
3390                 } else {
3391                         if (vp->v_vflag & VV_ROOT) {
3392                                 db_printf("<mount point>");
3393                                 vp = vp->v_mount->mnt_vnodecovered;
3394                         } else {
3395                                 struct namecache *ncp;
3396                                 char *ncn;
3397                                 int i;
3398
3399                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3400                                 if (ncp != NULL) {
3401                                         ncn = ncp->nc_name;
3402                                         for (i = 0; i < ncp->nc_nlen; i++)
3403                                                 db_printf("%c", *ncn++);
3404                                         vp = ncp->nc_dvp;
3405                                 } else {
3406                                         vp = NULL;
3407                                 }
3408                         }
3409                 }
3410                 db_printf("\n");
3411         }
3412
3413         return;
3414 }
3415
3416 DB_SHOW_COMMAND(vpath, db_show_vpath)
3417 {
3418         struct vnode *vp;
3419
3420         if (!have_addr) {
3421                 db_printf("usage: show vpath <struct vnode *>\n");
3422                 return;
3423         }
3424
3425         vp = (struct vnode *)addr;
3426         db_print_vpath(vp);
3427 }
3428
3429 #endif
3430
3431 static bool __read_frequently cache_fast_lookup = true;
3432 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3433     &cache_fast_lookup, 0, "");
3434
3435 #define CACHE_FPL_FAILED        -2020
3436
3437 static void
3438 cache_fpl_cleanup_cnp(struct componentname *cnp)
3439 {
3440
3441         uma_zfree(namei_zone, cnp->cn_pnbuf);
3442 #ifdef DIAGNOSTIC
3443         cnp->cn_pnbuf = NULL;
3444         cnp->cn_nameptr = NULL;
3445 #endif
3446 }
3447
3448 static void
3449 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3450 {
3451         struct componentname *cnp;
3452
3453         cnp = &ndp->ni_cnd;
3454         while (*(cnp->cn_nameptr) == '/') {
3455                 cnp->cn_nameptr++;
3456                 ndp->ni_pathlen--;
3457         }
3458
3459         *dpp = ndp->ni_rootdir;
3460 }
3461
3462 /*
3463  * Components of nameidata (or objects it can point to) which may
3464  * need restoring in case fast path lookup fails.
3465  */
3466 struct nameidata_saved {
3467         long cn_namelen;
3468         char *cn_nameptr;
3469         size_t ni_pathlen;
3470         int cn_flags;
3471 };
3472
3473 struct cache_fpl {
3474         struct nameidata *ndp;
3475         struct componentname *cnp;
3476         struct pwd *pwd;
3477         struct vnode *dvp;
3478         struct vnode *tvp;
3479         seqc_t dvp_seqc;
3480         seqc_t tvp_seqc;
3481         struct nameidata_saved snd;
3482         int line;
3483         enum cache_fpl_status status:8;
3484         bool in_smr;
3485         bool fsearch;
3486 };
3487
3488 static void
3489 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3490 {
3491
3492         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3493         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3494         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3495         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3496 }
3497
3498 static void
3499 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3500 {
3501
3502         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3503         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3504         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3505         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3506 }
3507
3508 #ifdef INVARIANTS
3509 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3510         struct cache_fpl *_fpl = (fpl);                         \
3511         MPASS(_fpl->in_smr == true);                            \
3512         VFS_SMR_ASSERT_ENTERED();                               \
3513 })
3514 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3515         struct cache_fpl *_fpl = (fpl);                         \
3516         MPASS(_fpl->in_smr == false);                           \
3517         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3518 })
3519 #else
3520 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3521 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3522 #endif
3523
3524 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3525         struct cache_fpl *_fpl = (fpl);                         \
3526         vfs_smr_enter();                                        \
3527         _fpl->in_smr = true;                                    \
3528 })
3529
3530 #define cache_fpl_smr_enter(fpl) ({                             \
3531         struct cache_fpl *_fpl = (fpl);                         \
3532         MPASS(_fpl->in_smr == false);                           \
3533         vfs_smr_enter();                                        \
3534         _fpl->in_smr = true;                                    \
3535 })
3536
3537 #define cache_fpl_smr_exit(fpl) ({                              \
3538         struct cache_fpl *_fpl = (fpl);                         \
3539         MPASS(_fpl->in_smr == true);                            \
3540         vfs_smr_exit();                                         \
3541         _fpl->in_smr = false;                                   \
3542 })
3543
3544 static int
3545 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3546 {
3547
3548         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3549                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3550                     ("%s: converting to abort from %d at %d, set at %d\n",
3551                     __func__, fpl->status, line, fpl->line));
3552         }
3553         fpl->status = CACHE_FPL_STATUS_ABORTED;
3554         fpl->line = line;
3555         return (CACHE_FPL_FAILED);
3556 }
3557
3558 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3559
3560 static int
3561 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3562 {
3563
3564         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3565             ("%s: setting to partial at %d, but already set to %d at %d\n",
3566             __func__, line, fpl->status, fpl->line));
3567         cache_fpl_smr_assert_entered(fpl);
3568         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3569         fpl->line = line;
3570         return (CACHE_FPL_FAILED);
3571 }
3572
3573 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3574
3575 static int
3576 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3577 {
3578
3579         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3580             ("%s: setting to handled at %d, but already set to %d at %d\n",
3581             __func__, line, fpl->status, fpl->line));
3582         cache_fpl_smr_assert_not_entered(fpl);
3583         MPASS(error != CACHE_FPL_FAILED);
3584         fpl->status = CACHE_FPL_STATUS_HANDLED;
3585         fpl->line = line;
3586         return (error);
3587 }
3588
3589 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3590
3591 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3592         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3593          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3594
3595 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3596         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3597
3598 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3599     "supported and internal flags overlap");
3600
3601 static bool
3602 cache_fpl_islastcn(struct nameidata *ndp)
3603 {
3604
3605         return (*ndp->ni_next == 0);
3606 }
3607
3608 static bool
3609 cache_fpl_isdotdot(struct componentname *cnp)
3610 {
3611
3612         if (cnp->cn_namelen == 2 &&
3613             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3614                 return (true);
3615         return (false);
3616 }
3617
3618 static bool
3619 cache_can_fplookup(struct cache_fpl *fpl)
3620 {
3621         struct nameidata *ndp;
3622         struct componentname *cnp;
3623         struct thread *td;
3624
3625         ndp = fpl->ndp;
3626         cnp = fpl->cnp;
3627         td = cnp->cn_thread;
3628
3629         if (!cache_fast_lookup) {
3630                 cache_fpl_aborted(fpl);
3631                 return (false);
3632         }
3633 #ifdef MAC
3634         if (mac_vnode_check_lookup_enabled()) {
3635                 cache_fpl_aborted(fpl);
3636                 return (false);
3637         }
3638 #endif
3639         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3640                 cache_fpl_aborted(fpl);
3641                 return (false);
3642         }
3643         if (IN_CAPABILITY_MODE(td)) {
3644                 cache_fpl_aborted(fpl);
3645                 return (false);
3646         }
3647         if (AUDITING_TD(td)) {
3648                 cache_fpl_aborted(fpl);
3649                 return (false);
3650         }
3651         if (ndp->ni_startdir != NULL) {
3652                 cache_fpl_aborted(fpl);
3653                 return (false);
3654         }
3655         return (true);
3656 }
3657
3658 static int
3659 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3660 {
3661         struct nameidata *ndp;
3662         int error;
3663         bool fsearch;
3664
3665         ndp = fpl->ndp;
3666         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3667         if (__predict_false(error != 0)) {
3668                 cache_fpl_smr_exit(fpl);
3669                 return (cache_fpl_aborted(fpl));
3670         }
3671         fpl->fsearch = fsearch;
3672         return (0);
3673 }
3674
3675 static bool
3676 cache_fplookup_vnode_supported(struct vnode *vp)
3677 {
3678
3679         return (vp->v_type != VLNK);
3680 }
3681
3682 static int __noinline
3683 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3684     uint32_t hash)
3685 {
3686         struct componentname *cnp;
3687         struct vnode *dvp;
3688
3689         cnp = fpl->cnp;
3690         dvp = fpl->dvp;
3691
3692         cache_fpl_smr_exit(fpl);
3693         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3694                 return (cache_fpl_handled(fpl, ENOENT));
3695         else
3696                 return (cache_fpl_aborted(fpl));
3697 }
3698
3699 /*
3700  * The target vnode is not supported, prepare for the slow path to take over.
3701  */
3702 static int __noinline
3703 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3704 {
3705         struct nameidata *ndp;
3706         struct componentname *cnp;
3707         enum vgetstate dvs;
3708         struct vnode *dvp;
3709         struct pwd *pwd;
3710         seqc_t dvp_seqc;
3711
3712         ndp = fpl->ndp;
3713         cnp = fpl->cnp;
3714         pwd = fpl->pwd;
3715         dvp = fpl->dvp;
3716         dvp_seqc = fpl->dvp_seqc;
3717
3718         if (!pwd_hold_smr(pwd)) {
3719                 cache_fpl_smr_exit(fpl);
3720                 return (cache_fpl_aborted(fpl));
3721         }
3722
3723         dvs = vget_prep_smr(dvp);
3724         cache_fpl_smr_exit(fpl);
3725         if (__predict_false(dvs == VGET_NONE)) {
3726                 pwd_drop(pwd);
3727                 return (cache_fpl_aborted(fpl));
3728         }
3729
3730         vget_finish_ref(dvp, dvs);
3731         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3732                 vrele(dvp);
3733                 pwd_drop(pwd);
3734                 return (cache_fpl_aborted(fpl));
3735         }
3736
3737         cache_fpl_restore(fpl, &fpl->snd);
3738
3739         ndp->ni_startdir = dvp;
3740         cnp->cn_flags |= MAKEENTRY;
3741         if (cache_fpl_islastcn(ndp))
3742                 cnp->cn_flags |= ISLASTCN;
3743         if (cache_fpl_isdotdot(cnp))
3744                 cnp->cn_flags |= ISDOTDOT;
3745
3746         return (0);
3747 }
3748
3749 static int
3750 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3751 {
3752         struct componentname *cnp;
3753         struct vnode *tvp;
3754         seqc_t tvp_seqc;
3755         int error, lkflags;
3756
3757         cnp = fpl->cnp;
3758         tvp = fpl->tvp;
3759         tvp_seqc = fpl->tvp_seqc;
3760
3761         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3762                 lkflags = LK_SHARED;
3763                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3764                         lkflags = LK_EXCLUSIVE;
3765                 error = vget_finish(tvp, lkflags, tvs);
3766                 if (__predict_false(error != 0)) {
3767                         return (cache_fpl_aborted(fpl));
3768                 }
3769         } else {
3770                 vget_finish_ref(tvp, tvs);
3771         }
3772
3773         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3774                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3775                         vput(tvp);
3776                 else
3777                         vrele(tvp);
3778                 return (cache_fpl_aborted(fpl));
3779         }
3780
3781         return (cache_fpl_handled(fpl, 0));
3782 }
3783
3784 /*
3785  * They want to possibly modify the state of the namecache.
3786  *
3787  * Don't try to match the API contract, just leave.
3788  * TODO: this leaves scalability on the table
3789  */
3790 static int
3791 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3792 {
3793         struct componentname *cnp;
3794
3795         cnp = fpl->cnp;
3796         MPASS(cnp->cn_nameiop != LOOKUP);
3797         return (cache_fpl_partial(fpl));
3798 }
3799
3800 static int __noinline
3801 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3802 {
3803         struct componentname *cnp;
3804         enum vgetstate dvs, tvs;
3805         struct vnode *dvp, *tvp;
3806         seqc_t dvp_seqc;
3807         int error;
3808
3809         cnp = fpl->cnp;
3810         dvp = fpl->dvp;
3811         dvp_seqc = fpl->dvp_seqc;
3812         tvp = fpl->tvp;
3813
3814         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3815
3816         /*
3817          * This is less efficient than it can be for simplicity.
3818          */
3819         dvs = vget_prep_smr(dvp);
3820         if (__predict_false(dvs == VGET_NONE)) {
3821                 return (cache_fpl_aborted(fpl));
3822         }
3823         tvs = vget_prep_smr(tvp);
3824         if (__predict_false(tvs == VGET_NONE)) {
3825                 cache_fpl_smr_exit(fpl);
3826                 vget_abort(dvp, dvs);
3827                 return (cache_fpl_aborted(fpl));
3828         }
3829
3830         cache_fpl_smr_exit(fpl);
3831
3832         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3833                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3834                 if (__predict_false(error != 0)) {
3835                         vget_abort(tvp, tvs);
3836                         return (cache_fpl_aborted(fpl));
3837                 }
3838         } else {
3839                 vget_finish_ref(dvp, dvs);
3840         }
3841
3842         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3843                 vget_abort(tvp, tvs);
3844                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3845                         vput(dvp);
3846                 else
3847                         vrele(dvp);
3848                 return (cache_fpl_aborted(fpl));
3849         }
3850
3851         error = cache_fplookup_final_child(fpl, tvs);
3852         if (__predict_false(error != 0)) {
3853                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3854                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3855                         vput(dvp);
3856                 else
3857                         vrele(dvp);
3858                 return (error);
3859         }
3860
3861         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3862         return (0);
3863 }
3864
3865 static int
3866 cache_fplookup_final(struct cache_fpl *fpl)
3867 {
3868         struct componentname *cnp;
3869         enum vgetstate tvs;
3870         struct vnode *dvp, *tvp;
3871         seqc_t dvp_seqc;
3872
3873         cnp = fpl->cnp;
3874         dvp = fpl->dvp;
3875         dvp_seqc = fpl->dvp_seqc;
3876         tvp = fpl->tvp;
3877
3878         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3879
3880         if (cnp->cn_nameiop != LOOKUP) {
3881                 return (cache_fplookup_final_modifying(fpl));
3882         }
3883
3884         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3885                 return (cache_fplookup_final_withparent(fpl));
3886
3887         tvs = vget_prep_smr(tvp);
3888         if (__predict_false(tvs == VGET_NONE)) {
3889                 return (cache_fpl_partial(fpl));
3890         }
3891
3892         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3893                 cache_fpl_smr_exit(fpl);
3894                 vget_abort(tvp, tvs);
3895                 return (cache_fpl_aborted(fpl));
3896         }
3897
3898         cache_fpl_smr_exit(fpl);
3899         return (cache_fplookup_final_child(fpl, tvs));
3900 }
3901
3902 static int __noinline
3903 cache_fplookup_dot(struct cache_fpl *fpl)
3904 {
3905         struct vnode *dvp;
3906
3907         dvp = fpl->dvp;
3908
3909         fpl->tvp = dvp;
3910         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3911         if (seqc_in_modify(fpl->tvp_seqc)) {
3912                 return (cache_fpl_aborted(fpl));
3913         }
3914
3915         counter_u64_add(dothits, 1);
3916         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3917
3918         return (0);
3919 }
3920
3921 static int __noinline
3922 cache_fplookup_dotdot(struct cache_fpl *fpl)
3923 {
3924         struct nameidata *ndp;
3925         struct componentname *cnp;
3926         struct namecache *ncp;
3927         struct vnode *dvp;
3928         struct prison *pr;
3929         u_char nc_flag;
3930
3931         ndp = fpl->ndp;
3932         cnp = fpl->cnp;
3933         dvp = fpl->dvp;
3934
3935         /*
3936          * XXX this is racy the same way regular lookup is
3937          */
3938         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3939             pr = pr->pr_parent)
3940                 if (dvp == pr->pr_root)
3941                         break;
3942
3943         if (dvp == ndp->ni_rootdir ||
3944             dvp == ndp->ni_topdir ||
3945             dvp == rootvnode ||
3946             pr != NULL) {
3947                 fpl->tvp = dvp;
3948                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3949                 if (seqc_in_modify(fpl->tvp_seqc)) {
3950                         return (cache_fpl_aborted(fpl));
3951                 }
3952                 return (0);
3953         }
3954
3955         if ((dvp->v_vflag & VV_ROOT) != 0) {
3956                 /*
3957                  * TODO
3958                  * The opposite of climb mount is needed here.
3959                  */
3960                 return (cache_fpl_aborted(fpl));
3961         }
3962
3963         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3964         if (ncp == NULL) {
3965                 return (cache_fpl_aborted(fpl));
3966         }
3967
3968         nc_flag = atomic_load_char(&ncp->nc_flag);
3969         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3970                 if ((nc_flag & NCF_NEGATIVE) != 0)
3971                         return (cache_fpl_aborted(fpl));
3972                 fpl->tvp = ncp->nc_vp;
3973         } else {
3974                 fpl->tvp = ncp->nc_dvp;
3975         }
3976
3977         if (__predict_false(!cache_ncp_canuse(ncp))) {
3978                 return (cache_fpl_aborted(fpl));
3979         }
3980
3981         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
3982         if (seqc_in_modify(fpl->tvp_seqc)) {
3983                 return (cache_fpl_partial(fpl));
3984         }
3985
3986         counter_u64_add(dotdothits, 1);
3987         return (0);
3988 }
3989
3990 static int __noinline
3991 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
3992 {
3993         u_char nc_flag;
3994         bool neg_promote;
3995
3996         nc_flag = atomic_load_char(&ncp->nc_flag);
3997         MPASS((nc_flag & NCF_NEGATIVE) != 0);
3998         /*
3999          * If they want to create an entry we need to replace this one.
4000          */
4001         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4002                 /*
4003                  * TODO
4004                  * This should call something similar to
4005                  * cache_fplookup_final_modifying.
4006                  */
4007                 return (cache_fpl_partial(fpl));
4008         }
4009         neg_promote = cache_neg_hit_prep(ncp);
4010         if (__predict_false(!cache_ncp_canuse(ncp))) {
4011                 cache_neg_hit_abort(ncp);
4012                 return (cache_fpl_partial(fpl));
4013         }
4014         if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4015                 cache_neg_hit_abort(ncp);
4016                 return (cache_fpl_partial(fpl));
4017         }
4018         if (neg_promote) {
4019                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4020         }
4021         cache_neg_hit_finish(ncp);
4022         cache_fpl_smr_exit(fpl);
4023         return (cache_fpl_handled(fpl, ENOENT));
4024 }
4025
4026 static int
4027 cache_fplookup_next(struct cache_fpl *fpl)
4028 {
4029         struct componentname *cnp;
4030         struct namecache *ncp;
4031         struct vnode *dvp, *tvp;
4032         u_char nc_flag;
4033         uint32_t hash;
4034
4035         cnp = fpl->cnp;
4036         dvp = fpl->dvp;
4037
4038         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4039                 return (cache_fplookup_dot(fpl));
4040         }
4041
4042         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4043
4044         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4045                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4046                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4047                         break;
4048         }
4049
4050         /*
4051          * If there is no entry we have to punt to the slow path to perform
4052          * actual lookup. Should there be nothing with this name a negative
4053          * entry will be created.
4054          */
4055         if (__predict_false(ncp == NULL)) {
4056                 return (cache_fpl_partial(fpl));
4057         }
4058
4059         tvp = atomic_load_ptr(&ncp->nc_vp);
4060         nc_flag = atomic_load_char(&ncp->nc_flag);
4061         if ((nc_flag & NCF_NEGATIVE) != 0) {
4062                 return (cache_fplookup_neg(fpl, ncp, hash));
4063         }
4064
4065         if (__predict_false(!cache_ncp_canuse(ncp))) {
4066                 return (cache_fpl_partial(fpl));
4067         }
4068
4069         fpl->tvp = tvp;
4070         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4071         if (seqc_in_modify(fpl->tvp_seqc)) {
4072                 return (cache_fpl_partial(fpl));
4073         }
4074
4075         if (!cache_fplookup_vnode_supported(tvp)) {
4076                 return (cache_fpl_partial(fpl));
4077         }
4078
4079         counter_u64_add(numposhits, 1);
4080         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4081         return (0);
4082 }
4083
4084 static bool
4085 cache_fplookup_mp_supported(struct mount *mp)
4086 {
4087
4088         if (mp == NULL)
4089                 return (false);
4090         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4091                 return (false);
4092         return (true);
4093 }
4094
4095 /*
4096  * Walk up the mount stack (if any).
4097  *
4098  * Correctness is provided in the following ways:
4099  * - all vnodes are protected from freeing with SMR
4100  * - struct mount objects are type stable making them always safe to access
4101  * - stability of the particular mount is provided by busying it
4102  * - relationship between the vnode which is mounted on and the mount is
4103  *   verified with the vnode sequence counter after busying
4104  * - association between root vnode of the mount and the mount is protected
4105  *   by busy
4106  *
4107  * From that point on we can read the sequence counter of the root vnode
4108  * and get the next mount on the stack (if any) using the same protection.
4109  *
4110  * By the end of successful walk we are guaranteed the reached state was
4111  * indeed present at least at some point which matches the regular lookup.
4112  */
4113 static int __noinline
4114 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4115 {
4116         struct mount *mp, *prev_mp;
4117         struct vnode *vp;
4118         seqc_t vp_seqc;
4119
4120         vp = fpl->tvp;
4121         vp_seqc = fpl->tvp_seqc;
4122
4123         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4124         mp = atomic_load_ptr(&vp->v_mountedhere);
4125         if (mp == NULL)
4126                 return (0);
4127
4128         prev_mp = NULL;
4129         for (;;) {
4130                 if (!vfs_op_thread_enter_crit(mp)) {
4131                         if (prev_mp != NULL)
4132                                 vfs_op_thread_exit_crit(prev_mp);
4133                         return (cache_fpl_partial(fpl));
4134                 }
4135                 if (prev_mp != NULL)
4136                         vfs_op_thread_exit_crit(prev_mp);
4137                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4138                         vfs_op_thread_exit_crit(mp);
4139                         return (cache_fpl_partial(fpl));
4140                 }
4141                 if (!cache_fplookup_mp_supported(mp)) {
4142                         vfs_op_thread_exit_crit(mp);
4143                         return (cache_fpl_partial(fpl));
4144                 }
4145                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4146                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4147                         vfs_op_thread_exit_crit(mp);
4148                         return (cache_fpl_partial(fpl));
4149                 }
4150                 vp_seqc = vn_seqc_read_any(vp);
4151                 if (seqc_in_modify(vp_seqc)) {
4152                         vfs_op_thread_exit_crit(mp);
4153                         return (cache_fpl_partial(fpl));
4154                 }
4155                 prev_mp = mp;
4156                 mp = atomic_load_ptr(&vp->v_mountedhere);
4157                 if (mp == NULL)
4158                         break;
4159         }
4160
4161         vfs_op_thread_exit_crit(prev_mp);
4162         fpl->tvp = vp;
4163         fpl->tvp_seqc = vp_seqc;
4164         return (0);
4165 }
4166
4167 static bool
4168 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4169 {
4170         struct mount *mp;
4171         struct vnode *vp;
4172
4173         vp = fpl->tvp;
4174
4175         /*
4176          * Hack: while this is a union, the pointer tends to be NULL so save on
4177          * a branch.
4178          */
4179         mp = atomic_load_ptr(&vp->v_mountedhere);
4180         if (mp == NULL)
4181                 return (false);
4182         if (vp->v_type == VDIR)
4183                 return (true);
4184         return (false);
4185 }
4186
4187 /*
4188  * Parse the path.
4189  *
4190  * The code was originally copy-pasted from regular lookup and despite
4191  * clean ups leaves performance on the table. Any modifications here
4192  * must take into account that in case off fallback the resulting
4193  * nameidata state has to be compatible with the original.
4194  */
4195 static int
4196 cache_fplookup_parse(struct cache_fpl *fpl)
4197 {
4198         struct nameidata *ndp;
4199         struct componentname *cnp;
4200         char *cp;
4201
4202         ndp = fpl->ndp;
4203         cnp = fpl->cnp;
4204
4205         /*
4206          * Search a new directory.
4207          *
4208          * The last component of the filename is left accessible via
4209          * cnp->cn_nameptr for callers that need the name. Callers needing
4210          * the name set the SAVENAME flag. When done, they assume
4211          * responsibility for freeing the pathname buffer.
4212          */
4213         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4214                 continue;
4215         cnp->cn_namelen = cp - cnp->cn_nameptr;
4216         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4217                 cache_fpl_smr_exit(fpl);
4218                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4219         }
4220         ndp->ni_pathlen -= cnp->cn_namelen;
4221         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4222             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4223         ndp->ni_next = cp;
4224
4225         /*
4226          * Replace multiple slashes by a single slash and trailing slashes
4227          * by a null.  This must be done before VOP_LOOKUP() because some
4228          * fs's don't know about trailing slashes.  Remember if there were
4229          * trailing slashes to handle symlinks, existing non-directories
4230          * and non-existing files that won't be directories specially later.
4231          */
4232         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4233                 cp++;
4234                 ndp->ni_pathlen--;
4235                 if (*cp == '\0') {
4236                         /*
4237                          * TODO
4238                          * Regular lookup performs the following:
4239                          * *ndp->ni_next = '\0';
4240                          * cnp->cn_flags |= TRAILINGSLASH;
4241                          *
4242                          * Which is problematic since it modifies data read
4243                          * from userspace. Then if fast path lookup was to
4244                          * abort we would have to either restore it or convey
4245                          * the flag. Since this is a corner case just ignore
4246                          * it for simplicity.
4247                          */
4248                         return (cache_fpl_partial(fpl));
4249                 }
4250         }
4251         ndp->ni_next = cp;
4252
4253         /*
4254          * Check for degenerate name (e.g. / or "")
4255          * which is a way of talking about a directory,
4256          * e.g. like "/." or ".".
4257          *
4258          * TODO
4259          * Another corner case handled by the regular lookup
4260          */
4261         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4262                 return (cache_fpl_partial(fpl));
4263         }
4264         return (0);
4265 }
4266
4267 static void
4268 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4269 {
4270         struct nameidata *ndp;
4271         struct componentname *cnp;
4272
4273         ndp = fpl->ndp;
4274         cnp = fpl->cnp;
4275
4276         cnp->cn_nameptr = ndp->ni_next;
4277         while (*cnp->cn_nameptr == '/') {
4278                 cnp->cn_nameptr++;
4279                 ndp->ni_pathlen--;
4280         }
4281 }
4282
4283 /*
4284  * See the API contract for VOP_FPLOOKUP_VEXEC.
4285  */
4286 static int __noinline
4287 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4288 {
4289         struct componentname *cnp;
4290         struct vnode *dvp;
4291         seqc_t dvp_seqc;
4292
4293         cnp = fpl->cnp;
4294         dvp = fpl->dvp;
4295         dvp_seqc = fpl->dvp_seqc;
4296
4297         /*
4298          * Hack: they may be looking up foo/bar, where foo is a
4299          * regular file. In such a case we need to turn ENOTDIR,
4300          * but we may happen to get here with a different error.
4301          */
4302         if (dvp->v_type != VDIR) {
4303                 /*
4304                  * The check here is predominantly to catch
4305                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4306                  * gets doomed past this point it is going to
4307                  * fail seqc verification.
4308                  */
4309                 if (VN_IS_DOOMED(dvp)) {
4310                         return (cache_fpl_aborted(fpl));
4311                 }
4312                 error = ENOTDIR;
4313         }
4314
4315         /*
4316          * Hack: handle O_SEARCH.
4317          *
4318          * Open Group Base Specifications Issue 7, 2018 edition states:
4319          * If the access mode of the open file description associated with the
4320          * file descriptor is not O_SEARCH, the function shall check whether
4321          * directory searches are permitted using the current permissions of
4322          * the directory underlying the file descriptor. If the access mode is
4323          * O_SEARCH, the function shall not perform the check.
4324          *
4325          * Regular lookup tests for the NOEXECCHECK flag for every path
4326          * component to decide whether to do the permission check. However,
4327          * since most lookups never have the flag (and when they do it is only
4328          * present for the first path component), lockless lookup only acts on
4329          * it if there is a permission problem. Here the flag is represented
4330          * with a boolean so that we don't have to clear it on the way out.
4331          *
4332          * For simplicity this always aborts.
4333          * TODO: check if this is the first lookup and ignore the permission
4334          * problem. Note the flag has to survive fallback (if it happens to be
4335          * performed).
4336          */
4337         if (fpl->fsearch) {
4338                 return (cache_fpl_aborted(fpl));
4339         }
4340
4341         switch (error) {
4342         case EAGAIN:
4343                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4344                         error = cache_fpl_aborted(fpl);
4345                 } else {
4346                         cache_fpl_partial(fpl);
4347                 }
4348                 break;
4349         default:
4350                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4351                         error = cache_fpl_aborted(fpl);
4352                 } else {
4353                         cache_fpl_smr_exit(fpl);
4354                         cache_fpl_handled(fpl, error);
4355                 }
4356                 break;
4357         }
4358         return (error);
4359 }
4360
4361 static int
4362 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4363 {
4364         struct nameidata *ndp;
4365         struct componentname *cnp;
4366         struct mount *mp;
4367         int error;
4368
4369         error = CACHE_FPL_FAILED;
4370         ndp = fpl->ndp;
4371         cnp = fpl->cnp;
4372
4373         cache_fpl_checkpoint(fpl, &fpl->snd);
4374
4375         fpl->dvp = dvp;
4376         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4377         if (seqc_in_modify(fpl->dvp_seqc)) {
4378                 cache_fpl_aborted(fpl);
4379                 goto out;
4380         }
4381         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4382         if (!cache_fplookup_mp_supported(mp)) {
4383                 cache_fpl_aborted(fpl);
4384                 goto out;
4385         }
4386
4387         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4388
4389         for (;;) {
4390                 error = cache_fplookup_parse(fpl);
4391                 if (__predict_false(error != 0)) {
4392                         break;
4393                 }
4394
4395                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4396
4397                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4398                 if (__predict_false(error != 0)) {
4399                         error = cache_fplookup_failed_vexec(fpl, error);
4400                         break;
4401                 }
4402
4403                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4404                         error = cache_fplookup_dotdot(fpl);
4405                         if (__predict_false(error != 0)) {
4406                                 break;
4407                         }
4408                 } else {
4409                         error = cache_fplookup_next(fpl);
4410                         if (__predict_false(error != 0)) {
4411                                 break;
4412                         }
4413
4414                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4415
4416                         if (cache_fplookup_need_climb_mount(fpl)) {
4417                                 error = cache_fplookup_climb_mount(fpl);
4418                                 if (__predict_false(error != 0)) {
4419                                         break;
4420                                 }
4421                         }
4422                 }
4423
4424                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4425
4426                 if (cache_fpl_islastcn(ndp)) {
4427                         error = cache_fplookup_final(fpl);
4428                         break;
4429                 }
4430
4431                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4432                         error = cache_fpl_aborted(fpl);
4433                         break;
4434                 }
4435
4436                 fpl->dvp = fpl->tvp;
4437                 fpl->dvp_seqc = fpl->tvp_seqc;
4438
4439                 cache_fplookup_parse_advance(fpl);
4440                 cache_fpl_checkpoint(fpl, &fpl->snd);
4441         }
4442 out:
4443         switch (fpl->status) {
4444         case CACHE_FPL_STATUS_UNSET:
4445                 __assert_unreachable();
4446                 break;
4447         case CACHE_FPL_STATUS_PARTIAL:
4448                 cache_fpl_smr_assert_entered(fpl);
4449                 return (cache_fplookup_partial_setup(fpl));
4450         case CACHE_FPL_STATUS_ABORTED:
4451                 if (fpl->in_smr)
4452                         cache_fpl_smr_exit(fpl);
4453                 return (CACHE_FPL_FAILED);
4454         case CACHE_FPL_STATUS_HANDLED:
4455                 MPASS(error != CACHE_FPL_FAILED);
4456                 cache_fpl_smr_assert_not_entered(fpl);
4457                 if (__predict_false(error != 0)) {
4458                         ndp->ni_dvp = NULL;
4459                         ndp->ni_vp = NULL;
4460                         cache_fpl_cleanup_cnp(cnp);
4461                         return (error);
4462                 }
4463                 ndp->ni_dvp = fpl->dvp;
4464                 ndp->ni_vp = fpl->tvp;
4465                 if (cnp->cn_flags & SAVENAME)
4466                         cnp->cn_flags |= HASBUF;
4467                 else
4468                         cache_fpl_cleanup_cnp(cnp);
4469                 return (error);
4470         }
4471 }
4472
4473 /*
4474  * Fast path lookup protected with SMR and sequence counters.
4475  *
4476  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4477  *
4478  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4479  * outlined below.
4480  *
4481  * Traditional vnode lookup conceptually looks like this:
4482  *
4483  * vn_lock(current);
4484  * for (;;) {
4485  *      next = find();
4486  *      vn_lock(next);
4487  *      vn_unlock(current);
4488  *      current = next;
4489  *      if (last)
4490  *          break;
4491  * }
4492  * return (current);
4493  *
4494  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4495  * any modifications thanks to holding respective locks.
4496  *
4497  * The same guarantee can be provided with a combination of safe memory
4498  * reclamation and sequence counters instead. If all operations which affect
4499  * the relationship between the current vnode and the one we are looking for
4500  * also modify the counter, we can verify whether all the conditions held as
4501  * we made the jump. This includes things like permissions, mount points etc.
4502  * Counter modification is provided by enclosing relevant places in
4503  * vn_seqc_write_begin()/end() calls.
4504  *
4505  * Thus this translates to:
4506  *
4507  * vfs_smr_enter();
4508  * dvp_seqc = seqc_read_any(dvp);
4509  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4510  *     abort();
4511  * for (;;) {
4512  *      tvp = find();
4513  *      tvp_seqc = seqc_read_any(tvp);
4514  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4515  *          abort();
4516  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4517  *          abort();
4518  *      dvp = tvp; // we know nothing of importance has changed
4519  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4520  *      if (last)
4521  *          break;
4522  * }
4523  * vget(); // secure the vnode
4524  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4525  *          abort();
4526  * // at this point we know nothing has changed for any parent<->child pair
4527  * // as they were crossed during the lookup, meaning we matched the guarantee
4528  * // of the locked variant
4529  * return (tvp);
4530  *
4531  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4532  * - they are called while within vfs_smr protection which they must never exit
4533  * - EAGAIN can be returned to denote checking could not be performed, it is
4534  *   always valid to return it
4535  * - if the sequence counter has not changed the result must be valid
4536  * - if the sequence counter has changed both false positives and false negatives
4537  *   are permitted (since the result will be rejected later)
4538  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4539  *
4540  * Caveats to watch out for:
4541  * - vnodes are passed unlocked and unreferenced with nothing stopping
4542  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4543  *   to use atomic_load_ptr to fetch it.
4544  * - the aforementioned object can also get freed, meaning absent other means it
4545  *   should be protected with vfs_smr
4546  * - either safely checking permissions as they are modified or guaranteeing
4547  *   their stability is left to the routine
4548  */
4549 int
4550 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4551     struct pwd **pwdp)
4552 {
4553         struct cache_fpl fpl;
4554         struct pwd *pwd;
4555         struct vnode *dvp;
4556         struct componentname *cnp;
4557         struct nameidata_saved orig;
4558         int error;
4559
4560         MPASS(ndp->ni_lcf == 0);
4561
4562         fpl.status = CACHE_FPL_STATUS_UNSET;
4563         fpl.ndp = ndp;
4564         fpl.cnp = &ndp->ni_cnd;
4565         MPASS(curthread == fpl.cnp->cn_thread);
4566
4567         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4568                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4569
4570         if (!cache_can_fplookup(&fpl)) {
4571                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4572                 *status = fpl.status;
4573                 return (EOPNOTSUPP);
4574         }
4575
4576         cache_fpl_checkpoint(&fpl, &orig);
4577
4578         cache_fpl_smr_enter_initial(&fpl);
4579         fpl.fsearch = false;
4580         pwd = pwd_get_smr();
4581         fpl.pwd = pwd;
4582         ndp->ni_rootdir = pwd->pwd_rdir;
4583         ndp->ni_topdir = pwd->pwd_jdir;
4584
4585         cnp = fpl.cnp;
4586         cnp->cn_nameptr = cnp->cn_pnbuf;
4587         if (cnp->cn_pnbuf[0] == '/') {
4588                 cache_fpl_handle_root(ndp, &dvp);
4589         } else {
4590                 if (ndp->ni_dirfd == AT_FDCWD) {
4591                         dvp = pwd->pwd_cdir;
4592                 } else {
4593                         error = cache_fplookup_dirfd(&fpl, &dvp);
4594                         if (__predict_false(error != 0)) {
4595                                 goto out;
4596                         }
4597                 }
4598         }
4599
4600         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4601
4602         error = cache_fplookup_impl(dvp, &fpl);
4603 out:
4604         cache_fpl_smr_assert_not_entered(&fpl);
4605         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4606
4607         *status = fpl.status;
4608         switch (fpl.status) {
4609         case CACHE_FPL_STATUS_UNSET:
4610                 __assert_unreachable();
4611                 break;
4612         case CACHE_FPL_STATUS_HANDLED:
4613                 SDT_PROBE3(vfs, namei, lookup, return, error,
4614                     (error == 0 ? ndp->ni_vp : NULL), true);
4615                 break;
4616         case CACHE_FPL_STATUS_PARTIAL:
4617                 *pwdp = fpl.pwd;
4618                 /*
4619                  * Status restored by cache_fplookup_partial_setup.
4620                  */
4621                 break;
4622         case CACHE_FPL_STATUS_ABORTED:
4623                 cache_fpl_restore(&fpl, &orig);
4624                 break;
4625         }
4626         return (error);
4627 }