sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  83     "Name cache");
  84
  85 SDT_PROVIDER_DECLARE(vfs);
  86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  87     "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  89     "struct vnode *");
  90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  91     "char *");
  92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  93     "const char *");
  94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  95     "struct namecache *", "int", "int");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  98     "char *", "struct vnode *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 101     "struct vnode *", "char *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 105     "struct vnode *", "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 107     "char *");
 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 109     "struct componentname *");
 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 111     "struct componentname *");
 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 116     "struct vnode *");
 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 118     "char *");
 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 120     "char *");
 121
 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 125
 126 /*
 127  * This structure describes the elements in the cache of recent
 128  * names looked up by namei.
 129  */
 130 struct negstate {
 131         u_char neg_flag;
 132         u_char neg_hit;
 133 };
 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 135     "the state must fit in a union with a pointer without growing it");
 136
 137 struct  namecache {
 138         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 139         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 140         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 141         struct  vnode *nc_dvp;          /* vnode of parent of name */
 142         union {
 143                 struct  vnode *nu_vp;   /* vnode the name refers to */
 144                 struct  negstate nu_neg;/* negative entry state */
 145         } n_un;
 146         u_char  nc_flag;                /* flag bits */
 147         u_char  nc_nlen;                /* length of name */
 148         char    nc_name[0];             /* segment name + nul */
 149 };
 150
 151 /*
 152  * struct namecache_ts repeats struct namecache layout up to the
 153  * nc_nlen member.
 154  * struct namecache_ts is used in place of struct namecache when time(s) need
 155  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 156  * both a non-dotdot directory name plus dotdot for the directory's
 157  * parent.
 158  *
 159  * See below for alignment requirement.
 160  */
 161 struct  namecache_ts {
 162         struct  timespec nc_time;       /* timespec provided by fs */
 163         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 164         int     nc_ticks;               /* ticks value when entry was added */
 165         struct namecache nc_nc;
 166 };
 167
 168 /*
 169  * At least mips n32 performs 64-bit accesses to timespec as found
 170  * in namecache_ts and requires them to be aligned. Since others
 171  * may be in the same spot suffer a little bit and enforce the
 172  * alignment for everyone. Note this is a nop for 64-bit platforms.
 173  */
 174 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 175 #define CACHE_PATH_CUTOFF       39
 176
 177 #define CACHE_ZONE_SMALL_SIZE           (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1)
 178 #define CACHE_ZONE_SMALL_TS_SIZE        (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1)
 179 #define CACHE_ZONE_LARGE_SIZE           (sizeof(struct namecache) + NAME_MAX + 1)
 180 #define CACHE_ZONE_LARGE_TS_SIZE        (sizeof(struct namecache_ts) + NAME_MAX + 1)
 181
 182 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 183 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 184 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 185 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 186
 187 #define nc_vp           n_un.nu_vp
 188 #define nc_neg          n_un.nu_neg
 189
 190 /*
 191  * Flags in namecache.nc_flag
 192  */
 193 #define NCF_WHITE       0x01
 194 #define NCF_ISDOTDOT    0x02
 195 #define NCF_TS          0x04
 196 #define NCF_DTS         0x08
 197 #define NCF_DVDROP      0x10
 198 #define NCF_NEGATIVE    0x20
 199 #define NCF_INVALID     0x40
 200 #define NCF_WIP         0x80
 201
 202 /*
 203  * Flags in negstate.neg_flag
 204  */
 205 #define NEG_HOT         0x01
 206
 207 /*
 208  * Mark an entry as invalid.
 209  *
 210  * This is called before it starts getting deconstructed.
 211  */
 212 static void
 213 cache_ncp_invalidate(struct namecache *ncp)
 214 {
 215
 216         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 217             ("%s: entry %p already invalid", __func__, ncp));
 218         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 219         atomic_thread_fence_rel();
 220 }
 221
 222 /*
 223  * Check whether the entry can be safely used.
 224  *
 225  * All places which elide locks are supposed to call this after they are
 226  * done with reading from an entry.
 227  */
 228 static bool
 229 cache_ncp_canuse(struct namecache *ncp)
 230 {
 231
 232         atomic_thread_fence_acq();
 233         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 234 }
 235
 236 /*
 237  * Name caching works as follows:
 238  *
 239  * Names found by directory scans are retained in a cache
 240  * for future reference.  It is managed LRU, so frequently
 241  * used names will hang around.  Cache is indexed by hash value
 242  * obtained from (dvp, name) where dvp refers to the directory
 243  * containing name.
 244  *
 245  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 246  * exist) the vnode pointer will be NULL.
 247  *
 248  * Upon reaching the last segment of a path, if the reference
 249  * is for DELETE, or NOCACHE is set (rewrite), and the
 250  * name is located in the cache, it will be dropped.
 251  *
 252  * These locks are used (in the order in which they can be taken):
 253  * NAME         TYPE    ROLE
 254  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 255  * bucketlock   mtx     for access to given set of hash buckets
 256  * neglist      mtx     negative entry LRU management
 257  *
 258  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 259  * order is lower address first. Both are recursive.
 260  *
 261  * "." lookups are lockless.
 262  *
 263  * ".." and vnode -> name lookups require vnodelock.
 264  *
 265  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 266  *
 267  * Insertions and removals of entries require involved vnodes and bucketlocks
 268  * to be locked to provide safe operation against other threads modifying the
 269  * cache.
 270  *
 271  * Some lookups result in removal of the found entry (e.g. getting rid of a
 272  * negative entry with the intent to create a positive one), which poses a
 273  * problem when multiple threads reach the state. Similarly, two different
 274  * threads can purge two different vnodes and try to remove the same name.
 275  *
 276  * If the already held vnode lock is lower than the second required lock, we
 277  * can just take the other lock. However, in the opposite case, this could
 278  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 279  * the first node, locking everything in order and revalidating the state.
 280  */
 281
 282 VFS_SMR_DECLARE;
 283
 284 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 285     "Name cache parameters");
 286
 287 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 288 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 289     "Total namecache capacity");
 290
 291 u_int ncsizefactor = 2;
 292 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 293     "Size factor for namecache");
 294
 295 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 296 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 297     "Ratio of negative namecache entries");
 298
 299 /*
 300  * Negative entry % of namecahe capacity above which automatic eviction is allowed.
 301  *
 302  * Check cache_neg_evict_cond for details.
 303  */
 304 static u_int ncnegminpct = 3;
 305
 306 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 307 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 308     "Negative entry count above which automatic eviction is allowed");
 309
 310 /*
 311  * Structures associated with name caching.
 312  */
 313 #define NCHHASH(hash) \
 314         (&nchashtbl[(hash) & nchash])
 315 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 316 static u_long __read_mostly     nchash;                 /* size of hash table */
 317 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 318     "Size of namecache hash table");
 319 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 320 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 321
 322 struct nchstats nchstats;               /* cache effectiveness statistics */
 323
 324 static bool __read_frequently cache_fast_revlookup = true;
 325 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 326     &cache_fast_revlookup, 0, "");
 327
 328 static u_int __exclusive_cache_line neg_cycle;
 329
 330 #define ncneghash       3
 331 #define numneglists     (ncneghash + 1)
 332
 333 struct neglist {
 334         struct mtx              nl_evict_lock;
 335         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 336         TAILQ_HEAD(, namecache) nl_list;
 337         TAILQ_HEAD(, namecache) nl_hotlist;
 338         u_long                  nl_hotnum;
 339 } __aligned(CACHE_LINE_SIZE);
 340
 341 static struct neglist neglists[numneglists];
 342
 343 static inline struct neglist *
 344 NCP2NEGLIST(struct namecache *ncp)
 345 {
 346
 347         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 348 }
 349
 350 static inline struct negstate *
 351 NCP2NEGSTATE(struct namecache *ncp)
 352 {
 353
 354         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 355         return (&ncp->nc_neg);
 356 }
 357
 358 #define numbucketlocks (ncbuckethash + 1)
 359 static u_int __read_mostly  ncbuckethash;
 360 static struct mtx_padalign __read_mostly  *bucketlocks;
 361 #define HASH2BUCKETLOCK(hash) \
 362         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 363
 364 #define numvnodelocks (ncvnodehash + 1)
 365 static u_int __read_mostly  ncvnodehash;
 366 static struct mtx __read_mostly *vnodelocks;
 367 static inline struct mtx *
 368 VP2VNODELOCK(struct vnode *vp)
 369 {
 370
 371         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 372 }
 373
 374 /*
 375  * UMA zones for the VFS cache.
 376  *
 377  * The small cache is used for entries with short names, which are the
 378  * most common.  The large cache is used for entries which are too big to
 379  * fit in the small cache.
 380  */
 381 static uma_zone_t __read_mostly cache_zone_small;
 382 static uma_zone_t __read_mostly cache_zone_small_ts;
 383 static uma_zone_t __read_mostly cache_zone_large;
 384 static uma_zone_t __read_mostly cache_zone_large_ts;
 385
 386 static struct namecache *
 387 cache_alloc(int len, int ts)
 388 {
 389         struct namecache_ts *ncp_ts;
 390         struct namecache *ncp;
 391
 392         if (__predict_false(ts)) {
 393                 if (len <= CACHE_PATH_CUTOFF)
 394                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 395                 else
 396                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 397                 ncp = &ncp_ts->nc_nc;
 398         } else {
 399                 if (len <= CACHE_PATH_CUTOFF)
 400                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 401                 else
 402                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 403         }
 404         return (ncp);
 405 }
 406
 407 static void
 408 cache_free(struct namecache *ncp)
 409 {
 410         struct namecache_ts *ncp_ts;
 411
 412         MPASS(ncp != NULL);
 413         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 414                 vdrop(ncp->nc_dvp);
 415         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 416                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 417                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 418                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 419                 else
 420                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 421         } else {
 422                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 423                         uma_zfree_smr(cache_zone_small, ncp);
 424                 else
 425                         uma_zfree_smr(cache_zone_large, ncp);
 426         }
 427 }
 428
 429 static void
 430 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 431 {
 432         struct namecache_ts *ncp_ts;
 433
 434         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 435             (tsp == NULL && ticksp == NULL),
 436             ("No NCF_TS"));
 437
 438         if (tsp == NULL)
 439                 return;
 440
 441         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 442         *tsp = ncp_ts->nc_time;
 443         *ticksp = ncp_ts->nc_ticks;
 444 }
 445
 446 #ifdef DEBUG_CACHE
 447 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 448 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 449     "VFS namecache enabled");
 450 #endif
 451
 452 /* Export size information to userland */
 453 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 454     sizeof(struct namecache), "sizeof(struct namecache)");
 455
 456 /*
 457  * The new name cache statistics
 458  */
 459 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 460     "Name cache statistics");
 461
 462 #define STATNODE_ULONG(name, varname, descr)                                    \
 463         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 464 #define STATNODE_COUNTER(name, varname, descr)                                  \
 465         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 466         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 467             descr);
 468 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 469 STATNODE_ULONG(count, numcache, "Number of cache entries");
 470 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 471 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 472 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 473 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 474 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 475 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 476 STATNODE_COUNTER(posszaps, numposzaps,
 477     "Number of cache hits (positive) we do not want to cache");
 478 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 479 STATNODE_COUNTER(negzaps, numnegzaps,
 480     "Number of cache hits (negative) we do not want to cache");
 481 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 482 /* These count for vn_getcwd(), too. */
 483 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 484 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 485 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 486     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 487 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 488 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 489
 490 /*
 491  * Debug or developer statistics.
 492  */
 493 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 494     "Name cache debugging");
 495 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 496         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 497 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 498         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 499         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 500             descr);
 501 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 502     "Number of successful removals after relocking");
 503 static long zap_bucket_fail;
 504 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 505 static long zap_bucket_fail2;
 506 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 507 static long cache_lock_vnodes_cel_3_failures;
 508 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 509     "Number of times 3-way vnode locking failed");
 510
 511 static void cache_zap_locked(struct namecache *ncp);
 512 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 513     char **freebuf, size_t *buflen);
 514 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 515     char **retbuf, size_t *buflen, size_t addend);
 516 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 517     char **retbuf, size_t *buflen);
 518 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 519     char **retbuf, size_t *len, size_t addend);
 520
 521 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 522
 523 static inline void
 524 cache_assert_vlp_locked(struct mtx *vlp)
 525 {
 526
 527         if (vlp != NULL)
 528                 mtx_assert(vlp, MA_OWNED);
 529 }
 530
 531 static inline void
 532 cache_assert_vnode_locked(struct vnode *vp)
 533 {
 534         struct mtx *vlp;
 535
 536         vlp = VP2VNODELOCK(vp);
 537         cache_assert_vlp_locked(vlp);
 538 }
 539
 540 /*
 541  * TODO: With the value stored we can do better than computing the hash based
 542  * on the address. The choice of FNV should also be revisited.
 543  */
 544 static void
 545 cache_prehash(struct vnode *vp)
 546 {
 547
 548         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 549 }
 550
 551 static uint32_t
 552 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 553 {
 554
 555         return (fnv_32_buf(name, len, dvp->v_nchash));
 556 }
 557
 558 static inline struct nchashhead *
 559 NCP2BUCKET(struct namecache *ncp)
 560 {
 561         uint32_t hash;
 562
 563         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 564         return (NCHHASH(hash));
 565 }
 566
 567 static inline struct mtx *
 568 NCP2BUCKETLOCK(struct namecache *ncp)
 569 {
 570         uint32_t hash;
 571
 572         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 573         return (HASH2BUCKETLOCK(hash));
 574 }
 575
 576 #ifdef INVARIANTS
 577 static void
 578 cache_assert_bucket_locked(struct namecache *ncp)
 579 {
 580         struct mtx *blp;
 581
 582         blp = NCP2BUCKETLOCK(ncp);
 583         mtx_assert(blp, MA_OWNED);
 584 }
 585
 586 static void
 587 cache_assert_bucket_unlocked(struct namecache *ncp)
 588 {
 589         struct mtx *blp;
 590
 591         blp = NCP2BUCKETLOCK(ncp);
 592         mtx_assert(blp, MA_NOTOWNED);
 593 }
 594 #else
 595 #define cache_assert_bucket_locked(x) do { } while (0)
 596 #define cache_assert_bucket_unlocked(x) do { } while (0)
 597 #endif
 598
 599 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 600 static void
 601 _cache_sort_vnodes(void **p1, void **p2)
 602 {
 603         void *tmp;
 604
 605         MPASS(*p1 != NULL || *p2 != NULL);
 606
 607         if (*p1 > *p2) {
 608                 tmp = *p2;
 609                 *p2 = *p1;
 610                 *p1 = tmp;
 611         }
 612 }
 613
 614 static void
 615 cache_lock_all_buckets(void)
 616 {
 617         u_int i;
 618
 619         for (i = 0; i < numbucketlocks; i++)
 620                 mtx_lock(&bucketlocks[i]);
 621 }
 622
 623 static void
 624 cache_unlock_all_buckets(void)
 625 {
 626         u_int i;
 627
 628         for (i = 0; i < numbucketlocks; i++)
 629                 mtx_unlock(&bucketlocks[i]);
 630 }
 631
 632 static void
 633 cache_lock_all_vnodes(void)
 634 {
 635         u_int i;
 636
 637         for (i = 0; i < numvnodelocks; i++)
 638                 mtx_lock(&vnodelocks[i]);
 639 }
 640
 641 static void
 642 cache_unlock_all_vnodes(void)
 643 {
 644         u_int i;
 645
 646         for (i = 0; i < numvnodelocks; i++)
 647                 mtx_unlock(&vnodelocks[i]);
 648 }
 649
 650 static int
 651 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 652 {
 653
 654         cache_sort_vnodes(&vlp1, &vlp2);
 655
 656         if (vlp1 != NULL) {
 657                 if (!mtx_trylock(vlp1))
 658                         return (EAGAIN);
 659         }
 660         if (!mtx_trylock(vlp2)) {
 661                 if (vlp1 != NULL)
 662                         mtx_unlock(vlp1);
 663                 return (EAGAIN);
 664         }
 665
 666         return (0);
 667 }
 668
 669 static void
 670 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 671 {
 672
 673         MPASS(vlp1 != NULL || vlp2 != NULL);
 674         MPASS(vlp1 <= vlp2);
 675
 676         if (vlp1 != NULL)
 677                 mtx_lock(vlp1);
 678         if (vlp2 != NULL)
 679                 mtx_lock(vlp2);
 680 }
 681
 682 static void
 683 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 684 {
 685
 686         MPASS(vlp1 != NULL || vlp2 != NULL);
 687
 688         if (vlp1 != NULL)
 689                 mtx_unlock(vlp1);
 690         if (vlp2 != NULL)
 691                 mtx_unlock(vlp2);
 692 }
 693
 694 static int
 695 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 696 {
 697         struct nchstats snap;
 698
 699         if (req->oldptr == NULL)
 700                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 701
 702         snap = nchstats;
 703         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 704         snap.ncs_neghits = counter_u64_fetch(numneghits);
 705         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 706             counter_u64_fetch(numnegzaps);
 707         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 708             counter_u64_fetch(nummiss);
 709
 710         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 711 }
 712 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 713     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 714     "VFS cache effectiveness statistics");
 715
 716 static void
 717 cache_recalc_neg_min(u_int val)
 718 {
 719
 720         neg_min = (ncsize * val) / 100;
 721 }
 722
 723 static int
 724 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 725 {
 726         u_int val;
 727         int error;
 728
 729         val = ncnegminpct;
 730         error = sysctl_handle_int(oidp, &val, 0, req);
 731         if (error != 0 || req->newptr == NULL)
 732                 return (error);
 733
 734         if (val == ncnegminpct)
 735                 return (0);
 736         if (val < 0 || val > 99)
 737                 return (EINVAL);
 738         ncnegminpct = val;
 739         cache_recalc_neg_min(val);
 740         return (0);
 741 }
 742
 743 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 744     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 745     "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed");
 746
 747 #ifdef DIAGNOSTIC
 748 /*
 749  * Grab an atomic snapshot of the name cache hash chain lengths
 750  */
 751 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 752     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 753     "hash table stats");
 754
 755 static int
 756 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 757 {
 758         struct nchashhead *ncpp;
 759         struct namecache *ncp;
 760         int i, error, n_nchash, *cntbuf;
 761
 762 retry:
 763         n_nchash = nchash + 1;  /* nchash is max index, not count */
 764         if (req->oldptr == NULL)
 765                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 766         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 767         cache_lock_all_buckets();
 768         if (n_nchash != nchash + 1) {
 769                 cache_unlock_all_buckets();
 770                 free(cntbuf, M_TEMP);
 771                 goto retry;
 772         }
 773         /* Scan hash tables counting entries */
 774         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 775                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 776                         cntbuf[i]++;
 777         cache_unlock_all_buckets();
 778         for (error = 0, i = 0; i < n_nchash; i++)
 779                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 780                         break;
 781         free(cntbuf, M_TEMP);
 782         return (error);
 783 }
 784 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 785     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 786     "nchash chain lengths");
 787
 788 static int
 789 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 790 {
 791         int error;
 792         struct nchashhead *ncpp;
 793         struct namecache *ncp;
 794         int n_nchash;
 795         int count, maxlength, used, pct;
 796
 797         if (!req->oldptr)
 798                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 799
 800         cache_lock_all_buckets();
 801         n_nchash = nchash + 1;  /* nchash is max index, not count */
 802         used = 0;
 803         maxlength = 0;
 804
 805         /* Scan hash tables for applicable entries */
 806         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 807                 count = 0;
 808                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 809                         count++;
 810                 }
 811                 if (count)
 812                         used++;
 813                 if (maxlength < count)
 814                         maxlength = count;
 815         }
 816         n_nchash = nchash + 1;
 817         cache_unlock_all_buckets();
 818         pct = (used * 100) / (n_nchash / 100);
 819         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 820         if (error)
 821                 return (error);
 822         error = SYSCTL_OUT(req, &used, sizeof(used));
 823         if (error)
 824                 return (error);
 825         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 826         if (error)
 827                 return (error);
 828         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 829         if (error)
 830                 return (error);
 831         return (0);
 832 }
 833 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 834     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 835     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 836 #endif
 837
 838 /*
 839  * Negative entries management
 840  *
 841  * Various workloads create plenty of negative entries and barely use them
 842  * afterwards. Moreover malicious users can keep performing bogus lookups
 843  * adding even more entries. For example "make tinderbox" as of writing this
 844  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 845  * negative.
 846  *
 847  * As such, a rather aggressive eviction method is needed. The currently
 848  * employed method is a placeholder.
 849  *
 850  * Entries are split over numneglists separate lists, each of which is further
 851  * split into hot and cold entries. Entries get promoted after getting a hit.
 852  * Eviction happens on addition of new entry.
 853  */
 854 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 855     "Name cache negative entry statistics");
 856
 857 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 858     "Number of negative cache entries");
 859
 860 static COUNTER_U64_DEFINE_EARLY(neg_created);
 861 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 862     "Number of created negative entries");
 863
 864 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 865 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 866     "Number of evicted negative entries");
 867
 868 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 870     &neg_evict_skipped_empty,
 871     "Number of times evicting failed due to lack of entries");
 872
 873 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 874 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 875     &neg_evict_skipped_missed,
 876     "Number of times evicting failed due to target entry disappearing");
 877
 878 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 879 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 880     &neg_evict_skipped_contended,
 881     "Number of times evicting failed due to contention");
 882
 883 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 884     "Number of cache hits (negative)");
 885
 886 static int
 887 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 888 {
 889         int i, out;
 890
 891         out = 0;
 892         for (i = 0; i < numneglists; i++)
 893                 out += neglists[i].nl_hotnum;
 894
 895         return (SYSCTL_OUT(req, &out, sizeof(out)));
 896 }
 897 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 898     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 899     "Number of hot negative entries");
 900
 901 static void
 902 cache_neg_init(struct namecache *ncp)
 903 {
 904         struct negstate *ns;
 905
 906         ncp->nc_flag |= NCF_NEGATIVE;
 907         ns = NCP2NEGSTATE(ncp);
 908         ns->neg_flag = 0;
 909         ns->neg_hit = 0;
 910         counter_u64_add(neg_created, 1);
 911 }
 912
 913 #define CACHE_NEG_PROMOTION_THRESH 2
 914
 915 static bool
 916 cache_neg_hit_prep(struct namecache *ncp)
 917 {
 918         struct negstate *ns;
 919         u_char n;
 920
 921         ns = NCP2NEGSTATE(ncp);
 922         n = atomic_load_char(&ns->neg_hit);
 923         for (;;) {
 924                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 925                         return (false);
 926                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 927                         break;
 928         }
 929         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 930 }
 931
 932 /*
 933  * Nothing to do here but it is provided for completeness as some
 934  * cache_neg_hit_prep callers may end up returning without even
 935  * trying to promote.
 936  */
 937 #define cache_neg_hit_abort(ncp)        do { } while (0)
 938
 939 static void
 940 cache_neg_hit_finish(struct namecache *ncp)
 941 {
 942
 943         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 944         counter_u64_add(numneghits, 1);
 945 }
 946
 947 /*
 948  * Move a negative entry to the hot list.
 949  */
 950 static void
 951 cache_neg_promote_locked(struct namecache *ncp)
 952 {
 953         struct neglist *nl;
 954         struct negstate *ns;
 955
 956         ns = NCP2NEGSTATE(ncp);
 957         nl = NCP2NEGLIST(ncp);
 958         mtx_assert(&nl->nl_lock, MA_OWNED);
 959         if ((ns->neg_flag & NEG_HOT) == 0) {
 960                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 961                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 962                 nl->nl_hotnum++;
 963                 ns->neg_flag |= NEG_HOT;
 964         }
 965 }
 966
 967 /*
 968  * Move a hot negative entry to the cold list.
 969  */
 970 static void
 971 cache_neg_demote_locked(struct namecache *ncp)
 972 {
 973         struct neglist *nl;
 974         struct negstate *ns;
 975
 976         ns = NCP2NEGSTATE(ncp);
 977         nl = NCP2NEGLIST(ncp);
 978         mtx_assert(&nl->nl_lock, MA_OWNED);
 979         MPASS(ns->neg_flag & NEG_HOT);
 980         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 981         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 982         nl->nl_hotnum--;
 983         ns->neg_flag &= ~NEG_HOT;
 984         atomic_store_char(&ns->neg_hit, 0);
 985 }
 986
 987 /*
 988  * Move a negative entry to the hot list if it matches the lookup.
 989  *
 990  * We have to take locks, but they may be contended and in the worst
 991  * case we may need to go off CPU. We don't want to spin within the
 992  * smr section and we can't block with it. Exiting the section means
 993  * the found entry could have been evicted. We are going to look it
 994  * up again.
 995  */
 996 static bool
 997 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
 998     struct namecache *oncp, uint32_t hash)
 999 {
1000         struct namecache *ncp;
1001         struct neglist *nl;
1002         u_char nc_flag;
1003
1004         nl = NCP2NEGLIST(oncp);
1005
1006         mtx_lock(&nl->nl_lock);
1007         /*
1008          * For hash iteration.
1009          */
1010         vfs_smr_enter();
1011
1012         /*
1013          * Avoid all surprises by only succeeding if we got the same entry and
1014          * bailing completely otherwise.
1015          * XXX There are no provisions to keep the vnode around, meaning we may
1016          * end up promoting a negative entry for a *new* vnode and returning
1017          * ENOENT on its account. This is the error we want to return anyway
1018          * and promotion is harmless.
1019          *
1020          * In particular at this point there can be a new ncp which matches the
1021          * search but hashes to a different neglist.
1022          */
1023         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1024                 if (ncp == oncp)
1025                         break;
1026         }
1027
1028         /*
1029          * No match to begin with.
1030          */
1031         if (__predict_false(ncp == NULL)) {
1032                 goto out_abort;
1033         }
1034
1035         /*
1036          * The newly found entry may be something different...
1037          */
1038         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1039             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1040                 goto out_abort;
1041         }
1042
1043         /*
1044          * ... and not even negative.
1045          */
1046         nc_flag = atomic_load_char(&ncp->nc_flag);
1047         if ((nc_flag & NCF_NEGATIVE) == 0) {
1048                 goto out_abort;
1049         }
1050
1051         if (__predict_false(!cache_ncp_canuse(ncp))) {
1052                 goto out_abort;
1053         }
1054
1055         cache_neg_promote_locked(ncp);
1056         cache_neg_hit_finish(ncp);
1057         vfs_smr_exit();
1058         mtx_unlock(&nl->nl_lock);
1059         return (true);
1060 out_abort:
1061         vfs_smr_exit();
1062         mtx_unlock(&nl->nl_lock);
1063         return (false);
1064 }
1065
1066 static void
1067 cache_neg_promote(struct namecache *ncp)
1068 {
1069         struct neglist *nl;
1070
1071         nl = NCP2NEGLIST(ncp);
1072         mtx_lock(&nl->nl_lock);
1073         cache_neg_promote_locked(ncp);
1074         mtx_unlock(&nl->nl_lock);
1075 }
1076
1077 static void
1078 cache_neg_insert(struct namecache *ncp)
1079 {
1080         struct neglist *nl;
1081
1082         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1083         cache_assert_bucket_locked(ncp);
1084         nl = NCP2NEGLIST(ncp);
1085         mtx_lock(&nl->nl_lock);
1086         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1087         mtx_unlock(&nl->nl_lock);
1088         atomic_add_long(&numneg, 1);
1089 }
1090
1091 static void
1092 cache_neg_remove(struct namecache *ncp)
1093 {
1094         struct neglist *nl;
1095         struct negstate *ns;
1096
1097         cache_assert_bucket_locked(ncp);
1098         nl = NCP2NEGLIST(ncp);
1099         ns = NCP2NEGSTATE(ncp);
1100         mtx_lock(&nl->nl_lock);
1101         if ((ns->neg_flag & NEG_HOT) != 0) {
1102                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1103                 nl->nl_hotnum--;
1104         } else {
1105                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1106         }
1107         mtx_unlock(&nl->nl_lock);
1108         atomic_subtract_long(&numneg, 1);
1109 }
1110
1111 static struct neglist *
1112 cache_neg_evict_select_list(void)
1113 {
1114         struct neglist *nl;
1115         u_int c;
1116
1117         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1118         nl = &neglists[c % numneglists];
1119         if (!mtx_trylock(&nl->nl_evict_lock)) {
1120                 counter_u64_add(neg_evict_skipped_contended, 1);
1121                 return (NULL);
1122         }
1123         return (nl);
1124 }
1125
1126 static struct namecache *
1127 cache_neg_evict_select_entry(struct neglist *nl)
1128 {
1129         struct namecache *ncp, *lncp;
1130         struct negstate *ns, *lns;
1131         int i;
1132
1133         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1134         mtx_assert(&nl->nl_lock, MA_OWNED);
1135         ncp = TAILQ_FIRST(&nl->nl_list);
1136         if (ncp == NULL)
1137                 return (NULL);
1138         lncp = ncp;
1139         lns = NCP2NEGSTATE(lncp);
1140         for (i = 1; i < 4; i++) {
1141                 ncp = TAILQ_NEXT(ncp, nc_dst);
1142                 if (ncp == NULL)
1143                         break;
1144                 ns = NCP2NEGSTATE(ncp);
1145                 if (ns->neg_hit < lns->neg_hit) {
1146                         lncp = ncp;
1147                         lns = ns;
1148                 }
1149         }
1150         return (lncp);
1151 }
1152
1153 static bool
1154 cache_neg_evict(void)
1155 {
1156         struct namecache *ncp, *ncp2;
1157         struct neglist *nl;
1158         struct negstate *ns;
1159         struct vnode *dvp;
1160         struct mtx *dvlp;
1161         struct mtx *blp;
1162         uint32_t hash;
1163         u_char nlen;
1164         bool evicted;
1165
1166         nl = cache_neg_evict_select_list();
1167         if (nl == NULL) {
1168                 return (false);
1169         }
1170
1171         mtx_lock(&nl->nl_lock);
1172         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1173         if (ncp != NULL) {
1174                 cache_neg_demote_locked(ncp);
1175         }
1176         ncp = cache_neg_evict_select_entry(nl);
1177         if (ncp == NULL) {
1178                 counter_u64_add(neg_evict_skipped_empty, 1);
1179                 mtx_unlock(&nl->nl_lock);
1180                 mtx_unlock(&nl->nl_evict_lock);
1181                 return (false);
1182         }
1183         ns = NCP2NEGSTATE(ncp);
1184         nlen = ncp->nc_nlen;
1185         dvp = ncp->nc_dvp;
1186         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1187         dvlp = VP2VNODELOCK(dvp);
1188         blp = HASH2BUCKETLOCK(hash);
1189         mtx_unlock(&nl->nl_lock);
1190         mtx_unlock(&nl->nl_evict_lock);
1191         mtx_lock(dvlp);
1192         mtx_lock(blp);
1193         /*
1194          * Note that since all locks were dropped above, the entry may be
1195          * gone or reallocated to be something else.
1196          */
1197         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1198                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1199                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1200                         break;
1201         }
1202         if (ncp2 == NULL) {
1203                 counter_u64_add(neg_evict_skipped_missed, 1);
1204                 ncp = NULL;
1205                 evicted = false;
1206         } else {
1207                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1208                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1209                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1210                     ncp->nc_name);
1211                 cache_zap_locked(ncp);
1212                 counter_u64_add(neg_evicted, 1);
1213                 evicted = true;
1214         }
1215         mtx_unlock(blp);
1216         mtx_unlock(dvlp);
1217         if (ncp != NULL)
1218                 cache_free(ncp);
1219         return (evicted);
1220 }
1221
1222 /*
1223  * Maybe evict a negative entry to create more room.
1224  *
1225  * The ncnegfactor parameter limits what fraction of the total count
1226  * can comprise of negative entries. However, if the cache is just
1227  * warming up this leads to excessive evictions.  As such, ncnegminpct
1228  * (recomputed to neg_min) dictates whether the above should be
1229  * applied.
1230  *
1231  * Try evicting if the cache is close to full capacity regardless of
1232  * other considerations.
1233  */
1234 static bool
1235 cache_neg_evict_cond(u_long lnumcache)
1236 {
1237         u_long lnumneg;
1238
1239         if (ncsize - 1000 < lnumcache)
1240                 goto out_evict;
1241         lnumneg = atomic_load_long(&numneg);
1242         if (lnumneg < neg_min)
1243                 return (false);
1244         if (lnumneg * ncnegfactor < lnumcache)
1245                 return (false);
1246 out_evict:
1247         return (cache_neg_evict());
1248 }
1249
1250 /*
1251  * cache_zap_locked():
1252  *
1253  *   Removes a namecache entry from cache, whether it contains an actual
1254  *   pointer to a vnode or if it is just a negative cache entry.
1255  */
1256 static void
1257 cache_zap_locked(struct namecache *ncp)
1258 {
1259         struct nchashhead *ncpp;
1260
1261         if (!(ncp->nc_flag & NCF_NEGATIVE))
1262                 cache_assert_vnode_locked(ncp->nc_vp);
1263         cache_assert_vnode_locked(ncp->nc_dvp);
1264         cache_assert_bucket_locked(ncp);
1265
1266         cache_ncp_invalidate(ncp);
1267
1268         ncpp = NCP2BUCKET(ncp);
1269         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1270         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1271                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1272                     ncp->nc_name, ncp->nc_vp);
1273                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1274                 if (ncp == ncp->nc_vp->v_cache_dd) {
1275                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1276                         ncp->nc_vp->v_cache_dd = NULL;
1277                         vn_seqc_write_end(ncp->nc_vp);
1278                 }
1279         } else {
1280                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1281                     ncp->nc_name);
1282                 cache_neg_remove(ncp);
1283         }
1284         if (ncp->nc_flag & NCF_ISDOTDOT) {
1285                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1286                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1287                         ncp->nc_dvp->v_cache_dd = NULL;
1288                         vn_seqc_write_end(ncp->nc_dvp);
1289                 }
1290         } else {
1291                 LIST_REMOVE(ncp, nc_src);
1292                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1293                         ncp->nc_flag |= NCF_DVDROP;
1294                         counter_u64_add(numcachehv, -1);
1295                 }
1296         }
1297         atomic_subtract_long(&numcache, 1);
1298 }
1299
1300 static void
1301 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1302 {
1303         struct mtx *blp;
1304
1305         MPASS(ncp->nc_dvp == vp);
1306         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1307         cache_assert_vnode_locked(vp);
1308
1309         blp = NCP2BUCKETLOCK(ncp);
1310         mtx_lock(blp);
1311         cache_zap_locked(ncp);
1312         mtx_unlock(blp);
1313 }
1314
1315 static bool
1316 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1317     struct mtx **vlpp)
1318 {
1319         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1320         struct mtx *blp;
1321
1322         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1323         cache_assert_vnode_locked(vp);
1324
1325         if (ncp->nc_flag & NCF_NEGATIVE) {
1326                 if (*vlpp != NULL) {
1327                         mtx_unlock(*vlpp);
1328                         *vlpp = NULL;
1329                 }
1330                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1331                 return (true);
1332         }
1333
1334         pvlp = VP2VNODELOCK(vp);
1335         blp = NCP2BUCKETLOCK(ncp);
1336         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1337         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1338
1339         if (*vlpp == vlp1 || *vlpp == vlp2) {
1340                 to_unlock = *vlpp;
1341                 *vlpp = NULL;
1342         } else {
1343                 if (*vlpp != NULL) {
1344                         mtx_unlock(*vlpp);
1345                         *vlpp = NULL;
1346                 }
1347                 cache_sort_vnodes(&vlp1, &vlp2);
1348                 if (vlp1 == pvlp) {
1349                         mtx_lock(vlp2);
1350                         to_unlock = vlp2;
1351                 } else {
1352                         if (!mtx_trylock(vlp1))
1353                                 goto out_relock;
1354                         to_unlock = vlp1;
1355                 }
1356         }
1357         mtx_lock(blp);
1358         cache_zap_locked(ncp);
1359         mtx_unlock(blp);
1360         if (to_unlock != NULL)
1361                 mtx_unlock(to_unlock);
1362         return (true);
1363
1364 out_relock:
1365         mtx_unlock(vlp2);
1366         mtx_lock(vlp1);
1367         mtx_lock(vlp2);
1368         MPASS(*vlpp == NULL);
1369         *vlpp = vlp1;
1370         return (false);
1371 }
1372
1373 /*
1374  * If trylocking failed we can get here. We know enough to take all needed locks
1375  * in the right order and re-lookup the entry.
1376  */
1377 static int
1378 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1379     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1380     struct mtx *blp)
1381 {
1382         struct namecache *rncp;
1383
1384         cache_assert_bucket_unlocked(ncp);
1385
1386         cache_sort_vnodes(&dvlp, &vlp);
1387         cache_lock_vnodes(dvlp, vlp);
1388         mtx_lock(blp);
1389         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1390                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1391                     rncp->nc_nlen == cnp->cn_namelen &&
1392                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1393                         break;
1394         }
1395         if (rncp != NULL) {
1396                 cache_zap_locked(rncp);
1397                 mtx_unlock(blp);
1398                 cache_unlock_vnodes(dvlp, vlp);
1399                 counter_u64_add(zap_bucket_relock_success, 1);
1400                 return (0);
1401         }
1402
1403         mtx_unlock(blp);
1404         cache_unlock_vnodes(dvlp, vlp);
1405         return (EAGAIN);
1406 }
1407
1408 static int __noinline
1409 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1410     uint32_t hash, struct mtx *blp)
1411 {
1412         struct mtx *dvlp, *vlp;
1413         struct vnode *dvp;
1414
1415         cache_assert_bucket_locked(ncp);
1416
1417         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1418         vlp = NULL;
1419         if (!(ncp->nc_flag & NCF_NEGATIVE))
1420                 vlp = VP2VNODELOCK(ncp->nc_vp);
1421         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1422                 cache_zap_locked(ncp);
1423                 mtx_unlock(blp);
1424                 cache_unlock_vnodes(dvlp, vlp);
1425                 return (0);
1426         }
1427
1428         dvp = ncp->nc_dvp;
1429         mtx_unlock(blp);
1430         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1431 }
1432
1433 static __noinline int
1434 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1435 {
1436         struct namecache *ncp;
1437         struct mtx *blp;
1438         struct mtx *dvlp, *dvlp2;
1439         uint32_t hash;
1440         int error;
1441
1442         if (cnp->cn_namelen == 2 &&
1443             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1444                 dvlp = VP2VNODELOCK(dvp);
1445                 dvlp2 = NULL;
1446                 mtx_lock(dvlp);
1447 retry_dotdot:
1448                 ncp = dvp->v_cache_dd;
1449                 if (ncp == NULL) {
1450                         mtx_unlock(dvlp);
1451                         if (dvlp2 != NULL)
1452                                 mtx_unlock(dvlp2);
1453                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1454                         return (0);
1455                 }
1456                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1457                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1458                                 goto retry_dotdot;
1459                         MPASS(dvp->v_cache_dd == NULL);
1460                         mtx_unlock(dvlp);
1461                         if (dvlp2 != NULL)
1462                                 mtx_unlock(dvlp2);
1463                         cache_free(ncp);
1464                 } else {
1465                         vn_seqc_write_begin(dvp);
1466                         dvp->v_cache_dd = NULL;
1467                         vn_seqc_write_end(dvp);
1468                         mtx_unlock(dvlp);
1469                         if (dvlp2 != NULL)
1470                                 mtx_unlock(dvlp2);
1471                 }
1472                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1473                 return (1);
1474         }
1475
1476         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1477         blp = HASH2BUCKETLOCK(hash);
1478 retry:
1479         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1480                 goto out_no_entry;
1481
1482         mtx_lock(blp);
1483
1484         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1485                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1486                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1487                         break;
1488         }
1489
1490         if (ncp == NULL) {
1491                 mtx_unlock(blp);
1492                 goto out_no_entry;
1493         }
1494
1495         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1496         if (__predict_false(error != 0)) {
1497                 zap_bucket_fail++;
1498                 goto retry;
1499         }
1500         counter_u64_add(numposzaps, 1);
1501         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1502         cache_free(ncp);
1503         return (1);
1504 out_no_entry:
1505         counter_u64_add(nummisszap, 1);
1506         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1507         return (0);
1508 }
1509
1510 static int __noinline
1511 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1512     struct timespec *tsp, int *ticksp)
1513 {
1514         int ltype;
1515
1516         *vpp = dvp;
1517         counter_u64_add(dothits, 1);
1518         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1519         if (tsp != NULL)
1520                 timespecclear(tsp);
1521         if (ticksp != NULL)
1522                 *ticksp = ticks;
1523         vrefact(*vpp);
1524         /*
1525          * When we lookup "." we still can be asked to lock it
1526          * differently...
1527          */
1528         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1529         if (ltype != VOP_ISLOCKED(*vpp)) {
1530                 if (ltype == LK_EXCLUSIVE) {
1531                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1532                         if (VN_IS_DOOMED((*vpp))) {
1533                                 /* forced unmount */
1534                                 vrele(*vpp);
1535                                 *vpp = NULL;
1536                                 return (ENOENT);
1537                         }
1538                 } else
1539                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1540         }
1541         return (-1);
1542 }
1543
1544 static int __noinline
1545 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1546     struct timespec *tsp, int *ticksp)
1547 {
1548         struct namecache_ts *ncp_ts;
1549         struct namecache *ncp;
1550         struct mtx *dvlp;
1551         enum vgetstate vs;
1552         int error, ltype;
1553         bool whiteout;
1554
1555         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1556
1557         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1558                 cache_remove_cnp(dvp, cnp);
1559                 return (0);
1560         }
1561
1562         counter_u64_add(dotdothits, 1);
1563 retry:
1564         dvlp = VP2VNODELOCK(dvp);
1565         mtx_lock(dvlp);
1566         ncp = dvp->v_cache_dd;
1567         if (ncp == NULL) {
1568                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1569                 mtx_unlock(dvlp);
1570                 return (0);
1571         }
1572         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1573                 if (ncp->nc_flag & NCF_NEGATIVE)
1574                         *vpp = NULL;
1575                 else
1576                         *vpp = ncp->nc_vp;
1577         } else
1578                 *vpp = ncp->nc_dvp;
1579         if (*vpp == NULL)
1580                 goto negative_success;
1581         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1582         cache_out_ts(ncp, tsp, ticksp);
1583         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1584             NCF_DTS && tsp != NULL) {
1585                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1586                 *tsp = ncp_ts->nc_dotdottime;
1587         }
1588
1589         MPASS(dvp != *vpp);
1590         ltype = VOP_ISLOCKED(dvp);
1591         VOP_UNLOCK(dvp);
1592         vs = vget_prep(*vpp);
1593         mtx_unlock(dvlp);
1594         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1595         vn_lock(dvp, ltype | LK_RETRY);
1596         if (VN_IS_DOOMED(dvp)) {
1597                 if (error == 0)
1598                         vput(*vpp);
1599                 *vpp = NULL;
1600                 return (ENOENT);
1601         }
1602         if (error) {
1603                 *vpp = NULL;
1604                 goto retry;
1605         }
1606         return (-1);
1607 negative_success:
1608         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1609                 if (cnp->cn_flags & ISLASTCN) {
1610                         counter_u64_add(numnegzaps, 1);
1611                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1612                         mtx_unlock(dvlp);
1613                         cache_free(ncp);
1614                         return (0);
1615                 }
1616         }
1617
1618         whiteout = (ncp->nc_flag & NCF_WHITE);
1619         cache_out_ts(ncp, tsp, ticksp);
1620         if (cache_neg_hit_prep(ncp))
1621                 cache_neg_promote(ncp);
1622         else
1623                 cache_neg_hit_finish(ncp);
1624         mtx_unlock(dvlp);
1625         if (whiteout)
1626                 cnp->cn_flags |= ISWHITEOUT;
1627         return (ENOENT);
1628 }
1629
1630 /**
1631  * Lookup a name in the name cache
1632  *
1633  * # Arguments
1634  *
1635  * - dvp:       Parent directory in which to search.
1636  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1637  * - cnp:       Parameters of the name search.  The most interesting bits of
1638  *              the cn_flags field have the following meanings:
1639  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1640  *                      it up.
1641  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1642  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1643  *              or negative) lookup, tsp will be filled with any timespec that
1644  *              was stored when this cache entry was created.  However, it will
1645  *              be clear for "." entries.
1646  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1647  *              (positive or negative) lookup, it will contain the ticks value
1648  *              that was current when the cache entry was created, unless cnp
1649  *              was ".".
1650  *
1651  * Either both tsp and ticks have to be provided or neither of them.
1652  *
1653  * # Returns
1654  *
1655  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1656  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1657  *              to a forced unmount.  vpp will not be modified.  If the entry
1658  *              is a whiteout, then the ISWHITEOUT flag will be set in
1659  *              cnp->cn_flags.
1660  * - 0:         A cache miss.  vpp will not be modified.
1661  *
1662  * # Locking
1663  *
1664  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1665  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1666  * lock is not recursively acquired.
1667  */
1668 static int __noinline
1669 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1670     struct timespec *tsp, int *ticksp)
1671 {
1672         struct namecache *ncp;
1673         struct mtx *blp;
1674         uint32_t hash;
1675         enum vgetstate vs;
1676         int error;
1677         bool whiteout;
1678
1679         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1680         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1681
1682 retry:
1683         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1684         blp = HASH2BUCKETLOCK(hash);
1685         mtx_lock(blp);
1686
1687         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1688                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1689                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1690                         break;
1691         }
1692
1693         if (__predict_false(ncp == NULL)) {
1694                 mtx_unlock(blp);
1695                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1696                     NULL);
1697                 counter_u64_add(nummiss, 1);
1698                 return (0);
1699         }
1700
1701         if (ncp->nc_flag & NCF_NEGATIVE)
1702                 goto negative_success;
1703
1704         counter_u64_add(numposhits, 1);
1705         *vpp = ncp->nc_vp;
1706         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1707         cache_out_ts(ncp, tsp, ticksp);
1708         MPASS(dvp != *vpp);
1709         vs = vget_prep(*vpp);
1710         mtx_unlock(blp);
1711         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1712         if (error) {
1713                 *vpp = NULL;
1714                 goto retry;
1715         }
1716         return (-1);
1717 negative_success:
1718         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1719                 if (cnp->cn_flags & ISLASTCN) {
1720                         counter_u64_add(numnegzaps, 1);
1721                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1722                         if (__predict_false(error != 0)) {
1723                                 zap_bucket_fail2++;
1724                                 goto retry;
1725                         }
1726                         cache_free(ncp);
1727                         return (0);
1728                 }
1729         }
1730
1731         whiteout = (ncp->nc_flag & NCF_WHITE);
1732         cache_out_ts(ncp, tsp, ticksp);
1733         if (cache_neg_hit_prep(ncp))
1734                 cache_neg_promote(ncp);
1735         else
1736                 cache_neg_hit_finish(ncp);
1737         mtx_unlock(blp);
1738         if (whiteout)
1739                 cnp->cn_flags |= ISWHITEOUT;
1740         return (ENOENT);
1741 }
1742
1743 int
1744 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1745     struct timespec *tsp, int *ticksp)
1746 {
1747         struct namecache *ncp;
1748         uint32_t hash;
1749         enum vgetstate vs;
1750         int error;
1751         bool whiteout, neg_promote;
1752         u_short nc_flag;
1753
1754         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1755
1756 #ifdef DEBUG_CACHE
1757         if (__predict_false(!doingcache)) {
1758                 cnp->cn_flags &= ~MAKEENTRY;
1759                 return (0);
1760         }
1761 #endif
1762
1763         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1764                 if (cnp->cn_namelen == 1)
1765                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1766                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1767                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1768         }
1769
1770         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1771
1772         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1773                 cache_remove_cnp(dvp, cnp);
1774                 return (0);
1775         }
1776
1777         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1778         vfs_smr_enter();
1779
1780         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1781                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1782                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1783                         break;
1784         }
1785
1786         if (__predict_false(ncp == NULL)) {
1787                 vfs_smr_exit();
1788                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1789                     NULL);
1790                 counter_u64_add(nummiss, 1);
1791                 return (0);
1792         }
1793
1794         nc_flag = atomic_load_char(&ncp->nc_flag);
1795         if (nc_flag & NCF_NEGATIVE)
1796                 goto negative_success;
1797
1798         counter_u64_add(numposhits, 1);
1799         *vpp = ncp->nc_vp;
1800         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1801         cache_out_ts(ncp, tsp, ticksp);
1802         MPASS(dvp != *vpp);
1803         if (!cache_ncp_canuse(ncp)) {
1804                 vfs_smr_exit();
1805                 *vpp = NULL;
1806                 goto out_fallback;
1807         }
1808         vs = vget_prep_smr(*vpp);
1809         vfs_smr_exit();
1810         if (__predict_false(vs == VGET_NONE)) {
1811                 *vpp = NULL;
1812                 goto out_fallback;
1813         }
1814         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1815         if (error) {
1816                 *vpp = NULL;
1817                 goto out_fallback;
1818         }
1819         return (-1);
1820 negative_success:
1821         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1822                 if (cnp->cn_flags & ISLASTCN) {
1823                         vfs_smr_exit();
1824                         goto out_fallback;
1825                 }
1826         }
1827
1828         cache_out_ts(ncp, tsp, ticksp);
1829         whiteout = (ncp->nc_flag & NCF_WHITE);
1830         neg_promote = cache_neg_hit_prep(ncp);
1831         if (__predict_false(!cache_ncp_canuse(ncp))) {
1832                 cache_neg_hit_abort(ncp);
1833                 vfs_smr_exit();
1834                 goto out_fallback;
1835         }
1836         if (neg_promote) {
1837                 vfs_smr_exit();
1838                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1839                         goto out_fallback;
1840         } else {
1841                 cache_neg_hit_finish(ncp);
1842                 vfs_smr_exit();
1843         }
1844         if (whiteout)
1845                 cnp->cn_flags |= ISWHITEOUT;
1846         return (ENOENT);
1847 out_fallback:
1848         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1849 }
1850
1851 struct celockstate {
1852         struct mtx *vlp[3];
1853         struct mtx *blp[2];
1854 };
1855 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1856 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1857
1858 static inline void
1859 cache_celockstate_init(struct celockstate *cel)
1860 {
1861
1862         bzero(cel, sizeof(*cel));
1863 }
1864
1865 static void
1866 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1867     struct vnode *dvp)
1868 {
1869         struct mtx *vlp1, *vlp2;
1870
1871         MPASS(cel->vlp[0] == NULL);
1872         MPASS(cel->vlp[1] == NULL);
1873         MPASS(cel->vlp[2] == NULL);
1874
1875         MPASS(vp != NULL || dvp != NULL);
1876
1877         vlp1 = VP2VNODELOCK(vp);
1878         vlp2 = VP2VNODELOCK(dvp);
1879         cache_sort_vnodes(&vlp1, &vlp2);
1880
1881         if (vlp1 != NULL) {
1882                 mtx_lock(vlp1);
1883                 cel->vlp[0] = vlp1;
1884         }
1885         mtx_lock(vlp2);
1886         cel->vlp[1] = vlp2;
1887 }
1888
1889 static void
1890 cache_unlock_vnodes_cel(struct celockstate *cel)
1891 {
1892
1893         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1894
1895         if (cel->vlp[0] != NULL)
1896                 mtx_unlock(cel->vlp[0]);
1897         if (cel->vlp[1] != NULL)
1898                 mtx_unlock(cel->vlp[1]);
1899         if (cel->vlp[2] != NULL)
1900                 mtx_unlock(cel->vlp[2]);
1901 }
1902
1903 static bool
1904 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1905 {
1906         struct mtx *vlp;
1907         bool ret;
1908
1909         cache_assert_vlp_locked(cel->vlp[0]);
1910         cache_assert_vlp_locked(cel->vlp[1]);
1911         MPASS(cel->vlp[2] == NULL);
1912
1913         MPASS(vp != NULL);
1914         vlp = VP2VNODELOCK(vp);
1915
1916         ret = true;
1917         if (vlp >= cel->vlp[1]) {
1918                 mtx_lock(vlp);
1919         } else {
1920                 if (mtx_trylock(vlp))
1921                         goto out;
1922                 cache_lock_vnodes_cel_3_failures++;
1923                 cache_unlock_vnodes_cel(cel);
1924                 if (vlp < cel->vlp[0]) {
1925                         mtx_lock(vlp);
1926                         mtx_lock(cel->vlp[0]);
1927                         mtx_lock(cel->vlp[1]);
1928                 } else {
1929                         if (cel->vlp[0] != NULL)
1930                                 mtx_lock(cel->vlp[0]);
1931                         mtx_lock(vlp);
1932                         mtx_lock(cel->vlp[1]);
1933                 }
1934                 ret = false;
1935         }
1936 out:
1937         cel->vlp[2] = vlp;
1938         return (ret);
1939 }
1940
1941 static void
1942 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1943     struct mtx *blp2)
1944 {
1945
1946         MPASS(cel->blp[0] == NULL);
1947         MPASS(cel->blp[1] == NULL);
1948
1949         cache_sort_vnodes(&blp1, &blp2);
1950
1951         if (blp1 != NULL) {
1952                 mtx_lock(blp1);
1953                 cel->blp[0] = blp1;
1954         }
1955         mtx_lock(blp2);
1956         cel->blp[1] = blp2;
1957 }
1958
1959 static void
1960 cache_unlock_buckets_cel(struct celockstate *cel)
1961 {
1962
1963         if (cel->blp[0] != NULL)
1964                 mtx_unlock(cel->blp[0]);
1965         mtx_unlock(cel->blp[1]);
1966 }
1967
1968 /*
1969  * Lock part of the cache affected by the insertion.
1970  *
1971  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1972  * However, insertion can result in removal of an old entry. In this
1973  * case we have an additional vnode and bucketlock pair to lock.
1974  *
1975  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1976  * preserving the locking order (smaller address first).
1977  */
1978 static void
1979 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1980     uint32_t hash)
1981 {
1982         struct namecache *ncp;
1983         struct mtx *blps[2];
1984
1985         blps[0] = HASH2BUCKETLOCK(hash);
1986         for (;;) {
1987                 blps[1] = NULL;
1988                 cache_lock_vnodes_cel(cel, dvp, vp);
1989                 if (vp == NULL || vp->v_type != VDIR)
1990                         break;
1991                 ncp = vp->v_cache_dd;
1992                 if (ncp == NULL)
1993                         break;
1994                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1995                         break;
1996                 MPASS(ncp->nc_dvp == vp);
1997                 blps[1] = NCP2BUCKETLOCK(ncp);
1998                 if (ncp->nc_flag & NCF_NEGATIVE)
1999                         break;
2000                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2001                         break;
2002                 /*
2003                  * All vnodes got re-locked. Re-validate the state and if
2004                  * nothing changed we are done. Otherwise restart.
2005                  */
2006                 if (ncp == vp->v_cache_dd &&
2007                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2008                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2009                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2010                         break;
2011                 cache_unlock_vnodes_cel(cel);
2012                 cel->vlp[0] = NULL;
2013                 cel->vlp[1] = NULL;
2014                 cel->vlp[2] = NULL;
2015         }
2016         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2017 }
2018
2019 static void
2020 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2021     uint32_t hash)
2022 {
2023         struct namecache *ncp;
2024         struct mtx *blps[2];
2025
2026         blps[0] = HASH2BUCKETLOCK(hash);
2027         for (;;) {
2028                 blps[1] = NULL;
2029                 cache_lock_vnodes_cel(cel, dvp, vp);
2030                 ncp = dvp->v_cache_dd;
2031                 if (ncp == NULL)
2032                         break;
2033                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2034                         break;
2035                 MPASS(ncp->nc_dvp == dvp);
2036                 blps[1] = NCP2BUCKETLOCK(ncp);
2037                 if (ncp->nc_flag & NCF_NEGATIVE)
2038                         break;
2039                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2040                         break;
2041                 if (ncp == dvp->v_cache_dd &&
2042                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2043                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2044                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2045                         break;
2046                 cache_unlock_vnodes_cel(cel);
2047                 cel->vlp[0] = NULL;
2048                 cel->vlp[1] = NULL;
2049                 cel->vlp[2] = NULL;
2050         }
2051         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2052 }
2053
2054 static void
2055 cache_enter_unlock(struct celockstate *cel)
2056 {
2057
2058         cache_unlock_buckets_cel(cel);
2059         cache_unlock_vnodes_cel(cel);
2060 }
2061
2062 static void __noinline
2063 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2064     struct componentname *cnp)
2065 {
2066         struct celockstate cel;
2067         struct namecache *ncp;
2068         uint32_t hash;
2069         int len;
2070
2071         if (dvp->v_cache_dd == NULL)
2072                 return;
2073         len = cnp->cn_namelen;
2074         cache_celockstate_init(&cel);
2075         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2076         cache_enter_lock_dd(&cel, dvp, vp, hash);
2077         vn_seqc_write_begin(dvp);
2078         ncp = dvp->v_cache_dd;
2079         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2080                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2081                 cache_zap_locked(ncp);
2082         } else {
2083                 ncp = NULL;
2084         }
2085         dvp->v_cache_dd = NULL;
2086         vn_seqc_write_end(dvp);
2087         cache_enter_unlock(&cel);
2088         if (ncp != NULL)
2089                 cache_free(ncp);
2090 }
2091
2092 /*
2093  * Add an entry to the cache.
2094  */
2095 void
2096 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2097     struct timespec *tsp, struct timespec *dtsp)
2098 {
2099         struct celockstate cel;
2100         struct namecache *ncp, *n2, *ndd;
2101         struct namecache_ts *ncp_ts;
2102         struct nchashhead *ncpp;
2103         uint32_t hash;
2104         int flag;
2105         int len;
2106         u_long lnumcache;
2107
2108         VNPASS(dvp != vp, dvp);
2109         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2110         VNPASS(dvp->v_type != VNON, dvp);
2111         if (vp != NULL) {
2112                 VNPASS(!VN_IS_DOOMED(vp), vp);
2113                 VNPASS(vp->v_type != VNON, vp);
2114         }
2115
2116 #ifdef DEBUG_CACHE
2117         if (__predict_false(!doingcache))
2118                 return;
2119 #endif
2120
2121         flag = 0;
2122         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2123                 if (cnp->cn_namelen == 1)
2124                         return;
2125                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2126                         cache_enter_dotdot_prep(dvp, vp, cnp);
2127                         flag = NCF_ISDOTDOT;
2128                 }
2129         }
2130
2131         /*
2132          * Avoid blowout in namecache entries.
2133          *
2134          * Bugs:
2135          * 1. filesystems may end up tryng to add an already existing entry
2136          * (for example this can happen after a cache miss during concurrent
2137          * lookup), in which case we will call cache_neg_evict despite not
2138          * adding anything.
2139          * 2. the routine may fail to free anything and no provisions are made
2140          * to make it try harder (see the inside for failure modes)
2141          * 3. it only ever looks at negative entries.
2142          */
2143         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
2144         if (cache_neg_evict_cond(lnumcache)) {
2145                 lnumcache = atomic_load_long(&numcache);
2146         }
2147         if (__predict_false(lnumcache >= ncsize)) {
2148                 atomic_subtract_long(&numcache, 1);
2149                 counter_u64_add(numdrops, 1);
2150                 return;
2151         }
2152
2153         cache_celockstate_init(&cel);
2154         ndd = NULL;
2155         ncp_ts = NULL;
2156
2157         /*
2158          * Calculate the hash key and setup as much of the new
2159          * namecache entry as possible before acquiring the lock.
2160          */
2161         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2162         ncp->nc_flag = flag | NCF_WIP;
2163         ncp->nc_vp = vp;
2164         if (vp == NULL)
2165                 cache_neg_init(ncp);
2166         ncp->nc_dvp = dvp;
2167         if (tsp != NULL) {
2168                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2169                 ncp_ts->nc_time = *tsp;
2170                 ncp_ts->nc_ticks = ticks;
2171                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2172                 if (dtsp != NULL) {
2173                         ncp_ts->nc_dotdottime = *dtsp;
2174                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2175                 }
2176         }
2177         len = ncp->nc_nlen = cnp->cn_namelen;
2178         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2179         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2180         ncp->nc_name[len] = '\0';
2181         cache_enter_lock(&cel, dvp, vp, hash);
2182
2183         /*
2184          * See if this vnode or negative entry is already in the cache
2185          * with this name.  This can happen with concurrent lookups of
2186          * the same path name.
2187          */
2188         ncpp = NCHHASH(hash);
2189         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2190                 if (n2->nc_dvp == dvp &&
2191                     n2->nc_nlen == cnp->cn_namelen &&
2192                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2193                         MPASS(cache_ncp_canuse(n2));
2194                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2195                                 KASSERT(vp == NULL,
2196                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2197                                     __func__, NULL, vp));
2198                         else
2199                                 KASSERT(n2->nc_vp == vp,
2200                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2201                                     __func__, n2->nc_vp, vp));
2202                         /*
2203                          * Entries are supposed to be immutable unless in the
2204                          * process of getting destroyed. Accommodating for
2205                          * changing timestamps is possible but not worth it.
2206                          * This should be harmless in terms of correctness, in
2207                          * the worst case resulting in an earlier expiration.
2208                          * Alternatively, the found entry can be replaced
2209                          * altogether.
2210                          */
2211                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2212 #if 0
2213                         if (tsp != NULL) {
2214                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2215                                     ("no NCF_TS"));
2216                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2217                                 n2_ts->nc_time = ncp_ts->nc_time;
2218                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2219                                 if (dtsp != NULL) {
2220                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2221                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2222                                 }
2223                         }
2224 #endif
2225                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2226                             vp);
2227                         goto out_unlock_free;
2228                 }
2229         }
2230
2231         if (flag == NCF_ISDOTDOT) {
2232                 /*
2233                  * See if we are trying to add .. entry, but some other lookup
2234                  * has populated v_cache_dd pointer already.
2235                  */
2236                 if (dvp->v_cache_dd != NULL)
2237                         goto out_unlock_free;
2238                 KASSERT(vp == NULL || vp->v_type == VDIR,
2239                     ("wrong vnode type %p", vp));
2240                 vn_seqc_write_begin(dvp);
2241                 dvp->v_cache_dd = ncp;
2242                 vn_seqc_write_end(dvp);
2243         }
2244
2245         if (vp != NULL) {
2246                 if (flag != NCF_ISDOTDOT) {
2247                         /*
2248                          * For this case, the cache entry maps both the
2249                          * directory name in it and the name ".." for the
2250                          * directory's parent.
2251                          */
2252                         vn_seqc_write_begin(vp);
2253                         if ((ndd = vp->v_cache_dd) != NULL) {
2254                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2255                                         cache_zap_locked(ndd);
2256                                 else
2257                                         ndd = NULL;
2258                         }
2259                         vp->v_cache_dd = ncp;
2260                         vn_seqc_write_end(vp);
2261                 } else if (vp->v_type != VDIR) {
2262                         if (vp->v_cache_dd != NULL) {
2263                                 vn_seqc_write_begin(vp);
2264                                 vp->v_cache_dd = NULL;
2265                                 vn_seqc_write_end(vp);
2266                         }
2267                 }
2268         }
2269
2270         if (flag != NCF_ISDOTDOT) {
2271                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2272                         vhold(dvp);
2273                         counter_u64_add(numcachehv, 1);
2274                 }
2275                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2276         }
2277
2278         /*
2279          * If the entry is "negative", we place it into the
2280          * "negative" cache queue, otherwise, we place it into the
2281          * destination vnode's cache entries queue.
2282          */
2283         if (vp != NULL) {
2284                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2285                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2286                     vp);
2287         } else {
2288                 if (cnp->cn_flags & ISWHITEOUT)
2289                         ncp->nc_flag |= NCF_WHITE;
2290                 cache_neg_insert(ncp);
2291                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2292                     ncp->nc_name);
2293         }
2294
2295         /*
2296          * Insert the new namecache entry into the appropriate chain
2297          * within the cache entries table.
2298          */
2299         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2300
2301         atomic_thread_fence_rel();
2302         /*
2303          * Mark the entry as fully constructed.
2304          * It is immutable past this point until its removal.
2305          */
2306         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2307
2308         cache_enter_unlock(&cel);
2309         if (ndd != NULL)
2310                 cache_free(ndd);
2311         return;
2312 out_unlock_free:
2313         cache_enter_unlock(&cel);
2314         atomic_subtract_long(&numcache, 1);
2315         cache_free(ncp);
2316         return;
2317 }
2318
2319 static u_int
2320 cache_roundup_2(u_int val)
2321 {
2322         u_int res;
2323
2324         for (res = 1; res <= val; res <<= 1)
2325                 continue;
2326
2327         return (res);
2328 }
2329
2330 static struct nchashhead *
2331 nchinittbl(u_long elements, u_long *hashmask)
2332 {
2333         struct nchashhead *hashtbl;
2334         u_long hashsize, i;
2335
2336         hashsize = cache_roundup_2(elements) / 2;
2337
2338         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2339         for (i = 0; i < hashsize; i++)
2340                 CK_SLIST_INIT(&hashtbl[i]);
2341         *hashmask = hashsize - 1;
2342         return (hashtbl);
2343 }
2344
2345 static void
2346 ncfreetbl(struct nchashhead *hashtbl)
2347 {
2348
2349         free(hashtbl, M_VFSCACHE);
2350 }
2351
2352 /*
2353  * Name cache initialization, from vfs_init() when we are booting
2354  */
2355 static void
2356 nchinit(void *dummy __unused)
2357 {
2358         u_int i;
2359
2360         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2361             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2362         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2363             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2364         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2365             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2366         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2367             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2368
2369         VFS_SMR_ZONE_SET(cache_zone_small);
2370         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2371         VFS_SMR_ZONE_SET(cache_zone_large);
2372         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2373
2374         ncsize = desiredvnodes * ncsizefactor;
2375         cache_recalc_neg_min(ncnegminpct);
2376         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2377         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2378         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2379                 ncbuckethash = 7;
2380         if (ncbuckethash > nchash)
2381                 ncbuckethash = nchash;
2382         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2383             M_WAITOK | M_ZERO);
2384         for (i = 0; i < numbucketlocks; i++)
2385                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2386         ncvnodehash = ncbuckethash;
2387         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2388             M_WAITOK | M_ZERO);
2389         for (i = 0; i < numvnodelocks; i++)
2390                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2391
2392         for (i = 0; i < numneglists; i++) {
2393                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2394                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2395                 TAILQ_INIT(&neglists[i].nl_list);
2396                 TAILQ_INIT(&neglists[i].nl_hotlist);
2397         }
2398 }
2399 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2400
2401 void
2402 cache_vnode_init(struct vnode *vp)
2403 {
2404
2405         LIST_INIT(&vp->v_cache_src);
2406         TAILQ_INIT(&vp->v_cache_dst);
2407         vp->v_cache_dd = NULL;
2408         cache_prehash(vp);
2409 }
2410
2411 void
2412 cache_changesize(u_long newmaxvnodes)
2413 {
2414         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2415         u_long new_nchash, old_nchash;
2416         struct namecache *ncp;
2417         uint32_t hash;
2418         u_long newncsize;
2419         int i;
2420
2421         newncsize = newmaxvnodes * ncsizefactor;
2422         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2423         if (newmaxvnodes < numbucketlocks)
2424                 newmaxvnodes = numbucketlocks;
2425
2426         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2427         /* If same hash table size, nothing to do */
2428         if (nchash == new_nchash) {
2429                 ncfreetbl(new_nchashtbl);
2430                 return;
2431         }
2432         /*
2433          * Move everything from the old hash table to the new table.
2434          * None of the namecache entries in the table can be removed
2435          * because to do so, they have to be removed from the hash table.
2436          */
2437         cache_lock_all_vnodes();
2438         cache_lock_all_buckets();
2439         old_nchashtbl = nchashtbl;
2440         old_nchash = nchash;
2441         nchashtbl = new_nchashtbl;
2442         nchash = new_nchash;
2443         for (i = 0; i <= old_nchash; i++) {
2444                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2445                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2446                             ncp->nc_dvp);
2447                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2448                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2449                 }
2450         }
2451         ncsize = newncsize;
2452         cache_recalc_neg_min(ncnegminpct);
2453         cache_unlock_all_buckets();
2454         cache_unlock_all_vnodes();
2455         ncfreetbl(old_nchashtbl);
2456 }
2457
2458 /*
2459  * Invalidate all entries from and to a particular vnode.
2460  */
2461 static void
2462 cache_purge_impl(struct vnode *vp)
2463 {
2464         TAILQ_HEAD(, namecache) ncps;
2465         struct namecache *ncp, *nnp;
2466         struct mtx *vlp, *vlp2;
2467
2468         TAILQ_INIT(&ncps);
2469         vlp = VP2VNODELOCK(vp);
2470         vlp2 = NULL;
2471         mtx_lock(vlp);
2472 retry:
2473         while (!LIST_EMPTY(&vp->v_cache_src)) {
2474                 ncp = LIST_FIRST(&vp->v_cache_src);
2475                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2476                         goto retry;
2477                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2478         }
2479         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2480                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2481                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2482                         goto retry;
2483                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2484         }
2485         ncp = vp->v_cache_dd;
2486         if (ncp != NULL) {
2487                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2488                    ("lost dotdot link"));
2489                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2490                         goto retry;
2491                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2492         }
2493         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2494         mtx_unlock(vlp);
2495         if (vlp2 != NULL)
2496                 mtx_unlock(vlp2);
2497         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2498                 cache_free(ncp);
2499         }
2500 }
2501
2502 /*
2503  * Opportunistic check to see if there is anything to do.
2504  */
2505 static bool
2506 cache_has_entries(struct vnode *vp)
2507 {
2508
2509         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2510             vp->v_cache_dd == NULL)
2511                 return (false);
2512         return (true);
2513 }
2514
2515 void
2516 cache_purge(struct vnode *vp)
2517 {
2518
2519         SDT_PROBE1(vfs, namecache, purge, done, vp);
2520         if (!cache_has_entries(vp))
2521                 return;
2522         cache_purge_impl(vp);
2523 }
2524
2525 /*
2526  * Only to be used by vgone.
2527  */
2528 void
2529 cache_purge_vgone(struct vnode *vp)
2530 {
2531         struct mtx *vlp;
2532
2533         VNPASS(VN_IS_DOOMED(vp), vp);
2534         if (cache_has_entries(vp)) {
2535                 cache_purge_impl(vp);
2536                 return;
2537         }
2538
2539         /*
2540          * Serialize against a potential thread doing cache_purge.
2541          */
2542         vlp = VP2VNODELOCK(vp);
2543         mtx_wait_unlocked(vlp);
2544         if (cache_has_entries(vp)) {
2545                 cache_purge_impl(vp);
2546                 return;
2547         }
2548         return;
2549 }
2550
2551 /*
2552  * Invalidate all negative entries for a particular directory vnode.
2553  */
2554 void
2555 cache_purge_negative(struct vnode *vp)
2556 {
2557         TAILQ_HEAD(, namecache) ncps;
2558         struct namecache *ncp, *nnp;
2559         struct mtx *vlp;
2560
2561         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2562         if (LIST_EMPTY(&vp->v_cache_src))
2563                 return;
2564         TAILQ_INIT(&ncps);
2565         vlp = VP2VNODELOCK(vp);
2566         mtx_lock(vlp);
2567         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2568                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2569                         continue;
2570                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2571                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2572         }
2573         mtx_unlock(vlp);
2574         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2575                 cache_free(ncp);
2576         }
2577 }
2578
2579 void
2580 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2581     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2582 {
2583
2584         ASSERT_VOP_IN_SEQC(fdvp);
2585         ASSERT_VOP_IN_SEQC(fvp);
2586         ASSERT_VOP_IN_SEQC(tdvp);
2587         if (tvp != NULL)
2588                 ASSERT_VOP_IN_SEQC(tvp);
2589
2590         cache_purge(fvp);
2591         if (tvp != NULL) {
2592                 cache_purge(tvp);
2593                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2594                     ("%s: lingering negative entry", __func__));
2595         } else {
2596                 cache_remove_cnp(tdvp, tcnp);
2597         }
2598 }
2599
2600 #ifdef INVARIANTS
2601 /*
2602  * Validate that if an entry exists it matches.
2603  */
2604 void
2605 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2606 {
2607         struct namecache *ncp;
2608         struct mtx *blp;
2609         uint32_t hash;
2610
2611         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2612         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2613                 return;
2614         blp = HASH2BUCKETLOCK(hash);
2615         mtx_lock(blp);
2616         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2617                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2618                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2619                         if (ncp->nc_vp != vp)
2620                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2621                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2622                                     ncp->nc_vp);
2623                 }
2624         }
2625         mtx_unlock(blp);
2626 }
2627 #endif
2628
2629 /*
2630  * Flush all entries referencing a particular filesystem.
2631  */
2632 void
2633 cache_purgevfs(struct mount *mp)
2634 {
2635         struct vnode *vp, *mvp;
2636
2637         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2638         /*
2639          * Somewhat wasteful iteration over all vnodes. Would be better to
2640          * support filtering and avoid the interlock to begin with.
2641          */
2642         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2643                 if (!cache_has_entries(vp)) {
2644                         VI_UNLOCK(vp);
2645                         continue;
2646                 }
2647                 vholdl(vp);
2648                 VI_UNLOCK(vp);
2649                 cache_purge(vp);
2650                 vdrop(vp);
2651         }
2652 }
2653
2654 /*
2655  * Perform canonical checks and cache lookup and pass on to filesystem
2656  * through the vop_cachedlookup only if needed.
2657  */
2658
2659 int
2660 vfs_cache_lookup(struct vop_lookup_args *ap)
2661 {
2662         struct vnode *dvp;
2663         int error;
2664         struct vnode **vpp = ap->a_vpp;
2665         struct componentname *cnp = ap->a_cnp;
2666         int flags = cnp->cn_flags;
2667
2668         *vpp = NULL;
2669         dvp = ap->a_dvp;
2670
2671         if (dvp->v_type != VDIR)
2672                 return (ENOTDIR);
2673
2674         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2675             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2676                 return (EROFS);
2677
2678         error = vn_dir_check_exec(dvp, cnp);
2679         if (error != 0)
2680                 return (error);
2681
2682         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2683         if (error == 0)
2684                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2685         if (error == -1)
2686                 return (0);
2687         return (error);
2688 }
2689
2690 /* Implementation of the getcwd syscall. */
2691 int
2692 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2693 {
2694         char *buf, *retbuf;
2695         size_t buflen;
2696         int error;
2697
2698         buflen = uap->buflen;
2699         if (__predict_false(buflen < 2))
2700                 return (EINVAL);
2701         if (buflen > MAXPATHLEN)
2702                 buflen = MAXPATHLEN;
2703
2704         buf = uma_zalloc(namei_zone, M_WAITOK);
2705         error = vn_getcwd(buf, &retbuf, &buflen);
2706         if (error == 0)
2707                 error = copyout(retbuf, uap->buf, buflen);
2708         uma_zfree(namei_zone, buf);
2709         return (error);
2710 }
2711
2712 int
2713 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2714 {
2715         struct pwd *pwd;
2716         int error;
2717
2718         vfs_smr_enter();
2719         pwd = pwd_get_smr();
2720         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2721             buflen, 0);
2722         VFS_SMR_ASSERT_NOT_ENTERED();
2723         if (error < 0) {
2724                 pwd = pwd_hold(curthread);
2725                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2726                     retbuf, buflen);
2727                 pwd_drop(pwd);
2728         }
2729
2730 #ifdef KTRACE
2731         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2732                 ktrnamei(*retbuf);
2733 #endif
2734         return (error);
2735 }
2736
2737 static int
2738 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2739     size_t size, int flags, enum uio_seg pathseg)
2740 {
2741         struct nameidata nd;
2742         char *retbuf, *freebuf;
2743         int error;
2744
2745         if (flags != 0)
2746                 return (EINVAL);
2747         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2748             pathseg, path, fd, &cap_fstat_rights, td);
2749         if ((error = namei(&nd)) != 0)
2750                 return (error);
2751         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2752         if (error == 0) {
2753                 error = copyout(retbuf, buf, size);
2754                 free(freebuf, M_TEMP);
2755         }
2756         NDFREE(&nd, 0);
2757         return (error);
2758 }
2759
2760 int
2761 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2762 {
2763
2764         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2765             uap->flags, UIO_USERSPACE));
2766 }
2767
2768 /*
2769  * Retrieve the full filesystem path that correspond to a vnode from the name
2770  * cache (if available)
2771  */
2772 int
2773 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2774 {
2775         struct pwd *pwd;
2776         char *buf;
2777         size_t buflen;
2778         int error;
2779
2780         if (__predict_false(vp == NULL))
2781                 return (EINVAL);
2782
2783         buflen = MAXPATHLEN;
2784         buf = malloc(buflen, M_TEMP, M_WAITOK);
2785         vfs_smr_enter();
2786         pwd = pwd_get_smr();
2787         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2788         VFS_SMR_ASSERT_NOT_ENTERED();
2789         if (error < 0) {
2790                 pwd = pwd_hold(curthread);
2791                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2792                 pwd_drop(pwd);
2793         }
2794         if (error == 0)
2795                 *freebuf = buf;
2796         else
2797                 free(buf, M_TEMP);
2798         return (error);
2799 }
2800
2801 /*
2802  * This function is similar to vn_fullpath, but it attempts to lookup the
2803  * pathname relative to the global root mount point.  This is required for the
2804  * auditing sub-system, as audited pathnames must be absolute, relative to the
2805  * global root mount point.
2806  */
2807 int
2808 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2809 {
2810         char *buf;
2811         size_t buflen;
2812         int error;
2813
2814         if (__predict_false(vp == NULL))
2815                 return (EINVAL);
2816         buflen = MAXPATHLEN;
2817         buf = malloc(buflen, M_TEMP, M_WAITOK);
2818         vfs_smr_enter();
2819         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2820         VFS_SMR_ASSERT_NOT_ENTERED();
2821         if (error < 0) {
2822                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2823         }
2824         if (error == 0)
2825                 *freebuf = buf;
2826         else
2827                 free(buf, M_TEMP);
2828         return (error);
2829 }
2830
2831 static struct namecache *
2832 vn_dd_from_dst(struct vnode *vp)
2833 {
2834         struct namecache *ncp;
2835
2836         cache_assert_vnode_locked(vp);
2837         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2838                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2839                         return (ncp);
2840         }
2841         return (NULL);
2842 }
2843
2844 int
2845 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2846 {
2847         struct vnode *dvp;
2848         struct namecache *ncp;
2849         struct mtx *vlp;
2850         int error;
2851
2852         vlp = VP2VNODELOCK(*vp);
2853         mtx_lock(vlp);
2854         ncp = (*vp)->v_cache_dd;
2855         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2856                 KASSERT(ncp == vn_dd_from_dst(*vp),
2857                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2858                     ncp, vn_dd_from_dst(*vp)));
2859         } else {
2860                 ncp = vn_dd_from_dst(*vp);
2861         }
2862         if (ncp != NULL) {
2863                 if (*buflen < ncp->nc_nlen) {
2864                         mtx_unlock(vlp);
2865                         vrele(*vp);
2866                         counter_u64_add(numfullpathfail4, 1);
2867                         error = ENOMEM;
2868                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2869                             vp, NULL);
2870                         return (error);
2871                 }
2872                 *buflen -= ncp->nc_nlen;
2873                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2874                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2875                     ncp->nc_name, vp);
2876                 dvp = *vp;
2877                 *vp = ncp->nc_dvp;
2878                 vref(*vp);
2879                 mtx_unlock(vlp);
2880                 vrele(dvp);
2881                 return (0);
2882         }
2883         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2884
2885         mtx_unlock(vlp);
2886         vn_lock(*vp, LK_SHARED | LK_RETRY);
2887         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2888         vput(*vp);
2889         if (error) {
2890                 counter_u64_add(numfullpathfail2, 1);
2891                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2892                 return (error);
2893         }
2894
2895         *vp = dvp;
2896         if (VN_IS_DOOMED(dvp)) {
2897                 /* forced unmount */
2898                 vrele(dvp);
2899                 error = ENOENT;
2900                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2901                 return (error);
2902         }
2903         /*
2904          * *vp has its use count incremented still.
2905          */
2906
2907         return (0);
2908 }
2909
2910 /*
2911  * Resolve a directory to a pathname.
2912  *
2913  * The name of the directory can always be found in the namecache or fetched
2914  * from the filesystem. There is also guaranteed to be only one parent, meaning
2915  * we can just follow vnodes up until we find the root.
2916  *
2917  * The vnode must be referenced.
2918  */
2919 static int
2920 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2921     size_t *len, size_t addend)
2922 {
2923 #ifdef KDTRACE_HOOKS
2924         struct vnode *startvp = vp;
2925 #endif
2926         struct vnode *vp1;
2927         size_t buflen;
2928         int error;
2929         bool slash_prefixed;
2930
2931         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2932         VNPASS(vp->v_usecount > 0, vp);
2933
2934         buflen = *len;
2935
2936         slash_prefixed = true;
2937         if (addend == 0) {
2938                 MPASS(*len >= 2);
2939                 buflen--;
2940                 buf[buflen] = '\0';
2941                 slash_prefixed = false;
2942         }
2943
2944         error = 0;
2945
2946         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2947         counter_u64_add(numfullpathcalls, 1);
2948         while (vp != rdir && vp != rootvnode) {
2949                 /*
2950                  * The vp vnode must be already fully constructed,
2951                  * since it is either found in namecache or obtained
2952                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2953                  * without obtaining the vnode lock.
2954                  */
2955                 if ((vp->v_vflag & VV_ROOT) != 0) {
2956                         vn_lock(vp, LK_RETRY | LK_SHARED);
2957
2958                         /*
2959                          * With the vnode locked, check for races with
2960                          * unmount, forced or not.  Note that we
2961                          * already verified that vp is not equal to
2962                          * the root vnode, which means that
2963                          * mnt_vnodecovered can be NULL only for the
2964                          * case of unmount.
2965                          */
2966                         if (VN_IS_DOOMED(vp) ||
2967                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2968                             vp1->v_mountedhere != vp->v_mount) {
2969                                 vput(vp);
2970                                 error = ENOENT;
2971                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2972                                     error, vp, NULL);
2973                                 break;
2974                         }
2975
2976                         vref(vp1);
2977                         vput(vp);
2978                         vp = vp1;
2979                         continue;
2980                 }
2981                 if (vp->v_type != VDIR) {
2982                         vrele(vp);
2983                         counter_u64_add(numfullpathfail1, 1);
2984                         error = ENOTDIR;
2985                         SDT_PROBE3(vfs, namecache, fullpath, return,
2986                             error, vp, NULL);
2987                         break;
2988                 }
2989                 error = vn_vptocnp(&vp, buf, &buflen);
2990                 if (error)
2991                         break;
2992                 if (buflen == 0) {
2993                         vrele(vp);
2994                         error = ENOMEM;
2995                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2996                             startvp, NULL);
2997                         break;
2998                 }
2999                 buf[--buflen] = '/';
3000                 slash_prefixed = true;
3001         }
3002         if (error)
3003                 return (error);
3004         if (!slash_prefixed) {
3005                 if (buflen == 0) {
3006                         vrele(vp);
3007                         counter_u64_add(numfullpathfail4, 1);
3008                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3009                             startvp, NULL);
3010                         return (ENOMEM);
3011                 }
3012                 buf[--buflen] = '/';
3013         }
3014         counter_u64_add(numfullpathfound, 1);
3015         vrele(vp);
3016
3017         *retbuf = buf + buflen;
3018         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3019         *len -= buflen;
3020         *len += addend;
3021         return (0);
3022 }
3023
3024 /*
3025  * Resolve an arbitrary vnode to a pathname.
3026  *
3027  * Note 2 caveats:
3028  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3029  *   resolve to a different path than the one used to find it
3030  * - namecache is not mandatory, meaning names are not guaranteed to be added
3031  *   (in which case resolving fails)
3032  */
3033 static void __inline
3034 cache_rev_failed_impl(int *reason, int line)
3035 {
3036
3037         *reason = line;
3038 }
3039 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3040
3041 static int
3042 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3043     char **retbuf, size_t *buflen, size_t addend)
3044 {
3045 #ifdef KDTRACE_HOOKS
3046         struct vnode *startvp = vp;
3047 #endif
3048         struct vnode *tvp;
3049         struct mount *mp;
3050         struct namecache *ncp;
3051         size_t orig_buflen;
3052         int reason;
3053         int error;
3054 #ifdef KDTRACE_HOOKS
3055         int i;
3056 #endif
3057         seqc_t vp_seqc, tvp_seqc;
3058         u_char nc_flag;
3059
3060         VFS_SMR_ASSERT_ENTERED();
3061
3062         if (!cache_fast_revlookup) {
3063                 vfs_smr_exit();
3064                 return (-1);
3065         }
3066
3067         orig_buflen = *buflen;
3068
3069         if (addend == 0) {
3070                 MPASS(*buflen >= 2);
3071                 *buflen -= 1;
3072                 buf[*buflen] = '\0';
3073         }
3074
3075         if (vp == rdir || vp == rootvnode) {
3076                 if (addend == 0) {
3077                         *buflen -= 1;
3078                         buf[*buflen] = '/';
3079                 }
3080                 goto out_ok;
3081         }
3082
3083 #ifdef KDTRACE_HOOKS
3084         i = 0;
3085 #endif
3086         error = -1;
3087         ncp = NULL; /* for sdt probe down below */
3088         vp_seqc = vn_seqc_read_any(vp);
3089         if (seqc_in_modify(vp_seqc)) {
3090                 cache_rev_failed(&reason);
3091                 goto out_abort;
3092         }
3093
3094         for (;;) {
3095 #ifdef KDTRACE_HOOKS
3096                 i++;
3097 #endif
3098                 if ((vp->v_vflag & VV_ROOT) != 0) {
3099                         mp = atomic_load_ptr(&vp->v_mount);
3100                         if (mp == NULL) {
3101                                 cache_rev_failed(&reason);
3102                                 goto out_abort;
3103                         }
3104                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3105                         tvp_seqc = vn_seqc_read_any(tvp);
3106                         if (seqc_in_modify(tvp_seqc)) {
3107                                 cache_rev_failed(&reason);
3108                                 goto out_abort;
3109                         }
3110                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3111                                 cache_rev_failed(&reason);
3112                                 goto out_abort;
3113                         }
3114                         vp = tvp;
3115                         vp_seqc = tvp_seqc;
3116                         continue;
3117                 }
3118                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3119                 if (ncp == NULL) {
3120                         cache_rev_failed(&reason);
3121                         goto out_abort;
3122                 }
3123                 nc_flag = atomic_load_char(&ncp->nc_flag);
3124                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3125                         cache_rev_failed(&reason);
3126                         goto out_abort;
3127                 }
3128                 if (!cache_ncp_canuse(ncp)) {
3129                         cache_rev_failed(&reason);
3130                         goto out_abort;
3131                 }
3132                 if (ncp->nc_nlen >= *buflen) {
3133                         cache_rev_failed(&reason);
3134                         error = ENOMEM;
3135                         goto out_abort;
3136                 }
3137                 *buflen -= ncp->nc_nlen;
3138                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3139                 *buflen -= 1;
3140                 buf[*buflen] = '/';
3141                 tvp = ncp->nc_dvp;
3142                 tvp_seqc = vn_seqc_read_any(tvp);
3143                 if (seqc_in_modify(tvp_seqc)) {
3144                         cache_rev_failed(&reason);
3145                         goto out_abort;
3146                 }
3147                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3148                         cache_rev_failed(&reason);
3149                         goto out_abort;
3150                 }
3151                 vp = tvp;
3152                 vp_seqc = tvp_seqc;
3153                 if (vp == rdir || vp == rootvnode)
3154                         break;
3155         }
3156 out_ok:
3157         vfs_smr_exit();
3158         *retbuf = buf + *buflen;
3159         *buflen = orig_buflen - *buflen + addend;
3160         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3161         return (0);
3162
3163 out_abort:
3164         *buflen = orig_buflen;
3165         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3166         vfs_smr_exit();
3167         return (error);
3168 }
3169
3170 static int
3171 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3172     size_t *buflen)
3173 {
3174         size_t orig_buflen, addend;
3175         int error;
3176
3177         if (*buflen < 2)
3178                 return (EINVAL);
3179
3180         orig_buflen = *buflen;
3181
3182         vref(vp);
3183         addend = 0;
3184         if (vp->v_type != VDIR) {
3185                 *buflen -= 1;
3186                 buf[*buflen] = '\0';
3187                 error = vn_vptocnp(&vp, buf, buflen);
3188                 if (error)
3189                         return (error);
3190                 if (*buflen == 0) {
3191                         vrele(vp);
3192                         return (ENOMEM);
3193                 }
3194                 *buflen -= 1;
3195                 buf[*buflen] = '/';
3196                 addend = orig_buflen - *buflen;
3197         }
3198
3199         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3200 }
3201
3202 /*
3203  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3204  *
3205  * Since the namecache does not track handlings, the caller is expected to first
3206  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3207  *
3208  * Then we have 2 cases:
3209  * - if the found vnode is a directory, the path can be constructed just by
3210  *   fullowing names up the chain
3211  * - otherwise we populate the buffer with the saved name and start resolving
3212  *   from the parent
3213  */
3214 static int
3215 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3216     size_t *buflen)
3217 {
3218         char *buf, *tmpbuf;
3219         struct pwd *pwd;
3220         struct componentname *cnp;
3221         struct vnode *vp;
3222         size_t addend;
3223         int error;
3224         enum vtype type;
3225
3226         if (*buflen < 2)
3227                 return (EINVAL);
3228         if (*buflen > MAXPATHLEN)
3229                 *buflen = MAXPATHLEN;
3230
3231         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3232
3233         addend = 0;
3234         vp = ndp->ni_vp;
3235         /*
3236          * Check for VBAD to work around the vp_crossmp bug in lookup().
3237          *
3238          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3239          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3240          * If the type is VDIR (like in this very case) we can skip looking
3241          * at ni_dvp in the first place. However, since vnodes get passed here
3242          * unlocked the target may transition to doomed state (type == VBAD)
3243          * before we get to evaluate the condition. If this happens, we will
3244          * populate part of the buffer and descend to vn_fullpath_dir with
3245          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3246          *
3247          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3248          * an address of a bit field, even if said field is sized to char.
3249          * Work around the problem by reading the value into a full-sized enum
3250          * and then re-reading it with atomic_load which will still prevent
3251          * the compiler from re-reading down the road.
3252          */
3253         type = vp->v_type;
3254         type = atomic_load_int(&type);
3255         if (type == VBAD) {
3256                 error = ENOENT;
3257                 goto out_bad;
3258         }
3259         if (type != VDIR) {
3260                 cnp = &ndp->ni_cnd;
3261                 addend = cnp->cn_namelen + 2;
3262                 if (*buflen < addend) {
3263                         error = ENOMEM;
3264                         goto out_bad;
3265                 }
3266                 *buflen -= addend;
3267                 tmpbuf = buf + *buflen;
3268                 tmpbuf[0] = '/';
3269                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3270                 tmpbuf[addend - 1] = '\0';
3271                 vp = ndp->ni_dvp;
3272         }
3273
3274         vfs_smr_enter();
3275         pwd = pwd_get_smr();
3276         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3277             addend);
3278         VFS_SMR_ASSERT_NOT_ENTERED();
3279         if (error < 0) {
3280                 pwd = pwd_hold(curthread);
3281                 vref(vp);
3282                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3283                     addend);
3284                 pwd_drop(pwd);
3285                 if (error != 0)
3286                         goto out_bad;
3287         }
3288
3289         *freebuf = buf;
3290
3291         return (0);
3292 out_bad:
3293         free(buf, M_TEMP);
3294         return (error);
3295 }
3296
3297 struct vnode *
3298 vn_dir_dd_ino(struct vnode *vp)
3299 {
3300         struct namecache *ncp;
3301         struct vnode *ddvp;
3302         struct mtx *vlp;
3303         enum vgetstate vs;
3304
3305         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3306         vlp = VP2VNODELOCK(vp);
3307         mtx_lock(vlp);
3308         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3309                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3310                         continue;
3311                 ddvp = ncp->nc_dvp;
3312                 vs = vget_prep(ddvp);
3313                 mtx_unlock(vlp);
3314                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3315                         return (NULL);
3316                 return (ddvp);
3317         }
3318         mtx_unlock(vlp);
3319         return (NULL);
3320 }
3321
3322 int
3323 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3324 {
3325         struct namecache *ncp;
3326         struct mtx *vlp;
3327         int l;
3328
3329         vlp = VP2VNODELOCK(vp);
3330         mtx_lock(vlp);
3331         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3332                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3333                         break;
3334         if (ncp == NULL) {
3335                 mtx_unlock(vlp);
3336                 return (ENOENT);
3337         }
3338         l = min(ncp->nc_nlen, buflen - 1);
3339         memcpy(buf, ncp->nc_name, l);
3340         mtx_unlock(vlp);
3341         buf[l] = '\0';
3342         return (0);
3343 }
3344
3345 /*
3346  * This function updates path string to vnode's full global path
3347  * and checks the size of the new path string against the pathlen argument.
3348  *
3349  * Requires a locked, referenced vnode.
3350  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3351  *
3352  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3353  * because it falls back to the ".." lookup if the namecache lookup fails.
3354  */
3355 int
3356 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3357     u_int pathlen)
3358 {
3359         struct nameidata nd;
3360         struct vnode *vp1;
3361         char *rpath, *fbuf;
3362         int error;
3363
3364         ASSERT_VOP_ELOCKED(vp, __func__);
3365
3366         /* Construct global filesystem path from vp. */
3367         VOP_UNLOCK(vp);
3368         error = vn_fullpath_global(vp, &rpath, &fbuf);
3369
3370         if (error != 0) {
3371                 vrele(vp);
3372                 return (error);
3373         }
3374
3375         if (strlen(rpath) >= pathlen) {
3376                 vrele(vp);
3377                 error = ENAMETOOLONG;
3378                 goto out;
3379         }
3380
3381         /*
3382          * Re-lookup the vnode by path to detect a possible rename.
3383          * As a side effect, the vnode is relocked.
3384          * If vnode was renamed, return ENOENT.
3385          */
3386         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3387             UIO_SYSSPACE, path, td);
3388         error = namei(&nd);
3389         if (error != 0) {
3390                 vrele(vp);
3391                 goto out;
3392         }
3393         NDFREE(&nd, NDF_ONLY_PNBUF);
3394         vp1 = nd.ni_vp;
3395         vrele(vp);
3396         if (vp1 == vp)
3397                 strcpy(path, rpath);
3398         else {
3399                 vput(vp1);
3400                 error = ENOENT;
3401         }
3402
3403 out:
3404         free(fbuf, M_TEMP);
3405         return (error);
3406 }
3407
3408 #ifdef DDB
3409 static void
3410 db_print_vpath(struct vnode *vp)
3411 {
3412
3413         while (vp != NULL) {
3414                 db_printf("%p: ", vp);
3415                 if (vp == rootvnode) {
3416                         db_printf("/");
3417                         vp = NULL;
3418                 } else {
3419                         if (vp->v_vflag & VV_ROOT) {
3420                                 db_printf("<mount point>");
3421                                 vp = vp->v_mount->mnt_vnodecovered;
3422                         } else {
3423                                 struct namecache *ncp;
3424                                 char *ncn;
3425                                 int i;
3426
3427                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3428                                 if (ncp != NULL) {
3429                                         ncn = ncp->nc_name;
3430                                         for (i = 0; i < ncp->nc_nlen; i++)
3431                                                 db_printf("%c", *ncn++);
3432                                         vp = ncp->nc_dvp;
3433                                 } else {
3434                                         vp = NULL;
3435                                 }
3436                         }
3437                 }
3438                 db_printf("\n");
3439         }
3440
3441         return;
3442 }
3443
3444 DB_SHOW_COMMAND(vpath, db_show_vpath)
3445 {
3446         struct vnode *vp;
3447
3448         if (!have_addr) {
3449                 db_printf("usage: show vpath <struct vnode *>\n");
3450                 return;
3451         }
3452
3453         vp = (struct vnode *)addr;
3454         db_print_vpath(vp);
3455 }
3456
3457 #endif
3458
3459 static bool __read_frequently cache_fast_lookup = true;
3460 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3461     &cache_fast_lookup, 0, "");
3462
3463 #define CACHE_FPL_FAILED        -2020
3464
3465 static void
3466 cache_fpl_cleanup_cnp(struct componentname *cnp)
3467 {
3468
3469         uma_zfree(namei_zone, cnp->cn_pnbuf);
3470 #ifdef DIAGNOSTIC
3471         cnp->cn_pnbuf = NULL;
3472         cnp->cn_nameptr = NULL;
3473 #endif
3474 }
3475
3476 static void
3477 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3478 {
3479         struct componentname *cnp;
3480
3481         cnp = &ndp->ni_cnd;
3482         while (*(cnp->cn_nameptr) == '/') {
3483                 cnp->cn_nameptr++;
3484                 ndp->ni_pathlen--;
3485         }
3486
3487         *dpp = ndp->ni_rootdir;
3488 }
3489
3490 /*
3491  * Components of nameidata (or objects it can point to) which may
3492  * need restoring in case fast path lookup fails.
3493  */
3494 struct nameidata_saved {
3495         long cn_namelen;
3496         char *cn_nameptr;
3497         size_t ni_pathlen;
3498         int cn_flags;
3499 };
3500
3501 struct cache_fpl {
3502         struct nameidata *ndp;
3503         struct componentname *cnp;
3504         struct pwd *pwd;
3505         struct vnode *dvp;
3506         struct vnode *tvp;
3507         seqc_t dvp_seqc;
3508         seqc_t tvp_seqc;
3509         struct nameidata_saved snd;
3510         int line;
3511         enum cache_fpl_status status:8;
3512         bool in_smr;
3513         bool fsearch;
3514 };
3515
3516 static void
3517 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3518 {
3519
3520         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3521         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3522         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3523         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3524 }
3525
3526 static void
3527 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3528 {
3529
3530         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3531         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3532         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3533         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3534 }
3535
3536 #ifdef INVARIANTS
3537 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3538         struct cache_fpl *_fpl = (fpl);                         \
3539         MPASS(_fpl->in_smr == true);                            \
3540         VFS_SMR_ASSERT_ENTERED();                               \
3541 })
3542 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3543         struct cache_fpl *_fpl = (fpl);                         \
3544         MPASS(_fpl->in_smr == false);                           \
3545         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3546 })
3547 #else
3548 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3549 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3550 #endif
3551
3552 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3553         struct cache_fpl *_fpl = (fpl);                         \
3554         vfs_smr_enter();                                        \
3555         _fpl->in_smr = true;                                    \
3556 })
3557
3558 #define cache_fpl_smr_enter(fpl) ({                             \
3559         struct cache_fpl *_fpl = (fpl);                         \
3560         MPASS(_fpl->in_smr == false);                           \
3561         vfs_smr_enter();                                        \
3562         _fpl->in_smr = true;                                    \
3563 })
3564
3565 #define cache_fpl_smr_exit(fpl) ({                              \
3566         struct cache_fpl *_fpl = (fpl);                         \
3567         MPASS(_fpl->in_smr == true);                            \
3568         vfs_smr_exit();                                         \
3569         _fpl->in_smr = false;                                   \
3570 })
3571
3572 static int
3573 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3574 {
3575
3576         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3577                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3578                     ("%s: converting to abort from %d at %d, set at %d\n",
3579                     __func__, fpl->status, line, fpl->line));
3580         }
3581         fpl->status = CACHE_FPL_STATUS_ABORTED;
3582         fpl->line = line;
3583         return (CACHE_FPL_FAILED);
3584 }
3585
3586 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3587
3588 static int
3589 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3590 {
3591
3592         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3593             ("%s: setting to partial at %d, but already set to %d at %d\n",
3594             __func__, line, fpl->status, fpl->line));
3595         cache_fpl_smr_assert_entered(fpl);
3596         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3597         fpl->line = line;
3598         return (CACHE_FPL_FAILED);
3599 }
3600
3601 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3602
3603 static int
3604 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3605 {
3606
3607         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3608             ("%s: setting to handled at %d, but already set to %d at %d\n",
3609             __func__, line, fpl->status, fpl->line));
3610         cache_fpl_smr_assert_not_entered(fpl);
3611         MPASS(error != CACHE_FPL_FAILED);
3612         fpl->status = CACHE_FPL_STATUS_HANDLED;
3613         fpl->line = line;
3614         return (error);
3615 }
3616
3617 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3618
3619 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3620         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3621          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3622
3623 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3624         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3625
3626 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3627     "supported and internal flags overlap");
3628
3629 static bool
3630 cache_fpl_islastcn(struct nameidata *ndp)
3631 {
3632
3633         return (*ndp->ni_next == 0);
3634 }
3635
3636 static bool
3637 cache_fpl_isdotdot(struct componentname *cnp)
3638 {
3639
3640         if (cnp->cn_namelen == 2 &&
3641             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3642                 return (true);
3643         return (false);
3644 }
3645
3646 static bool
3647 cache_can_fplookup(struct cache_fpl *fpl)
3648 {
3649         struct nameidata *ndp;
3650         struct componentname *cnp;
3651         struct thread *td;
3652
3653         ndp = fpl->ndp;
3654         cnp = fpl->cnp;
3655         td = cnp->cn_thread;
3656
3657         if (!cache_fast_lookup) {
3658                 cache_fpl_aborted(fpl);
3659                 return (false);
3660         }
3661 #ifdef MAC
3662         if (mac_vnode_check_lookup_enabled()) {
3663                 cache_fpl_aborted(fpl);
3664                 return (false);
3665         }
3666 #endif
3667         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3668                 cache_fpl_aborted(fpl);
3669                 return (false);
3670         }
3671         if (IN_CAPABILITY_MODE(td)) {
3672                 cache_fpl_aborted(fpl);
3673                 return (false);
3674         }
3675         if (AUDITING_TD(td)) {
3676                 cache_fpl_aborted(fpl);
3677                 return (false);
3678         }
3679         if (ndp->ni_startdir != NULL) {
3680                 cache_fpl_aborted(fpl);
3681                 return (false);
3682         }
3683         return (true);
3684 }
3685
3686 static int
3687 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3688 {
3689         struct nameidata *ndp;
3690         int error;
3691         bool fsearch;
3692
3693         ndp = fpl->ndp;
3694         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3695         if (__predict_false(error != 0)) {
3696                 cache_fpl_smr_exit(fpl);
3697                 return (cache_fpl_aborted(fpl));
3698         }
3699         fpl->fsearch = fsearch;
3700         return (0);
3701 }
3702
3703 static bool
3704 cache_fplookup_vnode_supported(struct vnode *vp)
3705 {
3706
3707         return (vp->v_type != VLNK);
3708 }
3709
3710 static int __noinline
3711 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3712     uint32_t hash)
3713 {
3714         struct componentname *cnp;
3715         struct vnode *dvp;
3716
3717         cnp = fpl->cnp;
3718         dvp = fpl->dvp;
3719
3720         cache_fpl_smr_exit(fpl);
3721         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3722                 return (cache_fpl_handled(fpl, ENOENT));
3723         else
3724                 return (cache_fpl_aborted(fpl));
3725 }
3726
3727 /*
3728  * The target vnode is not supported, prepare for the slow path to take over.
3729  */
3730 static int __noinline
3731 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3732 {
3733         struct nameidata *ndp;
3734         struct componentname *cnp;
3735         enum vgetstate dvs;
3736         struct vnode *dvp;
3737         struct pwd *pwd;
3738         seqc_t dvp_seqc;
3739
3740         ndp = fpl->ndp;
3741         cnp = fpl->cnp;
3742         pwd = fpl->pwd;
3743         dvp = fpl->dvp;
3744         dvp_seqc = fpl->dvp_seqc;
3745
3746         if (!pwd_hold_smr(pwd)) {
3747                 cache_fpl_smr_exit(fpl);
3748                 return (cache_fpl_aborted(fpl));
3749         }
3750
3751         dvs = vget_prep_smr(dvp);
3752         cache_fpl_smr_exit(fpl);
3753         if (__predict_false(dvs == VGET_NONE)) {
3754                 pwd_drop(pwd);
3755                 return (cache_fpl_aborted(fpl));
3756         }
3757
3758         vget_finish_ref(dvp, dvs);
3759         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3760                 vrele(dvp);
3761                 pwd_drop(pwd);
3762                 return (cache_fpl_aborted(fpl));
3763         }
3764
3765         cache_fpl_restore(fpl, &fpl->snd);
3766
3767         ndp->ni_startdir = dvp;
3768         cnp->cn_flags |= MAKEENTRY;
3769         if (cache_fpl_islastcn(ndp))
3770                 cnp->cn_flags |= ISLASTCN;
3771         if (cache_fpl_isdotdot(cnp))
3772                 cnp->cn_flags |= ISDOTDOT;
3773
3774         return (0);
3775 }
3776
3777 static int
3778 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3779 {
3780         struct componentname *cnp;
3781         struct vnode *tvp;
3782         seqc_t tvp_seqc;
3783         int error, lkflags;
3784
3785         cnp = fpl->cnp;
3786         tvp = fpl->tvp;
3787         tvp_seqc = fpl->tvp_seqc;
3788
3789         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3790                 lkflags = LK_SHARED;
3791                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3792                         lkflags = LK_EXCLUSIVE;
3793                 error = vget_finish(tvp, lkflags, tvs);
3794                 if (__predict_false(error != 0)) {
3795                         return (cache_fpl_aborted(fpl));
3796                 }
3797         } else {
3798                 vget_finish_ref(tvp, tvs);
3799         }
3800
3801         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3802                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3803                         vput(tvp);
3804                 else
3805                         vrele(tvp);
3806                 return (cache_fpl_aborted(fpl));
3807         }
3808
3809         return (cache_fpl_handled(fpl, 0));
3810 }
3811
3812 /*
3813  * They want to possibly modify the state of the namecache.
3814  *
3815  * Don't try to match the API contract, just leave.
3816  * TODO: this leaves scalability on the table
3817  */
3818 static int
3819 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3820 {
3821         struct componentname *cnp;
3822
3823         cnp = fpl->cnp;
3824         MPASS(cnp->cn_nameiop != LOOKUP);
3825         return (cache_fpl_partial(fpl));
3826 }
3827
3828 static int __noinline
3829 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3830 {
3831         struct componentname *cnp;
3832         enum vgetstate dvs, tvs;
3833         struct vnode *dvp, *tvp;
3834         seqc_t dvp_seqc;
3835         int error;
3836
3837         cnp = fpl->cnp;
3838         dvp = fpl->dvp;
3839         dvp_seqc = fpl->dvp_seqc;
3840         tvp = fpl->tvp;
3841
3842         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3843
3844         /*
3845          * This is less efficient than it can be for simplicity.
3846          */
3847         dvs = vget_prep_smr(dvp);
3848         if (__predict_false(dvs == VGET_NONE)) {
3849                 return (cache_fpl_aborted(fpl));
3850         }
3851         tvs = vget_prep_smr(tvp);
3852         if (__predict_false(tvs == VGET_NONE)) {
3853                 cache_fpl_smr_exit(fpl);
3854                 vget_abort(dvp, dvs);
3855                 return (cache_fpl_aborted(fpl));
3856         }
3857
3858         cache_fpl_smr_exit(fpl);
3859
3860         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3861                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3862                 if (__predict_false(error != 0)) {
3863                         vget_abort(tvp, tvs);
3864                         return (cache_fpl_aborted(fpl));
3865                 }
3866         } else {
3867                 vget_finish_ref(dvp, dvs);
3868         }
3869
3870         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3871                 vget_abort(tvp, tvs);
3872                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3873                         vput(dvp);
3874                 else
3875                         vrele(dvp);
3876                 return (cache_fpl_aborted(fpl));
3877         }
3878
3879         error = cache_fplookup_final_child(fpl, tvs);
3880         if (__predict_false(error != 0)) {
3881                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3882                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3883                         vput(dvp);
3884                 else
3885                         vrele(dvp);
3886                 return (error);
3887         }
3888
3889         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3890         return (0);
3891 }
3892
3893 static int
3894 cache_fplookup_final(struct cache_fpl *fpl)
3895 {
3896         struct componentname *cnp;
3897         enum vgetstate tvs;
3898         struct vnode *dvp, *tvp;
3899         seqc_t dvp_seqc;
3900
3901         cnp = fpl->cnp;
3902         dvp = fpl->dvp;
3903         dvp_seqc = fpl->dvp_seqc;
3904         tvp = fpl->tvp;
3905
3906         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3907
3908         if (cnp->cn_nameiop != LOOKUP) {
3909                 return (cache_fplookup_final_modifying(fpl));
3910         }
3911
3912         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3913                 return (cache_fplookup_final_withparent(fpl));
3914
3915         tvs = vget_prep_smr(tvp);
3916         if (__predict_false(tvs == VGET_NONE)) {
3917                 return (cache_fpl_partial(fpl));
3918         }
3919
3920         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3921                 cache_fpl_smr_exit(fpl);
3922                 vget_abort(tvp, tvs);
3923                 return (cache_fpl_aborted(fpl));
3924         }
3925
3926         cache_fpl_smr_exit(fpl);
3927         return (cache_fplookup_final_child(fpl, tvs));
3928 }
3929
3930 static int __noinline
3931 cache_fplookup_dot(struct cache_fpl *fpl)
3932 {
3933         struct vnode *dvp;
3934
3935         dvp = fpl->dvp;
3936
3937         fpl->tvp = dvp;
3938         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3939         if (seqc_in_modify(fpl->tvp_seqc)) {
3940                 return (cache_fpl_aborted(fpl));
3941         }
3942
3943         counter_u64_add(dothits, 1);
3944         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3945
3946         return (0);
3947 }
3948
3949 static int __noinline
3950 cache_fplookup_dotdot(struct cache_fpl *fpl)
3951 {
3952         struct nameidata *ndp;
3953         struct componentname *cnp;
3954         struct namecache *ncp;
3955         struct vnode *dvp;
3956         struct prison *pr;
3957         u_char nc_flag;
3958
3959         ndp = fpl->ndp;
3960         cnp = fpl->cnp;
3961         dvp = fpl->dvp;
3962
3963         /*
3964          * XXX this is racy the same way regular lookup is
3965          */
3966         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3967             pr = pr->pr_parent)
3968                 if (dvp == pr->pr_root)
3969                         break;
3970
3971         if (dvp == ndp->ni_rootdir ||
3972             dvp == ndp->ni_topdir ||
3973             dvp == rootvnode ||
3974             pr != NULL) {
3975                 fpl->tvp = dvp;
3976                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3977                 if (seqc_in_modify(fpl->tvp_seqc)) {
3978                         return (cache_fpl_aborted(fpl));
3979                 }
3980                 return (0);
3981         }
3982
3983         if ((dvp->v_vflag & VV_ROOT) != 0) {
3984                 /*
3985                  * TODO
3986                  * The opposite of climb mount is needed here.
3987                  */
3988                 return (cache_fpl_aborted(fpl));
3989         }
3990
3991         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3992         if (ncp == NULL) {
3993                 return (cache_fpl_aborted(fpl));
3994         }
3995
3996         nc_flag = atomic_load_char(&ncp->nc_flag);
3997         if ((nc_flag & NCF_ISDOTDOT) != 0) {
3998                 if ((nc_flag & NCF_NEGATIVE) != 0)
3999                         return (cache_fpl_aborted(fpl));
4000                 fpl->tvp = ncp->nc_vp;
4001         } else {
4002                 fpl->tvp = ncp->nc_dvp;
4003         }
4004
4005         if (__predict_false(!cache_ncp_canuse(ncp))) {
4006                 return (cache_fpl_aborted(fpl));
4007         }
4008
4009         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4010         if (seqc_in_modify(fpl->tvp_seqc)) {
4011                 return (cache_fpl_partial(fpl));
4012         }
4013
4014         counter_u64_add(dotdothits, 1);
4015         return (0);
4016 }
4017
4018 static int __noinline
4019 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4020 {
4021         u_char nc_flag;
4022         bool neg_promote;
4023
4024         nc_flag = atomic_load_char(&ncp->nc_flag);
4025         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4026         /*
4027          * If they want to create an entry we need to replace this one.
4028          */
4029         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4030                 /*
4031                  * TODO
4032                  * This should call something similar to
4033                  * cache_fplookup_final_modifying.
4034                  */
4035                 return (cache_fpl_partial(fpl));
4036         }
4037         neg_promote = cache_neg_hit_prep(ncp);
4038         if (__predict_false(!cache_ncp_canuse(ncp))) {
4039                 cache_neg_hit_abort(ncp);
4040                 return (cache_fpl_partial(fpl));
4041         }
4042         if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4043                 cache_neg_hit_abort(ncp);
4044                 return (cache_fpl_partial(fpl));
4045         }
4046         if (neg_promote) {
4047                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4048         }
4049         cache_neg_hit_finish(ncp);
4050         cache_fpl_smr_exit(fpl);
4051         return (cache_fpl_handled(fpl, ENOENT));
4052 }
4053
4054 static int
4055 cache_fplookup_next(struct cache_fpl *fpl)
4056 {
4057         struct componentname *cnp;
4058         struct namecache *ncp;
4059         struct vnode *dvp, *tvp;
4060         u_char nc_flag;
4061         uint32_t hash;
4062
4063         cnp = fpl->cnp;
4064         dvp = fpl->dvp;
4065
4066         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4067                 return (cache_fplookup_dot(fpl));
4068         }
4069
4070         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4071
4072         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4073                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4074                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4075                         break;
4076         }
4077
4078         /*
4079          * If there is no entry we have to punt to the slow path to perform
4080          * actual lookup. Should there be nothing with this name a negative
4081          * entry will be created.
4082          */
4083         if (__predict_false(ncp == NULL)) {
4084                 return (cache_fpl_partial(fpl));
4085         }
4086
4087         tvp = atomic_load_ptr(&ncp->nc_vp);
4088         nc_flag = atomic_load_char(&ncp->nc_flag);
4089         if ((nc_flag & NCF_NEGATIVE) != 0) {
4090                 return (cache_fplookup_neg(fpl, ncp, hash));
4091         }
4092
4093         if (__predict_false(!cache_ncp_canuse(ncp))) {
4094                 return (cache_fpl_partial(fpl));
4095         }
4096
4097         fpl->tvp = tvp;
4098         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4099         if (seqc_in_modify(fpl->tvp_seqc)) {
4100                 return (cache_fpl_partial(fpl));
4101         }
4102
4103         if (!cache_fplookup_vnode_supported(tvp)) {
4104                 return (cache_fpl_partial(fpl));
4105         }
4106
4107         counter_u64_add(numposhits, 1);
4108         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4109         return (0);
4110 }
4111
4112 static bool
4113 cache_fplookup_mp_supported(struct mount *mp)
4114 {
4115
4116         if (mp == NULL)
4117                 return (false);
4118         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4119                 return (false);
4120         return (true);
4121 }
4122
4123 /*
4124  * Walk up the mount stack (if any).
4125  *
4126  * Correctness is provided in the following ways:
4127  * - all vnodes are protected from freeing with SMR
4128  * - struct mount objects are type stable making them always safe to access
4129  * - stability of the particular mount is provided by busying it
4130  * - relationship between the vnode which is mounted on and the mount is
4131  *   verified with the vnode sequence counter after busying
4132  * - association between root vnode of the mount and the mount is protected
4133  *   by busy
4134  *
4135  * From that point on we can read the sequence counter of the root vnode
4136  * and get the next mount on the stack (if any) using the same protection.
4137  *
4138  * By the end of successful walk we are guaranteed the reached state was
4139  * indeed present at least at some point which matches the regular lookup.
4140  */
4141 static int __noinline
4142 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4143 {
4144         struct mount *mp, *prev_mp;
4145         struct vnode *vp;
4146         seqc_t vp_seqc;
4147
4148         vp = fpl->tvp;
4149         vp_seqc = fpl->tvp_seqc;
4150
4151         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4152         mp = atomic_load_ptr(&vp->v_mountedhere);
4153         if (mp == NULL)
4154                 return (0);
4155
4156         prev_mp = NULL;
4157         for (;;) {
4158                 if (!vfs_op_thread_enter_crit(mp)) {
4159                         if (prev_mp != NULL)
4160                                 vfs_op_thread_exit_crit(prev_mp);
4161                         return (cache_fpl_partial(fpl));
4162                 }
4163                 if (prev_mp != NULL)
4164                         vfs_op_thread_exit_crit(prev_mp);
4165                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4166                         vfs_op_thread_exit_crit(mp);
4167                         return (cache_fpl_partial(fpl));
4168                 }
4169                 if (!cache_fplookup_mp_supported(mp)) {
4170                         vfs_op_thread_exit_crit(mp);
4171                         return (cache_fpl_partial(fpl));
4172                 }
4173                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4174                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4175                         vfs_op_thread_exit_crit(mp);
4176                         return (cache_fpl_partial(fpl));
4177                 }
4178                 vp_seqc = vn_seqc_read_any(vp);
4179                 if (seqc_in_modify(vp_seqc)) {
4180                         vfs_op_thread_exit_crit(mp);
4181                         return (cache_fpl_partial(fpl));
4182                 }
4183                 prev_mp = mp;
4184                 mp = atomic_load_ptr(&vp->v_mountedhere);
4185                 if (mp == NULL)
4186                         break;
4187         }
4188
4189         vfs_op_thread_exit_crit(prev_mp);
4190         fpl->tvp = vp;
4191         fpl->tvp_seqc = vp_seqc;
4192         return (0);
4193 }
4194
4195 static bool
4196 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4197 {
4198         struct mount *mp;
4199         struct vnode *vp;
4200
4201         vp = fpl->tvp;
4202
4203         /*
4204          * Hack: while this is a union, the pointer tends to be NULL so save on
4205          * a branch.
4206          */
4207         mp = atomic_load_ptr(&vp->v_mountedhere);
4208         if (mp == NULL)
4209                 return (false);
4210         if (vp->v_type == VDIR)
4211                 return (true);
4212         return (false);
4213 }
4214
4215 /*
4216  * Parse the path.
4217  *
4218  * The code was originally copy-pasted from regular lookup and despite
4219  * clean ups leaves performance on the table. Any modifications here
4220  * must take into account that in case off fallback the resulting
4221  * nameidata state has to be compatible with the original.
4222  */
4223 static int
4224 cache_fplookup_parse(struct cache_fpl *fpl)
4225 {
4226         struct nameidata *ndp;
4227         struct componentname *cnp;
4228         char *cp;
4229
4230         ndp = fpl->ndp;
4231         cnp = fpl->cnp;
4232
4233         /*
4234          * Search a new directory.
4235          *
4236          * The last component of the filename is left accessible via
4237          * cnp->cn_nameptr for callers that need the name. Callers needing
4238          * the name set the SAVENAME flag. When done, they assume
4239          * responsibility for freeing the pathname buffer.
4240          */
4241         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4242                 continue;
4243         cnp->cn_namelen = cp - cnp->cn_nameptr;
4244         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4245                 cache_fpl_smr_exit(fpl);
4246                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4247         }
4248         ndp->ni_pathlen -= cnp->cn_namelen;
4249         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4250             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4251         ndp->ni_next = cp;
4252
4253         /*
4254          * Replace multiple slashes by a single slash and trailing slashes
4255          * by a null.  This must be done before VOP_LOOKUP() because some
4256          * fs's don't know about trailing slashes.  Remember if there were
4257          * trailing slashes to handle symlinks, existing non-directories
4258          * and non-existing files that won't be directories specially later.
4259          */
4260         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4261                 cp++;
4262                 ndp->ni_pathlen--;
4263                 if (*cp == '\0') {
4264                         /*
4265                          * TODO
4266                          * Regular lookup performs the following:
4267                          * *ndp->ni_next = '\0';
4268                          * cnp->cn_flags |= TRAILINGSLASH;
4269                          *
4270                          * Which is problematic since it modifies data read
4271                          * from userspace. Then if fast path lookup was to
4272                          * abort we would have to either restore it or convey
4273                          * the flag. Since this is a corner case just ignore
4274                          * it for simplicity.
4275                          */
4276                         return (cache_fpl_partial(fpl));
4277                 }
4278         }
4279         ndp->ni_next = cp;
4280
4281         /*
4282          * Check for degenerate name (e.g. / or "")
4283          * which is a way of talking about a directory,
4284          * e.g. like "/." or ".".
4285          *
4286          * TODO
4287          * Another corner case handled by the regular lookup
4288          */
4289         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4290                 return (cache_fpl_partial(fpl));
4291         }
4292         return (0);
4293 }
4294
4295 static void
4296 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4297 {
4298         struct nameidata *ndp;
4299         struct componentname *cnp;
4300
4301         ndp = fpl->ndp;
4302         cnp = fpl->cnp;
4303
4304         cnp->cn_nameptr = ndp->ni_next;
4305         while (*cnp->cn_nameptr == '/') {
4306                 cnp->cn_nameptr++;
4307                 ndp->ni_pathlen--;
4308         }
4309 }
4310
4311 /*
4312  * See the API contract for VOP_FPLOOKUP_VEXEC.
4313  */
4314 static int __noinline
4315 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4316 {
4317         struct componentname *cnp;
4318         struct vnode *dvp;
4319         seqc_t dvp_seqc;
4320
4321         cnp = fpl->cnp;
4322         dvp = fpl->dvp;
4323         dvp_seqc = fpl->dvp_seqc;
4324
4325         /*
4326          * Hack: they may be looking up foo/bar, where foo is a
4327          * regular file. In such a case we need to turn ENOTDIR,
4328          * but we may happen to get here with a different error.
4329          */
4330         if (dvp->v_type != VDIR) {
4331                 /*
4332                  * The check here is predominantly to catch
4333                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4334                  * gets doomed past this point it is going to
4335                  * fail seqc verification.
4336                  */
4337                 if (VN_IS_DOOMED(dvp)) {
4338                         return (cache_fpl_aborted(fpl));
4339                 }
4340                 error = ENOTDIR;
4341         }
4342
4343         /*
4344          * Hack: handle O_SEARCH.
4345          *
4346          * Open Group Base Specifications Issue 7, 2018 edition states:
4347          * If the access mode of the open file description associated with the
4348          * file descriptor is not O_SEARCH, the function shall check whether
4349          * directory searches are permitted using the current permissions of
4350          * the directory underlying the file descriptor. If the access mode is
4351          * O_SEARCH, the function shall not perform the check.
4352          *
4353          * Regular lookup tests for the NOEXECCHECK flag for every path
4354          * component to decide whether to do the permission check. However,
4355          * since most lookups never have the flag (and when they do it is only
4356          * present for the first path component), lockless lookup only acts on
4357          * it if there is a permission problem. Here the flag is represented
4358          * with a boolean so that we don't have to clear it on the way out.
4359          *
4360          * For simplicity this always aborts.
4361          * TODO: check if this is the first lookup and ignore the permission
4362          * problem. Note the flag has to survive fallback (if it happens to be
4363          * performed).
4364          */
4365         if (fpl->fsearch) {
4366                 return (cache_fpl_aborted(fpl));
4367         }
4368
4369         switch (error) {
4370         case EAGAIN:
4371                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4372                         error = cache_fpl_aborted(fpl);
4373                 } else {
4374                         cache_fpl_partial(fpl);
4375                 }
4376                 break;
4377         default:
4378                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4379                         error = cache_fpl_aborted(fpl);
4380                 } else {
4381                         cache_fpl_smr_exit(fpl);
4382                         cache_fpl_handled(fpl, error);
4383                 }
4384                 break;
4385         }
4386         return (error);
4387 }
4388
4389 static int
4390 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4391 {
4392         struct nameidata *ndp;
4393         struct componentname *cnp;
4394         struct mount *mp;
4395         int error;
4396
4397         error = CACHE_FPL_FAILED;
4398         ndp = fpl->ndp;
4399         cnp = fpl->cnp;
4400
4401         cache_fpl_checkpoint(fpl, &fpl->snd);
4402
4403         fpl->dvp = dvp;
4404         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4405         if (seqc_in_modify(fpl->dvp_seqc)) {
4406                 cache_fpl_aborted(fpl);
4407                 goto out;
4408         }
4409         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4410         if (!cache_fplookup_mp_supported(mp)) {
4411                 cache_fpl_aborted(fpl);
4412                 goto out;
4413         }
4414
4415         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4416
4417         for (;;) {
4418                 error = cache_fplookup_parse(fpl);
4419                 if (__predict_false(error != 0)) {
4420                         break;
4421                 }
4422
4423                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4424
4425                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4426                 if (__predict_false(error != 0)) {
4427                         error = cache_fplookup_failed_vexec(fpl, error);
4428                         break;
4429                 }
4430
4431                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4432                         error = cache_fplookup_dotdot(fpl);
4433                         if (__predict_false(error != 0)) {
4434                                 break;
4435                         }
4436                 } else {
4437                         error = cache_fplookup_next(fpl);
4438                         if (__predict_false(error != 0)) {
4439                                 break;
4440                         }
4441
4442                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4443
4444                         if (cache_fplookup_need_climb_mount(fpl)) {
4445                                 error = cache_fplookup_climb_mount(fpl);
4446                                 if (__predict_false(error != 0)) {
4447                                         break;
4448                                 }
4449                         }
4450                 }
4451
4452                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4453
4454                 if (cache_fpl_islastcn(ndp)) {
4455                         error = cache_fplookup_final(fpl);
4456                         break;
4457                 }
4458
4459                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4460                         error = cache_fpl_aborted(fpl);
4461                         break;
4462                 }
4463
4464                 fpl->dvp = fpl->tvp;
4465                 fpl->dvp_seqc = fpl->tvp_seqc;
4466
4467                 cache_fplookup_parse_advance(fpl);
4468                 cache_fpl_checkpoint(fpl, &fpl->snd);
4469         }
4470 out:
4471         switch (fpl->status) {
4472         case CACHE_FPL_STATUS_UNSET:
4473                 __assert_unreachable();
4474                 break;
4475         case CACHE_FPL_STATUS_PARTIAL:
4476                 cache_fpl_smr_assert_entered(fpl);
4477                 return (cache_fplookup_partial_setup(fpl));
4478         case CACHE_FPL_STATUS_ABORTED:
4479                 if (fpl->in_smr)
4480                         cache_fpl_smr_exit(fpl);
4481                 return (CACHE_FPL_FAILED);
4482         case CACHE_FPL_STATUS_HANDLED:
4483                 MPASS(error != CACHE_FPL_FAILED);
4484                 cache_fpl_smr_assert_not_entered(fpl);
4485                 if (__predict_false(error != 0)) {
4486                         ndp->ni_dvp = NULL;
4487                         ndp->ni_vp = NULL;
4488                         cache_fpl_cleanup_cnp(cnp);
4489                         return (error);
4490                 }
4491                 ndp->ni_dvp = fpl->dvp;
4492                 ndp->ni_vp = fpl->tvp;
4493                 if (cnp->cn_flags & SAVENAME)
4494                         cnp->cn_flags |= HASBUF;
4495                 else
4496                         cache_fpl_cleanup_cnp(cnp);
4497                 return (error);
4498         }
4499 }
4500
4501 /*
4502  * Fast path lookup protected with SMR and sequence counters.
4503  *
4504  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4505  *
4506  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4507  * outlined below.
4508  *
4509  * Traditional vnode lookup conceptually looks like this:
4510  *
4511  * vn_lock(current);
4512  * for (;;) {
4513  *      next = find();
4514  *      vn_lock(next);
4515  *      vn_unlock(current);
4516  *      current = next;
4517  *      if (last)
4518  *          break;
4519  * }
4520  * return (current);
4521  *
4522  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4523  * any modifications thanks to holding respective locks.
4524  *
4525  * The same guarantee can be provided with a combination of safe memory
4526  * reclamation and sequence counters instead. If all operations which affect
4527  * the relationship between the current vnode and the one we are looking for
4528  * also modify the counter, we can verify whether all the conditions held as
4529  * we made the jump. This includes things like permissions, mount points etc.
4530  * Counter modification is provided by enclosing relevant places in
4531  * vn_seqc_write_begin()/end() calls.
4532  *
4533  * Thus this translates to:
4534  *
4535  * vfs_smr_enter();
4536  * dvp_seqc = seqc_read_any(dvp);
4537  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4538  *     abort();
4539  * for (;;) {
4540  *      tvp = find();
4541  *      tvp_seqc = seqc_read_any(tvp);
4542  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4543  *          abort();
4544  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4545  *          abort();
4546  *      dvp = tvp; // we know nothing of importance has changed
4547  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4548  *      if (last)
4549  *          break;
4550  * }
4551  * vget(); // secure the vnode
4552  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4553  *          abort();
4554  * // at this point we know nothing has changed for any parent<->child pair
4555  * // as they were crossed during the lookup, meaning we matched the guarantee
4556  * // of the locked variant
4557  * return (tvp);
4558  *
4559  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4560  * - they are called while within vfs_smr protection which they must never exit
4561  * - EAGAIN can be returned to denote checking could not be performed, it is
4562  *   always valid to return it
4563  * - if the sequence counter has not changed the result must be valid
4564  * - if the sequence counter has changed both false positives and false negatives
4565  *   are permitted (since the result will be rejected later)
4566  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4567  *
4568  * Caveats to watch out for:
4569  * - vnodes are passed unlocked and unreferenced with nothing stopping
4570  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4571  *   to use atomic_load_ptr to fetch it.
4572  * - the aforementioned object can also get freed, meaning absent other means it
4573  *   should be protected with vfs_smr
4574  * - either safely checking permissions as they are modified or guaranteeing
4575  *   their stability is left to the routine
4576  */
4577 int
4578 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4579     struct pwd **pwdp)
4580 {
4581         struct cache_fpl fpl;
4582         struct pwd *pwd;
4583         struct vnode *dvp;
4584         struct componentname *cnp;
4585         struct nameidata_saved orig;
4586         int error;
4587
4588         MPASS(ndp->ni_lcf == 0);
4589
4590         fpl.status = CACHE_FPL_STATUS_UNSET;
4591         fpl.ndp = ndp;
4592         fpl.cnp = &ndp->ni_cnd;
4593         MPASS(curthread == fpl.cnp->cn_thread);
4594
4595         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4596                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4597
4598         if (!cache_can_fplookup(&fpl)) {
4599                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4600                 *status = fpl.status;
4601                 return (EOPNOTSUPP);
4602         }
4603
4604         cache_fpl_checkpoint(&fpl, &orig);
4605
4606         cache_fpl_smr_enter_initial(&fpl);
4607         fpl.fsearch = false;
4608         pwd = pwd_get_smr();
4609         fpl.pwd = pwd;
4610         ndp->ni_rootdir = pwd->pwd_rdir;
4611         ndp->ni_topdir = pwd->pwd_jdir;
4612
4613         cnp = fpl.cnp;
4614         cnp->cn_nameptr = cnp->cn_pnbuf;
4615         if (cnp->cn_pnbuf[0] == '/') {
4616                 cache_fpl_handle_root(ndp, &dvp);
4617         } else {
4618                 if (ndp->ni_dirfd == AT_FDCWD) {
4619                         dvp = pwd->pwd_cdir;
4620                 } else {
4621                         error = cache_fplookup_dirfd(&fpl, &dvp);
4622                         if (__predict_false(error != 0)) {
4623                                 goto out;
4624                         }
4625                 }
4626         }
4627
4628         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4629
4630         error = cache_fplookup_impl(dvp, &fpl);
4631 out:
4632         cache_fpl_smr_assert_not_entered(&fpl);
4633         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4634
4635         *status = fpl.status;
4636         switch (fpl.status) {
4637         case CACHE_FPL_STATUS_UNSET:
4638                 __assert_unreachable();
4639                 break;
4640         case CACHE_FPL_STATUS_HANDLED:
4641                 SDT_PROBE3(vfs, namei, lookup, return, error,
4642                     (error == 0 ? ndp->ni_vp : NULL), true);
4643                 break;
4644         case CACHE_FPL_STATUS_PARTIAL:
4645                 *pwdp = fpl.pwd;
4646                 /*
4647                  * Status restored by cache_fplookup_partial_setup.
4648                  */
4649                 break;
4650         case CACHE_FPL_STATUS_ABORTED:
4651                 cache_fpl_restore(&fpl, &orig);
4652                 break;
4653         }
4654         return (error);
4655 }