sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  83     "Name cache");
  84
  85 SDT_PROVIDER_DECLARE(vfs);
  86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  87     "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  89     "struct vnode *");
  90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  91     "char *");
  92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  93     "const char *");
  94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  95     "struct namecache *", "int", "int");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  98     "char *", "struct vnode *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 101     "struct vnode *", "char *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 105     "struct vnode *", "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 107     "char *");
 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 109     "struct componentname *");
 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 111     "struct componentname *");
 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 116     "struct vnode *");
 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 118     "char *");
 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 120     "char *");
 121
 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 125
 126 /*
 127  * This structure describes the elements in the cache of recent
 128  * names looked up by namei.
 129  */
 130 struct negstate {
 131         u_char neg_flag;
 132         u_char neg_hit;
 133 };
 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 135     "the state must fit in a union with a pointer without growing it");
 136
 137 struct  namecache {
 138         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 139         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 140         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 141         struct  vnode *nc_dvp;          /* vnode of parent of name */
 142         union {
 143                 struct  vnode *nu_vp;   /* vnode the name refers to */
 144                 struct  negstate nu_neg;/* negative entry state */
 145         } n_un;
 146         u_char  nc_flag;                /* flag bits */
 147         u_char  nc_nlen;                /* length of name */
 148         char    nc_name[0];             /* segment name + nul */
 149 };
 150
 151 /*
 152  * struct namecache_ts repeats struct namecache layout up to the
 153  * nc_nlen member.
 154  * struct namecache_ts is used in place of struct namecache when time(s) need
 155  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 156  * both a non-dotdot directory name plus dotdot for the directory's
 157  * parent.
 158  *
 159  * See below for alignment requirement.
 160  */
 161 struct  namecache_ts {
 162         struct  timespec nc_time;       /* timespec provided by fs */
 163         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 164         int     nc_ticks;               /* ticks value when entry was added */
 165         int     nc_pad;
 166         struct namecache nc_nc;
 167 };
 168
 169 /*
 170  * At least mips n32 performs 64-bit accesses to timespec as found
 171  * in namecache_ts and requires them to be aligned. Since others
 172  * may be in the same spot suffer a little bit and enforce the
 173  * alignment for everyone. Note this is a nop for 64-bit platforms.
 174  */
 175 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 176
 177 #ifdef __LP64__
 178 #define CACHE_PATH_CUTOFF       45
 179 #define CACHE_LARGE_PAD         6
 180 #else
 181 #define CACHE_PATH_CUTOFF       41
 182 #define CACHE_LARGE_PAD         2
 183 #endif
 184
 185 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 186 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 187 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 188 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 189
 190 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 191 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 192 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 193 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 194
 195 #define nc_vp           n_un.nu_vp
 196 #define nc_neg          n_un.nu_neg
 197
 198 /*
 199  * Flags in namecache.nc_flag
 200  */
 201 #define NCF_WHITE       0x01
 202 #define NCF_ISDOTDOT    0x02
 203 #define NCF_TS          0x04
 204 #define NCF_DTS         0x08
 205 #define NCF_DVDROP      0x10
 206 #define NCF_NEGATIVE    0x20
 207 #define NCF_INVALID     0x40
 208 #define NCF_WIP         0x80
 209
 210 /*
 211  * Flags in negstate.neg_flag
 212  */
 213 #define NEG_HOT         0x01
 214
 215 /*
 216  * Mark an entry as invalid.
 217  *
 218  * This is called before it starts getting deconstructed.
 219  */
 220 static void
 221 cache_ncp_invalidate(struct namecache *ncp)
 222 {
 223
 224         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 225             ("%s: entry %p already invalid", __func__, ncp));
 226         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 227         atomic_thread_fence_rel();
 228 }
 229
 230 /*
 231  * Check whether the entry can be safely used.
 232  *
 233  * All places which elide locks are supposed to call this after they are
 234  * done with reading from an entry.
 235  */
 236 static bool
 237 cache_ncp_canuse(struct namecache *ncp)
 238 {
 239
 240         atomic_thread_fence_acq();
 241         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 242 }
 243
 244 /*
 245  * Name caching works as follows:
 246  *
 247  * Names found by directory scans are retained in a cache
 248  * for future reference.  It is managed LRU, so frequently
 249  * used names will hang around.  Cache is indexed by hash value
 250  * obtained from (dvp, name) where dvp refers to the directory
 251  * containing name.
 252  *
 253  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 254  * exist) the vnode pointer will be NULL.
 255  *
 256  * Upon reaching the last segment of a path, if the reference
 257  * is for DELETE, or NOCACHE is set (rewrite), and the
 258  * name is located in the cache, it will be dropped.
 259  *
 260  * These locks are used (in the order in which they can be taken):
 261  * NAME         TYPE    ROLE
 262  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 263  * bucketlock   mtx     for access to given set of hash buckets
 264  * neglist      mtx     negative entry LRU management
 265  *
 266  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 267  * order is lower address first. Both are recursive.
 268  *
 269  * "." lookups are lockless.
 270  *
 271  * ".." and vnode -> name lookups require vnodelock.
 272  *
 273  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 274  *
 275  * Insertions and removals of entries require involved vnodes and bucketlocks
 276  * to be locked to provide safe operation against other threads modifying the
 277  * cache.
 278  *
 279  * Some lookups result in removal of the found entry (e.g. getting rid of a
 280  * negative entry with the intent to create a positive one), which poses a
 281  * problem when multiple threads reach the state. Similarly, two different
 282  * threads can purge two different vnodes and try to remove the same name.
 283  *
 284  * If the already held vnode lock is lower than the second required lock, we
 285  * can just take the other lock. However, in the opposite case, this could
 286  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 287  * the first node, locking everything in order and revalidating the state.
 288  */
 289
 290 VFS_SMR_DECLARE;
 291
 292 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 293     "Name cache parameters");
 294
 295 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 296 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 297     "Total namecache capacity");
 298
 299 u_int ncsizefactor = 2;
 300 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 301     "Size factor for namecache");
 302
 303 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 304 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 305     "Ratio of negative namecache entries");
 306
 307 /*
 308  * Negative entry % of namecahe capacity above which automatic eviction is allowed.
 309  *
 310  * Check cache_neg_evict_cond for details.
 311  */
 312 static u_int ncnegminpct = 3;
 313
 314 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 316     "Negative entry count above which automatic eviction is allowed");
 317
 318 /*
 319  * Structures associated with name caching.
 320  */
 321 #define NCHHASH(hash) \
 322         (&nchashtbl[(hash) & nchash])
 323 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 324 static u_long __read_mostly     nchash;                 /* size of hash table */
 325 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 326     "Size of namecache hash table");
 327 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 328 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 329
 330 struct nchstats nchstats;               /* cache effectiveness statistics */
 331
 332 static bool __read_frequently cache_fast_revlookup = true;
 333 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 334     &cache_fast_revlookup, 0, "");
 335
 336 static u_int __exclusive_cache_line neg_cycle;
 337
 338 #define ncneghash       3
 339 #define numneglists     (ncneghash + 1)
 340
 341 struct neglist {
 342         struct mtx              nl_evict_lock;
 343         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 344         TAILQ_HEAD(, namecache) nl_list;
 345         TAILQ_HEAD(, namecache) nl_hotlist;
 346         u_long                  nl_hotnum;
 347 } __aligned(CACHE_LINE_SIZE);
 348
 349 static struct neglist neglists[numneglists];
 350
 351 static inline struct neglist *
 352 NCP2NEGLIST(struct namecache *ncp)
 353 {
 354
 355         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 356 }
 357
 358 static inline struct negstate *
 359 NCP2NEGSTATE(struct namecache *ncp)
 360 {
 361
 362         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 363         return (&ncp->nc_neg);
 364 }
 365
 366 #define numbucketlocks (ncbuckethash + 1)
 367 static u_int __read_mostly  ncbuckethash;
 368 static struct mtx_padalign __read_mostly  *bucketlocks;
 369 #define HASH2BUCKETLOCK(hash) \
 370         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 371
 372 #define numvnodelocks (ncvnodehash + 1)
 373 static u_int __read_mostly  ncvnodehash;
 374 static struct mtx __read_mostly *vnodelocks;
 375 static inline struct mtx *
 376 VP2VNODELOCK(struct vnode *vp)
 377 {
 378
 379         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 380 }
 381
 382 /*
 383  * UMA zones for the VFS cache.
 384  *
 385  * The small cache is used for entries with short names, which are the
 386  * most common.  The large cache is used for entries which are too big to
 387  * fit in the small cache.
 388  */
 389 static uma_zone_t __read_mostly cache_zone_small;
 390 static uma_zone_t __read_mostly cache_zone_small_ts;
 391 static uma_zone_t __read_mostly cache_zone_large;
 392 static uma_zone_t __read_mostly cache_zone_large_ts;
 393
 394 static struct namecache *
 395 cache_alloc(int len, int ts)
 396 {
 397         struct namecache_ts *ncp_ts;
 398         struct namecache *ncp;
 399
 400         if (__predict_false(ts)) {
 401                 if (len <= CACHE_PATH_CUTOFF)
 402                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 403                 else
 404                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 405                 ncp = &ncp_ts->nc_nc;
 406         } else {
 407                 if (len <= CACHE_PATH_CUTOFF)
 408                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 409                 else
 410                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 411         }
 412         return (ncp);
 413 }
 414
 415 static void
 416 cache_free(struct namecache *ncp)
 417 {
 418         struct namecache_ts *ncp_ts;
 419
 420         MPASS(ncp != NULL);
 421         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 422                 vdrop(ncp->nc_dvp);
 423         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 424                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 425                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 426                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 427                 else
 428                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 429         } else {
 430                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 431                         uma_zfree_smr(cache_zone_small, ncp);
 432                 else
 433                         uma_zfree_smr(cache_zone_large, ncp);
 434         }
 435 }
 436
 437 static void
 438 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 439 {
 440         struct namecache_ts *ncp_ts;
 441
 442         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 443             (tsp == NULL && ticksp == NULL),
 444             ("No NCF_TS"));
 445
 446         if (tsp == NULL)
 447                 return;
 448
 449         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 450         *tsp = ncp_ts->nc_time;
 451         *ticksp = ncp_ts->nc_ticks;
 452 }
 453
 454 #ifdef DEBUG_CACHE
 455 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 456 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 457     "VFS namecache enabled");
 458 #endif
 459
 460 /* Export size information to userland */
 461 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 462     sizeof(struct namecache), "sizeof(struct namecache)");
 463
 464 /*
 465  * The new name cache statistics
 466  */
 467 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 468     "Name cache statistics");
 469
 470 #define STATNODE_ULONG(name, varname, descr)                                    \
 471         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 472 #define STATNODE_COUNTER(name, varname, descr)                                  \
 473         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 474         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 475             descr);
 476 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 477 STATNODE_ULONG(count, numcache, "Number of cache entries");
 478 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 479 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 480 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 481 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 482 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 483 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 484 STATNODE_COUNTER(posszaps, numposzaps,
 485     "Number of cache hits (positive) we do not want to cache");
 486 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 487 STATNODE_COUNTER(negzaps, numnegzaps,
 488     "Number of cache hits (negative) we do not want to cache");
 489 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 490 /* These count for vn_getcwd(), too. */
 491 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 492 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 493 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 494     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 495 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 496 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 497
 498 /*
 499  * Debug or developer statistics.
 500  */
 501 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 502     "Name cache debugging");
 503 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 504         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 505 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 506         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 507         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 508             descr);
 509 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 510     "Number of successful removals after relocking");
 511 static long zap_bucket_fail;
 512 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 513 static long zap_bucket_fail2;
 514 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 515 static long cache_lock_vnodes_cel_3_failures;
 516 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 517     "Number of times 3-way vnode locking failed");
 518
 519 static void cache_zap_locked(struct namecache *ncp);
 520 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 521     char **freebuf, size_t *buflen);
 522 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 523     char **retbuf, size_t *buflen, size_t addend);
 524 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 525     char **retbuf, size_t *buflen);
 526 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 527     char **retbuf, size_t *len, size_t addend);
 528
 529 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 530
 531 static inline void
 532 cache_assert_vlp_locked(struct mtx *vlp)
 533 {
 534
 535         if (vlp != NULL)
 536                 mtx_assert(vlp, MA_OWNED);
 537 }
 538
 539 static inline void
 540 cache_assert_vnode_locked(struct vnode *vp)
 541 {
 542         struct mtx *vlp;
 543
 544         vlp = VP2VNODELOCK(vp);
 545         cache_assert_vlp_locked(vlp);
 546 }
 547
 548 /*
 549  * TODO: With the value stored we can do better than computing the hash based
 550  * on the address. The choice of FNV should also be revisited.
 551  */
 552 static void
 553 cache_prehash(struct vnode *vp)
 554 {
 555
 556         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 557 }
 558
 559 static uint32_t
 560 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 561 {
 562
 563         return (fnv_32_buf(name, len, dvp->v_nchash));
 564 }
 565
 566 static inline struct nchashhead *
 567 NCP2BUCKET(struct namecache *ncp)
 568 {
 569         uint32_t hash;
 570
 571         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 572         return (NCHHASH(hash));
 573 }
 574
 575 static inline struct mtx *
 576 NCP2BUCKETLOCK(struct namecache *ncp)
 577 {
 578         uint32_t hash;
 579
 580         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 581         return (HASH2BUCKETLOCK(hash));
 582 }
 583
 584 #ifdef INVARIANTS
 585 static void
 586 cache_assert_bucket_locked(struct namecache *ncp)
 587 {
 588         struct mtx *blp;
 589
 590         blp = NCP2BUCKETLOCK(ncp);
 591         mtx_assert(blp, MA_OWNED);
 592 }
 593
 594 static void
 595 cache_assert_bucket_unlocked(struct namecache *ncp)
 596 {
 597         struct mtx *blp;
 598
 599         blp = NCP2BUCKETLOCK(ncp);
 600         mtx_assert(blp, MA_NOTOWNED);
 601 }
 602 #else
 603 #define cache_assert_bucket_locked(x) do { } while (0)
 604 #define cache_assert_bucket_unlocked(x) do { } while (0)
 605 #endif
 606
 607 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 608 static void
 609 _cache_sort_vnodes(void **p1, void **p2)
 610 {
 611         void *tmp;
 612
 613         MPASS(*p1 != NULL || *p2 != NULL);
 614
 615         if (*p1 > *p2) {
 616                 tmp = *p2;
 617                 *p2 = *p1;
 618                 *p1 = tmp;
 619         }
 620 }
 621
 622 static void
 623 cache_lock_all_buckets(void)
 624 {
 625         u_int i;
 626
 627         for (i = 0; i < numbucketlocks; i++)
 628                 mtx_lock(&bucketlocks[i]);
 629 }
 630
 631 static void
 632 cache_unlock_all_buckets(void)
 633 {
 634         u_int i;
 635
 636         for (i = 0; i < numbucketlocks; i++)
 637                 mtx_unlock(&bucketlocks[i]);
 638 }
 639
 640 static void
 641 cache_lock_all_vnodes(void)
 642 {
 643         u_int i;
 644
 645         for (i = 0; i < numvnodelocks; i++)
 646                 mtx_lock(&vnodelocks[i]);
 647 }
 648
 649 static void
 650 cache_unlock_all_vnodes(void)
 651 {
 652         u_int i;
 653
 654         for (i = 0; i < numvnodelocks; i++)
 655                 mtx_unlock(&vnodelocks[i]);
 656 }
 657
 658 static int
 659 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 660 {
 661
 662         cache_sort_vnodes(&vlp1, &vlp2);
 663
 664         if (vlp1 != NULL) {
 665                 if (!mtx_trylock(vlp1))
 666                         return (EAGAIN);
 667         }
 668         if (!mtx_trylock(vlp2)) {
 669                 if (vlp1 != NULL)
 670                         mtx_unlock(vlp1);
 671                 return (EAGAIN);
 672         }
 673
 674         return (0);
 675 }
 676
 677 static void
 678 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 679 {
 680
 681         MPASS(vlp1 != NULL || vlp2 != NULL);
 682         MPASS(vlp1 <= vlp2);
 683
 684         if (vlp1 != NULL)
 685                 mtx_lock(vlp1);
 686         if (vlp2 != NULL)
 687                 mtx_lock(vlp2);
 688 }
 689
 690 static void
 691 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 692 {
 693
 694         MPASS(vlp1 != NULL || vlp2 != NULL);
 695
 696         if (vlp1 != NULL)
 697                 mtx_unlock(vlp1);
 698         if (vlp2 != NULL)
 699                 mtx_unlock(vlp2);
 700 }
 701
 702 static int
 703 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 704 {
 705         struct nchstats snap;
 706
 707         if (req->oldptr == NULL)
 708                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 709
 710         snap = nchstats;
 711         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 712         snap.ncs_neghits = counter_u64_fetch(numneghits);
 713         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 714             counter_u64_fetch(numnegzaps);
 715         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 716             counter_u64_fetch(nummiss);
 717
 718         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 719 }
 720 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 721     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 722     "VFS cache effectiveness statistics");
 723
 724 static void
 725 cache_recalc_neg_min(u_int val)
 726 {
 727
 728         neg_min = (ncsize * val) / 100;
 729 }
 730
 731 static int
 732 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 733 {
 734         u_int val;
 735         int error;
 736
 737         val = ncnegminpct;
 738         error = sysctl_handle_int(oidp, &val, 0, req);
 739         if (error != 0 || req->newptr == NULL)
 740                 return (error);
 741
 742         if (val == ncnegminpct)
 743                 return (0);
 744         if (val < 0 || val > 99)
 745                 return (EINVAL);
 746         ncnegminpct = val;
 747         cache_recalc_neg_min(val);
 748         return (0);
 749 }
 750
 751 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 752     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 753     "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed");
 754
 755 #ifdef DIAGNOSTIC
 756 /*
 757  * Grab an atomic snapshot of the name cache hash chain lengths
 758  */
 759 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 760     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 761     "hash table stats");
 762
 763 static int
 764 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 765 {
 766         struct nchashhead *ncpp;
 767         struct namecache *ncp;
 768         int i, error, n_nchash, *cntbuf;
 769
 770 retry:
 771         n_nchash = nchash + 1;  /* nchash is max index, not count */
 772         if (req->oldptr == NULL)
 773                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 774         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 775         cache_lock_all_buckets();
 776         if (n_nchash != nchash + 1) {
 777                 cache_unlock_all_buckets();
 778                 free(cntbuf, M_TEMP);
 779                 goto retry;
 780         }
 781         /* Scan hash tables counting entries */
 782         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 783                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 784                         cntbuf[i]++;
 785         cache_unlock_all_buckets();
 786         for (error = 0, i = 0; i < n_nchash; i++)
 787                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 788                         break;
 789         free(cntbuf, M_TEMP);
 790         return (error);
 791 }
 792 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 793     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 794     "nchash chain lengths");
 795
 796 static int
 797 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 798 {
 799         int error;
 800         struct nchashhead *ncpp;
 801         struct namecache *ncp;
 802         int n_nchash;
 803         int count, maxlength, used, pct;
 804
 805         if (!req->oldptr)
 806                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 807
 808         cache_lock_all_buckets();
 809         n_nchash = nchash + 1;  /* nchash is max index, not count */
 810         used = 0;
 811         maxlength = 0;
 812
 813         /* Scan hash tables for applicable entries */
 814         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 815                 count = 0;
 816                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 817                         count++;
 818                 }
 819                 if (count)
 820                         used++;
 821                 if (maxlength < count)
 822                         maxlength = count;
 823         }
 824         n_nchash = nchash + 1;
 825         cache_unlock_all_buckets();
 826         pct = (used * 100) / (n_nchash / 100);
 827         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 828         if (error)
 829                 return (error);
 830         error = SYSCTL_OUT(req, &used, sizeof(used));
 831         if (error)
 832                 return (error);
 833         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 834         if (error)
 835                 return (error);
 836         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 837         if (error)
 838                 return (error);
 839         return (0);
 840 }
 841 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 842     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 843     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 844 #endif
 845
 846 /*
 847  * Negative entries management
 848  *
 849  * Various workloads create plenty of negative entries and barely use them
 850  * afterwards. Moreover malicious users can keep performing bogus lookups
 851  * adding even more entries. For example "make tinderbox" as of writing this
 852  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 853  * negative.
 854  *
 855  * As such, a rather aggressive eviction method is needed. The currently
 856  * employed method is a placeholder.
 857  *
 858  * Entries are split over numneglists separate lists, each of which is further
 859  * split into hot and cold entries. Entries get promoted after getting a hit.
 860  * Eviction happens on addition of new entry.
 861  */
 862 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 863     "Name cache negative entry statistics");
 864
 865 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 866     "Number of negative cache entries");
 867
 868 static COUNTER_U64_DEFINE_EARLY(neg_created);
 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 870     "Number of created negative entries");
 871
 872 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 873 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 874     "Number of evicted negative entries");
 875
 876 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 877 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 878     &neg_evict_skipped_empty,
 879     "Number of times evicting failed due to lack of entries");
 880
 881 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 882 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 883     &neg_evict_skipped_missed,
 884     "Number of times evicting failed due to target entry disappearing");
 885
 886 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 887 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 888     &neg_evict_skipped_contended,
 889     "Number of times evicting failed due to contention");
 890
 891 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 892     "Number of cache hits (negative)");
 893
 894 static int
 895 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 896 {
 897         int i, out;
 898
 899         out = 0;
 900         for (i = 0; i < numneglists; i++)
 901                 out += neglists[i].nl_hotnum;
 902
 903         return (SYSCTL_OUT(req, &out, sizeof(out)));
 904 }
 905 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 906     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 907     "Number of hot negative entries");
 908
 909 static void
 910 cache_neg_init(struct namecache *ncp)
 911 {
 912         struct negstate *ns;
 913
 914         ncp->nc_flag |= NCF_NEGATIVE;
 915         ns = NCP2NEGSTATE(ncp);
 916         ns->neg_flag = 0;
 917         ns->neg_hit = 0;
 918         counter_u64_add(neg_created, 1);
 919 }
 920
 921 #define CACHE_NEG_PROMOTION_THRESH 2
 922
 923 static bool
 924 cache_neg_hit_prep(struct namecache *ncp)
 925 {
 926         struct negstate *ns;
 927         u_char n;
 928
 929         ns = NCP2NEGSTATE(ncp);
 930         n = atomic_load_char(&ns->neg_hit);
 931         for (;;) {
 932                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 933                         return (false);
 934                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 935                         break;
 936         }
 937         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 938 }
 939
 940 /*
 941  * Nothing to do here but it is provided for completeness as some
 942  * cache_neg_hit_prep callers may end up returning without even
 943  * trying to promote.
 944  */
 945 #define cache_neg_hit_abort(ncp)        do { } while (0)
 946
 947 static void
 948 cache_neg_hit_finish(struct namecache *ncp)
 949 {
 950
 951         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 952         counter_u64_add(numneghits, 1);
 953 }
 954
 955 /*
 956  * Move a negative entry to the hot list.
 957  */
 958 static void
 959 cache_neg_promote_locked(struct namecache *ncp)
 960 {
 961         struct neglist *nl;
 962         struct negstate *ns;
 963
 964         ns = NCP2NEGSTATE(ncp);
 965         nl = NCP2NEGLIST(ncp);
 966         mtx_assert(&nl->nl_lock, MA_OWNED);
 967         if ((ns->neg_flag & NEG_HOT) == 0) {
 968                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 969                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 970                 nl->nl_hotnum++;
 971                 ns->neg_flag |= NEG_HOT;
 972         }
 973 }
 974
 975 /*
 976  * Move a hot negative entry to the cold list.
 977  */
 978 static void
 979 cache_neg_demote_locked(struct namecache *ncp)
 980 {
 981         struct neglist *nl;
 982         struct negstate *ns;
 983
 984         ns = NCP2NEGSTATE(ncp);
 985         nl = NCP2NEGLIST(ncp);
 986         mtx_assert(&nl->nl_lock, MA_OWNED);
 987         MPASS(ns->neg_flag & NEG_HOT);
 988         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 989         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 990         nl->nl_hotnum--;
 991         ns->neg_flag &= ~NEG_HOT;
 992         atomic_store_char(&ns->neg_hit, 0);
 993 }
 994
 995 /*
 996  * Move a negative entry to the hot list if it matches the lookup.
 997  *
 998  * We have to take locks, but they may be contended and in the worst
 999  * case we may need to go off CPU. We don't want to spin within the
1000  * smr section and we can't block with it. Exiting the section means
1001  * the found entry could have been evicted. We are going to look it
1002  * up again.
1003  */
1004 static bool
1005 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1006     struct namecache *oncp, uint32_t hash)
1007 {
1008         struct namecache *ncp;
1009         struct neglist *nl;
1010         u_char nc_flag;
1011
1012         nl = NCP2NEGLIST(oncp);
1013
1014         mtx_lock(&nl->nl_lock);
1015         /*
1016          * For hash iteration.
1017          */
1018         vfs_smr_enter();
1019
1020         /*
1021          * Avoid all surprises by only succeeding if we got the same entry and
1022          * bailing completely otherwise.
1023          * XXX There are no provisions to keep the vnode around, meaning we may
1024          * end up promoting a negative entry for a *new* vnode and returning
1025          * ENOENT on its account. This is the error we want to return anyway
1026          * and promotion is harmless.
1027          *
1028          * In particular at this point there can be a new ncp which matches the
1029          * search but hashes to a different neglist.
1030          */
1031         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1032                 if (ncp == oncp)
1033                         break;
1034         }
1035
1036         /*
1037          * No match to begin with.
1038          */
1039         if (__predict_false(ncp == NULL)) {
1040                 goto out_abort;
1041         }
1042
1043         /*
1044          * The newly found entry may be something different...
1045          */
1046         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1047             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1048                 goto out_abort;
1049         }
1050
1051         /*
1052          * ... and not even negative.
1053          */
1054         nc_flag = atomic_load_char(&ncp->nc_flag);
1055         if ((nc_flag & NCF_NEGATIVE) == 0) {
1056                 goto out_abort;
1057         }
1058
1059         if (__predict_false(!cache_ncp_canuse(ncp))) {
1060                 goto out_abort;
1061         }
1062
1063         cache_neg_promote_locked(ncp);
1064         cache_neg_hit_finish(ncp);
1065         vfs_smr_exit();
1066         mtx_unlock(&nl->nl_lock);
1067         return (true);
1068 out_abort:
1069         vfs_smr_exit();
1070         mtx_unlock(&nl->nl_lock);
1071         return (false);
1072 }
1073
1074 static void
1075 cache_neg_promote(struct namecache *ncp)
1076 {
1077         struct neglist *nl;
1078
1079         nl = NCP2NEGLIST(ncp);
1080         mtx_lock(&nl->nl_lock);
1081         cache_neg_promote_locked(ncp);
1082         mtx_unlock(&nl->nl_lock);
1083 }
1084
1085 static void
1086 cache_neg_insert(struct namecache *ncp)
1087 {
1088         struct neglist *nl;
1089
1090         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1091         cache_assert_bucket_locked(ncp);
1092         nl = NCP2NEGLIST(ncp);
1093         mtx_lock(&nl->nl_lock);
1094         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1095         mtx_unlock(&nl->nl_lock);
1096         atomic_add_long(&numneg, 1);
1097 }
1098
1099 static void
1100 cache_neg_remove(struct namecache *ncp)
1101 {
1102         struct neglist *nl;
1103         struct negstate *ns;
1104
1105         cache_assert_bucket_locked(ncp);
1106         nl = NCP2NEGLIST(ncp);
1107         ns = NCP2NEGSTATE(ncp);
1108         mtx_lock(&nl->nl_lock);
1109         if ((ns->neg_flag & NEG_HOT) != 0) {
1110                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1111                 nl->nl_hotnum--;
1112         } else {
1113                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1114         }
1115         mtx_unlock(&nl->nl_lock);
1116         atomic_subtract_long(&numneg, 1);
1117 }
1118
1119 static struct neglist *
1120 cache_neg_evict_select_list(void)
1121 {
1122         struct neglist *nl;
1123         u_int c;
1124
1125         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1126         nl = &neglists[c % numneglists];
1127         if (!mtx_trylock(&nl->nl_evict_lock)) {
1128                 counter_u64_add(neg_evict_skipped_contended, 1);
1129                 return (NULL);
1130         }
1131         return (nl);
1132 }
1133
1134 static struct namecache *
1135 cache_neg_evict_select_entry(struct neglist *nl)
1136 {
1137         struct namecache *ncp, *lncp;
1138         struct negstate *ns, *lns;
1139         int i;
1140
1141         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1142         mtx_assert(&nl->nl_lock, MA_OWNED);
1143         ncp = TAILQ_FIRST(&nl->nl_list);
1144         if (ncp == NULL)
1145                 return (NULL);
1146         lncp = ncp;
1147         lns = NCP2NEGSTATE(lncp);
1148         for (i = 1; i < 4; i++) {
1149                 ncp = TAILQ_NEXT(ncp, nc_dst);
1150                 if (ncp == NULL)
1151                         break;
1152                 ns = NCP2NEGSTATE(ncp);
1153                 if (ns->neg_hit < lns->neg_hit) {
1154                         lncp = ncp;
1155                         lns = ns;
1156                 }
1157         }
1158         return (lncp);
1159 }
1160
1161 static bool
1162 cache_neg_evict(void)
1163 {
1164         struct namecache *ncp, *ncp2;
1165         struct neglist *nl;
1166         struct vnode *dvp;
1167         struct mtx *dvlp;
1168         struct mtx *blp;
1169         uint32_t hash;
1170         u_char nlen;
1171         bool evicted;
1172
1173         nl = cache_neg_evict_select_list();
1174         if (nl == NULL) {
1175                 return (false);
1176         }
1177
1178         mtx_lock(&nl->nl_lock);
1179         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1180         if (ncp != NULL) {
1181                 cache_neg_demote_locked(ncp);
1182         }
1183         ncp = cache_neg_evict_select_entry(nl);
1184         if (ncp == NULL) {
1185                 counter_u64_add(neg_evict_skipped_empty, 1);
1186                 mtx_unlock(&nl->nl_lock);
1187                 mtx_unlock(&nl->nl_evict_lock);
1188                 return (false);
1189         }
1190         nlen = ncp->nc_nlen;
1191         dvp = ncp->nc_dvp;
1192         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1193         dvlp = VP2VNODELOCK(dvp);
1194         blp = HASH2BUCKETLOCK(hash);
1195         mtx_unlock(&nl->nl_lock);
1196         mtx_unlock(&nl->nl_evict_lock);
1197         mtx_lock(dvlp);
1198         mtx_lock(blp);
1199         /*
1200          * Note that since all locks were dropped above, the entry may be
1201          * gone or reallocated to be something else.
1202          */
1203         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1204                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1205                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1206                         break;
1207         }
1208         if (ncp2 == NULL) {
1209                 counter_u64_add(neg_evict_skipped_missed, 1);
1210                 ncp = NULL;
1211                 evicted = false;
1212         } else {
1213                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1214                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1215                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1216                     ncp->nc_name);
1217                 cache_zap_locked(ncp);
1218                 counter_u64_add(neg_evicted, 1);
1219                 evicted = true;
1220         }
1221         mtx_unlock(blp);
1222         mtx_unlock(dvlp);
1223         if (ncp != NULL)
1224                 cache_free(ncp);
1225         return (evicted);
1226 }
1227
1228 /*
1229  * Maybe evict a negative entry to create more room.
1230  *
1231  * The ncnegfactor parameter limits what fraction of the total count
1232  * can comprise of negative entries. However, if the cache is just
1233  * warming up this leads to excessive evictions.  As such, ncnegminpct
1234  * (recomputed to neg_min) dictates whether the above should be
1235  * applied.
1236  *
1237  * Try evicting if the cache is close to full capacity regardless of
1238  * other considerations.
1239  */
1240 static bool
1241 cache_neg_evict_cond(u_long lnumcache)
1242 {
1243         u_long lnumneg;
1244
1245         if (ncsize - 1000 < lnumcache)
1246                 goto out_evict;
1247         lnumneg = atomic_load_long(&numneg);
1248         if (lnumneg < neg_min)
1249                 return (false);
1250         if (lnumneg * ncnegfactor < lnumcache)
1251                 return (false);
1252 out_evict:
1253         return (cache_neg_evict());
1254 }
1255
1256 /*
1257  * cache_zap_locked():
1258  *
1259  *   Removes a namecache entry from cache, whether it contains an actual
1260  *   pointer to a vnode or if it is just a negative cache entry.
1261  */
1262 static void
1263 cache_zap_locked(struct namecache *ncp)
1264 {
1265         struct nchashhead *ncpp;
1266
1267         if (!(ncp->nc_flag & NCF_NEGATIVE))
1268                 cache_assert_vnode_locked(ncp->nc_vp);
1269         cache_assert_vnode_locked(ncp->nc_dvp);
1270         cache_assert_bucket_locked(ncp);
1271
1272         cache_ncp_invalidate(ncp);
1273
1274         ncpp = NCP2BUCKET(ncp);
1275         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1276         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1277                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1278                     ncp->nc_name, ncp->nc_vp);
1279                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1280                 if (ncp == ncp->nc_vp->v_cache_dd) {
1281                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1282                         ncp->nc_vp->v_cache_dd = NULL;
1283                         vn_seqc_write_end(ncp->nc_vp);
1284                 }
1285         } else {
1286                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1287                     ncp->nc_name);
1288                 cache_neg_remove(ncp);
1289         }
1290         if (ncp->nc_flag & NCF_ISDOTDOT) {
1291                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1292                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1293                         ncp->nc_dvp->v_cache_dd = NULL;
1294                         vn_seqc_write_end(ncp->nc_dvp);
1295                 }
1296         } else {
1297                 LIST_REMOVE(ncp, nc_src);
1298                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1299                         ncp->nc_flag |= NCF_DVDROP;
1300                         counter_u64_add(numcachehv, -1);
1301                 }
1302         }
1303         atomic_subtract_long(&numcache, 1);
1304 }
1305
1306 static void
1307 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1308 {
1309         struct mtx *blp;
1310
1311         MPASS(ncp->nc_dvp == vp);
1312         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1313         cache_assert_vnode_locked(vp);
1314
1315         blp = NCP2BUCKETLOCK(ncp);
1316         mtx_lock(blp);
1317         cache_zap_locked(ncp);
1318         mtx_unlock(blp);
1319 }
1320
1321 static bool
1322 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1323     struct mtx **vlpp)
1324 {
1325         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1326         struct mtx *blp;
1327
1328         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1329         cache_assert_vnode_locked(vp);
1330
1331         if (ncp->nc_flag & NCF_NEGATIVE) {
1332                 if (*vlpp != NULL) {
1333                         mtx_unlock(*vlpp);
1334                         *vlpp = NULL;
1335                 }
1336                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1337                 return (true);
1338         }
1339
1340         pvlp = VP2VNODELOCK(vp);
1341         blp = NCP2BUCKETLOCK(ncp);
1342         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1343         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1344
1345         if (*vlpp == vlp1 || *vlpp == vlp2) {
1346                 to_unlock = *vlpp;
1347                 *vlpp = NULL;
1348         } else {
1349                 if (*vlpp != NULL) {
1350                         mtx_unlock(*vlpp);
1351                         *vlpp = NULL;
1352                 }
1353                 cache_sort_vnodes(&vlp1, &vlp2);
1354                 if (vlp1 == pvlp) {
1355                         mtx_lock(vlp2);
1356                         to_unlock = vlp2;
1357                 } else {
1358                         if (!mtx_trylock(vlp1))
1359                                 goto out_relock;
1360                         to_unlock = vlp1;
1361                 }
1362         }
1363         mtx_lock(blp);
1364         cache_zap_locked(ncp);
1365         mtx_unlock(blp);
1366         if (to_unlock != NULL)
1367                 mtx_unlock(to_unlock);
1368         return (true);
1369
1370 out_relock:
1371         mtx_unlock(vlp2);
1372         mtx_lock(vlp1);
1373         mtx_lock(vlp2);
1374         MPASS(*vlpp == NULL);
1375         *vlpp = vlp1;
1376         return (false);
1377 }
1378
1379 /*
1380  * If trylocking failed we can get here. We know enough to take all needed locks
1381  * in the right order and re-lookup the entry.
1382  */
1383 static int
1384 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1385     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1386     struct mtx *blp)
1387 {
1388         struct namecache *rncp;
1389
1390         cache_assert_bucket_unlocked(ncp);
1391
1392         cache_sort_vnodes(&dvlp, &vlp);
1393         cache_lock_vnodes(dvlp, vlp);
1394         mtx_lock(blp);
1395         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1396                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1397                     rncp->nc_nlen == cnp->cn_namelen &&
1398                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1399                         break;
1400         }
1401         if (rncp != NULL) {
1402                 cache_zap_locked(rncp);
1403                 mtx_unlock(blp);
1404                 cache_unlock_vnodes(dvlp, vlp);
1405                 counter_u64_add(zap_bucket_relock_success, 1);
1406                 return (0);
1407         }
1408
1409         mtx_unlock(blp);
1410         cache_unlock_vnodes(dvlp, vlp);
1411         return (EAGAIN);
1412 }
1413
1414 static int __noinline
1415 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1416     uint32_t hash, struct mtx *blp)
1417 {
1418         struct mtx *dvlp, *vlp;
1419         struct vnode *dvp;
1420
1421         cache_assert_bucket_locked(ncp);
1422
1423         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1424         vlp = NULL;
1425         if (!(ncp->nc_flag & NCF_NEGATIVE))
1426                 vlp = VP2VNODELOCK(ncp->nc_vp);
1427         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1428                 cache_zap_locked(ncp);
1429                 mtx_unlock(blp);
1430                 cache_unlock_vnodes(dvlp, vlp);
1431                 return (0);
1432         }
1433
1434         dvp = ncp->nc_dvp;
1435         mtx_unlock(blp);
1436         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1437 }
1438
1439 static __noinline int
1440 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1441 {
1442         struct namecache *ncp;
1443         struct mtx *blp;
1444         struct mtx *dvlp, *dvlp2;
1445         uint32_t hash;
1446         int error;
1447
1448         if (cnp->cn_namelen == 2 &&
1449             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1450                 dvlp = VP2VNODELOCK(dvp);
1451                 dvlp2 = NULL;
1452                 mtx_lock(dvlp);
1453 retry_dotdot:
1454                 ncp = dvp->v_cache_dd;
1455                 if (ncp == NULL) {
1456                         mtx_unlock(dvlp);
1457                         if (dvlp2 != NULL)
1458                                 mtx_unlock(dvlp2);
1459                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1460                         return (0);
1461                 }
1462                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1463                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1464                                 goto retry_dotdot;
1465                         MPASS(dvp->v_cache_dd == NULL);
1466                         mtx_unlock(dvlp);
1467                         if (dvlp2 != NULL)
1468                                 mtx_unlock(dvlp2);
1469                         cache_free(ncp);
1470                 } else {
1471                         vn_seqc_write_begin(dvp);
1472                         dvp->v_cache_dd = NULL;
1473                         vn_seqc_write_end(dvp);
1474                         mtx_unlock(dvlp);
1475                         if (dvlp2 != NULL)
1476                                 mtx_unlock(dvlp2);
1477                 }
1478                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1479                 return (1);
1480         }
1481
1482         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1483         blp = HASH2BUCKETLOCK(hash);
1484 retry:
1485         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1486                 goto out_no_entry;
1487
1488         mtx_lock(blp);
1489
1490         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1491                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1492                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1493                         break;
1494         }
1495
1496         if (ncp == NULL) {
1497                 mtx_unlock(blp);
1498                 goto out_no_entry;
1499         }
1500
1501         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1502         if (__predict_false(error != 0)) {
1503                 zap_bucket_fail++;
1504                 goto retry;
1505         }
1506         counter_u64_add(numposzaps, 1);
1507         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1508         cache_free(ncp);
1509         return (1);
1510 out_no_entry:
1511         counter_u64_add(nummisszap, 1);
1512         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1513         return (0);
1514 }
1515
1516 static int __noinline
1517 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1518     struct timespec *tsp, int *ticksp)
1519 {
1520         int ltype;
1521
1522         *vpp = dvp;
1523         counter_u64_add(dothits, 1);
1524         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1525         if (tsp != NULL)
1526                 timespecclear(tsp);
1527         if (ticksp != NULL)
1528                 *ticksp = ticks;
1529         vrefact(*vpp);
1530         /*
1531          * When we lookup "." we still can be asked to lock it
1532          * differently...
1533          */
1534         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1535         if (ltype != VOP_ISLOCKED(*vpp)) {
1536                 if (ltype == LK_EXCLUSIVE) {
1537                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1538                         if (VN_IS_DOOMED((*vpp))) {
1539                                 /* forced unmount */
1540                                 vrele(*vpp);
1541                                 *vpp = NULL;
1542                                 return (ENOENT);
1543                         }
1544                 } else
1545                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1546         }
1547         return (-1);
1548 }
1549
1550 static int __noinline
1551 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1552     struct timespec *tsp, int *ticksp)
1553 {
1554         struct namecache_ts *ncp_ts;
1555         struct namecache *ncp;
1556         struct mtx *dvlp;
1557         enum vgetstate vs;
1558         int error, ltype;
1559         bool whiteout;
1560
1561         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1562
1563         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1564                 cache_remove_cnp(dvp, cnp);
1565                 return (0);
1566         }
1567
1568         counter_u64_add(dotdothits, 1);
1569 retry:
1570         dvlp = VP2VNODELOCK(dvp);
1571         mtx_lock(dvlp);
1572         ncp = dvp->v_cache_dd;
1573         if (ncp == NULL) {
1574                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1575                 mtx_unlock(dvlp);
1576                 return (0);
1577         }
1578         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1579                 if (ncp->nc_flag & NCF_NEGATIVE)
1580                         *vpp = NULL;
1581                 else
1582                         *vpp = ncp->nc_vp;
1583         } else
1584                 *vpp = ncp->nc_dvp;
1585         if (*vpp == NULL)
1586                 goto negative_success;
1587         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1588         cache_out_ts(ncp, tsp, ticksp);
1589         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1590             NCF_DTS && tsp != NULL) {
1591                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1592                 *tsp = ncp_ts->nc_dotdottime;
1593         }
1594
1595         MPASS(dvp != *vpp);
1596         ltype = VOP_ISLOCKED(dvp);
1597         VOP_UNLOCK(dvp);
1598         vs = vget_prep(*vpp);
1599         mtx_unlock(dvlp);
1600         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1601         vn_lock(dvp, ltype | LK_RETRY);
1602         if (VN_IS_DOOMED(dvp)) {
1603                 if (error == 0)
1604                         vput(*vpp);
1605                 *vpp = NULL;
1606                 return (ENOENT);
1607         }
1608         if (error) {
1609                 *vpp = NULL;
1610                 goto retry;
1611         }
1612         return (-1);
1613 negative_success:
1614         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1615                 if (cnp->cn_flags & ISLASTCN) {
1616                         counter_u64_add(numnegzaps, 1);
1617                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1618                         mtx_unlock(dvlp);
1619                         cache_free(ncp);
1620                         return (0);
1621                 }
1622         }
1623
1624         whiteout = (ncp->nc_flag & NCF_WHITE);
1625         cache_out_ts(ncp, tsp, ticksp);
1626         if (cache_neg_hit_prep(ncp))
1627                 cache_neg_promote(ncp);
1628         else
1629                 cache_neg_hit_finish(ncp);
1630         mtx_unlock(dvlp);
1631         if (whiteout)
1632                 cnp->cn_flags |= ISWHITEOUT;
1633         return (ENOENT);
1634 }
1635
1636 /**
1637  * Lookup a name in the name cache
1638  *
1639  * # Arguments
1640  *
1641  * - dvp:       Parent directory in which to search.
1642  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1643  * - cnp:       Parameters of the name search.  The most interesting bits of
1644  *              the cn_flags field have the following meanings:
1645  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1646  *                      it up.
1647  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1648  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1649  *              or negative) lookup, tsp will be filled with any timespec that
1650  *              was stored when this cache entry was created.  However, it will
1651  *              be clear for "." entries.
1652  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1653  *              (positive or negative) lookup, it will contain the ticks value
1654  *              that was current when the cache entry was created, unless cnp
1655  *              was ".".
1656  *
1657  * Either both tsp and ticks have to be provided or neither of them.
1658  *
1659  * # Returns
1660  *
1661  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1662  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1663  *              to a forced unmount.  vpp will not be modified.  If the entry
1664  *              is a whiteout, then the ISWHITEOUT flag will be set in
1665  *              cnp->cn_flags.
1666  * - 0:         A cache miss.  vpp will not be modified.
1667  *
1668  * # Locking
1669  *
1670  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1671  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1672  * lock is not recursively acquired.
1673  */
1674 static int __noinline
1675 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1676     struct timespec *tsp, int *ticksp)
1677 {
1678         struct namecache *ncp;
1679         struct mtx *blp;
1680         uint32_t hash;
1681         enum vgetstate vs;
1682         int error;
1683         bool whiteout;
1684
1685         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1686         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1687
1688 retry:
1689         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1690         blp = HASH2BUCKETLOCK(hash);
1691         mtx_lock(blp);
1692
1693         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1694                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1695                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1696                         break;
1697         }
1698
1699         if (__predict_false(ncp == NULL)) {
1700                 mtx_unlock(blp);
1701                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1702                     NULL);
1703                 counter_u64_add(nummiss, 1);
1704                 return (0);
1705         }
1706
1707         if (ncp->nc_flag & NCF_NEGATIVE)
1708                 goto negative_success;
1709
1710         counter_u64_add(numposhits, 1);
1711         *vpp = ncp->nc_vp;
1712         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1713         cache_out_ts(ncp, tsp, ticksp);
1714         MPASS(dvp != *vpp);
1715         vs = vget_prep(*vpp);
1716         mtx_unlock(blp);
1717         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1718         if (error) {
1719                 *vpp = NULL;
1720                 goto retry;
1721         }
1722         return (-1);
1723 negative_success:
1724         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1725                 if (cnp->cn_flags & ISLASTCN) {
1726                         counter_u64_add(numnegzaps, 1);
1727                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1728                         if (__predict_false(error != 0)) {
1729                                 zap_bucket_fail2++;
1730                                 goto retry;
1731                         }
1732                         cache_free(ncp);
1733                         return (0);
1734                 }
1735         }
1736
1737         whiteout = (ncp->nc_flag & NCF_WHITE);
1738         cache_out_ts(ncp, tsp, ticksp);
1739         if (cache_neg_hit_prep(ncp))
1740                 cache_neg_promote(ncp);
1741         else
1742                 cache_neg_hit_finish(ncp);
1743         mtx_unlock(blp);
1744         if (whiteout)
1745                 cnp->cn_flags |= ISWHITEOUT;
1746         return (ENOENT);
1747 }
1748
1749 int
1750 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1751     struct timespec *tsp, int *ticksp)
1752 {
1753         struct namecache *ncp;
1754         uint32_t hash;
1755         enum vgetstate vs;
1756         int error;
1757         bool whiteout, neg_promote;
1758         u_short nc_flag;
1759
1760         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1761
1762 #ifdef DEBUG_CACHE
1763         if (__predict_false(!doingcache)) {
1764                 cnp->cn_flags &= ~MAKEENTRY;
1765                 return (0);
1766         }
1767 #endif
1768
1769         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1770                 if (cnp->cn_namelen == 1)
1771                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1772                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1773                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1774         }
1775
1776         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1777
1778         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1779                 cache_remove_cnp(dvp, cnp);
1780                 return (0);
1781         }
1782
1783         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1784         vfs_smr_enter();
1785
1786         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1787                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1788                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1789                         break;
1790         }
1791
1792         if (__predict_false(ncp == NULL)) {
1793                 vfs_smr_exit();
1794                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1795                     NULL);
1796                 counter_u64_add(nummiss, 1);
1797                 return (0);
1798         }
1799
1800         nc_flag = atomic_load_char(&ncp->nc_flag);
1801         if (nc_flag & NCF_NEGATIVE)
1802                 goto negative_success;
1803
1804         counter_u64_add(numposhits, 1);
1805         *vpp = ncp->nc_vp;
1806         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1807         cache_out_ts(ncp, tsp, ticksp);
1808         MPASS(dvp != *vpp);
1809         if (!cache_ncp_canuse(ncp)) {
1810                 vfs_smr_exit();
1811                 *vpp = NULL;
1812                 goto out_fallback;
1813         }
1814         vs = vget_prep_smr(*vpp);
1815         vfs_smr_exit();
1816         if (__predict_false(vs == VGET_NONE)) {
1817                 *vpp = NULL;
1818                 goto out_fallback;
1819         }
1820         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1821         if (error) {
1822                 *vpp = NULL;
1823                 goto out_fallback;
1824         }
1825         return (-1);
1826 negative_success:
1827         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1828                 if (cnp->cn_flags & ISLASTCN) {
1829                         vfs_smr_exit();
1830                         goto out_fallback;
1831                 }
1832         }
1833
1834         cache_out_ts(ncp, tsp, ticksp);
1835         whiteout = (ncp->nc_flag & NCF_WHITE);
1836         neg_promote = cache_neg_hit_prep(ncp);
1837         if (__predict_false(!cache_ncp_canuse(ncp))) {
1838                 cache_neg_hit_abort(ncp);
1839                 vfs_smr_exit();
1840                 goto out_fallback;
1841         }
1842         if (neg_promote) {
1843                 vfs_smr_exit();
1844                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1845                         goto out_fallback;
1846         } else {
1847                 cache_neg_hit_finish(ncp);
1848                 vfs_smr_exit();
1849         }
1850         if (whiteout)
1851                 cnp->cn_flags |= ISWHITEOUT;
1852         return (ENOENT);
1853 out_fallback:
1854         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1855 }
1856
1857 struct celockstate {
1858         struct mtx *vlp[3];
1859         struct mtx *blp[2];
1860 };
1861 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1862 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1863
1864 static inline void
1865 cache_celockstate_init(struct celockstate *cel)
1866 {
1867
1868         bzero(cel, sizeof(*cel));
1869 }
1870
1871 static void
1872 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1873     struct vnode *dvp)
1874 {
1875         struct mtx *vlp1, *vlp2;
1876
1877         MPASS(cel->vlp[0] == NULL);
1878         MPASS(cel->vlp[1] == NULL);
1879         MPASS(cel->vlp[2] == NULL);
1880
1881         MPASS(vp != NULL || dvp != NULL);
1882
1883         vlp1 = VP2VNODELOCK(vp);
1884         vlp2 = VP2VNODELOCK(dvp);
1885         cache_sort_vnodes(&vlp1, &vlp2);
1886
1887         if (vlp1 != NULL) {
1888                 mtx_lock(vlp1);
1889                 cel->vlp[0] = vlp1;
1890         }
1891         mtx_lock(vlp2);
1892         cel->vlp[1] = vlp2;
1893 }
1894
1895 static void
1896 cache_unlock_vnodes_cel(struct celockstate *cel)
1897 {
1898
1899         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1900
1901         if (cel->vlp[0] != NULL)
1902                 mtx_unlock(cel->vlp[0]);
1903         if (cel->vlp[1] != NULL)
1904                 mtx_unlock(cel->vlp[1]);
1905         if (cel->vlp[2] != NULL)
1906                 mtx_unlock(cel->vlp[2]);
1907 }
1908
1909 static bool
1910 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1911 {
1912         struct mtx *vlp;
1913         bool ret;
1914
1915         cache_assert_vlp_locked(cel->vlp[0]);
1916         cache_assert_vlp_locked(cel->vlp[1]);
1917         MPASS(cel->vlp[2] == NULL);
1918
1919         MPASS(vp != NULL);
1920         vlp = VP2VNODELOCK(vp);
1921
1922         ret = true;
1923         if (vlp >= cel->vlp[1]) {
1924                 mtx_lock(vlp);
1925         } else {
1926                 if (mtx_trylock(vlp))
1927                         goto out;
1928                 cache_lock_vnodes_cel_3_failures++;
1929                 cache_unlock_vnodes_cel(cel);
1930                 if (vlp < cel->vlp[0]) {
1931                         mtx_lock(vlp);
1932                         mtx_lock(cel->vlp[0]);
1933                         mtx_lock(cel->vlp[1]);
1934                 } else {
1935                         if (cel->vlp[0] != NULL)
1936                                 mtx_lock(cel->vlp[0]);
1937                         mtx_lock(vlp);
1938                         mtx_lock(cel->vlp[1]);
1939                 }
1940                 ret = false;
1941         }
1942 out:
1943         cel->vlp[2] = vlp;
1944         return (ret);
1945 }
1946
1947 static void
1948 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1949     struct mtx *blp2)
1950 {
1951
1952         MPASS(cel->blp[0] == NULL);
1953         MPASS(cel->blp[1] == NULL);
1954
1955         cache_sort_vnodes(&blp1, &blp2);
1956
1957         if (blp1 != NULL) {
1958                 mtx_lock(blp1);
1959                 cel->blp[0] = blp1;
1960         }
1961         mtx_lock(blp2);
1962         cel->blp[1] = blp2;
1963 }
1964
1965 static void
1966 cache_unlock_buckets_cel(struct celockstate *cel)
1967 {
1968
1969         if (cel->blp[0] != NULL)
1970                 mtx_unlock(cel->blp[0]);
1971         mtx_unlock(cel->blp[1]);
1972 }
1973
1974 /*
1975  * Lock part of the cache affected by the insertion.
1976  *
1977  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1978  * However, insertion can result in removal of an old entry. In this
1979  * case we have an additional vnode and bucketlock pair to lock.
1980  *
1981  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1982  * preserving the locking order (smaller address first).
1983  */
1984 static void
1985 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1986     uint32_t hash)
1987 {
1988         struct namecache *ncp;
1989         struct mtx *blps[2];
1990
1991         blps[0] = HASH2BUCKETLOCK(hash);
1992         for (;;) {
1993                 blps[1] = NULL;
1994                 cache_lock_vnodes_cel(cel, dvp, vp);
1995                 if (vp == NULL || vp->v_type != VDIR)
1996                         break;
1997                 ncp = vp->v_cache_dd;
1998                 if (ncp == NULL)
1999                         break;
2000                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2001                         break;
2002                 MPASS(ncp->nc_dvp == vp);
2003                 blps[1] = NCP2BUCKETLOCK(ncp);
2004                 if (ncp->nc_flag & NCF_NEGATIVE)
2005                         break;
2006                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2007                         break;
2008                 /*
2009                  * All vnodes got re-locked. Re-validate the state and if
2010                  * nothing changed we are done. Otherwise restart.
2011                  */
2012                 if (ncp == vp->v_cache_dd &&
2013                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2014                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2015                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2016                         break;
2017                 cache_unlock_vnodes_cel(cel);
2018                 cel->vlp[0] = NULL;
2019                 cel->vlp[1] = NULL;
2020                 cel->vlp[2] = NULL;
2021         }
2022         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2023 }
2024
2025 static void
2026 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2027     uint32_t hash)
2028 {
2029         struct namecache *ncp;
2030         struct mtx *blps[2];
2031
2032         blps[0] = HASH2BUCKETLOCK(hash);
2033         for (;;) {
2034                 blps[1] = NULL;
2035                 cache_lock_vnodes_cel(cel, dvp, vp);
2036                 ncp = dvp->v_cache_dd;
2037                 if (ncp == NULL)
2038                         break;
2039                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2040                         break;
2041                 MPASS(ncp->nc_dvp == dvp);
2042                 blps[1] = NCP2BUCKETLOCK(ncp);
2043                 if (ncp->nc_flag & NCF_NEGATIVE)
2044                         break;
2045                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2046                         break;
2047                 if (ncp == dvp->v_cache_dd &&
2048                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2049                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2050                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2051                         break;
2052                 cache_unlock_vnodes_cel(cel);
2053                 cel->vlp[0] = NULL;
2054                 cel->vlp[1] = NULL;
2055                 cel->vlp[2] = NULL;
2056         }
2057         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2058 }
2059
2060 static void
2061 cache_enter_unlock(struct celockstate *cel)
2062 {
2063
2064         cache_unlock_buckets_cel(cel);
2065         cache_unlock_vnodes_cel(cel);
2066 }
2067
2068 static void __noinline
2069 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2070     struct componentname *cnp)
2071 {
2072         struct celockstate cel;
2073         struct namecache *ncp;
2074         uint32_t hash;
2075         int len;
2076
2077         if (dvp->v_cache_dd == NULL)
2078                 return;
2079         len = cnp->cn_namelen;
2080         cache_celockstate_init(&cel);
2081         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2082         cache_enter_lock_dd(&cel, dvp, vp, hash);
2083         vn_seqc_write_begin(dvp);
2084         ncp = dvp->v_cache_dd;
2085         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2086                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2087                 cache_zap_locked(ncp);
2088         } else {
2089                 ncp = NULL;
2090         }
2091         dvp->v_cache_dd = NULL;
2092         vn_seqc_write_end(dvp);
2093         cache_enter_unlock(&cel);
2094         if (ncp != NULL)
2095                 cache_free(ncp);
2096 }
2097
2098 /*
2099  * Add an entry to the cache.
2100  */
2101 void
2102 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2103     struct timespec *tsp, struct timespec *dtsp)
2104 {
2105         struct celockstate cel;
2106         struct namecache *ncp, *n2, *ndd;
2107         struct namecache_ts *ncp_ts;
2108         struct nchashhead *ncpp;
2109         uint32_t hash;
2110         int flag;
2111         int len;
2112         u_long lnumcache;
2113
2114         VNPASS(dvp != vp, dvp);
2115         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2116         VNPASS(dvp->v_type != VNON, dvp);
2117         if (vp != NULL) {
2118                 VNPASS(!VN_IS_DOOMED(vp), vp);
2119                 VNPASS(vp->v_type != VNON, vp);
2120         }
2121
2122 #ifdef DEBUG_CACHE
2123         if (__predict_false(!doingcache))
2124                 return;
2125 #endif
2126
2127         flag = 0;
2128         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2129                 if (cnp->cn_namelen == 1)
2130                         return;
2131                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2132                         cache_enter_dotdot_prep(dvp, vp, cnp);
2133                         flag = NCF_ISDOTDOT;
2134                 }
2135         }
2136
2137         /*
2138          * Avoid blowout in namecache entries.
2139          *
2140          * Bugs:
2141          * 1. filesystems may end up tryng to add an already existing entry
2142          * (for example this can happen after a cache miss during concurrent
2143          * lookup), in which case we will call cache_neg_evict despite not
2144          * adding anything.
2145          * 2. the routine may fail to free anything and no provisions are made
2146          * to make it try harder (see the inside for failure modes)
2147          * 3. it only ever looks at negative entries.
2148          */
2149         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
2150         if (cache_neg_evict_cond(lnumcache)) {
2151                 lnumcache = atomic_load_long(&numcache);
2152         }
2153         if (__predict_false(lnumcache >= ncsize)) {
2154                 atomic_subtract_long(&numcache, 1);
2155                 counter_u64_add(numdrops, 1);
2156                 return;
2157         }
2158
2159         cache_celockstate_init(&cel);
2160         ndd = NULL;
2161         ncp_ts = NULL;
2162
2163         /*
2164          * Calculate the hash key and setup as much of the new
2165          * namecache entry as possible before acquiring the lock.
2166          */
2167         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2168         ncp->nc_flag = flag | NCF_WIP;
2169         ncp->nc_vp = vp;
2170         if (vp == NULL)
2171                 cache_neg_init(ncp);
2172         ncp->nc_dvp = dvp;
2173         if (tsp != NULL) {
2174                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2175                 ncp_ts->nc_time = *tsp;
2176                 ncp_ts->nc_ticks = ticks;
2177                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2178                 if (dtsp != NULL) {
2179                         ncp_ts->nc_dotdottime = *dtsp;
2180                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2181                 }
2182         }
2183         len = ncp->nc_nlen = cnp->cn_namelen;
2184         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2185         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2186         ncp->nc_name[len] = '\0';
2187         cache_enter_lock(&cel, dvp, vp, hash);
2188
2189         /*
2190          * See if this vnode or negative entry is already in the cache
2191          * with this name.  This can happen with concurrent lookups of
2192          * the same path name.
2193          */
2194         ncpp = NCHHASH(hash);
2195         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2196                 if (n2->nc_dvp == dvp &&
2197                     n2->nc_nlen == cnp->cn_namelen &&
2198                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2199                         MPASS(cache_ncp_canuse(n2));
2200                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2201                                 KASSERT(vp == NULL,
2202                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2203                                     __func__, NULL, vp));
2204                         else
2205                                 KASSERT(n2->nc_vp == vp,
2206                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2207                                     __func__, n2->nc_vp, vp));
2208                         /*
2209                          * Entries are supposed to be immutable unless in the
2210                          * process of getting destroyed. Accommodating for
2211                          * changing timestamps is possible but not worth it.
2212                          * This should be harmless in terms of correctness, in
2213                          * the worst case resulting in an earlier expiration.
2214                          * Alternatively, the found entry can be replaced
2215                          * altogether.
2216                          */
2217                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2218 #if 0
2219                         if (tsp != NULL) {
2220                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2221                                     ("no NCF_TS"));
2222                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2223                                 n2_ts->nc_time = ncp_ts->nc_time;
2224                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2225                                 if (dtsp != NULL) {
2226                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2227                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2228                                 }
2229                         }
2230 #endif
2231                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2232                             vp);
2233                         goto out_unlock_free;
2234                 }
2235         }
2236
2237         if (flag == NCF_ISDOTDOT) {
2238                 /*
2239                  * See if we are trying to add .. entry, but some other lookup
2240                  * has populated v_cache_dd pointer already.
2241                  */
2242                 if (dvp->v_cache_dd != NULL)
2243                         goto out_unlock_free;
2244                 KASSERT(vp == NULL || vp->v_type == VDIR,
2245                     ("wrong vnode type %p", vp));
2246                 vn_seqc_write_begin(dvp);
2247                 dvp->v_cache_dd = ncp;
2248                 vn_seqc_write_end(dvp);
2249         }
2250
2251         if (vp != NULL) {
2252                 if (flag != NCF_ISDOTDOT) {
2253                         /*
2254                          * For this case, the cache entry maps both the
2255                          * directory name in it and the name ".." for the
2256                          * directory's parent.
2257                          */
2258                         vn_seqc_write_begin(vp);
2259                         if ((ndd = vp->v_cache_dd) != NULL) {
2260                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2261                                         cache_zap_locked(ndd);
2262                                 else
2263                                         ndd = NULL;
2264                         }
2265                         vp->v_cache_dd = ncp;
2266                         vn_seqc_write_end(vp);
2267                 } else if (vp->v_type != VDIR) {
2268                         if (vp->v_cache_dd != NULL) {
2269                                 vn_seqc_write_begin(vp);
2270                                 vp->v_cache_dd = NULL;
2271                                 vn_seqc_write_end(vp);
2272                         }
2273                 }
2274         }
2275
2276         if (flag != NCF_ISDOTDOT) {
2277                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2278                         vhold(dvp);
2279                         counter_u64_add(numcachehv, 1);
2280                 }
2281                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2282         }
2283
2284         /*
2285          * If the entry is "negative", we place it into the
2286          * "negative" cache queue, otherwise, we place it into the
2287          * destination vnode's cache entries queue.
2288          */
2289         if (vp != NULL) {
2290                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2291                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2292                     vp);
2293         } else {
2294                 if (cnp->cn_flags & ISWHITEOUT)
2295                         ncp->nc_flag |= NCF_WHITE;
2296                 cache_neg_insert(ncp);
2297                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2298                     ncp->nc_name);
2299         }
2300
2301         /*
2302          * Insert the new namecache entry into the appropriate chain
2303          * within the cache entries table.
2304          */
2305         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2306
2307         atomic_thread_fence_rel();
2308         /*
2309          * Mark the entry as fully constructed.
2310          * It is immutable past this point until its removal.
2311          */
2312         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2313
2314         cache_enter_unlock(&cel);
2315         if (ndd != NULL)
2316                 cache_free(ndd);
2317         return;
2318 out_unlock_free:
2319         cache_enter_unlock(&cel);
2320         atomic_subtract_long(&numcache, 1);
2321         cache_free(ncp);
2322         return;
2323 }
2324
2325 static u_int
2326 cache_roundup_2(u_int val)
2327 {
2328         u_int res;
2329
2330         for (res = 1; res <= val; res <<= 1)
2331                 continue;
2332
2333         return (res);
2334 }
2335
2336 static struct nchashhead *
2337 nchinittbl(u_long elements, u_long *hashmask)
2338 {
2339         struct nchashhead *hashtbl;
2340         u_long hashsize, i;
2341
2342         hashsize = cache_roundup_2(elements) / 2;
2343
2344         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2345         for (i = 0; i < hashsize; i++)
2346                 CK_SLIST_INIT(&hashtbl[i]);
2347         *hashmask = hashsize - 1;
2348         return (hashtbl);
2349 }
2350
2351 static void
2352 ncfreetbl(struct nchashhead *hashtbl)
2353 {
2354
2355         free(hashtbl, M_VFSCACHE);
2356 }
2357
2358 /*
2359  * Name cache initialization, from vfs_init() when we are booting
2360  */
2361 static void
2362 nchinit(void *dummy __unused)
2363 {
2364         u_int i;
2365
2366         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2367             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2368         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2369             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2370         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2371             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2372         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2373             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2374
2375         VFS_SMR_ZONE_SET(cache_zone_small);
2376         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2377         VFS_SMR_ZONE_SET(cache_zone_large);
2378         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2379
2380         ncsize = desiredvnodes * ncsizefactor;
2381         cache_recalc_neg_min(ncnegminpct);
2382         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2383         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2384         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2385                 ncbuckethash = 7;
2386         if (ncbuckethash > nchash)
2387                 ncbuckethash = nchash;
2388         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2389             M_WAITOK | M_ZERO);
2390         for (i = 0; i < numbucketlocks; i++)
2391                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2392         ncvnodehash = ncbuckethash;
2393         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2394             M_WAITOK | M_ZERO);
2395         for (i = 0; i < numvnodelocks; i++)
2396                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2397
2398         for (i = 0; i < numneglists; i++) {
2399                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2400                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2401                 TAILQ_INIT(&neglists[i].nl_list);
2402                 TAILQ_INIT(&neglists[i].nl_hotlist);
2403         }
2404 }
2405 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2406
2407 void
2408 cache_vnode_init(struct vnode *vp)
2409 {
2410
2411         LIST_INIT(&vp->v_cache_src);
2412         TAILQ_INIT(&vp->v_cache_dst);
2413         vp->v_cache_dd = NULL;
2414         cache_prehash(vp);
2415 }
2416
2417 void
2418 cache_changesize(u_long newmaxvnodes)
2419 {
2420         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2421         u_long new_nchash, old_nchash;
2422         struct namecache *ncp;
2423         uint32_t hash;
2424         u_long newncsize;
2425         int i;
2426
2427         newncsize = newmaxvnodes * ncsizefactor;
2428         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2429         if (newmaxvnodes < numbucketlocks)
2430                 newmaxvnodes = numbucketlocks;
2431
2432         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2433         /* If same hash table size, nothing to do */
2434         if (nchash == new_nchash) {
2435                 ncfreetbl(new_nchashtbl);
2436                 return;
2437         }
2438         /*
2439          * Move everything from the old hash table to the new table.
2440          * None of the namecache entries in the table can be removed
2441          * because to do so, they have to be removed from the hash table.
2442          */
2443         cache_lock_all_vnodes();
2444         cache_lock_all_buckets();
2445         old_nchashtbl = nchashtbl;
2446         old_nchash = nchash;
2447         nchashtbl = new_nchashtbl;
2448         nchash = new_nchash;
2449         for (i = 0; i <= old_nchash; i++) {
2450                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2451                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2452                             ncp->nc_dvp);
2453                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2454                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2455                 }
2456         }
2457         ncsize = newncsize;
2458         cache_recalc_neg_min(ncnegminpct);
2459         cache_unlock_all_buckets();
2460         cache_unlock_all_vnodes();
2461         ncfreetbl(old_nchashtbl);
2462 }
2463
2464 /*
2465  * Invalidate all entries from and to a particular vnode.
2466  */
2467 static void
2468 cache_purge_impl(struct vnode *vp)
2469 {
2470         TAILQ_HEAD(, namecache) ncps;
2471         struct namecache *ncp, *nnp;
2472         struct mtx *vlp, *vlp2;
2473
2474         TAILQ_INIT(&ncps);
2475         vlp = VP2VNODELOCK(vp);
2476         vlp2 = NULL;
2477         mtx_lock(vlp);
2478 retry:
2479         while (!LIST_EMPTY(&vp->v_cache_src)) {
2480                 ncp = LIST_FIRST(&vp->v_cache_src);
2481                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2482                         goto retry;
2483                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2484         }
2485         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2486                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2487                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2488                         goto retry;
2489                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2490         }
2491         ncp = vp->v_cache_dd;
2492         if (ncp != NULL) {
2493                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2494                    ("lost dotdot link"));
2495                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2496                         goto retry;
2497                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2498         }
2499         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2500         mtx_unlock(vlp);
2501         if (vlp2 != NULL)
2502                 mtx_unlock(vlp2);
2503         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2504                 cache_free(ncp);
2505         }
2506 }
2507
2508 /*
2509  * Opportunistic check to see if there is anything to do.
2510  */
2511 static bool
2512 cache_has_entries(struct vnode *vp)
2513 {
2514
2515         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2516             vp->v_cache_dd == NULL)
2517                 return (false);
2518         return (true);
2519 }
2520
2521 void
2522 cache_purge(struct vnode *vp)
2523 {
2524
2525         SDT_PROBE1(vfs, namecache, purge, done, vp);
2526         if (!cache_has_entries(vp))
2527                 return;
2528         cache_purge_impl(vp);
2529 }
2530
2531 /*
2532  * Only to be used by vgone.
2533  */
2534 void
2535 cache_purge_vgone(struct vnode *vp)
2536 {
2537         struct mtx *vlp;
2538
2539         VNPASS(VN_IS_DOOMED(vp), vp);
2540         if (cache_has_entries(vp)) {
2541                 cache_purge_impl(vp);
2542                 return;
2543         }
2544
2545         /*
2546          * Serialize against a potential thread doing cache_purge.
2547          */
2548         vlp = VP2VNODELOCK(vp);
2549         mtx_wait_unlocked(vlp);
2550         if (cache_has_entries(vp)) {
2551                 cache_purge_impl(vp);
2552                 return;
2553         }
2554         return;
2555 }
2556
2557 /*
2558  * Invalidate all negative entries for a particular directory vnode.
2559  */
2560 void
2561 cache_purge_negative(struct vnode *vp)
2562 {
2563         TAILQ_HEAD(, namecache) ncps;
2564         struct namecache *ncp, *nnp;
2565         struct mtx *vlp;
2566
2567         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2568         if (LIST_EMPTY(&vp->v_cache_src))
2569                 return;
2570         TAILQ_INIT(&ncps);
2571         vlp = VP2VNODELOCK(vp);
2572         mtx_lock(vlp);
2573         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2574                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2575                         continue;
2576                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2577                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2578         }
2579         mtx_unlock(vlp);
2580         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2581                 cache_free(ncp);
2582         }
2583 }
2584
2585 void
2586 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2587     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2588 {
2589
2590         ASSERT_VOP_IN_SEQC(fdvp);
2591         ASSERT_VOP_IN_SEQC(fvp);
2592         ASSERT_VOP_IN_SEQC(tdvp);
2593         if (tvp != NULL)
2594                 ASSERT_VOP_IN_SEQC(tvp);
2595
2596         cache_purge(fvp);
2597         if (tvp != NULL) {
2598                 cache_purge(tvp);
2599                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2600                     ("%s: lingering negative entry", __func__));
2601         } else {
2602                 cache_remove_cnp(tdvp, tcnp);
2603         }
2604 }
2605
2606 #ifdef INVARIANTS
2607 /*
2608  * Validate that if an entry exists it matches.
2609  */
2610 void
2611 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2612 {
2613         struct namecache *ncp;
2614         struct mtx *blp;
2615         uint32_t hash;
2616
2617         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2618         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2619                 return;
2620         blp = HASH2BUCKETLOCK(hash);
2621         mtx_lock(blp);
2622         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2623                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2624                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2625                         if (ncp->nc_vp != vp)
2626                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2627                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2628                                     ncp->nc_vp);
2629                 }
2630         }
2631         mtx_unlock(blp);
2632 }
2633 #endif
2634
2635 /*
2636  * Flush all entries referencing a particular filesystem.
2637  */
2638 void
2639 cache_purgevfs(struct mount *mp)
2640 {
2641         struct vnode *vp, *mvp;
2642
2643         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2644         /*
2645          * Somewhat wasteful iteration over all vnodes. Would be better to
2646          * support filtering and avoid the interlock to begin with.
2647          */
2648         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2649                 if (!cache_has_entries(vp)) {
2650                         VI_UNLOCK(vp);
2651                         continue;
2652                 }
2653                 vholdl(vp);
2654                 VI_UNLOCK(vp);
2655                 cache_purge(vp);
2656                 vdrop(vp);
2657         }
2658 }
2659
2660 /*
2661  * Perform canonical checks and cache lookup and pass on to filesystem
2662  * through the vop_cachedlookup only if needed.
2663  */
2664
2665 int
2666 vfs_cache_lookup(struct vop_lookup_args *ap)
2667 {
2668         struct vnode *dvp;
2669         int error;
2670         struct vnode **vpp = ap->a_vpp;
2671         struct componentname *cnp = ap->a_cnp;
2672         int flags = cnp->cn_flags;
2673
2674         *vpp = NULL;
2675         dvp = ap->a_dvp;
2676
2677         if (dvp->v_type != VDIR)
2678                 return (ENOTDIR);
2679
2680         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2681             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2682                 return (EROFS);
2683
2684         error = vn_dir_check_exec(dvp, cnp);
2685         if (error != 0)
2686                 return (error);
2687
2688         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2689         if (error == 0)
2690                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2691         if (error == -1)
2692                 return (0);
2693         return (error);
2694 }
2695
2696 /* Implementation of the getcwd syscall. */
2697 int
2698 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2699 {
2700         char *buf, *retbuf;
2701         size_t buflen;
2702         int error;
2703
2704         buflen = uap->buflen;
2705         if (__predict_false(buflen < 2))
2706                 return (EINVAL);
2707         if (buflen > MAXPATHLEN)
2708                 buflen = MAXPATHLEN;
2709
2710         buf = uma_zalloc(namei_zone, M_WAITOK);
2711         error = vn_getcwd(buf, &retbuf, &buflen);
2712         if (error == 0)
2713                 error = copyout(retbuf, uap->buf, buflen);
2714         uma_zfree(namei_zone, buf);
2715         return (error);
2716 }
2717
2718 int
2719 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2720 {
2721         struct pwd *pwd;
2722         int error;
2723
2724         vfs_smr_enter();
2725         pwd = pwd_get_smr();
2726         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2727             buflen, 0);
2728         VFS_SMR_ASSERT_NOT_ENTERED();
2729         if (error < 0) {
2730                 pwd = pwd_hold(curthread);
2731                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2732                     retbuf, buflen);
2733                 pwd_drop(pwd);
2734         }
2735
2736 #ifdef KTRACE
2737         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2738                 ktrnamei(*retbuf);
2739 #endif
2740         return (error);
2741 }
2742
2743 static int
2744 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2745     size_t size, int flags, enum uio_seg pathseg)
2746 {
2747         struct nameidata nd;
2748         char *retbuf, *freebuf;
2749         int error;
2750
2751         if (flags != 0)
2752                 return (EINVAL);
2753         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2754             pathseg, path, fd, &cap_fstat_rights, td);
2755         if ((error = namei(&nd)) != 0)
2756                 return (error);
2757         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2758         if (error == 0) {
2759                 error = copyout(retbuf, buf, size);
2760                 free(freebuf, M_TEMP);
2761         }
2762         NDFREE(&nd, 0);
2763         return (error);
2764 }
2765
2766 int
2767 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2768 {
2769
2770         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2771             uap->flags, UIO_USERSPACE));
2772 }
2773
2774 /*
2775  * Retrieve the full filesystem path that correspond to a vnode from the name
2776  * cache (if available)
2777  */
2778 int
2779 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2780 {
2781         struct pwd *pwd;
2782         char *buf;
2783         size_t buflen;
2784         int error;
2785
2786         if (__predict_false(vp == NULL))
2787                 return (EINVAL);
2788
2789         buflen = MAXPATHLEN;
2790         buf = malloc(buflen, M_TEMP, M_WAITOK);
2791         vfs_smr_enter();
2792         pwd = pwd_get_smr();
2793         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2794         VFS_SMR_ASSERT_NOT_ENTERED();
2795         if (error < 0) {
2796                 pwd = pwd_hold(curthread);
2797                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2798                 pwd_drop(pwd);
2799         }
2800         if (error == 0)
2801                 *freebuf = buf;
2802         else
2803                 free(buf, M_TEMP);
2804         return (error);
2805 }
2806
2807 /*
2808  * This function is similar to vn_fullpath, but it attempts to lookup the
2809  * pathname relative to the global root mount point.  This is required for the
2810  * auditing sub-system, as audited pathnames must be absolute, relative to the
2811  * global root mount point.
2812  */
2813 int
2814 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2815 {
2816         char *buf;
2817         size_t buflen;
2818         int error;
2819
2820         if (__predict_false(vp == NULL))
2821                 return (EINVAL);
2822         buflen = MAXPATHLEN;
2823         buf = malloc(buflen, M_TEMP, M_WAITOK);
2824         vfs_smr_enter();
2825         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2826         VFS_SMR_ASSERT_NOT_ENTERED();
2827         if (error < 0) {
2828                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2829         }
2830         if (error == 0)
2831                 *freebuf = buf;
2832         else
2833                 free(buf, M_TEMP);
2834         return (error);
2835 }
2836
2837 static struct namecache *
2838 vn_dd_from_dst(struct vnode *vp)
2839 {
2840         struct namecache *ncp;
2841
2842         cache_assert_vnode_locked(vp);
2843         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2844                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2845                         return (ncp);
2846         }
2847         return (NULL);
2848 }
2849
2850 int
2851 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2852 {
2853         struct vnode *dvp;
2854         struct namecache *ncp;
2855         struct mtx *vlp;
2856         int error;
2857
2858         vlp = VP2VNODELOCK(*vp);
2859         mtx_lock(vlp);
2860         ncp = (*vp)->v_cache_dd;
2861         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2862                 KASSERT(ncp == vn_dd_from_dst(*vp),
2863                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2864                     ncp, vn_dd_from_dst(*vp)));
2865         } else {
2866                 ncp = vn_dd_from_dst(*vp);
2867         }
2868         if (ncp != NULL) {
2869                 if (*buflen < ncp->nc_nlen) {
2870                         mtx_unlock(vlp);
2871                         vrele(*vp);
2872                         counter_u64_add(numfullpathfail4, 1);
2873                         error = ENOMEM;
2874                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2875                             vp, NULL);
2876                         return (error);
2877                 }
2878                 *buflen -= ncp->nc_nlen;
2879                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2880                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2881                     ncp->nc_name, vp);
2882                 dvp = *vp;
2883                 *vp = ncp->nc_dvp;
2884                 vref(*vp);
2885                 mtx_unlock(vlp);
2886                 vrele(dvp);
2887                 return (0);
2888         }
2889         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2890
2891         mtx_unlock(vlp);
2892         vn_lock(*vp, LK_SHARED | LK_RETRY);
2893         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2894         vput(*vp);
2895         if (error) {
2896                 counter_u64_add(numfullpathfail2, 1);
2897                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2898                 return (error);
2899         }
2900
2901         *vp = dvp;
2902         if (VN_IS_DOOMED(dvp)) {
2903                 /* forced unmount */
2904                 vrele(dvp);
2905                 error = ENOENT;
2906                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2907                 return (error);
2908         }
2909         /*
2910          * *vp has its use count incremented still.
2911          */
2912
2913         return (0);
2914 }
2915
2916 /*
2917  * Resolve a directory to a pathname.
2918  *
2919  * The name of the directory can always be found in the namecache or fetched
2920  * from the filesystem. There is also guaranteed to be only one parent, meaning
2921  * we can just follow vnodes up until we find the root.
2922  *
2923  * The vnode must be referenced.
2924  */
2925 static int
2926 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2927     size_t *len, size_t addend)
2928 {
2929 #ifdef KDTRACE_HOOKS
2930         struct vnode *startvp = vp;
2931 #endif
2932         struct vnode *vp1;
2933         size_t buflen;
2934         int error;
2935         bool slash_prefixed;
2936
2937         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2938         VNPASS(vp->v_usecount > 0, vp);
2939
2940         buflen = *len;
2941
2942         slash_prefixed = true;
2943         if (addend == 0) {
2944                 MPASS(*len >= 2);
2945                 buflen--;
2946                 buf[buflen] = '\0';
2947                 slash_prefixed = false;
2948         }
2949
2950         error = 0;
2951
2952         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2953         counter_u64_add(numfullpathcalls, 1);
2954         while (vp != rdir && vp != rootvnode) {
2955                 /*
2956                  * The vp vnode must be already fully constructed,
2957                  * since it is either found in namecache or obtained
2958                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2959                  * without obtaining the vnode lock.
2960                  */
2961                 if ((vp->v_vflag & VV_ROOT) != 0) {
2962                         vn_lock(vp, LK_RETRY | LK_SHARED);
2963
2964                         /*
2965                          * With the vnode locked, check for races with
2966                          * unmount, forced or not.  Note that we
2967                          * already verified that vp is not equal to
2968                          * the root vnode, which means that
2969                          * mnt_vnodecovered can be NULL only for the
2970                          * case of unmount.
2971                          */
2972                         if (VN_IS_DOOMED(vp) ||
2973                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2974                             vp1->v_mountedhere != vp->v_mount) {
2975                                 vput(vp);
2976                                 error = ENOENT;
2977                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2978                                     error, vp, NULL);
2979                                 break;
2980                         }
2981
2982                         vref(vp1);
2983                         vput(vp);
2984                         vp = vp1;
2985                         continue;
2986                 }
2987                 if (vp->v_type != VDIR) {
2988                         vrele(vp);
2989                         counter_u64_add(numfullpathfail1, 1);
2990                         error = ENOTDIR;
2991                         SDT_PROBE3(vfs, namecache, fullpath, return,
2992                             error, vp, NULL);
2993                         break;
2994                 }
2995                 error = vn_vptocnp(&vp, buf, &buflen);
2996                 if (error)
2997                         break;
2998                 if (buflen == 0) {
2999                         vrele(vp);
3000                         error = ENOMEM;
3001                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3002                             startvp, NULL);
3003                         break;
3004                 }
3005                 buf[--buflen] = '/';
3006                 slash_prefixed = true;
3007         }
3008         if (error)
3009                 return (error);
3010         if (!slash_prefixed) {
3011                 if (buflen == 0) {
3012                         vrele(vp);
3013                         counter_u64_add(numfullpathfail4, 1);
3014                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3015                             startvp, NULL);
3016                         return (ENOMEM);
3017                 }
3018                 buf[--buflen] = '/';
3019         }
3020         counter_u64_add(numfullpathfound, 1);
3021         vrele(vp);
3022
3023         *retbuf = buf + buflen;
3024         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3025         *len -= buflen;
3026         *len += addend;
3027         return (0);
3028 }
3029
3030 /*
3031  * Resolve an arbitrary vnode to a pathname.
3032  *
3033  * Note 2 caveats:
3034  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3035  *   resolve to a different path than the one used to find it
3036  * - namecache is not mandatory, meaning names are not guaranteed to be added
3037  *   (in which case resolving fails)
3038  */
3039 static void __inline
3040 cache_rev_failed_impl(int *reason, int line)
3041 {
3042
3043         *reason = line;
3044 }
3045 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3046
3047 static int
3048 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3049     char **retbuf, size_t *buflen, size_t addend)
3050 {
3051 #ifdef KDTRACE_HOOKS
3052         struct vnode *startvp = vp;
3053 #endif
3054         struct vnode *tvp;
3055         struct mount *mp;
3056         struct namecache *ncp;
3057         size_t orig_buflen;
3058         int reason;
3059         int error;
3060 #ifdef KDTRACE_HOOKS
3061         int i;
3062 #endif
3063         seqc_t vp_seqc, tvp_seqc;
3064         u_char nc_flag;
3065
3066         VFS_SMR_ASSERT_ENTERED();
3067
3068         if (!cache_fast_revlookup) {
3069                 vfs_smr_exit();
3070                 return (-1);
3071         }
3072
3073         orig_buflen = *buflen;
3074
3075         if (addend == 0) {
3076                 MPASS(*buflen >= 2);
3077                 *buflen -= 1;
3078                 buf[*buflen] = '\0';
3079         }
3080
3081         if (vp == rdir || vp == rootvnode) {
3082                 if (addend == 0) {
3083                         *buflen -= 1;
3084                         buf[*buflen] = '/';
3085                 }
3086                 goto out_ok;
3087         }
3088
3089 #ifdef KDTRACE_HOOKS
3090         i = 0;
3091 #endif
3092         error = -1;
3093         ncp = NULL; /* for sdt probe down below */
3094         vp_seqc = vn_seqc_read_any(vp);
3095         if (seqc_in_modify(vp_seqc)) {
3096                 cache_rev_failed(&reason);
3097                 goto out_abort;
3098         }
3099
3100         for (;;) {
3101 #ifdef KDTRACE_HOOKS
3102                 i++;
3103 #endif
3104                 if ((vp->v_vflag & VV_ROOT) != 0) {
3105                         mp = atomic_load_ptr(&vp->v_mount);
3106                         if (mp == NULL) {
3107                                 cache_rev_failed(&reason);
3108                                 goto out_abort;
3109                         }
3110                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3111                         tvp_seqc = vn_seqc_read_any(tvp);
3112                         if (seqc_in_modify(tvp_seqc)) {
3113                                 cache_rev_failed(&reason);
3114                                 goto out_abort;
3115                         }
3116                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3117                                 cache_rev_failed(&reason);
3118                                 goto out_abort;
3119                         }
3120                         vp = tvp;
3121                         vp_seqc = tvp_seqc;
3122                         continue;
3123                 }
3124                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3125                 if (ncp == NULL) {
3126                         cache_rev_failed(&reason);
3127                         goto out_abort;
3128                 }
3129                 nc_flag = atomic_load_char(&ncp->nc_flag);
3130                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3131                         cache_rev_failed(&reason);
3132                         goto out_abort;
3133                 }
3134                 if (!cache_ncp_canuse(ncp)) {
3135                         cache_rev_failed(&reason);
3136                         goto out_abort;
3137                 }
3138                 if (ncp->nc_nlen >= *buflen) {
3139                         cache_rev_failed(&reason);
3140                         error = ENOMEM;
3141                         goto out_abort;
3142                 }
3143                 *buflen -= ncp->nc_nlen;
3144                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3145                 *buflen -= 1;
3146                 buf[*buflen] = '/';
3147                 tvp = ncp->nc_dvp;
3148                 tvp_seqc = vn_seqc_read_any(tvp);
3149                 if (seqc_in_modify(tvp_seqc)) {
3150                         cache_rev_failed(&reason);
3151                         goto out_abort;
3152                 }
3153                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3154                         cache_rev_failed(&reason);
3155                         goto out_abort;
3156                 }
3157                 vp = tvp;
3158                 vp_seqc = tvp_seqc;
3159                 if (vp == rdir || vp == rootvnode)
3160                         break;
3161         }
3162 out_ok:
3163         vfs_smr_exit();
3164         *retbuf = buf + *buflen;
3165         *buflen = orig_buflen - *buflen + addend;
3166         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3167         return (0);
3168
3169 out_abort:
3170         *buflen = orig_buflen;
3171         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3172         vfs_smr_exit();
3173         return (error);
3174 }
3175
3176 static int
3177 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3178     size_t *buflen)
3179 {
3180         size_t orig_buflen, addend;
3181         int error;
3182
3183         if (*buflen < 2)
3184                 return (EINVAL);
3185
3186         orig_buflen = *buflen;
3187
3188         vref(vp);
3189         addend = 0;
3190         if (vp->v_type != VDIR) {
3191                 *buflen -= 1;
3192                 buf[*buflen] = '\0';
3193                 error = vn_vptocnp(&vp, buf, buflen);
3194                 if (error)
3195                         return (error);
3196                 if (*buflen == 0) {
3197                         vrele(vp);
3198                         return (ENOMEM);
3199                 }
3200                 *buflen -= 1;
3201                 buf[*buflen] = '/';
3202                 addend = orig_buflen - *buflen;
3203         }
3204
3205         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3206 }
3207
3208 /*
3209  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3210  *
3211  * Since the namecache does not track handlings, the caller is expected to first
3212  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3213  *
3214  * Then we have 2 cases:
3215  * - if the found vnode is a directory, the path can be constructed just by
3216  *   fullowing names up the chain
3217  * - otherwise we populate the buffer with the saved name and start resolving
3218  *   from the parent
3219  */
3220 static int
3221 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3222     size_t *buflen)
3223 {
3224         char *buf, *tmpbuf;
3225         struct pwd *pwd;
3226         struct componentname *cnp;
3227         struct vnode *vp;
3228         size_t addend;
3229         int error;
3230         enum vtype type;
3231
3232         if (*buflen < 2)
3233                 return (EINVAL);
3234         if (*buflen > MAXPATHLEN)
3235                 *buflen = MAXPATHLEN;
3236
3237         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3238
3239         addend = 0;
3240         vp = ndp->ni_vp;
3241         /*
3242          * Check for VBAD to work around the vp_crossmp bug in lookup().
3243          *
3244          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3245          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3246          * If the type is VDIR (like in this very case) we can skip looking
3247          * at ni_dvp in the first place. However, since vnodes get passed here
3248          * unlocked the target may transition to doomed state (type == VBAD)
3249          * before we get to evaluate the condition. If this happens, we will
3250          * populate part of the buffer and descend to vn_fullpath_dir with
3251          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3252          *
3253          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3254          * an address of a bit field, even if said field is sized to char.
3255          * Work around the problem by reading the value into a full-sized enum
3256          * and then re-reading it with atomic_load which will still prevent
3257          * the compiler from re-reading down the road.
3258          */
3259         type = vp->v_type;
3260         type = atomic_load_int(&type);
3261         if (type == VBAD) {
3262                 error = ENOENT;
3263                 goto out_bad;
3264         }
3265         if (type != VDIR) {
3266                 cnp = &ndp->ni_cnd;
3267                 addend = cnp->cn_namelen + 2;
3268                 if (*buflen < addend) {
3269                         error = ENOMEM;
3270                         goto out_bad;
3271                 }
3272                 *buflen -= addend;
3273                 tmpbuf = buf + *buflen;
3274                 tmpbuf[0] = '/';
3275                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3276                 tmpbuf[addend - 1] = '\0';
3277                 vp = ndp->ni_dvp;
3278         }
3279
3280         vfs_smr_enter();
3281         pwd = pwd_get_smr();
3282         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3283             addend);
3284         VFS_SMR_ASSERT_NOT_ENTERED();
3285         if (error < 0) {
3286                 pwd = pwd_hold(curthread);
3287                 vref(vp);
3288                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3289                     addend);
3290                 pwd_drop(pwd);
3291                 if (error != 0)
3292                         goto out_bad;
3293         }
3294
3295         *freebuf = buf;
3296
3297         return (0);
3298 out_bad:
3299         free(buf, M_TEMP);
3300         return (error);
3301 }
3302
3303 struct vnode *
3304 vn_dir_dd_ino(struct vnode *vp)
3305 {
3306         struct namecache *ncp;
3307         struct vnode *ddvp;
3308         struct mtx *vlp;
3309         enum vgetstate vs;
3310
3311         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3312         vlp = VP2VNODELOCK(vp);
3313         mtx_lock(vlp);
3314         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3315                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3316                         continue;
3317                 ddvp = ncp->nc_dvp;
3318                 vs = vget_prep(ddvp);
3319                 mtx_unlock(vlp);
3320                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3321                         return (NULL);
3322                 return (ddvp);
3323         }
3324         mtx_unlock(vlp);
3325         return (NULL);
3326 }
3327
3328 int
3329 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3330 {
3331         struct namecache *ncp;
3332         struct mtx *vlp;
3333         int l;
3334
3335         vlp = VP2VNODELOCK(vp);
3336         mtx_lock(vlp);
3337         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3338                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3339                         break;
3340         if (ncp == NULL) {
3341                 mtx_unlock(vlp);
3342                 return (ENOENT);
3343         }
3344         l = min(ncp->nc_nlen, buflen - 1);
3345         memcpy(buf, ncp->nc_name, l);
3346         mtx_unlock(vlp);
3347         buf[l] = '\0';
3348         return (0);
3349 }
3350
3351 /*
3352  * This function updates path string to vnode's full global path
3353  * and checks the size of the new path string against the pathlen argument.
3354  *
3355  * Requires a locked, referenced vnode.
3356  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3357  *
3358  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3359  * because it falls back to the ".." lookup if the namecache lookup fails.
3360  */
3361 int
3362 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3363     u_int pathlen)
3364 {
3365         struct nameidata nd;
3366         struct vnode *vp1;
3367         char *rpath, *fbuf;
3368         int error;
3369
3370         ASSERT_VOP_ELOCKED(vp, __func__);
3371
3372         /* Construct global filesystem path from vp. */
3373         VOP_UNLOCK(vp);
3374         error = vn_fullpath_global(vp, &rpath, &fbuf);
3375
3376         if (error != 0) {
3377                 vrele(vp);
3378                 return (error);
3379         }
3380
3381         if (strlen(rpath) >= pathlen) {
3382                 vrele(vp);
3383                 error = ENAMETOOLONG;
3384                 goto out;
3385         }
3386
3387         /*
3388          * Re-lookup the vnode by path to detect a possible rename.
3389          * As a side effect, the vnode is relocked.
3390          * If vnode was renamed, return ENOENT.
3391          */
3392         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3393             UIO_SYSSPACE, path, td);
3394         error = namei(&nd);
3395         if (error != 0) {
3396                 vrele(vp);
3397                 goto out;
3398         }
3399         NDFREE(&nd, NDF_ONLY_PNBUF);
3400         vp1 = nd.ni_vp;
3401         vrele(vp);
3402         if (vp1 == vp)
3403                 strcpy(path, rpath);
3404         else {
3405                 vput(vp1);
3406                 error = ENOENT;
3407         }
3408
3409 out:
3410         free(fbuf, M_TEMP);
3411         return (error);
3412 }
3413
3414 #ifdef DDB
3415 static void
3416 db_print_vpath(struct vnode *vp)
3417 {
3418
3419         while (vp != NULL) {
3420                 db_printf("%p: ", vp);
3421                 if (vp == rootvnode) {
3422                         db_printf("/");
3423                         vp = NULL;
3424                 } else {
3425                         if (vp->v_vflag & VV_ROOT) {
3426                                 db_printf("<mount point>");
3427                                 vp = vp->v_mount->mnt_vnodecovered;
3428                         } else {
3429                                 struct namecache *ncp;
3430                                 char *ncn;
3431                                 int i;
3432
3433                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3434                                 if (ncp != NULL) {
3435                                         ncn = ncp->nc_name;
3436                                         for (i = 0; i < ncp->nc_nlen; i++)
3437                                                 db_printf("%c", *ncn++);
3438                                         vp = ncp->nc_dvp;
3439                                 } else {
3440                                         vp = NULL;
3441                                 }
3442                         }
3443                 }
3444                 db_printf("\n");
3445         }
3446
3447         return;
3448 }
3449
3450 DB_SHOW_COMMAND(vpath, db_show_vpath)
3451 {
3452         struct vnode *vp;
3453
3454         if (!have_addr) {
3455                 db_printf("usage: show vpath <struct vnode *>\n");
3456                 return;
3457         }
3458
3459         vp = (struct vnode *)addr;
3460         db_print_vpath(vp);
3461 }
3462
3463 #endif
3464
3465 static bool __read_frequently cache_fast_lookup = true;
3466 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3467     &cache_fast_lookup, 0, "");
3468
3469 #define CACHE_FPL_FAILED        -2020
3470
3471 static void
3472 cache_fpl_cleanup_cnp(struct componentname *cnp)
3473 {
3474
3475         uma_zfree(namei_zone, cnp->cn_pnbuf);
3476 #ifdef DIAGNOSTIC
3477         cnp->cn_pnbuf = NULL;
3478         cnp->cn_nameptr = NULL;
3479 #endif
3480 }
3481
3482 static void
3483 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3484 {
3485         struct componentname *cnp;
3486
3487         cnp = &ndp->ni_cnd;
3488         while (*(cnp->cn_nameptr) == '/') {
3489                 cnp->cn_nameptr++;
3490                 ndp->ni_pathlen--;
3491         }
3492
3493         *dpp = ndp->ni_rootdir;
3494 }
3495
3496 /*
3497  * Components of nameidata (or objects it can point to) which may
3498  * need restoring in case fast path lookup fails.
3499  */
3500 struct nameidata_saved {
3501         long cn_namelen;
3502         char *cn_nameptr;
3503         size_t ni_pathlen;
3504         int cn_flags;
3505 };
3506
3507 struct cache_fpl {
3508         struct nameidata *ndp;
3509         struct componentname *cnp;
3510         struct pwd *pwd;
3511         struct vnode *dvp;
3512         struct vnode *tvp;
3513         seqc_t dvp_seqc;
3514         seqc_t tvp_seqc;
3515         struct nameidata_saved snd;
3516         int line;
3517         enum cache_fpl_status status:8;
3518         bool in_smr;
3519         bool fsearch;
3520 };
3521
3522 static void
3523 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3524 {
3525
3526         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3527         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3528         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3529         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3530 }
3531
3532 static void
3533 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3534 {
3535
3536         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3537         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3538         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3539         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3540 }
3541
3542 #ifdef INVARIANTS
3543 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3544         struct cache_fpl *_fpl = (fpl);                         \
3545         MPASS(_fpl->in_smr == true);                            \
3546         VFS_SMR_ASSERT_ENTERED();                               \
3547 })
3548 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3549         struct cache_fpl *_fpl = (fpl);                         \
3550         MPASS(_fpl->in_smr == false);                           \
3551         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3552 })
3553 #else
3554 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3555 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3556 #endif
3557
3558 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3559         struct cache_fpl *_fpl = (fpl);                         \
3560         vfs_smr_enter();                                        \
3561         _fpl->in_smr = true;                                    \
3562 })
3563
3564 #define cache_fpl_smr_enter(fpl) ({                             \
3565         struct cache_fpl *_fpl = (fpl);                         \
3566         MPASS(_fpl->in_smr == false);                           \
3567         vfs_smr_enter();                                        \
3568         _fpl->in_smr = true;                                    \
3569 })
3570
3571 #define cache_fpl_smr_exit(fpl) ({                              \
3572         struct cache_fpl *_fpl = (fpl);                         \
3573         MPASS(_fpl->in_smr == true);                            \
3574         vfs_smr_exit();                                         \
3575         _fpl->in_smr = false;                                   \
3576 })
3577
3578 static int
3579 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3580 {
3581
3582         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3583                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3584                     ("%s: converting to abort from %d at %d, set at %d\n",
3585                     __func__, fpl->status, line, fpl->line));
3586         }
3587         fpl->status = CACHE_FPL_STATUS_ABORTED;
3588         fpl->line = line;
3589         return (CACHE_FPL_FAILED);
3590 }
3591
3592 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3593
3594 static int
3595 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3596 {
3597
3598         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3599             ("%s: setting to partial at %d, but already set to %d at %d\n",
3600             __func__, line, fpl->status, fpl->line));
3601         cache_fpl_smr_assert_entered(fpl);
3602         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3603         fpl->line = line;
3604         return (CACHE_FPL_FAILED);
3605 }
3606
3607 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3608
3609 static int
3610 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3611 {
3612
3613         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3614             ("%s: setting to handled at %d, but already set to %d at %d\n",
3615             __func__, line, fpl->status, fpl->line));
3616         cache_fpl_smr_assert_not_entered(fpl);
3617         MPASS(error != CACHE_FPL_FAILED);
3618         fpl->status = CACHE_FPL_STATUS_HANDLED;
3619         fpl->line = line;
3620         return (error);
3621 }
3622
3623 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3624
3625 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3626         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3627          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3628
3629 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3630         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3631
3632 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3633     "supported and internal flags overlap");
3634
3635 static bool
3636 cache_fpl_islastcn(struct nameidata *ndp)
3637 {
3638
3639         return (*ndp->ni_next == 0);
3640 }
3641
3642 static bool
3643 cache_fpl_isdotdot(struct componentname *cnp)
3644 {
3645
3646         if (cnp->cn_namelen == 2 &&
3647             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3648                 return (true);
3649         return (false);
3650 }
3651
3652 static bool
3653 cache_can_fplookup(struct cache_fpl *fpl)
3654 {
3655         struct nameidata *ndp;
3656         struct componentname *cnp;
3657         struct thread *td;
3658
3659         ndp = fpl->ndp;
3660         cnp = fpl->cnp;
3661         td = cnp->cn_thread;
3662
3663         if (!cache_fast_lookup) {
3664                 cache_fpl_aborted(fpl);
3665                 return (false);
3666         }
3667 #ifdef MAC
3668         if (mac_vnode_check_lookup_enabled()) {
3669                 cache_fpl_aborted(fpl);
3670                 return (false);
3671         }
3672 #endif
3673         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3674                 cache_fpl_aborted(fpl);
3675                 return (false);
3676         }
3677         if (IN_CAPABILITY_MODE(td)) {
3678                 cache_fpl_aborted(fpl);
3679                 return (false);
3680         }
3681         if (AUDITING_TD(td)) {
3682                 cache_fpl_aborted(fpl);
3683                 return (false);
3684         }
3685         if (ndp->ni_startdir != NULL) {
3686                 cache_fpl_aborted(fpl);
3687                 return (false);
3688         }
3689         return (true);
3690 }
3691
3692 static int
3693 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3694 {
3695         struct nameidata *ndp;
3696         int error;
3697         bool fsearch;
3698
3699         ndp = fpl->ndp;
3700         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3701         if (__predict_false(error != 0)) {
3702                 cache_fpl_smr_exit(fpl);
3703                 return (cache_fpl_aborted(fpl));
3704         }
3705         fpl->fsearch = fsearch;
3706         return (0);
3707 }
3708
3709 static bool
3710 cache_fplookup_vnode_supported(struct vnode *vp)
3711 {
3712
3713         return (vp->v_type != VLNK);
3714 }
3715
3716 static int __noinline
3717 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3718     uint32_t hash)
3719 {
3720         struct componentname *cnp;
3721         struct vnode *dvp;
3722
3723         cnp = fpl->cnp;
3724         dvp = fpl->dvp;
3725
3726         cache_fpl_smr_exit(fpl);
3727         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3728                 return (cache_fpl_handled(fpl, ENOENT));
3729         else
3730                 return (cache_fpl_aborted(fpl));
3731 }
3732
3733 /*
3734  * The target vnode is not supported, prepare for the slow path to take over.
3735  */
3736 static int __noinline
3737 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3738 {
3739         struct nameidata *ndp;
3740         struct componentname *cnp;
3741         enum vgetstate dvs;
3742         struct vnode *dvp;
3743         struct pwd *pwd;
3744         seqc_t dvp_seqc;
3745
3746         ndp = fpl->ndp;
3747         cnp = fpl->cnp;
3748         pwd = fpl->pwd;
3749         dvp = fpl->dvp;
3750         dvp_seqc = fpl->dvp_seqc;
3751
3752         if (!pwd_hold_smr(pwd)) {
3753                 cache_fpl_smr_exit(fpl);
3754                 return (cache_fpl_aborted(fpl));
3755         }
3756
3757         dvs = vget_prep_smr(dvp);
3758         cache_fpl_smr_exit(fpl);
3759         if (__predict_false(dvs == VGET_NONE)) {
3760                 pwd_drop(pwd);
3761                 return (cache_fpl_aborted(fpl));
3762         }
3763
3764         vget_finish_ref(dvp, dvs);
3765         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3766                 vrele(dvp);
3767                 pwd_drop(pwd);
3768                 return (cache_fpl_aborted(fpl));
3769         }
3770
3771         cache_fpl_restore(fpl, &fpl->snd);
3772
3773         ndp->ni_startdir = dvp;
3774         cnp->cn_flags |= MAKEENTRY;
3775         if (cache_fpl_islastcn(ndp))
3776                 cnp->cn_flags |= ISLASTCN;
3777         if (cache_fpl_isdotdot(cnp))
3778                 cnp->cn_flags |= ISDOTDOT;
3779
3780         return (0);
3781 }
3782
3783 static int
3784 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3785 {
3786         struct componentname *cnp;
3787         struct vnode *tvp;
3788         seqc_t tvp_seqc;
3789         int error, lkflags;
3790
3791         cnp = fpl->cnp;
3792         tvp = fpl->tvp;
3793         tvp_seqc = fpl->tvp_seqc;
3794
3795         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3796                 lkflags = LK_SHARED;
3797                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3798                         lkflags = LK_EXCLUSIVE;
3799                 error = vget_finish(tvp, lkflags, tvs);
3800                 if (__predict_false(error != 0)) {
3801                         return (cache_fpl_aborted(fpl));
3802                 }
3803         } else {
3804                 vget_finish_ref(tvp, tvs);
3805         }
3806
3807         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3808                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3809                         vput(tvp);
3810                 else
3811                         vrele(tvp);
3812                 return (cache_fpl_aborted(fpl));
3813         }
3814
3815         return (cache_fpl_handled(fpl, 0));
3816 }
3817
3818 /*
3819  * They want to possibly modify the state of the namecache.
3820  *
3821  * Don't try to match the API contract, just leave.
3822  * TODO: this leaves scalability on the table
3823  */
3824 static int
3825 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3826 {
3827         struct componentname *cnp;
3828
3829         cnp = fpl->cnp;
3830         MPASS(cnp->cn_nameiop != LOOKUP);
3831         return (cache_fpl_partial(fpl));
3832 }
3833
3834 static int __noinline
3835 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3836 {
3837         struct componentname *cnp;
3838         enum vgetstate dvs, tvs;
3839         struct vnode *dvp, *tvp;
3840         seqc_t dvp_seqc;
3841         int error;
3842
3843         cnp = fpl->cnp;
3844         dvp = fpl->dvp;
3845         dvp_seqc = fpl->dvp_seqc;
3846         tvp = fpl->tvp;
3847
3848         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3849
3850         /*
3851          * This is less efficient than it can be for simplicity.
3852          */
3853         dvs = vget_prep_smr(dvp);
3854         if (__predict_false(dvs == VGET_NONE)) {
3855                 return (cache_fpl_aborted(fpl));
3856         }
3857         tvs = vget_prep_smr(tvp);
3858         if (__predict_false(tvs == VGET_NONE)) {
3859                 cache_fpl_smr_exit(fpl);
3860                 vget_abort(dvp, dvs);
3861                 return (cache_fpl_aborted(fpl));
3862         }
3863
3864         cache_fpl_smr_exit(fpl);
3865
3866         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3867                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3868                 if (__predict_false(error != 0)) {
3869                         vget_abort(tvp, tvs);
3870                         return (cache_fpl_aborted(fpl));
3871                 }
3872         } else {
3873                 vget_finish_ref(dvp, dvs);
3874         }
3875
3876         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3877                 vget_abort(tvp, tvs);
3878                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3879                         vput(dvp);
3880                 else
3881                         vrele(dvp);
3882                 return (cache_fpl_aborted(fpl));
3883         }
3884
3885         error = cache_fplookup_final_child(fpl, tvs);
3886         if (__predict_false(error != 0)) {
3887                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3888                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3889                         vput(dvp);
3890                 else
3891                         vrele(dvp);
3892                 return (error);
3893         }
3894
3895         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3896         return (0);
3897 }
3898
3899 static int
3900 cache_fplookup_final(struct cache_fpl *fpl)
3901 {
3902         struct componentname *cnp;
3903         enum vgetstate tvs;
3904         struct vnode *dvp, *tvp;
3905         seqc_t dvp_seqc;
3906
3907         cnp = fpl->cnp;
3908         dvp = fpl->dvp;
3909         dvp_seqc = fpl->dvp_seqc;
3910         tvp = fpl->tvp;
3911
3912         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3913
3914         if (cnp->cn_nameiop != LOOKUP) {
3915                 return (cache_fplookup_final_modifying(fpl));
3916         }
3917
3918         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3919                 return (cache_fplookup_final_withparent(fpl));
3920
3921         tvs = vget_prep_smr(tvp);
3922         if (__predict_false(tvs == VGET_NONE)) {
3923                 return (cache_fpl_partial(fpl));
3924         }
3925
3926         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3927                 cache_fpl_smr_exit(fpl);
3928                 vget_abort(tvp, tvs);
3929                 return (cache_fpl_aborted(fpl));
3930         }
3931
3932         cache_fpl_smr_exit(fpl);
3933         return (cache_fplookup_final_child(fpl, tvs));
3934 }
3935
3936 static int __noinline
3937 cache_fplookup_dot(struct cache_fpl *fpl)
3938 {
3939         struct vnode *dvp;
3940
3941         dvp = fpl->dvp;
3942
3943         fpl->tvp = dvp;
3944         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3945         if (seqc_in_modify(fpl->tvp_seqc)) {
3946                 return (cache_fpl_aborted(fpl));
3947         }
3948
3949         counter_u64_add(dothits, 1);
3950         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3951
3952         return (0);
3953 }
3954
3955 static int __noinline
3956 cache_fplookup_dotdot(struct cache_fpl *fpl)
3957 {
3958         struct nameidata *ndp;
3959         struct componentname *cnp;
3960         struct namecache *ncp;
3961         struct vnode *dvp;
3962         struct prison *pr;
3963         u_char nc_flag;
3964
3965         ndp = fpl->ndp;
3966         cnp = fpl->cnp;
3967         dvp = fpl->dvp;
3968
3969         /*
3970          * XXX this is racy the same way regular lookup is
3971          */
3972         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3973             pr = pr->pr_parent)
3974                 if (dvp == pr->pr_root)
3975                         break;
3976
3977         if (dvp == ndp->ni_rootdir ||
3978             dvp == ndp->ni_topdir ||
3979             dvp == rootvnode ||
3980             pr != NULL) {
3981                 fpl->tvp = dvp;
3982                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3983                 if (seqc_in_modify(fpl->tvp_seqc)) {
3984                         return (cache_fpl_aborted(fpl));
3985                 }
3986                 return (0);
3987         }
3988
3989         if ((dvp->v_vflag & VV_ROOT) != 0) {
3990                 /*
3991                  * TODO
3992                  * The opposite of climb mount is needed here.
3993                  */
3994                 return (cache_fpl_aborted(fpl));
3995         }
3996
3997         ncp = atomic_load_ptr(&dvp->v_cache_dd);
3998         if (ncp == NULL) {
3999                 return (cache_fpl_aborted(fpl));
4000         }
4001
4002         nc_flag = atomic_load_char(&ncp->nc_flag);
4003         if ((nc_flag & NCF_ISDOTDOT) != 0) {
4004                 if ((nc_flag & NCF_NEGATIVE) != 0)
4005                         return (cache_fpl_aborted(fpl));
4006                 fpl->tvp = ncp->nc_vp;
4007         } else {
4008                 fpl->tvp = ncp->nc_dvp;
4009         }
4010
4011         if (__predict_false(!cache_ncp_canuse(ncp))) {
4012                 return (cache_fpl_aborted(fpl));
4013         }
4014
4015         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4016         if (seqc_in_modify(fpl->tvp_seqc)) {
4017                 return (cache_fpl_partial(fpl));
4018         }
4019
4020         counter_u64_add(dotdothits, 1);
4021         return (0);
4022 }
4023
4024 static int __noinline
4025 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4026 {
4027         u_char nc_flag;
4028         bool neg_promote;
4029
4030         nc_flag = atomic_load_char(&ncp->nc_flag);
4031         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4032         /*
4033          * If they want to create an entry we need to replace this one.
4034          */
4035         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4036                 /*
4037                  * TODO
4038                  * This should call something similar to
4039                  * cache_fplookup_final_modifying.
4040                  */
4041                 return (cache_fpl_partial(fpl));
4042         }
4043         neg_promote = cache_neg_hit_prep(ncp);
4044         if (__predict_false(!cache_ncp_canuse(ncp))) {
4045                 cache_neg_hit_abort(ncp);
4046                 return (cache_fpl_partial(fpl));
4047         }
4048         if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4049                 cache_neg_hit_abort(ncp);
4050                 return (cache_fpl_partial(fpl));
4051         }
4052         if (neg_promote) {
4053                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4054         }
4055         cache_neg_hit_finish(ncp);
4056         cache_fpl_smr_exit(fpl);
4057         return (cache_fpl_handled(fpl, ENOENT));
4058 }
4059
4060 static int
4061 cache_fplookup_next(struct cache_fpl *fpl)
4062 {
4063         struct componentname *cnp;
4064         struct namecache *ncp;
4065         struct vnode *dvp, *tvp;
4066         u_char nc_flag;
4067         uint32_t hash;
4068
4069         cnp = fpl->cnp;
4070         dvp = fpl->dvp;
4071
4072         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4073                 return (cache_fplookup_dot(fpl));
4074         }
4075
4076         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4077
4078         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4079                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4080                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4081                         break;
4082         }
4083
4084         /*
4085          * If there is no entry we have to punt to the slow path to perform
4086          * actual lookup. Should there be nothing with this name a negative
4087          * entry will be created.
4088          */
4089         if (__predict_false(ncp == NULL)) {
4090                 return (cache_fpl_partial(fpl));
4091         }
4092
4093         tvp = atomic_load_ptr(&ncp->nc_vp);
4094         nc_flag = atomic_load_char(&ncp->nc_flag);
4095         if ((nc_flag & NCF_NEGATIVE) != 0) {
4096                 return (cache_fplookup_neg(fpl, ncp, hash));
4097         }
4098
4099         if (__predict_false(!cache_ncp_canuse(ncp))) {
4100                 return (cache_fpl_partial(fpl));
4101         }
4102
4103         fpl->tvp = tvp;
4104         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4105         if (seqc_in_modify(fpl->tvp_seqc)) {
4106                 return (cache_fpl_partial(fpl));
4107         }
4108
4109         if (!cache_fplookup_vnode_supported(tvp)) {
4110                 return (cache_fpl_partial(fpl));
4111         }
4112
4113         counter_u64_add(numposhits, 1);
4114         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4115         return (0);
4116 }
4117
4118 static bool
4119 cache_fplookup_mp_supported(struct mount *mp)
4120 {
4121
4122         if (mp == NULL)
4123                 return (false);
4124         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4125                 return (false);
4126         return (true);
4127 }
4128
4129 /*
4130  * Walk up the mount stack (if any).
4131  *
4132  * Correctness is provided in the following ways:
4133  * - all vnodes are protected from freeing with SMR
4134  * - struct mount objects are type stable making them always safe to access
4135  * - stability of the particular mount is provided by busying it
4136  * - relationship between the vnode which is mounted on and the mount is
4137  *   verified with the vnode sequence counter after busying
4138  * - association between root vnode of the mount and the mount is protected
4139  *   by busy
4140  *
4141  * From that point on we can read the sequence counter of the root vnode
4142  * and get the next mount on the stack (if any) using the same protection.
4143  *
4144  * By the end of successful walk we are guaranteed the reached state was
4145  * indeed present at least at some point which matches the regular lookup.
4146  */
4147 static int __noinline
4148 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4149 {
4150         struct mount *mp, *prev_mp;
4151         struct vnode *vp;
4152         seqc_t vp_seqc;
4153
4154         vp = fpl->tvp;
4155         vp_seqc = fpl->tvp_seqc;
4156
4157         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4158         mp = atomic_load_ptr(&vp->v_mountedhere);
4159         if (mp == NULL)
4160                 return (0);
4161
4162         prev_mp = NULL;
4163         for (;;) {
4164                 if (!vfs_op_thread_enter_crit(mp)) {
4165                         if (prev_mp != NULL)
4166                                 vfs_op_thread_exit_crit(prev_mp);
4167                         return (cache_fpl_partial(fpl));
4168                 }
4169                 if (prev_mp != NULL)
4170                         vfs_op_thread_exit_crit(prev_mp);
4171                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4172                         vfs_op_thread_exit_crit(mp);
4173                         return (cache_fpl_partial(fpl));
4174                 }
4175                 if (!cache_fplookup_mp_supported(mp)) {
4176                         vfs_op_thread_exit_crit(mp);
4177                         return (cache_fpl_partial(fpl));
4178                 }
4179                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4180                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4181                         vfs_op_thread_exit_crit(mp);
4182                         return (cache_fpl_partial(fpl));
4183                 }
4184                 vp_seqc = vn_seqc_read_any(vp);
4185                 if (seqc_in_modify(vp_seqc)) {
4186                         vfs_op_thread_exit_crit(mp);
4187                         return (cache_fpl_partial(fpl));
4188                 }
4189                 prev_mp = mp;
4190                 mp = atomic_load_ptr(&vp->v_mountedhere);
4191                 if (mp == NULL)
4192                         break;
4193         }
4194
4195         vfs_op_thread_exit_crit(prev_mp);
4196         fpl->tvp = vp;
4197         fpl->tvp_seqc = vp_seqc;
4198         return (0);
4199 }
4200
4201 static bool
4202 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4203 {
4204         struct mount *mp;
4205         struct vnode *vp;
4206
4207         vp = fpl->tvp;
4208
4209         /*
4210          * Hack: while this is a union, the pointer tends to be NULL so save on
4211          * a branch.
4212          */
4213         mp = atomic_load_ptr(&vp->v_mountedhere);
4214         if (mp == NULL)
4215                 return (false);
4216         if (vp->v_type == VDIR)
4217                 return (true);
4218         return (false);
4219 }
4220
4221 /*
4222  * Parse the path.
4223  *
4224  * The code was originally copy-pasted from regular lookup and despite
4225  * clean ups leaves performance on the table. Any modifications here
4226  * must take into account that in case off fallback the resulting
4227  * nameidata state has to be compatible with the original.
4228  */
4229 static int
4230 cache_fplookup_parse(struct cache_fpl *fpl)
4231 {
4232         struct nameidata *ndp;
4233         struct componentname *cnp;
4234         char *cp;
4235
4236         ndp = fpl->ndp;
4237         cnp = fpl->cnp;
4238
4239         /*
4240          * Search a new directory.
4241          *
4242          * The last component of the filename is left accessible via
4243          * cnp->cn_nameptr for callers that need the name. Callers needing
4244          * the name set the SAVENAME flag. When done, they assume
4245          * responsibility for freeing the pathname buffer.
4246          */
4247         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4248                 continue;
4249         cnp->cn_namelen = cp - cnp->cn_nameptr;
4250         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4251                 cache_fpl_smr_exit(fpl);
4252                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4253         }
4254         ndp->ni_pathlen -= cnp->cn_namelen;
4255         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4256             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4257         ndp->ni_next = cp;
4258
4259         /*
4260          * Replace multiple slashes by a single slash and trailing slashes
4261          * by a null.  This must be done before VOP_LOOKUP() because some
4262          * fs's don't know about trailing slashes.  Remember if there were
4263          * trailing slashes to handle symlinks, existing non-directories
4264          * and non-existing files that won't be directories specially later.
4265          */
4266         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4267                 cp++;
4268                 ndp->ni_pathlen--;
4269                 if (*cp == '\0') {
4270                         /*
4271                          * TODO
4272                          * Regular lookup performs the following:
4273                          * *ndp->ni_next = '\0';
4274                          * cnp->cn_flags |= TRAILINGSLASH;
4275                          *
4276                          * Which is problematic since it modifies data read
4277                          * from userspace. Then if fast path lookup was to
4278                          * abort we would have to either restore it or convey
4279                          * the flag. Since this is a corner case just ignore
4280                          * it for simplicity.
4281                          */
4282                         return (cache_fpl_partial(fpl));
4283                 }
4284         }
4285         ndp->ni_next = cp;
4286
4287         /*
4288          * Check for degenerate name (e.g. / or "")
4289          * which is a way of talking about a directory,
4290          * e.g. like "/." or ".".
4291          *
4292          * TODO
4293          * Another corner case handled by the regular lookup
4294          */
4295         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4296                 return (cache_fpl_partial(fpl));
4297         }
4298         return (0);
4299 }
4300
4301 static void
4302 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4303 {
4304         struct nameidata *ndp;
4305         struct componentname *cnp;
4306
4307         ndp = fpl->ndp;
4308         cnp = fpl->cnp;
4309
4310         cnp->cn_nameptr = ndp->ni_next;
4311         while (*cnp->cn_nameptr == '/') {
4312                 cnp->cn_nameptr++;
4313                 ndp->ni_pathlen--;
4314         }
4315 }
4316
4317 /*
4318  * See the API contract for VOP_FPLOOKUP_VEXEC.
4319  */
4320 static int __noinline
4321 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4322 {
4323         struct vnode *dvp;
4324         seqc_t dvp_seqc;
4325
4326         dvp = fpl->dvp;
4327         dvp_seqc = fpl->dvp_seqc;
4328
4329         /*
4330          * Hack: they may be looking up foo/bar, where foo is a
4331          * regular file. In such a case we need to turn ENOTDIR,
4332          * but we may happen to get here with a different error.
4333          */
4334         if (dvp->v_type != VDIR) {
4335                 /*
4336                  * The check here is predominantly to catch
4337                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4338                  * gets doomed past this point it is going to
4339                  * fail seqc verification.
4340                  */
4341                 if (VN_IS_DOOMED(dvp)) {
4342                         return (cache_fpl_aborted(fpl));
4343                 }
4344                 error = ENOTDIR;
4345         }
4346
4347         /*
4348          * Hack: handle O_SEARCH.
4349          *
4350          * Open Group Base Specifications Issue 7, 2018 edition states:
4351          * If the access mode of the open file description associated with the
4352          * file descriptor is not O_SEARCH, the function shall check whether
4353          * directory searches are permitted using the current permissions of
4354          * the directory underlying the file descriptor. If the access mode is
4355          * O_SEARCH, the function shall not perform the check.
4356          *
4357          * Regular lookup tests for the NOEXECCHECK flag for every path
4358          * component to decide whether to do the permission check. However,
4359          * since most lookups never have the flag (and when they do it is only
4360          * present for the first path component), lockless lookup only acts on
4361          * it if there is a permission problem. Here the flag is represented
4362          * with a boolean so that we don't have to clear it on the way out.
4363          *
4364          * For simplicity this always aborts.
4365          * TODO: check if this is the first lookup and ignore the permission
4366          * problem. Note the flag has to survive fallback (if it happens to be
4367          * performed).
4368          */
4369         if (fpl->fsearch) {
4370                 return (cache_fpl_aborted(fpl));
4371         }
4372
4373         switch (error) {
4374         case EAGAIN:
4375                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4376                         error = cache_fpl_aborted(fpl);
4377                 } else {
4378                         cache_fpl_partial(fpl);
4379                 }
4380                 break;
4381         default:
4382                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4383                         error = cache_fpl_aborted(fpl);
4384                 } else {
4385                         cache_fpl_smr_exit(fpl);
4386                         cache_fpl_handled(fpl, error);
4387                 }
4388                 break;
4389         }
4390         return (error);
4391 }
4392
4393 static int
4394 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4395 {
4396         struct nameidata *ndp;
4397         struct componentname *cnp;
4398         struct mount *mp;
4399         int error;
4400
4401         error = CACHE_FPL_FAILED;
4402         ndp = fpl->ndp;
4403         cnp = fpl->cnp;
4404
4405         cache_fpl_checkpoint(fpl, &fpl->snd);
4406
4407         fpl->dvp = dvp;
4408         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4409         if (seqc_in_modify(fpl->dvp_seqc)) {
4410                 cache_fpl_aborted(fpl);
4411                 goto out;
4412         }
4413         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4414         if (!cache_fplookup_mp_supported(mp)) {
4415                 cache_fpl_aborted(fpl);
4416                 goto out;
4417         }
4418
4419         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4420
4421         for (;;) {
4422                 error = cache_fplookup_parse(fpl);
4423                 if (__predict_false(error != 0)) {
4424                         break;
4425                 }
4426
4427                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4428
4429                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4430                 if (__predict_false(error != 0)) {
4431                         error = cache_fplookup_failed_vexec(fpl, error);
4432                         break;
4433                 }
4434
4435                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4436                         error = cache_fplookup_dotdot(fpl);
4437                         if (__predict_false(error != 0)) {
4438                                 break;
4439                         }
4440                 } else {
4441                         error = cache_fplookup_next(fpl);
4442                         if (__predict_false(error != 0)) {
4443                                 break;
4444                         }
4445
4446                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4447
4448                         if (cache_fplookup_need_climb_mount(fpl)) {
4449                                 error = cache_fplookup_climb_mount(fpl);
4450                                 if (__predict_false(error != 0)) {
4451                                         break;
4452                                 }
4453                         }
4454                 }
4455
4456                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4457
4458                 if (cache_fpl_islastcn(ndp)) {
4459                         error = cache_fplookup_final(fpl);
4460                         break;
4461                 }
4462
4463                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4464                         error = cache_fpl_aborted(fpl);
4465                         break;
4466                 }
4467
4468                 fpl->dvp = fpl->tvp;
4469                 fpl->dvp_seqc = fpl->tvp_seqc;
4470
4471                 cache_fplookup_parse_advance(fpl);
4472                 cache_fpl_checkpoint(fpl, &fpl->snd);
4473         }
4474 out:
4475         switch (fpl->status) {
4476         case CACHE_FPL_STATUS_UNSET:
4477                 __assert_unreachable();
4478                 break;
4479         case CACHE_FPL_STATUS_PARTIAL:
4480                 cache_fpl_smr_assert_entered(fpl);
4481                 return (cache_fplookup_partial_setup(fpl));
4482         case CACHE_FPL_STATUS_ABORTED:
4483                 if (fpl->in_smr)
4484                         cache_fpl_smr_exit(fpl);
4485                 return (CACHE_FPL_FAILED);
4486         case CACHE_FPL_STATUS_HANDLED:
4487                 MPASS(error != CACHE_FPL_FAILED);
4488                 cache_fpl_smr_assert_not_entered(fpl);
4489                 if (__predict_false(error != 0)) {
4490                         ndp->ni_dvp = NULL;
4491                         ndp->ni_vp = NULL;
4492                         cache_fpl_cleanup_cnp(cnp);
4493                         return (error);
4494                 }
4495                 ndp->ni_dvp = fpl->dvp;
4496                 ndp->ni_vp = fpl->tvp;
4497                 if (cnp->cn_flags & SAVENAME)
4498                         cnp->cn_flags |= HASBUF;
4499                 else
4500                         cache_fpl_cleanup_cnp(cnp);
4501                 return (error);
4502         }
4503 }
4504
4505 /*
4506  * Fast path lookup protected with SMR and sequence counters.
4507  *
4508  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4509  *
4510  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4511  * outlined below.
4512  *
4513  * Traditional vnode lookup conceptually looks like this:
4514  *
4515  * vn_lock(current);
4516  * for (;;) {
4517  *      next = find();
4518  *      vn_lock(next);
4519  *      vn_unlock(current);
4520  *      current = next;
4521  *      if (last)
4522  *          break;
4523  * }
4524  * return (current);
4525  *
4526  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4527  * any modifications thanks to holding respective locks.
4528  *
4529  * The same guarantee can be provided with a combination of safe memory
4530  * reclamation and sequence counters instead. If all operations which affect
4531  * the relationship between the current vnode and the one we are looking for
4532  * also modify the counter, we can verify whether all the conditions held as
4533  * we made the jump. This includes things like permissions, mount points etc.
4534  * Counter modification is provided by enclosing relevant places in
4535  * vn_seqc_write_begin()/end() calls.
4536  *
4537  * Thus this translates to:
4538  *
4539  * vfs_smr_enter();
4540  * dvp_seqc = seqc_read_any(dvp);
4541  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4542  *     abort();
4543  * for (;;) {
4544  *      tvp = find();
4545  *      tvp_seqc = seqc_read_any(tvp);
4546  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4547  *          abort();
4548  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4549  *          abort();
4550  *      dvp = tvp; // we know nothing of importance has changed
4551  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4552  *      if (last)
4553  *          break;
4554  * }
4555  * vget(); // secure the vnode
4556  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4557  *          abort();
4558  * // at this point we know nothing has changed for any parent<->child pair
4559  * // as they were crossed during the lookup, meaning we matched the guarantee
4560  * // of the locked variant
4561  * return (tvp);
4562  *
4563  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4564  * - they are called while within vfs_smr protection which they must never exit
4565  * - EAGAIN can be returned to denote checking could not be performed, it is
4566  *   always valid to return it
4567  * - if the sequence counter has not changed the result must be valid
4568  * - if the sequence counter has changed both false positives and false negatives
4569  *   are permitted (since the result will be rejected later)
4570  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4571  *
4572  * Caveats to watch out for:
4573  * - vnodes are passed unlocked and unreferenced with nothing stopping
4574  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4575  *   to use atomic_load_ptr to fetch it.
4576  * - the aforementioned object can also get freed, meaning absent other means it
4577  *   should be protected with vfs_smr
4578  * - either safely checking permissions as they are modified or guaranteeing
4579  *   their stability is left to the routine
4580  */
4581 int
4582 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4583     struct pwd **pwdp)
4584 {
4585         struct cache_fpl fpl;
4586         struct pwd *pwd;
4587         struct vnode *dvp;
4588         struct componentname *cnp;
4589         struct nameidata_saved orig;
4590         int error;
4591
4592         MPASS(ndp->ni_lcf == 0);
4593
4594         fpl.status = CACHE_FPL_STATUS_UNSET;
4595         fpl.ndp = ndp;
4596         fpl.cnp = &ndp->ni_cnd;
4597         MPASS(curthread == fpl.cnp->cn_thread);
4598
4599         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4600                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4601
4602         if (!cache_can_fplookup(&fpl)) {
4603                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4604                 *status = fpl.status;
4605                 return (EOPNOTSUPP);
4606         }
4607
4608         cache_fpl_checkpoint(&fpl, &orig);
4609
4610         cache_fpl_smr_enter_initial(&fpl);
4611         fpl.fsearch = false;
4612         pwd = pwd_get_smr();
4613         fpl.pwd = pwd;
4614         ndp->ni_rootdir = pwd->pwd_rdir;
4615         ndp->ni_topdir = pwd->pwd_jdir;
4616
4617         cnp = fpl.cnp;
4618         cnp->cn_nameptr = cnp->cn_pnbuf;
4619         if (cnp->cn_pnbuf[0] == '/') {
4620                 cache_fpl_handle_root(ndp, &dvp);
4621         } else {
4622                 if (ndp->ni_dirfd == AT_FDCWD) {
4623                         dvp = pwd->pwd_cdir;
4624                 } else {
4625                         error = cache_fplookup_dirfd(&fpl, &dvp);
4626                         if (__predict_false(error != 0)) {
4627                                 goto out;
4628                         }
4629                 }
4630         }
4631
4632         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4633
4634         error = cache_fplookup_impl(dvp, &fpl);
4635 out:
4636         cache_fpl_smr_assert_not_entered(&fpl);
4637         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4638
4639         *status = fpl.status;
4640         switch (fpl.status) {
4641         case CACHE_FPL_STATUS_UNSET:
4642                 __assert_unreachable();
4643                 break;
4644         case CACHE_FPL_STATUS_HANDLED:
4645                 SDT_PROBE3(vfs, namei, lookup, return, error,
4646                     (error == 0 ? ndp->ni_vp : NULL), true);
4647                 break;
4648         case CACHE_FPL_STATUS_PARTIAL:
4649                 *pwdp = fpl.pwd;
4650                 /*
4651                  * Status restored by cache_fplookup_partial_setup.
4652                  */
4653                 break;
4654         case CACHE_FPL_STATUS_ABORTED:
4655                 cache_fpl_restore(&fpl, &orig);
4656                 break;
4657         }
4658         return (error);
4659 }