sys/kern/vfs_cache.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1989, 1993, 1995
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to Berkeley by
   8  * Poul-Henning Kamp of the FreeBSD Project.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_ddb.h"
  41 #include "opt_ktrace.h"
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/capsicum.h>
  46 #include <sys/counter.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/fnv_hash.h>
  49 #include <sys/kernel.h>
  50 #include <sys/ktr.h>
  51 #include <sys/lock.h>
  52 #include <sys/malloc.h>
  53 #include <sys/fcntl.h>
  54 #include <sys/jail.h>
  55 #include <sys/mount.h>
  56 #include <sys/namei.h>
  57 #include <sys/proc.h>
  58 #include <sys/seqc.h>
  59 #include <sys/sdt.h>
  60 #include <sys/smr.h>
  61 #include <sys/smp.h>
  62 #include <sys/syscallsubr.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/sysproto.h>
  65 #include <sys/vnode.h>
  66 #include <ck_queue.h>
  67 #ifdef KTRACE
  68 #include <sys/ktrace.h>
  69 #endif
  70
  71 #include <sys/capsicum.h>
  72
  73 #include <security/audit/audit.h>
  74 #include <security/mac/mac_framework.h>
  75
  76 #ifdef DDB
  77 #include <ddb/ddb.h>
  78 #endif
  79
  80 #include <vm/uma.h>
  81
  82 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  83     "Name cache");
  84
  85 SDT_PROVIDER_DECLARE(vfs);
  86 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
  87     "struct vnode *");
  88 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
  89     "struct vnode *");
  90 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
  91     "char *");
  92 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
  93     "const char *");
  94 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
  95     "struct namecache *", "int", "int");
  96 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
  97 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
  98     "char *", "struct vnode *");
  99 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 100 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
 101     "struct vnode *", "char *");
 102 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
 103     "struct vnode *");
 104 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
 105     "struct vnode *", "char *");
 106 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
 107     "char *");
 108 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
 109     "struct componentname *");
 110 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
 111     "struct componentname *");
 112 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 113 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 114 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 115 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
 116     "struct vnode *");
 117 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
 118     "char *");
 119 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
 120     "char *");
 121
 122 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 123 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 124 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 125
 126 /*
 127  * This structure describes the elements in the cache of recent
 128  * names looked up by namei.
 129  */
 130 struct negstate {
 131         u_char neg_flag;
 132         u_char neg_hit;
 133 };
 134 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
 135     "the state must fit in a union with a pointer without growing it");
 136
 137 struct  namecache {
 138         LIST_ENTRY(namecache) nc_src;   /* source vnode list */
 139         TAILQ_ENTRY(namecache) nc_dst;  /* destination vnode list */
 140         CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 141         struct  vnode *nc_dvp;          /* vnode of parent of name */
 142         union {
 143                 struct  vnode *nu_vp;   /* vnode the name refers to */
 144                 struct  negstate nu_neg;/* negative entry state */
 145         } n_un;
 146         u_char  nc_flag;                /* flag bits */
 147         u_char  nc_nlen;                /* length of name */
 148         char    nc_name[0];             /* segment name + nul */
 149 };
 150
 151 /*
 152  * struct namecache_ts repeats struct namecache layout up to the
 153  * nc_nlen member.
 154  * struct namecache_ts is used in place of struct namecache when time(s) need
 155  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
 156  * both a non-dotdot directory name plus dotdot for the directory's
 157  * parent.
 158  *
 159  * See below for alignment requirement.
 160  */
 161 struct  namecache_ts {
 162         struct  timespec nc_time;       /* timespec provided by fs */
 163         struct  timespec nc_dotdottime; /* dotdot timespec provided by fs */
 164         int     nc_ticks;               /* ticks value when entry was added */
 165         int     nc_pad;
 166         struct namecache nc_nc;
 167 };
 168
 169 /*
 170  * At least mips n32 performs 64-bit accesses to timespec as found
 171  * in namecache_ts and requires them to be aligned. Since others
 172  * may be in the same spot suffer a little bit and enforce the
 173  * alignment for everyone. Note this is a nop for 64-bit platforms.
 174  */
 175 #define CACHE_ZONE_ALIGNMENT    UMA_ALIGNOF(time_t)
 176
 177 #ifdef __LP64__
 178 #define CACHE_PATH_CUTOFF       45
 179 #define CACHE_LARGE_PAD         6
 180 #else
 181 #define CACHE_PATH_CUTOFF       41
 182 #define CACHE_LARGE_PAD         2
 183 #endif
 184
 185 #define CACHE_ZONE_SMALL_SIZE           (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 186 #define CACHE_ZONE_SMALL_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 187 #define CACHE_ZONE_LARGE_SIZE           (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 188 #define CACHE_ZONE_LARGE_TS_SIZE        (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 189
 190 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 191 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 192 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 193 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 194
 195 #define nc_vp           n_un.nu_vp
 196 #define nc_neg          n_un.nu_neg
 197
 198 /*
 199  * Flags in namecache.nc_flag
 200  */
 201 #define NCF_WHITE       0x01
 202 #define NCF_ISDOTDOT    0x02
 203 #define NCF_TS          0x04
 204 #define NCF_DTS         0x08
 205 #define NCF_DVDROP      0x10
 206 #define NCF_NEGATIVE    0x20
 207 #define NCF_INVALID     0x40
 208 #define NCF_WIP         0x80
 209
 210 /*
 211  * Flags in negstate.neg_flag
 212  */
 213 #define NEG_HOT         0x01
 214
 215 /*
 216  * Mark an entry as invalid.
 217  *
 218  * This is called before it starts getting deconstructed.
 219  */
 220 static void
 221 cache_ncp_invalidate(struct namecache *ncp)
 222 {
 223
 224         KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 225             ("%s: entry %p already invalid", __func__, ncp));
 226         atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 227         atomic_thread_fence_rel();
 228 }
 229
 230 /*
 231  * Check whether the entry can be safely used.
 232  *
 233  * All places which elide locks are supposed to call this after they are
 234  * done with reading from an entry.
 235  */
 236 static bool
 237 cache_ncp_canuse(struct namecache *ncp)
 238 {
 239
 240         atomic_thread_fence_acq();
 241         return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0);
 242 }
 243
 244 /*
 245  * Name caching works as follows:
 246  *
 247  * Names found by directory scans are retained in a cache
 248  * for future reference.  It is managed LRU, so frequently
 249  * used names will hang around.  Cache is indexed by hash value
 250  * obtained from (dvp, name) where dvp refers to the directory
 251  * containing name.
 252  *
 253  * If it is a "negative" entry, (i.e. for a name that is known NOT to
 254  * exist) the vnode pointer will be NULL.
 255  *
 256  * Upon reaching the last segment of a path, if the reference
 257  * is for DELETE, or NOCACHE is set (rewrite), and the
 258  * name is located in the cache, it will be dropped.
 259  *
 260  * These locks are used (in the order in which they can be taken):
 261  * NAME         TYPE    ROLE
 262  * vnodelock    mtx     vnode lists and v_cache_dd field protection
 263  * bucketlock   mtx     for access to given set of hash buckets
 264  * neglist      mtx     negative entry LRU management
 265  *
 266  * It is legal to take multiple vnodelock and bucketlock locks. The locking
 267  * order is lower address first. Both are recursive.
 268  *
 269  * "." lookups are lockless.
 270  *
 271  * ".." and vnode -> name lookups require vnodelock.
 272  *
 273  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
 274  *
 275  * Insertions and removals of entries require involved vnodes and bucketlocks
 276  * to be locked to provide safe operation against other threads modifying the
 277  * cache.
 278  *
 279  * Some lookups result in removal of the found entry (e.g. getting rid of a
 280  * negative entry with the intent to create a positive one), which poses a
 281  * problem when multiple threads reach the state. Similarly, two different
 282  * threads can purge two different vnodes and try to remove the same name.
 283  *
 284  * If the already held vnode lock is lower than the second required lock, we
 285  * can just take the other lock. However, in the opposite case, this could
 286  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
 287  * the first node, locking everything in order and revalidating the state.
 288  */
 289
 290 VFS_SMR_DECLARE;
 291
 292 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 293     "Name cache parameters");
 294
 295 static u_int __read_mostly      ncsize; /* the size as computed on creation or resizing */
 296 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0,
 297     "Total namecache capacity");
 298
 299 u_int ncsizefactor = 2;
 300 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
 301     "Size factor for namecache");
 302
 303 static u_long __read_mostly     ncnegfactor = 5; /* ratio of negative entries */
 304 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
 305     "Ratio of negative namecache entries");
 306
 307 /*
 308  * Negative entry % of namecahe capacity above which automatic eviction is allowed.
 309  *
 310  * Check cache_neg_evict_cond for details.
 311  */
 312 static u_int ncnegminpct = 3;
 313
 314 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 315 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
 316     "Negative entry count above which automatic eviction is allowed");
 317
 318 /*
 319  * Structures associated with name caching.
 320  */
 321 #define NCHHASH(hash) \
 322         (&nchashtbl[(hash) & nchash])
 323 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 324 static u_long __read_mostly     nchash;                 /* size of hash table */
 325 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 326     "Size of namecache hash table");
 327 static u_long __exclusive_cache_line    numneg; /* number of negative entries allocated */
 328 static u_long __exclusive_cache_line    numcache;/* number of cache entries allocated */
 329
 330 struct nchstats nchstats;               /* cache effectiveness statistics */
 331
 332 static bool __read_frequently cache_fast_revlookup = true;
 333 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
 334     &cache_fast_revlookup, 0, "");
 335
 336 static u_int __exclusive_cache_line neg_cycle;
 337
 338 #define ncneghash       3
 339 #define numneglists     (ncneghash + 1)
 340
 341 struct neglist {
 342         struct mtx              nl_evict_lock;
 343         struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
 344         TAILQ_HEAD(, namecache) nl_list;
 345         TAILQ_HEAD(, namecache) nl_hotlist;
 346         u_long                  nl_hotnum;
 347 } __aligned(CACHE_LINE_SIZE);
 348
 349 static struct neglist neglists[numneglists];
 350
 351 static inline struct neglist *
 352 NCP2NEGLIST(struct namecache *ncp)
 353 {
 354
 355         return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 356 }
 357
 358 static inline struct negstate *
 359 NCP2NEGSTATE(struct namecache *ncp)
 360 {
 361
 362         MPASS(ncp->nc_flag & NCF_NEGATIVE);
 363         return (&ncp->nc_neg);
 364 }
 365
 366 #define numbucketlocks (ncbuckethash + 1)
 367 static u_int __read_mostly  ncbuckethash;
 368 static struct mtx_padalign __read_mostly  *bucketlocks;
 369 #define HASH2BUCKETLOCK(hash) \
 370         ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 371
 372 #define numvnodelocks (ncvnodehash + 1)
 373 static u_int __read_mostly  ncvnodehash;
 374 static struct mtx __read_mostly *vnodelocks;
 375 static inline struct mtx *
 376 VP2VNODELOCK(struct vnode *vp)
 377 {
 378
 379         return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 380 }
 381
 382 /*
 383  * UMA zones for the VFS cache.
 384  *
 385  * The small cache is used for entries with short names, which are the
 386  * most common.  The large cache is used for entries which are too big to
 387  * fit in the small cache.
 388  */
 389 static uma_zone_t __read_mostly cache_zone_small;
 390 static uma_zone_t __read_mostly cache_zone_small_ts;
 391 static uma_zone_t __read_mostly cache_zone_large;
 392 static uma_zone_t __read_mostly cache_zone_large_ts;
 393
 394 static struct namecache *
 395 cache_alloc(int len, int ts)
 396 {
 397         struct namecache_ts *ncp_ts;
 398         struct namecache *ncp;
 399
 400         if (__predict_false(ts)) {
 401                 if (len <= CACHE_PATH_CUTOFF)
 402                         ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 403                 else
 404                         ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 405                 ncp = &ncp_ts->nc_nc;
 406         } else {
 407                 if (len <= CACHE_PATH_CUTOFF)
 408                         ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 409                 else
 410                         ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 411         }
 412         return (ncp);
 413 }
 414
 415 static void
 416 cache_free(struct namecache *ncp)
 417 {
 418         struct namecache_ts *ncp_ts;
 419
 420         MPASS(ncp != NULL);
 421         if ((ncp->nc_flag & NCF_DVDROP) != 0)
 422                 vdrop(ncp->nc_dvp);
 423         if (__predict_false(ncp->nc_flag & NCF_TS)) {
 424                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 425                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 426                         uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 427                 else
 428                         uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 429         } else {
 430                 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 431                         uma_zfree_smr(cache_zone_small, ncp);
 432                 else
 433                         uma_zfree_smr(cache_zone_large, ncp);
 434         }
 435 }
 436
 437 static void
 438 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 439 {
 440         struct namecache_ts *ncp_ts;
 441
 442         KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 443             (tsp == NULL && ticksp == NULL),
 444             ("No NCF_TS"));
 445
 446         if (tsp == NULL)
 447                 return;
 448
 449         ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 450         *tsp = ncp_ts->nc_time;
 451         *ticksp = ncp_ts->nc_ticks;
 452 }
 453
 454 #ifdef DEBUG_CACHE
 455 static int __read_mostly        doingcache = 1; /* 1 => enable the cache */
 456 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
 457     "VFS namecache enabled");
 458 #endif
 459
 460 /* Export size information to userland */
 461 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
 462     sizeof(struct namecache), "sizeof(struct namecache)");
 463
 464 /*
 465  * The new name cache statistics
 466  */
 467 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 468     "Name cache statistics");
 469
 470 #define STATNODE_ULONG(name, varname, descr)                                    \
 471         SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 472 #define STATNODE_COUNTER(name, varname, descr)                                  \
 473         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 474         SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 475             descr);
 476 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 477 STATNODE_ULONG(count, numcache, "Number of cache entries");
 478 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 479 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 480 STATNODE_COUNTER(dothits, dothits, "Number of '.' hits");
 481 STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits");
 482 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 483 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 484 STATNODE_COUNTER(posszaps, numposzaps,
 485     "Number of cache hits (positive) we do not want to cache");
 486 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 487 STATNODE_COUNTER(negzaps, numnegzaps,
 488     "Number of cache hits (negative) we do not want to cache");
 489 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 490 /* These count for vn_getcwd(), too. */
 491 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 492 STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 493 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
 494     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 495 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 496 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 497
 498 /*
 499  * Debug or developer statistics.
 500  */
 501 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 502     "Name cache debugging");
 503 #define DEBUGNODE_ULONG(name, varname, descr)                                   \
 504         SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 505 #define DEBUGNODE_COUNTER(name, varname, descr)                                 \
 506         static COUNTER_U64_DEFINE_EARLY(varname);                               \
 507         SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \
 508             descr);
 509 DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success,
 510     "Number of successful removals after relocking");
 511 static long zap_bucket_fail;
 512 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 513 static long zap_bucket_fail2;
 514 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 515 static long cache_lock_vnodes_cel_3_failures;
 516 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
 517     "Number of times 3-way vnode locking failed");
 518
 519 static void cache_zap_locked(struct namecache *ncp);
 520 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
 521     char **freebuf, size_t *buflen);
 522 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
 523     char **retbuf, size_t *buflen, size_t addend);
 524 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
 525     char **retbuf, size_t *buflen);
 526 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
 527     char **retbuf, size_t *len, size_t addend);
 528
 529 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 530
 531 static inline void
 532 cache_assert_vlp_locked(struct mtx *vlp)
 533 {
 534
 535         if (vlp != NULL)
 536                 mtx_assert(vlp, MA_OWNED);
 537 }
 538
 539 static inline void
 540 cache_assert_vnode_locked(struct vnode *vp)
 541 {
 542         struct mtx *vlp;
 543
 544         vlp = VP2VNODELOCK(vp);
 545         cache_assert_vlp_locked(vlp);
 546 }
 547
 548 /*
 549  * TODO: With the value stored we can do better than computing the hash based
 550  * on the address. The choice of FNV should also be revisited.
 551  */
 552 static void
 553 cache_prehash(struct vnode *vp)
 554 {
 555
 556         vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 557 }
 558
 559 static uint32_t
 560 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 561 {
 562
 563         return (fnv_32_buf(name, len, dvp->v_nchash));
 564 }
 565
 566 static inline struct nchashhead *
 567 NCP2BUCKET(struct namecache *ncp)
 568 {
 569         uint32_t hash;
 570
 571         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 572         return (NCHHASH(hash));
 573 }
 574
 575 static inline struct mtx *
 576 NCP2BUCKETLOCK(struct namecache *ncp)
 577 {
 578         uint32_t hash;
 579
 580         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 581         return (HASH2BUCKETLOCK(hash));
 582 }
 583
 584 #ifdef INVARIANTS
 585 static void
 586 cache_assert_bucket_locked(struct namecache *ncp)
 587 {
 588         struct mtx *blp;
 589
 590         blp = NCP2BUCKETLOCK(ncp);
 591         mtx_assert(blp, MA_OWNED);
 592 }
 593
 594 static void
 595 cache_assert_bucket_unlocked(struct namecache *ncp)
 596 {
 597         struct mtx *blp;
 598
 599         blp = NCP2BUCKETLOCK(ncp);
 600         mtx_assert(blp, MA_NOTOWNED);
 601 }
 602 #else
 603 #define cache_assert_bucket_locked(x) do { } while (0)
 604 #define cache_assert_bucket_unlocked(x) do { } while (0)
 605 #endif
 606
 607 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y))
 608 static void
 609 _cache_sort_vnodes(void **p1, void **p2)
 610 {
 611         void *tmp;
 612
 613         MPASS(*p1 != NULL || *p2 != NULL);
 614
 615         if (*p1 > *p2) {
 616                 tmp = *p2;
 617                 *p2 = *p1;
 618                 *p1 = tmp;
 619         }
 620 }
 621
 622 static void
 623 cache_lock_all_buckets(void)
 624 {
 625         u_int i;
 626
 627         for (i = 0; i < numbucketlocks; i++)
 628                 mtx_lock(&bucketlocks[i]);
 629 }
 630
 631 static void
 632 cache_unlock_all_buckets(void)
 633 {
 634         u_int i;
 635
 636         for (i = 0; i < numbucketlocks; i++)
 637                 mtx_unlock(&bucketlocks[i]);
 638 }
 639
 640 static void
 641 cache_lock_all_vnodes(void)
 642 {
 643         u_int i;
 644
 645         for (i = 0; i < numvnodelocks; i++)
 646                 mtx_lock(&vnodelocks[i]);
 647 }
 648
 649 static void
 650 cache_unlock_all_vnodes(void)
 651 {
 652         u_int i;
 653
 654         for (i = 0; i < numvnodelocks; i++)
 655                 mtx_unlock(&vnodelocks[i]);
 656 }
 657
 658 static int
 659 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 660 {
 661
 662         cache_sort_vnodes(&vlp1, &vlp2);
 663
 664         if (vlp1 != NULL) {
 665                 if (!mtx_trylock(vlp1))
 666                         return (EAGAIN);
 667         }
 668         if (!mtx_trylock(vlp2)) {
 669                 if (vlp1 != NULL)
 670                         mtx_unlock(vlp1);
 671                 return (EAGAIN);
 672         }
 673
 674         return (0);
 675 }
 676
 677 static void
 678 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 679 {
 680
 681         MPASS(vlp1 != NULL || vlp2 != NULL);
 682         MPASS(vlp1 <= vlp2);
 683
 684         if (vlp1 != NULL)
 685                 mtx_lock(vlp1);
 686         if (vlp2 != NULL)
 687                 mtx_lock(vlp2);
 688 }
 689
 690 static void
 691 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 692 {
 693
 694         MPASS(vlp1 != NULL || vlp2 != NULL);
 695
 696         if (vlp1 != NULL)
 697                 mtx_unlock(vlp1);
 698         if (vlp2 != NULL)
 699                 mtx_unlock(vlp2);
 700 }
 701
 702 static int
 703 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 704 {
 705         struct nchstats snap;
 706
 707         if (req->oldptr == NULL)
 708                 return (SYSCTL_OUT(req, 0, sizeof(snap)));
 709
 710         snap = nchstats;
 711         snap.ncs_goodhits = counter_u64_fetch(numposhits);
 712         snap.ncs_neghits = counter_u64_fetch(numneghits);
 713         snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 714             counter_u64_fetch(numnegzaps);
 715         snap.ncs_miss = counter_u64_fetch(nummisszap) +
 716             counter_u64_fetch(nummiss);
 717
 718         return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 719 }
 720 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
 721     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
 722     "VFS cache effectiveness statistics");
 723
 724 static void
 725 cache_recalc_neg_min(u_int val)
 726 {
 727
 728         neg_min = (ncsize * val) / 100;
 729 }
 730
 731 static int
 732 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 733 {
 734         u_int val;
 735         int error;
 736
 737         val = ncnegminpct;
 738         error = sysctl_handle_int(oidp, &val, 0, req);
 739         if (error != 0 || req->newptr == NULL)
 740                 return (error);
 741
 742         if (val == ncnegminpct)
 743                 return (0);
 744         if (val < 0 || val > 99)
 745                 return (EINVAL);
 746         ncnegminpct = val;
 747         cache_recalc_neg_min(val);
 748         return (0);
 749 }
 750
 751 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
 752     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
 753     "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed");
 754
 755 #ifdef DIAGNOSTIC
 756 /*
 757  * Grab an atomic snapshot of the name cache hash chain lengths
 758  */
 759 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
 760     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
 761     "hash table stats");
 762
 763 static int
 764 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 765 {
 766         struct nchashhead *ncpp;
 767         struct namecache *ncp;
 768         int i, error, n_nchash, *cntbuf;
 769
 770 retry:
 771         n_nchash = nchash + 1;  /* nchash is max index, not count */
 772         if (req->oldptr == NULL)
 773                 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 774         cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 775         cache_lock_all_buckets();
 776         if (n_nchash != nchash + 1) {
 777                 cache_unlock_all_buckets();
 778                 free(cntbuf, M_TEMP);
 779                 goto retry;
 780         }
 781         /* Scan hash tables counting entries */
 782         for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 783                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 784                         cntbuf[i]++;
 785         cache_unlock_all_buckets();
 786         for (error = 0, i = 0; i < n_nchash; i++)
 787                 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 788                         break;
 789         free(cntbuf, M_TEMP);
 790         return (error);
 791 }
 792 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
 793     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
 794     "nchash chain lengths");
 795
 796 static int
 797 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 798 {
 799         int error;
 800         struct nchashhead *ncpp;
 801         struct namecache *ncp;
 802         int n_nchash;
 803         int count, maxlength, used, pct;
 804
 805         if (!req->oldptr)
 806                 return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 807
 808         cache_lock_all_buckets();
 809         n_nchash = nchash + 1;  /* nchash is max index, not count */
 810         used = 0;
 811         maxlength = 0;
 812
 813         /* Scan hash tables for applicable entries */
 814         for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 815                 count = 0;
 816                 CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 817                         count++;
 818                 }
 819                 if (count)
 820                         used++;
 821                 if (maxlength < count)
 822                         maxlength = count;
 823         }
 824         n_nchash = nchash + 1;
 825         cache_unlock_all_buckets();
 826         pct = (used * 100) / (n_nchash / 100);
 827         error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 828         if (error)
 829                 return (error);
 830         error = SYSCTL_OUT(req, &used, sizeof(used));
 831         if (error)
 832                 return (error);
 833         error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 834         if (error)
 835                 return (error);
 836         error = SYSCTL_OUT(req, &pct, sizeof(pct));
 837         if (error)
 838                 return (error);
 839         return (0);
 840 }
 841 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
 842     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
 843     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 844 #endif
 845
 846 /*
 847  * Negative entries management
 848  *
 849  * Various workloads create plenty of negative entries and barely use them
 850  * afterwards. Moreover malicious users can keep performing bogus lookups
 851  * adding even more entries. For example "make tinderbox" as of writing this
 852  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
 853  * negative.
 854  *
 855  * As such, a rather aggressive eviction method is needed. The currently
 856  * employed method is a placeholder.
 857  *
 858  * Entries are split over numneglists separate lists, each of which is further
 859  * split into hot and cold entries. Entries get promoted after getting a hit.
 860  * Eviction happens on addition of new entry.
 861  */
 862 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 863     "Name cache negative entry statistics");
 864
 865 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
 866     "Number of negative cache entries");
 867
 868 static COUNTER_U64_DEFINE_EARLY(neg_created);
 869 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
 870     "Number of created negative entries");
 871
 872 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 873 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
 874     "Number of evicted negative entries");
 875
 876 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 877 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
 878     &neg_evict_skipped_empty,
 879     "Number of times evicting failed due to lack of entries");
 880
 881 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 882 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
 883     &neg_evict_skipped_missed,
 884     "Number of times evicting failed due to target entry disappearing");
 885
 886 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 887 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
 888     &neg_evict_skipped_contended,
 889     "Number of times evicting failed due to contention");
 890
 891 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
 892     "Number of cache hits (negative)");
 893
 894 static int
 895 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 896 {
 897         int i, out;
 898
 899         out = 0;
 900         for (i = 0; i < numneglists; i++)
 901                 out += neglists[i].nl_hotnum;
 902
 903         return (SYSCTL_OUT(req, &out, sizeof(out)));
 904 }
 905 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
 906     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
 907     "Number of hot negative entries");
 908
 909 static void
 910 cache_neg_init(struct namecache *ncp)
 911 {
 912         struct negstate *ns;
 913
 914         ncp->nc_flag |= NCF_NEGATIVE;
 915         ns = NCP2NEGSTATE(ncp);
 916         ns->neg_flag = 0;
 917         ns->neg_hit = 0;
 918         counter_u64_add(neg_created, 1);
 919 }
 920
 921 #define CACHE_NEG_PROMOTION_THRESH 2
 922
 923 static bool
 924 cache_neg_hit_prep(struct namecache *ncp)
 925 {
 926         struct negstate *ns;
 927         u_char n;
 928
 929         ns = NCP2NEGSTATE(ncp);
 930         n = atomic_load_char(&ns->neg_hit);
 931         for (;;) {
 932                 if (n >= CACHE_NEG_PROMOTION_THRESH)
 933                         return (false);
 934                 if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 935                         break;
 936         }
 937         return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 938 }
 939
 940 /*
 941  * Nothing to do here but it is provided for completeness as some
 942  * cache_neg_hit_prep callers may end up returning without even
 943  * trying to promote.
 944  */
 945 #define cache_neg_hit_abort(ncp)        do { } while (0)
 946
 947 static void
 948 cache_neg_hit_finish(struct namecache *ncp)
 949 {
 950
 951         SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 952         counter_u64_add(numneghits, 1);
 953 }
 954
 955 /*
 956  * Move a negative entry to the hot list.
 957  */
 958 static void
 959 cache_neg_promote_locked(struct namecache *ncp)
 960 {
 961         struct neglist *nl;
 962         struct negstate *ns;
 963
 964         ns = NCP2NEGSTATE(ncp);
 965         nl = NCP2NEGLIST(ncp);
 966         mtx_assert(&nl->nl_lock, MA_OWNED);
 967         if ((ns->neg_flag & NEG_HOT) == 0) {
 968                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 969                 TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 970                 nl->nl_hotnum++;
 971                 ns->neg_flag |= NEG_HOT;
 972         }
 973 }
 974
 975 /*
 976  * Move a hot negative entry to the cold list.
 977  */
 978 static void
 979 cache_neg_demote_locked(struct namecache *ncp)
 980 {
 981         struct neglist *nl;
 982         struct negstate *ns;
 983
 984         ns = NCP2NEGSTATE(ncp);
 985         nl = NCP2NEGLIST(ncp);
 986         mtx_assert(&nl->nl_lock, MA_OWNED);
 987         MPASS(ns->neg_flag & NEG_HOT);
 988         TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 989         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 990         nl->nl_hotnum--;
 991         ns->neg_flag &= ~NEG_HOT;
 992         atomic_store_char(&ns->neg_hit, 0);
 993 }
 994
 995 /*
 996  * Move a negative entry to the hot list if it matches the lookup.
 997  *
 998  * We have to take locks, but they may be contended and in the worst
 999  * case we may need to go off CPU. We don't want to spin within the
1000  * smr section and we can't block with it. Exiting the section means
1001  * the found entry could have been evicted. We are going to look it
1002  * up again.
1003  */
1004 static bool
1005 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
1006     struct namecache *oncp, uint32_t hash)
1007 {
1008         struct namecache *ncp;
1009         struct neglist *nl;
1010         u_char nc_flag;
1011
1012         nl = NCP2NEGLIST(oncp);
1013
1014         mtx_lock(&nl->nl_lock);
1015         /*
1016          * For hash iteration.
1017          */
1018         vfs_smr_enter();
1019
1020         /*
1021          * Avoid all surprises by only succeeding if we got the same entry and
1022          * bailing completely otherwise.
1023          * XXX There are no provisions to keep the vnode around, meaning we may
1024          * end up promoting a negative entry for a *new* vnode and returning
1025          * ENOENT on its account. This is the error we want to return anyway
1026          * and promotion is harmless.
1027          *
1028          * In particular at this point there can be a new ncp which matches the
1029          * search but hashes to a different neglist.
1030          */
1031         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1032                 if (ncp == oncp)
1033                         break;
1034         }
1035
1036         /*
1037          * No match to begin with.
1038          */
1039         if (__predict_false(ncp == NULL)) {
1040                 goto out_abort;
1041         }
1042
1043         /*
1044          * The newly found entry may be something different...
1045          */
1046         if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1047             !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
1048                 goto out_abort;
1049         }
1050
1051         /*
1052          * ... and not even negative.
1053          */
1054         nc_flag = atomic_load_char(&ncp->nc_flag);
1055         if ((nc_flag & NCF_NEGATIVE) == 0) {
1056                 goto out_abort;
1057         }
1058
1059         if (__predict_false(!cache_ncp_canuse(ncp))) {
1060                 goto out_abort;
1061         }
1062
1063         cache_neg_promote_locked(ncp);
1064         cache_neg_hit_finish(ncp);
1065         vfs_smr_exit();
1066         mtx_unlock(&nl->nl_lock);
1067         return (true);
1068 out_abort:
1069         vfs_smr_exit();
1070         mtx_unlock(&nl->nl_lock);
1071         return (false);
1072 }
1073
1074 static void
1075 cache_neg_promote(struct namecache *ncp)
1076 {
1077         struct neglist *nl;
1078
1079         nl = NCP2NEGLIST(ncp);
1080         mtx_lock(&nl->nl_lock);
1081         cache_neg_promote_locked(ncp);
1082         mtx_unlock(&nl->nl_lock);
1083 }
1084
1085 static void
1086 cache_neg_insert(struct namecache *ncp)
1087 {
1088         struct neglist *nl;
1089
1090         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1091         cache_assert_bucket_locked(ncp);
1092         nl = NCP2NEGLIST(ncp);
1093         mtx_lock(&nl->nl_lock);
1094         TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
1095         mtx_unlock(&nl->nl_lock);
1096         atomic_add_long(&numneg, 1);
1097 }
1098
1099 static void
1100 cache_neg_remove(struct namecache *ncp)
1101 {
1102         struct neglist *nl;
1103         struct negstate *ns;
1104
1105         cache_assert_bucket_locked(ncp);
1106         nl = NCP2NEGLIST(ncp);
1107         ns = NCP2NEGSTATE(ncp);
1108         mtx_lock(&nl->nl_lock);
1109         if ((ns->neg_flag & NEG_HOT) != 0) {
1110                 TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
1111                 nl->nl_hotnum--;
1112         } else {
1113                 TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
1114         }
1115         mtx_unlock(&nl->nl_lock);
1116         atomic_subtract_long(&numneg, 1);
1117 }
1118
1119 static struct neglist *
1120 cache_neg_evict_select_list(void)
1121 {
1122         struct neglist *nl;
1123         u_int c;
1124
1125         c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
1126         nl = &neglists[c % numneglists];
1127         if (!mtx_trylock(&nl->nl_evict_lock)) {
1128                 counter_u64_add(neg_evict_skipped_contended, 1);
1129                 return (NULL);
1130         }
1131         return (nl);
1132 }
1133
1134 static struct namecache *
1135 cache_neg_evict_select_entry(struct neglist *nl)
1136 {
1137         struct namecache *ncp, *lncp;
1138         struct negstate *ns, *lns;
1139         int i;
1140
1141         mtx_assert(&nl->nl_evict_lock, MA_OWNED);
1142         mtx_assert(&nl->nl_lock, MA_OWNED);
1143         ncp = TAILQ_FIRST(&nl->nl_list);
1144         if (ncp == NULL)
1145                 return (NULL);
1146         lncp = ncp;
1147         lns = NCP2NEGSTATE(lncp);
1148         for (i = 1; i < 4; i++) {
1149                 ncp = TAILQ_NEXT(ncp, nc_dst);
1150                 if (ncp == NULL)
1151                         break;
1152                 ns = NCP2NEGSTATE(ncp);
1153                 if (ns->neg_hit < lns->neg_hit) {
1154                         lncp = ncp;
1155                         lns = ns;
1156                 }
1157         }
1158         return (lncp);
1159 }
1160
1161 static bool
1162 cache_neg_evict(void)
1163 {
1164         struct namecache *ncp, *ncp2;
1165         struct neglist *nl;
1166         struct negstate *ns;
1167         struct vnode *dvp;
1168         struct mtx *dvlp;
1169         struct mtx *blp;
1170         uint32_t hash;
1171         u_char nlen;
1172         bool evicted;
1173
1174         nl = cache_neg_evict_select_list();
1175         if (nl == NULL) {
1176                 return (false);
1177         }
1178
1179         mtx_lock(&nl->nl_lock);
1180         ncp = TAILQ_FIRST(&nl->nl_hotlist);
1181         if (ncp != NULL) {
1182                 cache_neg_demote_locked(ncp);
1183         }
1184         ncp = cache_neg_evict_select_entry(nl);
1185         if (ncp == NULL) {
1186                 counter_u64_add(neg_evict_skipped_empty, 1);
1187                 mtx_unlock(&nl->nl_lock);
1188                 mtx_unlock(&nl->nl_evict_lock);
1189                 return (false);
1190         }
1191         ns = NCP2NEGSTATE(ncp);
1192         nlen = ncp->nc_nlen;
1193         dvp = ncp->nc_dvp;
1194         hash = cache_get_hash(ncp->nc_name, nlen, dvp);
1195         dvlp = VP2VNODELOCK(dvp);
1196         blp = HASH2BUCKETLOCK(hash);
1197         mtx_unlock(&nl->nl_lock);
1198         mtx_unlock(&nl->nl_evict_lock);
1199         mtx_lock(dvlp);
1200         mtx_lock(blp);
1201         /*
1202          * Note that since all locks were dropped above, the entry may be
1203          * gone or reallocated to be something else.
1204          */
1205         CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
1206                 if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
1207                     ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
1208                         break;
1209         }
1210         if (ncp2 == NULL) {
1211                 counter_u64_add(neg_evict_skipped_missed, 1);
1212                 ncp = NULL;
1213                 evicted = false;
1214         } else {
1215                 MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
1216                 MPASS(blp == NCP2BUCKETLOCK(ncp));
1217                 SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
1218                     ncp->nc_name);
1219                 cache_zap_locked(ncp);
1220                 counter_u64_add(neg_evicted, 1);
1221                 evicted = true;
1222         }
1223         mtx_unlock(blp);
1224         mtx_unlock(dvlp);
1225         if (ncp != NULL)
1226                 cache_free(ncp);
1227         return (evicted);
1228 }
1229
1230 /*
1231  * Maybe evict a negative entry to create more room.
1232  *
1233  * The ncnegfactor parameter limits what fraction of the total count
1234  * can comprise of negative entries. However, if the cache is just
1235  * warming up this leads to excessive evictions.  As such, ncnegminpct
1236  * (recomputed to neg_min) dictates whether the above should be
1237  * applied.
1238  *
1239  * Try evicting if the cache is close to full capacity regardless of
1240  * other considerations.
1241  */
1242 static bool
1243 cache_neg_evict_cond(u_long lnumcache)
1244 {
1245         u_long lnumneg;
1246
1247         if (ncsize - 1000 < lnumcache)
1248                 goto out_evict;
1249         lnumneg = atomic_load_long(&numneg);
1250         if (lnumneg < neg_min)
1251                 return (false);
1252         if (lnumneg * ncnegfactor < lnumcache)
1253                 return (false);
1254 out_evict:
1255         return (cache_neg_evict());
1256 }
1257
1258 /*
1259  * cache_zap_locked():
1260  *
1261  *   Removes a namecache entry from cache, whether it contains an actual
1262  *   pointer to a vnode or if it is just a negative cache entry.
1263  */
1264 static void
1265 cache_zap_locked(struct namecache *ncp)
1266 {
1267         struct nchashhead *ncpp;
1268
1269         if (!(ncp->nc_flag & NCF_NEGATIVE))
1270                 cache_assert_vnode_locked(ncp->nc_vp);
1271         cache_assert_vnode_locked(ncp->nc_dvp);
1272         cache_assert_bucket_locked(ncp);
1273
1274         cache_ncp_invalidate(ncp);
1275
1276         ncpp = NCP2BUCKET(ncp);
1277         CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
1278         if (!(ncp->nc_flag & NCF_NEGATIVE)) {
1279                 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
1280                     ncp->nc_name, ncp->nc_vp);
1281                 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
1282                 if (ncp == ncp->nc_vp->v_cache_dd) {
1283                         vn_seqc_write_begin_unheld(ncp->nc_vp);
1284                         ncp->nc_vp->v_cache_dd = NULL;
1285                         vn_seqc_write_end(ncp->nc_vp);
1286                 }
1287         } else {
1288                 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
1289                     ncp->nc_name);
1290                 cache_neg_remove(ncp);
1291         }
1292         if (ncp->nc_flag & NCF_ISDOTDOT) {
1293                 if (ncp == ncp->nc_dvp->v_cache_dd) {
1294                         vn_seqc_write_begin_unheld(ncp->nc_dvp);
1295                         ncp->nc_dvp->v_cache_dd = NULL;
1296                         vn_seqc_write_end(ncp->nc_dvp);
1297                 }
1298         } else {
1299                 LIST_REMOVE(ncp, nc_src);
1300                 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
1301                         ncp->nc_flag |= NCF_DVDROP;
1302                         counter_u64_add(numcachehv, -1);
1303                 }
1304         }
1305         atomic_subtract_long(&numcache, 1);
1306 }
1307
1308 static void
1309 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
1310 {
1311         struct mtx *blp;
1312
1313         MPASS(ncp->nc_dvp == vp);
1314         MPASS(ncp->nc_flag & NCF_NEGATIVE);
1315         cache_assert_vnode_locked(vp);
1316
1317         blp = NCP2BUCKETLOCK(ncp);
1318         mtx_lock(blp);
1319         cache_zap_locked(ncp);
1320         mtx_unlock(blp);
1321 }
1322
1323 static bool
1324 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
1325     struct mtx **vlpp)
1326 {
1327         struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1328         struct mtx *blp;
1329
1330         MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1331         cache_assert_vnode_locked(vp);
1332
1333         if (ncp->nc_flag & NCF_NEGATIVE) {
1334                 if (*vlpp != NULL) {
1335                         mtx_unlock(*vlpp);
1336                         *vlpp = NULL;
1337                 }
1338                 cache_zap_negative_locked_vnode_kl(ncp, vp);
1339                 return (true);
1340         }
1341
1342         pvlp = VP2VNODELOCK(vp);
1343         blp = NCP2BUCKETLOCK(ncp);
1344         vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1345         vlp2 = VP2VNODELOCK(ncp->nc_vp);
1346
1347         if (*vlpp == vlp1 || *vlpp == vlp2) {
1348                 to_unlock = *vlpp;
1349                 *vlpp = NULL;
1350         } else {
1351                 if (*vlpp != NULL) {
1352                         mtx_unlock(*vlpp);
1353                         *vlpp = NULL;
1354                 }
1355                 cache_sort_vnodes(&vlp1, &vlp2);
1356                 if (vlp1 == pvlp) {
1357                         mtx_lock(vlp2);
1358                         to_unlock = vlp2;
1359                 } else {
1360                         if (!mtx_trylock(vlp1))
1361                                 goto out_relock;
1362                         to_unlock = vlp1;
1363                 }
1364         }
1365         mtx_lock(blp);
1366         cache_zap_locked(ncp);
1367         mtx_unlock(blp);
1368         if (to_unlock != NULL)
1369                 mtx_unlock(to_unlock);
1370         return (true);
1371
1372 out_relock:
1373         mtx_unlock(vlp2);
1374         mtx_lock(vlp1);
1375         mtx_lock(vlp2);
1376         MPASS(*vlpp == NULL);
1377         *vlpp = vlp1;
1378         return (false);
1379 }
1380
1381 /*
1382  * If trylocking failed we can get here. We know enough to take all needed locks
1383  * in the right order and re-lookup the entry.
1384  */
1385 static int
1386 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1387     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1388     struct mtx *blp)
1389 {
1390         struct namecache *rncp;
1391
1392         cache_assert_bucket_unlocked(ncp);
1393
1394         cache_sort_vnodes(&dvlp, &vlp);
1395         cache_lock_vnodes(dvlp, vlp);
1396         mtx_lock(blp);
1397         CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1398                 if (rncp == ncp && rncp->nc_dvp == dvp &&
1399                     rncp->nc_nlen == cnp->cn_namelen &&
1400                     !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1401                         break;
1402         }
1403         if (rncp != NULL) {
1404                 cache_zap_locked(rncp);
1405                 mtx_unlock(blp);
1406                 cache_unlock_vnodes(dvlp, vlp);
1407                 counter_u64_add(zap_bucket_relock_success, 1);
1408                 return (0);
1409         }
1410
1411         mtx_unlock(blp);
1412         cache_unlock_vnodes(dvlp, vlp);
1413         return (EAGAIN);
1414 }
1415
1416 static int __noinline
1417 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
1418     uint32_t hash, struct mtx *blp)
1419 {
1420         struct mtx *dvlp, *vlp;
1421         struct vnode *dvp;
1422
1423         cache_assert_bucket_locked(ncp);
1424
1425         dvlp = VP2VNODELOCK(ncp->nc_dvp);
1426         vlp = NULL;
1427         if (!(ncp->nc_flag & NCF_NEGATIVE))
1428                 vlp = VP2VNODELOCK(ncp->nc_vp);
1429         if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1430                 cache_zap_locked(ncp);
1431                 mtx_unlock(blp);
1432                 cache_unlock_vnodes(dvlp, vlp);
1433                 return (0);
1434         }
1435
1436         dvp = ncp->nc_dvp;
1437         mtx_unlock(blp);
1438         return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1439 }
1440
1441 static __noinline int
1442 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
1443 {
1444         struct namecache *ncp;
1445         struct mtx *blp;
1446         struct mtx *dvlp, *dvlp2;
1447         uint32_t hash;
1448         int error;
1449
1450         if (cnp->cn_namelen == 2 &&
1451             cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1452                 dvlp = VP2VNODELOCK(dvp);
1453                 dvlp2 = NULL;
1454                 mtx_lock(dvlp);
1455 retry_dotdot:
1456                 ncp = dvp->v_cache_dd;
1457                 if (ncp == NULL) {
1458                         mtx_unlock(dvlp);
1459                         if (dvlp2 != NULL)
1460                                 mtx_unlock(dvlp2);
1461                         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1462                         return (0);
1463                 }
1464                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1465                         if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
1466                                 goto retry_dotdot;
1467                         MPASS(dvp->v_cache_dd == NULL);
1468                         mtx_unlock(dvlp);
1469                         if (dvlp2 != NULL)
1470                                 mtx_unlock(dvlp2);
1471                         cache_free(ncp);
1472                 } else {
1473                         vn_seqc_write_begin(dvp);
1474                         dvp->v_cache_dd = NULL;
1475                         vn_seqc_write_end(dvp);
1476                         mtx_unlock(dvlp);
1477                         if (dvlp2 != NULL)
1478                                 mtx_unlock(dvlp2);
1479                 }
1480                 SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1481                 return (1);
1482         }
1483
1484         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1485         blp = HASH2BUCKETLOCK(hash);
1486 retry:
1487         if (CK_SLIST_EMPTY(NCHHASH(hash)))
1488                 goto out_no_entry;
1489
1490         mtx_lock(blp);
1491
1492         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1493                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1494                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1495                         break;
1496         }
1497
1498         if (ncp == NULL) {
1499                 mtx_unlock(blp);
1500                 goto out_no_entry;
1501         }
1502
1503         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1504         if (__predict_false(error != 0)) {
1505                 zap_bucket_fail++;
1506                 goto retry;
1507         }
1508         counter_u64_add(numposzaps, 1);
1509         SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
1510         cache_free(ncp);
1511         return (1);
1512 out_no_entry:
1513         counter_u64_add(nummisszap, 1);
1514         SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
1515         return (0);
1516 }
1517
1518 static int __noinline
1519 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1520     struct timespec *tsp, int *ticksp)
1521 {
1522         int ltype;
1523
1524         *vpp = dvp;
1525         counter_u64_add(dothits, 1);
1526         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1527         if (tsp != NULL)
1528                 timespecclear(tsp);
1529         if (ticksp != NULL)
1530                 *ticksp = ticks;
1531         vrefact(*vpp);
1532         /*
1533          * When we lookup "." we still can be asked to lock it
1534          * differently...
1535          */
1536         ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1537         if (ltype != VOP_ISLOCKED(*vpp)) {
1538                 if (ltype == LK_EXCLUSIVE) {
1539                         vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1540                         if (VN_IS_DOOMED((*vpp))) {
1541                                 /* forced unmount */
1542                                 vrele(*vpp);
1543                                 *vpp = NULL;
1544                                 return (ENOENT);
1545                         }
1546                 } else
1547                         vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1548         }
1549         return (-1);
1550 }
1551
1552 static int __noinline
1553 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1554     struct timespec *tsp, int *ticksp)
1555 {
1556         struct namecache_ts *ncp_ts;
1557         struct namecache *ncp;
1558         struct mtx *dvlp;
1559         enum vgetstate vs;
1560         int error, ltype;
1561         bool whiteout;
1562
1563         MPASS((cnp->cn_flags & ISDOTDOT) != 0);
1564
1565         if ((cnp->cn_flags & MAKEENTRY) == 0) {
1566                 cache_remove_cnp(dvp, cnp);
1567                 return (0);
1568         }
1569
1570         counter_u64_add(dotdothits, 1);
1571 retry:
1572         dvlp = VP2VNODELOCK(dvp);
1573         mtx_lock(dvlp);
1574         ncp = dvp->v_cache_dd;
1575         if (ncp == NULL) {
1576                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL);
1577                 mtx_unlock(dvlp);
1578                 return (0);
1579         }
1580         if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1581                 if (ncp->nc_flag & NCF_NEGATIVE)
1582                         *vpp = NULL;
1583                 else
1584                         *vpp = ncp->nc_vp;
1585         } else
1586                 *vpp = ncp->nc_dvp;
1587         if (*vpp == NULL)
1588                 goto negative_success;
1589         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
1590         cache_out_ts(ncp, tsp, ticksp);
1591         if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1592             NCF_DTS && tsp != NULL) {
1593                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1594                 *tsp = ncp_ts->nc_dotdottime;
1595         }
1596
1597         MPASS(dvp != *vpp);
1598         ltype = VOP_ISLOCKED(dvp);
1599         VOP_UNLOCK(dvp);
1600         vs = vget_prep(*vpp);
1601         mtx_unlock(dvlp);
1602         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1603         vn_lock(dvp, ltype | LK_RETRY);
1604         if (VN_IS_DOOMED(dvp)) {
1605                 if (error == 0)
1606                         vput(*vpp);
1607                 *vpp = NULL;
1608                 return (ENOENT);
1609         }
1610         if (error) {
1611                 *vpp = NULL;
1612                 goto retry;
1613         }
1614         return (-1);
1615 negative_success:
1616         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1617                 if (cnp->cn_flags & ISLASTCN) {
1618                         counter_u64_add(numnegzaps, 1);
1619                         cache_zap_negative_locked_vnode_kl(ncp, dvp);
1620                         mtx_unlock(dvlp);
1621                         cache_free(ncp);
1622                         return (0);
1623                 }
1624         }
1625
1626         whiteout = (ncp->nc_flag & NCF_WHITE);
1627         cache_out_ts(ncp, tsp, ticksp);
1628         if (cache_neg_hit_prep(ncp))
1629                 cache_neg_promote(ncp);
1630         else
1631                 cache_neg_hit_finish(ncp);
1632         mtx_unlock(dvlp);
1633         if (whiteout)
1634                 cnp->cn_flags |= ISWHITEOUT;
1635         return (ENOENT);
1636 }
1637
1638 /**
1639  * Lookup a name in the name cache
1640  *
1641  * # Arguments
1642  *
1643  * - dvp:       Parent directory in which to search.
1644  * - vpp:       Return argument.  Will contain desired vnode on cache hit.
1645  * - cnp:       Parameters of the name search.  The most interesting bits of
1646  *              the cn_flags field have the following meanings:
1647  *      - MAKEENTRY:    If clear, free an entry from the cache rather than look
1648  *                      it up.
1649  *      - ISDOTDOT:     Must be set if and only if cn_nameptr == ".."
1650  * - tsp:       Return storage for cache timestamp.  On a successful (positive
1651  *              or negative) lookup, tsp will be filled with any timespec that
1652  *              was stored when this cache entry was created.  However, it will
1653  *              be clear for "." entries.
1654  * - ticks:     Return storage for alternate cache timestamp.  On a successful
1655  *              (positive or negative) lookup, it will contain the ticks value
1656  *              that was current when the cache entry was created, unless cnp
1657  *              was ".".
1658  *
1659  * Either both tsp and ticks have to be provided or neither of them.
1660  *
1661  * # Returns
1662  *
1663  * - -1:        A positive cache hit.  vpp will contain the desired vnode.
1664  * - ENOENT:    A negative cache hit, or dvp was recycled out from under us due
1665  *              to a forced unmount.  vpp will not be modified.  If the entry
1666  *              is a whiteout, then the ISWHITEOUT flag will be set in
1667  *              cnp->cn_flags.
1668  * - 0:         A cache miss.  vpp will not be modified.
1669  *
1670  * # Locking
1671  *
1672  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1673  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1674  * lock is not recursively acquired.
1675  */
1676 static int __noinline
1677 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1678     struct timespec *tsp, int *ticksp)
1679 {
1680         struct namecache *ncp;
1681         struct mtx *blp;
1682         uint32_t hash;
1683         enum vgetstate vs;
1684         int error;
1685         bool whiteout;
1686
1687         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1688         MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
1689
1690 retry:
1691         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1692         blp = HASH2BUCKETLOCK(hash);
1693         mtx_lock(blp);
1694
1695         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1696                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1697                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1698                         break;
1699         }
1700
1701         if (__predict_false(ncp == NULL)) {
1702                 mtx_unlock(blp);
1703                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1704                     NULL);
1705                 counter_u64_add(nummiss, 1);
1706                 return (0);
1707         }
1708
1709         if (ncp->nc_flag & NCF_NEGATIVE)
1710                 goto negative_success;
1711
1712         counter_u64_add(numposhits, 1);
1713         *vpp = ncp->nc_vp;
1714         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1715         cache_out_ts(ncp, tsp, ticksp);
1716         MPASS(dvp != *vpp);
1717         vs = vget_prep(*vpp);
1718         mtx_unlock(blp);
1719         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1720         if (error) {
1721                 *vpp = NULL;
1722                 goto retry;
1723         }
1724         return (-1);
1725 negative_success:
1726         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1727                 if (cnp->cn_flags & ISLASTCN) {
1728                         counter_u64_add(numnegzaps, 1);
1729                         error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
1730                         if (__predict_false(error != 0)) {
1731                                 zap_bucket_fail2++;
1732                                 goto retry;
1733                         }
1734                         cache_free(ncp);
1735                         return (0);
1736                 }
1737         }
1738
1739         whiteout = (ncp->nc_flag & NCF_WHITE);
1740         cache_out_ts(ncp, tsp, ticksp);
1741         if (cache_neg_hit_prep(ncp))
1742                 cache_neg_promote(ncp);
1743         else
1744                 cache_neg_hit_finish(ncp);
1745         mtx_unlock(blp);
1746         if (whiteout)
1747                 cnp->cn_flags |= ISWHITEOUT;
1748         return (ENOENT);
1749 }
1750
1751 int
1752 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1753     struct timespec *tsp, int *ticksp)
1754 {
1755         struct namecache *ncp;
1756         uint32_t hash;
1757         enum vgetstate vs;
1758         int error;
1759         bool whiteout, neg_promote;
1760         u_short nc_flag;
1761
1762         MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
1763
1764 #ifdef DEBUG_CACHE
1765         if (__predict_false(!doingcache)) {
1766                 cnp->cn_flags &= ~MAKEENTRY;
1767                 return (0);
1768         }
1769 #endif
1770
1771         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1772                 if (cnp->cn_namelen == 1)
1773                         return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1774                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
1775                         return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
1776         }
1777
1778         MPASS((cnp->cn_flags & ISDOTDOT) == 0);
1779
1780         if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
1781                 cache_remove_cnp(dvp, cnp);
1782                 return (0);
1783         }
1784
1785         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1786         vfs_smr_enter();
1787
1788         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1789                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1790                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1791                         break;
1792         }
1793
1794         if (__predict_false(ncp == NULL)) {
1795                 vfs_smr_exit();
1796                 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1797                     NULL);
1798                 counter_u64_add(nummiss, 1);
1799                 return (0);
1800         }
1801
1802         nc_flag = atomic_load_char(&ncp->nc_flag);
1803         if (nc_flag & NCF_NEGATIVE)
1804                 goto negative_success;
1805
1806         counter_u64_add(numposhits, 1);
1807         *vpp = ncp->nc_vp;
1808         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
1809         cache_out_ts(ncp, tsp, ticksp);
1810         MPASS(dvp != *vpp);
1811         if (!cache_ncp_canuse(ncp)) {
1812                 vfs_smr_exit();
1813                 *vpp = NULL;
1814                 goto out_fallback;
1815         }
1816         vs = vget_prep_smr(*vpp);
1817         vfs_smr_exit();
1818         if (__predict_false(vs == VGET_NONE)) {
1819                 *vpp = NULL;
1820                 goto out_fallback;
1821         }
1822         error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1823         if (error) {
1824                 *vpp = NULL;
1825                 goto out_fallback;
1826         }
1827         return (-1);
1828 negative_success:
1829         if (__predict_false(cnp->cn_nameiop == CREATE)) {
1830                 if (cnp->cn_flags & ISLASTCN) {
1831                         vfs_smr_exit();
1832                         goto out_fallback;
1833                 }
1834         }
1835
1836         cache_out_ts(ncp, tsp, ticksp);
1837         whiteout = (ncp->nc_flag & NCF_WHITE);
1838         neg_promote = cache_neg_hit_prep(ncp);
1839         if (__predict_false(!cache_ncp_canuse(ncp))) {
1840                 cache_neg_hit_abort(ncp);
1841                 vfs_smr_exit();
1842                 goto out_fallback;
1843         }
1844         if (neg_promote) {
1845                 vfs_smr_exit();
1846                 if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
1847                         goto out_fallback;
1848         } else {
1849                 cache_neg_hit_finish(ncp);
1850                 vfs_smr_exit();
1851         }
1852         if (whiteout)
1853                 cnp->cn_flags |= ISWHITEOUT;
1854         return (ENOENT);
1855 out_fallback:
1856         return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
1857 }
1858
1859 struct celockstate {
1860         struct mtx *vlp[3];
1861         struct mtx *blp[2];
1862 };
1863 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1864 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1865
1866 static inline void
1867 cache_celockstate_init(struct celockstate *cel)
1868 {
1869
1870         bzero(cel, sizeof(*cel));
1871 }
1872
1873 static void
1874 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1875     struct vnode *dvp)
1876 {
1877         struct mtx *vlp1, *vlp2;
1878
1879         MPASS(cel->vlp[0] == NULL);
1880         MPASS(cel->vlp[1] == NULL);
1881         MPASS(cel->vlp[2] == NULL);
1882
1883         MPASS(vp != NULL || dvp != NULL);
1884
1885         vlp1 = VP2VNODELOCK(vp);
1886         vlp2 = VP2VNODELOCK(dvp);
1887         cache_sort_vnodes(&vlp1, &vlp2);
1888
1889         if (vlp1 != NULL) {
1890                 mtx_lock(vlp1);
1891                 cel->vlp[0] = vlp1;
1892         }
1893         mtx_lock(vlp2);
1894         cel->vlp[1] = vlp2;
1895 }
1896
1897 static void
1898 cache_unlock_vnodes_cel(struct celockstate *cel)
1899 {
1900
1901         MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1902
1903         if (cel->vlp[0] != NULL)
1904                 mtx_unlock(cel->vlp[0]);
1905         if (cel->vlp[1] != NULL)
1906                 mtx_unlock(cel->vlp[1]);
1907         if (cel->vlp[2] != NULL)
1908                 mtx_unlock(cel->vlp[2]);
1909 }
1910
1911 static bool
1912 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1913 {
1914         struct mtx *vlp;
1915         bool ret;
1916
1917         cache_assert_vlp_locked(cel->vlp[0]);
1918         cache_assert_vlp_locked(cel->vlp[1]);
1919         MPASS(cel->vlp[2] == NULL);
1920
1921         MPASS(vp != NULL);
1922         vlp = VP2VNODELOCK(vp);
1923
1924         ret = true;
1925         if (vlp >= cel->vlp[1]) {
1926                 mtx_lock(vlp);
1927         } else {
1928                 if (mtx_trylock(vlp))
1929                         goto out;
1930                 cache_lock_vnodes_cel_3_failures++;
1931                 cache_unlock_vnodes_cel(cel);
1932                 if (vlp < cel->vlp[0]) {
1933                         mtx_lock(vlp);
1934                         mtx_lock(cel->vlp[0]);
1935                         mtx_lock(cel->vlp[1]);
1936                 } else {
1937                         if (cel->vlp[0] != NULL)
1938                                 mtx_lock(cel->vlp[0]);
1939                         mtx_lock(vlp);
1940                         mtx_lock(cel->vlp[1]);
1941                 }
1942                 ret = false;
1943         }
1944 out:
1945         cel->vlp[2] = vlp;
1946         return (ret);
1947 }
1948
1949 static void
1950 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
1951     struct mtx *blp2)
1952 {
1953
1954         MPASS(cel->blp[0] == NULL);
1955         MPASS(cel->blp[1] == NULL);
1956
1957         cache_sort_vnodes(&blp1, &blp2);
1958
1959         if (blp1 != NULL) {
1960                 mtx_lock(blp1);
1961                 cel->blp[0] = blp1;
1962         }
1963         mtx_lock(blp2);
1964         cel->blp[1] = blp2;
1965 }
1966
1967 static void
1968 cache_unlock_buckets_cel(struct celockstate *cel)
1969 {
1970
1971         if (cel->blp[0] != NULL)
1972                 mtx_unlock(cel->blp[0]);
1973         mtx_unlock(cel->blp[1]);
1974 }
1975
1976 /*
1977  * Lock part of the cache affected by the insertion.
1978  *
1979  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1980  * However, insertion can result in removal of an old entry. In this
1981  * case we have an additional vnode and bucketlock pair to lock.
1982  *
1983  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1984  * preserving the locking order (smaller address first).
1985  */
1986 static void
1987 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1988     uint32_t hash)
1989 {
1990         struct namecache *ncp;
1991         struct mtx *blps[2];
1992
1993         blps[0] = HASH2BUCKETLOCK(hash);
1994         for (;;) {
1995                 blps[1] = NULL;
1996                 cache_lock_vnodes_cel(cel, dvp, vp);
1997                 if (vp == NULL || vp->v_type != VDIR)
1998                         break;
1999                 ncp = vp->v_cache_dd;
2000                 if (ncp == NULL)
2001                         break;
2002                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2003                         break;
2004                 MPASS(ncp->nc_dvp == vp);
2005                 blps[1] = NCP2BUCKETLOCK(ncp);
2006                 if (ncp->nc_flag & NCF_NEGATIVE)
2007                         break;
2008                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2009                         break;
2010                 /*
2011                  * All vnodes got re-locked. Re-validate the state and if
2012                  * nothing changed we are done. Otherwise restart.
2013                  */
2014                 if (ncp == vp->v_cache_dd &&
2015                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2016                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2017                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2018                         break;
2019                 cache_unlock_vnodes_cel(cel);
2020                 cel->vlp[0] = NULL;
2021                 cel->vlp[1] = NULL;
2022                 cel->vlp[2] = NULL;
2023         }
2024         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2025 }
2026
2027 static void
2028 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
2029     uint32_t hash)
2030 {
2031         struct namecache *ncp;
2032         struct mtx *blps[2];
2033
2034         blps[0] = HASH2BUCKETLOCK(hash);
2035         for (;;) {
2036                 blps[1] = NULL;
2037                 cache_lock_vnodes_cel(cel, dvp, vp);
2038                 ncp = dvp->v_cache_dd;
2039                 if (ncp == NULL)
2040                         break;
2041                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2042                         break;
2043                 MPASS(ncp->nc_dvp == dvp);
2044                 blps[1] = NCP2BUCKETLOCK(ncp);
2045                 if (ncp->nc_flag & NCF_NEGATIVE)
2046                         break;
2047                 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
2048                         break;
2049                 if (ncp == dvp->v_cache_dd &&
2050                     (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
2051                     blps[1] == NCP2BUCKETLOCK(ncp) &&
2052                     VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
2053                         break;
2054                 cache_unlock_vnodes_cel(cel);
2055                 cel->vlp[0] = NULL;
2056                 cel->vlp[1] = NULL;
2057                 cel->vlp[2] = NULL;
2058         }
2059         cache_lock_buckets_cel(cel, blps[0], blps[1]);
2060 }
2061
2062 static void
2063 cache_enter_unlock(struct celockstate *cel)
2064 {
2065
2066         cache_unlock_buckets_cel(cel);
2067         cache_unlock_vnodes_cel(cel);
2068 }
2069
2070 static void __noinline
2071 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
2072     struct componentname *cnp)
2073 {
2074         struct celockstate cel;
2075         struct namecache *ncp;
2076         uint32_t hash;
2077         int len;
2078
2079         if (dvp->v_cache_dd == NULL)
2080                 return;
2081         len = cnp->cn_namelen;
2082         cache_celockstate_init(&cel);
2083         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2084         cache_enter_lock_dd(&cel, dvp, vp, hash);
2085         vn_seqc_write_begin(dvp);
2086         ncp = dvp->v_cache_dd;
2087         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
2088                 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
2089                 cache_zap_locked(ncp);
2090         } else {
2091                 ncp = NULL;
2092         }
2093         dvp->v_cache_dd = NULL;
2094         vn_seqc_write_end(dvp);
2095         cache_enter_unlock(&cel);
2096         if (ncp != NULL)
2097                 cache_free(ncp);
2098 }
2099
2100 /*
2101  * Add an entry to the cache.
2102  */
2103 void
2104 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
2105     struct timespec *tsp, struct timespec *dtsp)
2106 {
2107         struct celockstate cel;
2108         struct namecache *ncp, *n2, *ndd;
2109         struct namecache_ts *ncp_ts;
2110         struct nchashhead *ncpp;
2111         uint32_t hash;
2112         int flag;
2113         int len;
2114         u_long lnumcache;
2115
2116         VNPASS(dvp != vp, dvp);
2117         VNPASS(!VN_IS_DOOMED(dvp), dvp);
2118         VNPASS(dvp->v_type != VNON, dvp);
2119         if (vp != NULL) {
2120                 VNPASS(!VN_IS_DOOMED(vp), vp);
2121                 VNPASS(vp->v_type != VNON, vp);
2122         }
2123
2124 #ifdef DEBUG_CACHE
2125         if (__predict_false(!doingcache))
2126                 return;
2127 #endif
2128
2129         flag = 0;
2130         if (__predict_false(cnp->cn_nameptr[0] == '.')) {
2131                 if (cnp->cn_namelen == 1)
2132                         return;
2133                 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
2134                         cache_enter_dotdot_prep(dvp, vp, cnp);
2135                         flag = NCF_ISDOTDOT;
2136                 }
2137         }
2138
2139         /*
2140          * Avoid blowout in namecache entries.
2141          *
2142          * Bugs:
2143          * 1. filesystems may end up tryng to add an already existing entry
2144          * (for example this can happen after a cache miss during concurrent
2145          * lookup), in which case we will call cache_neg_evict despite not
2146          * adding anything.
2147          * 2. the routine may fail to free anything and no provisions are made
2148          * to make it try harder (see the inside for failure modes)
2149          * 3. it only ever looks at negative entries.
2150          */
2151         lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
2152         if (cache_neg_evict_cond(lnumcache)) {
2153                 lnumcache = atomic_load_long(&numcache);
2154         }
2155         if (__predict_false(lnumcache >= ncsize)) {
2156                 atomic_subtract_long(&numcache, 1);
2157                 counter_u64_add(numdrops, 1);
2158                 return;
2159         }
2160
2161         cache_celockstate_init(&cel);
2162         ndd = NULL;
2163         ncp_ts = NULL;
2164
2165         /*
2166          * Calculate the hash key and setup as much of the new
2167          * namecache entry as possible before acquiring the lock.
2168          */
2169         ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
2170         ncp->nc_flag = flag | NCF_WIP;
2171         ncp->nc_vp = vp;
2172         if (vp == NULL)
2173                 cache_neg_init(ncp);
2174         ncp->nc_dvp = dvp;
2175         if (tsp != NULL) {
2176                 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
2177                 ncp_ts->nc_time = *tsp;
2178                 ncp_ts->nc_ticks = ticks;
2179                 ncp_ts->nc_nc.nc_flag |= NCF_TS;
2180                 if (dtsp != NULL) {
2181                         ncp_ts->nc_dotdottime = *dtsp;
2182                         ncp_ts->nc_nc.nc_flag |= NCF_DTS;
2183                 }
2184         }
2185         len = ncp->nc_nlen = cnp->cn_namelen;
2186         hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
2187         memcpy(ncp->nc_name, cnp->cn_nameptr, len);
2188         ncp->nc_name[len] = '\0';
2189         cache_enter_lock(&cel, dvp, vp, hash);
2190
2191         /*
2192          * See if this vnode or negative entry is already in the cache
2193          * with this name.  This can happen with concurrent lookups of
2194          * the same path name.
2195          */
2196         ncpp = NCHHASH(hash);
2197         CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
2198                 if (n2->nc_dvp == dvp &&
2199                     n2->nc_nlen == cnp->cn_namelen &&
2200                     !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
2201                         MPASS(cache_ncp_canuse(n2));
2202                         if ((n2->nc_flag & NCF_NEGATIVE) != 0)
2203                                 KASSERT(vp == NULL,
2204                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2205                                     __func__, NULL, vp));
2206                         else
2207                                 KASSERT(n2->nc_vp == vp,
2208                                     ("%s: found entry pointing to a different vnode (%p != %p)",
2209                                     __func__, n2->nc_vp, vp));
2210                         /*
2211                          * Entries are supposed to be immutable unless in the
2212                          * process of getting destroyed. Accommodating for
2213                          * changing timestamps is possible but not worth it.
2214                          * This should be harmless in terms of correctness, in
2215                          * the worst case resulting in an earlier expiration.
2216                          * Alternatively, the found entry can be replaced
2217                          * altogether.
2218                          */
2219                         MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
2220 #if 0
2221                         if (tsp != NULL) {
2222                                 KASSERT((n2->nc_flag & NCF_TS) != 0,
2223                                     ("no NCF_TS"));
2224                                 n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
2225                                 n2_ts->nc_time = ncp_ts->nc_time;
2226                                 n2_ts->nc_ticks = ncp_ts->nc_ticks;
2227                                 if (dtsp != NULL) {
2228                                         n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
2229                                         n2_ts->nc_nc.nc_flag |= NCF_DTS;
2230                                 }
2231                         }
2232 #endif
2233                         SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
2234                             vp);
2235                         goto out_unlock_free;
2236                 }
2237         }
2238
2239         if (flag == NCF_ISDOTDOT) {
2240                 /*
2241                  * See if we are trying to add .. entry, but some other lookup
2242                  * has populated v_cache_dd pointer already.
2243                  */
2244                 if (dvp->v_cache_dd != NULL)
2245                         goto out_unlock_free;
2246                 KASSERT(vp == NULL || vp->v_type == VDIR,
2247                     ("wrong vnode type %p", vp));
2248                 vn_seqc_write_begin(dvp);
2249                 dvp->v_cache_dd = ncp;
2250                 vn_seqc_write_end(dvp);
2251         }
2252
2253         if (vp != NULL) {
2254                 if (flag != NCF_ISDOTDOT) {
2255                         /*
2256                          * For this case, the cache entry maps both the
2257                          * directory name in it and the name ".." for the
2258                          * directory's parent.
2259                          */
2260                         vn_seqc_write_begin(vp);
2261                         if ((ndd = vp->v_cache_dd) != NULL) {
2262                                 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
2263                                         cache_zap_locked(ndd);
2264                                 else
2265                                         ndd = NULL;
2266                         }
2267                         vp->v_cache_dd = ncp;
2268                         vn_seqc_write_end(vp);
2269                 } else if (vp->v_type != VDIR) {
2270                         if (vp->v_cache_dd != NULL) {
2271                                 vn_seqc_write_begin(vp);
2272                                 vp->v_cache_dd = NULL;
2273                                 vn_seqc_write_end(vp);
2274                         }
2275                 }
2276         }
2277
2278         if (flag != NCF_ISDOTDOT) {
2279                 if (LIST_EMPTY(&dvp->v_cache_src)) {
2280                         vhold(dvp);
2281                         counter_u64_add(numcachehv, 1);
2282                 }
2283                 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
2284         }
2285
2286         /*
2287          * If the entry is "negative", we place it into the
2288          * "negative" cache queue, otherwise, we place it into the
2289          * destination vnode's cache entries queue.
2290          */
2291         if (vp != NULL) {
2292                 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
2293                 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
2294                     vp);
2295         } else {
2296                 if (cnp->cn_flags & ISWHITEOUT)
2297                         ncp->nc_flag |= NCF_WHITE;
2298                 cache_neg_insert(ncp);
2299                 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
2300                     ncp->nc_name);
2301         }
2302
2303         /*
2304          * Insert the new namecache entry into the appropriate chain
2305          * within the cache entries table.
2306          */
2307         CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
2308
2309         atomic_thread_fence_rel();
2310         /*
2311          * Mark the entry as fully constructed.
2312          * It is immutable past this point until its removal.
2313          */
2314         atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
2315
2316         cache_enter_unlock(&cel);
2317         if (ndd != NULL)
2318                 cache_free(ndd);
2319         return;
2320 out_unlock_free:
2321         cache_enter_unlock(&cel);
2322         atomic_subtract_long(&numcache, 1);
2323         cache_free(ncp);
2324         return;
2325 }
2326
2327 static u_int
2328 cache_roundup_2(u_int val)
2329 {
2330         u_int res;
2331
2332         for (res = 1; res <= val; res <<= 1)
2333                 continue;
2334
2335         return (res);
2336 }
2337
2338 static struct nchashhead *
2339 nchinittbl(u_long elements, u_long *hashmask)
2340 {
2341         struct nchashhead *hashtbl;
2342         u_long hashsize, i;
2343
2344         hashsize = cache_roundup_2(elements) / 2;
2345
2346         hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
2347         for (i = 0; i < hashsize; i++)
2348                 CK_SLIST_INIT(&hashtbl[i]);
2349         *hashmask = hashsize - 1;
2350         return (hashtbl);
2351 }
2352
2353 static void
2354 ncfreetbl(struct nchashhead *hashtbl)
2355 {
2356
2357         free(hashtbl, M_VFSCACHE);
2358 }
2359
2360 /*
2361  * Name cache initialization, from vfs_init() when we are booting
2362  */
2363 static void
2364 nchinit(void *dummy __unused)
2365 {
2366         u_int i;
2367
2368         cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
2369             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2370         cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
2371             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2372         cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
2373             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2374         cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
2375             NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
2376
2377         VFS_SMR_ZONE_SET(cache_zone_small);
2378         VFS_SMR_ZONE_SET(cache_zone_small_ts);
2379         VFS_SMR_ZONE_SET(cache_zone_large);
2380         VFS_SMR_ZONE_SET(cache_zone_large_ts);
2381
2382         ncsize = desiredvnodes * ncsizefactor;
2383         cache_recalc_neg_min(ncnegminpct);
2384         nchashtbl = nchinittbl(desiredvnodes * 2, &nchash);
2385         ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2386         if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2387                 ncbuckethash = 7;
2388         if (ncbuckethash > nchash)
2389                 ncbuckethash = nchash;
2390         bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2391             M_WAITOK | M_ZERO);
2392         for (i = 0; i < numbucketlocks; i++)
2393                 mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
2394         ncvnodehash = ncbuckethash;
2395         vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2396             M_WAITOK | M_ZERO);
2397         for (i = 0; i < numvnodelocks; i++)
2398                 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2399
2400         for (i = 0; i < numneglists; i++) {
2401                 mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
2402                 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2403                 TAILQ_INIT(&neglists[i].nl_list);
2404                 TAILQ_INIT(&neglists[i].nl_hotlist);
2405         }
2406 }
2407 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2408
2409 void
2410 cache_vnode_init(struct vnode *vp)
2411 {
2412
2413         LIST_INIT(&vp->v_cache_src);
2414         TAILQ_INIT(&vp->v_cache_dst);
2415         vp->v_cache_dd = NULL;
2416         cache_prehash(vp);
2417 }
2418
2419 void
2420 cache_changesize(u_long newmaxvnodes)
2421 {
2422         struct nchashhead *new_nchashtbl, *old_nchashtbl;
2423         u_long new_nchash, old_nchash;
2424         struct namecache *ncp;
2425         uint32_t hash;
2426         u_long newncsize;
2427         int i;
2428
2429         newncsize = newmaxvnodes * ncsizefactor;
2430         newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2431         if (newmaxvnodes < numbucketlocks)
2432                 newmaxvnodes = numbucketlocks;
2433
2434         new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
2435         /* If same hash table size, nothing to do */
2436         if (nchash == new_nchash) {
2437                 ncfreetbl(new_nchashtbl);
2438                 return;
2439         }
2440         /*
2441          * Move everything from the old hash table to the new table.
2442          * None of the namecache entries in the table can be removed
2443          * because to do so, they have to be removed from the hash table.
2444          */
2445         cache_lock_all_vnodes();
2446         cache_lock_all_buckets();
2447         old_nchashtbl = nchashtbl;
2448         old_nchash = nchash;
2449         nchashtbl = new_nchashtbl;
2450         nchash = new_nchash;
2451         for (i = 0; i <= old_nchash; i++) {
2452                 while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
2453                         hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2454                             ncp->nc_dvp);
2455                         CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
2456                         CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2457                 }
2458         }
2459         ncsize = newncsize;
2460         cache_recalc_neg_min(ncnegminpct);
2461         cache_unlock_all_buckets();
2462         cache_unlock_all_vnodes();
2463         ncfreetbl(old_nchashtbl);
2464 }
2465
2466 /*
2467  * Invalidate all entries from and to a particular vnode.
2468  */
2469 static void
2470 cache_purge_impl(struct vnode *vp)
2471 {
2472         TAILQ_HEAD(, namecache) ncps;
2473         struct namecache *ncp, *nnp;
2474         struct mtx *vlp, *vlp2;
2475
2476         TAILQ_INIT(&ncps);
2477         vlp = VP2VNODELOCK(vp);
2478         vlp2 = NULL;
2479         mtx_lock(vlp);
2480 retry:
2481         while (!LIST_EMPTY(&vp->v_cache_src)) {
2482                 ncp = LIST_FIRST(&vp->v_cache_src);
2483                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2484                         goto retry;
2485                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2486         }
2487         while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2488                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
2489                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2490                         goto retry;
2491                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2492         }
2493         ncp = vp->v_cache_dd;
2494         if (ncp != NULL) {
2495                 KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2496                    ("lost dotdot link"));
2497                 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2498                         goto retry;
2499                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2500         }
2501         KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2502         mtx_unlock(vlp);
2503         if (vlp2 != NULL)
2504                 mtx_unlock(vlp2);
2505         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2506                 cache_free(ncp);
2507         }
2508 }
2509
2510 /*
2511  * Opportunistic check to see if there is anything to do.
2512  */
2513 static bool
2514 cache_has_entries(struct vnode *vp)
2515 {
2516
2517         if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2518             vp->v_cache_dd == NULL)
2519                 return (false);
2520         return (true);
2521 }
2522
2523 void
2524 cache_purge(struct vnode *vp)
2525 {
2526
2527         SDT_PROBE1(vfs, namecache, purge, done, vp);
2528         if (!cache_has_entries(vp))
2529                 return;
2530         cache_purge_impl(vp);
2531 }
2532
2533 /*
2534  * Only to be used by vgone.
2535  */
2536 void
2537 cache_purge_vgone(struct vnode *vp)
2538 {
2539         struct mtx *vlp;
2540
2541         VNPASS(VN_IS_DOOMED(vp), vp);
2542         if (cache_has_entries(vp)) {
2543                 cache_purge_impl(vp);
2544                 return;
2545         }
2546
2547         /*
2548          * Serialize against a potential thread doing cache_purge.
2549          */
2550         vlp = VP2VNODELOCK(vp);
2551         mtx_wait_unlocked(vlp);
2552         if (cache_has_entries(vp)) {
2553                 cache_purge_impl(vp);
2554                 return;
2555         }
2556         return;
2557 }
2558
2559 /*
2560  * Invalidate all negative entries for a particular directory vnode.
2561  */
2562 void
2563 cache_purge_negative(struct vnode *vp)
2564 {
2565         TAILQ_HEAD(, namecache) ncps;
2566         struct namecache *ncp, *nnp;
2567         struct mtx *vlp;
2568
2569         SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2570         if (LIST_EMPTY(&vp->v_cache_src))
2571                 return;
2572         TAILQ_INIT(&ncps);
2573         vlp = VP2VNODELOCK(vp);
2574         mtx_lock(vlp);
2575         LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2576                 if (!(ncp->nc_flag & NCF_NEGATIVE))
2577                         continue;
2578                 cache_zap_negative_locked_vnode_kl(ncp, vp);
2579                 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2580         }
2581         mtx_unlock(vlp);
2582         TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2583                 cache_free(ncp);
2584         }
2585 }
2586
2587 void
2588 cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
2589     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
2590 {
2591
2592         ASSERT_VOP_IN_SEQC(fdvp);
2593         ASSERT_VOP_IN_SEQC(fvp);
2594         ASSERT_VOP_IN_SEQC(tdvp);
2595         if (tvp != NULL)
2596                 ASSERT_VOP_IN_SEQC(tvp);
2597
2598         cache_purge(fvp);
2599         if (tvp != NULL) {
2600                 cache_purge(tvp);
2601                 KASSERT(!cache_remove_cnp(tdvp, tcnp),
2602                     ("%s: lingering negative entry", __func__));
2603         } else {
2604                 cache_remove_cnp(tdvp, tcnp);
2605         }
2606 }
2607
2608 #ifdef INVARIANTS
2609 /*
2610  * Validate that if an entry exists it matches.
2611  */
2612 void
2613 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
2614 {
2615         struct namecache *ncp;
2616         struct mtx *blp;
2617         uint32_t hash;
2618
2619         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
2620         if (CK_SLIST_EMPTY(NCHHASH(hash)))
2621                 return;
2622         blp = HASH2BUCKETLOCK(hash);
2623         mtx_lock(blp);
2624         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
2625                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
2626                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
2627                         if (ncp->nc_vp != vp)
2628                                 panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n",
2629                                     __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp,
2630                                     ncp->nc_vp);
2631                 }
2632         }
2633         mtx_unlock(blp);
2634 }
2635 #endif
2636
2637 /*
2638  * Flush all entries referencing a particular filesystem.
2639  */
2640 void
2641 cache_purgevfs(struct mount *mp)
2642 {
2643         struct vnode *vp, *mvp;
2644
2645         SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2646         /*
2647          * Somewhat wasteful iteration over all vnodes. Would be better to
2648          * support filtering and avoid the interlock to begin with.
2649          */
2650         MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2651                 if (!cache_has_entries(vp)) {
2652                         VI_UNLOCK(vp);
2653                         continue;
2654                 }
2655                 vholdl(vp);
2656                 VI_UNLOCK(vp);
2657                 cache_purge(vp);
2658                 vdrop(vp);
2659         }
2660 }
2661
2662 /*
2663  * Perform canonical checks and cache lookup and pass on to filesystem
2664  * through the vop_cachedlookup only if needed.
2665  */
2666
2667 int
2668 vfs_cache_lookup(struct vop_lookup_args *ap)
2669 {
2670         struct vnode *dvp;
2671         int error;
2672         struct vnode **vpp = ap->a_vpp;
2673         struct componentname *cnp = ap->a_cnp;
2674         int flags = cnp->cn_flags;
2675
2676         *vpp = NULL;
2677         dvp = ap->a_dvp;
2678
2679         if (dvp->v_type != VDIR)
2680                 return (ENOTDIR);
2681
2682         if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2683             (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2684                 return (EROFS);
2685
2686         error = vn_dir_check_exec(dvp, cnp);
2687         if (error != 0)
2688                 return (error);
2689
2690         error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2691         if (error == 0)
2692                 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2693         if (error == -1)
2694                 return (0);
2695         return (error);
2696 }
2697
2698 /* Implementation of the getcwd syscall. */
2699 int
2700 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2701 {
2702         char *buf, *retbuf;
2703         size_t buflen;
2704         int error;
2705
2706         buflen = uap->buflen;
2707         if (__predict_false(buflen < 2))
2708                 return (EINVAL);
2709         if (buflen > MAXPATHLEN)
2710                 buflen = MAXPATHLEN;
2711
2712         buf = uma_zalloc(namei_zone, M_WAITOK);
2713         error = vn_getcwd(buf, &retbuf, &buflen);
2714         if (error == 0)
2715                 error = copyout(retbuf, uap->buf, buflen);
2716         uma_zfree(namei_zone, buf);
2717         return (error);
2718 }
2719
2720 int
2721 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
2722 {
2723         struct pwd *pwd;
2724         int error;
2725
2726         vfs_smr_enter();
2727         pwd = pwd_get_smr();
2728         error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
2729             buflen, 0);
2730         VFS_SMR_ASSERT_NOT_ENTERED();
2731         if (error < 0) {
2732                 pwd = pwd_hold(curthread);
2733                 error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
2734                     retbuf, buflen);
2735                 pwd_drop(pwd);
2736         }
2737
2738 #ifdef KTRACE
2739         if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2740                 ktrnamei(*retbuf);
2741 #endif
2742         return (error);
2743 }
2744
2745 static int
2746 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2747     size_t size, int flags, enum uio_seg pathseg)
2748 {
2749         struct nameidata nd;
2750         char *retbuf, *freebuf;
2751         int error;
2752
2753         if (flags != 0)
2754                 return (EINVAL);
2755         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2756             pathseg, path, fd, &cap_fstat_rights, td);
2757         if ((error = namei(&nd)) != 0)
2758                 return (error);
2759         error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size);
2760         if (error == 0) {
2761                 error = copyout(retbuf, buf, size);
2762                 free(freebuf, M_TEMP);
2763         }
2764         NDFREE(&nd, 0);
2765         return (error);
2766 }
2767
2768 int
2769 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2770 {
2771
2772         return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2773             uap->flags, UIO_USERSPACE));
2774 }
2775
2776 /*
2777  * Retrieve the full filesystem path that correspond to a vnode from the name
2778  * cache (if available)
2779  */
2780 int
2781 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
2782 {
2783         struct pwd *pwd;
2784         char *buf;
2785         size_t buflen;
2786         int error;
2787
2788         if (__predict_false(vp == NULL))
2789                 return (EINVAL);
2790
2791         buflen = MAXPATHLEN;
2792         buf = malloc(buflen, M_TEMP, M_WAITOK);
2793         vfs_smr_enter();
2794         pwd = pwd_get_smr();
2795         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
2796         VFS_SMR_ASSERT_NOT_ENTERED();
2797         if (error < 0) {
2798                 pwd = pwd_hold(curthread);
2799                 error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
2800                 pwd_drop(pwd);
2801         }
2802         if (error == 0)
2803                 *freebuf = buf;
2804         else
2805                 free(buf, M_TEMP);
2806         return (error);
2807 }
2808
2809 /*
2810  * This function is similar to vn_fullpath, but it attempts to lookup the
2811  * pathname relative to the global root mount point.  This is required for the
2812  * auditing sub-system, as audited pathnames must be absolute, relative to the
2813  * global root mount point.
2814  */
2815 int
2816 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
2817 {
2818         char *buf;
2819         size_t buflen;
2820         int error;
2821
2822         if (__predict_false(vp == NULL))
2823                 return (EINVAL);
2824         buflen = MAXPATHLEN;
2825         buf = malloc(buflen, M_TEMP, M_WAITOK);
2826         vfs_smr_enter();
2827         error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
2828         VFS_SMR_ASSERT_NOT_ENTERED();
2829         if (error < 0) {
2830                 error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
2831         }
2832         if (error == 0)
2833                 *freebuf = buf;
2834         else
2835                 free(buf, M_TEMP);
2836         return (error);
2837 }
2838
2839 static struct namecache *
2840 vn_dd_from_dst(struct vnode *vp)
2841 {
2842         struct namecache *ncp;
2843
2844         cache_assert_vnode_locked(vp);
2845         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
2846                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2847                         return (ncp);
2848         }
2849         return (NULL);
2850 }
2851
2852 int
2853 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
2854 {
2855         struct vnode *dvp;
2856         struct namecache *ncp;
2857         struct mtx *vlp;
2858         int error;
2859
2860         vlp = VP2VNODELOCK(*vp);
2861         mtx_lock(vlp);
2862         ncp = (*vp)->v_cache_dd;
2863         if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
2864                 KASSERT(ncp == vn_dd_from_dst(*vp),
2865                     ("%s: mismatch for dd entry (%p != %p)", __func__,
2866                     ncp, vn_dd_from_dst(*vp)));
2867         } else {
2868                 ncp = vn_dd_from_dst(*vp);
2869         }
2870         if (ncp != NULL) {
2871                 if (*buflen < ncp->nc_nlen) {
2872                         mtx_unlock(vlp);
2873                         vrele(*vp);
2874                         counter_u64_add(numfullpathfail4, 1);
2875                         error = ENOMEM;
2876                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
2877                             vp, NULL);
2878                         return (error);
2879                 }
2880                 *buflen -= ncp->nc_nlen;
2881                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2882                 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2883                     ncp->nc_name, vp);
2884                 dvp = *vp;
2885                 *vp = ncp->nc_dvp;
2886                 vref(*vp);
2887                 mtx_unlock(vlp);
2888                 vrele(dvp);
2889                 return (0);
2890         }
2891         SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2892
2893         mtx_unlock(vlp);
2894         vn_lock(*vp, LK_SHARED | LK_RETRY);
2895         error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
2896         vput(*vp);
2897         if (error) {
2898                 counter_u64_add(numfullpathfail2, 1);
2899                 SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2900                 return (error);
2901         }
2902
2903         *vp = dvp;
2904         if (VN_IS_DOOMED(dvp)) {
2905                 /* forced unmount */
2906                 vrele(dvp);
2907                 error = ENOENT;
2908                 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2909                 return (error);
2910         }
2911         /*
2912          * *vp has its use count incremented still.
2913          */
2914
2915         return (0);
2916 }
2917
2918 /*
2919  * Resolve a directory to a pathname.
2920  *
2921  * The name of the directory can always be found in the namecache or fetched
2922  * from the filesystem. There is also guaranteed to be only one parent, meaning
2923  * we can just follow vnodes up until we find the root.
2924  *
2925  * The vnode must be referenced.
2926  */
2927 static int
2928 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
2929     size_t *len, size_t addend)
2930 {
2931 #ifdef KDTRACE_HOOKS
2932         struct vnode *startvp = vp;
2933 #endif
2934         struct vnode *vp1;
2935         size_t buflen;
2936         int error;
2937         bool slash_prefixed;
2938
2939         VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2940         VNPASS(vp->v_usecount > 0, vp);
2941
2942         buflen = *len;
2943
2944         slash_prefixed = true;
2945         if (addend == 0) {
2946                 MPASS(*len >= 2);
2947                 buflen--;
2948                 buf[buflen] = '\0';
2949                 slash_prefixed = false;
2950         }
2951
2952         error = 0;
2953
2954         SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2955         counter_u64_add(numfullpathcalls, 1);
2956         while (vp != rdir && vp != rootvnode) {
2957                 /*
2958                  * The vp vnode must be already fully constructed,
2959                  * since it is either found in namecache or obtained
2960                  * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2961                  * without obtaining the vnode lock.
2962                  */
2963                 if ((vp->v_vflag & VV_ROOT) != 0) {
2964                         vn_lock(vp, LK_RETRY | LK_SHARED);
2965
2966                         /*
2967                          * With the vnode locked, check for races with
2968                          * unmount, forced or not.  Note that we
2969                          * already verified that vp is not equal to
2970                          * the root vnode, which means that
2971                          * mnt_vnodecovered can be NULL only for the
2972                          * case of unmount.
2973                          */
2974                         if (VN_IS_DOOMED(vp) ||
2975                             (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2976                             vp1->v_mountedhere != vp->v_mount) {
2977                                 vput(vp);
2978                                 error = ENOENT;
2979                                 SDT_PROBE3(vfs, namecache, fullpath, return,
2980                                     error, vp, NULL);
2981                                 break;
2982                         }
2983
2984                         vref(vp1);
2985                         vput(vp);
2986                         vp = vp1;
2987                         continue;
2988                 }
2989                 if (vp->v_type != VDIR) {
2990                         vrele(vp);
2991                         counter_u64_add(numfullpathfail1, 1);
2992                         error = ENOTDIR;
2993                         SDT_PROBE3(vfs, namecache, fullpath, return,
2994                             error, vp, NULL);
2995                         break;
2996                 }
2997                 error = vn_vptocnp(&vp, buf, &buflen);
2998                 if (error)
2999                         break;
3000                 if (buflen == 0) {
3001                         vrele(vp);
3002                         error = ENOMEM;
3003                         SDT_PROBE3(vfs, namecache, fullpath, return, error,
3004                             startvp, NULL);
3005                         break;
3006                 }
3007                 buf[--buflen] = '/';
3008                 slash_prefixed = true;
3009         }
3010         if (error)
3011                 return (error);
3012         if (!slash_prefixed) {
3013                 if (buflen == 0) {
3014                         vrele(vp);
3015                         counter_u64_add(numfullpathfail4, 1);
3016                         SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
3017                             startvp, NULL);
3018                         return (ENOMEM);
3019                 }
3020                 buf[--buflen] = '/';
3021         }
3022         counter_u64_add(numfullpathfound, 1);
3023         vrele(vp);
3024
3025         *retbuf = buf + buflen;
3026         SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
3027         *len -= buflen;
3028         *len += addend;
3029         return (0);
3030 }
3031
3032 /*
3033  * Resolve an arbitrary vnode to a pathname.
3034  *
3035  * Note 2 caveats:
3036  * - hardlinks are not tracked, thus if the vnode is not a directory this can
3037  *   resolve to a different path than the one used to find it
3038  * - namecache is not mandatory, meaning names are not guaranteed to be added
3039  *   (in which case resolving fails)
3040  */
3041 static void __inline
3042 cache_rev_failed_impl(int *reason, int line)
3043 {
3044
3045         *reason = line;
3046 }
3047 #define cache_rev_failed(var)   cache_rev_failed_impl((var), __LINE__)
3048
3049 static int
3050 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
3051     char **retbuf, size_t *buflen, size_t addend)
3052 {
3053 #ifdef KDTRACE_HOOKS
3054         struct vnode *startvp = vp;
3055 #endif
3056         struct vnode *tvp;
3057         struct mount *mp;
3058         struct namecache *ncp;
3059         size_t orig_buflen;
3060         int reason;
3061         int error;
3062 #ifdef KDTRACE_HOOKS
3063         int i;
3064 #endif
3065         seqc_t vp_seqc, tvp_seqc;
3066         u_char nc_flag;
3067
3068         VFS_SMR_ASSERT_ENTERED();
3069
3070         if (!cache_fast_revlookup) {
3071                 vfs_smr_exit();
3072                 return (-1);
3073         }
3074
3075         orig_buflen = *buflen;
3076
3077         if (addend == 0) {
3078                 MPASS(*buflen >= 2);
3079                 *buflen -= 1;
3080                 buf[*buflen] = '\0';
3081         }
3082
3083         if (vp == rdir || vp == rootvnode) {
3084                 if (addend == 0) {
3085                         *buflen -= 1;
3086                         buf[*buflen] = '/';
3087                 }
3088                 goto out_ok;
3089         }
3090
3091 #ifdef KDTRACE_HOOKS
3092         i = 0;
3093 #endif
3094         error = -1;
3095         ncp = NULL; /* for sdt probe down below */
3096         vp_seqc = vn_seqc_read_any(vp);
3097         if (seqc_in_modify(vp_seqc)) {
3098                 cache_rev_failed(&reason);
3099                 goto out_abort;
3100         }
3101
3102         for (;;) {
3103 #ifdef KDTRACE_HOOKS
3104                 i++;
3105 #endif
3106                 if ((vp->v_vflag & VV_ROOT) != 0) {
3107                         mp = atomic_load_ptr(&vp->v_mount);
3108                         if (mp == NULL) {
3109                                 cache_rev_failed(&reason);
3110                                 goto out_abort;
3111                         }
3112                         tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
3113                         tvp_seqc = vn_seqc_read_any(tvp);
3114                         if (seqc_in_modify(tvp_seqc)) {
3115                                 cache_rev_failed(&reason);
3116                                 goto out_abort;
3117                         }
3118                         if (!vn_seqc_consistent(vp, vp_seqc)) {
3119                                 cache_rev_failed(&reason);
3120                                 goto out_abort;
3121                         }
3122                         vp = tvp;
3123                         vp_seqc = tvp_seqc;
3124                         continue;
3125                 }
3126                 ncp = atomic_load_ptr(&vp->v_cache_dd);
3127                 if (ncp == NULL) {
3128                         cache_rev_failed(&reason);
3129                         goto out_abort;
3130                 }
3131                 nc_flag = atomic_load_char(&ncp->nc_flag);
3132                 if ((nc_flag & NCF_ISDOTDOT) != 0) {
3133                         cache_rev_failed(&reason);
3134                         goto out_abort;
3135                 }
3136                 if (!cache_ncp_canuse(ncp)) {
3137                         cache_rev_failed(&reason);
3138                         goto out_abort;
3139                 }
3140                 if (ncp->nc_nlen >= *buflen) {
3141                         cache_rev_failed(&reason);
3142                         error = ENOMEM;
3143                         goto out_abort;
3144                 }
3145                 *buflen -= ncp->nc_nlen;
3146                 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
3147                 *buflen -= 1;
3148                 buf[*buflen] = '/';
3149                 tvp = ncp->nc_dvp;
3150                 tvp_seqc = vn_seqc_read_any(tvp);
3151                 if (seqc_in_modify(tvp_seqc)) {
3152                         cache_rev_failed(&reason);
3153                         goto out_abort;
3154                 }
3155                 if (!vn_seqc_consistent(vp, vp_seqc)) {
3156                         cache_rev_failed(&reason);
3157                         goto out_abort;
3158                 }
3159                 vp = tvp;
3160                 vp_seqc = tvp_seqc;
3161                 if (vp == rdir || vp == rootvnode)
3162                         break;
3163         }
3164 out_ok:
3165         vfs_smr_exit();
3166         *retbuf = buf + *buflen;
3167         *buflen = orig_buflen - *buflen + addend;
3168         SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
3169         return (0);
3170
3171 out_abort:
3172         *buflen = orig_buflen;
3173         SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
3174         vfs_smr_exit();
3175         return (error);
3176 }
3177
3178 static int
3179 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
3180     size_t *buflen)
3181 {
3182         size_t orig_buflen, addend;
3183         int error;
3184
3185         if (*buflen < 2)
3186                 return (EINVAL);
3187
3188         orig_buflen = *buflen;
3189
3190         vref(vp);
3191         addend = 0;
3192         if (vp->v_type != VDIR) {
3193                 *buflen -= 1;
3194                 buf[*buflen] = '\0';
3195                 error = vn_vptocnp(&vp, buf, buflen);
3196                 if (error)
3197                         return (error);
3198                 if (*buflen == 0) {
3199                         vrele(vp);
3200                         return (ENOMEM);
3201                 }
3202                 *buflen -= 1;
3203                 buf[*buflen] = '/';
3204                 addend = orig_buflen - *buflen;
3205         }
3206
3207         return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
3208 }
3209
3210 /*
3211  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
3212  *
3213  * Since the namecache does not track handlings, the caller is expected to first
3214  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
3215  *
3216  * Then we have 2 cases:
3217  * - if the found vnode is a directory, the path can be constructed just by
3218  *   fullowing names up the chain
3219  * - otherwise we populate the buffer with the saved name and start resolving
3220  *   from the parent
3221  */
3222 static int
3223 vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf,
3224     size_t *buflen)
3225 {
3226         char *buf, *tmpbuf;
3227         struct pwd *pwd;
3228         struct componentname *cnp;
3229         struct vnode *vp;
3230         size_t addend;
3231         int error;
3232         enum vtype type;
3233
3234         if (*buflen < 2)
3235                 return (EINVAL);
3236         if (*buflen > MAXPATHLEN)
3237                 *buflen = MAXPATHLEN;
3238
3239         buf = malloc(*buflen, M_TEMP, M_WAITOK);
3240
3241         addend = 0;
3242         vp = ndp->ni_vp;
3243         /*
3244          * Check for VBAD to work around the vp_crossmp bug in lookup().
3245          *
3246          * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
3247          * set to mount point's root vnode while ni_dvp will be vp_crossmp.
3248          * If the type is VDIR (like in this very case) we can skip looking
3249          * at ni_dvp in the first place. However, since vnodes get passed here
3250          * unlocked the target may transition to doomed state (type == VBAD)
3251          * before we get to evaluate the condition. If this happens, we will
3252          * populate part of the buffer and descend to vn_fullpath_dir with
3253          * vp == vp_crossmp. Prevent the problem by checking for VBAD.
3254          *
3255          * This should be atomic_load(&vp->v_type) but it is ilegal to take
3256          * an address of a bit field, even if said field is sized to char.
3257          * Work around the problem by reading the value into a full-sized enum
3258          * and then re-reading it with atomic_load which will still prevent
3259          * the compiler from re-reading down the road.
3260          */
3261         type = vp->v_type;
3262         type = atomic_load_int(&type);
3263         if (type == VBAD) {
3264                 error = ENOENT;
3265                 goto out_bad;
3266         }
3267         if (type != VDIR) {
3268                 cnp = &ndp->ni_cnd;
3269                 addend = cnp->cn_namelen + 2;
3270                 if (*buflen < addend) {
3271                         error = ENOMEM;
3272                         goto out_bad;
3273                 }
3274                 *buflen -= addend;
3275                 tmpbuf = buf + *buflen;
3276                 tmpbuf[0] = '/';
3277                 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
3278                 tmpbuf[addend - 1] = '\0';
3279                 vp = ndp->ni_dvp;
3280         }
3281
3282         vfs_smr_enter();
3283         pwd = pwd_get_smr();
3284         error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3285             addend);
3286         VFS_SMR_ASSERT_NOT_ENTERED();
3287         if (error < 0) {
3288                 pwd = pwd_hold(curthread);
3289                 vref(vp);
3290                 error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
3291                     addend);
3292                 pwd_drop(pwd);
3293                 if (error != 0)
3294                         goto out_bad;
3295         }
3296
3297         *freebuf = buf;
3298
3299         return (0);
3300 out_bad:
3301         free(buf, M_TEMP);
3302         return (error);
3303 }
3304
3305 struct vnode *
3306 vn_dir_dd_ino(struct vnode *vp)
3307 {
3308         struct namecache *ncp;
3309         struct vnode *ddvp;
3310         struct mtx *vlp;
3311         enum vgetstate vs;
3312
3313         ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
3314         vlp = VP2VNODELOCK(vp);
3315         mtx_lock(vlp);
3316         TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
3317                 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
3318                         continue;
3319                 ddvp = ncp->nc_dvp;
3320                 vs = vget_prep(ddvp);
3321                 mtx_unlock(vlp);
3322                 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
3323                         return (NULL);
3324                 return (ddvp);
3325         }
3326         mtx_unlock(vlp);
3327         return (NULL);
3328 }
3329
3330 int
3331 vn_commname(struct vnode *vp, char *buf, u_int buflen)
3332 {
3333         struct namecache *ncp;
3334         struct mtx *vlp;
3335         int l;
3336
3337         vlp = VP2VNODELOCK(vp);
3338         mtx_lock(vlp);
3339         TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
3340                 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
3341                         break;
3342         if (ncp == NULL) {
3343                 mtx_unlock(vlp);
3344                 return (ENOENT);
3345         }
3346         l = min(ncp->nc_nlen, buflen - 1);
3347         memcpy(buf, ncp->nc_name, l);
3348         mtx_unlock(vlp);
3349         buf[l] = '\0';
3350         return (0);
3351 }
3352
3353 /*
3354  * This function updates path string to vnode's full global path
3355  * and checks the size of the new path string against the pathlen argument.
3356  *
3357  * Requires a locked, referenced vnode.
3358  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
3359  *
3360  * If vp is a directory, the call to vn_fullpath_global() always succeeds
3361  * because it falls back to the ".." lookup if the namecache lookup fails.
3362  */
3363 int
3364 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
3365     u_int pathlen)
3366 {
3367         struct nameidata nd;
3368         struct vnode *vp1;
3369         char *rpath, *fbuf;
3370         int error;
3371
3372         ASSERT_VOP_ELOCKED(vp, __func__);
3373
3374         /* Construct global filesystem path from vp. */
3375         VOP_UNLOCK(vp);
3376         error = vn_fullpath_global(vp, &rpath, &fbuf);
3377
3378         if (error != 0) {
3379                 vrele(vp);
3380                 return (error);
3381         }
3382
3383         if (strlen(rpath) >= pathlen) {
3384                 vrele(vp);
3385                 error = ENAMETOOLONG;
3386                 goto out;
3387         }
3388
3389         /*
3390          * Re-lookup the vnode by path to detect a possible rename.
3391          * As a side effect, the vnode is relocked.
3392          * If vnode was renamed, return ENOENT.
3393          */
3394         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
3395             UIO_SYSSPACE, path, td);
3396         error = namei(&nd);
3397         if (error != 0) {
3398                 vrele(vp);
3399                 goto out;
3400         }
3401         NDFREE(&nd, NDF_ONLY_PNBUF);
3402         vp1 = nd.ni_vp;
3403         vrele(vp);
3404         if (vp1 == vp)
3405                 strcpy(path, rpath);
3406         else {
3407                 vput(vp1);
3408                 error = ENOENT;
3409         }
3410
3411 out:
3412         free(fbuf, M_TEMP);
3413         return (error);
3414 }
3415
3416 #ifdef DDB
3417 static void
3418 db_print_vpath(struct vnode *vp)
3419 {
3420
3421         while (vp != NULL) {
3422                 db_printf("%p: ", vp);
3423                 if (vp == rootvnode) {
3424                         db_printf("/");
3425                         vp = NULL;
3426                 } else {
3427                         if (vp->v_vflag & VV_ROOT) {
3428                                 db_printf("<mount point>");
3429                                 vp = vp->v_mount->mnt_vnodecovered;
3430                         } else {
3431                                 struct namecache *ncp;
3432                                 char *ncn;
3433                                 int i;
3434
3435                                 ncp = TAILQ_FIRST(&vp->v_cache_dst);
3436                                 if (ncp != NULL) {
3437                                         ncn = ncp->nc_name;
3438                                         for (i = 0; i < ncp->nc_nlen; i++)
3439                                                 db_printf("%c", *ncn++);
3440                                         vp = ncp->nc_dvp;
3441                                 } else {
3442                                         vp = NULL;
3443                                 }
3444                         }
3445                 }
3446                 db_printf("\n");
3447         }
3448
3449         return;
3450 }
3451
3452 DB_SHOW_COMMAND(vpath, db_show_vpath)
3453 {
3454         struct vnode *vp;
3455
3456         if (!have_addr) {
3457                 db_printf("usage: show vpath <struct vnode *>\n");
3458                 return;
3459         }
3460
3461         vp = (struct vnode *)addr;
3462         db_print_vpath(vp);
3463 }
3464
3465 #endif
3466
3467 static bool __read_frequently cache_fast_lookup = true;
3468 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
3469     &cache_fast_lookup, 0, "");
3470
3471 #define CACHE_FPL_FAILED        -2020
3472
3473 static void
3474 cache_fpl_cleanup_cnp(struct componentname *cnp)
3475 {
3476
3477         uma_zfree(namei_zone, cnp->cn_pnbuf);
3478 #ifdef DIAGNOSTIC
3479         cnp->cn_pnbuf = NULL;
3480         cnp->cn_nameptr = NULL;
3481 #endif
3482 }
3483
3484 static void
3485 cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
3486 {
3487         struct componentname *cnp;
3488
3489         cnp = &ndp->ni_cnd;
3490         while (*(cnp->cn_nameptr) == '/') {
3491                 cnp->cn_nameptr++;
3492                 ndp->ni_pathlen--;
3493         }
3494
3495         *dpp = ndp->ni_rootdir;
3496 }
3497
3498 /*
3499  * Components of nameidata (or objects it can point to) which may
3500  * need restoring in case fast path lookup fails.
3501  */
3502 struct nameidata_saved {
3503         long cn_namelen;
3504         char *cn_nameptr;
3505         size_t ni_pathlen;
3506         int cn_flags;
3507 };
3508
3509 struct cache_fpl {
3510         struct nameidata *ndp;
3511         struct componentname *cnp;
3512         struct pwd *pwd;
3513         struct vnode *dvp;
3514         struct vnode *tvp;
3515         seqc_t dvp_seqc;
3516         seqc_t tvp_seqc;
3517         struct nameidata_saved snd;
3518         int line;
3519         enum cache_fpl_status status:8;
3520         bool in_smr;
3521         bool fsearch;
3522 };
3523
3524 static void
3525 cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
3526 {
3527
3528         snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
3529         snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
3530         snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
3531         snd->ni_pathlen = fpl->ndp->ni_pathlen;
3532 }
3533
3534 static void
3535 cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
3536 {
3537
3538         fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
3539         fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
3540         fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
3541         fpl->ndp->ni_pathlen = snd->ni_pathlen;
3542 }
3543
3544 #ifdef INVARIANTS
3545 #define cache_fpl_smr_assert_entered(fpl) ({                    \
3546         struct cache_fpl *_fpl = (fpl);                         \
3547         MPASS(_fpl->in_smr == true);                            \
3548         VFS_SMR_ASSERT_ENTERED();                               \
3549 })
3550 #define cache_fpl_smr_assert_not_entered(fpl) ({                \
3551         struct cache_fpl *_fpl = (fpl);                         \
3552         MPASS(_fpl->in_smr == false);                           \
3553         VFS_SMR_ASSERT_NOT_ENTERED();                           \
3554 })
3555 #else
3556 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
3557 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
3558 #endif
3559
3560 #define cache_fpl_smr_enter_initial(fpl) ({                     \
3561         struct cache_fpl *_fpl = (fpl);                         \
3562         vfs_smr_enter();                                        \
3563         _fpl->in_smr = true;                                    \
3564 })
3565
3566 #define cache_fpl_smr_enter(fpl) ({                             \
3567         struct cache_fpl *_fpl = (fpl);                         \
3568         MPASS(_fpl->in_smr == false);                           \
3569         vfs_smr_enter();                                        \
3570         _fpl->in_smr = true;                                    \
3571 })
3572
3573 #define cache_fpl_smr_exit(fpl) ({                              \
3574         struct cache_fpl *_fpl = (fpl);                         \
3575         MPASS(_fpl->in_smr == true);                            \
3576         vfs_smr_exit();                                         \
3577         _fpl->in_smr = false;                                   \
3578 })
3579
3580 static int
3581 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
3582 {
3583
3584         if (fpl->status != CACHE_FPL_STATUS_UNSET) {
3585                 KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
3586                     ("%s: converting to abort from %d at %d, set at %d\n",
3587                     __func__, fpl->status, line, fpl->line));
3588         }
3589         fpl->status = CACHE_FPL_STATUS_ABORTED;
3590         fpl->line = line;
3591         return (CACHE_FPL_FAILED);
3592 }
3593
3594 #define cache_fpl_aborted(x)    cache_fpl_aborted_impl((x), __LINE__)
3595
3596 static int
3597 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
3598 {
3599
3600         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3601             ("%s: setting to partial at %d, but already set to %d at %d\n",
3602             __func__, line, fpl->status, fpl->line));
3603         cache_fpl_smr_assert_entered(fpl);
3604         fpl->status = CACHE_FPL_STATUS_PARTIAL;
3605         fpl->line = line;
3606         return (CACHE_FPL_FAILED);
3607 }
3608
3609 #define cache_fpl_partial(x)    cache_fpl_partial_impl((x), __LINE__)
3610
3611 static int
3612 cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
3613 {
3614
3615         KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
3616             ("%s: setting to handled at %d, but already set to %d at %d\n",
3617             __func__, line, fpl->status, fpl->line));
3618         cache_fpl_smr_assert_not_entered(fpl);
3619         MPASS(error != CACHE_FPL_FAILED);
3620         fpl->status = CACHE_FPL_STATUS_HANDLED;
3621         fpl->line = line;
3622         return (error);
3623 }
3624
3625 #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
3626
3627 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
3628         (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \
3629          SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK)
3630
3631 #define CACHE_FPL_INTERNAL_CN_FLAGS \
3632         (ISDOTDOT | MAKEENTRY | ISLASTCN)
3633
3634 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
3635     "supported and internal flags overlap");
3636
3637 static bool
3638 cache_fpl_islastcn(struct nameidata *ndp)
3639 {
3640
3641         return (*ndp->ni_next == 0);
3642 }
3643
3644 static bool
3645 cache_fpl_isdotdot(struct componentname *cnp)
3646 {
3647
3648         if (cnp->cn_namelen == 2 &&
3649             cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
3650                 return (true);
3651         return (false);
3652 }
3653
3654 static bool
3655 cache_can_fplookup(struct cache_fpl *fpl)
3656 {
3657         struct nameidata *ndp;
3658         struct componentname *cnp;
3659         struct thread *td;
3660
3661         ndp = fpl->ndp;
3662         cnp = fpl->cnp;
3663         td = cnp->cn_thread;
3664
3665         if (!cache_fast_lookup) {
3666                 cache_fpl_aborted(fpl);
3667                 return (false);
3668         }
3669 #ifdef MAC
3670         if (mac_vnode_check_lookup_enabled()) {
3671                 cache_fpl_aborted(fpl);
3672                 return (false);
3673         }
3674 #endif
3675         if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
3676                 cache_fpl_aborted(fpl);
3677                 return (false);
3678         }
3679         if (IN_CAPABILITY_MODE(td)) {
3680                 cache_fpl_aborted(fpl);
3681                 return (false);
3682         }
3683         if (AUDITING_TD(td)) {
3684                 cache_fpl_aborted(fpl);
3685                 return (false);
3686         }
3687         if (ndp->ni_startdir != NULL) {
3688                 cache_fpl_aborted(fpl);
3689                 return (false);
3690         }
3691         return (true);
3692 }
3693
3694 static int
3695 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
3696 {
3697         struct nameidata *ndp;
3698         int error;
3699         bool fsearch;
3700
3701         ndp = fpl->ndp;
3702         error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch);
3703         if (__predict_false(error != 0)) {
3704                 cache_fpl_smr_exit(fpl);
3705                 return (cache_fpl_aborted(fpl));
3706         }
3707         fpl->fsearch = fsearch;
3708         return (0);
3709 }
3710
3711 static bool
3712 cache_fplookup_vnode_supported(struct vnode *vp)
3713 {
3714
3715         return (vp->v_type != VLNK);
3716 }
3717
3718 static int __noinline
3719 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
3720     uint32_t hash)
3721 {
3722         struct componentname *cnp;
3723         struct vnode *dvp;
3724
3725         cnp = fpl->cnp;
3726         dvp = fpl->dvp;
3727
3728         cache_fpl_smr_exit(fpl);
3729         if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
3730                 return (cache_fpl_handled(fpl, ENOENT));
3731         else
3732                 return (cache_fpl_aborted(fpl));
3733 }
3734
3735 /*
3736  * The target vnode is not supported, prepare for the slow path to take over.
3737  */
3738 static int __noinline
3739 cache_fplookup_partial_setup(struct cache_fpl *fpl)
3740 {
3741         struct nameidata *ndp;
3742         struct componentname *cnp;
3743         enum vgetstate dvs;
3744         struct vnode *dvp;
3745         struct pwd *pwd;
3746         seqc_t dvp_seqc;
3747
3748         ndp = fpl->ndp;
3749         cnp = fpl->cnp;
3750         pwd = fpl->pwd;
3751         dvp = fpl->dvp;
3752         dvp_seqc = fpl->dvp_seqc;
3753
3754         if (!pwd_hold_smr(pwd)) {
3755                 cache_fpl_smr_exit(fpl);
3756                 return (cache_fpl_aborted(fpl));
3757         }
3758
3759         dvs = vget_prep_smr(dvp);
3760         cache_fpl_smr_exit(fpl);
3761         if (__predict_false(dvs == VGET_NONE)) {
3762                 pwd_drop(pwd);
3763                 return (cache_fpl_aborted(fpl));
3764         }
3765
3766         vget_finish_ref(dvp, dvs);
3767         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3768                 vrele(dvp);
3769                 pwd_drop(pwd);
3770                 return (cache_fpl_aborted(fpl));
3771         }
3772
3773         cache_fpl_restore(fpl, &fpl->snd);
3774
3775         ndp->ni_startdir = dvp;
3776         cnp->cn_flags |= MAKEENTRY;
3777         if (cache_fpl_islastcn(ndp))
3778                 cnp->cn_flags |= ISLASTCN;
3779         if (cache_fpl_isdotdot(cnp))
3780                 cnp->cn_flags |= ISDOTDOT;
3781
3782         return (0);
3783 }
3784
3785 static int
3786 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
3787 {
3788         struct componentname *cnp;
3789         struct vnode *tvp;
3790         seqc_t tvp_seqc;
3791         int error, lkflags;
3792
3793         cnp = fpl->cnp;
3794         tvp = fpl->tvp;
3795         tvp_seqc = fpl->tvp_seqc;
3796
3797         if ((cnp->cn_flags & LOCKLEAF) != 0) {
3798                 lkflags = LK_SHARED;
3799                 if ((cnp->cn_flags & LOCKSHARED) == 0)
3800                         lkflags = LK_EXCLUSIVE;
3801                 error = vget_finish(tvp, lkflags, tvs);
3802                 if (__predict_false(error != 0)) {
3803                         return (cache_fpl_aborted(fpl));
3804                 }
3805         } else {
3806                 vget_finish_ref(tvp, tvs);
3807         }
3808
3809         if (!vn_seqc_consistent(tvp, tvp_seqc)) {
3810                 if ((cnp->cn_flags & LOCKLEAF) != 0)
3811                         vput(tvp);
3812                 else
3813                         vrele(tvp);
3814                 return (cache_fpl_aborted(fpl));
3815         }
3816
3817         return (cache_fpl_handled(fpl, 0));
3818 }
3819
3820 /*
3821  * They want to possibly modify the state of the namecache.
3822  *
3823  * Don't try to match the API contract, just leave.
3824  * TODO: this leaves scalability on the table
3825  */
3826 static int
3827 cache_fplookup_final_modifying(struct cache_fpl *fpl)
3828 {
3829         struct componentname *cnp;
3830
3831         cnp = fpl->cnp;
3832         MPASS(cnp->cn_nameiop != LOOKUP);
3833         return (cache_fpl_partial(fpl));
3834 }
3835
3836 static int __noinline
3837 cache_fplookup_final_withparent(struct cache_fpl *fpl)
3838 {
3839         struct componentname *cnp;
3840         enum vgetstate dvs, tvs;
3841         struct vnode *dvp, *tvp;
3842         seqc_t dvp_seqc;
3843         int error;
3844
3845         cnp = fpl->cnp;
3846         dvp = fpl->dvp;
3847         dvp_seqc = fpl->dvp_seqc;
3848         tvp = fpl->tvp;
3849
3850         MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
3851
3852         /*
3853          * This is less efficient than it can be for simplicity.
3854          */
3855         dvs = vget_prep_smr(dvp);
3856         if (__predict_false(dvs == VGET_NONE)) {
3857                 return (cache_fpl_aborted(fpl));
3858         }
3859         tvs = vget_prep_smr(tvp);
3860         if (__predict_false(tvs == VGET_NONE)) {
3861                 cache_fpl_smr_exit(fpl);
3862                 vget_abort(dvp, dvs);
3863                 return (cache_fpl_aborted(fpl));
3864         }
3865
3866         cache_fpl_smr_exit(fpl);
3867
3868         if ((cnp->cn_flags & LOCKPARENT) != 0) {
3869                 error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
3870                 if (__predict_false(error != 0)) {
3871                         vget_abort(tvp, tvs);
3872                         return (cache_fpl_aborted(fpl));
3873                 }
3874         } else {
3875                 vget_finish_ref(dvp, dvs);
3876         }
3877
3878         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3879                 vget_abort(tvp, tvs);
3880                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3881                         vput(dvp);
3882                 else
3883                         vrele(dvp);
3884                 return (cache_fpl_aborted(fpl));
3885         }
3886
3887         error = cache_fplookup_final_child(fpl, tvs);
3888         if (__predict_false(error != 0)) {
3889                 MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED);
3890                 if ((cnp->cn_flags & LOCKPARENT) != 0)
3891                         vput(dvp);
3892                 else
3893                         vrele(dvp);
3894                 return (error);
3895         }
3896
3897         MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
3898         return (0);
3899 }
3900
3901 static int
3902 cache_fplookup_final(struct cache_fpl *fpl)
3903 {
3904         struct componentname *cnp;
3905         enum vgetstate tvs;
3906         struct vnode *dvp, *tvp;
3907         seqc_t dvp_seqc;
3908
3909         cnp = fpl->cnp;
3910         dvp = fpl->dvp;
3911         dvp_seqc = fpl->dvp_seqc;
3912         tvp = fpl->tvp;
3913
3914         VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
3915
3916         if (cnp->cn_nameiop != LOOKUP) {
3917                 return (cache_fplookup_final_modifying(fpl));
3918         }
3919
3920         if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
3921                 return (cache_fplookup_final_withparent(fpl));
3922
3923         tvs = vget_prep_smr(tvp);
3924         if (__predict_false(tvs == VGET_NONE)) {
3925                 return (cache_fpl_partial(fpl));
3926         }
3927
3928         if (!vn_seqc_consistent(dvp, dvp_seqc)) {
3929                 cache_fpl_smr_exit(fpl);
3930                 vget_abort(tvp, tvs);
3931                 return (cache_fpl_aborted(fpl));
3932         }
3933
3934         cache_fpl_smr_exit(fpl);
3935         return (cache_fplookup_final_child(fpl, tvs));
3936 }
3937
3938 static int __noinline
3939 cache_fplookup_dot(struct cache_fpl *fpl)
3940 {
3941         struct vnode *dvp;
3942
3943         dvp = fpl->dvp;
3944
3945         fpl->tvp = dvp;
3946         fpl->tvp_seqc = vn_seqc_read_any(dvp);
3947         if (seqc_in_modify(fpl->tvp_seqc)) {
3948                 return (cache_fpl_aborted(fpl));
3949         }
3950
3951         counter_u64_add(dothits, 1);
3952         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp);
3953
3954         return (0);
3955 }
3956
3957 static int __noinline
3958 cache_fplookup_dotdot(struct cache_fpl *fpl)
3959 {
3960         struct nameidata *ndp;
3961         struct componentname *cnp;
3962         struct namecache *ncp;
3963         struct vnode *dvp;
3964         struct prison *pr;
3965         u_char nc_flag;
3966
3967         ndp = fpl->ndp;
3968         cnp = fpl->cnp;
3969         dvp = fpl->dvp;
3970
3971         /*
3972          * XXX this is racy the same way regular lookup is
3973          */
3974         for (pr = cnp->cn_cred->cr_prison; pr != NULL;
3975             pr = pr->pr_parent)
3976                 if (dvp == pr->pr_root)
3977                         break;
3978
3979         if (dvp == ndp->ni_rootdir ||
3980             dvp == ndp->ni_topdir ||
3981             dvp == rootvnode ||
3982             pr != NULL) {
3983                 fpl->tvp = dvp;
3984                 fpl->tvp_seqc = vn_seqc_read_any(dvp);
3985                 if (seqc_in_modify(fpl->tvp_seqc)) {
3986                         return (cache_fpl_aborted(fpl));
3987                 }
3988                 return (0);
3989         }
3990
3991         if ((dvp->v_vflag & VV_ROOT) != 0) {
3992                 /*
3993                  * TODO
3994                  * The opposite of climb mount is needed here.
3995                  */
3996                 return (cache_fpl_aborted(fpl));
3997         }
3998
3999         ncp = atomic_load_ptr(&dvp->v_cache_dd);
4000         if (ncp == NULL) {
4001                 return (cache_fpl_aborted(fpl));
4002         }
4003
4004         nc_flag = atomic_load_char(&ncp->nc_flag);
4005         if ((nc_flag & NCF_ISDOTDOT) != 0) {
4006                 if ((nc_flag & NCF_NEGATIVE) != 0)
4007                         return (cache_fpl_aborted(fpl));
4008                 fpl->tvp = ncp->nc_vp;
4009         } else {
4010                 fpl->tvp = ncp->nc_dvp;
4011         }
4012
4013         if (__predict_false(!cache_ncp_canuse(ncp))) {
4014                 return (cache_fpl_aborted(fpl));
4015         }
4016
4017         fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
4018         if (seqc_in_modify(fpl->tvp_seqc)) {
4019                 return (cache_fpl_partial(fpl));
4020         }
4021
4022         counter_u64_add(dotdothits, 1);
4023         return (0);
4024 }
4025
4026 static int __noinline
4027 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
4028 {
4029         u_char nc_flag;
4030         bool neg_promote;
4031
4032         nc_flag = atomic_load_char(&ncp->nc_flag);
4033         MPASS((nc_flag & NCF_NEGATIVE) != 0);
4034         /*
4035          * If they want to create an entry we need to replace this one.
4036          */
4037         if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
4038                 /*
4039                  * TODO
4040                  * This should call something similar to
4041                  * cache_fplookup_final_modifying.
4042                  */
4043                 return (cache_fpl_partial(fpl));
4044         }
4045         neg_promote = cache_neg_hit_prep(ncp);
4046         if (__predict_false(!cache_ncp_canuse(ncp))) {
4047                 cache_neg_hit_abort(ncp);
4048                 return (cache_fpl_partial(fpl));
4049         }
4050         if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
4051                 cache_neg_hit_abort(ncp);
4052                 return (cache_fpl_partial(fpl));
4053         }
4054         if (neg_promote) {
4055                 return (cache_fplookup_negative_promote(fpl, ncp, hash));
4056         }
4057         cache_neg_hit_finish(ncp);
4058         cache_fpl_smr_exit(fpl);
4059         return (cache_fpl_handled(fpl, ENOENT));
4060 }
4061
4062 static int
4063 cache_fplookup_next(struct cache_fpl *fpl)
4064 {
4065         struct componentname *cnp;
4066         struct namecache *ncp;
4067         struct vnode *dvp, *tvp;
4068         u_char nc_flag;
4069         uint32_t hash;
4070
4071         cnp = fpl->cnp;
4072         dvp = fpl->dvp;
4073
4074         if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
4075                 return (cache_fplookup_dot(fpl));
4076         }
4077
4078         hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
4079
4080         CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
4081                 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
4082                     !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
4083                         break;
4084         }
4085
4086         /*
4087          * If there is no entry we have to punt to the slow path to perform
4088          * actual lookup. Should there be nothing with this name a negative
4089          * entry will be created.
4090          */
4091         if (__predict_false(ncp == NULL)) {
4092                 return (cache_fpl_partial(fpl));
4093         }
4094
4095         tvp = atomic_load_ptr(&ncp->nc_vp);
4096         nc_flag = atomic_load_char(&ncp->nc_flag);
4097         if ((nc_flag & NCF_NEGATIVE) != 0) {
4098                 return (cache_fplookup_neg(fpl, ncp, hash));
4099         }
4100
4101         if (__predict_false(!cache_ncp_canuse(ncp))) {
4102                 return (cache_fpl_partial(fpl));
4103         }
4104
4105         fpl->tvp = tvp;
4106         fpl->tvp_seqc = vn_seqc_read_any(tvp);
4107         if (seqc_in_modify(fpl->tvp_seqc)) {
4108                 return (cache_fpl_partial(fpl));
4109         }
4110
4111         if (!cache_fplookup_vnode_supported(tvp)) {
4112                 return (cache_fpl_partial(fpl));
4113         }
4114
4115         counter_u64_add(numposhits, 1);
4116         SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
4117         return (0);
4118 }
4119
4120 static bool
4121 cache_fplookup_mp_supported(struct mount *mp)
4122 {
4123
4124         if (mp == NULL)
4125                 return (false);
4126         if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
4127                 return (false);
4128         return (true);
4129 }
4130
4131 /*
4132  * Walk up the mount stack (if any).
4133  *
4134  * Correctness is provided in the following ways:
4135  * - all vnodes are protected from freeing with SMR
4136  * - struct mount objects are type stable making them always safe to access
4137  * - stability of the particular mount is provided by busying it
4138  * - relationship between the vnode which is mounted on and the mount is
4139  *   verified with the vnode sequence counter after busying
4140  * - association between root vnode of the mount and the mount is protected
4141  *   by busy
4142  *
4143  * From that point on we can read the sequence counter of the root vnode
4144  * and get the next mount on the stack (if any) using the same protection.
4145  *
4146  * By the end of successful walk we are guaranteed the reached state was
4147  * indeed present at least at some point which matches the regular lookup.
4148  */
4149 static int __noinline
4150 cache_fplookup_climb_mount(struct cache_fpl *fpl)
4151 {
4152         struct mount *mp, *prev_mp;
4153         struct vnode *vp;
4154         seqc_t vp_seqc;
4155
4156         vp = fpl->tvp;
4157         vp_seqc = fpl->tvp_seqc;
4158
4159         VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp);
4160         mp = atomic_load_ptr(&vp->v_mountedhere);
4161         if (mp == NULL)
4162                 return (0);
4163
4164         prev_mp = NULL;
4165         for (;;) {
4166                 if (!vfs_op_thread_enter_crit(mp)) {
4167                         if (prev_mp != NULL)
4168                                 vfs_op_thread_exit_crit(prev_mp);
4169                         return (cache_fpl_partial(fpl));
4170                 }
4171                 if (prev_mp != NULL)
4172                         vfs_op_thread_exit_crit(prev_mp);
4173                 if (!vn_seqc_consistent(vp, vp_seqc)) {
4174                         vfs_op_thread_exit_crit(mp);
4175                         return (cache_fpl_partial(fpl));
4176                 }
4177                 if (!cache_fplookup_mp_supported(mp)) {
4178                         vfs_op_thread_exit_crit(mp);
4179                         return (cache_fpl_partial(fpl));
4180                 }
4181                 vp = atomic_load_ptr(&mp->mnt_rootvnode);
4182                 if (vp == NULL || VN_IS_DOOMED(vp)) {
4183                         vfs_op_thread_exit_crit(mp);
4184                         return (cache_fpl_partial(fpl));
4185                 }
4186                 vp_seqc = vn_seqc_read_any(vp);
4187                 if (seqc_in_modify(vp_seqc)) {
4188                         vfs_op_thread_exit_crit(mp);
4189                         return (cache_fpl_partial(fpl));
4190                 }
4191                 prev_mp = mp;
4192                 mp = atomic_load_ptr(&vp->v_mountedhere);
4193                 if (mp == NULL)
4194                         break;
4195         }
4196
4197         vfs_op_thread_exit_crit(prev_mp);
4198         fpl->tvp = vp;
4199         fpl->tvp_seqc = vp_seqc;
4200         return (0);
4201 }
4202
4203 static bool
4204 cache_fplookup_need_climb_mount(struct cache_fpl *fpl)
4205 {
4206         struct mount *mp;
4207         struct vnode *vp;
4208
4209         vp = fpl->tvp;
4210
4211         /*
4212          * Hack: while this is a union, the pointer tends to be NULL so save on
4213          * a branch.
4214          */
4215         mp = atomic_load_ptr(&vp->v_mountedhere);
4216         if (mp == NULL)
4217                 return (false);
4218         if (vp->v_type == VDIR)
4219                 return (true);
4220         return (false);
4221 }
4222
4223 /*
4224  * Parse the path.
4225  *
4226  * The code was originally copy-pasted from regular lookup and despite
4227  * clean ups leaves performance on the table. Any modifications here
4228  * must take into account that in case off fallback the resulting
4229  * nameidata state has to be compatible with the original.
4230  */
4231 static int
4232 cache_fplookup_parse(struct cache_fpl *fpl)
4233 {
4234         struct nameidata *ndp;
4235         struct componentname *cnp;
4236         char *cp;
4237
4238         ndp = fpl->ndp;
4239         cnp = fpl->cnp;
4240
4241         /*
4242          * Search a new directory.
4243          *
4244          * The last component of the filename is left accessible via
4245          * cnp->cn_nameptr for callers that need the name. Callers needing
4246          * the name set the SAVENAME flag. When done, they assume
4247          * responsibility for freeing the pathname buffer.
4248          */
4249         for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
4250                 continue;
4251         cnp->cn_namelen = cp - cnp->cn_nameptr;
4252         if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
4253                 cache_fpl_smr_exit(fpl);
4254                 return (cache_fpl_handled(fpl, ENAMETOOLONG));
4255         }
4256         ndp->ni_pathlen -= cnp->cn_namelen;
4257         KASSERT(ndp->ni_pathlen <= PATH_MAX,
4258             ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
4259         ndp->ni_next = cp;
4260
4261         /*
4262          * Replace multiple slashes by a single slash and trailing slashes
4263          * by a null.  This must be done before VOP_LOOKUP() because some
4264          * fs's don't know about trailing slashes.  Remember if there were
4265          * trailing slashes to handle symlinks, existing non-directories
4266          * and non-existing files that won't be directories specially later.
4267          */
4268         while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
4269                 cp++;
4270                 ndp->ni_pathlen--;
4271                 if (*cp == '\0') {
4272                         /*
4273                          * TODO
4274                          * Regular lookup performs the following:
4275                          * *ndp->ni_next = '\0';
4276                          * cnp->cn_flags |= TRAILINGSLASH;
4277                          *
4278                          * Which is problematic since it modifies data read
4279                          * from userspace. Then if fast path lookup was to
4280                          * abort we would have to either restore it or convey
4281                          * the flag. Since this is a corner case just ignore
4282                          * it for simplicity.
4283                          */
4284                         return (cache_fpl_partial(fpl));
4285                 }
4286         }
4287         ndp->ni_next = cp;
4288
4289         /*
4290          * Check for degenerate name (e.g. / or "")
4291          * which is a way of talking about a directory,
4292          * e.g. like "/." or ".".
4293          *
4294          * TODO
4295          * Another corner case handled by the regular lookup
4296          */
4297         if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
4298                 return (cache_fpl_partial(fpl));
4299         }
4300         return (0);
4301 }
4302
4303 static void
4304 cache_fplookup_parse_advance(struct cache_fpl *fpl)
4305 {
4306         struct nameidata *ndp;
4307         struct componentname *cnp;
4308
4309         ndp = fpl->ndp;
4310         cnp = fpl->cnp;
4311
4312         cnp->cn_nameptr = ndp->ni_next;
4313         while (*cnp->cn_nameptr == '/') {
4314                 cnp->cn_nameptr++;
4315                 ndp->ni_pathlen--;
4316         }
4317 }
4318
4319 /*
4320  * See the API contract for VOP_FPLOOKUP_VEXEC.
4321  */
4322 static int __noinline
4323 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
4324 {
4325         struct componentname *cnp;
4326         struct vnode *dvp;
4327         seqc_t dvp_seqc;
4328
4329         cnp = fpl->cnp;
4330         dvp = fpl->dvp;
4331         dvp_seqc = fpl->dvp_seqc;
4332
4333         /*
4334          * Hack: they may be looking up foo/bar, where foo is a
4335          * regular file. In such a case we need to turn ENOTDIR,
4336          * but we may happen to get here with a different error.
4337          */
4338         if (dvp->v_type != VDIR) {
4339                 /*
4340                  * The check here is predominantly to catch
4341                  * EOPNOTSUPP from dead_vnodeops. If the vnode
4342                  * gets doomed past this point it is going to
4343                  * fail seqc verification.
4344                  */
4345                 if (VN_IS_DOOMED(dvp)) {
4346                         return (cache_fpl_aborted(fpl));
4347                 }
4348                 error = ENOTDIR;
4349         }
4350
4351         /*
4352          * Hack: handle O_SEARCH.
4353          *
4354          * Open Group Base Specifications Issue 7, 2018 edition states:
4355          * If the access mode of the open file description associated with the
4356          * file descriptor is not O_SEARCH, the function shall check whether
4357          * directory searches are permitted using the current permissions of
4358          * the directory underlying the file descriptor. If the access mode is
4359          * O_SEARCH, the function shall not perform the check.
4360          *
4361          * Regular lookup tests for the NOEXECCHECK flag for every path
4362          * component to decide whether to do the permission check. However,
4363          * since most lookups never have the flag (and when they do it is only
4364          * present for the first path component), lockless lookup only acts on
4365          * it if there is a permission problem. Here the flag is represented
4366          * with a boolean so that we don't have to clear it on the way out.
4367          *
4368          * For simplicity this always aborts.
4369          * TODO: check if this is the first lookup and ignore the permission
4370          * problem. Note the flag has to survive fallback (if it happens to be
4371          * performed).
4372          */
4373         if (fpl->fsearch) {
4374                 return (cache_fpl_aborted(fpl));
4375         }
4376
4377         switch (error) {
4378         case EAGAIN:
4379                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4380                         error = cache_fpl_aborted(fpl);
4381                 } else {
4382                         cache_fpl_partial(fpl);
4383                 }
4384                 break;
4385         default:
4386                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
4387                         error = cache_fpl_aborted(fpl);
4388                 } else {
4389                         cache_fpl_smr_exit(fpl);
4390                         cache_fpl_handled(fpl, error);
4391                 }
4392                 break;
4393         }
4394         return (error);
4395 }
4396
4397 static int
4398 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
4399 {
4400         struct nameidata *ndp;
4401         struct componentname *cnp;
4402         struct mount *mp;
4403         int error;
4404
4405         error = CACHE_FPL_FAILED;
4406         ndp = fpl->ndp;
4407         cnp = fpl->cnp;
4408
4409         cache_fpl_checkpoint(fpl, &fpl->snd);
4410
4411         fpl->dvp = dvp;
4412         fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
4413         if (seqc_in_modify(fpl->dvp_seqc)) {
4414                 cache_fpl_aborted(fpl);
4415                 goto out;
4416         }
4417         mp = atomic_load_ptr(&fpl->dvp->v_mount);
4418         if (!cache_fplookup_mp_supported(mp)) {
4419                 cache_fpl_aborted(fpl);
4420                 goto out;
4421         }
4422
4423         VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4424
4425         for (;;) {
4426                 error = cache_fplookup_parse(fpl);
4427                 if (__predict_false(error != 0)) {
4428                         break;
4429                 }
4430
4431                 VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
4432
4433                 error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
4434                 if (__predict_false(error != 0)) {
4435                         error = cache_fplookup_failed_vexec(fpl, error);
4436                         break;
4437                 }
4438
4439                 if (__predict_false(cache_fpl_isdotdot(cnp))) {
4440                         error = cache_fplookup_dotdot(fpl);
4441                         if (__predict_false(error != 0)) {
4442                                 break;
4443                         }
4444                 } else {
4445                         error = cache_fplookup_next(fpl);
4446                         if (__predict_false(error != 0)) {
4447                                 break;
4448                         }
4449
4450                         VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4451
4452                         if (cache_fplookup_need_climb_mount(fpl)) {
4453                                 error = cache_fplookup_climb_mount(fpl);
4454                                 if (__predict_false(error != 0)) {
4455                                         break;
4456                                 }
4457                         }
4458                 }
4459
4460                 VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
4461
4462                 if (cache_fpl_islastcn(ndp)) {
4463                         error = cache_fplookup_final(fpl);
4464                         break;
4465                 }
4466
4467                 if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
4468                         error = cache_fpl_aborted(fpl);
4469                         break;
4470                 }
4471
4472                 fpl->dvp = fpl->tvp;
4473                 fpl->dvp_seqc = fpl->tvp_seqc;
4474
4475                 cache_fplookup_parse_advance(fpl);
4476                 cache_fpl_checkpoint(fpl, &fpl->snd);
4477         }
4478 out:
4479         switch (fpl->status) {
4480         case CACHE_FPL_STATUS_UNSET:
4481                 __assert_unreachable();
4482                 break;
4483         case CACHE_FPL_STATUS_PARTIAL:
4484                 cache_fpl_smr_assert_entered(fpl);
4485                 return (cache_fplookup_partial_setup(fpl));
4486         case CACHE_FPL_STATUS_ABORTED:
4487                 if (fpl->in_smr)
4488                         cache_fpl_smr_exit(fpl);
4489                 return (CACHE_FPL_FAILED);
4490         case CACHE_FPL_STATUS_HANDLED:
4491                 MPASS(error != CACHE_FPL_FAILED);
4492                 cache_fpl_smr_assert_not_entered(fpl);
4493                 if (__predict_false(error != 0)) {
4494                         ndp->ni_dvp = NULL;
4495                         ndp->ni_vp = NULL;
4496                         cache_fpl_cleanup_cnp(cnp);
4497                         return (error);
4498                 }
4499                 ndp->ni_dvp = fpl->dvp;
4500                 ndp->ni_vp = fpl->tvp;
4501                 if (cnp->cn_flags & SAVENAME)
4502                         cnp->cn_flags |= HASBUF;
4503                 else
4504                         cache_fpl_cleanup_cnp(cnp);
4505                 return (error);
4506         }
4507 }
4508
4509 /*
4510  * Fast path lookup protected with SMR and sequence counters.
4511  *
4512  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
4513  *
4514  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
4515  * outlined below.
4516  *
4517  * Traditional vnode lookup conceptually looks like this:
4518  *
4519  * vn_lock(current);
4520  * for (;;) {
4521  *      next = find();
4522  *      vn_lock(next);
4523  *      vn_unlock(current);
4524  *      current = next;
4525  *      if (last)
4526  *          break;
4527  * }
4528  * return (current);
4529  *
4530  * Each jump to the next vnode is safe memory-wise and atomic with respect to
4531  * any modifications thanks to holding respective locks.
4532  *
4533  * The same guarantee can be provided with a combination of safe memory
4534  * reclamation and sequence counters instead. If all operations which affect
4535  * the relationship between the current vnode and the one we are looking for
4536  * also modify the counter, we can verify whether all the conditions held as
4537  * we made the jump. This includes things like permissions, mount points etc.
4538  * Counter modification is provided by enclosing relevant places in
4539  * vn_seqc_write_begin()/end() calls.
4540  *
4541  * Thus this translates to:
4542  *
4543  * vfs_smr_enter();
4544  * dvp_seqc = seqc_read_any(dvp);
4545  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
4546  *     abort();
4547  * for (;;) {
4548  *      tvp = find();
4549  *      tvp_seqc = seqc_read_any(tvp);
4550  *      if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
4551  *          abort();
4552  *      if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
4553  *          abort();
4554  *      dvp = tvp; // we know nothing of importance has changed
4555  *      dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
4556  *      if (last)
4557  *          break;
4558  * }
4559  * vget(); // secure the vnode
4560  * if (!seqc_consistent(tvp, tvp_seqc) // final check
4561  *          abort();
4562  * // at this point we know nothing has changed for any parent<->child pair
4563  * // as they were crossed during the lookup, meaning we matched the guarantee
4564  * // of the locked variant
4565  * return (tvp);
4566  *
4567  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
4568  * - they are called while within vfs_smr protection which they must never exit
4569  * - EAGAIN can be returned to denote checking could not be performed, it is
4570  *   always valid to return it
4571  * - if the sequence counter has not changed the result must be valid
4572  * - if the sequence counter has changed both false positives and false negatives
4573  *   are permitted (since the result will be rejected later)
4574  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
4575  *
4576  * Caveats to watch out for:
4577  * - vnodes are passed unlocked and unreferenced with nothing stopping
4578  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
4579  *   to use atomic_load_ptr to fetch it.
4580  * - the aforementioned object can also get freed, meaning absent other means it
4581  *   should be protected with vfs_smr
4582  * - either safely checking permissions as they are modified or guaranteeing
4583  *   their stability is left to the routine
4584  */
4585 int
4586 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
4587     struct pwd **pwdp)
4588 {
4589         struct cache_fpl fpl;
4590         struct pwd *pwd;
4591         struct vnode *dvp;
4592         struct componentname *cnp;
4593         struct nameidata_saved orig;
4594         int error;
4595
4596         MPASS(ndp->ni_lcf == 0);
4597
4598         fpl.status = CACHE_FPL_STATUS_UNSET;
4599         fpl.ndp = ndp;
4600         fpl.cnp = &ndp->ni_cnd;
4601         MPASS(curthread == fpl.cnp->cn_thread);
4602
4603         if ((fpl.cnp->cn_flags & SAVESTART) != 0)
4604                 MPASS(fpl.cnp->cn_nameiop != LOOKUP);
4605
4606         if (!cache_can_fplookup(&fpl)) {
4607                 SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4608                 *status = fpl.status;
4609                 return (EOPNOTSUPP);
4610         }
4611
4612         cache_fpl_checkpoint(&fpl, &orig);
4613
4614         cache_fpl_smr_enter_initial(&fpl);
4615         fpl.fsearch = false;
4616         pwd = pwd_get_smr();
4617         fpl.pwd = pwd;
4618         ndp->ni_rootdir = pwd->pwd_rdir;
4619         ndp->ni_topdir = pwd->pwd_jdir;
4620
4621         cnp = fpl.cnp;
4622         cnp->cn_nameptr = cnp->cn_pnbuf;
4623         if (cnp->cn_pnbuf[0] == '/') {
4624                 cache_fpl_handle_root(ndp, &dvp);
4625         } else {
4626                 if (ndp->ni_dirfd == AT_FDCWD) {
4627                         dvp = pwd->pwd_cdir;
4628                 } else {
4629                         error = cache_fplookup_dirfd(&fpl, &dvp);
4630                         if (__predict_false(error != 0)) {
4631                                 goto out;
4632                         }
4633                 }
4634         }
4635
4636         SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
4637
4638         error = cache_fplookup_impl(dvp, &fpl);
4639 out:
4640         cache_fpl_smr_assert_not_entered(&fpl);
4641         SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
4642
4643         *status = fpl.status;
4644         switch (fpl.status) {
4645         case CACHE_FPL_STATUS_UNSET:
4646                 __assert_unreachable();
4647                 break;
4648         case CACHE_FPL_STATUS_HANDLED:
4649                 SDT_PROBE3(vfs, namei, lookup, return, error,
4650                     (error == 0 ? ndp->ni_vp : NULL), true);
4651                 break;
4652         case CACHE_FPL_STATUS_PARTIAL:
4653                 *pwdp = fpl.pwd;
4654                 /*
4655                  * Status restored by cache_fplookup_partial_setup.
4656                  */
4657                 break;
4658         case CACHE_FPL_STATUS_ABORTED:
4659                 cache_fpl_restore(&fpl, &orig);
4660                 break;
4661         }
4662         return (error);
4663 }